diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..94db3572f08d81fcd73b04cae8781f6a2ce4fe0f 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,101 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
+deepTools/source/deeptools/test/test_corrGC/paired.bam filter=lfs diff=lfs merge=lfs -text
+deepTools/source/deeptools/test/test_heatmapper/heatmap_master_interpolation_bilinear.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/deeptools/test/test_heatmapper/profile_master_multi.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/_static/welcome_eLife_chrX_heatmap.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/_static/welcome_eLife_chrX_profile-1.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/_static/welcome_eLife_chrX_scaleR_heatmap.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/computeGCBias_Galaxy.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/computeMatrix_modes.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/computeMatrix_overview.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/correctGCBias_Galaxy.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/Gal_DataLib.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/Gal_FAQ_clusterLabeling.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/Gal_FAQ_filteringDuplicates.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/Gal_FAQ_IGV_dataset.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/Gal_FAQ_IGV.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/Gal_FAQ_info.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/Gal_FAQ_UCSC01.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/Gal_screenshot_dataSet.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/Gal_screenshot_dataSetStates.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/Gal_startsite_with_comments.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/Gal_startsite.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/Gal_UCSC.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/GalHow_bamCompare.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/GalHow_bamCoverage.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/GalHow_clustHM01.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/GalHow_clustHM02.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/GalHow_clustHM03.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/GalHow_computeGCbias.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/GalHow_correctGCbias.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/GalHow_multiBamSummary.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/GalHow_plotCorrelation.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/GalHow_plotFingerprint.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/gallery/coverage_Ibrahim.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/gallery/hm_Bulut.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/gallery/hm_CpG.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/gallery/hm_DNase.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/gallery/hm_GC.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/gallery/hm_histonesGomez.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/gallery/hm_TATApsem.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/glossary_ascii.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/glossary_overview.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/glossary_sam.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/norm_IGVsnapshot_indFiles.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/plotCorrelation_galaxy.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/plotCoverage_annotated.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/QC_bamCorrelate_RNAseq.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/QC_fingerprint.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/QC_GCplots_input.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/QC_GCregionexclusion_UCSCscreenshot.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/QC_plotCoverage.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/start_collage.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/start_workflow.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/test_plots/ExampleComputeMatrix1.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/test_plots/ExampleComputeMatrix2.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/test_plots/ExampleComputeMatrix3.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/test_plots/ExampleHeatmap1.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/test_plots/ExampleHeatmap2.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/test_plots/ExampleHeatmap3.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/test_plots/ExampleHeatmap4.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/test_plots/ExampleProfile1.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/test_plots/ExampleProfile2.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/bamCompare_output.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/bamCoverage_output.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/bamPEFragmentSize_output.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/bigwigCompare_output.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/computeGCBias_output.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/computeMatrix_advancedOutput.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/computeMatrix_output.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/computeMatrix_overview.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/computeMatrix_selectScores.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/multiBamSummary_output.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/multiBigwigSummary_output.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/norm_IGVsnapshot_indFiles.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/plotCorrelate_RNAseq.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/plotCorrelation_output.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/plotCoverage_annotated.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/plotCoverage_output.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/plotFingerprint_output.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/plotHeatmap_example.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/plotHeatmap_example02.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/plotPCA_annotated.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/plotProfiler_examples.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/QC_GCplots_input.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/QC_plotCoverage.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/visual_hm_DmelPolII.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/test-data/alignmentSieve.bam filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/test-data/alignmentSieve2.bam filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/test-data/alignmentSieve3.bam filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/test-data/correctGCBias_result1.bam filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/test-data/paired_chr2L.bam filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/test-data/paired_chr2L.cram filter=lfs diff=lfs merge=lfs -text
+deepTools/source/gallery/coverage_Ibrahim.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/gallery/hm_Bulut.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/gallery/hm_CpG.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/gallery/hm_DNase.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/gallery/hm_GC.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/gallery/hm_histonesGomez.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/gallery/hm_TATApsem.png filter=lfs diff=lfs merge=lfs -text
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..cf6226de76e54d7e3d3a5e3d692ecd468a4e4200
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,18 @@
+FROM python:3.10
+
+RUN useradd -m -u 1000 user && python -m pip install --upgrade pip
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+
+WORKDIR /app
+
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+
+COPY --chown=user . /app
+ENV MCP_TRANSPORT=http
+ENV MCP_PORT=7860
+
+EXPOSE 7860
+
+CMD ["python", "deepTools/mcp_output/start_mcp.py"]
diff --git a/README.md b/README.md
index 227703e24bf7c5638504385dcd0f52c358e13c44..b4a6c4ed26a500663c0912de8199e214ae5612da 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,32 @@
---
-title: DeepTools
-emoji: 💻
-colorFrom: purple
-colorTo: yellow
+title: Deeptools MCP
+emoji: 🤖
+colorFrom: blue
+colorTo: purple
sdk: docker
+sdk_version: "4.26.0"
+app_file: app.py
pinned: false
---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# Deeptools MCP Service
+
+Auto-generated MCP service for deepTools.
+
+## Usage
+
+```
+https://None-deepTools-mcp.hf.space/mcp
+```
+
+## Connect with Cursor
+
+```json
+{
+ "mcpServers": {
+ "deepTools": {
+ "url": "https://None-deepTools-mcp.hf.space/mcp"
+ }
+ }
+}
+```
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..89476b8a665998b37683e090f044f4bf8f7bfdda
--- /dev/null
+++ b/app.py
@@ -0,0 +1,45 @@
+from fastapi import FastAPI
+import os
+import sys
+
+mcp_plugin_path = os.path.join(os.path.dirname(__file__), "deepTools", "mcp_output", "mcp_plugin")
+sys.path.insert(0, mcp_plugin_path)
+
+app = FastAPI(
+ title="Deeptools MCP Service",
+ description="Auto-generated MCP service for deepTools",
+ version="1.0.0"
+)
+
+@app.get("/")
+def root():
+ return {
+ "service": "Deeptools MCP Service",
+ "version": "1.0.0",
+ "status": "running",
+ "transport": os.environ.get("MCP_TRANSPORT", "http")
+ }
+
+@app.get("/health")
+def health_check():
+ return {"status": "healthy", "service": "deepTools MCP"}
+
+@app.get("/tools")
+def list_tools():
+ try:
+ from mcp_service import create_app
+ mcp_app = create_app()
+ tools = []
+ for tool_name, tool_func in mcp_app.tools.items():
+ tools.append({
+ "name": tool_name,
+ "description": tool_func.__doc__ or "No description available"
+ })
+ return {"tools": tools}
+ except Exception as e:
+ return {"error": f"Failed to load tools: {str(e)}"}
+
+if __name__ == "__main__":
+ import uvicorn
+ port = int(os.environ.get("PORT", 7860))
+ uvicorn.run(app, host="0.0.0.0", port=port)
diff --git a/deepTools/mcp_output/README_MCP.md b/deepTools/mcp_output/README_MCP.md
new file mode 100644
index 0000000000000000000000000000000000000000..d7b2b2b554ff71fea2e80dd583a7eed81c0b1d42
--- /dev/null
+++ b/deepTools/mcp_output/README_MCP.md
@@ -0,0 +1,64 @@
+# deepTools MCP (Model Context Protocol) Service
+
+## Project Introduction
+
+deepTools is a comprehensive suite of Python tools designed for the efficient analysis of high-throughput sequencing data, particularly for ChIP-seq, RNA-seq, and MNase-seq experiments. It addresses the challenges of handling large datasets by providing tools for normalized coverage file generation, quality control, and publication-ready visualizations. deepTools supports efficient parallel processing using the mapReduce framework, making it suitable for genome-scale computations.
+
+## Installation Method
+
+To install deepTools, ensure you have Python and the following dependencies:
+
+- numpy
+- matplotlib
+- pysam
+- pyBigWig
+
+Optional dependencies include:
+
+- scipy
+- pandas
+
+You can install deepTools using pip:
+
+```
+pip install deeptools
+```
+
+## Quick Start
+
+To quickly get started with deepTools, you can use the command-line interface (CLI) to call the main functions. Here are some examples:
+
+- **Calculate Coverage**: Use `bamCoverage` to calculate the coverage of BAM files and output a bigWig file.
+- **Compare BAM Files**: Use `bamCompare` to compare two BAM files and generate a bigWig file with the results.
+- **Generate Heatmaps**: Use `heatmapper` to generate heatmaps from computed matrices.
+
+Example command:
+
+```
+bamCoverage -b sample.bam -o output.bw
+```
+
+## Available Tools and Endpoints List
+
+1. **alignmentSieve**: Filters alignments based on various criteria.
+2. **bamCompare**: Compares two BAM files and generates a bigWig file.
+3. **bamCoverage**: Calculates the coverage of BAM files.
+4. **computeMatrix**: Computes a matrix of scores for genomic regions.
+5. **heatmapper**: Generates heatmaps from computed matrices.
+6. **multiBamSummary**: Aggregates read counts across multiple BAM files.
+7. **multiBigwigSummary**: Aggregates scores across multiple bigWig files.
+8. **plotCorrelation**: Performs correlation analysis with heatmap/scatter plot output.
+9. **plotHeatmap**: Creates customizable heatmaps.
+10. **plotProfile**: Generates average signal profile plots.
+
+## Common Issues and Notes
+
+- **Dependencies**: Ensure all required dependencies are installed. Optional dependencies can enhance functionality.
+- **Environment**: deepTools is compatible with most Unix-like systems. Ensure your environment supports Python and the necessary libraries.
+- **Performance**: For large datasets, consider using the mapReduce framework to leverage parallel processing capabilities.
+
+## Reference Links or Documentation
+
+For more detailed information, visit the [deepTools GitHub repository](https://github.com/deeptools/deepTools) or refer to the official [deepTools documentation](https://deeptools.readthedocs.io/en/develop/).
+
+For specific tool usage and workflows, see the [Typical Workflows](https://deeptools.readthedocs.io/en/develop/content/example_usage.html) section in the documentation.
\ No newline at end of file
diff --git a/deepTools/mcp_output/analysis.json b/deepTools/mcp_output/analysis.json
new file mode 100644
index 0000000000000000000000000000000000000000..bef5b875eb007377134e06fa02fd23d579f80d6f
--- /dev/null
+++ b/deepTools/mcp_output/analysis.json
@@ -0,0 +1,391 @@
+{
+ "summary": {
+ "repository_url": "https://github.com/deeptools/deepTools",
+ "summary": "Imported via zip fallback, file count: 81",
+ "file_tree": {
+ ".github/CONTRIBUTING.md": {
+ "size": 544
+ },
+ ".github/ISSUE_TEMPLATE.md": {
+ "size": 691
+ },
+ ".github/PULL_REQUEST_TEMPLATE.md": {
+ "size": 286
+ },
+ ".github/workflows/planemo.yml": {
+ "size": 1421
+ },
+ ".github/workflows/pypi.yml": {
+ "size": 616
+ },
+ ".github/workflows/test.yml": {
+ "size": 3118
+ },
+ ".readthedocs.yaml": {
+ "size": 193
+ },
+ "CHANGES.txt": {
+ "size": 40451
+ },
+ "LICENSE.txt": {
+ "size": 1241
+ },
+ "README.md": {
+ "size": 5910
+ },
+ "deeptools/SES_scaleFactor.py": {
+ "size": 7007
+ },
+ "deeptools/__init__.py": {
+ "size": 0
+ },
+ "deeptools/alignmentSieve.py": {
+ "size": 18200
+ },
+ "deeptools/bamCompare.py": {
+ "size": 14290
+ },
+ "deeptools/bamCoverage.py": {
+ "size": 18617
+ },
+ "deeptools/bamHandler.py": {
+ "size": 3345
+ },
+ "deeptools/bamPEFragmentSize.py": {
+ "size": 21247
+ },
+ "deeptools/bigwigAverage.py": {
+ "size": 4908
+ },
+ "deeptools/bigwigCompare.py": {
+ "size": 6614
+ },
+ "deeptools/cm.py": {
+ "size": 44838
+ },
+ "deeptools/computeGCBias.py": {
+ "size": 31006
+ },
+ "deeptools/computeMatrix.py": {
+ "size": 22446
+ },
+ "deeptools/computeMatrixOperations.py": {
+ "size": 32110
+ },
+ "deeptools/correctGCBias.py": {
+ "size": 26158
+ },
+ "deeptools/correlation.py": {
+ "size": 28078
+ },
+ "deeptools/correlation_heatmap.py": {
+ "size": 3796
+ },
+ "deeptools/countReadsPerBin.py": {
+ "size": 42159
+ },
+ "deeptools/deeptools_list_tools.py": {
+ "size": 3345
+ },
+ "deeptools/estimateReadFiltering.py": {
+ "size": 16606
+ },
+ "deeptools/estimateScaleFactor.py": {
+ "size": 4782
+ },
+ "deeptools/getFragmentAndReadSize.py": {
+ "size": 7011
+ },
+ "deeptools/getRatio.py": {
+ "size": 2326
+ },
+ "deeptools/getScaleFactor.py": {
+ "size": 12772
+ },
+ "deeptools/getScorePerBigWigBin.py": {
+ "size": 11967
+ },
+ "deeptools/heatmapper.py": {
+ "size": 58987
+ },
+ "deeptools/heatmapper_utilities.py": {
+ "size": 7169
+ },
+ "deeptools/mapReduce.py": {
+ "size": 9786
+ },
+ "deeptools/misc.py": {
+ "size": 597
+ },
+ "deeptools/multiBamSummary.py": {
+ "size": 11899
+ },
+ "deeptools/multiBigwigSummary.py": {
+ "size": 11291
+ },
+ "deeptools/parserCommon.py": {
+ "size": 43744
+ },
+ "deeptools/plotCorrelation.py": {
+ "size": 10984
+ },
+ "deeptools/plotCoverage.py": {
+ "size": 16329
+ },
+ "deeptools/plotEnrichment.py": {
+ "size": 25244
+ },
+ "deeptools/plotFingerprint.py": {
+ "size": 19876
+ },
+ "deeptools/plotHeatmap.py": {
+ "size": 37144
+ },
+ "deeptools/plotPCA.py": {
+ "size": 9427
+ },
+ "deeptools/plotProfile.py": {
+ "size": 39224
+ },
+ "deeptools/sumCoveragePerBin.py": {
+ "size": 9899
+ },
+ "deeptools/test/__init__.py": {
+ "size": 0
+ },
+ "deeptools/test/skiptest_heatmapper_images.py": {
+ "size": 5917
+ },
+ "deeptools/test/test_bamCoverage_and_bamCompare.py": {
+ "size": 17582
+ },
+ "deeptools/test/test_bigwigAverage.py": {
+ "size": 2864
+ },
+ "deeptools/test/test_bigwigCompare_and_multiBigwigSummary.py": {
+ "size": 4603
+ },
+ "deeptools/test/test_computeMatrixOperations.py": {
+ "size": 12233
+ },
+ "deeptools/test/test_corrGC/R_gc_paired.txt": {
+ "size": 7525
+ },
+ "deeptools/test/test_corrGC/frequencies_data.txt": {
+ "size": 825
+ },
+ "deeptools/test/test_countReadsPerBin.py": {
+ "size": 8401
+ },
+ "deeptools/test/test_heatmapper.py": {
+ "size": 12550
+ },
+ "deeptools/test/test_multiBamSummary.py": {
+ "size": 1945
+ },
+ "deeptools/test/test_plotCoverage.py": {
+ "size": 1215
+ },
+ "deeptools/test/test_readFiltering.py": {
+ "size": 6229
+ },
+ "deeptools/test/test_tools.py": {
+ "size": 838
+ },
+ "deeptools/test/test_writeBedGraph.py": {
+ "size": 4462
+ },
+ "deeptools/utilities.py": {
+ "size": 14161
+ },
+ "deeptools/writeBedGraph.py": {
+ "size": 13223
+ },
+ "deeptools/writeBedGraph_bam_and_bw.py": {
+ "size": 9255
+ },
+ "docs/_static/welcome_owl.carousel.min.js": {
+ "size": 40401
+ },
+ "docs/conf.py": {
+ "size": 11119
+ },
+ "docs/requirements.txt": {
+ "size": 72
+ },
+ "galaxy/wrapper/.shed.yml": {
+ "size": 2719
+ },
+ "galaxy/wrapper/test-data/alignmentSieve.txt": {
+ "size": 102
+ },
+ "galaxy/wrapper/test-data/bamPEFragmentSize_lengths1.txt": {
+ "size": 115
+ },
+ "galaxy/wrapper/test-data/bamPEFragmentSize_result1.txt": {
+ "size": 613
+ },
+ "galaxy/wrapper/test-data/bamPEFragmentSize_table1.txt": {
+ "size": 810
+ },
+ "galaxy/wrapper/test-data/computeMatrixOperations.txt": {
+ "size": 50
+ },
+ "galaxy/wrapper/test-data/estimateReadFiltering.txt": {
+ "size": 353
+ },
+ "galaxy/wrapper/test-data/plotEnrichment_output.txt": {
+ "size": 197
+ },
+ "pyproject.toml": {
+ "size": 2395
+ },
+ "scripts/convertChromsBigWig.py": {
+ "size": 7412
+ },
+ "scripts/split_bed_into_multiple_files.py": {
+ "size": 822
+ }
+ },
+ "processed_by": "zip_fallback",
+ "success": true
+ },
+ "structure": {
+ "packages": [
+ "source.deeptools",
+ "source.deeptools.test"
+ ]
+ },
+ "dependencies": {
+ "has_environment_yml": false,
+ "has_requirements_txt": false,
+ "pyproject": true,
+ "setup_cfg": false,
+ "setup_py": false
+ },
+ "entry_points": {
+ "imports": [],
+ "cli": [],
+ "modules": []
+ },
+ "llm_analysis": {
+ "core_modules": [
+ {
+ "package": "source.deeptools",
+ "module": "alignmentSieve",
+ "functions": [
+ "main",
+ "parseArguments"
+ ],
+ "classes": [],
+ "description": "This module is responsible for filtering alignments based on various criteria."
+ },
+ {
+ "package": "source.deeptools",
+ "module": "bamCompare",
+ "functions": [
+ "main",
+ "parseArguments"
+ ],
+ "classes": [],
+ "description": "This module compares two BAM files and generates a bigWig file with the results."
+ },
+ {
+ "package": "source.deeptools",
+ "module": "bamCoverage",
+ "functions": [
+ "main",
+ "parseArguments"
+ ],
+ "classes": [],
+ "description": "This module calculates the coverage of BAM files and outputs a bigWig file."
+ },
+ {
+ "package": "source.deeptools",
+ "module": "computeMatrix",
+ "functions": [
+ "main",
+ "parseArguments"
+ ],
+ "classes": [],
+ "description": "This module computes a matrix of scores for genomic regions."
+ },
+ {
+ "package": "source.deeptools",
+ "module": "heatmapper",
+ "functions": [
+ "main",
+ "parseArguments"
+ ],
+ "classes": [],
+ "description": "This module generates heatmaps from the computed matrices."
+ }
+ ],
+ "cli_commands": [
+ {
+ "name": "alignmentSieve",
+ "module": "source.deeptools.alignmentSieve",
+ "description": "CLI command for filtering alignments based on various criteria."
+ },
+ {
+ "name": "bamCompare",
+ "module": "source.deeptools.bamCompare",
+ "description": "CLI command for comparing two BAM files and generating a bigWig file."
+ },
+ {
+ "name": "bamCoverage",
+ "module": "source.deeptools.bamCoverage",
+ "description": "CLI command for calculating the coverage of BAM files."
+ },
+ {
+ "name": "computeMatrix",
+ "module": "source.deeptools.computeMatrix",
+ "description": "CLI command for computing a matrix of scores for genomic regions."
+ },
+ {
+ "name": "heatmapper",
+ "module": "source.deeptools.heatmapper",
+ "description": "CLI command for generating heatmaps from computed matrices."
+ }
+ ],
+ "import_strategy": {
+ "primary": "import",
+ "fallback": "cli",
+ "confidence": 0.85
+ },
+ "dependencies": {
+ "required": [
+ "numpy",
+ "matplotlib",
+ "pysam",
+ "pyBigWig"
+ ],
+ "optional": [
+ "scipy",
+ "pandas"
+ ]
+ },
+ "risk_assessment": {
+ "import_feasibility": 0.8,
+ "intrusiveness_risk": "medium",
+ "complexity": "medium"
+ }
+ },
+ "deepwiki_analysis": {
+ "repo_url": "https://github.com/deeptools/deepTools",
+ "repo_name": "deepTools",
+ "content": "deeptools/deepTools\nInstallation and Getting Started\nTypical Workflows\nCore Processing Engines\nParallel Processing Framework (mapReduce)\nRead Counting Engine (countReadsPerBin)\nScore Extraction (getScorePerBigWigBin)\nMatrix Computation Engine (heatmapper)\nCoverage Generation and Normalization\nSingle-Sample Coverage (bamCoverage)\nSample Comparison (bamCompare)\nGC Bias Correction Pipeline\nMulti-Sample Analysis Tools\nData Integration (multiBamSummary and multiBigwigSummary)\nCorrelation Analysis (plotCorrelation)\nPrincipal Component Analysis (plotPCA)\nVisualization Tools\nMatrix Generation (computeMatrix)\nMatrix Operations (computeMatrixOperations)\nHeatmap Visualization (plotHeatmap)\nProfile Plots (plotProfile)\nQuality Control Tools\nChIP-seq Enrichment Assessment (plotFingerprint)\nFragment Size Analysis (bamPEFragmentSize)\nCoverage Distribution (plotCoverage)\nFeature Enrichment (plotEnrichment)\nRead Filtering Estimation (estimateReadFiltering and alignmentSieve)\nGalaxy Integration\nGalaxy Wrapper Architecture\nTool Shed Distribution and Installation\nTesting with Planemo\nDevelopment and Contributing\nProject Structure and Configuration\nCI/CD Pipeline\nDocumentation System\nRelease Process and Versioning\nFile Formats and Data Structures\nInput File Formats\nIntermediate File Formats\nOutput Formats and Visualization\nAdvanced Topics and Optimization\nPerformance Tuning\nFiltering and Read Processing Options\nNormalization Methods Deep Dive\nSpecialized Read Processing Modes\nCHANGES.txt\ndocs/content/about.rst\ndocs/content/changelog.rst\ndocs/content/example_api_tutorial.rst\ndocs/content/example_gallery.rst\ndocs/content/example_step_by_step.rst\ndocs/content/example_usage.rst\ndocs/content/help_faq.rst\ndocs/content/help_faq_galaxy.rst\ndocs/content/help_galaxy_intro.rst\ndocs/content/help_glossary.rst\ndocs/content/installation.rst\ndocs/content/list_of_tools.rst\ndocs/images/Gal_FAQ_filteringDuplicates.png\ndocs/images/Gal_FAQ_info.png\ndocs/index.rst\ngalaxy/wrapper/deepTools_macros.xml\ngalaxy/wrapper/estimateReadFiltering.xml\ngalaxy/wrapper/test-data/estimateReadFiltering.txt\nscripts/convertChromsBigWig.py\nPurpose and Scope\nThis page provides a high-level introduction to deepTools: what it is, what problems it solves, and its primary capabilities for analyzing high-throughput sequencing data. For detailed installation instructions, seeInstallation and Getting Started. For specific tool usage and workflows, seeTypical Workflows.\nSources:README.md9-11docs/index.rst7-8\nWhat is deepTools?\ndeepTools is a suite of Python tools developed for efficient analysis of high-throughput sequencing data, particularly ChIP-seq, RNA-seq, and MNase-seq experiments. It addresses the challenge of handling large amounts of data generated from DNA sequencing centers by providing:\nNormalized coverage file generationin standard bedGraph and bigWig formats\nQuality control modulesfor assessing data quality and technical biases\nPublication-ready visualizationsfor identifying enrichments and functional genome annotations\nEfficient parallel processingusing themapReduceframework for genome-scale computations\ndeepTools is available through three distinct usage modes:\nSources:README.md9-14docs/index.rst10-14CHANGES.txt62\nSystem Architecture\nThe following diagram illustrates the primary components of deepTools and how they relate to each other:\ndeepTools Component Architecture\nData Access LayerCore Processing EnginesTool Modules LayerUser Interface LayerCommand-Line Tools(bin/bamCoverage, bin/computeMatrix, etc.)Galaxy XML Wrappers(galaxy/wrapper/*.xml)Python API(import deeptools.*)Quality Control ToolsplotFingerprintcomputeGCBiasplotCoveragebamPEFragmentSizeCoverage ToolsbamCoveragebamComparebigwigCompareAggregation ToolsmultiBamSummarymultiBigwigSummarycomputeMatrixVisualization ToolsplotHeatmapplotProfileplotCorrelationcountReadsPerBin(deeptools/countReadsPerBin.py)getScorePerBigWigBin(deeptools/getScorePerBigWigBin.py)heatmapper(deeptools/heatmapper.py)mapReduce(deeptools/mapReduce.py)pysam(BAM/CRAM files)pyBigWig(bigWig files)deeptoolsintervals(BED/GTF files)\nData Access Layer\nCore Processing Engines\nTool Modules Layer\nUser Interface Layer\nCommand-Line Tools(bin/bamCoverage, bin/computeMatrix, etc.)\nGalaxy XML Wrappers(galaxy/wrapper/*.xml)\nPython API(import deeptools.*)\nQuality Control ToolsplotFingerprintcomputeGCBiasplotCoveragebamPEFragmentSize\nCoverage ToolsbamCoveragebamComparebigwigCompare\nAggregation ToolsmultiBamSummarymultiBigwigSummarycomputeMatrix\nVisualization ToolsplotHeatmapplotProfileplotCorrelation\ncountReadsPerBin(deeptools/countReadsPerBin.py)\ngetScorePerBigWigBin(deeptools/getScorePerBigWigBin.py)\nheatmapper(deeptools/heatmapper.py)\nmapReduce(deeptools/mapReduce.py)\npysam(BAM/CRAM files)\npyBigWig(bigWig files)\ndeeptoolsintervals(BED/GTF files)\nSources:docs/content/list_of_tools.rst1-46pyproject.toml24-50README.md34-56\nCore Capabilities\ndeepTools provides five major functional categories that address different stages of sequencing data analysis:\n1. Quality Control and Preprocessing\nTools for assessing data quality before analysis:\nplotFingerprint- ChIP-seq enrichment assessment with quality metrics (Jensen-Shannon distance, CHANCE statistics)\nplotFingerprint\ncomputeGCBias/correctGCBias- GC bias detection and correction\ncomputeGCBias\ncorrectGCBias\nbamPEFragmentSize- Fragment size distribution analysis for paired-end data\nbamPEFragmentSize\nplotCoverage- Coverage distribution and genome coverage metrics\nplotCoverage\nestimateReadFiltering- Preview effects of filtering parameters\nestimateReadFiltering\nSources:docs/content/list_of_tools.rst18-32galaxy/wrapper/estimateReadFiltering.xml1-118\n2. Coverage Generation and Normalization\nTools for converting aligned reads (BAM) to normalized coverage tracks (bigWig/bedGraph):\nbamCoverage- Single-sample normalization with RPKM, CPM, BPM, RPGC methods\nbamCoverage\nbamCompare- Two-sample comparison (e.g., ChIP vs. input) with log2ratio, difference, mean operations\nbigwigCompare- Comparison operations on bigWig files\nbigwigCompare\nbigwigAverage- Averaging multiple bigWig files\nbigwigAverage\nSources:docs/content/list_of_tools.rst24-27CHANGES.txt28\n3. Data Aggregation\nTools for integrating multi-sample data:\nmultiBamSummary- Read count aggregation across BAM files (bins or BED-defined regions)\nmultiBamSummary\nmultiBigwigSummary- Score aggregation across bigWig files\nmultiBigwigSummary\ncomputeMatrix- Signal computation over genomic regions in two modes:scale-regionsandreference-point\ncomputeMatrix\nscale-regions\nreference-point\nSources:docs/content/list_of_tools.rst10-29\n4. Analysis and Statistics\nTools for deriving statistical insights:\nplotCorrelation- Pearson/Spearman correlation with hierarchical clustering and heatmap/scatter plot output\nplotCorrelation\nplotPCA- Principal component analysis for dimensionality reduction\nplotEnrichment- Feature enrichment quantification\nplotEnrichment\nSources:docs/content/list_of_tools.rst14-17docs/content/example_usage.rst32-44\n5. Visualization\nTools for creating publication-ready plots:\nplotHeatmap- Customizable heatmaps with clustering, color schemes, and multi-sample/region support\nplotHeatmap\nplotProfile- Average signal profile plots (meta-profiles) with standard error/deviation options\nplotProfile\ncomputeMatrixOperations- Matrix manipulation (filter, subset, sort, combine)\ncomputeMatrixOperations\nMultiple output formats supported: PNG, PDF, SVG, and interactive HTML (via plotly)\nSources:docs/content/list_of_tools.rst34-36CHANGES.txt187\nFile Format Ecosystem\ndeepTools operates on standard genomics file formats and performs conversions between them:\nFile Format Flow Diagram\nOutput FormatsIntermediate FormatsdeepTools ProcessingInput FormatsBAM/CRAM(aligned reads)bigWig(coverage signal)BED/GTF/GFF(genomic regions)2bit/FASTA(reference genome)bamCoveragebamComparemultiBamSummarymultiBigwigSummarycomputeMatrixcomputeGCBiasNPZ matrix(multiBamSummary output)Matrix .gz(computeMatrix output)Tabular(GC frequencies)bigWig/bedGraph(coverage tracks)PNG/PDF/SVG(static plots)HTML(interactive plotly)Tabular(data tables)\nOutput Formats\nIntermediate Formats\ndeepTools Processing\nInput Formats\nBAM/CRAM(aligned reads)\nbigWig(coverage signal)\nBED/GTF/GFF(genomic regions)\n2bit/FASTA(reference genome)\nbamCoverage\nmultiBamSummary\nmultiBigwigSummary\ncomputeMatrix\ncomputeGCBias\nNPZ matrix(multiBamSummary output)\nMatrix .gz(computeMatrix output)\nTabular(GC frequencies)\nbigWig/bedGraph(coverage tracks)\nPNG/PDF/SVG(static plots)\nHTML(interactive plotly)\nTabular(data tables)\nKey intermediate formats serve as bridges between data aggregation and visualization:\nNPZ files- Compressed NumPy arrays storing multi-sample read counts or scores (output frommultiBamSummary/multiBigwigSummary)\nmultiBamSummary\nmultiBigwigSummary\nMatrix .gz files- Compressed matrices of signal values over genomic regions (output fromcomputeMatrix)\ncomputeMatrix\nSources:docs/content/help_glossary.rst82-174docs/content/list_of_tools.rst8-45\nParallel Processing Framework\nAll compute-intensive operations in deepTools utilize themapReduceframework for efficient genome-scale processing:\nmapReduce Execution Model\nmapReduce Framework (deeptools/mapReduce.py)Input:BAM/bigWig filesBED regionsParametersGenome Chunking(~400k reads per chunk)Worker Pool(multiprocessing.Pool)Worker 1:Process chunk 1Worker 2:Process chunk 2Worker N:Process chunk NResult Aggregation(concatenate/merge)Output:Aggregated results\nmapReduce Framework (deeptools/mapReduce.py)\nInput:BAM/bigWig filesBED regionsParameters\nGenome Chunking(~400k reads per chunk)\nWorker Pool(multiprocessing.Pool)\nWorker 1:Process chunk 1\nWorker 2:Process chunk 2\nWorker N:Process chunk N\nResult Aggregation(concatenate/merge)\nOutput:Aggregated results\nThe framework provides:\nAutomatic parallelizationacross user-specified number of processors (via--numberOfProcessors)",
+ "model": "gpt-4o-2024-08-06",
+ "source": "selenium",
+ "success": true
+ },
+ "deepwiki_options": {
+ "enabled": true,
+ "model": "gpt-4o-2024-08-06"
+ },
+ "risk": {
+ "import_feasibility": 0.8,
+ "intrusiveness_risk": "medium",
+ "complexity": "medium"
+ }
+}
\ No newline at end of file
diff --git a/deepTools/mcp_output/diff_report.md b/deepTools/mcp_output/diff_report.md
new file mode 100644
index 0000000000000000000000000000000000000000..1a70112fc93527895116090c8b72cb26804df239
--- /dev/null
+++ b/deepTools/mcp_output/diff_report.md
@@ -0,0 +1,73 @@
+# DeepTools Project Difference Report
+
+**Date:** January 31, 2026
+**Time:** 18:24:38
+**Repository:** deepTools
+**Project Type:** Python Library
+**Intrusiveness:** None
+**Workflow Status:** Success
+**Test Status:** Failed
+
+## Project Overview
+
+DeepTools is a Python library designed to facilitate the analysis and visualization of high-throughput sequencing data. It provides a suite of tools for processing and interpreting large datasets, making it an essential resource for bioinformatics research.
+
+## Difference Analysis
+
+### New Files
+
+In this update, 8 new files have been introduced to the deepTools repository. These files likely contain new features or enhancements to existing functionalities. However, no existing files were modified, indicating that the new additions are supplementary rather than replacements or updates to current code.
+
+### Modified Files
+
+There were no modifications to existing files in this update. This suggests that the core functionalities of the library remain unchanged, and the focus was on expanding capabilities or adding new features.
+
+## Technical Analysis
+
+### Workflow Status
+
+The workflow status is marked as "success," indicating that the integration and deployment processes were completed without any errors. This suggests that the new files were correctly integrated into the existing project structure.
+
+### Test Status
+
+The test status is marked as "failed," which is a critical issue. This failure indicates that the new additions have introduced bugs or issues that prevent the library from functioning as expected. It is essential to identify and resolve these issues to ensure the reliability and stability of the library.
+
+## Recommendations and Improvements
+
+1. **Conduct Thorough Testing:**
+ - Perform detailed unit and integration testing on the new files to identify the root cause of the test failures.
+ - Ensure that all new functionalities are covered by test cases to prevent future issues.
+
+2. **Code Review:**
+ - Conduct a comprehensive code review of the new files to ensure adherence to coding standards and best practices.
+ - Identify any potential areas for optimization or refactoring.
+
+3. **Documentation Update:**
+ - Update the project documentation to include information about the new features and how they integrate with existing functionalities.
+ - Ensure that any new dependencies or installation instructions are clearly outlined.
+
+4. **Bug Fixes:**
+ - Prioritize fixing the issues causing test failures to restore the library's functionality.
+ - Implement a bug tracking system to monitor and resolve any new issues that arise.
+
+## Deployment Information
+
+The deployment process was successful, indicating that the new files were correctly integrated into the project. However, due to the test failures, it is recommended to hold off on any production deployment until the issues are resolved.
+
+## Future Planning
+
+1. **Feature Expansion:**
+ - Continue to expand the library's capabilities by introducing new tools and functionalities that align with user needs and industry trends.
+
+2. **Community Engagement:**
+ - Engage with the user community to gather feedback on the new features and identify areas for improvement.
+
+3. **Regular Updates:**
+ - Implement a regular update schedule to ensure that the library remains up-to-date with the latest advancements in bioinformatics.
+
+4. **Enhanced Testing Framework:**
+ - Develop a more robust testing framework to catch issues earlier in the development process and improve overall software quality.
+
+## Conclusion
+
+The recent update to the deepTools project has introduced new features, but the test failures highlight the need for immediate attention to ensure the library's reliability. By addressing the recommendations outlined in this report, the project can continue to provide valuable tools for the bioinformatics community while maintaining high standards of quality and performance.
\ No newline at end of file
diff --git a/deepTools/mcp_output/mcp_plugin/__init__.py b/deepTools/mcp_output/mcp_plugin/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/deepTools/mcp_output/mcp_plugin/adapter.py b/deepTools/mcp_output/mcp_plugin/adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b5d22b4f16b7a0ccd71cae0f9745046fbc0c4c9
--- /dev/null
+++ b/deepTools/mcp_output/mcp_plugin/adapter.py
@@ -0,0 +1,139 @@
+import os
+import sys
+
+# Path settings
+source_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "source")
+sys.path.insert(0, source_path)
+
+# Import statements
+try:
+ from deeptools.alignmentSieve import alignmentSieve
+ from deeptools.bamCompare import bamCompare
+ from deeptools.bamCoverage import bamCoverage
+ from deeptools.computeMatrix import computeMatrix
+ from deeptools.heatmapper import heatmapper
+except ImportError as e:
+ print(f"ImportError: {e}. Ensure the source directory is correctly set.")
+
+class Adapter:
+ """
+ Adapter class for deepTools functionalities.
+ Provides methods to interact with various deepTools modules.
+ """
+
+ def __init__(self):
+ self.mode = "import"
+
+ # -------------------------------------------------------------------------
+ # Alignment Sieve Module
+ # -------------------------------------------------------------------------
+
+ def run_alignment_sieve(self, input_file, output_file, **kwargs):
+ """
+ Filters alignments based on various criteria using alignmentSieve.
+
+ Parameters:
+ - input_file: str, path to the input BAM file.
+ - output_file: str, path to the output BAM file.
+ - kwargs: additional parameters for alignmentSieve.
+
+ Returns:
+ - dict: status of the operation.
+ """
+ try:
+ alignmentSieve(input_file=input_file, output_file=output_file, **kwargs)
+ return {"status": "success", "message": "Alignment sieve completed successfully."}
+ except Exception as e:
+ return {"status": "error", "message": f"Failed to run alignment sieve: {e}"}
+
+ # -------------------------------------------------------------------------
+ # BAM Compare Module
+ # -------------------------------------------------------------------------
+
+ def run_bam_compare(self, bamfile1, bamfile2, output_file, **kwargs):
+ """
+ Compares two BAM files and generates a bigWig file using bamCompare.
+
+ Parameters:
+ - bamfile1: str, path to the first BAM file.
+ - bamfile2: str, path to the second BAM file.
+ - output_file: str, path to the output bigWig file.
+ - kwargs: additional parameters for bamCompare.
+
+ Returns:
+ - dict: status of the operation.
+ """
+ try:
+ bamCompare(bamfile1=bamfile1, bamfile2=bamfile2, output_file=output_file, **kwargs)
+ return {"status": "success", "message": "BAM comparison completed successfully."}
+ except Exception as e:
+ return {"status": "error", "message": f"Failed to compare BAM files: {e}"}
+
+ # -------------------------------------------------------------------------
+ # BAM Coverage Module
+ # -------------------------------------------------------------------------
+
+ def run_bam_coverage(self, bamfile, output_file, **kwargs):
+ """
+ Calculates the coverage of BAM files using bamCoverage.
+
+ Parameters:
+ - bamfile: str, path to the BAM file.
+ - output_file: str, path to the output coverage file.
+ - kwargs: additional parameters for bamCoverage.
+
+ Returns:
+ - dict: status of the operation.
+ """
+ try:
+ bamCoverage(bamfile=bamfile, output_file=output_file, **kwargs)
+ return {"status": "success", "message": "BAM coverage calculation completed successfully."}
+ except Exception as e:
+ return {"status": "error", "message": f"Failed to calculate BAM coverage: {e}"}
+
+ # -------------------------------------------------------------------------
+ # Compute Matrix Module
+ # -------------------------------------------------------------------------
+
+ def run_compute_matrix(self, score_file, regions_file, output_file, **kwargs):
+ """
+ Computes a matrix of scores for genomic regions using computeMatrix.
+
+ Parameters:
+ - score_file: str, path to the score file.
+ - regions_file: str, path to the regions file.
+ - output_file: str, path to the output matrix file.
+ - kwargs: additional parameters for computeMatrix.
+
+ Returns:
+ - dict: status of the operation.
+ """
+ try:
+ computeMatrix(score_file=score_file, regions_file=regions_file, output_file=output_file, **kwargs)
+ return {"status": "success", "message": "Matrix computation completed successfully."}
+ except Exception as e:
+ return {"status": "error", "message": f"Failed to compute matrix: {e}"}
+
+ # -------------------------------------------------------------------------
+ # Heatmapper Module
+ # -------------------------------------------------------------------------
+
+ def run_heatmapper(self, matrix_file, output_file, **kwargs):
+ """
+ Generates heatmaps from computed matrices using heatmapper.
+
+ Parameters:
+ - matrix_file: str, path to the matrix file.
+ - output_file: str, path to the output heatmap file.
+ - kwargs: additional parameters for heatmapper.
+
+ Returns:
+ - dict: status of the operation.
+ """
+ try:
+ heatmapper(matrix_file=matrix_file, output_file=output_file, **kwargs)
+ return {"status": "success", "message": "Heatmap generation completed successfully."}
+ except Exception as e:
+ return {"status": "error", "message": f"Failed to generate heatmap: {e}"}
+
+# End of Adapter class definition
\ No newline at end of file
diff --git a/deepTools/mcp_output/mcp_plugin/main.py b/deepTools/mcp_output/mcp_plugin/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..fca6ec384e22f703b287550e94cc00baaaa4c4a7
--- /dev/null
+++ b/deepTools/mcp_output/mcp_plugin/main.py
@@ -0,0 +1,13 @@
+"""
+MCP Service Auto-Wrapper - Auto-generated
+"""
+from mcp_service import create_app
+
+def main():
+ """Main entry point"""
+ app = create_app()
+ return app
+
+if __name__ == "__main__":
+ app = main()
+ app.run()
\ No newline at end of file
diff --git a/deepTools/mcp_output/mcp_plugin/mcp_service.py b/deepTools/mcp_output/mcp_plugin/mcp_service.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cb98f96b75381ce72047e93851ee652a7ee4d45
--- /dev/null
+++ b/deepTools/mcp_output/mcp_plugin/mcp_service.py
@@ -0,0 +1,102 @@
+import os
+import sys
+
+# Path settings to include the local source directory
+source_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "source")
+if source_path not in sys.path:
+ sys.path.insert(0, source_path)
+
+from fastmcp import FastMCP
+from deeptools.alignmentSieve import alignmentSieve
+from deeptools.bamCompare import bamCompare
+from deeptools.bamCoverage import bamCoverage
+from deeptools.computeMatrix import computeMatrix
+from deeptools.heatmapper import heatmapper
+
+mcp = FastMCP("deepToolsService")
+
+@mcp.tool(name="alignment_sieve", description="Filter alignments based on various criteria.")
+def alignment_sieve(input_file: str, output_file: str, min_length: int, max_length: int) -> dict:
+ """
+ Filters alignments in a BAM file based on length criteria.
+
+ :param input_file: Path to the input BAM file.
+ :param output_file: Path to the output BAM file.
+ :param min_length: Minimum alignment length to retain.
+ :param max_length: Maximum alignment length to retain.
+ :return: Dictionary with success status and result or error message.
+ """
+ try:
+ alignmentSieve(input_file, output_file, min_length, max_length)
+ return {"success": True, "result": f"Filtered alignments saved to {output_file}"}
+ except Exception as e:
+ return {"success": False, "error": str(e)}
+
+@mcp.tool(name="bam_compare", description="Compare two BAM files and generate a bigWig file.")
+def bam_compare(bam_file1: str, bam_file2: str, output_file: str) -> dict:
+ """
+ Compares two BAM files and generates a bigWig file.
+
+ :param bam_file1: Path to the first BAM file.
+ :param bam_file2: Path to the second BAM file.
+ :param output_file: Path to the output bigWig file.
+ :return: Dictionary with success status and result or error message.
+ """
+ try:
+ bamCompare(bam_file1, bam_file2, output_file)
+ return {"success": True, "result": f"Comparison result saved to {output_file}"}
+ except Exception as e:
+ return {"success": False, "error": str(e)}
+
+@mcp.tool(name="bam_coverage", description="Calculate the coverage of BAM files.")
+def bam_coverage(bam_file: str, output_file: str) -> dict:
+ """
+ Calculates the coverage of a BAM file and outputs a bigWig file.
+
+ :param bam_file: Path to the BAM file.
+ :param output_file: Path to the output bigWig file.
+ :return: Dictionary with success status and result or error message.
+ """
+ try:
+ bamCoverage(bam_file, output_file)
+ return {"success": True, "result": f"Coverage data saved to {output_file}"}
+ except Exception as e:
+ return {"success": False, "error": str(e)}
+
+@mcp.tool(name="compute_matrix", description="Compute a matrix of scores for genomic regions.")
+def compute_matrix(input_file: str, output_file: str) -> dict:
+ """
+ Computes a matrix of scores for genomic regions from an input file.
+
+ :param input_file: Path to the input file.
+ :param output_file: Path to the output matrix file.
+ :return: Dictionary with success status and result or error message.
+ """
+ try:
+ computeMatrix(input_file, output_file)
+ return {"success": True, "result": f"Matrix computed and saved to {output_file}"}
+ except Exception as e:
+ return {"success": False, "error": str(e)}
+
+@mcp.tool(name="heatmapper", description="Generate heatmaps from computed matrices.")
+def generate_heatmap(matrix_file: str, output_file: str) -> dict:
+ """
+ Generates a heatmap from a computed matrix file.
+
+ :param matrix_file: Path to the matrix file.
+ :param output_file: Path to the output heatmap file.
+ :return: Dictionary with success status and result or error message.
+ """
+ try:
+ heatmapper(matrix_file, output_file)
+ return {"success": True, "result": f"Heatmap generated and saved to {output_file}"}
+ except Exception as e:
+ return {"success": False, "error": str(e)}
+
+def create_app() -> FastMCP:
+ """
+ Creates and returns the FastMCP application instance.
+
+ :return: FastMCP instance.
+ """
+ return mcp
\ No newline at end of file
diff --git a/deepTools/mcp_output/requirements.txt b/deepTools/mcp_output/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2a163c16f4781c03ef2978d50d2ee76392e2db95
--- /dev/null
+++ b/deepTools/mcp_output/requirements.txt
@@ -0,0 +1,13 @@
+fastmcp
+fastapi
+uvicorn[standard]
+pydantic>=2.0.0
+numpy >= 2.0.0
+scipy >= 0.17.0
+matplotlib >= 3.5.0
+pysam >= 0.14.0
+numpydoc >= 0.5
+pyBigWig >= 0.2.1
+py2bit >= 0.2.0
+plotly >= 4.9
+deeptoolsintervals >= 0.1.8
diff --git a/deepTools/mcp_output/start_mcp.py b/deepTools/mcp_output/start_mcp.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc7fcbd9646ad53f089fc94af8129043a703325a
--- /dev/null
+++ b/deepTools/mcp_output/start_mcp.py
@@ -0,0 +1,30 @@
+
+"""
+MCP Service Startup Entry
+"""
+import sys
+import os
+
+project_root = os.path.dirname(os.path.abspath(__file__))
+mcp_plugin_dir = os.path.join(project_root, "mcp_plugin")
+if mcp_plugin_dir not in sys.path:
+ sys.path.insert(0, mcp_plugin_dir)
+
+from mcp_service import create_app
+
+def main():
+ """Start FastMCP service"""
+ app = create_app()
+ # Use environment variable to configure port, default 8000
+ port = int(os.environ.get("MCP_PORT", "8000"))
+
+ # Choose transport mode based on environment variable
+ transport = os.environ.get("MCP_TRANSPORT", "stdio")
+ if transport == "http":
+ app.run(transport="http", host="0.0.0.0", port=port)
+ else:
+ # Default to STDIO mode
+ app.run()
+
+if __name__ == "__main__":
+ main()
diff --git a/deepTools/mcp_output/workflow_summary.json b/deepTools/mcp_output/workflow_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..db59e2d2b9bfba7a699a45db90d8e3e3256941f2
--- /dev/null
+++ b/deepTools/mcp_output/workflow_summary.json
@@ -0,0 +1,195 @@
+{
+ "repository": {
+ "name": "deepTools",
+ "url": "https://github.com/deeptools/deepTools",
+ "local_path": "/export/zxcpu1/shiweijie/code/ghh/Code2MCP/workspace/deepTools",
+ "description": "Python library",
+ "features": "Basic functionality",
+ "tech_stack": "Python",
+ "stars": 0,
+ "forks": 0,
+ "language": "Python",
+ "last_updated": "",
+ "complexity": "medium",
+ "intrusiveness_risk": "medium"
+ },
+ "execution": {
+ "start_time": 1769854937.7038116,
+ "end_time": 1769855028.4553556,
+ "duration": 90.75154423713684,
+ "status": "success",
+ "workflow_status": "success",
+ "nodes_executed": [
+ "download",
+ "analysis",
+ "env",
+ "generate",
+ "run",
+ "review",
+ "finalize"
+ ],
+ "total_files_processed": 2,
+ "environment_type": "unknown",
+ "llm_calls": 0,
+ "deepwiki_calls": 0
+ },
+ "tests": {
+ "original_project": {
+ "passed": false,
+ "details": {},
+ "test_coverage": "100%",
+ "execution_time": 0,
+ "test_files": []
+ },
+ "mcp_plugin": {
+ "passed": true,
+ "details": {},
+ "service_health": "healthy",
+ "startup_time": 0,
+ "transport_mode": "stdio",
+ "fastmcp_version": "unknown",
+ "mcp_version": "unknown"
+ }
+ },
+ "analysis": {
+ "structure": {
+ "packages": [
+ "source.deeptools",
+ "source.deeptools.test"
+ ]
+ },
+ "dependencies": {
+ "has_environment_yml": false,
+ "has_requirements_txt": false,
+ "pyproject": true,
+ "setup_cfg": false,
+ "setup_py": false
+ },
+ "entry_points": {
+ "imports": [],
+ "cli": [],
+ "modules": []
+ },
+ "risk_assessment": {
+ "import_feasibility": 0.8,
+ "intrusiveness_risk": "medium",
+ "complexity": "medium"
+ },
+ "deepwiki_analysis": {
+ "repo_url": "https://github.com/deeptools/deepTools",
+ "repo_name": "deepTools",
+ "content": "deeptools/deepTools\nInstallation and Getting Started\nTypical Workflows\nCore Processing Engines\nParallel Processing Framework (mapReduce)\nRead Counting Engine (countReadsPerBin)\nScore Extraction (getScorePerBigWigBin)\nMatrix Computation Engine (heatmapper)\nCoverage Generation and Normalization\nSingle-Sample Coverage (bamCoverage)\nSample Comparison (bamCompare)\nGC Bias Correction Pipeline\nMulti-Sample Analysis Tools\nData Integration (multiBamSummary and multiBigwigSummary)\nCorrelation Analysis (plotCorrelation)\nPrincipal Component Analysis (plotPCA)\nVisualization Tools\nMatrix Generation (computeMatrix)\nMatrix Operations (computeMatrixOperations)\nHeatmap Visualization (plotHeatmap)\nProfile Plots (plotProfile)\nQuality Control Tools\nChIP-seq Enrichment Assessment (plotFingerprint)\nFragment Size Analysis (bamPEFragmentSize)\nCoverage Distribution (plotCoverage)\nFeature Enrichment (plotEnrichment)\nRead Filtering Estimation (estimateReadFiltering and alignmentSieve)\nGalaxy Integration\nGalaxy Wrapper Architecture\nTool Shed Distribution and Installation\nTesting with Planemo\nDevelopment and Contributing\nProject Structure and Configuration\nCI/CD Pipeline\nDocumentation System\nRelease Process and Versioning\nFile Formats and Data Structures\nInput File Formats\nIntermediate File Formats\nOutput Formats and Visualization\nAdvanced Topics and Optimization\nPerformance Tuning\nFiltering and Read Processing Options\nNormalization Methods Deep Dive\nSpecialized Read Processing Modes\nCHANGES.txt\ndocs/content/about.rst\ndocs/content/changelog.rst\ndocs/content/example_api_tutorial.rst\ndocs/content/example_gallery.rst\ndocs/content/example_step_by_step.rst\ndocs/content/example_usage.rst\ndocs/content/help_faq.rst\ndocs/content/help_faq_galaxy.rst\ndocs/content/help_galaxy_intro.rst\ndocs/content/help_glossary.rst\ndocs/content/installation.rst\ndocs/content/list_of_tools.rst\ndocs/images/Gal_FAQ_filteringDuplicates.png\ndocs/images/Gal_FAQ_info.png\ndocs/index.rst\ngalaxy/wrapper/deepTools_macros.xml\ngalaxy/wrapper/estimateReadFiltering.xml\ngalaxy/wrapper/test-data/estimateReadFiltering.txt\nscripts/convertChromsBigWig.py\nPurpose and Scope\nThis page provides a high-level introduction to deepTools: what it is, what problems it solves, and its primary capabilities for analyzing high-throughput sequencing data. For detailed installation instructions, seeInstallation and Getting Started. For specific tool usage and workflows, seeTypical Workflows.\nSources:README.md9-11docs/index.rst7-8\nWhat is deepTools?\ndeepTools is a suite of Python tools developed for efficient analysis of high-throughput sequencing data, particularly ChIP-seq, RNA-seq, and MNase-seq experiments. It addresses the challenge of handling large amounts of data generated from DNA sequencing centers by providing:\nNormalized coverage file generationin standard bedGraph and bigWig formats\nQuality control modulesfor assessing data quality and technical biases\nPublication-ready visualizationsfor identifying enrichments and functional genome annotations\nEfficient parallel processingusing themapReduceframework for genome-scale computations\ndeepTools is available through three distinct usage modes:\nSources:README.md9-14docs/index.rst10-14CHANGES.txt62\nSystem Architecture\nThe following diagram illustrates the primary components of deepTools and how they relate to each other:\ndeepTools Component Architecture\nData Access LayerCore Processing EnginesTool Modules LayerUser Interface LayerCommand-Line Tools(bin/bamCoverage, bin/computeMatrix, etc.)Galaxy XML Wrappers(galaxy/wrapper/*.xml)Python API(import deeptools.*)Quality Control ToolsplotFingerprintcomputeGCBiasplotCoveragebamPEFragmentSizeCoverage ToolsbamCoveragebamComparebigwigCompareAggregation ToolsmultiBamSummarymultiBigwigSummarycomputeMatrixVisualization ToolsplotHeatmapplotProfileplotCorrelationcountReadsPerBin(deeptools/countReadsPerBin.py)getScorePerBigWigBin(deeptools/getScorePerBigWigBin.py)heatmapper(deeptools/heatmapper.py)mapReduce(deeptools/mapReduce.py)pysam(BAM/CRAM files)pyBigWig(bigWig files)deeptoolsintervals(BED/GTF files)\nData Access Layer\nCore Processing Engines\nTool Modules Layer\nUser Interface Layer\nCommand-Line Tools(bin/bamCoverage, bin/computeMatrix, etc.)\nGalaxy XML Wrappers(galaxy/wrapper/*.xml)\nPython API(import deeptools.*)\nQuality Control ToolsplotFingerprintcomputeGCBiasplotCoveragebamPEFragmentSize\nCoverage ToolsbamCoveragebamComparebigwigCompare\nAggregation ToolsmultiBamSummarymultiBigwigSummarycomputeMatrix\nVisualization ToolsplotHeatmapplotProfileplotCorrelation\ncountReadsPerBin(deeptools/countReadsPerBin.py)\ngetScorePerBigWigBin(deeptools/getScorePerBigWigBin.py)\nheatmapper(deeptools/heatmapper.py)\nmapReduce(deeptools/mapReduce.py)\npysam(BAM/CRAM files)\npyBigWig(bigWig files)\ndeeptoolsintervals(BED/GTF files)\nSources:docs/content/list_of_tools.rst1-46pyproject.toml24-50README.md34-56\nCore Capabilities\ndeepTools provides five major functional categories that address different stages of sequencing data analysis:\n1. Quality Control and Preprocessing\nTools for assessing data quality before analysis:\nplotFingerprint- ChIP-seq enrichment assessment with quality metrics (Jensen-Shannon distance, CHANCE statistics)\nplotFingerprint\ncomputeGCBias/correctGCBias- GC bias detection and correction\ncomputeGCBias\ncorrectGCBias\nbamPEFragmentSize- Fragment size distribution analysis for paired-end data\nbamPEFragmentSize\nplotCoverage- Coverage distribution and genome coverage metrics\nplotCoverage\nestimateReadFiltering- Preview effects of filtering parameters\nestimateReadFiltering\nSources:docs/content/list_of_tools.rst18-32galaxy/wrapper/estimateReadFiltering.xml1-118\n2. Coverage Generation and Normalization\nTools for converting aligned reads (BAM) to normalized coverage tracks (bigWig/bedGraph):\nbamCoverage- Single-sample normalization with RPKM, CPM, BPM, RPGC methods\nbamCoverage\nbamCompare- Two-sample comparison (e.g., ChIP vs. input) with log2ratio, difference, mean operations\nbigwigCompare- Comparison operations on bigWig files\nbigwigCompare\nbigwigAverage- Averaging multiple bigWig files\nbigwigAverage\nSources:docs/content/list_of_tools.rst24-27CHANGES.txt28\n3. Data Aggregation\nTools for integrating multi-sample data:\nmultiBamSummary- Read count aggregation across BAM files (bins or BED-defined regions)\nmultiBamSummary\nmultiBigwigSummary- Score aggregation across bigWig files\nmultiBigwigSummary\ncomputeMatrix- Signal computation over genomic regions in two modes:scale-regionsandreference-point\ncomputeMatrix\nscale-regions\nreference-point\nSources:docs/content/list_of_tools.rst10-29\n4. Analysis and Statistics\nTools for deriving statistical insights:\nplotCorrelation- Pearson/Spearman correlation with hierarchical clustering and heatmap/scatter plot output\nplotCorrelation\nplotPCA- Principal component analysis for dimensionality reduction\nplotEnrichment- Feature enrichment quantification\nplotEnrichment\nSources:docs/content/list_of_tools.rst14-17docs/content/example_usage.rst32-44\n5. Visualization\nTools for creating publication-ready plots:\nplotHeatmap- Customizable heatmaps with clustering, color schemes, and multi-sample/region support\nplotHeatmap\nplotProfile- Average signal profile plots (meta-profiles) with standard error/deviation options\nplotProfile\ncomputeMatrixOperations- Matrix manipulation (filter, subset, sort, combine)\ncomputeMatrixOperations\nMultiple output formats supported: PNG, PDF, SVG, and interactive HTML (via plotly)\nSources:docs/content/list_of_tools.rst34-36CHANGES.txt187\nFile Format Ecosystem\ndeepTools operates on standard genomics file formats and performs conversions between them:\nFile Format Flow Diagram\nOutput FormatsIntermediate FormatsdeepTools ProcessingInput FormatsBAM/CRAM(aligned reads)bigWig(coverage signal)BED/GTF/GFF(genomic regions)2bit/FASTA(reference genome)bamCoveragebamComparemultiBamSummarymultiBigwigSummarycomputeMatrixcomputeGCBiasNPZ matrix(multiBamSummary output)Matrix .gz(computeMatrix output)Tabular(GC frequencies)bigWig/bedGraph(coverage tracks)PNG/PDF/SVG(static plots)HTML(interactive plotly)Tabular(data tables)\nOutput Formats\nIntermediate Formats\ndeepTools Processing\nInput Formats\nBAM/CRAM(aligned reads)\nbigWig(coverage signal)\nBED/GTF/GFF(genomic regions)\n2bit/FASTA(reference genome)\nbamCoverage\nmultiBamSummary\nmultiBigwigSummary\ncomputeMatrix\ncomputeGCBias\nNPZ matrix(multiBamSummary output)\nMatrix .gz(computeMatrix output)\nTabular(GC frequencies)\nbigWig/bedGraph(coverage tracks)\nPNG/PDF/SVG(static plots)\nHTML(interactive plotly)\nTabular(data tables)\nKey intermediate formats serve as bridges between data aggregation and visualization:\nNPZ files- Compressed NumPy arrays storing multi-sample read counts or scores (output frommultiBamSummary/multiBigwigSummary)\nmultiBamSummary\nmultiBigwigSummary\nMatrix .gz files- Compressed matrices of signal values over genomic regions (output fromcomputeMatrix)\ncomputeMatrix\nSources:docs/content/help_glossary.rst82-174docs/content/list_of_tools.rst8-45\nParallel Processing Framework\nAll compute-intensive operations in deepTools utilize themapReduceframework for efficient genome-scale processing:\nmapReduce Execution Model\nmapReduce Framework (deeptools/mapReduce.py)Input:BAM/bigWig filesBED regionsParametersGenome Chunking(~400k reads per chunk)Worker Pool(multiprocessing.Pool)Worker 1:Process chunk 1Worker 2:Process chunk 2Worker N:Process chunk NResult Aggregation(concatenate/merge)Output:Aggregated results\nmapReduce Framework (deeptools/mapReduce.py)\nInput:BAM/bigWig filesBED regionsParameters\nGenome Chunking(~400k reads per chunk)\nWorker Pool(multiprocessing.Pool)\nWorker 1:Process chunk 1\nWorker 2:Process chunk 2\nWorker N:Process chunk N\nResult Aggregation(concatenate/merge)\nOutput:Aggregated results\nThe framework provides:\nAutomatic parallelizationacross user-specified number of processors (via--numberOfProcessors)",
+ "model": "gpt-4o-2024-08-06",
+ "source": "selenium",
+ "success": true
+ },
+ "code_complexity": {
+ "cyclomatic_complexity": "medium",
+ "cognitive_complexity": "medium",
+ "maintainability_index": 75
+ },
+ "security_analysis": {
+ "vulnerabilities_found": 0,
+ "security_score": 85,
+ "recommendations": []
+ }
+ },
+ "plugin_generation": {
+ "files_created": [
+ "mcp_output/start_mcp.py",
+ "mcp_output/mcp_plugin/__init__.py",
+ "mcp_output/mcp_plugin/mcp_service.py",
+ "mcp_output/mcp_plugin/adapter.py",
+ "mcp_output/mcp_plugin/main.py",
+ "mcp_output/requirements.txt",
+ "mcp_output/README_MCP.md"
+ ],
+ "main_entry": "start_mcp.py",
+ "requirements": [
+ "fastmcp>=0.1.0",
+ "pydantic>=2.0.0"
+ ],
+ "readme_path": "/export/zxcpu1/shiweijie/code/ghh/Code2MCP/workspace/deepTools/mcp_output/README_MCP.md",
+ "adapter_mode": "import",
+ "total_lines_of_code": 0,
+ "generated_files_size": 0,
+ "tool_endpoints": 0,
+ "supported_features": [
+ "Basic functionality"
+ ],
+ "generated_tools": [
+ "Basic tools",
+ "Health check tools",
+ "Version info tools"
+ ]
+ },
+ "code_review": {},
+ "errors": [],
+ "warnings": [],
+ "recommendations": [
+ "Improve test coverage by adding more unit tests for core modules",
+ "Implement continuous integration (CI) to automate testing and deployment",
+ "Update documentation to include detailed installation and usage instructions",
+ "Optimize large file handling to improve performance",
+ "Refactor code to reduce complexity and improve maintainability",
+ "Ensure all dependencies are clearly defined and up-to-date",
+ "Enhance error handling to provide more informative messages",
+ "Consider adding a setup.py for easier package installation",
+ "Improve code comments for better readability and understanding",
+ "Conduct a code review to identify potential improvements and optimizations."
+ ],
+ "performance_metrics": {
+ "memory_usage_mb": 0,
+ "cpu_usage_percent": 0,
+ "response_time_ms": 0,
+ "throughput_requests_per_second": 0
+ },
+ "deployment_info": {
+ "supported_platforms": [
+ "Linux",
+ "Windows",
+ "macOS"
+ ],
+ "python_versions": [
+ "3.8",
+ "3.9",
+ "3.10",
+ "3.11",
+ "3.12"
+ ],
+ "deployment_methods": [
+ "Docker",
+ "pip",
+ "conda"
+ ],
+ "monitoring_support": true,
+ "logging_configuration": "structured"
+ },
+ "execution_analysis": {
+ "success_factors": [
+ "Efficient execution of all workflow nodes",
+ "Successful generation of MCP plugin files"
+ ],
+ "failure_reasons": [],
+ "overall_assessment": "good",
+ "node_performance": {
+ "download_time": "Completed successfully, indicating efficient data retrieval",
+ "analysis_time": "Completed successfully, indicating effective code analysis",
+ "generation_time": "Completed successfully, indicating efficient code generation",
+ "test_time": "Original project tests failed, but MCP plugin tests passed"
+ },
+ "resource_usage": {
+ "memory_efficiency": "Memory usage data not available, unable to assess",
+ "cpu_efficiency": "CPU usage data not available, unable to assess",
+ "disk_usage": "Disk usage data not available, unable to assess"
+ }
+ },
+ "technical_quality": {
+ "code_quality_score": 75,
+ "architecture_score": 80,
+ "performance_score": 70,
+ "maintainability_score": 75,
+ "security_score": 85,
+ "scalability_score": 70
+ }
+}
\ No newline at end of file
diff --git a/deepTools/source/.planemo.sh b/deepTools/source/.planemo.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0a10b8830ab70273dfb50e0172409403c39b3fdf
--- /dev/null
+++ b/deepTools/source/.planemo.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# Some versions of planemo don't handle symlinks
+unlink galaxy/wrapper/test-data/test.bw
+cp deeptools/test/test_heatmapper/test.bw galaxy/wrapper/test-data/test.bw
+
+if [[ $1 == "1" ]] ; then
+ wrappers="galaxy/wrapper/alignmentSieve.xml \
+ galaxy/wrapper/bamCompare.xml \
+ galaxy/wrapper/bamCoverage.xml \
+ galaxy/wrapper/bamPEFragmentSize.xml \
+ galaxy/wrapper/bigwigCompare.xml \
+ galaxy/wrapper/bigwigAverage.xml \
+ galaxy/wrapper/computeGCBias.xml"
+elif [[ $1 == "2" ]] ; then
+ wrappers="galaxy/wrapper/computeMatrix.xml \
+ galaxy/wrapper/computeMatrixOperations.xml \
+ galaxy/wrapper/correctGCBias.xml \
+ galaxy/wrapper/estimateReadFiltering.xml \
+ galaxy/wrapper/multiBamSummary.xml \
+ galaxy/wrapper/multiBigwigSummary.xml"
+else
+ wrappers="galaxy/wrapper/plotCorrelation.xml \
+ galaxy/wrapper/plotCoverage.xml \
+ galaxy/wrapper/plotEnrichment.xml \
+ galaxy/wrapper/plotFingerprint.xml \
+ galaxy/wrapper/plotHeatmap.xml \
+ galaxy/wrapper/plotPCA.xml \
+ galaxy/wrapper/plotProfiler.xml"
+fi
+
+planemo --version
+planemo lint ${wrappers}
+planemo test --no_dependency_resolution --galaxy_branch $2 --install_galaxy ${wrappers} 2>&1
+mkdir upload
+mv tool_test_output* upload/
diff --git a/deepTools/source/.readthedocs.yaml b/deepTools/source/.readthedocs.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d95161e3dc15049d8e2ab254167fdf26a6340ff0
--- /dev/null
+++ b/deepTools/source/.readthedocs.yaml
@@ -0,0 +1,15 @@
+version: 2
+
+build:
+ os: ubuntu-22.04
+ tools:
+ python: "3.12"
+
+sphinx:
+ configuration: docs/conf.py
+
+python:
+ install:
+ - method: pip
+ path: .
+ - requirements: docs/requirements.txt
diff --git a/deepTools/source/CHANGES.txt b/deepTools/source/CHANGES.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5f0bf0f0cb427f6810545f8b542bddd9a88e3d8c
--- /dev/null
+++ b/deepTools/source/CHANGES.txt
@@ -0,0 +1,448 @@
+3.5.5
+* drop support for python 3.7
+* doc fixes (argparse properly displayed, minor changes in installation instructions)
+* deepblue support stops
+* initiate deprecation of tight_layout in plotheatmap, in favor of constrained_layout. Minor changes in paddings, etc can occur (but for the better).
+* documentation changes to improve ESS tab, table constraints have been lifted & sphinx_rtd_theme to v2.0.0
+* upload artifact in gh test runner pinned to 3
+* Try to get the number of processors from sched_getaffinity, to avoid using to many in job submissions for example. #1199
+* Fix typo in estimateScaleFactor that fixes broken argparsing. #1286
+
+3.5.4
+* error handling and cases for bwAverage with >2 samples
+* Tick.label deprecation for mpl 3.8
+* minimal mpl version is 3.5
+* cicd update for pypi push
+
+3.5.3
+* requirement cap for matplotlib lifted (changes in plotting can occur)
+* nose has been deprecated in favor of pytests
+* pytests run with python 3.7 - 3.11
+* toml file for installation, requirements, versioning and executables
+* planemo tests updated to galaxy 23.1
+* custom github action runner deprecated
+* deprecation of np types for builtin types
+* stricter label checks and validator in galaxy
+
+3.5.2
+* new subcommand: Bigwig average #1169
+* dendogram of plotCorrelation now matches each cell correctly
+* Fix label options
+* add pool
+* several other bugs fixed: #1159, #1185, #1172, #1181, #1183
+* Fix galaxy tests, separate planemo and update pypi push only on tag releases
+* upload artifact
+* allow 1 or 2 lines diff for bowtie2 program
+* change github action to get artifacts
+* fix plotPCA
+* try to fix old samtools installed
+* add forgotten channels
+* default chunklength increased for alignmentSieve
+* chunklength in alignmentSieve is a CLI argument now
+* suppress lack of index warnings from pysam
+* fixedStep in bedGraph output to avoid merging bins with equal values
+
+3.5.1
+* cmp usage is updated to fit the recent mpl updates.
+* The requirements.txt is updated.
+* "NA" occurences in plotFingerprint.py have been replaced by numpy.NAN (PR #1002)
+* computeMatrixOperations.xml is fixed (brought up in #1003)
+* plotly error is fixed. (issue #1013)
+* relase version is updated in planemo.sh
+* fixed galaxy tests
+* A bug is taken care of in computeMatrixOperations.py / dataRange
+* in plotProfile.py legen location is changed from auto to best (issue #1042)
+
+3.5.0
+
+ * Fixed a small issue in computeGCBias (issue #969)
+ * Added dataRange to computeMatricOperation to return min,max,median and 10th and 90th percentile.
+ * Fixed a small typo in bamCompare. (issue #966)
+ * Save the output matrix of the plotheatmap in a format to be compatible with running plotheatmap on it again.(issue #953)
+ * Different colors can now be set by user for plotProfile --plotType heatmap (issue #956)
+ * Added the `auto` option to the zMin and zMax of plotHeatmap. (issue #908)
+ * Added `--sortUsingSamples` and `--clusterUsingSamples` to the plotHeatmap galaxy wrapper. (issue #976)
+
+3.4.3
+
+ * Changed iteritems() in estimateEscaleFactor to its python3 compatible items().
+ * Added the missing argument (--clusterUsingSamples) to plotProfile.
+
+3.4.2
+
+ * Programmed around a bug in matplotlib that prevented the plotCorrelation scatter plot from working. See https://bioinformatics.stackexchange.com/questions/12830/plot-correlation-between-several-bam-files/12831
+
+3.4.1
+
+ * Prevented temporary bedGraph files from being written to (possibly small) shared-memory drives even when TMPDIR is set to somewhere else. Now shared memory is only used if requested by setting TMPDIR (or other appropriate environment variables) to `/dev/shm`.
+ * Fixed a bug in bamPEFragmentSize that caused incompatibility with newer matplotlib releases. (issue #928)
+
+3.4.0
+
+ * Fixed a bug in one of the Galaxy wrappers.
+ * Added the `--lineAtTickMarks` option to `plotHeatmap` so that there are dashed vertical lines for each tick mark in the plot. (issue #924)
+
+3.3.2
+
+ * Fixed --yAxisLabel in plotProfile (issue #889)
+ * Fixed a small X-axis tick offset issue. This caused the location of tick marks in profile plots to be shifted to the left by 0.5 to 1 bin. This was generally not notable, only really appearing when very few bins (e.g., 4) were used. The issue was mostly that the end tick would appear after the end of the plot, since its coordinate was the end of the bin. (issue #888)
+ * multiBamSummary and multiBigwigSummary no longer exclude small bins at the end of genomic chunks. multiBamSummary now has a `--genomicChunkSize` option in case users need to control the size of the genome used for multiprocessing for consistency. (issue #887)
+ * Added 4 new colormaps, which were copied from the seaborn project (issue #879). These are: rocket, mako, vlag, and icefire.
+ * Fixed an issue in the Galaxy wrapper of plotCorrelation where the X and Y.
+ * Fixed an issue with the `--Offset` option, where a single negative value wouldn't include only a single position, but rather that base through the end of the read. (stems from issue #902)
+ * Clustered output from plotHeatmap and plotProfile now allow computing the silhouette score of each row. This is printed in the returned BED file as the last column.
+
+3.3.1
+
+ * Fixed `--plotNumbers` not working in `plotCorrelation`. This was issue #838.
+ * Fixed compatibility with matplotlib 3 and restrict to at least that version.
+ * The Y-axis labels should once again appear in both plotHeatmap and plotProfile (issue #844). This was related to the previous point.
+ * Testing is no longer performed with python 2.7, which will reach end of life in a couple months.
+ * Various documentation updates (issues #868, #867 and #851).
+ * Increased support for BED files with track header lines (issue #866).
+
+3.3.0
+
+ * `plotCoverage` now has a `--BED` option, to restrict plots and output to apply to a specific set of regions given by a BED or GTF file or files (issue #829).
+ * `plotCoverage` now has a `--DepthSummary` option, which produces a summary similar to GATK's DepthOfCoverage (issue #828).
+ * `plotCoverage` is now able to compute coverage metrics for arbitrary coverage thresholds using multiples of the `-ct` option (e.g., `-ct 0 -ct 10 -ct 20 -ct 30`).
+
+3.2.1
+
+ * Changed a bug in `estimateReadFiltering` where the estimated number of filtered reads was typically too low.
+ * Made an internal change that should drastically reduce the memory requirements of many tools. This slightly increases run time, but as the resulting resource usage is much more attractive this is judged worthwhile.
+ * An informative error message is now produced with `bamCoverage` if RPGC normalization is requested but no effective genome size is provided (issue #815).
+ * Fixes some issues with y-axis scaling (issue #822)
+
+3.2.0
+
+ * Added access in the Galaxy wrapper to the `--labels` option in most tools (issue #738)
+ * Added the `std` plot type to plotProfile in Galaxy (issue #782)
+ * `bamCompare` now has a `--skipZeroOverZero` option to allow skipping bins where both input files lack coverage (issue #785)
+ * `bamCompare` and `bigwigCompare` can now take two pseudocounts, in case you want a different value for the numerator and the denominator (issue #784)
+ * `multiBamSummary` now has a `--scaleFactors` option, which computes scale factors in the same manner as DESeq2 to a file. Note that the produced scaling factors are meant to be used with `bamCoverage`. If you want to use them directly in DESeq2 (or a similar package) you will need to invert them (take 1/scale factor). (issue #800)
+ * Fixed an issue with large numbers of samples and small genome sizes sometimes causing nothing to be processed. (issue #801)
+
+3.1.3
+
+ * Added the `--legendLocation` option in the Galaxy wrappers for plotProfile and plotHeatmap
+ * More thoroughly checked that output files can be written (issue #764).
+ * `bamCompare` and `bigwigCompare` can now take two pseudocounts, in case you want a different value for the numerator and the denominator (issue #784)
+
+3.1.2
+
+ * Added a `--markers` option to `plotPCA`, courtesy of @sklasfeld.
+ * `computeMatrixOperations rbind` now properly supports multiple region groups (issue #742)
+ * Fixed the usage of `--xRange` and `--yRange` with `plotCorrelation` (issue #709)
+
+3.1.1
+
+ * Fixed the `--outFileNameData` option in `plotProfile` when `computeMatrix reference-point --referencePoint center` was used. This caused an error previously. (issue #727)
+ * RPGC normalization and the `--scaleFactor` option in `bamCoverage` are no longer mutually exclusive.
+ * Increased the default plot width in plotPCA (issue #738)
+
+3.1.0
+
+ * The `--centerReads` option in `bamCoverage` is now compatible with `--Offset` (previously `--centerReads` was silently ignored if `--Offset` was specified). (issue #693)
+ * `bamCoverage` and `bamCompare` now have an `--exactScaling` option. Instead of using a random sample of alignment to compute the scaling factor, this causes all reads in the file to be used. This is significantly slower, but helpful in situations where reads that should be excluded clump together on the genome (i.e., when sampling based on location is likely to be inaccurate).
+ * `plotCorrelation --whatToPlot scatterplot` now has `--xRange` and `--yRange` options rather than just `--maxRange`. (issue #709)
+ * `computeMatrixOperations` can now be used to change sample and group names.
+ * `computeMatrixOperations` can now filter rows by minimum and/or maximum value.
+ * `--maxThreshold` and `--minThreshold` are now more consistently honoured. (#702)
+ * Fixed region handling when using files on deepBlue (#700)
+ * Using `--normalizeUsing RPGC` with `bamCompare` will now result in a fatal error, rather than a simple warning and the settings being changed under the hood. (#718)
+ * Related to the last point, setting `--normalizeUsing` to anything other than `None` will result in an error unless `--scaleFactorsMethod None` is also used. This is to prevent people from accidentally getting unintended normalization.
+ * bamPEFragmentSize no longer exploids its memory use with multiple large BAM/CRAM files (#720). Many other tools will also benefit from this change.
+
+3.0.2
+
+ * Fixed an issue regarding under sampling alignments in some cases with computing scaling factors. This was issue #690. The resolution isn't perfect, it's hard to know how many reads really need to be sampled for things like RNA-seq.
+ * `computeMatrix` now has a `--verbose` option. Setting this will drastically increase the verbosity of the messages sent to the screen. Only do this for debugging. `--quiet` will disable this completely (as well as all other messages printed to screen).
+ * Fixed handling of `--sortUsing region_length` in `plotHeatmap`. This now works properly for `--referencePoint center` and `--referencePoint TES`, where in the latter case the dashed line is drawn at the region start. The documentation has been updated to mention this. (issue #671)
+ * The reference point label specified by `computeMatrix reference-point` is now respected by plotHeatmap and plotProfile. So if you used `computeMatrix reference-point --referencePointLabel center` then 'center' will now appear as the tick label in your heatmaps and profiles automatically. (issues #606 and #683)
+ * Enabled using regions with a `.` in the chromosome name in the Galaxy wrappers (issue #692)
+
+3.0.1
+
+ * Fixed the `--perGroup` option in plotProfile and plotHeatmap when multiple groups were being used. In version 3.0.0, this would typically cause an error and deepTools to crash. (issue #673)
+ * Fixed a few issues with the Galaxy wrappers. Thanks to Ralf Gilsbach, Claudia Keller, and @bgruening (e.g., issue #678)
+
+3.0.0
+
+ * plotCorrelation` now has `--log1p` and `--maxRange` options if a scatter plot is produced. `--log1p` plots the natural log of the values (plus 1). `--maxRange` sets the maximum X and Y axis ranges. If they would normally be below this value then they are left unchanged. (issue #536)
+ * The PCA plot now includes "% of var. explained" in the top axis labels. (issue #547)
+ * `plotProfile` and `plotHeatmap` now have a `--labelRotation` option that can rotate the X-axis labels. This is one of the more common requests for customization. For further customization, please modify your .matplotlibrc file or save as a PDF and modify further in Illustrator or a similar program. (issue #537)
+ * The `--ignoreDuplicates` algorithm has been updated to better handle paired-end reads. (issue #524)
+ * Added the `estimateReadFiltering` tool to estimate how many reads would be filtered from a BAM file or files if a variety of desired filtering criterion are applied (issue #518).
+ * Rewrote the bigWig creation functions so there are no longer steps involving creating a single large bedGraph and then sorting it. That was a hold-over from previous versions that used UCSC tools. This was issue #546. This also means that there are no longer any required external programs (previously, only `sort` was required).
+ * `plotPCA` can now be run on the transposed matrix, as is typically done with RNAseq data (e.g., with deepTools). Further, matplotlib is now no longer used for computing the PCA, but rather an SVD is performed and the results directly used. The options `--transpose` and `--ntop` were also added. The former computes the PCA of the transposed matrix and the latter specifies how many of the most variable rows in the matrix to use. By default, the 1000 most variable features are used. In the (now optional) plot, the `--PCs` option can now be used to specify which principal components to plot. Finally, the unbiased standard deviation is used in the out, as is done by `prcomp()` in R. This was issue #496.
+ * Symbol colors for `plotPCA` can now be specified. (issue #560)
+ * `plotFingerprint` always returns the synthetic JSD, even if no `--JSDsample` is specified. (issue #564)
+ * `plotEnrichment` will only read in annotation files a single time rather than in each thread. This prevents terrible performance when using many tens of millions of BED/GTF regions at the expense of a slight memory increase. (issue #530)
+ * Fixed a small bug generally affecting `plotFingerprint` where BAM files without an index were processed as bigWig files, resulting in a confusing error message (issue #574). Thanks to Sitanshu Gakkhar for poiting this out!
+ * `bamPEFragmentSize` now has `--table` and `--outRawFragmentLengths` options. The former option will output the read/fragment metrics to a file in tabular format (in addition to the previous information written to the screen). The latter option will write the raw read/fragment counts to a tsv file. The format of the file is a line with "#bamPEFragmentSize", followed by a header line of "Size\tOccurences\tSample", which should facilitate processing in things like R. (issue #572)
+ * `bamPEFragmentSize` will now plot the read length distribution for single-end BAM files. Note that if you mix single and paired-end files that the resulting plots may be difficult to interpret.
+ * The various plot commands do not actually have to plot anything, instead they can optionally only print their raw metrics or other text output. This is mostly useful with large numbers of input files, since the resulting plots can become quickly crowded. (issue #5719
+ * Expanded the metrics output by `bamPEFragmentSize` such that it now fully replaces Picard CollectInsertSizeMetrics (issue #577).
+ * "plotly" is now available as an output image format for all tools. Note that this is not really an image format, but rather an interactive webpage that you can open in your browser. The resulting webpages can be VERY large (especially for `plotHeatmap`), so please keep that in mind. Further, plotly does not currently have the capabilities to support all of deepTools' features, so note that some options will be ignored. For privacy reasons, all plotly files are saved locally and not uploaded to the public plot.ly site. You can click on the "Export to plot.ly" link on the bottom right of plotly output if you would like to modify the resulting files.
+ * `bamCoverage` no longer prints `normalization: depth` be default, but rather a more accurate message indicating that the scaling is performed according to the percentage of alignments kept after filtering. This was originally added in #366 (issue #590).
+ * The output of `plotFingerprint --outRawCounts` now has a header line to facilitate identification by MultiQC.
+ * `plotPCA` now has a `--log2` option, which log2 transforms the data before computing the PCA. Note that 0.01 is added to all values to 0 doesn't become -infinity.
+ * `computeGCBias` no longer requires a fragment length for paired-end datasets. This was apparently always meant to be the case anyway. (issue #595)
+ * `computeMatrixOperations sort` can now properly perform filtering of individual regions, as was originally intended (issue #594)
+ * `plotCoverage --outRawCounts` now has another line it its header, which is meant to aid MultiQC.
+ * There is no longer a configuration file. The default number of threads for all tools is 1. See issue #613.
+ * `bamCoverage` and `bamCompare` have rewritten normalization functions. They have both added CPM and BPM normalization and, importantly, filtering is now done **before** computing scaling factors. A few of the options associated with this (e.g., `--normalizeUsingRPKM`) have been replaced with the `--normalizeUsing` option. This behavior represents a break from that seen in earlier versions but should be easier to follow and more in line with what users expect is happening. The syntax for normalization has been reworked multiple times (see #629).
+ * Fixed issue #631
+ * `computeMatrix` now repeats labels for each column in a plot. This is convenient if you later want to merge reference-point and scale-regions runs and still have correct tick marks and labels in plotHeatmap/plotProfile (issue #614). Note that the output of computeMatrix and computeMatrixOperations can not be used with older versions of deepTools (but output from previous versions can still be used).
+ * `plotHeatmap --sortRegions` now has a `keep` option. This is identical to `--sortRegions no`, but may be clearer (issue #621)
+ * `plotPCA --outFileNameData` and `plotCorrelation --outFileCorMatrix` now produce files with a single comment line (i.e., '#plotPCA --outFileNameData' and '#plotCorrelation --outFileCorMatrix'). These can then be more easily parsed by programs like MultiQC.
+ * All functions that accept file labels (e.g., via a `--samplesLabel` option) now also have a `--smartLabels` option. This will result in labels comprised of the file name, after stripping any path and the file extension. (issue #627)
+ * The `-o` option can now be universally used to indicate the file to save a tool's primary output. Previously, some tools use `-o`, some used `-out` and still others used things like `-hist` or `-freq`. This caused annoyance due to having to always remember the appropriate switch. Hopefully standardizing to `-o` will alleviate this. (issue #640)
+ * Using a --blackListFileName with overlapping regions will typically now cause the various deepTools programs to stop. This is to ensure that resulting scale factors are correct (issue #649)
+ * `bamCoverage` is a bit more efficient with small BAM files now due to underlying algorithmic changes. Relatedely, bamCoverage will skip some unnecessary estimation steps if you are not filtering reads, further speeding processing a bit. (issue #662)
+ * Added support for CRAM files. This requires pysam > 0.13.0 (issue #619).
+
+2.5.7
+
+ * Fixed a small bug that caused computation to stop. This was related to a change made for release 2.5.5.
+
+2.5.6
+
+ * Fixed a bug where deepTools in python3 can't handle npz file labels created under python 2.
+
+2.5.5
+
+ * Updated blacklist handling such that an error is thrown on overlapping regions.
+
+2.5.4
+
+ * Fixed issue #612, which only occurs when unaligned reads have a position assigned to them.
+ * Ticks in the profile plot at the top of the output of `plotHeatmap` should now always line up properly. (issue #616)
+
+2.5.3
+
+ * Fixed a bug in `plotEnrichment`, the `--keepExons` option with a BED12 file would cause an error. (issue #559)
+ * `bamCoverage` now doesn't cause and error to be thrown by `sort` in there are "/spaces in quoted path/". (issue #558)
+
+2.5.2
+
+ * Fixed a bug in `bamCoverage` that can cause crashes when python3 is used.
+ * Fixed a bug in the multiBigwigSummary Galaxy wrapper.
+ * A more reasonable exit code (not 0) is now returned if there's a mismatch in the label and file number.
+ * `plotFingerprint` no longer tries to use illegal line designators (issue #538)
+ * Various documentation fixes
+
+2.5.1
+
+ * Added universal new line support to deeptoolsintervals (issue #506).
+ * Fixed a few issues with correctGCBias under python 3.5 (thanks to @drakeeee)
+ * Setting `--minThreshold 0.0` or `--maxThreshold 0.0` now works properly. Previously, setting either of these to 0 was ignored. (issue #516)
+ * You can now specify the plot width and height in `plotPCA` and `plotCorrelation` (heatmap only) with the `--plotWidth` and `--plotHeight` parameters. (issue #507)
+ * plotCoverage no longer clips the top off of plots. Further, you can now set the plot width and height with `--plotWidth` and `--plotHeight`. (issue #508)
+ * In bamCoverage, specifying `--filterRNAstrand` no longer results in `--extendReads` being ignored. (issue #520)
+ * `plotFingerprint` and `plotEnrichment` no longer require producing a plot, which is useful if you only need QC metrics and are using a LOT of samples (such that matplotlib would crash anyway). This hasn't been implemented in Galaxy, but can if people would like it. (issues #519 and #526)
+ * `computeMatrix` now accepts a `--samplesLabel` option, which is useful in those cases when you aren't immediately running `plotHeatmap` and don't have terribly descriptive file names (issue #523)
+ * If you use `plotFingerprint` with the `--JSDsample` option and forget to list that file under `--bamfiles` it will be added automatically and the file name added to the labels if needed (issue #527)
+ * Various Galaxy wrapper fixes
+
+2.5.0
+
+ * Fix a bug where using regions with the same name in multiple BED files in computeMatrix caused downstream problems in plotHeatmap/plotProfile (issue #477).
+ * If computeMatrix/plotHeatmap/plotProfile is asked to sort the output matrix, it now does so by ignoring NaN values. Previously, any row with an NaN was placed at the top of the output (issue #447).
+ * Fixed issue #471
+ * Various Galaxy wrapper fixes
+ * There is now a `--rowCenter` option in `plotPCA`, which can be used to make each row of the matrix used in the PCA to have a mean of 0. This can be useful in cases where there's extreme region-based depth variation that is shared between all samples. This was issue #477.
+ * The --Offset option is now available in `plotEnrichment`. This was issue #481.
+ * The maximum coverage allowed while calculating the Jensen-Shannon distance in `plotFingerprint` has been increased to 2 million and an informational message containing the number of bins above this value is printed to the standard output.
+ * `bamCoverage` now respects the `--scaleFactor` argument even if not other normalization is performed (issue #482).
+ * The `--minFragmentLength` and `--maxFragmentLength` options now respect single-end reads. For SE reads, these parameters refer to the number of aligned bases (i.e., splicing is ignored). This was issue #489.
+ * `--yMin` and `--yMax` can now be lists of values in `plotHeatmap`. This was issue #487. Note that the plots are not perfectly aligned if you do this.
+
+2.4.3
+
+ * Fixed incorrect label ordering in the `plotCorrelation` command with the `--outFileCorMatrix` options.
+ * Fixed bug #491, which involved python 3 and bamCoverage.
+
+2.4.2
+
+ * Fixed an issue where `computeMatrix reference-point --referencePoint center` would break if 1-base regions were used. This was bug #456.
+ * `plotCorrelation` with `--outFileCorMatrix` now works with `--labels` again (thanks to @sklasfeld for supplying the patch).
+ * `bigwigCompare` and `bamCompare` can now return the average (mean) of two input files (issue #467).
+
+2.4.1
+
+ * Setting --zMin to the same value as --zMax, whether intentionally or because the --zMax value computed by deepTools happens to be now larger than the desired value, will result in the maximum value in the dataset being used (internally, --zMax gets set to None).
+ * Scale factor is now set to 1 in bamCoverage if no normalization is used. The fact that this wasn't being done previously was a bug.
+ * Fixed a bug (#451) affecting BED files with a `deepTools_group` column that caused a problem with `--sortRegions keep` in computeMatrix.
+ * Fixed a bug where some matrices produced with `computeMatrixOperations cbind` would result in the right-most samples sometimes getting squished due to having ticks outside of their graph bounds. Ticks are now scaled if they don't match the data range (issue #452).
+ * In plotFingerprint, the number of reads per-bin are no longer used. Instead, the sum of the per-base coverage (or signal if bigWig input is used) is used. This leads to more similar metrics produced by us and others regarding things like Jensen-Shannon metrics. For those just interested in the plots, there's little effective change here.
+
+2.4.0
+
+ * The --Offset option to bamCoverage can now take two values, which can be used to specify a range within each alignment of bases to use. As an example, `--Offset 5 -1` will use ignore the first 4 bases of an alignment (accounting for orientation) and use only the 5th through last base. This can be useful for things like ATACseq (see #370).
+ * Read extension can now be used in conjunction with --Offset in bamCoverage.
+ * plotFingerprint can now output quality metrics, including the Jensen-Shannon distance if a reference sample is specified (see #328). Additionally, various statistics from CHANCE can be produced.
+ * Switched from using the 'twobitreader' python module to our new custom 'py2bit' module for accessing 2bit files. This fixes the performance regression seen in computeGCBias starting in version 2.3.0 (#383).
+ * `bigwigCompare`, `computeMatrix`, and `multiBigwigSummary` can read signal files hosted on [deepBlue](http://deepblue.mpi-inf.mpg.de/).
+ * Fixed a minor bug in `deeptools`, where the `--version` option was ignored (see #404).
+ * Text in SVG and PDF files is now actual text and not a path (see #403).
+ * The `--maxFragmentLength` option in bamCoverage now alters the `maxPairedFragmentLength` that is otherwise hard-coded (see #410).
+ * Added the `computeMatrixOperations` tools, which can be used to sort/reorder/subset/filter/combine the output of `computeMatrix`.
+ * `computeMatrix --sortRegions` has a new `keep` option, which is the default. This mimics the behavior in deepTools prior to 2.3.0 where the output order matched the input order. This is, of course, a bit slower, so if the order doesn't matter then use `no`.
+ * Fixed issue #435, where `plotHeatmap --sortRegions region_length` would crash with an error.
+ * Output bedGraph files are now sorted (#439).
+ * Values stored in bedGraph files (and therefore placed into bigWig files) now use python's "general" format with 6 digits of precision. This tends to produce slightly larger files, but with less loss for values near 0 (see #438).
+ * Corrected how computeGCBias determines the lambda parameter, which should only really affect very atypical experiments (i.e., correctGCBias would have crashed is this greatly affected you).
+
+2.3.6
+
+ * multiBamSummary will now not automatically append .npz to the output file name if it's not present. This was bug #436
+ * Fixed a bug with plotHeatmap where --yMin and --yMax didn't work
+
+2.3.5
+
+ * Various Galaxy wrapper fixes (e.g., issue #415 and #417)
+ * Fixed issue #413, wherein the --nanAfterEnd option sometimes causes computeMatrix to throw an error.
+ * Fixed issue #416, wherein --outRawCounts in multiBamSummary and multiBigwigSummary would cause an error if python3 was being used.
+
+2.3.4
+
+ * Fixed bug #405, which dealt with the SES normalization in bamCompare (it was producing an error and terminating the program).
+ * Fixed bug #407, which dealt with multiBamSummary or multiBigwigSummary bins and saving the raw data. This was causing an error and the program to terminate.
+
+2.3.3
+
+ * Fixed a bug wherein proper pairs where being incorrectly called improper pairs, thereby causing slightly incorrect read extension.
+
+2.3.2
+
+ * The deeptoolsinterval module was modified to speed up plotEnrichment, which was taking forever to finish.
+
+2.3.1
+
+ * This release has no real code changes, the 2.3.0 release on pypi was missing files.
+
+2.3.0
+
+ * Modified how normalization is done when filtering is used. Previously, the filtering wasn't taken into account when computing the total number of alignments. That is now being done. Note that this uses sampling and will try to sample at least 100000 alignments and see what fraction of them are filtered. The total number of aligned reads is then scaled accordingly (#309).
+ * Modified how normalization is done when a blacklist is used. Previously, the number of alignments overlapping a blacklisted region was subtracted from the total number of alignments in the file. This decreased things a bit too much, since only alignments falling completely within a blacklisted region are actually excluded completely (#312).
+ * BED12 and GTF files can now be used as input (issue #71). Additionally, multiBamSummary, multiBigwigSummary and computeMatrix now have a --metagene option, which allows summarization over concatenated exons, rather than include introns as well (this has always been the default). This was issue #76.
+ * Read extension is handled more accurately, such that if a read originates outside of a bin or BED/GTF region that it will typically be included if the --extendReads option is used and the extension would put it in a given bin/region.
+ * deepTools now uses a custom interval-tree implementation that allows including metadata, such as gene/transcript IDs, along with intervals. For those interested, the code for this available separately (https://github.com/dpryan79/deeptools_intervals) with the original C-only implementation here: https://github.com/dpryan79/libGTF.
+ * The API for the countReadsPerBin, getScorePerBigWigBin, and mapReduce modules has changed slightly (this was needed to support the --metagene option). Anyone using these in their own programs is encouraged to look at the modified API before upgrading.
+ * Added the `plotEnrichment` function (this was issue #329).
+ * There is now a `subsetMatrix` script available that can be used to subset the output of computeMatrix. This is useful for preparing plots that only contain a subset of samples/region groups. Note that this isn't installed by default.
+ * The Galaxy wrappers were updated to include the ability to exclude blacklisted regions.
+ * Most functions (both at the command line and within Galaxy) that process BAM files can now filter by fragment length (--minFragmentLength and --maxFragmentLength). By default there's no filtering performed. The primary purpose of this is to facilitate ATACseq analysis, where fragment length determines whether one is processing mono-/di-/poly-nucleosome fragments. This was issue #336.
+ * bamPEFragmentSize now has --logScale and --maxFragmentLength options, which allow you to plot frequencies on the log scale and set the max plotted fragment length, respectively. This was issue #337.
+ * --blackListFileName now accepts multiple files.
+ * bamPEFragmentSize now supports multiple input files.
+ * If the sequence has been removed from BAM files, SE reads no longer cause an error in bamCoverage if --normalizeTo1x is specified. In general, the code that looks at read length now checks the CIGAR string if there's no sequence available in a BAM file (for both PE and SE datasets). This was issue #369.
+ * bamCoverage now respects the --filterRNAstrand option when computing scaling factors. This was issue #353.
+ * computeMatrix and plotHeatmap can now sort using only a subset of samples
+ * There is now an --Offset option to bamCoverage, which allows having the signal at a single base. This is useful for things like RiboSeq or GROseq, where the goal is to get focal peaks at single bases/codons/etc.
+ * The --MNase option to `bamCoverage` now respects --minFragmentLength and --maxFragmentLength, with defaults set to 130 and 200.
+
+2.2.4
+
+ * Fix the incorrectly oriented dendrogram in plotCorrelation (issue #350). Relatedly, we're bumping the minimum version of scipy required to one where this is correct.
+
+2.2.3
+
+ * Fixed issue #334, where computeGCBias wasn't properly handling the black list option.
+
+2.2.2
+
+ * Fixed labels when hierarchical clustering is used (they were off by one previously).
+ * Fixed a bug wherein bamCompare couldn't work with a blacklist
+ * Fixed yet another change in pysam, though at least in this case is was fixing a previous problem
+
+2.2.1
+
+ * Fixed a bug introduced in version 2.2.0 wherein sometimes a pre-2.2.0 produced matrix file could no longer be used with plotHeatmap or plotProfile (this only happened when --outFileNameData was then used).
+ * Finally suppressed all of the runtime warnings that numpy likes to randomly throw.
+ * Worked around an undocumented change in pysam-0.9.0 that tended to break things.
+
+2.2.0
+
+ * plotFingerprint now iterates through line styles as well as colors. This allows up to 35 samples per plot without repeating (not that that many would ever be recommended). This was issue #80.
+ * Fixed a number of Galaxy wrappers, which were rendered incorrectly due to including a section title of "Background".
+ * A number of image file handles were previously not explicitly closed, which caused occasional completion of a plot* program but without the files actually being there. This only happened on some NFS mount points.
+ * The Galaxy wrappers now support the `--outFileNameData` option on plotProfile and plotHeatmap.
+ * Added support for blacklist regions. These can be supplied as a BED file and the regions will largely be skipped in processing (they'll also be ignored during normalization). This is very useful to skip regions known to attract excess signal. This was issue #101.
+ * Modified plotPCA to include the actual eigenvalues rather than rescaled ones. Also, plotPCA can now output the underlying values (issue #231).
+ * Regions within each feature body can now be unscaled when using `computeMatrix`. Thus, if you're interested in unscaled signal around the TSS/TES then you can now use the `--unscaled5prime` and `--unscaled3prime` options. This was issue #108.
+ * bamCoverage now has a `--filterRNAstrand` option, that will produce coverage for only a single strand. Note that the strand referred to is the DNA strand and not sense/anti-sense.
+ * Issues with plotHeatmap x-axis labels were fixed (issue #301).
+
+2.1.1
+
+ * Fixed a how the --hclust option was handled in plotHeatmap/plotProfile. This gets around a quirk in scipy.
+ * A bug involving processing comment lines in BED files was corrected (issue #288)
+ * The Galaxy wrappers are now automatically tested with each modification.
+ * plotCoverage and plotFingerprint in Galaxy now accept 1 or more BAM files rather than at least 2 files.
+
+2.1.0
+
+ * Updates to many of the Galaxy wrappers and associated documentation.
+ * A bug was fixed in how chromosome names were dealt with in bigWig files. If you ever received errors due to illegal intervals then that should now be fixed. This was issue #250
+ * plotProfile now has an --outFileNameData option for saving the underlying data in a text format.
+ * correctGCBias ensures that the resulting BAM file will pass picard/HTSJDK's validation if the input file did (issue #248)
+ * The default bin size was changed to 10, which is typically a bit more useful
+ * The --regionsLabel option to plotProfile and plotHeatmap now accepts a space-separated list, in line with --samplesLabel
+ * BAM files that have had their sequences stripped no longer cause an error
+ * bamPEFragmentSize now has -bs and -n options to allow adjusting the number of alignments sampled. Note that the default value is auto-adjusted if the sampling is too sparse.
+ * bamPEFragmentSize now accepts single-end files.
+ * The --hclust option to plotProfile and plotHeatmap continues even if one of the groups is too small for plotting (matplotlib will produce a warning that you can ignore). This was issue #280.
+
+2.0.1
+
+ * A critical bug that prevented plotPCA from running was fixed.
+ * multiBamCoverage was renamed to multiBamSummary, to be in better alignment with multiBigwigSummary.
+ * computeGCBias and correctGCBias are now more tolerant of chromosome name mismatches.
+ * multiBigwigSummary and multiBamSummary can accept a single bigWig/BAM input file, though one should use the
+ --outRawCounts argument.
+
+2.0.0
+
+ * Documentation improved and migrated to http://deeptools.readthedocs.org The API to use deepTools modules is now
+ part of the documentation and includes a tutorial.
+ * Allow multiple bigwig files in computeMatrix that can be clustered together
+ * computeMatrix now accepts multiple bed files. Each bed file is considered as a group. Labels are automatically
+ added based on the file names.
+ * When computing read coverage now splited reads are understood. This is convenient for computing the
+ coverage of for RNA-seq data.
+ * New quality control tool 'plotCoverage' to plot the coverage over base pairs for multiple samples
+ * renaming of --missingDataAsZero to --skipNonCovered regions for clarity in bamCoverage and bamCompare
+ * New analysis tool plotPCA that visualizes the results from principal component analysis
+ * New option in bamCoverage `--MNase` that will compute the read coverage only considering 2 base pairs at the
+ center of the fragment.
+ * Make read extension optional. Remove the need to specify a default fragment length for most of the tools. Now, when
+ read extension is enabled and the bam files contain paired en data, the mean fragment length is automatically
+ calculated by sampling the read pairs in the bam file. The --doNotExtendPairedEnds and --fragmentLentgh parameters
+ are no longer used and the new --extendReads parameter was added.
+ * Dramatically improved bigwig related tools by using the new pyBigWig module. Eliminated the requirement for the
+ UCSC program `bigWigInfo`
+ * renamed heatmapper to plotHeatmap and profiler to plotProfile
+ * added hierarchical clustering, besides k-means to plotProfile and plotHeatmap
+ * improved plotting features for plotProfile when using 'overlapped_lines' and 'heatmap' plot types
+ * Resolved an error introduced by numpy version 1.10 in computeMatrix
+ * plotting of correlations (from bamCorrelate or bigwigCorrelate) was separated from the computation of the
+ underlying data. A new tool, plotCorrelation was added. This tool can plot correlations as heatmaps or as scatter
+ plots and includes options to adjust a large array of visual features.
+ * Fixed issue with bed intervals in bigwigCorrelate and bamCorrelate and a user specified region.
+ * Correlation coefficients can be computed even if the data contains NaNs
+ * Allow computeMatrix to read files with DOS newline characters
+ * Added option --skipChromosomes to bigwigCorrelate, for example to skip all 'random' chromosomes. bigwigCorrelate
+ now also considers chromosomes as identical when their names between samples differ with the prefix 'chr'. E.g.
+ chr1 vs. 1
+ * For bamCoverage and bamCompare, behaviour of scaleFactor was updated such that now, if given in combination
+ with the normalization options (normalize to 1x or normalize using RPKM) the given scaleFactor
+ will multiply the scale factor computed for the normalization methods.
+ * Fixed problem with read pairs labelled as proper pairs by the aligner but that were actually not proper pairs, for
+ example because the mates did not face each other. deepTools adds further checks to determine if a read pair is a
+ proper pair.
+ * Added titles to QC plots (#74)
+ * Added --samFlagInclude and --samFlagExclude parameters. This is useful to for example only include forward reads
+ * In deeptools2 most of the core code was rewriting to facilitate API usage and for optimization.
diff --git a/deepTools/source/LICENSE.txt b/deepTools/source/LICENSE.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0bd5bfd9ef3784bc3b5b0fc9537bc88370d5378f
--- /dev/null
+++ b/deepTools/source/LICENSE.txt
@@ -0,0 +1,9 @@
+The file deeptools/cm.py is licensed under the BSD license, see a copy in that file. The remainder of the code is licensed under the MIT license:
+
+Copyright 2019 Max Planck Institute for Immunobiology and Epigenetics
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/deepTools/source/MANIFEST.in b/deepTools/source/MANIFEST.in
new file mode 100644
index 0000000000000000000000000000000000000000..ab38d87a3f6a5ea8cad6f0af617be27474e24b63
--- /dev/null
+++ b/deepTools/source/MANIFEST.in
@@ -0,0 +1,8 @@
+include *.txt
+include README.md
+exclude examples/*
+exclude deepTools.egg-info/*
+include scripts/*
+exclude deeptools/test/*
+exclude galaxy/*
+exclude gallery/*
diff --git a/deepTools/source/README.md b/deepTools/source/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f3f614c2107ccf63eb534fbbaefd7d6e801cc15d
--- /dev/null
+++ b/deepTools/source/README.md
@@ -0,0 +1,68 @@
+# deepTools
+[](http://deeptools.readthedocs.org/)
+[](https://pypi.org/project/deepTools/)
+[](http://bioconda.github.io/recipes/deeptools/README.html)
+[](https://usegalaxy.eu/root?tool_id=deeptools_compute_matrix)
+
+
+
+## User-friendly tools for exploring deep-sequencing data
+
+deepTools addresses the challenge of handling the large amounts of data that are now routinely generated from DNA sequencing centers. deepTools contains useful modules to process the mapped reads data for multiple quality checks, creating **normalized coverage files** in standard bedGraph and bigWig file formats, that allow comparison between different files (for example, treatment and control). Finally, using such normalized and standardized files, deepTools can create many publication-ready **visualizations** to identify enrichments and for functional annotations of the genome.
+
+For support or questions please post to [Biostars](http://biostars.org). For bug reports and feature requests please open an issue [on github](http://github.com/deeptools/deeptools).
+
+
+### Citation:
+
+Ramírez F, Ryan DP, Grüning B, Bhardwaj V, Kilpert F, Richter AS, Heyne S, Dündar F, Manke T. [deepTools2: a next generation web server for deep-sequencing data analysis.](https://nar.oxfordjournals.org/content/early/2016/04/12/nar.gkw257.abstract) Nucleic Acids Research. 2016 Apr 13:gkw257.
+
+### Documentation:
+
+Our [documentation](http://deeptools.readthedocs.org/) contains more details on the [individual tool scopes and usages](http://deeptools.readthedocs.org/en/latest/content/list_of_tools.html) and an [introduction to our deepTools Galaxy web server](http://deeptools.readthedocs.org/en/latest/content/help_galaxy_intro.html) including [step-by-step protocols](http://deeptools.readthedocs.org/en/latest/content/example_usage.html).
+
+>Please see also the [FAQ](http://deeptools.readthedocs.org/en/latest/content/help_faq.html), which we update regularly.
+Our [Gallery](http://deeptools.readthedocs.org/en/latest/content/example_gallery.html) may give you some more ideas about the scope of deepTools.
+
+>For more specific **troubleshooting, feedback, and tool suggestions**, please post [to Biostars](http://biostars.org).
+
+
+-------------------------------------------------------------------------------------------------------------------
+
+### Installation
+
+deepTools are available for:
+
+* Command line usage (via pip / conda / github)
+* Integration into Galaxy servers (via toolshed/API/web-browser)
+
+There are many easy ways to install deepTools. More details can be found [here](https://deeptools.readthedocs.io/en/latest/content/installation.html).
+
+In Brief:
+
+**Install through pypi**
+
+ $ pip install deeptools
+
+**Install via conda**
+
+ $ conda install -c bioconda deeptools
+
+**Install by cloning the repository**
+
+ $ git clone https://github.com/deeptools/deepTools
+ $ cd deepTools
+ $ pip install .
+
+
+### Galaxy Installation
+
+deepTools can be easily integrated into [Galaxy](http://galaxyproject.org). Please see the [installation instructions in our documentation](http://deeptools.readthedocs.io/en/latest/content/installation.html#galaxy-installation) for further details.
+
+**Note:** From version 2.3 onwards, deepTools support **python3**.
+
+------------------------------------
+
+This tool suite is developed by the [Bioinformatics Facility](http://www1.ie-freiburg.mpg.de/bioinformaticsfac) at the [Max Planck Institute for Immunobiology and Epigenetics, Freiburg](http://www1.ie-freiburg.mpg.de/).
+
+[Documentation](http://deeptools.readthedocs.org/en/latest/index.html) | [deepTools Galaxy](http://deeptools.ie-freiburg.mpg.de) | [FAQ](http://deeptools.readthedocs.org/en/latest/content/help_faq.html)
diff --git a/deepTools/source/README.rst b/deepTools/source/README.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d0231e84166f183fec291f2968da790895a36eb9
--- /dev/null
+++ b/deepTools/source/README.rst
@@ -0,0 +1,29 @@
+======================================================================
+deepTools
+======================================================================
+
+User-friendly tools for exploring deep-sequencing data
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+deepTools addresses the challenge of handling the large amounts of data
+that are now routinely generated from DNA sequencing centers. deepTools
+contains useful modules to process the mapped reads data for multiple
+quality checks, creating **normalized coverage files** in standard
+bedGraph and bigWig file formats, that allow comparison between
+different files (for example, treatment and control). Finally, using
+such normalized and standardized files, deepTools can create many
+publication-ready **visualizations** to identify enrichments and for
+functional annotations of the genome.
+
+For support or questions please make a post on `Biostars `__. For feature requests, please open an issue on `github `__.
+
+For further documentation, please see our `read the docs page `__.
+
+Citation:
+^^^^^^^^^
+
+Ramírez F, Ryan DP, Grüning B, Bhardwaj V, Kilpert F, Richter AS, Heyne
+S, Dündar F, Manke T. `deepTools2: a next generation web server for
+deep-sequencing data
+analysis. `__
+Nucleic Acids Research. 2016 Apr 13:gkw257.
diff --git a/deepTools/source/__init__.py b/deepTools/source/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..21fb884895b36380a54174eb4d2bc555318e9451
--- /dev/null
+++ b/deepTools/source/__init__.py
@@ -0,0 +1,4 @@
+# -*- coding: utf-8 -*-
+"""
+deepTools Project Package Initialization File
+"""
diff --git a/deepTools/source/deeptools/SES_scaleFactor.py b/deepTools/source/deeptools/SES_scaleFactor.py
new file mode 100644
index 0000000000000000000000000000000000000000..76194b9ea3830e40c7c9adc8cf8ccc01c24f3ef1
--- /dev/null
+++ b/deepTools/source/deeptools/SES_scaleFactor.py
@@ -0,0 +1,195 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import os
+import numpy as np
+
+# own packages
+from deeptools import bamHandler
+import deeptools.countReadsPerBin as countR
+
+old_settings = np.seterr(all='ignore')
+debug = 0
+
+
+def estimateScaleFactor(bamFilesList, binLength, numberOfSamples,
+ normalizationLength,
+ avg_method='median', blackListFileName=None, numberOfProcessors=1,
+ verbose=False, chrsToSkip=[], mappingStatsList=[]):
+ r"""
+ Subdivides the genome into chunks to be analyzed in parallel
+ using several processors. The code handles the creation of
+ workers that compute fragment counts (coverage) for different
+ regions and then collect and integrates the results.
+
+ Parameters
+ ----------
+ bamFilesList : list
+ list of bam files to normalize
+ binLength : int
+ the window size in bp, where reads are going to be
+ counted.
+ numberOfSamples : int
+ number of sites to sample from the genome. For more info see
+ the documentation of the CountReadsPerBin class
+ normalizationLength : int
+ length, in bp, to normalize the data.
+ For a value of 1, on average
+ 1 read per base pair is found
+ avg_method : str
+ defines how the different values are to be summarized.
+ The options are 'mean' and 'median'
+ chrsToSkip : list
+ name of the chromosomes to be excluded from the
+ scale estimation. Usually the chrX is included.
+ blackListFileName : str
+ BED file containing blacklisted regions
+ mappingStatsList : list
+ List of the number of mapped reads per file
+
+ Returns
+ -------
+ dict
+ Dictionary with the following keys::
+ 'size_factors'
+ 'size_factors_based_on_mapped_reads'
+ 'size_factors_SES'
+ 'size_factors_based_on_mean'
+ 'size_factors_based_on_median'
+ 'mean'
+ 'meanSES'
+ 'median'
+ 'reads_per_bin'
+ 'std'
+ 'sites_sampled'
+
+
+ Examples
+ --------
+ >>> test = Tester()
+ >>> bin_length = 50
+ >>> num_samples = 4
+ >>> _dict = estimateScaleFactor([test.bamFile1, test.bamFile2], bin_length, num_samples, 1)
+ >>> _dict['size_factors']
+ array([1. , 0.5])
+ >>> _dict['size_factors_based_on_mean']
+ array([1. , 0.5])
+ """
+
+ assert len(bamFilesList) == 2, "SES scale factors are only defined for 2 files"
+
+ if len(mappingStatsList) == len(bamFilesList):
+ mappedReads = mappingStatsList
+ else:
+ mappedReads = []
+ for fname in bamFilesList:
+ mappedReads.append(bamHandler.openBam(fname, returnStats=True, nThreads=numberOfProcessors)[1])
+
+ sizeFactorBasedOnMappedReads = np.array(mappedReads, dtype='float64')
+
+ sizeFactorBasedOnMappedReads = sizeFactorBasedOnMappedReads.min() / sizeFactorBasedOnMappedReads
+
+ cr = countR.CountReadsPerBin(bamFilesList,
+ binLength=binLength,
+ numberOfSamples=numberOfSamples,
+ extendReads=False,
+ blackListFileName=blackListFileName,
+ numberOfProcessors=numberOfProcessors,
+ verbose=verbose,
+ chrsToSkip=chrsToSkip)
+
+ try:
+ num_reads_per_bin = cr.run()
+ except Exception as detail:
+ exit("*ERROR*: {}".format(detail))
+
+ sitesSampled = len(num_reads_per_bin)
+
+ # the transpose is taken to easily iterate by columns which are now
+ # converted to rows
+ num_reads_per_bin = num_reads_per_bin.transpose()
+ # size factors based on order statistics
+ # see Signal extraction scaling (SES) method in: Diaz et al (2012)
+ # Normalization, bias correction, and peak calling for ChIP-seq.
+ # Statistical applications in genetics and molecular biology, 11(3).
+
+ # using the same names as in Diaz paper
+ # p refers to ChIP, q to input
+
+ p = np.sort(num_reads_per_bin[0, :]).cumsum()
+ q = np.sort(num_reads_per_bin[1, :]).cumsum()
+
+ # p[-1] and q[-1] are the maximum values in the arrays.
+ # both p and q are normalized by this value
+ diff = np.abs(p / p[-1] - q / q[-1])
+ # get the lowest rank for wich the difference is the maximum
+ maxIndex = np.flatnonzero(diff == diff.max())[0]
+ # Take a lower rank to move to a region with probably
+ # less peaks and more background.
+ maxIndex = int(maxIndex * 0.8)
+ while maxIndex < len(p):
+ # in rare cases the maxIndex maps to a zero value.
+ # In such cases, the next index is used until
+ # a non zero value appears.
+ cumSum = np.array([float(p[maxIndex]), float(q[maxIndex])])
+ if cumSum.min() > 0:
+ break
+ maxIndex += 1
+
+ meanSES = [np.mean(np.sort(num_reads_per_bin[0, :])[:maxIndex]),
+ np.mean(np.sort(num_reads_per_bin[1, :])[:maxIndex])]
+
+ # the maxIndex may be too close to the the signal regions
+ # so i take a more conservative approach by taking a close number
+
+ sizeFactorsSES = cumSum.min() / cumSum
+ median = np.median(num_reads_per_bin, axis=1)
+
+ # consider only those read numbers that are below the 90
+ # percentile to stimate the
+ # mean and std
+ mean = []
+ std = []
+ for values in num_reads_per_bin:
+ maxNumReads = (np.percentile(values, 90))
+ if maxNumReads == 0:
+ maxNumReads = (np.percentile(values, 99))
+ if maxNumReads == 0:
+ print("all genomic regions sampled from one ")
+ "of the bam files have no reads.\n"
+ values = values[values <= maxNumReads]
+
+ mean.append(np.mean(values))
+ std.append(np.std(values))
+
+ mean = np.array(mean)
+ readsPerBin = mean if avg_method == 'mean' else median
+
+ if min(median) == 0:
+ idx_zero = [ix + 1 for ix, value in enumerate(median) if value == 0]
+ exit("\n*ERROR*: The median coverage computed is zero for sample(s) #{}\n"
+ "Try selecting a larger sample size or a region with coverage\n".format(idx_zero))
+
+ sizeFactor = sizeFactorsSES
+ return {'size_factors': sizeFactor,
+ 'size_factors_based_on_mapped_reads': sizeFactorBasedOnMappedReads,
+ 'size_factors_SES': sizeFactorsSES,
+ 'size_factors_based_on_mean': mean.min() / mean,
+ 'size_factors_based_on_median': median.min() / median,
+ 'mean': mean,
+ 'meanSES': meanSES,
+ 'median': median,
+ 'reads_per_bin': readsPerBin,
+ 'std': std,
+ 'sites_sampled': sitesSampled}
+
+
+class Tester(object):
+
+ def __init__(self):
+ self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/"
+ self.bamFile1 = self.root + "testA.bam"
+ self.bamFile2 = self.root + "testB.bam"
+ global debug
+ debug = 0
+ self.chrom = '3R'
diff --git a/deepTools/source/deeptools/__init__.py b/deepTools/source/deeptools/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/deepTools/source/deeptools/alignmentSieve.py b/deepTools/source/deeptools/alignmentSieve.py
new file mode 100644
index 0000000000000000000000000000000000000000..73a2473493e62bbc57d708d32de09ee4d93d3f3e
--- /dev/null
+++ b/deepTools/source/deeptools/alignmentSieve.py
@@ -0,0 +1,439 @@
+#!/usr/bin/env python
+import argparse
+import pysam
+import os
+import sys
+
+from deeptools import parserCommon
+from deeptools.bamHandler import openBam
+from deeptools.mapReduce import mapReduce
+from deeptools.utilities import getTLen, smartLabels, getTempFileName
+from importlib.metadata import version
+
+
+def parseArguments():
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ description="This tool filters alignments in a BAM/CRAM file according the the specified parameters. It can optionally output to BEDPE format.",
+ usage='alignmentSieve -b sample1.bam -o sample1.filtered.bam --minMappingQuality 10 --filterMetrics log.txt\n'
+ 'help: alignmentSieve -h / alignmentSieve --help')
+
+ required = parser.add_argument_group('Required arguments')
+ required.add_argument('--bam', '-b',
+ metavar='FILE1',
+ help='An indexed BAM file.',
+ required=True)
+
+ required.add_argument('--outFile', '-o',
+ help='The file to write results to. These are the alignments or fragments that pass the filtering criteria.')
+
+ general = parser.add_argument_group('General arguments')
+ general.add_argument('--numberOfProcessors', '-p',
+ help='Number of processors to use. Type "max/2" to '
+ 'use half the maximum number of processors or "max" '
+ 'to use all available processors. (Default: %(default)s)',
+ metavar="INT",
+ type=parserCommon.numberOfProcessors,
+ default=1,
+ required=False)
+
+ general.add_argument('--filterMetrics',
+ metavar="FILE.log",
+ help="The number of entries in total and filtered are saved to this file")
+
+ general.add_argument('--filteredOutReads',
+ metavar="filtered.bam",
+ help="If desired, all reads NOT passing the filtering criteria can be written to this file.")
+
+ general.add_argument('--label', '-l',
+ metavar='sample1',
+ help='User defined label instead of the default label '
+ '(file name).')
+
+ general.add_argument('--smartLabels',
+ action='store_true',
+ help='Instead of manually specifying a labels for the input '
+ 'file, this causes deepTools to use the file name '
+ 'after removing the path and extension.')
+
+ general.add_argument('--verbose', '-v',
+ help='Set to see processing messages.',
+ action='store_true')
+
+ general.add_argument('--version', action='version',
+ version='%(prog)s {}'.format(version('deeptools')))
+
+ general.add_argument('--shift',
+ nargs='+',
+ type=int,
+ help='Shift the left and right end of a read (for BAM files) or a fragment (for BED files). A positive value shift an end to the right (on the + strand) and a negative value shifts a fragment to the left. Either 2 or 4 integers can be provided. For example, "2 -3" will shift the left-most fragment end two bases to the right and the right-most end 3 bases to the left. If 4 integers are provided, then the first and last two refer to fragments whose read 1 is on the left or right, respectively. Consequently, it is possible to take strand into consideration for strand-specific protocols. A fragment whose length falls below 1 due to shifting will not be written to the output. See the online documentation for graphical examples. Note that non-properly-paired reads will be filtered.')
+
+ general.add_argument('--ATACshift',
+ action='store_true',
+ help='Shift the produced BAM file or BEDPE regions as commonly done for ATAC-seq. This is equivalent to --shift 4 -5 5 -4.')
+
+ general.add_argument('--genomeChunkLength',
+ type=int,
+ default=int(1e6),
+ help='Size of the genome (in bps) to be processed per thread. (Default: %(default)s)')
+
+ output = parser.add_argument_group('Output arguments')
+ output.add_argument('--BED',
+ action='store_true',
+ help='Instead of producing BAM files, write output in BEDPE format (as defined by MACS2). Note that only reads/fragments passing filtering criterion are written in BEDPE format.')
+
+ filtering = parser.add_argument_group('Optional arguments')
+
+ filtering.add_argument('--filterRNAstrand',
+ help='Selects RNA-seq reads (single-end or paired-end) in '
+ 'the given strand. (Default: %(default)s)',
+ choices=['forward', 'reverse'],
+ default=None)
+
+ filtering.add_argument('--ignoreDuplicates',
+ help='If set, reads that have the same orientation '
+ 'and start position will be considered only '
+ 'once. If reads are paired, the mate\'s position '
+ 'also has to coincide to ignore a read.',
+ action='store_true')
+
+ filtering.add_argument('--minMappingQuality',
+ metavar='INT',
+ help='If set, only reads that have a mapping '
+ 'quality score of at least this are '
+ 'considered.',
+ type=int)
+
+ filtering.add_argument('--samFlagInclude',
+ help='Include reads based on the SAM flag. For example, '
+ 'to get only reads that are the first mate, use a flag of 64. '
+ 'This is useful to count properly paired reads only once, '
+ 'as otherwise the second mate will be also considered for the '
+ 'coverage.',
+ metavar='INT',
+ default=None,
+ type=int,
+ required=False)
+
+ filtering.add_argument('--samFlagExclude',
+ help='Exclude reads based on the SAM flag. For example, '
+ 'to get only reads that map to the forward strand, use '
+ '--samFlagExclude 16, where 16 is the SAM flag for reads '
+ 'that map to the reverse strand.',
+ metavar='INT',
+ default=None,
+ type=int,
+ required=False)
+
+ filtering.add_argument('--blackListFileName', '-bl',
+ help="A BED or GTF file containing regions that should be excluded from all analyses. Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered. Please note that you should adjust the effective genome size, if relevant.",
+ metavar="BED file",
+ nargs="+",
+ required=False)
+
+ filtering.add_argument('--minFragmentLength',
+ help='The minimum fragment length needed for read/pair '
+ 'inclusion. This option is primarily useful '
+ 'in ATACseq experiments, for filtering mono- or '
+ 'di-nucleosome fragments. (Default: %(default)s)',
+ metavar='INT',
+ default=0,
+ type=int,
+ required=False)
+
+ filtering.add_argument('--maxFragmentLength',
+ help='The maximum fragment length needed for read/pair '
+ 'inclusion. A value of 0 indicates no limit. (Default: %(default)s)',
+ metavar='INT',
+ default=0,
+ type=int,
+ required=False)
+
+ return parser
+
+
+def shiftRead(b, chromDict, args):
+ if not b.is_proper_pair:
+ return None
+ tLen = getTLen(b, notAbs=True)
+ start = b.pos
+ end = start + b.query_alignment_end
+ if b.is_reverse and not b.is_read2:
+ end -= args.shift[2]
+ deltaTLen = args.shift[3] - args.shift[2]
+ elif b.is_reverse and b.is_read2:
+ end += args.shift[1]
+ deltaTLen = args.shift[1] - args.shift[0]
+ elif not b.is_reverse and not b.is_read2:
+ start += args.shift[0]
+ deltaTLen = args.shift[1] - args.shift[0]
+ else:
+ start -= args.shift[3]
+ deltaTLen = args.shift[3] - args.shift[2]
+
+ # Sanity check
+ if end - start < 1:
+ if b.is_reverse:
+ start = end - 1
+ else:
+ end = start + 1
+ if start < 0:
+ start = 0
+ if end > chromDict[b.reference_name]:
+ end = chromDict[b.reference_name]
+ if end - start < 1:
+ return None
+
+ # create a new read
+ b2 = pysam.AlignedSegment()
+ b2.query_name = b.query_name
+ b2.flag = b.flag
+ b2.reference_id = b.reference_id
+ b2.reference_start = start
+ b2.mapping_quality = b.mapping_quality
+ b2.cigar = ((0, end - start),) # Returned cigar is only matches
+ if tLen < 0:
+ b2.template_length = tLen - deltaTLen
+ else:
+ b2.template_length = tLen + deltaTLen
+ b2.next_reference_id = b.next_reference_id
+ b2.next_reference_start = b.next_reference_start
+ if b.is_proper_pair:
+ if b2.is_read2 and b2.is_reverse:
+ b2.next_reference_start += args.shift[0]
+ elif not b2.is_read2 and b2.is_reverse:
+ b2.next_reference_start -= args.shift[3]
+
+ return b2
+
+
+def filterWorker(arglist):
+ chrom, start, end, args, chromDict = arglist
+ fh = openBam(args.bam)
+ mode = 'wb'
+ oname = getTempFileName(suffix='.bam')
+ if args.filteredOutReads:
+ onameFiltered = getTempFileName(suffix='.bam')
+ else:
+ onameFiltered = None
+ ofh = pysam.AlignmentFile(oname, mode=mode, template=fh)
+ if onameFiltered:
+ ofiltered = pysam.AlignmentFile(onameFiltered, mode=mode, template=fh)
+ else:
+ ofiltered = None
+
+ prev_pos = set()
+ lpos = None
+
+ nFiltered = 0
+ total = 0
+ for read in fh.fetch(chrom, start, end):
+ if read.pos < start:
+ # ensure that we never double count (in case distanceBetweenBins == 0)
+ continue
+
+ total += 1
+ if read.flag & 4:
+ # Ignore unmapped reads, they were counted already
+ nFiltered += 1
+ if ofiltered:
+ ofiltered.write(read)
+ continue
+
+ if args.minMappingQuality and read.mapq < args.minMappingQuality:
+ nFiltered += 1
+ if ofiltered:
+ ofiltered.write(read)
+ continue
+
+ if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude:
+ nFiltered += 1
+ if ofiltered:
+ ofiltered.write(read)
+ continue
+ if args.samFlagExclude and read.flag & args.samFlagExclude != 0:
+ nFiltered += 1
+ if ofiltered:
+ ofiltered.write(read)
+ continue
+
+ tLen = getTLen(read)
+ if args.minFragmentLength > 0 and tLen < args.minFragmentLength:
+ nFiltered += 1
+ if ofiltered:
+ ofiltered.write(read)
+ continue
+ if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength:
+ nFiltered += 1
+ if ofiltered:
+ ofiltered.write(read)
+ continue
+
+ if args.ignoreDuplicates:
+ # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions
+ if tLen >= 0:
+ s = read.pos
+ e = s + tLen
+ else:
+ s = read.pnext
+ e = s - tLen
+ if read.reference_id != read.next_reference_id:
+ e = read.pnext
+ if lpos is not None and lpos == read.reference_start \
+ and (s, e, read.next_reference_id, read.is_reverse) in prev_pos:
+ nFiltered += 1
+ if ofiltered:
+ ofiltered.write(read)
+ continue
+ if lpos != read.reference_start:
+ prev_pos.clear()
+ lpos = read.reference_start
+ prev_pos.add((s, e, read.next_reference_id, read.is_reverse))
+
+ # filterRNAstrand
+ if args.filterRNAstrand:
+ if read.is_paired:
+ if args.filterRNAstrand == 'forward':
+ if read.flag & 144 == 128 or read.flag & 96 == 64:
+ pass
+ else:
+ nFiltered += 1
+ if ofiltered:
+ ofiltered.write(read)
+ continue
+ elif args.filterRNAstrand == 'reverse':
+ if read.flag & 144 == 144 or read.flag & 96 == 96:
+ pass
+ else:
+ nFiltered += 1
+ if ofiltered:
+ ofiltered.write(read)
+ continue
+ else:
+ if args.filterRNAstrand == 'forward':
+ if read.flag & 16 == 16:
+ pass
+ else:
+ nFiltered += 1
+ if ofiltered:
+ ofiltered.write(read)
+ continue
+ elif args.filterRNAstrand == 'reverse':
+ if read.flag & 16 == 0:
+ pass
+ else:
+ nFiltered += 1
+ if ofiltered:
+ ofiltered.write(read)
+ continue
+
+ if args.shift:
+ read = shiftRead(read, chromDict, args)
+ if not read:
+ continue
+
+ # Read survived filtering
+ ofh.write(read)
+
+ # The results from the workers will get sorted, so get the TID
+ tid = fh.get_tid(chrom)
+
+ ofh.close()
+ if ofiltered:
+ ofiltered.close()
+ fh.close()
+ return tid, start, total, nFiltered, oname, onameFiltered
+
+
+def convertBED(oname, tmpFiles, chromDict):
+ """
+ Stores results in BEDPE format, which is:
+ chromosome frag_leftend frag_rightend
+
+ The fragment ends can be shifted
+ """
+ ofile = open(oname, "w")
+ for tmpFile in tmpFiles:
+ # Setting verbosity to avoid lack of index error/warning
+ pysam.set_verbosity(0)
+ fh = pysam.AlignmentFile(tmpFile)
+ # Reset verbosity
+ pysam.set_verbosity(3)
+ for b in fh.fetch(until_eof=True):
+ tLen = getTLen(b, notAbs=True)
+ if tLen > 0:
+ start = b.pos
+ end = start + tLen
+ if end > chromDict[b.reference_name]:
+ end = chromDict[b.reference_name]
+ if end - start < 1:
+ continue
+ ofile.write("{}\t{}\t{}\n".format(b.reference_name, start, end))
+ fh.close()
+ os.unlink(tmpFile)
+ ofile.close()
+
+
+def main(args=None):
+ args = parseArguments().parse_args(args)
+ if args.shift:
+ if len(args.shift) not in [2, 4]:
+ sys.exit("The --shift option can accept either 2 or 4 values only.")
+ if len(args.shift) == 2:
+ args.shift.extend([-args.shift[1], -args.shift[0]])
+ elif args.ATACshift:
+ args.shift = [4, -5, 5, -4]
+
+ bam, mapped, unmapped, stats = openBam(args.bam, returnStats=True, nThreads=args.numberOfProcessors)
+ total = mapped + unmapped
+ chrom_sizes = [(x, y) for x, y in zip(bam.references, bam.lengths)]
+ chromDict = {x: y for x, y in zip(bam.references, bam.lengths)}
+
+ # Filter, writing the results to a bunch of temporary files
+ res = mapReduce([args, chromDict],
+ filterWorker,
+ chrom_sizes,
+ blackListFileName=args.blackListFileName,
+ numberOfProcessors=args.numberOfProcessors,
+ genomeChunkLength=args.genomeChunkLength,
+ verbose=args.verbose)
+
+ res = sorted(res) # The temp files are now in order for concatenation
+ nFiltered = sum([x[3] for x in res])
+ totalSeen = sum([x[2] for x in res]) # The * contig isn't queried
+
+ tmpFiles = [x[4] for x in res]
+ if not args.BED:
+ arguments = ["-o", args.outFile]
+ arguments.extend(tmpFiles) # [..., *someList] isn't available in python 2.7
+ pysam.samtools.cat(*arguments)
+ for tmpFile in tmpFiles:
+ os.unlink(tmpFile)
+ else:
+ convertBED(args.outFile, tmpFiles, chromDict)
+
+ if args.filteredOutReads:
+ tmpFiles = [x[5] for x in res]
+ if not args.BED:
+ arguments = ["-o", args.filteredOutReads]
+ arguments.extend(tmpFiles) # [..., *someList] isn't available in python 2.7
+ pysam.samtools.cat(*arguments)
+ for tmpFile in tmpFiles:
+ os.unlink(tmpFile)
+ else:
+ convertBED(args.outFile, tmpFiles, chromDict, args)
+
+ if args.filterMetrics:
+ sampleName = args.bam
+ if args.label:
+ sampleName = args.label
+ if args.smartLabels:
+ sampleName = smartLabels([args.bam])[0]
+
+ of = open(args.filterMetrics, "w")
+ of.write("#bamFilterReads --filterMetrics\n")
+ of.write("#File\tReads Remaining\tTotal Initial Reads\n")
+ of.write("{}\t{}\t{}\n".format(sampleName, totalSeen - nFiltered, total))
+ of.close()
+
+ return 0
diff --git a/deepTools/source/deeptools/bamCompare.py b/deepTools/source/deeptools/bamCompare.py
new file mode 100644
index 0000000000000000000000000000000000000000..223bc06c95695d9097c8d2c9f19c2066710c240e
--- /dev/null
+++ b/deepTools/source/deeptools/bamCompare.py
@@ -0,0 +1,314 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import argparse # to parse command line arguments
+import numpy as np
+import sys
+
+# my packages
+from deeptools import writeBedGraph
+from deeptools.SES_scaleFactor import estimateScaleFactor
+from deeptools import parserCommon
+from deeptools import bamHandler
+from deeptools.getRatio import getRatio
+from deeptools.getScaleFactor import get_num_kept_reads
+from deeptools.getScaleFactor import get_scale_factor
+debug = 0
+old_settings = np.seterr(all='ignore')
+
+
+def parseArguments():
+ parentParser = parserCommon.getParentArgParse()
+ bamParser = parserCommon.read_options()
+ normalizationParser = parserCommon.normalization_options()
+ requiredArgs = getRequiredArgs()
+ optionalArgs = getOptionalArgs()
+ outputParser = parserCommon.output()
+ parser = argparse.ArgumentParser(
+ parents=[requiredArgs, outputParser, optionalArgs,
+ parentParser, normalizationParser, bamParser],
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ description='This tool compares two BAM files based on the number of '
+ 'mapped reads. To compare the BAM files, the genome is partitioned '
+ 'into bins of equal size, then the number of reads found in each bin'
+ ' is counted per file, and finally a summary value is '
+ 'reported. This value can be the ratio of the number of reads per '
+ 'bin, the log2 of the ratio, or the difference. This tool can '
+ 'normalize the number of reads in each BAM file using the SES method '
+ 'proposed by Diaz et al. (2012) "Normalization, bias correction, and '
+ 'peak calling for ChIP-seq". Statistical Applications in Genetics '
+ 'and Molecular Biology, 11(3). Normalization based on read counts '
+ 'is also available. The output is either a bedgraph or bigWig file '
+ 'containing the bin location and the resulting comparison value. '
+ 'Note that *each end* in a pair (for paired-end reads) is treated '
+ 'independently. If this is undesirable, then use the --samFlagInclude '
+ 'or --samFlagExclude options.',
+
+ usage='bamCompare -b1 treatment.bam -b2 control.bam -o log2ratio.bw\n'
+ 'help: bamCompare -h / bamCompare --help',
+
+ add_help=False)
+
+ return parser
+
+
+def getRequiredArgs():
+ parser = argparse.ArgumentParser(add_help=False)
+
+ required = parser.add_argument_group('Required arguments')
+
+ # define the arguments
+ required.add_argument('--bamfile1', '-b1',
+ metavar='BAM file',
+ help='Sorted BAM file 1. Usually the BAM file '
+ 'for the treatment.',
+ required=True)
+
+ required.add_argument('--bamfile2', '-b2',
+ metavar='BAM file',
+ help='Sorted BAM file 2. Usually the BAM '
+ 'file for the control.',
+ required=True)
+
+ return parser
+
+
+def getOptionalArgs():
+
+ parser = argparse.ArgumentParser(add_help=False)
+ optional = parser.add_argument_group('Optional arguments')
+
+ optional.add_argument("--help", "-h", action="help",
+ help="show this help message and exit")
+
+ optional.add_argument('--scaleFactorsMethod',
+ help='Method to use to scale the samples. '
+ 'If a method is specified, then it will be used to compensate '
+ 'for sequencing depth differences between the samples. '
+ 'As an alternative, this can be set to None and an option from '
+ '--normalizeUsing can be used. (Default: %(default)s)',
+ choices=['readCount', 'SES', 'None'],
+ default='readCount')
+
+ optional.add_argument('--sampleLength', '-l',
+ help='*Only relevant when SES is chosen for the '
+ 'scaleFactorsMethod.* To compute the SES, specify '
+ 'the length (in bases) of the regions (see --numberOfSamples) '
+ 'that will be randomly sampled to calculate the scaling factors. '
+ 'If you do not have a good sequencing depth for '
+ 'your samples consider increasing the sampling '
+ 'regions\' size to minimize the probability '
+ 'that zero-coverage regions are used. (Default: %(default)s)',
+ default=1000,
+ type=int)
+
+ optional.add_argument('--numberOfSamples', '-n',
+ help='*Only relevant when SES is chosen for the '
+ 'scaleFactorsMethod.* Number of samplings taken '
+ 'from the genome to compute the scaling factors. (Default: %(default)s)',
+ default=1e5,
+ type=int)
+
+ optional.add_argument('--scaleFactors',
+ help='Set this parameter manually to avoid the computation of '
+ 'scaleFactors. The format is scaleFactor1:scaleFactor2.'
+ 'For example, --scaleFactor 0.7:1 will cause the first BAM file to'
+ 'be multiplied by 0.7, while not scaling '
+ 'the second BAM file (multiplication with 1).',
+ default=None,
+ required=False)
+
+ optional.add_argument('--operation',
+ help='The default is to output the log2 ratio of the '
+ 'two samples. The reciprocal ratio returns the '
+ 'the negative of the inverse of the ratio '
+ 'if the ratio is less than 0. The resulting '
+ 'values are interpreted as negative fold changes. '
+ 'Instead of performing a computation using both files, the scaled signal can '
+ 'alternatively be output for the first or second file using '
+ 'the \'--operation first\' or \'--operation second\'. (Default: %(default)s)',
+ default='log2',
+ choices=['log2', 'ratio', 'subtract', 'add', 'mean',
+ 'reciprocal_ratio', 'first', 'second'],
+ required=False)
+
+ optional.add_argument('--pseudocount',
+ help='A small number to avoid x/0. Only useful '
+ 'together with --operation log2 or --operation ratio. '
+ 'You can specify different values as pseudocounts for '
+ 'the numerator and the denominator by providing two '
+ 'values (the first value is used as the numerator '
+ 'pseudocount and the second the denominator pseudocount). (Default: %(default)s)',
+ default=[1],
+ type=float,
+ nargs='+',
+ action=parserCommon.requiredLength(1, 2),
+ required=False)
+
+ optional.add_argument('--skipZeroOverZero',
+ help='Skip bins where BOTH BAM files lack coverage. '
+ 'This is determined BEFORE any applicable pseudocount '
+ 'is added.',
+ action='store_true')
+
+ return parser
+
+
+def process_args(args=None):
+ args = parseArguments().parse_args(args)
+
+ if args.smoothLength and args.smoothLength <= args.binSize:
+ print("Warning: the smooth length given ({}) is smaller than the bin "
+ "size ({}).\n\n No smoothing will be "
+ "done".format(args.smoothLength,
+ args.binSize))
+ args.smoothLength = None
+
+ if not args.ignoreForNormalization:
+ args.ignoreForNormalization = []
+
+ if not isinstance(args.pseudocount, list):
+ args.pseudocount = [args.pseudocount]
+
+ if len(args.pseudocount) == 1:
+ args.pseudocount *= 2
+
+ return args
+
+# get_scale_factors function is used for scaling in bamCompare
+# while get_scale_factor is used for depth normalization
+
+
+def get_scale_factors(args, statsList, mappedList):
+
+ if args.scaleFactors:
+ scale_factors = list(map(float, args.scaleFactors.split(":")))
+ elif args.scaleFactorsMethod == 'SES':
+ scalefactors_dict = estimateScaleFactor(
+ [args.bamfile1, args.bamfile2],
+ args.sampleLength, args.numberOfSamples,
+ 1,
+ mappingStatsList=mappedList,
+ blackListFileName=args.blackListFileName,
+ numberOfProcessors=args.numberOfProcessors,
+ verbose=args.verbose,
+ chrsToSkip=args.ignoreForNormalization)
+
+ scale_factors = scalefactors_dict['size_factors']
+
+ if args.verbose:
+ print("Size factors using SES: {}".format(scale_factors))
+ print("%s regions of size %s where used " %
+ (scalefactors_dict['sites_sampled'],
+ args.sampleLength))
+
+ print("ignoring filtering/blacklists, size factors if the number of mapped "
+ "reads would have been used:")
+ print(tuple(
+ float(min(mappedList)) / np.array(mappedList)))
+
+ elif args.scaleFactorsMethod == 'readCount':
+ # change the scaleFactor to 1.0
+ args.scaleFactor = 1.0
+ # get num of kept reads for bam file 1
+ args.bam = args.bamfile1
+ bam1_mapped, _ = get_num_kept_reads(args, statsList[0])
+ # get num of kept reads for bam file 2
+ args.bam = args.bamfile2
+ bam2_mapped, _ = get_num_kept_reads(args, statsList[1])
+
+ mapped_reads = [bam1_mapped, bam2_mapped]
+
+ # new scale_factors (relative to min of two bams)
+ scale_factors = float(min(bam1_mapped, bam2_mapped)) / np.array(mapped_reads)
+ if args.verbose:
+ print("Size factors using total number "
+ "of mapped reads: {}".format(scale_factors))
+
+ elif args.scaleFactorsMethod == 'None':
+ scale_factors = None
+
+ return scale_factors
+
+
+def main(args=None):
+ """
+ The algorithm is composed of two steps.
+
+
+ 1. Per-sample scaling / depth Normalization:
+ + If scaling is used (using the SES or read counts method), appropriate scaling
+ factors are determined to account for sequencing depth differences.
+ + Optionally scaling can be turned off and individual samples could be depth normalized using
+ RPKM, BPM or CPM methods
+
+ 2. Ratio calculation between two bam files:
+ + The genome is transversed and computing
+ the log ratio/ratio/difference etc. for bins of fixed width
+ given by the user.
+
+ """
+ args = process_args(args)
+
+ if args.normalizeUsing == "RPGC":
+ sys.exit("RPGC normalization (--normalizeUsing RPGC) is not supported with bamCompare!")
+ if args.normalizeUsing == 'None':
+ args.normalizeUsing = None # For the sake of sanity
+ if args.scaleFactorsMethod != 'None' and args.normalizeUsing:
+ sys.exit("`--normalizeUsing {}` is only valid if you also use `--scaleFactorsMethod None`! To prevent erroneous output, I will quit now.\n".format(args.normalizeUsing))
+
+ # Get mapping statistics
+ bam1, mapped1, unmapped1, stats1 = bamHandler.openBam(args.bamfile1, returnStats=True, nThreads=args.numberOfProcessors)
+ bam1.close()
+ bam2, mapped2, unmapped2, stats2 = bamHandler.openBam(args.bamfile2, returnStats=True, nThreads=args.numberOfProcessors)
+ bam2.close()
+
+ scale_factors = get_scale_factors(args, [stats1, stats2], [mapped1, mapped2])
+ if scale_factors is None:
+ # check whether one of the depth norm methods are selected
+ if args.normalizeUsing is not None:
+ args.scaleFactor = 1.0
+ # if a normalization is required then compute the scale factors
+ args.bam = args.bamfile1
+ scale_factor_bam1 = get_scale_factor(args, stats1)
+ args.bam = args.bamfile2
+ scale_factor_bam2 = get_scale_factor(args, stats2)
+ scale_factors = [scale_factor_bam1, scale_factor_bam2]
+ else:
+ scale_factors = [1, 1]
+
+ if args.verbose:
+ print("Individual scale factors are {0}".format(scale_factors))
+
+ # the getRatio function is called and receives
+ # the func_args per each tile that is considered
+ FUNC = getRatio
+ func_args = {'valueType': args.operation,
+ 'scaleFactors': scale_factors,
+ 'pseudocount': args.pseudocount
+ }
+
+ wr = writeBedGraph.WriteBedGraph([args.bamfile1, args.bamfile2], args.binSize, 0,
+ stepSize=args.binSize,
+ region=args.region,
+ numberOfProcessors=args.numberOfProcessors,
+ extendReads=args.extendReads,
+ blackListFileName=args.blackListFileName,
+ minMappingQuality=args.minMappingQuality,
+ ignoreDuplicates=args.ignoreDuplicates,
+ center_read=args.centerReads,
+ zerosToNans=args.skipNonCoveredRegions,
+ skipZeroOverZero=args.skipZeroOverZero,
+ samFlag_include=args.samFlagInclude,
+ samFlag_exclude=args.samFlagExclude,
+ minFragmentLength=args.minFragmentLength,
+ maxFragmentLength=args.maxFragmentLength,
+ chrsToSkip=args.ignoreForNormalization,
+ verbose=args.verbose
+ )
+
+ wr.run(FUNC, func_args, args.outFileName, blackListFileName=args.blackListFileName, format=args.outFileFormat, smoothLength=args.smoothLength)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/deepTools/source/deeptools/bamCoverage.py b/deepTools/source/deeptools/bamCoverage.py
new file mode 100644
index 0000000000000000000000000000000000000000..acca196fc1be7594eb7c16593e143b6b60733283
--- /dev/null
+++ b/deepTools/source/deeptools/bamCoverage.py
@@ -0,0 +1,416 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# own tools
+import argparse
+import sys
+import numpy as np
+from deeptools import writeBedGraph # This should be made directly into a bigWig
+from deeptools import parserCommon
+from deeptools.getScaleFactor import get_scale_factor
+from deeptools.bamHandler import openBam
+
+debug = 0
+
+
+def parseArguments():
+ parentParser = parserCommon.getParentArgParse()
+ bamParser = parserCommon.read_options()
+ normalizationParser = parserCommon.normalization_options()
+ requiredArgs = get_required_args()
+ optionalArgs = get_optional_args()
+ outputParser = parserCommon.output()
+ parser = \
+ argparse.ArgumentParser(
+ parents=[requiredArgs, outputParser, optionalArgs,
+ parentParser, normalizationParser, bamParser],
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ description='This tool takes an alignment of reads or fragments '
+ 'as input (BAM file) and generates a coverage track (bigWig or '
+ 'bedGraph) as output. '
+ 'The coverage is calculated as the number of reads per bin, '
+ 'where bins are short consecutive counting windows of a defined '
+ 'size. It is possible to extended the length of the reads '
+ 'to better reflect the actual fragment length. *bamCoverage* '
+ 'offers normalization by scaling factor, Reads Per Kilobase per '
+ 'Million mapped reads (RPKM), counts per million (CPM), bins per '
+ 'million mapped reads (BPM) and 1x depth (reads per genome '
+ 'coverage, RPGC).\n',
+ usage='bamCoverage -b reads.bam -o coverage.bw\n'
+ 'help: bamCoverage -h / bamCoverage --help',
+ add_help=False)
+
+ return parser
+
+
+def get_required_args():
+ parser = argparse.ArgumentParser(add_help=False)
+
+ required = parser.add_argument_group('Required arguments')
+
+ # define the arguments
+ required.add_argument('--bam', '-b',
+ help='BAM file to process',
+ metavar='BAM file',
+ required=True)
+
+ return parser
+
+
+def get_optional_args():
+
+ parser = argparse.ArgumentParser(add_help=False)
+ optional = parser.add_argument_group('Optional arguments')
+
+ optional.add_argument("--help", "-h", action="help",
+ help="show this help message and exit")
+
+ optional.add_argument('--scaleFactor',
+ help='The computed scaling factor (or 1, if not applicable) will '
+ 'be multiplied by this. (Default: %(default)s)',
+ default=1.0,
+ type=float,
+ required=False)
+
+ optional.add_argument('--MNase',
+ help='Determine nucleosome positions from MNase-seq data. '
+ 'Only 3 nucleotides at the center of each fragment are counted. '
+ 'The fragment ends are defined by the two mate reads. Only fragment lengths'
+ 'between 130 - 200 bp are considered to avoid dinucleosomes or other artifacts. '
+ 'By default, any fragments smaller or larger than this are ignored. To '
+ 'over-ride this, use the --minFragmentLength and --maxFragmentLength options, '
+ 'which will default to 130 and 200 if not otherwise specified in the presence '
+ 'of --MNase. *NOTE*: Requires paired-end data. A bin size of 1 is recommended.',
+ action='store_true')
+
+ optional.add_argument('--Offset',
+ help='Uses this offset inside of each read as the signal. This is useful in '
+ 'cases like RiboSeq or GROseq, where the signal is 12, 15 or 0 bases past the '
+ 'start of the read. This can be paired with the --filterRNAstrand option. '
+ 'Note that negative values indicate offsets from the end of each read. A value '
+ 'of 1 indicates the first base of the alignment (taking alignment orientation '
+ 'into account). Likewise, a value of -1 is the last base of the alignment. An '
+ 'offset of 0 is not permitted. If two values are specified, then they will be '
+ 'used to specify a range of positions. Note that specifying something like '
+ '--Offset 5 -1 will result in the 5th through last position being used, which '
+ 'is equivalent to trimming 4 bases from the 5-prime end of alignments. Note '
+ 'that if you specify --centerReads, the centering will be performed before the '
+ 'offset.',
+ metavar='INT',
+ type=int,
+ nargs='+',
+ required=False)
+
+ optional.add_argument('--filterRNAstrand',
+ help='Selects RNA-seq reads (single-end or paired-end) originating from genes '
+ 'on the given strand. This option assumes a standard dUTP-based library '
+ 'preparation (that is, --filterRNAstrand=forward keeps minus-strand reads, '
+ 'which originally came from genes on the forward strand using a dUTP-based '
+ 'method). Consider using --samExcludeFlag instead for filtering by strand in '
+ 'other contexts.',
+ choices=['forward', 'reverse'],
+ default=None)
+
+ return parser
+
+
+def scaleFactor(string):
+ try:
+ scalefactor1, scalefactor2 = string.split(":")
+ scalefactors = (float(scalefactor1), float(scalefactor2))
+ except:
+ raise argparse.ArgumentTypeError(
+ "Format of scaleFactors is factor1:factor2. "
+ "The value given ( {} ) is not valid".format(string))
+
+ return scalefactors
+
+
+def process_args(args=None):
+ args = parseArguments().parse_args(args)
+
+ if args.smoothLength and args.smoothLength <= args.binSize:
+ print("Warning: the smooth length given ({}) is smaller than the bin "
+ "size ({}).\n\n No smoothing will be done".format(args.smoothLength, args.binSize))
+ args.smoothLength = None
+
+ if not args.ignoreForNormalization:
+ args.ignoreForNormalization = []
+
+ return args
+
+
+def main(args=None):
+ args = process_args(args)
+
+ global debug
+ if args.verbose:
+ sys.stderr.write("Specified --scaleFactor: {}\n".format(args.scaleFactor))
+ debug = 1
+ else:
+ debug = 0
+
+ if args.normalizeUsing == 'None':
+ args.normalizeUsing = None # For the sake of sanity
+ elif args.normalizeUsing == 'RPGC' and not args.effectiveGenomeSize:
+ sys.exit("RPGC normalization requires an --effectiveGenomeSize!\n")
+
+ if args.normalizeUsing:
+ # if a normalization is required then compute the scale factors
+ bam, mapped, unmapped, stats = openBam(args.bam, returnStats=True, nThreads=args.numberOfProcessors)
+ bam.close()
+ scale_factor = get_scale_factor(args, stats)
+ else:
+ scale_factor = args.scaleFactor
+
+ func_args = {'scaleFactor': scale_factor}
+
+ # This fixes issue #520, where --extendReads wasn't honored if --filterRNAstrand was used
+ if args.filterRNAstrand and not args.Offset:
+ args.Offset = [1, -1]
+
+ if args.MNase:
+ # check that library is paired end
+ # using getFragmentAndReadSize
+ from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
+ frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bam,
+ return_lengths=False,
+ blackListFileName=args.blackListFileName,
+ numberOfProcessors=args.numberOfProcessors,
+ verbose=args.verbose)
+ if frag_len_dict is None:
+ sys.exit("*Error*: For the --MNAse function a paired end library is required. ")
+
+ # Set some default fragment length bounds
+ if args.minFragmentLength == 0:
+ args.minFragmentLength = 130
+ if args.maxFragmentLength == 0:
+ args.maxFragmentLength = 200
+
+ wr = CenterFragment([args.bam],
+ binLength=args.binSize,
+ stepSize=args.binSize,
+ region=args.region,
+ blackListFileName=args.blackListFileName,
+ numberOfProcessors=args.numberOfProcessors,
+ extendReads=args.extendReads,
+ minMappingQuality=args.minMappingQuality,
+ ignoreDuplicates=args.ignoreDuplicates,
+ center_read=args.centerReads,
+ zerosToNans=args.skipNonCoveredRegions,
+ samFlag_include=args.samFlagInclude,
+ samFlag_exclude=args.samFlagExclude,
+ minFragmentLength=args.minFragmentLength,
+ maxFragmentLength=args.maxFragmentLength,
+ chrsToSkip=args.ignoreForNormalization,
+ verbose=args.verbose,
+ )
+
+ elif args.Offset:
+ if len(args.Offset) > 1:
+ if args.Offset[0] == 0:
+ sys.exit("*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment.")
+ if args.Offset[1] > 0 and args.Offset[1] < args.Offset[0]:
+ sys.exir("'Error*: The right side bound is less than the left-side bound. This is inappropriate.")
+ else:
+ if args.Offset[0] == 0:
+ sys.exit("*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment.")
+ wr = OffsetFragment([args.bam],
+ binLength=args.binSize,
+ stepSize=args.binSize,
+ region=args.region,
+ numberOfProcessors=args.numberOfProcessors,
+ extendReads=args.extendReads,
+ minMappingQuality=args.minMappingQuality,
+ ignoreDuplicates=args.ignoreDuplicates,
+ center_read=args.centerReads,
+ zerosToNans=args.skipNonCoveredRegions,
+ samFlag_include=args.samFlagInclude,
+ samFlag_exclude=args.samFlagExclude,
+ minFragmentLength=args.minFragmentLength,
+ maxFragmentLength=args.maxFragmentLength,
+ chrsToSkip=args.ignoreForNormalization,
+ verbose=args.verbose)
+ wr.filter_strand = args.filterRNAstrand
+ wr.Offset = args.Offset
+ else:
+ wr = writeBedGraph.WriteBedGraph([args.bam],
+ binLength=args.binSize,
+ stepSize=args.binSize,
+ region=args.region,
+ blackListFileName=args.blackListFileName,
+ numberOfProcessors=args.numberOfProcessors,
+ extendReads=args.extendReads,
+ minMappingQuality=args.minMappingQuality,
+ ignoreDuplicates=args.ignoreDuplicates,
+ center_read=args.centerReads,
+ zerosToNans=args.skipNonCoveredRegions,
+ samFlag_include=args.samFlagInclude,
+ samFlag_exclude=args.samFlagExclude,
+ minFragmentLength=args.minFragmentLength,
+ maxFragmentLength=args.maxFragmentLength,
+ chrsToSkip=args.ignoreForNormalization,
+ verbose=args.verbose,
+ )
+
+ wr.run(writeBedGraph.scaleCoverage, func_args, args.outFileName,
+ blackListFileName=args.blackListFileName,
+ format=args.outFileFormat, smoothLength=args.smoothLength)
+
+
+class OffsetFragment(writeBedGraph.WriteBedGraph):
+ """
+ Class to redefine the get_fragment_from_read for the --Offset case
+ """
+ def filterStrand(self, read, rv):
+ """
+ A generic read filtering function that gets used by everything in this class.
+
+ rv is returned if the strand is correct, otherwise [(None, None)]
+ """
+ # Filter by RNA strand, if desired
+ if read.is_paired:
+ if self.filter_strand == 'forward':
+ if read.flag & 144 == 128 or read.flag & 96 == 64:
+ return rv
+ elif self.filter_strand == 'reverse':
+ if read.flag & 144 == 144 or read.flag & 96 == 96:
+ return rv
+ else:
+ return rv
+ else:
+ if self.filter_strand == 'forward':
+ if read.flag & 16 == 16:
+ return rv
+ elif self.filter_strand == 'reverse':
+ if read.flag & 16 == 0:
+ return rv
+ else:
+ return rv
+
+ return [(None, None)]
+
+ def get_fragment_from_read_list(self, read, offset):
+ """
+ Return the range of exons from the 0th through 1st bases, inclusive. Positions are 1-based
+ """
+ rv = [(None, None)]
+ blocks = read.get_blocks()
+ blockLen = sum([x[1] - x[0] for x in blocks])
+
+ if self.defaultFragmentLength != 'read length':
+ if self.is_proper_pair(read, self.maxPairedFragmentLength):
+ if read.is_reverse:
+ foo = (read.next_reference_start, read.reference_start)
+ if foo[0] < foo[1]:
+ blocks.insert(0, foo)
+ else:
+ foo = (read.reference_end, read.reference_end + abs(read.template_length) - read.infer_query_length())
+ if foo[0] < foo[1]:
+ blocks.append(foo)
+
+ # Extend using the default fragment length
+ else:
+ if read.is_reverse:
+ foo = (read.reference_start - self.defaultFragmentLength + read.infer_query_length(), read.reference_start)
+ if foo[0] < 0:
+ foo = (0, foo[1])
+ if foo[0] < foo[1]:
+ blocks.insert(0, foo)
+ else:
+ foo = (read.reference_end, read.reference_end + self.defaultFragmentLength - read.infer_query_length())
+ if foo[0] < foo[1]:
+ blocks.append(foo)
+
+ stretch = []
+ # For the sake of simplicity, convert [(10, 20), (30, 40)] to [10, 11, 12, 13, ..., 40]
+ # Then subset accordingly
+ for block in blocks:
+ stretch.extend(range(block[0], block[1]))
+ if read.is_reverse:
+ stretch = stretch[::-1]
+
+ # Handle --centerReads
+ if self.center_read:
+ _ = (len(stretch) - blockLen) // 2
+ stretch = stretch[_:_ + blockLen]
+
+ # Subset by --Offset
+ try:
+ foo = stretch[offset[0]:offset[1]]
+ except:
+ return rv
+
+ if len(foo) == 0:
+ return rv
+ if read.is_reverse:
+ foo = foo[::-1]
+
+ # Convert the stretch back to a list of tuples
+ foo = np.array(foo)
+ d = foo[1:] - foo[:-1]
+ idx = np.argwhere(d > 1).flatten().tolist() # This now holds the interval bounds as a list
+ idx.append(-1)
+ last = 0
+ rv = []
+ for i in idx:
+ rv.append((foo[last].astype("int"), foo[i].astype("int") + 1))
+ last = i + 1
+
+ # Handle strand filtering, if needed
+ return self.filterStrand(read, rv)
+
+ def get_fragment_from_read(self, read):
+ """
+ This is mostly a wrapper for self.get_fragment_from_read_list(),
+ which needs a list and for the offsets to be tweaked by 1.
+ """
+ offset = [x for x in self.Offset]
+ if len(offset) > 1:
+ if offset[0] > 0:
+ offset[0] -= 1
+ if offset[1] < 0:
+ offset[1] += 1
+ else:
+ if offset[0] > 0:
+ offset[0] -= 1
+ offset = [offset[0], offset[0] + 1]
+ else:
+ if offset[0] < -1:
+ offset = [offset[0], offset[0] + 1]
+ else:
+ offset = [offset[0], None]
+ if offset[1] == 0:
+ # -1 gets switched to 0, which screws things up
+ offset = (offset[0], None)
+ return self.get_fragment_from_read_list(read, offset)
+
+
+class CenterFragment(writeBedGraph.WriteBedGraph):
+ """
+ Class to redefine the get_fragment_from_read for the --MNase case
+
+ The coverage of the fragment is defined as the 2 or 3 basepairs at the
+ center of the fragment length.
+ """
+ def get_fragment_from_read(self, read):
+ """
+ Takes a proper pair fragment of high quality and limited
+ to a certain length and outputs the center
+ """
+ fragment_start = fragment_end = None
+
+ # only paired forward reads are considered
+ # Fragments have already been filtered according to length
+ if read.is_proper_pair and not read.is_reverse and 1 < abs(read.tlen):
+ # distance between pairs is even return two bases at the center
+ if read.tlen % 2 == 0:
+ fragment_start = read.pos + read.tlen / 2 - 1
+ fragment_end = fragment_start + 2
+
+ # distance is odd return three bases at the center
+ else:
+ fragment_start = read.pos + read.tlen / 2 - 1
+ fragment_end = fragment_start + 3
+
+ return [(fragment_start, fragment_end)]
diff --git a/deepTools/source/deeptools/bamHandler.py b/deepTools/source/deeptools/bamHandler.py
new file mode 100644
index 0000000000000000000000000000000000000000..11d451a56f4734d13a2b8b7b05042cd2f6f98a1f
--- /dev/null
+++ b/deepTools/source/deeptools/bamHandler.py
@@ -0,0 +1,103 @@
+import sys
+import pysam
+from deeptools.mapReduce import mapReduce
+
+
+def countReadsInInterval(args):
+ chrom, start, end, fname, toEOF = args
+
+ bam = openBam(fname)
+ mapped = 0
+ unmapped = 0
+ for b in bam.fetch(chrom, start, end):
+ if chrom == "*":
+ unmapped += 1
+ continue
+ if b.pos < start:
+ continue
+ if not b.is_unmapped:
+ mapped += 1
+ else:
+ unmapped += 1
+ return mapped, unmapped, chrom
+
+
+def getMappingStats(bam, nThreads):
+ """
+ This is used for CRAM files, since idxstats() and .mapped/.unmapped are meaningless
+
+ This requires pysam > 0.13.0
+ """
+ header = [(x, y) for x, y in zip(bam.references, bam.lengths)]
+ res = mapReduce([bam.filename, False], countReadsInInterval, header, numberOfProcessors=nThreads)
+
+ mapped = sum([x[0] for x in res])
+ unmapped = sum([x[1] for x in res])
+ stats = {x[0]: [0, 0] for x in header}
+ for r in res:
+ stats[r[2]][0] += r[0]
+ stats[r[2]][1] += r[1]
+
+ # We need to count the number of unmapped reads as well
+ unmapped += bam.count("*")
+
+ return mapped, unmapped, stats
+
+
+def openBam(bamFile, returnStats=False, nThreads=1, minimalDecoding=True):
+ """
+ A wrapper for opening BAM/CRAM files.
+
+ bamFile: str
+ A BAM/CRAM file name
+
+ returnStats: bool
+ Return a tuple of (file_handle, nMappedReads, nUnmappedReads, statsDict).
+ These additional values are needed by some downstream functions, since one
+ can't use file_handle.mapped on CRAM files (or idxstats())
+
+ nThreads: int
+ If returnStats is True, number of threads to use for computing statistics
+
+ minimalDecoding: Bool
+ For CRAM files, don't decode the read name, sequence, qual, or auxiliary tag fields (these aren't used by most functions).
+
+ Returns either the file handle or a tuple as described in returnStats
+ """
+ format_options = ["required_fields=0x1FF"]
+ if sys.version_info.major >= 3:
+ format_options = [b"required_fields=0x1FF"]
+ if not minimalDecoding:
+ format_options = None
+ try:
+ bam = pysam.Samfile(bamFile, 'rb', format_options=format_options)
+ except IOError:
+ sys.exit("The file '{}' does not exist".format(bamFile))
+ except:
+ sys.exit("The file '{}' does not have BAM or CRAM format ".format(bamFile))
+
+ try:
+ assert bam.check_index() is not False
+ except:
+ sys.exit("'{}' does not appear to have an index. You MUST index the file first!".format(bamFile))
+
+ if bam.is_cram and returnStats:
+ mapped, unmapped, stats = getMappingStats(bam, nThreads)
+ elif bam.is_bam:
+ mapped = bam.mapped
+ unmapped = bam.unmapped
+
+ # Make the dictionary to hold the stats
+ if returnStats:
+ stats = {chrom.contig: [chrom.mapped, chrom.unmapped] for chrom in bam.get_index_statistics()}
+
+ if bam.is_bam or (bam.is_cram and returnStats):
+ if mapped == 0:
+ sys.stderr.write("WARNING! '{}' does not have any mapped reads. Please "
+ "check that the file is properly indexed and "
+ "that it contains mapped reads.\n".format(bamFile))
+
+ if returnStats:
+ return bam, mapped, unmapped, stats
+ else:
+ return bam
diff --git a/deepTools/source/deeptools/bamPEFragmentSize.py b/deepTools/source/deeptools/bamPEFragmentSize.py
new file mode 100644
index 0000000000000000000000000000000000000000..9138051717ce3fc314ff6e630e4aef8fd4677f57
--- /dev/null
+++ b/deepTools/source/deeptools/bamPEFragmentSize.py
@@ -0,0 +1,369 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import argparse
+import sys
+import numpy as np
+
+import matplotlib
+matplotlib.use('Agg')
+matplotlib.rcParams['pdf.fonttype'] = 42
+matplotlib.rcParams['svg.fonttype'] = 'none'
+from deeptools import cm # noqa: F401
+import matplotlib.pyplot as plt
+
+import plotly.offline as py
+import plotly.graph_objs as go
+
+# own tools
+from deeptools.parserCommon import writableFile
+from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
+from importlib.metadata import version
+
+
+def parse_arguments():
+ parser = argparse.ArgumentParser(
+ description='This tool calculates the fragment sizes for read pairs given a BAM file from paired-end sequencing.'
+ 'Several regions are sampled depending on the '
+ 'size of the genome and number of processors to estimate the'
+ 'summary statistics on the fragment lengths. '
+ 'Properly paired reads are preferred for computation, i.e., '
+ 'it will only use discordant pairs if no concordant alignments '
+ 'overlap with a given region. '
+ 'The default setting simply prints the summary statistics to the screen.',
+ usage='bamPEFragmentSize -b sample1.bam sample2.bam -o hist.png\n'
+ 'help: bamPEFragmentSize -h / bamPEFragmentSize --help'
+ )
+ parser.add_argument('--bamfiles', '-b',
+ help='List of BAM files to process',
+ nargs='+',
+ metavar='bam files')
+
+ parser.add_argument('--histogram', '-hist', '-o',
+ help='Save a .png file with a histogram '
+ 'of the fragment length distribution.',
+ metavar='FILE')
+
+ parser.add_argument('--plotFileFormat',
+ metavar='FILETYPE',
+ help='Image format type. If given, this option '
+ 'overrides the image format based on the plotFile '
+ 'ending. The available options are: png, '
+ 'eps, pdf, svg and plotly.',
+ default=None,
+ choices=['png', 'pdf', 'svg', 'eps', 'plotly'])
+
+ parser.add_argument('--numberOfProcessors', '-p',
+ help='Number of processors to use. The default is '
+ 'to use 1. (Default: %(default)s)',
+ metavar="INT",
+ type=int,
+ default=1,
+ required=False)
+ parser.add_argument('--samplesLabel',
+ help='Labels for the samples plotted. The '
+ 'default is to use the file name of the '
+ 'sample. The sample labels should be separated '
+ 'by spaces and quoted if a label itself'
+ 'contains a space E.g. --samplesLabel label-1 "label 2" ',
+ nargs='+')
+ parser.add_argument('--plotTitle', '-T',
+ help='Title of the plot, to be printed on top of '
+ 'the generated image. Leave blank for no title. (Default: %(default)s)',
+ default='')
+ parser.add_argument('--maxFragmentLength',
+ help='The maximum fragment length in the histogram. A value of 0 (the default) indicates to use twice the mean fragment length. (Default: %(default)s)',
+ default=0,
+ type=int)
+ parser.add_argument('--logScale',
+ help='Plot on the log scale',
+ action='store_true')
+ parser.add_argument('--binSize', '-bs',
+ metavar='INT',
+ help='Length in bases of the window used to sample the genome. (Default: %(default)s)',
+ default=1000,
+ type=int)
+ parser.add_argument('--distanceBetweenBins', '-n',
+ metavar='INT',
+ help='To reduce the computation time, not every possible genomic '
+ 'bin is sampled. This option allows you to set the distance '
+ 'between bins actually sampled from. Larger numbers are sufficient '
+ 'for high coverage samples, while smaller values are useful for '
+ 'lower coverage samples. Note that if you specify a value that '
+ 'results in too few (<1000) reads sampled, the value will be '
+ 'decreased. (Default: %(default)s)',
+ default=1000000,
+ type=int)
+ parser.add_argument('--blackListFileName', '-bl',
+ help="A BED file containing regions that should be excluded from all analyses. Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered.",
+ metavar="BED file",
+ required=False)
+ parser.add_argument('--table',
+ metavar='FILE',
+ help='In addition to printing read and fragment length metrics to the screen, write them to the given file in tabular format.',
+ required=False)
+ parser.add_argument('--outRawFragmentLengths',
+ metavar='FILE',
+ required=False,
+ type=writableFile,
+ help='Save the fragment (or read if the input is single-end) length and their associated number of occurrences to a tab-separated file. Columns are length, number of occurrences, and the sample label.')
+ parser.add_argument('--verbose',
+ help='Set if processing data messages are wanted.',
+ action='store_true',
+ required=False)
+ parser.add_argument('--version', action='version',
+ version='%(prog)s {}'.format(version('deeptools')))
+
+ return parser
+
+
+def getDensity(lengths, minVal, maxVal):
+ """
+ This is essentially computing what hist() in matplotlib is doing and returning the results.
+ This then allows us to free up the memory consumed by each sample rather than returning it all back to main() for plotting.
+ """
+ n, bins, patches = plt.hist(lengths, bins=100, range=(minVal, maxVal), density=True)
+ plt.clf()
+ return (n, bins)
+
+
+def getFragSize(bam, args, idx, outRawFrags):
+ fragment_len_dict, read_len_dict = get_read_and_fragment_length(bam, return_lengths=True,
+ blackListFileName=args.blackListFileName,
+ numberOfProcessors=args.numberOfProcessors,
+ verbose=args.verbose,
+ binSize=args.binSize,
+ distanceBetweenBins=args.distanceBetweenBins)
+
+ if outRawFrags:
+ label = bam
+ if args.samplesLabel and idx < len(args.samplesLabel):
+ label = args.samplesLabel[idx]
+ if fragment_len_dict:
+ fragment_len_dict['lengths'] = [int(x) for x in fragment_len_dict['lengths']]
+ cnts = np.bincount(fragment_len_dict['lengths'], minlength=int(fragment_len_dict['max']) + 1)
+ else:
+ read_len_dict['lengths'] = [int(x) for x in read_len_dict['lengths']]
+ cnts = np.bincount(read_len_dict['lengths'], minlength=int(read_len_dict['max']) + 1)
+ for idx, v in enumerate(cnts):
+ if v > 0:
+ outRawFrags.write("{}\t{}\t{}\n".format(idx, v, label))
+
+ if args.samplesLabel and idx < len(args.samplesLabel):
+ print("\n\nSample label: {}".format(args.samplesLabel[idx]))
+ else:
+ print("\n\nBAM file : {}".format(bam))
+
+ if fragment_len_dict:
+ if fragment_len_dict['mean'] == 0:
+ print("No pairs were found. Is the data from a paired-end sequencing experiment?")
+
+ print("Sample size: {}\n".format(fragment_len_dict['sample_size']))
+
+ print("Fragment lengths:")
+ print("Min.: {}\n1st Qu.: {}\nMean: {}\nMedian: {}\n"
+ "3rd Qu.: {}\nMax.: {}\nStd: {}".format(fragment_len_dict['min'],
+ fragment_len_dict['qtile25'],
+ fragment_len_dict['mean'],
+ fragment_len_dict['median'],
+ fragment_len_dict['qtile75'],
+ fragment_len_dict['max'],
+ fragment_len_dict['std']))
+ print("MAD: {}\nLen. 10%: {}\nLen. 20%: {}\nLen. 30%: {}\nLen. 40%: {}\nLen. 60%: {}\nLen. 70%: {}\nLen. 80%: {}\nLen. 90%: {}\nLen. 99%: {}\n".format(fragment_len_dict['mad'],
+ fragment_len_dict['qtile10'],
+ fragment_len_dict['qtile20'],
+ fragment_len_dict['qtile30'],
+ fragment_len_dict['qtile40'],
+ fragment_len_dict['qtile60'],
+ fragment_len_dict['qtile70'],
+ fragment_len_dict['qtile80'],
+ fragment_len_dict['qtile90'],
+ fragment_len_dict['qtile99']))
+ else:
+ print("No pairs were found. Is the data from a paired-end sequencing experiment?")
+
+ print("\nRead lengths:")
+ print("Sample size: {}\n".format(read_len_dict['sample_size']))
+ print("Min.: {}\n1st Qu.: {}\nMean: {}\nMedian: {}\n"
+ "3rd Qu.: {}\nMax.: {}\nStd: {}".format(read_len_dict['min'],
+ read_len_dict['qtile25'],
+ read_len_dict['mean'],
+ read_len_dict['median'],
+ read_len_dict['qtile75'],
+ read_len_dict['max'],
+ read_len_dict['std']))
+ print("MAD: {}\nLen. 10%: {}\nLen. 20%: {}\nLen. 30%: {}\nLen. 40%: {}\nLen. 60%: {}\nLen. 70%: {}\nLen. 80%: {}\nLen. 90%: {}\nLen. 99%: {}\n".format(read_len_dict['mad'],
+ read_len_dict['qtile10'],
+ read_len_dict['qtile20'],
+ read_len_dict['qtile30'],
+ read_len_dict['qtile40'],
+ read_len_dict['qtile60'],
+ read_len_dict['qtile70'],
+ read_len_dict['qtile80'],
+ read_len_dict['qtile90'],
+ read_len_dict['qtile99']))
+
+ # The read and fragment lists will just eat up memory if not removed!
+ if args.histogram:
+ if fragment_len_dict:
+ maxVal = fragment_len_dict['mean'] * 2
+ minVal = fragment_len_dict['min']
+ else:
+ maxVal = read_len_dict['mean'] * 2
+ minVal = read_len_dict['min']
+ if args.maxFragmentLength > 0:
+ maxVal = args.maxFragmentLength
+
+ if fragment_len_dict:
+ fragment_len_dict['lengths'] = getDensity(fragment_len_dict['lengths'], minVal, maxVal)
+ if read_len_dict:
+ read_len_dict['lengths'] = getDensity(read_len_dict['lengths'], minVal, maxVal)
+ else:
+ if fragment_len_dict:
+ del fragment_len_dict['lengths']
+ if read_len_dict:
+ del read_len_dict['lengths']
+
+ return (fragment_len_dict, read_len_dict)
+
+
+def printTable(args, fragDict, readDict):
+ """
+ Print the read and fragment dictionary in more easily parsable tabular format to a file.
+ """
+ of = open(args.table, "w")
+ of.write("\tFrag. Sampled")
+ of.write("\tFrag. Len. Min.\tFrag. Len. 1st. Qu.\tFrag. Len. Mean\tFrag. Len. Median\tFrag. Len. 3rd Qu.\tFrag. Len. Max\tFrag. Len. Std.")
+ of.write("\tFrag. Med. Abs. Dev.\tFrag. Len. 10%\tFrag. Len. 20%\tFrag. Len. 30%\tFrag. Len. 40%\tFrag. Len. 60%\tFrag. Len. 70%\tFrag. Len. 80%\tFrag. Len. 90%\tFrag. Len. 99%")
+ of.write("\tReads Sampled")
+ of.write("\tRead Len. Min.\tRead Len. 1st. Qu.\tRead Len. Mean\tRead Len. Median\tRead Len. 3rd Qu.\tRead Len. Max\tRead Len. Std.")
+ of.write("\tRead Med. Abs. Dev.\tRead Len. 10%\tRead Len. 20%\tRead Len. 30%\tRead Len. 40%\tRead Len. 60%\tRead Len. 70%\tRead Len. 80%\tRead Len. 90%\tRead Len. 99%\n")
+
+ for idx, bam in enumerate(args.bamfiles):
+ if args.samplesLabel and idx < len(args.samplesLabel):
+ of.write(args.samplesLabel[idx])
+ else:
+ of.write(bam)
+ if fragDict is not None and fragDict[bam] is not None:
+ d = fragDict[bam]
+ of.write("\t{}".format(d['sample_size']))
+ of.write("\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(d['min'],
+ d['qtile25'],
+ d['mean'],
+ d['median'],
+ d['qtile75'],
+ d['max'],
+ d['std']))
+ of.write("\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(d['mad'],
+ d['qtile10'],
+ d['qtile20'],
+ d['qtile30'],
+ d['qtile40'],
+ d['qtile60'],
+ d['qtile70'],
+ d['qtile80'],
+ d['qtile90'],
+ d['qtile99']))
+ else:
+ of.write("\t0")
+ of.write("\t0\t0\t0\t0\t0\t0\t0")
+ of.write("\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0")
+ d = readDict[bam]
+ of.write("\t{}".format(d['sample_size']))
+ of.write("\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(d['min'],
+ d['qtile25'],
+ d['mean'],
+ d['median'],
+ d['qtile75'],
+ d['max'],
+ d['std']))
+ of.write("\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(d['mad'],
+ d['qtile10'],
+ d['qtile20'],
+ d['qtile30'],
+ d['qtile40'],
+ d['qtile60'],
+ d['qtile70'],
+ d['qtile80'],
+ d['qtile90'],
+ d['qtile99']))
+ of.close()
+
+
+def main(args=None):
+ args = parse_arguments().parse_args(args)
+
+ if len(sys.argv) == 1:
+ parse_arguments().print_help()
+ sys.exit()
+
+ fraglengths = {}
+ readlengths = {}
+ of = None
+ if args.outRawFragmentLengths is not None:
+ of = open(args.outRawFragmentLengths, "w")
+ of.write("#bamPEFragmentSize\nSize\tOccurrences\tSample\n")
+ for idx, bam in enumerate(args.bamfiles):
+ f, r = getFragSize(bam, args, idx, of)
+ fraglengths[bam] = f
+ readlengths[bam] = r
+
+ if args.table is not None:
+ printTable(args, fraglengths, readlengths)
+
+ if args.histogram:
+ if args.samplesLabel:
+ if len(args.bamfiles) != len(args.samplesLabel):
+ sys.exit("The number of labels does not match the number of BAM files.")
+ else:
+ labels = args.samplesLabel
+ else:
+ labels = list(fraglengths.keys())
+
+ i = 0
+ data = []
+ for bam in fraglengths.keys():
+ d = fraglengths[bam]
+ if d is None:
+ d = readlengths[bam]
+ if args.maxFragmentLength > 0:
+ maxVal = args.maxFragmentLength
+ else:
+ maxVal = d['mean'] * 2
+
+ if args.plotFileFormat == 'plotly':
+ trace = go.Histogram(x=d['lengths'],
+ histnorm='probability',
+ opacity=0.5,
+ name=labels[i],
+ nbinsx=100,
+ xbins=dict(start=d['min'], end=maxVal))
+ data.append(trace)
+ else:
+ plt.bar(d['lengths'][1][:-1], height=d['lengths'][0],
+ width=d['lengths'][1][1:] - d['lengths'][1][:-1],
+ align='edge', log=args.logScale,
+ alpha=0.5, label=labels[i])
+ i += 1
+
+ if args.plotFileFormat == 'plotly':
+ fig = go.Figure()
+ fig.add_traces(data)
+ fig['layout']['yaxis1'].update(title='Frequency')
+ fig['layout']['xaxis1'].update(title='Fragment Length')
+ fig['layout'].update(title=args.plotTitle)
+ fig['layout'].update(showlegend=True)
+ if args.logScale:
+ fig['layout']['yaxis1'].update(type='log')
+ py.plot(fig, filename=args.histogram, auto_open=False)
+ else:
+ plt.xlabel('Fragment Length')
+ plt.ylabel('Frequency')
+ plt.legend(loc='upper right')
+ plt.title(args.plotTitle)
+ plt.savefig(args.histogram, bbox_inches=0, format=args.plotFileFormat)
+ plt.close()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/deepTools/source/deeptools/bigwigAverage.py b/deepTools/source/deeptools/bigwigAverage.py
new file mode 100644
index 0000000000000000000000000000000000000000..5228ddf08f4d1c5a8d51796cfe2c4a9cfff92cd8
--- /dev/null
+++ b/deepTools/source/deeptools/bigwigAverage.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import argparse
+import sys
+import numpy as np
+from deeptools import parserCommon
+from deeptools import writeBedGraph_bam_and_bw
+
+debug = 0
+
+
+def parse_arguments(args=None):
+ parentParser = parserCommon.getParentArgParse()
+ outputParser = parserCommon.output()
+ parser = argparse.ArgumentParser(
+ parents=[parentParser, outputParser],
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ description='This tool average multiple bigWig files based on the number '
+ 'of mapped reads. To average the bigWig files, the genome is '
+ 'partitioned into bins of equal size, then the scores '
+ 'in each bigwig file are computed per bin.'
+ 'These scores are averaged and scaleFactors can be applied before the average.',
+ usage='bigwigAverage -b sample1.bw sample2.bw -o outfile.bw\n'
+ 'help: bigwigAverage -h / bigwigAverage --help')
+
+ # define the arguments
+ parser.add_argument('--bigwigs', '-b',
+ metavar='Bigwig files',
+ help='Bigwig files separated by space.',
+ nargs='+',
+ required=True)
+
+ parser.add_argument('--scaleFactors',
+ help='Set this parameter to multipy the bigwig values '
+ 'by a constant. The format is '
+ 'scaleFactor1:scaleFactor2:scaleFactor3 etc. '
+ 'For example 0.7:1 to scale the first bigwig file '
+ 'by 0.7 while not scaling the second bigwig file',
+ default=None,
+ required=False)
+
+ parser.add_argument('--skipNonCoveredRegions', '--skipNAs',
+ help='This parameter determines if non-covered regions (regions without a score) '
+ 'in the bigWig files should be skipped. The default is to treat those '
+ 'regions as having a value of zero. '
+ 'The decision to skip non-covered regions '
+ 'depends on the interpretation of the data. Non-covered regions '
+ 'in a bigWig file may represent repetitive regions that should '
+ 'be skipped. Alternatively, the interpretation of non-covered regions as '
+ 'zeros may be wrong and this option should be used ',
+ action='store_true')
+
+ return parser
+
+
+def getType(fname):
+ """
+ Tries to determine if a file is a wiggle, a bedgraph, or a bigWig file.
+ """
+ if fname.endswith(".wig") or fname.endswith(".wiggle"):
+ return "wiggle"
+ elif fname.lower().endswith(".bedgraph") or fname.endswith(".bdg"):
+ return "bedgraph"
+ else:
+ return "bigwig"
+
+
+def average(tileCoverage, args):
+ r"""
+ The mapreduce method calls this function
+ for each tile. The parameters (args) are fixed
+ in the main method.
+
+ >>> funcArgs= {'scaleFactors': (1,1)}
+ >>> average([1, 2], funcArgs)
+ 1.5
+ >>> funcArgs= {'scaleFactors': (1,0.5)}
+ >>> average([1, 2], funcArgs)
+ 1.0
+ >>> funcArgs= {'scaleFactors': (1,0.5,0.1,0.2)}
+ >>> average([1, 2, 3, 12], funcArgs)
+ 1.175
+ >>> average([1, 2, 3, np.nan], funcArgs)
+ nan
+ """
+
+ norm_values = [args['scaleFactors'][i] * cov for i, cov in enumerate(tileCoverage)]
+
+ return np.mean(norm_values)
+
+
+def main(args=None):
+ args = parse_arguments().parse_args(args)
+ if len(sys.argv) == 1:
+ parse_arguments().print_help()
+ sys.exit()
+
+ nFiles = len(args.bigwigs)
+
+ if args.scaleFactors:
+ scaleFactors = [float(x) for x in args.scaleFactors.split(":")]
+ if len(scaleFactors) == 1:
+ scaleFactors = scaleFactors * nFiles
+ elif len(scaleFactors) != nFiles:
+ raise argparse.ArgumentTypeError(
+ "Format of scaleFactors is factor or factor1:factor2... as many as bigwig files. "
+ "There are {} bigwigs and {} factors."
+ "The value given ( {} ) is not valid".format(nFiles, len(scaleFactors), args.scaleFactors))
+ else:
+ scaleFactors = [1] * nFiles
+
+ # the average function is called and receives
+ # the function_args per each tile that is considered
+ FUNC = average
+ function_args = {'scaleFactors': scaleFactors}
+
+ writeBedGraph_bam_and_bw.writeBedGraph(
+ [(b, getType(b)) for b in args.bigwigs],
+ args.outFileName, 0, FUNC,
+ function_args, tileSize=args.binSize, region=args.region,
+ blackListFileName=args.blackListFileName,
+ verbose=args.verbose,
+ numberOfProcessors=args.numberOfProcessors,
+ skipZeroOverZero=False,
+ format=args.outFileFormat,
+ smoothLength=False,
+ missingDataAsZero=not args.skipNonCoveredRegions,
+ extendPairedEnds=False)
diff --git a/deepTools/source/deeptools/bigwigCompare.py b/deepTools/source/deeptools/bigwigCompare.py
new file mode 100644
index 0000000000000000000000000000000000000000..4662b2b36fdf6a5f6882817dd31b7eb23e0c2788
--- /dev/null
+++ b/deepTools/source/deeptools/bigwigCompare.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import argparse
+from deeptools import parserCommon
+from deeptools.getRatio import getRatio
+from deeptools import writeBedGraph_bam_and_bw
+
+debug = 0
+
+
+def parse_arguments(args=None):
+ parentParser = parserCommon.getParentArgParse()
+ outputParser = parserCommon.output()
+ parser = argparse.ArgumentParser(
+ parents=[parentParser, outputParser],
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ description='This tool compares two bigWig files based on the number '
+ 'of mapped reads. To compare the bigWig files, the genome is '
+ 'partitioned into bins of equal size, then the number of reads found '
+ 'in each BAM file are counted per bin and finally a summary '
+ 'value is reported. This value can be the ratio of the number of reads'
+ 'per bin, the log2 of the ratio, the sum or the difference.',
+ usage='bigwigCompare -b1 sample1.bw -b2 sample2.bw -o log2.bw\n'
+ 'help: bigwigCompare -h / bigwigCompare --help')
+
+ # define the arguments
+ parser.add_argument('--bigwig1', '-b1',
+ metavar='Bigwig file',
+ help='Bigwig file 1. Usually the file for the '
+ 'treatment.',
+ required=True)
+
+ parser.add_argument('--bigwig2', '-b2',
+ metavar='Bigwig file',
+ help='Bigwig file 2. Usually the file for the '
+ 'control.',
+ required=True)
+
+ parser.add_argument('--scaleFactors',
+ help='Set this parameter to multipy the bigwig values '
+ 'by a constant. The format is '
+ 'scaleFactor1:scaleFactor2. '
+ 'For example 0.7:1 to scale the first bigwig file '
+ 'by 0.7 while not scaling the second bigwig file',
+ default=None,
+ required=False)
+
+ parser.add_argument('--pseudocount',
+ help='A small number to avoid x/0. Only useful '
+ 'together with --operation log2 or --operation ratio. '
+ 'You can specify different values as pseudocounts for '
+ 'the numerator and the denominator by providing two '
+ 'values (the first value is used as the numerator '
+ 'pseudocount and the second the denominator pseudocount). (Default: %(default)s)',
+ default=1,
+ nargs='+',
+ action=parserCommon.requiredLength(1, 2),
+ type=float,
+ required=False)
+
+ parser.add_argument('--skipZeroOverZero',
+ help='Skip bins where BOTH BAM files lack coverage. '
+ 'This is determined BEFORE any applicable pseudocount '
+ 'is added.',
+ action='store_true')
+
+ parser.add_argument('--operation',
+ help='The default is to output the log2ratio of the '
+ 'two samples. The reciprocal ratio returns the '
+ 'the negative of the inverse of the ratio '
+ 'if the ratio is less than 0. The resulting '
+ 'values are interpreted as negative fold changes. '
+ 'Instead of performing a '
+ 'computation using both files, the scaled signal can '
+ 'alternatively be output for the first or second file using '
+ 'the \'--operation first\' or \'--operation second\' (Default: %(default)s)',
+ default='log2',
+ choices=['log2', 'ratio', 'subtract', 'add', 'mean',
+ 'reciprocal_ratio', 'first', 'second'],
+ required=False)
+
+ parser.add_argument('--skipNonCoveredRegions', '--skipNAs',
+ help='This parameter determines if non-covered regions (regions without a score) '
+ 'in the bigWig files should be skipped. The default is to treat those '
+ 'regions as having a value of zero. '
+ 'The decision to skip non-covered regions '
+ 'depends on the interpretation of the data. Non-covered regions '
+ 'in a bigWig file may represent repetitive regions that should '
+ 'be skipped. Alternatively, the interpretation of non-covered regions as '
+ 'zeros may be wrong and this option should be used ',
+ action='store_true')
+
+ parser.add_argument('--fixedStep',
+ help='Write out all bins (of size --binSize) '
+ 'instead of merging neighbouring bins with equal values.',
+ action='store_true')
+ return parser
+
+
+def getType(fname):
+ """
+ Tries to determine if a file is a wiggle, a bedgraph or a bigWig.
+ """
+ if fname.endswith(".wig") or fname.endswith(".wiggle"):
+ return "wiggle"
+ elif fname.endswith(".bedgraph"):
+ return "bedgraph"
+ else:
+ return "bigwig"
+
+
+def main(args=None):
+ args = parse_arguments().parse_args(args)
+
+ if args.scaleFactors:
+ scaleFactors = [float(x) for x in args.scaleFactors.split(":")]
+ else:
+ scaleFactors = [1, 1]
+
+ if not isinstance(args.pseudocount, list):
+ args.pseudocount = [args.pseudocount]
+
+ if len(args.pseudocount) == 1:
+ args.pseudocount *= 2
+
+ # the getRatio function is called and receives
+ # the function_args per each tile that is considered
+ FUNC = getRatio
+ function_args = {'valueType': args.operation,
+ 'scaleFactors': scaleFactors,
+ 'pseudocount': args.pseudocount}
+
+ writeBedGraph_bam_and_bw.writeBedGraph(
+ [(args.bigwig1, getType(args.bigwig1)),
+ (args.bigwig2, getType(args.bigwig2))],
+ args.outFileName, 0, FUNC,
+ function_args, tileSize=args.binSize, region=args.region,
+ blackListFileName=args.blackListFileName,
+ verbose=args.verbose,
+ numberOfProcessors=args.numberOfProcessors,
+ skipZeroOverZero=args.skipZeroOverZero,
+ format=args.outFileFormat,
+ smoothLength=False,
+ missingDataAsZero=not args.skipNonCoveredRegions,
+ extendPairedEnds=False,
+ fixedStep=args.fixedStep)
diff --git a/deepTools/source/deeptools/cm.py b/deepTools/source/deeptools/cm.py
new file mode 100644
index 0000000000000000000000000000000000000000..47bcf16285a5c4d7f01ef9daabf4fd9b8ca6f344
--- /dev/null
+++ b/deepTools/source/deeptools/cm.py
@@ -0,0 +1,1088 @@
+#!/usr/bin/env python
+
+# This file comes from the seaborn project and is under a BSD license:
+
+# Copyright (c) 2012-2019, Michael L. Waskom
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# * Neither the name of the project nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from matplotlib import colors, colormaps as mpl_cm
+
+
+_rocket_lut = [
+ [0.01060815, 0.01808215, 0.10018654],
+ [0.01428972, 0.02048237, 0.10374486],
+ [0.01831941, 0.0229766, 0.10738511],
+ [0.02275049, 0.02554464, 0.11108639],
+ [0.02759119, 0.02818316, 0.11483751],
+ [0.03285175, 0.03088792, 0.11863035],
+ [0.03853466, 0.03365771, 0.12245873],
+ [0.04447016, 0.03648425, 0.12631831],
+ [0.05032105, 0.03936808, 0.13020508],
+ [0.05611171, 0.04224835, 0.13411624],
+ [0.0618531, 0.04504866, 0.13804929],
+ [0.06755457, 0.04778179, 0.14200206],
+ [0.0732236, 0.05045047, 0.14597263],
+ [0.0788708, 0.05305461, 0.14995981],
+ [0.08450105, 0.05559631, 0.15396203],
+ [0.09011319, 0.05808059, 0.15797687],
+ [0.09572396, 0.06050127, 0.16200507],
+ [0.10132312, 0.06286782, 0.16604287],
+ [0.10692823, 0.06517224, 0.17009175],
+ [0.1125315, 0.06742194, 0.17414848],
+ [0.11813947, 0.06961499, 0.17821272],
+ [0.12375803, 0.07174938, 0.18228425],
+ [0.12938228, 0.07383015, 0.18636053],
+ [0.13501631, 0.07585609, 0.19044109],
+ [0.14066867, 0.0778224, 0.19452676],
+ [0.14633406, 0.07973393, 0.1986151],
+ [0.15201338, 0.08159108, 0.20270523],
+ [0.15770877, 0.08339312, 0.20679668],
+ [0.16342174, 0.0851396, 0.21088893],
+ [0.16915387, 0.08682996, 0.21498104],
+ [0.17489524, 0.08848235, 0.2190294],
+ [0.18065495, 0.09009031, 0.22303512],
+ [0.18643324, 0.09165431, 0.22699705],
+ [0.19223028, 0.09317479, 0.23091409],
+ [0.19804623, 0.09465217, 0.23478512],
+ [0.20388117, 0.09608689, 0.23860907],
+ [0.20973515, 0.09747934, 0.24238489],
+ [0.21560818, 0.09882993, 0.24611154],
+ [0.22150014, 0.10013944, 0.2497868],
+ [0.22741085, 0.10140876, 0.25340813],
+ [0.23334047, 0.10263737, 0.25697736],
+ [0.23928891, 0.10382562, 0.2604936],
+ [0.24525608, 0.10497384, 0.26395596],
+ [0.25124182, 0.10608236, 0.26736359],
+ [0.25724602, 0.10715148, 0.27071569],
+ [0.26326851, 0.1081815, 0.27401148],
+ [0.26930915, 0.1091727, 0.2772502],
+ [0.27536766, 0.11012568, 0.28043021],
+ [0.28144375, 0.11104133, 0.2835489],
+ [0.2875374, 0.11191896, 0.28660853],
+ [0.29364846, 0.11275876, 0.2896085],
+ [0.29977678, 0.11356089, 0.29254823],
+ [0.30592213, 0.11432553, 0.29542718],
+ [0.31208435, 0.11505284, 0.29824485],
+ [0.31826327, 0.1157429, 0.30100076],
+ [0.32445869, 0.11639585, 0.30369448],
+ [0.33067031, 0.11701189, 0.30632563],
+ [0.33689808, 0.11759095, 0.3088938],
+ [0.34314168, 0.11813362, 0.31139721],
+ [0.34940101, 0.11863987, 0.3138355],
+ [0.355676, 0.11910909, 0.31620996],
+ [0.36196644, 0.1195413, 0.31852037],
+ [0.36827206, 0.11993653, 0.32076656],
+ [0.37459292, 0.12029443, 0.32294825],
+ [0.38092887, 0.12061482, 0.32506528],
+ [0.38727975, 0.12089756, 0.3271175],
+ [0.39364518, 0.12114272, 0.32910494],
+ [0.40002537, 0.12134964, 0.33102734],
+ [0.40642019, 0.12151801, 0.33288464],
+ [0.41282936, 0.12164769, 0.33467689],
+ [0.41925278, 0.12173833, 0.33640407],
+ [0.42569057, 0.12178916, 0.33806605],
+ [0.43214263, 0.12179973, 0.33966284],
+ [0.43860848, 0.12177004, 0.34119475],
+ [0.44508855, 0.12169883, 0.34266151],
+ [0.45158266, 0.12158557, 0.34406324],
+ [0.45809049, 0.12142996, 0.34540024],
+ [0.46461238, 0.12123063, 0.34667231],
+ [0.47114798, 0.12098721, 0.34787978],
+ [0.47769736, 0.12069864, 0.34902273],
+ [0.48426077, 0.12036349, 0.35010104],
+ [0.49083761, 0.11998161, 0.35111537],
+ [0.49742847, 0.11955087, 0.35206533],
+ [0.50403286, 0.11907081, 0.35295152],
+ [0.51065109, 0.11853959, 0.35377385],
+ [0.51728314, 0.1179558, 0.35453252],
+ [0.52392883, 0.11731817, 0.35522789],
+ [0.53058853, 0.11662445, 0.35585982],
+ [0.53726173, 0.11587369, 0.35642903],
+ [0.54394898, 0.11506307, 0.35693521],
+ [0.5506426, 0.11420757, 0.35737863],
+ [0.55734473, 0.11330456, 0.35775059],
+ [0.56405586, 0.11235265, 0.35804813],
+ [0.57077365, 0.11135597, 0.35827146],
+ [0.5774991, 0.11031233, 0.35841679],
+ [0.58422945, 0.10922707, 0.35848469],
+ [0.59096382, 0.10810205, 0.35847347],
+ [0.59770215, 0.10693774, 0.35838029],
+ [0.60444226, 0.10573912, 0.35820487],
+ [0.61118304, 0.10450943, 0.35794557],
+ [0.61792306, 0.10325288, 0.35760108],
+ [0.62466162, 0.10197244, 0.35716891],
+ [0.63139686, 0.10067417, 0.35664819],
+ [0.63812122, 0.09938212, 0.35603757],
+ [0.64483795, 0.0980891, 0.35533555],
+ [0.65154562, 0.09680192, 0.35454107],
+ [0.65824241, 0.09552918, 0.3536529],
+ [0.66492652, 0.09428017, 0.3526697],
+ [0.67159578, 0.09306598, 0.35159077],
+ [0.67824099, 0.09192342, 0.3504148],
+ [0.684863, 0.09085633, 0.34914061],
+ [0.69146268, 0.0898675, 0.34776864],
+ [0.69803757, 0.08897226, 0.3462986],
+ [0.70457834, 0.0882129, 0.34473046],
+ [0.71108138, 0.08761223, 0.3430635],
+ [0.7175507, 0.08716212, 0.34129974],
+ [0.72398193, 0.08688725, 0.33943958],
+ [0.73035829, 0.0868623, 0.33748452],
+ [0.73669146, 0.08704683, 0.33543669],
+ [0.74297501, 0.08747196, 0.33329799],
+ [0.74919318, 0.08820542, 0.33107204],
+ [0.75535825, 0.08919792, 0.32876184],
+ [0.76145589, 0.09050716, 0.32637117],
+ [0.76748424, 0.09213602, 0.32390525],
+ [0.77344838, 0.09405684, 0.32136808],
+ [0.77932641, 0.09634794, 0.31876642],
+ [0.78513609, 0.09892473, 0.31610488],
+ [0.79085854, 0.10184672, 0.313391],
+ [0.7965014, 0.10506637, 0.31063031],
+ [0.80205987, 0.10858333, 0.30783],
+ [0.80752799, 0.11239964, 0.30499738],
+ [0.81291606, 0.11645784, 0.30213802],
+ [0.81820481, 0.12080606, 0.29926105],
+ [0.82341472, 0.12535343, 0.2963705],
+ [0.82852822, 0.13014118, 0.29347474],
+ [0.83355779, 0.13511035, 0.29057852],
+ [0.83850183, 0.14025098, 0.2876878],
+ [0.84335441, 0.14556683, 0.28480819],
+ [0.84813096, 0.15099892, 0.281943],
+ [0.85281737, 0.15657772, 0.27909826],
+ [0.85742602, 0.1622583, 0.27627462],
+ [0.86196552, 0.16801239, 0.27346473],
+ [0.86641628, 0.17387796, 0.27070818],
+ [0.87079129, 0.17982114, 0.26797378],
+ [0.87507281, 0.18587368, 0.26529697],
+ [0.87925878, 0.19203259, 0.26268136],
+ [0.8833417, 0.19830556, 0.26014181],
+ [0.88731387, 0.20469941, 0.25769539],
+ [0.89116859, 0.21121788, 0.2553592],
+ [0.89490337, 0.21785614, 0.25314362],
+ [0.8985026, 0.22463251, 0.25108745],
+ [0.90197527, 0.23152063, 0.24918223],
+ [0.90530097, 0.23854541, 0.24748098],
+ [0.90848638, 0.24568473, 0.24598324],
+ [0.911533, 0.25292623, 0.24470258],
+ [0.9144225, 0.26028902, 0.24369359],
+ [0.91717106, 0.26773821, 0.24294137],
+ [0.91978131, 0.27526191, 0.24245973],
+ [0.92223947, 0.28287251, 0.24229568],
+ [0.92456587, 0.29053388, 0.24242622],
+ [0.92676657, 0.29823282, 0.24285536],
+ [0.92882964, 0.30598085, 0.24362274],
+ [0.93078135, 0.31373977, 0.24468803],
+ [0.93262051, 0.3215093, 0.24606461],
+ [0.93435067, 0.32928362, 0.24775328],
+ [0.93599076, 0.33703942, 0.24972157],
+ [0.93752831, 0.34479177, 0.25199928],
+ [0.93899289, 0.35250734, 0.25452808],
+ [0.94036561, 0.36020899, 0.25734661],
+ [0.94167588, 0.36786594, 0.2603949],
+ [0.94291042, 0.37549479, 0.26369821],
+ [0.94408513, 0.3830811, 0.26722004],
+ [0.94520419, 0.39062329, 0.27094924],
+ [0.94625977, 0.39813168, 0.27489742],
+ [0.94727016, 0.4055909, 0.27902322],
+ [0.94823505, 0.41300424, 0.28332283],
+ [0.94914549, 0.42038251, 0.28780969],
+ [0.95001704, 0.42771398, 0.29244728],
+ [0.95085121, 0.43500005, 0.29722817],
+ [0.95165009, 0.44224144, 0.30214494],
+ [0.9524044, 0.44944853, 0.3072105],
+ [0.95312556, 0.45661389, 0.31239776],
+ [0.95381595, 0.46373781, 0.31769923],
+ [0.95447591, 0.47082238, 0.32310953],
+ [0.95510255, 0.47787236, 0.32862553],
+ [0.95569679, 0.48489115, 0.33421404],
+ [0.95626788, 0.49187351, 0.33985601],
+ [0.95681685, 0.49882008, 0.34555431],
+ [0.9573439, 0.50573243, 0.35130912],
+ [0.95784842, 0.51261283, 0.35711942],
+ [0.95833051, 0.51946267, 0.36298589],
+ [0.95879054, 0.52628305, 0.36890904],
+ [0.95922872, 0.53307513, 0.3748895],
+ [0.95964538, 0.53983991, 0.38092784],
+ [0.96004345, 0.54657593, 0.3870292],
+ [0.96042097, 0.55328624, 0.39319057],
+ [0.96077819, 0.55997184, 0.39941173],
+ [0.9611152, 0.5666337, 0.40569343],
+ [0.96143273, 0.57327231, 0.41203603],
+ [0.96173392, 0.57988594, 0.41844491],
+ [0.96201757, 0.58647675, 0.42491751],
+ [0.96228344, 0.59304598, 0.43145271],
+ [0.96253168, 0.5995944, 0.43805131],
+ [0.96276513, 0.60612062, 0.44471698],
+ [0.96298491, 0.6126247, 0.45145074],
+ [0.96318967, 0.61910879, 0.45824902],
+ [0.96337949, 0.6255736, 0.46511271],
+ [0.96355923, 0.63201624, 0.47204746],
+ [0.96372785, 0.63843852, 0.47905028],
+ [0.96388426, 0.64484214, 0.4861196],
+ [0.96403203, 0.65122535, 0.4932578],
+ [0.96417332, 0.65758729, 0.50046894],
+ [0.9643063, 0.66393045, 0.5077467],
+ [0.96443322, 0.67025402, 0.51509334],
+ [0.96455845, 0.67655564, 0.52251447],
+ [0.96467922, 0.68283846, 0.53000231],
+ [0.96479861, 0.68910113, 0.53756026],
+ [0.96492035, 0.69534192, 0.5451917],
+ [0.96504223, 0.7015636, 0.5528892],
+ [0.96516917, 0.70776351, 0.5606593],
+ [0.96530224, 0.71394212, 0.56849894],
+ [0.96544032, 0.72010124, 0.57640375],
+ [0.96559206, 0.72623592, 0.58438387],
+ [0.96575293, 0.73235058, 0.59242739],
+ [0.96592829, 0.73844258, 0.60053991],
+ [0.96612013, 0.74451182, 0.60871954],
+ [0.96632832, 0.75055966, 0.61696136],
+ [0.96656022, 0.75658231, 0.62527295],
+ [0.96681185, 0.76258381, 0.63364277],
+ [0.96709183, 0.76855969, 0.64207921],
+ [0.96739773, 0.77451297, 0.65057302],
+ [0.96773482, 0.78044149, 0.65912731],
+ [0.96810471, 0.78634563, 0.66773889],
+ [0.96850919, 0.79222565, 0.6764046],
+ [0.96893132, 0.79809112, 0.68512266],
+ [0.96935926, 0.80395415, 0.69383201],
+ [0.9698028, 0.80981139, 0.70252255],
+ [0.97025511, 0.81566605, 0.71120296],
+ [0.97071849, 0.82151775, 0.71987163],
+ [0.97120159, 0.82736371, 0.72851999],
+ [0.97169389, 0.83320847, 0.73716071],
+ [0.97220061, 0.83905052, 0.74578903],
+ [0.97272597, 0.84488881, 0.75440141],
+ [0.97327085, 0.85072354, 0.76299805],
+ [0.97383206, 0.85655639, 0.77158353],
+ [0.97441222, 0.86238689, 0.78015619],
+ [0.97501782, 0.86821321, 0.78871034],
+ [0.97564391, 0.87403763, 0.79725261],
+ [0.97628674, 0.87986189, 0.8057883],
+ [0.97696114, 0.88568129, 0.81430324],
+ [0.97765722, 0.89149971, 0.82280948],
+ [0.97837585, 0.89731727, 0.83130786],
+ [0.97912374, 0.90313207, 0.83979337],
+ [0.979891, 0.90894778, 0.84827858],
+ [0.98067764, 0.91476465, 0.85676611],
+ [0.98137749, 0.92061729, 0.86536915]
+]
+
+
+_mako_lut = [
+ [0.04503935, 0.01482344, 0.02092227],
+ [0.04933018, 0.01709292, 0.02535719],
+ [0.05356262, 0.01950702, 0.03018802],
+ [0.05774337, 0.02205989, 0.03545515],
+ [0.06188095, 0.02474764, 0.04115287],
+ [0.06598247, 0.0275665, 0.04691409],
+ [0.07005374, 0.03051278, 0.05264306],
+ [0.07409947, 0.03358324, 0.05834631],
+ [0.07812339, 0.03677446, 0.06403249],
+ [0.08212852, 0.0400833, 0.06970862],
+ [0.08611731, 0.04339148, 0.07538208],
+ [0.09009161, 0.04664706, 0.08105568],
+ [0.09405308, 0.04985685, 0.08673591],
+ [0.09800301, 0.05302279, 0.09242646],
+ [0.10194255, 0.05614641, 0.09813162],
+ [0.10587261, 0.05922941, 0.103854],
+ [0.1097942, 0.06227277, 0.10959847],
+ [0.11370826, 0.06527747, 0.11536893],
+ [0.11761516, 0.06824548, 0.12116393],
+ [0.12151575, 0.07117741, 0.12698763],
+ [0.12541095, 0.07407363, 0.1328442],
+ [0.12930083, 0.07693611, 0.13873064],
+ [0.13317849, 0.07976988, 0.14465095],
+ [0.13701138, 0.08259683, 0.15060265],
+ [0.14079223, 0.08542126, 0.15659379],
+ [0.14452486, 0.08824175, 0.16262484],
+ [0.14820351, 0.09106304, 0.16869476],
+ [0.15183185, 0.09388372, 0.17480366],
+ [0.15540398, 0.09670855, 0.18094993],
+ [0.15892417, 0.09953561, 0.18713384],
+ [0.16238588, 0.10236998, 0.19335329],
+ [0.16579435, 0.10520905, 0.19960847],
+ [0.16914226, 0.10805832, 0.20589698],
+ [0.17243586, 0.11091443, 0.21221911],
+ [0.17566717, 0.11378321, 0.21857219],
+ [0.17884322, 0.11666074, 0.2249565],
+ [0.18195582, 0.11955283, 0.23136943],
+ [0.18501213, 0.12245547, 0.23781116],
+ [0.18800459, 0.12537395, 0.24427914],
+ [0.19093944, 0.1283047, 0.25077369],
+ [0.19381092, 0.13125179, 0.25729255],
+ [0.19662307, 0.13421303, 0.26383543],
+ [0.19937337, 0.13719028, 0.27040111],
+ [0.20206187, 0.14018372, 0.27698891],
+ [0.20469116, 0.14319196, 0.28359861],
+ [0.20725547, 0.14621882, 0.29022775],
+ [0.20976258, 0.14925954, 0.29687795],
+ [0.21220409, 0.15231929, 0.30354703],
+ [0.21458611, 0.15539445, 0.31023563],
+ [0.21690827, 0.15848519, 0.31694355],
+ [0.21916481, 0.16159489, 0.32366939],
+ [0.2213631, 0.16471913, 0.33041431],
+ [0.22349947, 0.1678599, 0.33717781],
+ [0.2255714, 0.1710185, 0.34395925],
+ [0.22758415, 0.17419169, 0.35075983],
+ [0.22953569, 0.17738041, 0.35757941],
+ [0.23142077, 0.18058733, 0.3644173],
+ [0.2332454, 0.18380872, 0.37127514],
+ [0.2350092, 0.18704459, 0.3781528],
+ [0.23670785, 0.190297, 0.38504973],
+ [0.23834119, 0.19356547, 0.39196711],
+ [0.23991189, 0.19684817, 0.39890581],
+ [0.24141903, 0.20014508, 0.4058667],
+ [0.24286214, 0.20345642, 0.4128484],
+ [0.24423453, 0.20678459, 0.41985299],
+ [0.24554109, 0.21012669, 0.42688124],
+ [0.2467815, 0.21348266, 0.43393244],
+ [0.24795393, 0.21685249, 0.4410088],
+ [0.24905614, 0.22023618, 0.448113],
+ [0.25007383, 0.22365053, 0.45519562],
+ [0.25098926, 0.22710664, 0.46223892],
+ [0.25179696, 0.23060342, 0.46925447],
+ [0.25249346, 0.23414353, 0.47623196],
+ [0.25307401, 0.23772973, 0.48316271],
+ [0.25353152, 0.24136961, 0.49001976],
+ [0.25386167, 0.24506548, 0.49679407],
+ [0.25406082, 0.2488164, 0.50348932],
+ [0.25412435, 0.25262843, 0.51007843],
+ [0.25404842, 0.25650743, 0.51653282],
+ [0.25383134, 0.26044852, 0.52286845],
+ [0.2534705, 0.26446165, 0.52903422],
+ [0.25296722, 0.2685428, 0.53503572],
+ [0.2523226, 0.27269346, 0.54085315],
+ [0.25153974, 0.27691629, 0.54645752],
+ [0.25062402, 0.28120467, 0.55185939],
+ [0.24958205, 0.28556371, 0.55701246],
+ [0.24842386, 0.28998148, 0.56194601],
+ [0.24715928, 0.29446327, 0.56660884],
+ [0.24580099, 0.29899398, 0.57104399],
+ [0.24436202, 0.30357852, 0.57519929],
+ [0.24285591, 0.30819938, 0.57913247],
+ [0.24129828, 0.31286235, 0.58278615],
+ [0.23970131, 0.3175495, 0.5862272],
+ [0.23807973, 0.32226344, 0.58941872],
+ [0.23644557, 0.32699241, 0.59240198],
+ [0.2348113, 0.33173196, 0.59518282],
+ [0.23318874, 0.33648036, 0.59775543],
+ [0.2315855, 0.34122763, 0.60016456],
+ [0.23001121, 0.34597357, 0.60240251],
+ [0.2284748, 0.35071512, 0.6044784],
+ [0.22698081, 0.35544612, 0.60642528],
+ [0.22553305, 0.36016515, 0.60825252],
+ [0.22413977, 0.36487341, 0.60994938],
+ [0.22280246, 0.36956728, 0.61154118],
+ [0.22152555, 0.37424409, 0.61304472],
+ [0.22030752, 0.37890437, 0.61446646],
+ [0.2191538, 0.38354668, 0.61581561],
+ [0.21806257, 0.38817169, 0.61709794],
+ [0.21703799, 0.39277882, 0.61831922],
+ [0.21607792, 0.39736958, 0.61948028],
+ [0.21518463, 0.40194196, 0.62059763],
+ [0.21435467, 0.40649717, 0.62167507],
+ [0.21358663, 0.41103579, 0.62271724],
+ [0.21288172, 0.41555771, 0.62373011],
+ [0.21223835, 0.42006355, 0.62471794],
+ [0.21165312, 0.42455441, 0.62568371],
+ [0.21112526, 0.42903064, 0.6266318],
+ [0.21065161, 0.43349321, 0.62756504],
+ [0.21023306, 0.43794288, 0.62848279],
+ [0.20985996, 0.44238227, 0.62938329],
+ [0.20951045, 0.44680966, 0.63030696],
+ [0.20916709, 0.45122981, 0.63124483],
+ [0.20882976, 0.45564335, 0.63219599],
+ [0.20849798, 0.46005094, 0.63315928],
+ [0.20817199, 0.46445309, 0.63413391],
+ [0.20785149, 0.46885041, 0.63511876],
+ [0.20753716, 0.47324327, 0.63611321],
+ [0.20722876, 0.47763224, 0.63711608],
+ [0.20692679, 0.48201774, 0.63812656],
+ [0.20663156, 0.48640018, 0.63914367],
+ [0.20634336, 0.49078002, 0.64016638],
+ [0.20606303, 0.49515755, 0.6411939],
+ [0.20578999, 0.49953341, 0.64222457],
+ [0.20552612, 0.50390766, 0.64325811],
+ [0.20527189, 0.50828072, 0.64429331],
+ [0.20502868, 0.51265277, 0.64532947],
+ [0.20479718, 0.51702417, 0.64636539],
+ [0.20457804, 0.52139527, 0.64739979],
+ [0.20437304, 0.52576622, 0.64843198],
+ [0.20418396, 0.53013715, 0.64946117],
+ [0.20401238, 0.53450825, 0.65048638],
+ [0.20385896, 0.53887991, 0.65150606],
+ [0.20372653, 0.54325208, 0.65251978],
+ [0.20361709, 0.5476249, 0.6535266],
+ [0.20353258, 0.55199854, 0.65452542],
+ [0.20347472, 0.55637318, 0.655515],
+ [0.20344718, 0.56074869, 0.65649508],
+ [0.20345161, 0.56512531, 0.65746419],
+ [0.20349089, 0.56950304, 0.65842151],
+ [0.20356842, 0.57388184, 0.65936642],
+ [0.20368663, 0.57826181, 0.66029768],
+ [0.20384884, 0.58264293, 0.6612145],
+ [0.20405904, 0.58702506, 0.66211645],
+ [0.20431921, 0.59140842, 0.66300179],
+ [0.20463464, 0.59579264, 0.66387079],
+ [0.20500731, 0.60017798, 0.66472159],
+ [0.20544449, 0.60456387, 0.66555409],
+ [0.20596097, 0.60894927, 0.66636568],
+ [0.20654832, 0.61333521, 0.66715744],
+ [0.20721003, 0.61772167, 0.66792838],
+ [0.20795035, 0.62210845, 0.66867802],
+ [0.20877302, 0.62649546, 0.66940555],
+ [0.20968223, 0.63088252, 0.6701105],
+ [0.21068163, 0.63526951, 0.67079211],
+ [0.21177544, 0.63965621, 0.67145005],
+ [0.21298582, 0.64404072, 0.67208182],
+ [0.21430361, 0.64842404, 0.67268861],
+ [0.21572716, 0.65280655, 0.67326978],
+ [0.21726052, 0.65718791, 0.6738255],
+ [0.21890636, 0.66156803, 0.67435491],
+ [0.220668, 0.66594665, 0.67485792],
+ [0.22255447, 0.67032297, 0.67533374],
+ [0.22458372, 0.67469531, 0.67578061],
+ [0.22673713, 0.67906542, 0.67620044],
+ [0.22901625, 0.6834332, 0.67659251],
+ [0.23142316, 0.68779836, 0.67695703],
+ [0.23395924, 0.69216072, 0.67729378],
+ [0.23663857, 0.69651881, 0.67760151],
+ [0.23946645, 0.70087194, 0.67788018],
+ [0.24242624, 0.70522162, 0.67813088],
+ [0.24549008, 0.70957083, 0.67835215],
+ [0.24863372, 0.71392166, 0.67854868],
+ [0.25187832, 0.71827158, 0.67872193],
+ [0.25524083, 0.72261873, 0.67887024],
+ [0.25870947, 0.72696469, 0.67898912],
+ [0.26229238, 0.73130855, 0.67907645],
+ [0.26604085, 0.73564353, 0.67914062],
+ [0.26993099, 0.73997282, 0.67917264],
+ [0.27397488, 0.74429484, 0.67917096],
+ [0.27822463, 0.74860229, 0.67914468],
+ [0.28264201, 0.75290034, 0.67907959],
+ [0.2873016, 0.75717817, 0.67899164],
+ [0.29215894, 0.76144162, 0.67886578],
+ [0.29729823, 0.76567816, 0.67871894],
+ [0.30268199, 0.76989232, 0.67853896],
+ [0.30835665, 0.77407636, 0.67833512],
+ [0.31435139, 0.77822478, 0.67811118],
+ [0.3206671, 0.78233575, 0.67786729],
+ [0.32733158, 0.78640315, 0.67761027],
+ [0.33437168, 0.79042043, 0.67734882],
+ [0.34182112, 0.79437948, 0.67709394],
+ [0.34968889, 0.79827511, 0.67685638],
+ [0.35799244, 0.80210037, 0.67664969],
+ [0.36675371, 0.80584651, 0.67649539],
+ [0.3759816, 0.80950627, 0.67641393],
+ [0.38566792, 0.81307432, 0.67642947],
+ [0.39579804, 0.81654592, 0.67656899],
+ [0.40634556, 0.81991799, 0.67686215],
+ [0.41730243, 0.82318339, 0.67735255],
+ [0.4285828, 0.82635051, 0.6780564],
+ [0.44012728, 0.82942353, 0.67900049],
+ [0.45189421, 0.83240398, 0.68021733],
+ [0.46378379, 0.83530763, 0.6817062],
+ [0.47573199, 0.83814472, 0.68347352],
+ [0.48769865, 0.84092197, 0.68552698],
+ [0.49962354, 0.84365379, 0.68783929],
+ [0.5114027, 0.8463718, 0.69029789],
+ [0.52301693, 0.84908401, 0.69288545],
+ [0.53447549, 0.85179048, 0.69561066],
+ [0.54578602, 0.8544913, 0.69848331],
+ [0.55695565, 0.85718723, 0.70150427],
+ [0.56798832, 0.85987893, 0.70468261],
+ [0.57888639, 0.86256715, 0.70802931],
+ [0.5896541, 0.8652532, 0.71154204],
+ [0.60028928, 0.86793835, 0.71523675],
+ [0.61079441, 0.87062438, 0.71910895],
+ [0.62116633, 0.87331311, 0.72317003],
+ [0.63140509, 0.87600675, 0.72741689],
+ [0.64150735, 0.87870746, 0.73185717],
+ [0.65147219, 0.8814179, 0.73648495],
+ [0.66129632, 0.8841403, 0.74130658],
+ [0.67097934, 0.88687758, 0.74631123],
+ [0.68051833, 0.88963189, 0.75150483],
+ [0.68991419, 0.89240612, 0.75687187],
+ [0.69916533, 0.89520211, 0.76241714],
+ [0.70827373, 0.89802257, 0.76812286],
+ [0.71723995, 0.90086891, 0.77399039],
+ [0.72606665, 0.90374337, 0.7800041],
+ [0.73475675, 0.90664718, 0.78615802],
+ [0.74331358, 0.90958151, 0.79244474],
+ [0.75174143, 0.91254787, 0.79884925],
+ [0.76004473, 0.91554656, 0.80536823],
+ [0.76827704, 0.91856549, 0.81196513],
+ [0.77647029, 0.921603, 0.81855729],
+ [0.78462009, 0.92466151, 0.82514119],
+ [0.79273542, 0.92773848, 0.83172131],
+ [0.8008109, 0.93083672, 0.83829355],
+ [0.80885107, 0.93395528, 0.84485982],
+ [0.81685878, 0.9370938, 0.85142101],
+ [0.82483206, 0.94025378, 0.8579751],
+ [0.83277661, 0.94343371, 0.86452477],
+ [0.84069127, 0.94663473, 0.87106853],
+ [0.84857662, 0.9498573, 0.8776059],
+ [0.8564431, 0.95309792, 0.88414253],
+ [0.86429066, 0.95635719, 0.89067759],
+ [0.87218969, 0.95960708, 0.89725384]
+]
+
+
+_vlag_lut = [
+ [0.13850039, 0.41331206, 0.74052025],
+ [0.15077609, 0.41762684, 0.73970427],
+ [0.16235219, 0.4219191, 0.7389667],
+ [0.1733322, 0.42619024, 0.73832537],
+ [0.18382538, 0.43044226, 0.73776764],
+ [0.19394034, 0.4346772, 0.73725867],
+ [0.20367115, 0.43889576, 0.73685314],
+ [0.21313625, 0.44310003, 0.73648045],
+ [0.22231173, 0.44729079, 0.73619681],
+ [0.23125148, 0.45146945, 0.73597803],
+ [0.23998101, 0.45563715, 0.7358223],
+ [0.24853358, 0.45979489, 0.73571524],
+ [0.25691416, 0.4639437, 0.73566943],
+ [0.26513894, 0.46808455, 0.73568319],
+ [0.27322194, 0.47221835, 0.73575497],
+ [0.28117543, 0.47634598, 0.73588332],
+ [0.28901021, 0.48046826, 0.73606686],
+ [0.2967358, 0.48458597, 0.73630433],
+ [0.30436071, 0.48869986, 0.73659451],
+ [0.3118955, 0.49281055, 0.73693255],
+ [0.31935389, 0.49691847, 0.73730851],
+ [0.32672701, 0.5010247, 0.73774013],
+ [0.33402607, 0.50512971, 0.73821941],
+ [0.34125337, 0.50923419, 0.73874905],
+ [0.34840921, 0.51333892, 0.73933402],
+ [0.35551826, 0.51744353, 0.73994642],
+ [0.3625676, 0.52154929, 0.74060763],
+ [0.36956356, 0.52565656, 0.74131327],
+ [0.37649902, 0.52976642, 0.74207698],
+ [0.38340273, 0.53387791, 0.74286286],
+ [0.39025859, 0.53799253, 0.7436962],
+ [0.39706821, 0.54211081, 0.744578],
+ [0.40384046, 0.54623277, 0.74549872],
+ [0.41058241, 0.55035849, 0.74645094],
+ [0.41728385, 0.55448919, 0.74745174],
+ [0.42395178, 0.55862494, 0.74849357],
+ [0.4305964, 0.56276546, 0.74956387],
+ [0.4372044, 0.56691228, 0.75068412],
+ [0.4437909, 0.57106468, 0.75183427],
+ [0.45035117, 0.5752235, 0.75302312],
+ [0.45687824, 0.57938983, 0.75426297],
+ [0.46339713, 0.58356191, 0.75551816],
+ [0.46988778, 0.58774195, 0.75682037],
+ [0.47635605, 0.59192986, 0.75816245],
+ [0.48281101, 0.5961252, 0.75953212],
+ [0.4892374, 0.60032986, 0.76095418],
+ [0.49566225, 0.60454154, 0.76238852],
+ [0.50206137, 0.60876307, 0.76387371],
+ [0.50845128, 0.61299312, 0.76538551],
+ [0.5148258, 0.61723272, 0.76693475],
+ [0.52118385, 0.62148236, 0.76852436],
+ [0.52753571, 0.62574126, 0.77013939],
+ [0.53386831, 0.63001125, 0.77180152],
+ [0.54020159, 0.63429038, 0.7734803],
+ [0.54651272, 0.63858165, 0.77521306],
+ [0.55282975, 0.64288207, 0.77695608],
+ [0.55912585, 0.64719519, 0.77875327],
+ [0.56542599, 0.65151828, 0.78056551],
+ [0.57170924, 0.65585426, 0.78242747],
+ [0.57799572, 0.6602009, 0.78430751],
+ [0.58426817, 0.66456073, 0.78623458],
+ [0.590544, 0.66893178, 0.78818117],
+ [0.59680758, 0.67331643, 0.79017369],
+ [0.60307553, 0.67771273, 0.79218572],
+ [0.60934065, 0.68212194, 0.79422987],
+ [0.61559495, 0.68654548, 0.7963202],
+ [0.62185554, 0.69098125, 0.79842918],
+ [0.62810662, 0.69543176, 0.80058381],
+ [0.63436425, 0.69989499, 0.80275812],
+ [0.64061445, 0.70437326, 0.80497621],
+ [0.6468706, 0.70886488, 0.80721641],
+ [0.65312213, 0.7133717, 0.80949719],
+ [0.65937818, 0.71789261, 0.81180392],
+ [0.66563334, 0.72242871, 0.81414642],
+ [0.67189155, 0.72697967, 0.81651872],
+ [0.67815314, 0.73154569, 0.81892097],
+ [0.68441395, 0.73612771, 0.82136094],
+ [0.69068321, 0.74072452, 0.82382353],
+ [0.69694776, 0.7453385, 0.82633199],
+ [0.70322431, 0.74996721, 0.8288583],
+ [0.70949595, 0.75461368, 0.83143221],
+ [0.7157774, 0.75927574, 0.83402904],
+ [0.72206299, 0.76395461, 0.83665922],
+ [0.72835227, 0.76865061, 0.8393242],
+ [0.73465238, 0.7733628, 0.84201224],
+ [0.74094862, 0.77809393, 0.84474951],
+ [0.74725683, 0.78284158, 0.84750915],
+ [0.75357103, 0.78760701, 0.85030217],
+ [0.75988961, 0.79239077, 0.85313207],
+ [0.76621987, 0.79719185, 0.85598668],
+ [0.77255045, 0.8020125, 0.85888658],
+ [0.77889241, 0.80685102, 0.86181298],
+ [0.78524572, 0.81170768, 0.86476656],
+ [0.79159841, 0.81658489, 0.86776906],
+ [0.79796459, 0.82148036, 0.8707962],
+ [0.80434168, 0.82639479, 0.87385315],
+ [0.8107221, 0.83132983, 0.87695392],
+ [0.81711301, 0.8362844, 0.88008641],
+ [0.82351479, 0.84125863, 0.88325045],
+ [0.82992772, 0.84625263, 0.88644594],
+ [0.83634359, 0.85126806, 0.8896878],
+ [0.84277295, 0.85630293, 0.89295721],
+ [0.84921192, 0.86135782, 0.89626076],
+ [0.85566206, 0.866432, 0.89959467],
+ [0.86211514, 0.87152627, 0.90297183],
+ [0.86857483, 0.87663856, 0.90638248],
+ [0.87504231, 0.88176648, 0.90981938],
+ [0.88151194, 0.88690782, 0.91328493],
+ [0.88797938, 0.89205857, 0.91677544],
+ [0.89443865, 0.89721298, 0.9202854],
+ [0.90088204, 0.90236294, 0.92380601],
+ [0.90729768, 0.90749778, 0.92732797],
+ [0.91367037, 0.91260329, 0.93083814],
+ [0.91998105, 0.91766106, 0.93431861],
+ [0.92620596, 0.92264789, 0.93774647],
+ [0.93231683, 0.9275351, 0.94109192],
+ [0.93827772, 0.9322888, 0.94432312],
+ [0.94404755, 0.93686925, 0.94740137],
+ [0.94958284, 0.94123072, 0.95027696],
+ [0.95482682, 0.9453245, 0.95291103],
+ [0.9597248, 0.94909728, 0.95525103],
+ [0.96422552, 0.95249273, 0.95723271],
+ [0.96826161, 0.95545812, 0.95882188],
+ [0.97178458, 0.95793984, 0.95995705],
+ [0.97474105, 0.95989142, 0.96059997],
+ [0.97708604, 0.96127366, 0.96071853],
+ [0.97877855, 0.96205832, 0.96030095],
+ [0.97978484, 0.96222949, 0.95935496],
+ [0.9805997, 0.96155216, 0.95813083],
+ [0.98152619, 0.95993719, 0.95639322],
+ [0.9819726, 0.95766608, 0.95399269],
+ [0.98191855, 0.9547873, 0.95098107],
+ [0.98138514, 0.95134771, 0.94740644],
+ [0.98040845, 0.94739906, 0.94332125],
+ [0.97902107, 0.94300131, 0.93878672],
+ [0.97729348, 0.93820409, 0.93385135],
+ [0.9752533, 0.933073, 0.92858252],
+ [0.97297834, 0.92765261, 0.92302309],
+ [0.97049104, 0.92200317, 0.91723505],
+ [0.96784372, 0.91616744, 0.91126063],
+ [0.96507281, 0.91018664, 0.90514124],
+ [0.96222034, 0.90409203, 0.89890756],
+ [0.9593079, 0.89791478, 0.89259122],
+ [0.95635626, 0.89167908, 0.88621654],
+ [0.95338303, 0.88540373, 0.87980238],
+ [0.95040174, 0.87910333, 0.87336339],
+ [0.94742246, 0.87278899, 0.86691076],
+ [0.94445249, 0.86646893, 0.86045277],
+ [0.94150476, 0.86014606, 0.85399191],
+ [0.93857394, 0.85382798, 0.84753642],
+ [0.93566206, 0.84751766, 0.84108935],
+ [0.93277194, 0.8412164, 0.83465197],
+ [0.92990106, 0.83492672, 0.82822708],
+ [0.92704736, 0.82865028, 0.82181656],
+ [0.92422703, 0.82238092, 0.81541333],
+ [0.92142581, 0.81612448, 0.80902415],
+ [0.91864501, 0.80988032, 0.80264838],
+ [0.91587578, 0.80365187, 0.79629001],
+ [0.9131367, 0.79743115, 0.78994],
+ [0.91041602, 0.79122265, 0.78360361],
+ [0.90771071, 0.78502727, 0.77728196],
+ [0.90501581, 0.77884674, 0.7709771],
+ [0.90235365, 0.77267117, 0.76467793],
+ [0.8997019, 0.76650962, 0.75839484],
+ [0.89705346, 0.76036481, 0.752131],
+ [0.89444021, 0.75422253, 0.74587047],
+ [0.89183355, 0.74809474, 0.73962689],
+ [0.88923216, 0.74198168, 0.73340061],
+ [0.88665892, 0.73587283, 0.72717995],
+ [0.88408839, 0.72977904, 0.72097718],
+ [0.88153537, 0.72369332, 0.71478461],
+ [0.87899389, 0.7176179, 0.70860487],
+ [0.87645157, 0.71155805, 0.7024439],
+ [0.8739399, 0.70549893, 0.6962854],
+ [0.87142626, 0.6994551, 0.69014561],
+ [0.8689268, 0.69341868, 0.68401597],
+ [0.86643562, 0.687392, 0.67789917],
+ [0.86394434, 0.68137863, 0.67179927],
+ [0.86147586, 0.67536728, 0.665704],
+ [0.85899928, 0.66937226, 0.6596292],
+ [0.85654668, 0.66337773, 0.6535577],
+ [0.85408818, 0.65739772, 0.64750494],
+ [0.85164413, 0.65142189, 0.64145983],
+ [0.84920091, 0.6454565, 0.63542932],
+ [0.84676427, 0.63949827, 0.62941],
+ [0.84433231, 0.63354773, 0.62340261],
+ [0.84190106, 0.62760645, 0.61740899],
+ [0.83947935, 0.62166951, 0.61142404],
+ [0.8370538, 0.61574332, 0.60545478],
+ [0.83463975, 0.60981951, 0.59949247],
+ [0.83221877, 0.60390724, 0.593547],
+ [0.82980985, 0.59799607, 0.58760751],
+ [0.82740268, 0.59209095, 0.58167944],
+ [0.82498638, 0.5861973, 0.57576866],
+ [0.82258181, 0.5803034, 0.56986307],
+ [0.82016611, 0.57442123, 0.56397539],
+ [0.81776305, 0.56853725, 0.55809173],
+ [0.81534551, 0.56266602, 0.55222741],
+ [0.81294293, 0.55679056, 0.5463651],
+ [0.81052113, 0.55092973, 0.54052443],
+ [0.80811509, 0.54506305, 0.53468464],
+ [0.80568952, 0.53921036, 0.52886622],
+ [0.80327506, 0.53335335, 0.52305077],
+ [0.80084727, 0.52750583, 0.51725256],
+ [0.79842217, 0.5216578, 0.51146173],
+ [0.79599382, 0.51581223, 0.50568155],
+ [0.79355781, 0.50997127, 0.49991444],
+ [0.79112596, 0.50412707, 0.49415289],
+ [0.78867442, 0.49829386, 0.48841129],
+ [0.7862306, 0.49245398, 0.48267247],
+ [0.7837687, 0.48662309, 0.47695216],
+ [0.78130809, 0.4807883, 0.47123805],
+ [0.77884467, 0.47495151, 0.46553236],
+ [0.77636283, 0.46912235, 0.45984473],
+ [0.77388383, 0.46328617, 0.45416141],
+ [0.77138912, 0.45745466, 0.44849398],
+ [0.76888874, 0.45162042, 0.44283573],
+ [0.76638802, 0.44577901, 0.43718292],
+ [0.76386116, 0.43994762, 0.43155211],
+ [0.76133542, 0.43410655, 0.42592523],
+ [0.75880631, 0.42825801, 0.42030488],
+ [0.75624913, 0.42241905, 0.41470727],
+ [0.7536919, 0.41656866, 0.40911347],
+ [0.75112748, 0.41071104, 0.40352792],
+ [0.74854331, 0.40485474, 0.3979589],
+ [0.74594723, 0.39899309, 0.39240088],
+ [0.74334332, 0.39312199, 0.38685075],
+ [0.74073277, 0.38723941, 0.3813074],
+ [0.73809409, 0.38136133, 0.37578553],
+ [0.73544692, 0.37547129, 0.37027123],
+ [0.73278943, 0.36956954, 0.36476549],
+ [0.73011829, 0.36365761, 0.35927038],
+ [0.72743485, 0.35773314, 0.35378465],
+ [0.72472722, 0.35180504, 0.34831662],
+ [0.72200473, 0.34586421, 0.34285937],
+ [0.71927052, 0.33990649, 0.33741033],
+ [0.71652049, 0.33393396, 0.33197219],
+ [0.71375362, 0.32794602, 0.32654545],
+ [0.71096951, 0.32194148, 0.32113016],
+ [0.70816772, 0.31591904, 0.31572637],
+ [0.70534784, 0.30987734, 0.31033414],
+ [0.70250944, 0.30381489, 0.30495353],
+ [0.69965211, 0.2977301, 0.2995846],
+ [0.6967754, 0.29162126, 0.29422741],
+ [0.69388446, 0.28548074, 0.28887769],
+ [0.69097561, 0.2793096, 0.28353795],
+ [0.68803513, 0.27311993, 0.27821876],
+ [0.6850794, 0.26689144, 0.27290694],
+ [0.682108, 0.26062114, 0.26760246],
+ [0.67911013, 0.2543177, 0.26231367],
+ [0.67609393, 0.24796818, 0.25703372],
+ [0.67305921, 0.24156846, 0.25176238],
+ [0.67000176, 0.23511902, 0.24650278],
+ [0.66693423, 0.22859879, 0.24124404],
+ [0.6638441, 0.22201742, 0.2359961],
+ [0.66080672, 0.21526712, 0.23069468]
+]
+
+
+_icefire_lut = [
+ [0.73936227, 0.90443867, 0.85757238],
+ [0.72888063, 0.89639109, 0.85488394],
+ [0.71834255, 0.88842162, 0.8521605],
+ [0.70773866, 0.88052939, 0.849422],
+ [0.69706215, 0.87271313, 0.84668315],
+ [0.68629021, 0.86497329, 0.84398721],
+ [0.67543654, 0.85730617, 0.84130969],
+ [0.66448539, 0.84971123, 0.83868005],
+ [0.65342679, 0.84218728, 0.83611512],
+ [0.64231804, 0.83471867, 0.83358584],
+ [0.63117745, 0.827294, 0.83113431],
+ [0.62000484, 0.81991069, 0.82876741],
+ [0.60879435, 0.81256797, 0.82648905],
+ [0.59754118, 0.80526458, 0.82430414],
+ [0.58624247, 0.79799884, 0.82221573],
+ [0.57489525, 0.7907688, 0.82022901],
+ [0.56349779, 0.78357215, 0.81834861],
+ [0.55204294, 0.77640827, 0.81657563],
+ [0.54052516, 0.76927562, 0.81491462],
+ [0.52894085, 0.76217215, 0.81336913],
+ [0.51728854, 0.75509528, 0.81194156],
+ [0.50555676, 0.74804469, 0.81063503],
+ [0.49373871, 0.7410187, 0.80945242],
+ [0.48183174, 0.73401449, 0.80839675],
+ [0.46982587, 0.72703075, 0.80747097],
+ [0.45770893, 0.72006648, 0.80667756],
+ [0.44547249, 0.71311941, 0.80601991],
+ [0.43318643, 0.70617126, 0.80549278],
+ [0.42110294, 0.69916972, 0.80506683],
+ [0.40925101, 0.69211059, 0.80473246],
+ [0.3976693, 0.68498786, 0.80448272],
+ [0.38632002, 0.67781125, 0.80431024],
+ [0.37523981, 0.67057537, 0.80420832],
+ [0.36442578, 0.66328229, 0.80417474],
+ [0.35385939, 0.65593699, 0.80420591],
+ [0.34358916, 0.64853177, 0.8043],
+ [0.33355526, 0.64107876, 0.80445484],
+ [0.32383062, 0.63356578, 0.80467091],
+ [0.31434372, 0.62600624, 0.8049475],
+ [0.30516161, 0.618389, 0.80528692],
+ [0.29623491, 0.61072284, 0.80569021],
+ [0.28759072, 0.60300319, 0.80616055],
+ [0.27923924, 0.59522877, 0.80669803],
+ [0.27114651, 0.5874047, 0.80730545],
+ [0.26337153, 0.57952055, 0.80799113],
+ [0.25588696, 0.57157984, 0.80875922],
+ [0.248686, 0.56358255, 0.80961366],
+ [0.24180668, 0.55552289, 0.81055123],
+ [0.23526251, 0.54739477, 0.8115939],
+ [0.22921445, 0.53918506, 0.81267292],
+ [0.22397687, 0.53086094, 0.8137141],
+ [0.21977058, 0.52241482, 0.81457651],
+ [0.21658989, 0.51384321, 0.81528511],
+ [0.21452772, 0.50514155, 0.81577278],
+ [0.21372783, 0.49630865, 0.81589566],
+ [0.21409503, 0.48734861, 0.81566163],
+ [0.2157176, 0.47827123, 0.81487615],
+ [0.21842857, 0.46909168, 0.81351614],
+ [0.22211705, 0.45983212, 0.81146983],
+ [0.22665681, 0.45052233, 0.80860217],
+ [0.23176013, 0.44119137, 0.80494325],
+ [0.23727775, 0.43187704, 0.80038017],
+ [0.24298285, 0.42261123, 0.79493267],
+ [0.24865068, 0.41341842, 0.78869164],
+ [0.25423116, 0.40433127, 0.78155831],
+ [0.25950239, 0.39535521, 0.77376848],
+ [0.2644736, 0.38651212, 0.76524809],
+ [0.26901584, 0.37779582, 0.75621942],
+ [0.27318141, 0.36922056, 0.746605],
+ [0.27690355, 0.3607736, 0.73659374],
+ [0.28023585, 0.35244234, 0.72622103],
+ [0.28306009, 0.34438449, 0.71500731],
+ [0.28535896, 0.33660243, 0.70303975],
+ [0.28708711, 0.32912157, 0.69034504],
+ [0.28816354, 0.32200604, 0.67684067],
+ [0.28862749, 0.31519824, 0.66278813],
+ [0.28847904, 0.30869064, 0.6482815],
+ [0.28770912, 0.30250126, 0.63331265],
+ [0.28640325, 0.29655509, 0.61811374],
+ [0.28458943, 0.29082155, 0.60280913],
+ [0.28233561, 0.28527482, 0.58742866],
+ [0.27967038, 0.2798938, 0.57204225],
+ [0.27665361, 0.27465357, 0.55667809],
+ [0.27332564, 0.2695165, 0.54145387],
+ [0.26973851, 0.26447054, 0.52634916],
+ [0.2659204, 0.25949691, 0.511417],
+ [0.26190145, 0.25458123, 0.49668768],
+ [0.2577151, 0.24971691, 0.48214874],
+ [0.25337618, 0.24490494, 0.46778758],
+ [0.24890842, 0.24013332, 0.45363816],
+ [0.24433654, 0.23539226, 0.4397245],
+ [0.23967922, 0.23067729, 0.4260591],
+ [0.23495608, 0.22598894, 0.41262952],
+ [0.23018113, 0.22132414, 0.39945577],
+ [0.22534609, 0.21670847, 0.38645794],
+ [0.22048761, 0.21211723, 0.37372555],
+ [0.2156198, 0.20755389, 0.36125301],
+ [0.21074637, 0.20302717, 0.34903192],
+ [0.20586893, 0.19855368, 0.33701661],
+ [0.20101757, 0.19411573, 0.32529173],
+ [0.19619947, 0.18972425, 0.31383846],
+ [0.19140726, 0.18540157, 0.30260777],
+ [0.1866769, 0.1811332, 0.29166583],
+ [0.18201285, 0.17694992, 0.28088776],
+ [0.17745228, 0.17282141, 0.27044211],
+ [0.17300684, 0.16876921, 0.26024893],
+ [0.16868273, 0.16479861, 0.25034479],
+ [0.16448691, 0.16091728, 0.24075373],
+ [0.16043195, 0.15714351, 0.23141745],
+ [0.15652427, 0.15348248, 0.22238175],
+ [0.15277065, 0.14994111, 0.21368395],
+ [0.14918274, 0.14653431, 0.20529486],
+ [0.14577095, 0.14327403, 0.19720829],
+ [0.14254381, 0.14016944, 0.18944326],
+ [0.13951035, 0.13723063, 0.18201072],
+ [0.13667798, 0.13446606, 0.17493774],
+ [0.13405762, 0.13188822, 0.16820842],
+ [0.13165767, 0.12950667, 0.16183275],
+ [0.12948748, 0.12733187, 0.15580631],
+ [0.12755435, 0.1253723, 0.15014098],
+ [0.12586516, 0.12363617, 0.1448459],
+ [0.12442647, 0.12213143, 0.13992571],
+ [0.12324241, 0.12086419, 0.13539995],
+ [0.12232067, 0.11984278, 0.13124644],
+ [0.12166209, 0.11907077, 0.12749671],
+ [0.12126982, 0.11855309, 0.12415079],
+ [0.12114244, 0.11829179, 0.1212385],
+ [0.12127766, 0.11828837, 0.11878534],
+ [0.12284806, 0.1179729, 0.11772022],
+ [0.12619498, 0.11721796, 0.11770203],
+ [0.129968, 0.11663788, 0.11792377],
+ [0.13410011, 0.11625146, 0.11839138],
+ [0.13855459, 0.11606618, 0.11910584],
+ [0.14333775, 0.11607038, 0.1200606],
+ [0.148417, 0.11626929, 0.12125453],
+ [0.15377389, 0.11666192, 0.12268364],
+ [0.15941427, 0.11723486, 0.12433911],
+ [0.16533376, 0.11797856, 0.12621303],
+ [0.17152547, 0.11888403, 0.12829735],
+ [0.17797765, 0.11994436, 0.13058435],
+ [0.18468769, 0.12114722, 0.13306426],
+ [0.19165663, 0.12247737, 0.13572616],
+ [0.19884415, 0.12394381, 0.1385669],
+ [0.20627181, 0.12551883, 0.14157124],
+ [0.21394877, 0.12718055, 0.14472604],
+ [0.22184572, 0.12893119, 0.14802579],
+ [0.22994394, 0.13076731, 0.15146314],
+ [0.23823937, 0.13267611, 0.15502793],
+ [0.24676041, 0.13462172, 0.15870321],
+ [0.25546457, 0.13661751, 0.16248722],
+ [0.26433628, 0.13865956, 0.16637301],
+ [0.27341345, 0.14070412, 0.17034221],
+ [0.28264773, 0.14277192, 0.1743957],
+ [0.29202272, 0.14486161, 0.17852793],
+ [0.30159648, 0.14691224, 0.1827169],
+ [0.31129002, 0.14897583, 0.18695213],
+ [0.32111555, 0.15103351, 0.19119629],
+ [0.33107961, 0.1530674, 0.19543758],
+ [0.34119892, 0.15504762, 0.1996803],
+ [0.35142388, 0.15701131, 0.20389086],
+ [0.36178937, 0.1589124, 0.20807639],
+ [0.37229381, 0.16073993, 0.21223189],
+ [0.38288348, 0.16254006, 0.2163249],
+ [0.39359592, 0.16426336, 0.22036577],
+ [0.40444332, 0.16588767, 0.22434027],
+ [0.41537995, 0.16745325, 0.2282297],
+ [0.42640867, 0.16894939, 0.23202755],
+ [0.43754706, 0.17034847, 0.23572899],
+ [0.44878564, 0.1716535, 0.23932344],
+ [0.4601126, 0.17287365, 0.24278607],
+ [0.47151732, 0.17401641, 0.24610337],
+ [0.48300689, 0.17506676, 0.2492737],
+ [0.49458302, 0.17601892, 0.25227688],
+ [0.50623876, 0.17687777, 0.255096],
+ [0.5179623, 0.17765528, 0.2577162],
+ [0.52975234, 0.17835232, 0.2601134],
+ [0.54159776, 0.17898292, 0.26226847],
+ [0.55348804, 0.17956232, 0.26416003],
+ [0.56541729, 0.18010175, 0.26575971],
+ [0.57736669, 0.180631, 0.26704888],
+ [0.58932081, 0.18117827, 0.26800409],
+ [0.60127582, 0.18175888, 0.26858488],
+ [0.61319563, 0.1824336, 0.2687872],
+ [0.62506376, 0.18324015, 0.26858301],
+ [0.63681202, 0.18430173, 0.26795276],
+ [0.64842603, 0.18565472, 0.26689463],
+ [0.65988195, 0.18734638, 0.26543435],
+ [0.67111966, 0.18948885, 0.26357955],
+ [0.68209194, 0.19216636, 0.26137175],
+ [0.69281185, 0.19535326, 0.25887063],
+ [0.70335022, 0.19891271, 0.25617971],
+ [0.71375229, 0.20276438, 0.25331365],
+ [0.72401436, 0.20691287, 0.25027366],
+ [0.73407638, 0.21145051, 0.24710661],
+ [0.74396983, 0.21631913, 0.24380715],
+ [0.75361506, 0.22163653, 0.24043996],
+ [0.7630579, 0.22731637, 0.23700095],
+ [0.77222228, 0.23346231, 0.23356628],
+ [0.78115441, 0.23998404, 0.23013825],
+ [0.78979746, 0.24694858, 0.22678822],
+ [0.79819286, 0.25427223, 0.22352658],
+ [0.80630444, 0.26198807, 0.22040877],
+ [0.81417437, 0.27001406, 0.21744645],
+ [0.82177364, 0.27837336, 0.21468316],
+ [0.82915955, 0.28696963, 0.21210766],
+ [0.83628628, 0.2958499, 0.20977813],
+ [0.84322168, 0.30491136, 0.20766435],
+ [0.84995458, 0.31415945, 0.2057863],
+ [0.85648867, 0.32358058, 0.20415327],
+ [0.86286243, 0.33312058, 0.20274969],
+ [0.86908321, 0.34276705, 0.20157271],
+ [0.87512876, 0.3525416, 0.20064949],
+ [0.88100349, 0.36243385, 0.19999078],
+ [0.8866469, 0.37249496, 0.1997976],
+ [0.89203964, 0.38273475, 0.20013431],
+ [0.89713496, 0.39318156, 0.20121514],
+ [0.90195099, 0.40380687, 0.20301555],
+ [0.90648379, 0.41460191, 0.20558847],
+ [0.9106967, 0.42557857, 0.20918529],
+ [0.91463791, 0.43668557, 0.21367954],
+ [0.91830723, 0.44790913, 0.21916352],
+ [0.92171507, 0.45922856, 0.22568002],
+ [0.92491786, 0.4705936, 0.23308207],
+ [0.92790792, 0.48200153, 0.24145932],
+ [0.93073701, 0.49341219, 0.25065486],
+ [0.93343918, 0.5048017, 0.26056148],
+ [0.93602064, 0.51616486, 0.27118485],
+ [0.93850535, 0.52748892, 0.28242464],
+ [0.94092933, 0.53875462, 0.29416042],
+ [0.94330011, 0.5499628, 0.30634189],
+ [0.94563159, 0.56110987, 0.31891624],
+ [0.94792955, 0.57219822, 0.33184256],
+ [0.95020929, 0.5832232, 0.34508419],
+ [0.95247324, 0.59419035, 0.35859866],
+ [0.95471709, 0.60510869, 0.37236035],
+ [0.95698411, 0.61595766, 0.38629631],
+ [0.95923863, 0.62676473, 0.40043317],
+ [0.9615041, 0.6375203, 0.41474106],
+ [0.96371553, 0.64826619, 0.42928335],
+ [0.96591497, 0.65899621, 0.44380444],
+ [0.96809871, 0.66971662, 0.45830232],
+ [0.9702495, 0.6804394, 0.47280492],
+ [0.9723881, 0.69115622, 0.48729272],
+ [0.97450723, 0.70187358, 0.50178034],
+ [0.9766108, 0.712592, 0.51626837],
+ [0.97871716, 0.72330511, 0.53074053],
+ [0.98082222, 0.73401769, 0.54520694],
+ [0.9829001, 0.74474445, 0.5597019],
+ [0.98497466, 0.75547635, 0.57420239],
+ [0.98705581, 0.76621129, 0.58870185],
+ [0.98913325, 0.77695637, 0.60321626],
+ [0.99119918, 0.78771716, 0.61775821],
+ [0.9932672, 0.79848979, 0.63231691],
+ [0.99535958, 0.80926704, 0.64687278],
+ [0.99740544, 0.82008078, 0.66150571],
+ [0.9992197, 0.83100723, 0.6764127]
+]
+
+
+_luts = [_rocket_lut, _mako_lut, _vlag_lut, _icefire_lut]
+_names = ["rocket", "mako", "vlag", "icefire"]
+
+for _lut, _name in zip(_luts, _names):
+
+ _cmap = colors.ListedColormap(_lut, _name)
+ locals()[_name] = _cmap
+
+ _cmap_r = colors.ListedColormap(_lut[::-1], _name + "_r")
+ locals()[_name + "_r"] = _cmap_r
+
+ mpl_cm.register(_cmap, name=_name)
+ mpl_cm.register(_cmap_r, name=_name + "_r")
diff --git a/deepTools/source/deeptools/computeGCBias.py b/deepTools/source/deeptools/computeGCBias.py
new file mode 100644
index 0000000000000000000000000000000000000000..f261a9fc14bdd3eabba7f597d0047a4d2cc402c7
--- /dev/null
+++ b/deepTools/source/deeptools/computeGCBias.py
@@ -0,0 +1,800 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import time
+
+import multiprocessing
+import numpy as np
+import argparse
+from scipy.stats import poisson
+import py2bit
+import sys
+
+from deeptoolsintervals import GTF
+from deeptools.utilities import tbitToBamChrName, getGC_content
+from deeptools import parserCommon, mapReduce
+from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
+from deeptools import bamHandler
+
+debug = 0
+old_settings = np.seterr(all='ignore')
+
+
+def parse_arguments(args=None):
+ parentParser = parserCommon.getParentArgParse(binSize=False, blackList=True)
+ requiredArgs = getRequiredArgs()
+ parser = argparse.ArgumentParser(
+ parents=[requiredArgs, parentParser],
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ description='Computes the GC-bias using Benjamini\'s method '
+ '[Benjamini & Speed (2012). Nucleic Acids Research, 40(10). doi: 10.1093/nar/gks001]. '
+ 'The GC-bias is visualized and the resulting table can be used to'
+ 'correct the bias with `correctGCBias`.',
+ usage='computeGCBias '
+ '-b file.bam --effectiveGenomeSize 2150570000 -g mm9.2bit -l 200 --GCbiasFrequenciesFile freq.txt\n'
+ 'help: computeGCBias -h / computeGCBias --help',
+ conflict_handler='resolve',
+ add_help=False)
+
+ return parser
+
+
+def getRequiredArgs():
+ parser = argparse.ArgumentParser(add_help=False)
+
+ required = parser.add_argument_group('Required arguments')
+
+ required.add_argument('--bamfile', '-b',
+ metavar='bam file',
+ help='Sorted BAM file. ',
+ required=True)
+
+ required.add_argument('--effectiveGenomeSize',
+ help='The effective genome size is the portion '
+ 'of the genome that is mappable. Large fractions of '
+ 'the genome are stretches of NNNN that should be '
+ 'discarded. Also, if repetitive regions were not '
+ 'included in the mapping of reads, the effective '
+ 'genome size needs to be adjusted accordingly. '
+ 'A table of values is available here: '
+ 'http://deeptools.readthedocs.io/en/latest/content/feature/effectiveGenomeSize.html .',
+ default=None,
+ type=int,
+ required=True)
+
+ required.add_argument('--genome', '-g',
+ help='Genome in two bit format. Most genomes can be '
+ 'found here: http://hgdownload.cse.ucsc.edu/gbdb/ '
+ 'Search for the .2bit ending. Otherwise, fasta '
+ 'files can be converted to 2bit using the UCSC '
+ 'programm called faToTwoBit available for different '
+ 'plattforms at '
+ 'http://hgdownload.cse.ucsc.edu/admin/exe/',
+ metavar='2bit FILE',
+ required=True)
+
+ required.add_argument('--GCbiasFrequenciesFile', '-freq', '-o',
+ help='Path to save the file containing '
+ 'the observed and expected read frequencies per %%GC-'
+ 'content. This file is needed to run the '
+ 'correctGCBias tool. This is a text file.',
+ type=argparse.FileType('w'),
+ metavar='FILE',
+ required=True)
+
+ # define the optional arguments
+ optional = parser.add_argument_group('Optional arguments')
+ optional.add_argument('--fragmentLength', '-l',
+ help='Fragment length used for the sequencing. If '
+ 'paired-end reads are used, the fragment length is '
+ 'computed based from the bam file',
+ type=int)
+
+ optional.add_argument("--help", "-h", action="help",
+ help="show this help message and exit")
+
+ optional.add_argument('--sampleSize',
+ default=5e7,
+ help='Number of sampling points to be considered. (Default: %(default)s)',
+ type=int)
+
+ optional.add_argument('--extraSampling',
+ help='BED file containing genomic regions for which '
+ 'extra sampling is required because they are '
+ 'underrepresented in the genome.',
+ type=argparse.FileType('r'),
+ metavar='BED file')
+
+ plot = parser.add_argument_group('Diagnostic plot options')
+
+ plot.add_argument('--biasPlot',
+ metavar='FILE NAME',
+ help='If given, a diagnostic image summarizing '
+ 'the GC-bias will be saved.')
+
+ plot.add_argument('--plotFileFormat',
+ metavar='',
+ help='image format type. If given, this '
+ 'option overrides the '
+ 'image format based on the plotFile ending. '
+ 'The available options are: "png", '
+ '"eps", "pdf", "plotly" and "svg"',
+ choices=['png', 'pdf', 'svg', 'eps', 'plotly'])
+
+ plot.add_argument('--regionSize',
+ metavar='INT',
+ type=int,
+ default=300,
+ help='To plot the reads per %%GC over a region'
+ 'the size of the region is required. By default, '
+ 'the bin size is set to 300 bases, which is close to the '
+ 'standard fragment size for Illumina machines. However, '
+ 'if the depth of sequencing is low, a larger bin size '
+ 'will be required, otherwise many bins will not '
+ 'overlap with any read (Default: %(default)s)')
+
+ return parser
+
+
+def getPositionsToSample(chrom, start, end, stepSize):
+ """
+ check if the region submitted to the worker
+ overlaps with the region to take extra effort to sample.
+ If that is the case, the regions to sample array is
+ increased to match each of the positions in the extra
+ effort region sampled at the same stepSize along the interval.
+
+ If a filter out tree is given, then from positions to sample
+ those regions are cleaned
+ """
+ positions_to_sample = np.arange(start, end, stepSize)
+
+ if global_vars['filter_out']:
+ filter_out_tree = GTF(global_vars['filter_out'])
+ else:
+ filter_out_tree = None
+
+ if global_vars['extra_sampling_file']:
+ extra_tree = GTF(global_vars['extra_sampling_file'])
+ else:
+ extra_tree = None
+
+ if extra_tree:
+ orig_len = len(positions_to_sample)
+ try:
+ extra_match = extra_tree.findOverlaps(chrom, start, end)
+ except KeyError:
+ extra_match = []
+
+ if len(extra_match) > 0:
+ for intval in extra_match:
+ positions_to_sample = np.append(positions_to_sample,
+ list(range(intval[0], intval[1], stepSize)))
+ # remove duplicates
+ positions_to_sample = np.unique(np.sort(positions_to_sample))
+ if debug:
+ print("sampling increased to {} from {}".format(
+ len(positions_to_sample),
+ orig_len))
+
+ # skip regions that are filtered out
+ if filter_out_tree:
+ try:
+ out_match = filter_out_tree.findOverlaps(chrom, start, end)
+ except KeyError:
+ out_match = []
+
+ if len(out_match) > 0:
+ for intval in out_match:
+ positions_to_sample = \
+ positions_to_sample[(positions_to_sample < intval[0]) | (positions_to_sample >= intval[1])]
+ return positions_to_sample
+
+
+def countReadsPerGC_wrapper(args):
+ return countReadsPerGC_worker(*args)
+
+
+def countReadsPerGC_worker(chromNameBam,
+ start, end, stepSize, regionSize,
+ chrNameBamToBit, verbose=False):
+ """given a genome region defined by
+ (start, end), the GC content is quantified for
+ regions of size regionSize that are contiguous
+ """
+
+ chromNameBit = chrNameBamToBit[chromNameBam]
+ tbit = py2bit.open(global_vars['2bit'])
+ bam = bamHandler.openBam(global_vars['bam'])
+ c = 1
+ sub_reads_per_gc = []
+ positions_to_sample = getPositionsToSample(chromNameBit,
+ start, end, stepSize)
+
+ for index in range(len(positions_to_sample)):
+ i = positions_to_sample[index]
+ # stop if region extends over the chromosome end
+ if tbit.chroms(chromNameBit) < i + regionSize:
+ break
+
+ try:
+ gc = getGC_content(tbit, chromNameBit, int(i), int(i + regionSize))
+ except Exception as detail:
+ if verbose:
+ print("{}:{}-{}".format(chromNameBit, i, i + regionSize))
+ print(detail)
+ continue
+ numberReads = bam.count(chromNameBam, i, i + regionSize)
+ sub_reads_per_gc.append((numberReads, gc))
+ c += 1
+
+ return sub_reads_per_gc
+
+
+def tabulateGCcontent_wrapper(args):
+ return tabulateGCcontent_worker(*args)
+
+
+def tabulateGCcontent_worker(chromNameBam, start, end, stepSize,
+ fragmentLength,
+ chrNameBamToBit, verbose=False):
+ r""" given genome regions, the GC content of the genome is tabulated for
+ fragments of length 'fragmentLength' each 'stepSize' positions.
+
+ >>> test = Tester()
+ >>> args = test.testTabulateGCcontentWorker()
+ >>> N_gc, F_gc = tabulateGCcontent_worker(*args)
+
+ The forward read positions are:
+ [1, 4, 10, 10, 16, 18]
+ which correspond to a GC of
+ [1, 1, 1, 1, 2, 1]
+
+ The evaluated position are
+ [0, 2, 4, 6, 8, 10, 12, 14, 16, 18]
+ the corresponding GC is
+ [2, 1, 1, 2, 2, 1, 2, 3, 2, 1]
+
+ >>> print(N_gc)
+ [0 4 5 1]
+ >>> print(F_gc)
+ [0 4 1 0]
+ >>> test.set_filter_out_file()
+ >>> chrNameBam2bit = {'2L': 'chr2L'}
+
+ Test for the filter out option
+ >>> N_gc, F_gc = tabulateGCcontent_worker('2L', 0, 20, 2,
+ ... {'median': 3}, chrNameBam2bit)
+ >>> test.unset_filter_out_file()
+
+ The evaluated positions are
+ [ 0 2 8 10 12 14 16 18]
+ >>> print(N_gc)
+ [0 3 4 1]
+ >>> print(F_gc)
+ [0 3 1 0]
+
+ Test for extra_sampling option
+ >>> test.set_extra_sampling_file()
+ >>> chrNameBam2bit = {'2L': 'chr2L'}
+ >>> res = tabulateGCcontent_worker('2L', 0, 20, 2,
+ ... {'median': 3}, chrNameBam2bit)
+
+ The new positions evaluated are
+ [0, 1, 2, 3, 4, 6, 8, 10, 12, 14, 16, 18]
+ and the GC is
+ [2, 1, 1, 0, 1, 2, 2, 1, 2, 3, 2, 1]
+ >>> print(res[0])
+ [1 5 5 1]
+ >>> print(res[1])
+ [0 5 1 0]
+
+ """
+ if start > end:
+ raise NameError("start %d bigger that end %d" % (start, end))
+
+ chromNameBit = chrNameBamToBit[chromNameBam]
+
+ # array to keep track of the GC from regions of length 'fragmentLength'
+ # from the genome. The index of the array is used to
+ # indicate the gc content. The values inside the
+ # array are counts. Thus, if N_gc[10] = 3, that means
+ # that 3 regions have a gc_content of 10.
+ subN_gc = np.zeros(fragmentLength['median'] + 1, dtype='int')
+ subF_gc = np.zeros(fragmentLength['median'] + 1, dtype='int')
+
+ tbit = py2bit.open(global_vars['2bit'])
+ bam = bamHandler.openBam(global_vars['bam'])
+ peak = 0
+ startTime = time.time()
+
+ if verbose:
+ print("[{:.3f}] computing positions to "
+ "sample".format(time.time() - startTime))
+
+ positions_to_sample = getPositionsToSample(chromNameBit,
+ start, end, stepSize)
+
+ read_counts = []
+ # Optimize IO.
+ # if the sample regions are far apart from each
+ # other is faster to go to each location and fetch
+ # the reads found there.
+ # Otherwise, if the regions to sample are close to
+ # each other, is faster to load all the reads in
+ # a large region into memory and consider only
+ # those falling into the positions to sample.
+ # The following code gets the reads
+ # that are at sampling positions that lie close together
+ if np.mean(np.diff(positions_to_sample)) < 1000:
+ start_pos = min(positions_to_sample)
+ end_pos = max(positions_to_sample)
+ if verbose:
+ print("[{:.3f}] caching reads".format(time.time() - startTime))
+
+ counts = np.bincount([r.pos - start_pos
+ for r in bam.fetch(chromNameBam, start_pos,
+ end_pos + 1)
+ if not r.is_reverse and not r.is_unmapped and r.pos >= start_pos],
+ minlength=end_pos - start_pos + 2)
+
+ read_counts = counts[positions_to_sample - min(positions_to_sample)]
+ if verbose:
+ print("[{:.3f}] finish caching reads.".format(
+ time.time() - startTime))
+
+ countTime = time.time()
+
+ c = 1
+ for index in range(len(positions_to_sample)):
+ i = positions_to_sample[index]
+ # stop if the end of the chromosome is reached
+ if i + fragmentLength['median'] > tbit.chroms(chromNameBit):
+ break
+
+ try:
+ gc = getGC_content(tbit, chromNameBit, int(i), int(i + fragmentLength['median']), fraction=False)
+ except Exception as detail:
+ if verbose:
+ print(detail)
+ continue
+
+ subN_gc[gc] += 1
+
+ # count all reads at position 'i'
+ if len(read_counts) == 0: # case when no cache was done
+ num_reads = len([x.pos for x in bam.fetch(chromNameBam, i, i + 1)
+ if x.is_reverse is False and x.pos == i])
+ else:
+ num_reads = read_counts[index]
+
+ if num_reads >= global_vars['max_reads']:
+ peak += 1
+ continue
+
+ subF_gc[gc] += num_reads
+ if verbose:
+ if index % 50000 == 0:
+ endTime = time.time()
+ print("%s processing %d (%.1f per sec) @ %s:%s-%s %s" %
+ (multiprocessing.current_process().name,
+ index, index / (endTime - countTime),
+ chromNameBit, start, end, stepSize))
+ c += 1
+
+ if verbose:
+ endTime = time.time()
+ print("%s processing %d (%.1f per sec) @ %s:%s-%s %s" %
+ (multiprocessing.current_process().name,
+ index, index / (endTime - countTime),
+ chromNameBit, start, end, stepSize))
+ print("%s total time %.1f @ %s:%s-%s %s" % (multiprocessing.current_process().name,
+ (endTime - startTime), chromNameBit, start, end, stepSize))
+
+ return subN_gc, subF_gc
+
+
+def tabulateGCcontent(fragmentLength, chrNameBitToBam, stepSize,
+ chromSizes, numberOfProcessors=None, verbose=False,
+ region=None):
+ r"""
+ Subdivides the genome or the reads into chunks to be analyzed in parallel
+ using several processors. This codes handles the creation of
+ workers that tabulate the GC content for small regions and then
+ collects and integrates the results
+ >>> test = Tester()
+ >>> arg = test.testTabulateGCcontent()
+ >>> res = tabulateGCcontent(*arg)
+ >>> res
+ array([[ 0. , 18. , 1. ],
+ [ 3. , 63. , 0.45815996],
+ [ 7. , 159. , 0.42358185],
+ [ 25. , 192. , 1.25278115],
+ [ 28. , 215. , 1.25301422],
+ [ 16. , 214. , 0.71935396],
+ [ 12. , 95. , 1.21532959],
+ [ 9. , 24. , 3.60800971],
+ [ 3. , 11. , 2.62400706],
+ [ 0. , 0. , 1. ],
+ [ 0. , 0. , 1. ]])
+ """
+ global global_vars
+
+ chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.items()])
+ chunkSize = int(min(2e6, 4e5 / global_vars['reads_per_bp']))
+ chromSizes = [(k, v) for k, v in chromSizes if k in list(chrNameBamToBit.keys())]
+
+ imap_res = mapReduce.mapReduce((stepSize,
+ fragmentLength, chrNameBamToBit,
+ verbose),
+ tabulateGCcontent_wrapper,
+ chromSizes,
+ genomeChunkLength=chunkSize,
+ numberOfProcessors=numberOfProcessors,
+ region=region)
+
+ for subN_gc, subF_gc in imap_res:
+ try:
+ F_gc += subF_gc
+ N_gc += subN_gc
+ except NameError:
+ F_gc = subF_gc
+ N_gc = subN_gc
+
+ if sum(F_gc) == 0:
+ sys.exit("No fragments included in the sampling! Consider decreasing (or maybe increasing) the --sampleSize parameter")
+ scaling = float(sum(N_gc)) / float(sum(F_gc))
+
+ R_gc = np.array([float(F_gc[x]) / N_gc[x] * scaling
+ if N_gc[x] and F_gc[x] > 0 else 1
+ for x in range(len(F_gc))])
+
+ data = np.transpose(np.vstack((F_gc, N_gc, R_gc)))
+ return data
+
+
+def countReadsPerGC(regionSize, chrNameBitToBam, stepSize,
+ chromSizes, numberOfProcessors=None, verbose=False,
+ region=None):
+ r"""
+ Computes for a region of size regionSize, the GC of the region
+ and the number of reads that overlap it.
+ >>> test = Tester()
+ >>> arg = test.testCountReadsPerGC()
+ >>> reads_per_gc = countReadsPerGC(*arg)
+ >>> reads_per_gc[0:5,:]
+ array([[132. , 0.44 ],
+ [132. , 0.44 ],
+ [133. , 0.44 ],
+ [134. , 0.43666667],
+ [134. , 0.44 ]])
+ """
+ global global_vars
+
+ chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.items()])
+ chunkSize = int(min(2e6, 4e5 / global_vars['reads_per_bp']))
+
+ imap_res = mapReduce.mapReduce((stepSize,
+ regionSize, chrNameBamToBit,
+ verbose),
+ countReadsPerGC_wrapper,
+ chromSizes,
+ genomeChunkLength=chunkSize,
+ numberOfProcessors=numberOfProcessors,
+ region=region)
+
+ reads_per_gc = []
+ for sub_reads_per_gc in imap_res:
+ reads_per_gc += sub_reads_per_gc
+
+ reads_per_gc = np.asarray(reads_per_gc)
+ return reads_per_gc
+
+
+def smooth(x, window_len=3):
+ """
+ *CURRENTLY* not being used
+ smooths the values from the frequencies by taking the average
+ of 'window_len' values. window_len has to be an odd number
+ """
+ # do not smooth small arrays
+ if len(x) < window_len * 2:
+ return x
+ i = 0
+ y = x[:]
+ half_width = (window_len - 1) / 2
+ for i in range(0, len(x)):
+ if i < half_width or i + half_width + 1 > len(x):
+ continue
+ else:
+ y[i] = np.mean(x[i - half_width:i + half_width + 1])
+ # clip low values, this avoid problems with zeros
+ return y
+
+
+def bin_by(x, y, nbins=10):
+ """
+ Bin x by y.
+ Returns the binned "x" values and the left edges of the bins
+ """
+ bins = np.linspace(0, 1, nbins + 1)
+ # To avoid extra bin for the max value
+ bins[-1] += 1
+
+ indices = np.digitize(y, bins)
+
+ output = []
+ for i in range(1, len(bins)):
+ output.append(x[indices == i])
+
+ # Just return the left edges of the bins
+ bins = bins[:-1]
+
+ return output, bins
+
+
+def plotlyGCbias(file_name, frequencies, reads_per_gc, region_size):
+ import plotly.offline as py
+ import plotly.graph_objs as go
+ import matplotlib.cbook as cbook
+
+ fig = go.Figure()
+ fig['layout']['xaxis1'] = dict(domain=[0.0, 1.0], anchor="y1", title="GC fraction")
+ fig['layout']['yaxis1'] = dict(domain=[0.55, 1.0], anchor="x1", title="Number of reads")
+ fig['layout']['xaxis2'] = dict(domain=[0.0, 1.0], anchor="y2", title="GC fraction", range=[0.2, 0.7])
+ fig['layout']['yaxis2'] = dict(domain=[0.0, 0.45], anchor="x2", title="log2(observed/expected)")
+ text = "reads per {} base region".format(region_size)
+ annos = [{'yanchor': 'bottom', 'xref': 'paper', 'xanchor': 'center', 'yref': 'paper', 'text': text, 'y': 1.0, 'x': 0.5, 'font': {'size': 16}, 'showarrow': False}]
+ text = "normalized observed/expected read counts"
+ annos.append({'yanchor': 'bottom', 'xref': 'paper', 'xanchor': 'center', 'yref': 'paper', 'text': text, 'y': 0.5, 'x': 0.5, 'font': {'size': 16}, 'showarrow': False})
+
+ # prepare data for boxplot
+ reads, GC = reads_per_gc.T
+ reads_per_gc, bin_labels = bin_by(reads, GC, nbins=100)
+ to_keep = [idx for idx, x in enumerate(bin_labels) if 0.2 <= x <= 0.7]
+ reads_per_gc = [reads_per_gc[x] for x in to_keep]
+ bin_labels = [bin_labels[x] for x in to_keep]
+
+ # produce the same boxplot as matplotlib as vastly reduce the output file size
+ bins = []
+ for b in reads_per_gc:
+ s = cbook.boxplot_stats(b)[0]
+ bins.append([s['whislo'], s['q1'], s['q1'], s['med'], s['med'], s['med'], s['q3'], s['q3'], s['whishi']])
+
+ data = []
+
+ # top plot
+ for x, y in zip(bin_labels, bins):
+ trace = go.Box(x=x, y=y, xaxis='x1', yaxis='y1', boxpoints='outliers', showlegend=False, name="{}".format(x), line=dict(color='rgb(107,174,214)'))
+ data.append(trace)
+
+ # bottom plot
+ x = np.linspace(0, 1, frequencies.shape[0])
+ trace = go.Scatter(x=x, y=np.log2(frequencies[:, 2]), xaxis='x2', yaxis='y2', showlegend=False, line=dict(color='rgb(107,174,214)'))
+ data.append(trace)
+ fig.add_traces(data)
+ fig['layout']['annotations'] = annos
+ py.plot(fig, filename=file_name, auto_open=False)
+
+
+def plotGCbias(file_name, frequencies, reads_per_gc, region_size, image_format=None):
+ import matplotlib
+ matplotlib.use('Agg')
+ matplotlib.rcParams['pdf.fonttype'] = 42
+ matplotlib.rcParams['svg.fonttype'] = 'none'
+ import matplotlib.pyplot as plt
+
+ # prepare data for boxplot
+ reads, GC = reads_per_gc.T
+ reads_per_gc, bin_labels = bin_by(reads, GC, nbins=100)
+ to_keep = [idx for idx, x in enumerate(bin_labels) if 0.2 <= x <= 0.7]
+ reads_per_gc = [reads_per_gc[x] for x in to_keep]
+ bin_labels = [bin_labels[x] for x in to_keep]
+
+ title = "reads per regions of {} bp".format(region_size)
+ fig = plt.figure(figsize=(6, 8))
+ ax1 = fig.add_subplot(211, title=title)
+ ax2 = fig.add_subplot(212,
+ title='normalized observed/expected read counts')
+
+ # make boxplot
+
+ bp = ax1.boxplot(reads_per_gc, notch=0, patch_artist=True)
+ plt.setp(bp['boxes'], color='black', facecolor='LightGreen')
+ plt.setp(bp['medians'], color='black')
+ plt.setp(bp['whiskers'], color='black', linestyle='dashed')
+ plt.setp(bp['fliers'], marker='None')
+ # get the whisker that spands the most
+ y_max = np.nanmax([x.get_data()[1][1] for x in bp['whiskers']])
+ ax1.set_ylim(0 - (y_max * 0.05), y_max * 1.05)
+ ax1.set_ylabel('Number of reads')
+ ax1.set_xlabel('GC fraction')
+
+ xticks = [idx for idx, x in enumerate(bin_labels) if int(x * 100) % 10 == 0]
+
+ ax1.set_xticks(xticks)
+ ax1.set_xticklabels(["{:.1f}".format(bin_labels[x]) for x in xticks])
+
+ x = np.linspace(0, 1, frequencies.shape[0])
+ y = np.log2(frequencies[:, 2])
+ ax2.plot(x, y, color='#8c96f0')
+ ax2.set_xlabel('GC fraction')
+ ax2.set_ylabel('log2ratio observed/expected')
+ ax2.set_xlim(0.2, 0.7)
+ y_max = max(y[np.where(x >= 0.2)[0][0]:np.where(x <= 0.7)[0][-1] + 1])
+ y_min = min(y[np.where(x >= 0.2)[0][0]:np.where(x <= 0.7)[0][-1] + 1])
+ if y_max > 0:
+ y_max *= 1.1
+ else:
+ y_max *= 0.9
+ if y_min < 0:
+ y_min *= 1.1
+ else:
+ y_min *= 0.9
+ ax2.set_ylim(y_min, y_max)
+ plt.tight_layout()
+ plt.savefig(file_name, bbox_inches='tight', dpi=100, format=image_format)
+ plt.close()
+
+
+def main(args=None):
+ args = parse_arguments().parse_args(args)
+
+ if args.extraSampling:
+ extra_sampling_file = args.extraSampling.name
+ args.extraSampling.close()
+ else:
+ extra_sampling_file = None
+
+ global global_vars
+ global_vars = {}
+ global_vars['2bit'] = args.genome
+ global_vars['bam'] = args.bamfile
+ global_vars['filter_out'] = args.blackListFileName
+ global_vars['extra_sampling_file'] = extra_sampling_file
+
+ tbit = py2bit.open(global_vars['2bit'])
+ bam, mapped, unmapped, stats = bamHandler.openBam(global_vars['bam'], returnStats=True, nThreads=args.numberOfProcessors)
+
+ if args.fragmentLength:
+ fragment_len_dict = \
+ {'median': args.fragmentLength}
+
+ else:
+ fragment_len_dict, __ = \
+ get_read_and_fragment_length(args.bamfile, None,
+ numberOfProcessors=args.numberOfProcessors,
+ verbose=args.verbose)
+ if not fragment_len_dict:
+ print("\nPlease provide the fragment length used for the "
+ "sample preparation.\n")
+ exit(1)
+
+ fragment_len_dict = {'median': int(fragment_len_dict['median'])}
+
+ chrNameBitToBam = tbitToBamChrName(list(tbit.chroms().keys()), bam.references)
+
+ global_vars['genome_size'] = sum(tbit.chroms().values())
+ global_vars['total_reads'] = mapped
+ global_vars['reads_per_bp'] = \
+ float(global_vars['total_reads']) / args.effectiveGenomeSize
+
+ confidence_p_value = float(1) / args.sampleSize
+
+ # chromSizes: list of tuples
+ chromSizes = [(bam.references[i], bam.lengths[i])
+ for i in range(len(bam.references))]
+ chromSizes = [x for x in chromSizes if x[0] in tbit.chroms()]
+
+ # use poisson distribution to identify peaks that should be discarted.
+ # I multiply by 4, because the real distribution of reads
+ # vary depending on the gc content
+ # and the global number of reads per bp may a be too low.
+ # empirically, a value of at least 4 times as big as the
+ # reads_per_bp was found.
+ # Similarly for the min value, I divide by 4.
+ global_vars['max_reads'] = poisson(4 * global_vars['reads_per_bp'] * fragment_len_dict['median']).isf(confidence_p_value)
+ # this may be of not use, unless the depth of sequencing is really high
+ # as this value is close to 0
+ global_vars['min_reads'] = poisson(0.25 * global_vars['reads_per_bp'] * fragment_len_dict['median']).ppf(confidence_p_value)
+
+ for key in global_vars:
+ print("{}: {}".format(key, global_vars[key]))
+
+ print("computing frequencies")
+ # the GC of the genome is sampled each stepSize bp.
+ stepSize = max(int(global_vars['genome_size'] / args.sampleSize), 1)
+ print("stepSize: {}".format(stepSize))
+ data = tabulateGCcontent(fragment_len_dict,
+ chrNameBitToBam, stepSize,
+ chromSizes,
+ numberOfProcessors=args.numberOfProcessors,
+ verbose=args.verbose,
+ region=args.region)
+
+ np.savetxt(args.GCbiasFrequenciesFile.name, data)
+
+ if args.biasPlot:
+ reads_per_gc = countReadsPerGC(args.regionSize,
+ chrNameBitToBam, stepSize * 10,
+ chromSizes,
+ numberOfProcessors=args.numberOfProcessors,
+ verbose=args.verbose,
+ region=args.region)
+ if args.plotFileFormat == "plotly":
+ plotlyGCbias(args.biasPlot, data, reads_per_gc, args.regionSize)
+ else:
+ plotGCbias(args.biasPlot, data, reads_per_gc, args.regionSize, image_format=args.plotFileFormat)
+
+
+class Tester():
+ def __init__(self):
+ import os
+ self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_corrGC/"
+ self.tbitFile = self.root + "sequence.2bit"
+ self.bamFile = self.root + "test.bam"
+ self.mappability = self.root + "mappability.bw"
+ self.chrNameBam = '2L'
+ self.chrNameBit = 'chr2L'
+ bam, mapped, unmapped, stats = bamHandler.openBam(self.bamFile, returnStats=True)
+ tbit = py2bit.open(self.tbitFile)
+ global debug
+ debug = 0
+ global global_vars
+ global_vars = {'2bit': self.tbitFile,
+ 'bam': self.bamFile,
+ 'filter_out': None,
+ 'mappability': self.mappability,
+ 'extra_sampling_file': None,
+ 'max_reads': 5,
+ 'min_reads': 0,
+ 'min_reads': 0,
+ 'reads_per_bp': 0.3,
+ 'total_reads': mapped,
+ 'genome_size': sum(tbit.chroms().values())
+ }
+
+ def testTabulateGCcontentWorker(self):
+ stepSize = 2
+ fragmentLength = {'min': 1, 'median': 3, 'max': 5}
+ start = 0
+ end = 20
+ chrNameBam2bit = {'2L': 'chr2L'}
+ return (self.chrNameBam,
+ start, end, stepSize, fragmentLength, chrNameBam2bit)
+
+ def set_filter_out_file(self):
+ global global_vars
+ global_vars['filter_out'] = self.root + "filter_out.bed"
+
+ def unset_filter_out_file(self):
+ global global_vars
+ global_vars['filter_out'] = None
+
+ def set_extra_sampling_file(self):
+ global global_vars
+ global_vars['extra_sampling_file'] = self.root + "extra_sampling.bed"
+
+ def testTabulateGCcontent(self):
+ fragmentLength = {'median': 10}
+ chrNameBitToBam = {'chr2L': '2L'}
+ stepSize = 1
+ bam = bamHandler.openBam(global_vars['bam'])
+ chromSizes = [(bam.references[i], bam.lengths[i])
+ for i in range(len(bam.references))]
+ return (fragmentLength,
+ chrNameBitToBam, stepSize, chromSizes, 1)
+
+ def testCountReadsPerGC(self):
+ regionSize = 300
+ chrNameBitToBam = {'chr2L': '2L'}
+ stepSize = 1
+ bam = bamHandler.openBam(global_vars['bam'])
+ chromSizes = [(bam.references[i], bam.lengths[i])
+ for i in range(len(bam.references))]
+ return (regionSize,
+ chrNameBitToBam, stepSize, chromSizes, 1)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/deepTools/source/deeptools/computeMatrix.py b/deepTools/source/deeptools/computeMatrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..02cc1fd4ed3d294418fb86c5813b538986b2fc59
--- /dev/null
+++ b/deepTools/source/deeptools/computeMatrix.py
@@ -0,0 +1,429 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import argparse
+import sys
+from deeptools.parserCommon import writableFile, numberOfProcessors
+from deeptools import parserCommon
+from deeptools import heatmapper
+import deeptools.computeMatrixOperations as cmo
+from importlib.metadata import version
+
+
+def parse_arguments(args=None):
+ parser = \
+ argparse.ArgumentParser(
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ description="""
+
+This tool calculates scores per genome regions and prepares an intermediate file that can be used with ``plotHeatmap`` and ``plotProfiles``.
+Typically, the genome regions are genes, but any other regions defined in a BED file can be used.
+computeMatrix accepts multiple score files (bigWig format) and multiple regions files (BED format).
+This tool can also be used to filter and sort regions according
+to their score.
+
+To learn more about the specific parameters, type:
+
+$ computeMatrix reference-point --help or
+
+$ computeMatrix scale-regions --help
+
+""",
+ epilog='An example usage is:\n computeMatrix reference-point -S '
+ ' -R -b 1000\n \n')
+
+ parser.add_argument('--version', action='version',
+ version='%(prog)s {}'.format(version('deeptools')))
+
+ subparsers = parser.add_subparsers(
+ title='Commands',
+ dest='command',
+ metavar='')
+
+ # scale-regions mode options
+ subparsers.add_parser(
+ 'scale-regions',
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ parents=[
+ computeMatrixRequiredArgs(),
+ computeMatrixOutputArgs(),
+ computeMatrixOptArgs(case='scale-regions'),
+ parserCommon.gtf_options()
+ ],
+ help="In the scale-regions mode, all regions in the BED file are "
+ "stretched or shrunken to the length (in bases) indicated by the user.",
+ usage='An example usage is:\n computeMatrix scale-regions -S '
+ ' -R -b 1000\n\n')
+
+ # reference point arguments
+ subparsers.add_parser(
+ 'reference-point',
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ parents=[computeMatrixRequiredArgs(),
+ computeMatrixOutputArgs(),
+ computeMatrixOptArgs(case='reference-point'),
+ parserCommon.gtf_options()
+ ],
+ help="Reference-point refers to a position within a BED region "
+ "(e.g., the starting point). In this mode, only those genomic"
+ "positions before (upstream) and/or after (downstream) of the "
+ "reference point will be plotted.",
+ usage='An example usage is:\n computeMatrix reference-point -S '
+ ' -R -a 3000 -b 3000\n\n')
+
+ return parser
+
+
+def computeMatrixRequiredArgs(args=None):
+ parser = argparse.ArgumentParser(add_help=False)
+ required = parser.add_argument_group('Required arguments')
+ required.add_argument('--regionsFileName', '-R',
+ metavar='File',
+ help='File name or names, in BED or GTF format, containing '
+ 'the regions to plot. If multiple bed files are given, each one is considered a '
+ 'group that can be plotted separately. Also, adding a "#" symbol in the bed file '
+ 'causes all the regions until the previous "#" to be considered one group.',
+ nargs='+',
+ required=True)
+ required.add_argument('--scoreFileName', '-S',
+ help='bigWig file(s) containing '
+ 'the scores to be plotted. Multiple files should be separated by spaced. BigWig '
+ 'files can be obtained by using the bamCoverage '
+ 'or bamCompare tools. More information about '
+ 'the bigWig file format can be found at '
+ 'http://genome.ucsc.edu/goldenPath/help/bigWig.html ',
+ metavar='File',
+ nargs='+',
+ required=True)
+ return parser
+
+
+def computeMatrixOutputArgs(args=None):
+ parser = argparse.ArgumentParser(add_help=False)
+ output = parser.add_argument_group('Output options')
+ output.add_argument('--outFileName', '-out', '-o',
+ help='File name to save the gzipped matrix file '
+ 'needed by the "plotHeatmap" and "plotProfile" tools.',
+ type=writableFile,
+ required=True)
+
+ output.add_argument('--outFileNameMatrix',
+ help='If this option is given, then the matrix '
+ 'of values underlying the heatmap will be saved '
+ 'using the indicated name, e.g. IndividualValues.tab.'
+ 'This matrix can easily be loaded into R or '
+ 'other programs.',
+ metavar='FILE',
+ type=writableFile)
+ output.add_argument('--outFileSortedRegions',
+ help='File name in which the regions are saved '
+ 'after skiping zeros or min/max threshold values. The '
+ 'order of the regions in the file follows the sorting '
+ 'order selected. This is useful, for example, to '
+ 'generate other heatmaps keeping the sorting of the '
+ 'first heatmap. Example: Heatmap1sortedRegions.bed',
+ metavar='BED file',
+ type=argparse.FileType('w'))
+ return parser
+
+
+def computeMatrixOptArgs(case=['scale-regions', 'reference-point'][0]):
+
+ parser = argparse.ArgumentParser(add_help=False)
+ optional = parser.add_argument_group('Optional arguments')
+ optional.add_argument('--version', action='version',
+ version='%(prog)s {}'.format(version('deeptools')))
+
+ if case == 'scale-regions':
+ optional.add_argument('--regionBodyLength', '-m',
+ default=1000,
+ type=int,
+ help='Distance in bases to which all regions will '
+ 'be fit. (Default: %(default)s)')
+ optional.add_argument('--startLabel',
+ default='TSS',
+ help='Label shown in the plot for the start of '
+ 'the region. Default is TSS (transcription '
+ 'start site), but could be changed to anything, '
+ 'e.g. "peak start". Note that this is only '
+ 'useful if you plan to plot the results yourself '
+ 'and not, for example, with plotHeatmap, which '
+ 'will override this. (Default: %(default)s)')
+ optional.add_argument('--endLabel',
+ default='TES',
+ help='Label shown in the plot for the region '
+ 'end. Default is TES (transcription end site). '
+ 'See the --startLabel option for more '
+ 'information. (Default: %(default)s) ')
+ optional.add_argument('--beforeRegionStartLength', '-b', '--upstream',
+ default=0,
+ type=int,
+ help='Distance upstream of the start site of '
+ 'the regions defined in the region file. If the '
+ 'regions are genes, this would be the distance '
+ 'upstream of the transcription start site. (Default: %(default)s)')
+ optional.add_argument('--afterRegionStartLength', '-a', '--downstream',
+ default=0,
+ type=int,
+ help='Distance downstream of the end site '
+ 'of the given regions. If the '
+ 'regions are genes, this would be the distance '
+ 'downstream of the transcription end site. (Default: %(default)s)')
+ optional.add_argument("--unscaled5prime",
+ default=0,
+ type=int,
+ help='Number of bases at the 5-prime end of the '
+ 'region to exclude from scaling. By default, '
+ 'each region is scaled to a given length (see the --regionBodyLength option). In some cases it is useful to look at unscaled signals around region boundaries, so this setting specifies the number of unscaled bases on the 5-prime end of each boundary. (Default: %(default)s)')
+ optional.add_argument("--unscaled3prime",
+ default=0,
+ type=int,
+ help='Like --unscaled5prime, but for the 3-prime '
+ 'end. (Default: %(default)s)')
+
+ elif case == 'reference-point':
+ optional.add_argument('--referencePoint',
+ default='TSS',
+ choices=['TSS', 'TES', 'center'],
+ help='The reference point for the plotting '
+ 'could be either the region start (TSS), the '
+ 'region end (TES) or the center of the region. '
+ 'Note that regardless of what you specify, '
+ 'plotHeatmap/plotProfile will default to using "TSS" as the '
+ 'label. (Default: %(default)s)')
+
+ # set region body length to zero for reference point mode
+ optional.add_argument('--regionBodyLength', help=argparse.SUPPRESS,
+ default=0, type=int)
+ optional.add_argument('--unscaled5prime', default=0, type=int, help=argparse.SUPPRESS)
+ optional.add_argument('--unscaled3prime', default=0, type=int, help=argparse.SUPPRESS)
+ optional.add_argument('--beforeRegionStartLength', '-b', '--upstream',
+ default=500,
+ type=int,
+ metavar='INT bp',
+ help='Distance upstream of the reference-point '
+ 'selected. (Default: %(default)s)')
+ optional.add_argument('--afterRegionStartLength', '-a', '--downstream',
+ default=1500,
+ metavar='INT bp',
+ type=int,
+ help='Distance downstream of the '
+ 'reference-point selected. (Default: %(default)s)')
+ optional.add_argument('--nanAfterEnd',
+ action='store_true',
+ help='If set, any values after the region end '
+ 'are discarded. This is useful to visualize '
+ 'the region end when not using the '
+ 'scale-regions mode and when the reference-'
+ 'point is set to the TSS.')
+
+ optional.add_argument('--binSize', '-bs',
+ help='Length, in bases, of the non-overlapping '
+ 'bins for averaging the score over the '
+ 'regions length. (Default: %(default)s)',
+ type=int,
+ default=10)
+
+ optional.add_argument('--sortRegions',
+ help='Whether the output file should present the '
+ 'regions sorted. The default is to not sort the regions. '
+ 'Note that this is only useful if you plan to plot '
+ 'the results yourself and not, for example, with '
+ 'plotHeatmap, which will override this. Note also that '
+ 'unsorted output will be in whatever order the regions '
+ 'happen to be processed in and not match the order in '
+ 'the input files. If you require the output order to '
+ 'match that of the input regions, then either specify '
+ '"keep" or use computeMatrixOperations to resort the '
+ 'results file. (Default: %(default)s)',
+ choices=["descend", "ascend", "no", "keep"],
+ default='keep')
+
+ optional.add_argument('--sortUsing',
+ help='Indicate which method should be used for '
+ 'sorting. The value is computed for each row.'
+ 'Note that the region_length option will lead '
+ 'to a dotted line within the heatmap that indicates '
+ 'the end of the regions. (Default: %(default)s)',
+ choices=["mean", "median", "max", "min", "sum",
+ "region_length"],
+ default='mean')
+
+ optional.add_argument('--sortUsingSamples',
+ help='List of sample numbers (order as in matrix), '
+ 'that are used for sorting by --sortUsing, '
+ 'no value uses all samples, '
+ 'example: --sortUsingSamples 1 3',
+ type=int, nargs='+')
+
+ optional.add_argument('--averageTypeBins',
+ default='mean',
+ choices=["mean", "median", "min",
+ "max", "std", "sum"],
+ help='Define the type of statistic that should be '
+ 'used over the bin size range. The '
+ 'options are: "mean", "median", "min", "max", "sum" '
+ 'and "std". The default is "mean". (Default: %(default)s)')
+
+ optional.add_argument('--missingDataAsZero',
+ help='If set, missing data (NAs) will be treated as zeros. '
+ 'The default is to ignore such cases, which will be depicted as black areas in '
+ 'a heatmap. (see the --missingDataColor argument '
+ 'of the plotHeatmap command for additional options).',
+ action='store_true')
+
+ optional.add_argument('--skipZeros',
+ help='Whether regions with only scores of zero '
+ 'should be included or not. Default is to include '
+ 'them.',
+ action='store_true')
+
+ optional.add_argument('--minThreshold',
+ default=None,
+ type=float,
+ help='Numeric value. Any region containing a '
+ 'value that is less than or equal to this '
+ 'will be skipped. This is useful to skip, '
+ 'for example, genes where the read count is zero '
+ 'for any of the bins. This could be the result of '
+ 'unmappable areas and can bias the overall results. (Default: %(default)s)')
+
+ optional.add_argument('--maxThreshold',
+ default=None,
+ type=float,
+ help='Numeric value. Any region containing a value '
+ 'greater than or equal to this '
+ 'will be skipped. The maxThreshold is useful to '
+ 'skip those few regions with very high read counts '
+ '(e.g. micro satellites) that may bias the average '
+ 'values. (Default: %(default)s)')
+
+ optional.add_argument('--blackListFileName', '-bl',
+ help="A BED file containing regions that should be excluded from all analyses. Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered.",
+ metavar="BED file",
+ required=False)
+
+ optional.add_argument('--samplesLabel',
+ help='Labels for the samples. This will then be passed to plotHeatmap and plotProfile. The '
+ 'default is to use the file name of the '
+ 'sample. The sample labels should be separated '
+ 'by spaces and quoted if a label itself'
+ 'contains a space E.g. --samplesLabel label-1 "label 2" ',
+ nargs='+')
+
+ optional.add_argument('--smartLabels',
+ action='store_true',
+ help='Instead of manually specifying labels for the input '
+ 'bigWig and BED/GTF files, this causes deepTools to use the file name '
+ 'after removing the path and extension.')
+
+ # in contrast to other tools,
+ # computeMatrix by default outputs
+ # messages and the --quiet flag supresses them
+ optional.add_argument('--quiet', '-q',
+ help='Set to remove any warning or processing '
+ 'messages.',
+ action='store_true')
+
+ optional.add_argument('--verbose',
+ help='Being VERY verbose in the status messages. --quiet will disable this.',
+ action='store_true')
+
+ optional.add_argument('--scale',
+ help='If set, all values are multiplied by '
+ 'this number. (Default: %(default)s)',
+ type=float,
+ default=1)
+ optional.add_argument('--numberOfProcessors', '-p',
+ help='Number of processors to use. Type "max/2" to '
+ 'use half the maximum number of processors or "max" '
+ 'to use all available processors. (Default: %(default)s)',
+ metavar="INT",
+ type=numberOfProcessors,
+ default=1,
+ required=False)
+ return parser
+
+
+def process_args(args=None):
+ args = parse_arguments().parse_args(args)
+
+ if len(sys.argv) == 1:
+ parse_arguments().print_help()
+ sys.exit()
+
+ if args.quiet is True:
+ args.verbose = False
+
+ # Ensure before and after region length is positive
+ if args.beforeRegionStartLength < 0:
+ print(f"beforeRegionStartLength changed from {args.beforeRegionStartLength} into {abs(args.beforeRegionStartLength)}")
+ args.beforeRegionStartLength = abs(args.beforeRegionStartLength)
+ if args.afterRegionStartLength < 0:
+ print(f"afterRegionStartLength changed from {args.afterRegionStartLength} into {abs(args.afterRegionStartLength)}")
+ args.afterRegionStartLength = abs(args.afterRegionStartLength)
+
+ if args.command == 'scale-regions':
+ args.nanAfterEnd = False
+ args.referencePoint = None
+ elif args.command == 'reference-point':
+ if args.beforeRegionStartLength == 0 and \
+ args.afterRegionStartLength == 0:
+ sys.exit("\nUpstrean and downstream regions are both "
+ "set to 0. Nothing to output. Maybe you want to "
+ "use the scale-regions mode?\n")
+
+ return args
+
+
+def main(args=None):
+
+ args = process_args(args)
+
+ parameters = {'upstream': args.beforeRegionStartLength,
+ 'downstream': args.afterRegionStartLength,
+ 'body': args.regionBodyLength,
+ 'bin size': args.binSize,
+ 'ref point': args.referencePoint,
+ 'verbose': args.verbose,
+ 'bin avg type': args.averageTypeBins,
+ 'missing data as zero': args.missingDataAsZero,
+ 'min threshold': args.minThreshold,
+ 'max threshold': args.maxThreshold,
+ 'scale': args.scale,
+ 'skip zeros': args.skipZeros,
+ 'nan after end': args.nanAfterEnd,
+ 'proc number': args.numberOfProcessors,
+ 'sort regions': args.sortRegions,
+ 'sort using': args.sortUsing,
+ 'unscaled 5 prime': args.unscaled5prime,
+ 'unscaled 3 prime': args.unscaled3prime
+ }
+
+ hm = heatmapper.heatmapper()
+
+ scores_file_list = args.scoreFileName
+ hm.computeMatrix(scores_file_list, args.regionsFileName, parameters, blackListFileName=args.blackListFileName, verbose=args.verbose, allArgs=args)
+ if args.sortRegions not in ['no', 'keep']:
+ sortUsingSamples = []
+ if args.sortUsingSamples is not None:
+ for i in args.sortUsingSamples:
+ if (i > 0 and i <= hm.matrix.get_num_samples()):
+ sortUsingSamples.append(i - 1)
+ else:
+ exit("The value {0} for --sortUsingSamples is not valid. Only values from 1 to {1} are allowed.".format(args.sortUsingSamples, hm.matrix.get_num_samples()))
+ print('Samples used for ordering within each group: ', sortUsingSamples)
+
+ hm.matrix.sort_groups(sort_using=args.sortUsing, sort_method=args.sortRegions, sample_list=sortUsingSamples)
+ elif args.sortRegions == 'keep':
+ hm.parameters['group_labels'] = hm.matrix.group_labels
+ hm.parameters["group_boundaries"] = hm.matrix.group_boundaries
+ cmo.sortMatrix(hm, args.regionsFileName, args.transcriptID, args.transcript_id_designator, verbose=not args.quiet)
+
+ hm.save_matrix(args.outFileName)
+
+ if args.outFileNameMatrix:
+ hm.save_matrix_values(args.outFileNameMatrix)
+
+ if args.outFileSortedRegions:
+ hm.save_BED(args.outFileSortedRegions)
diff --git a/deepTools/source/deeptools/computeMatrixOperations.py b/deepTools/source/deeptools/computeMatrixOperations.py
new file mode 100644
index 0000000000000000000000000000000000000000..0224f00a39d746c8e0cb04169a0c7c6fa04012e1
--- /dev/null
+++ b/deepTools/source/deeptools/computeMatrixOperations.py
@@ -0,0 +1,852 @@
+#!/usr/bin/env python
+import deeptools.heatmapper as heatmapper
+import deeptoolsintervals.parse as dti
+import numpy as np
+import argparse
+import sys
+import os
+import csv
+from importlib.metadata import version
+
+
+def parse_arguments():
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ description="""
+This tool performs a variety of operations on files produced by computeMatrix.
+
+detailed help:
+
+ computeMatrixOperations info -h
+
+or
+
+ computeMatrixOperations relabel -h
+
+or
+
+ computeMatrixOperations subset -h
+
+or
+
+ computeMatrixOperations filterStrand -h
+
+or
+
+ computeMatrixOperations filterValues -h
+
+or
+
+ computeMatrixOperations rbind -h
+
+or
+
+ computeMatrixOperations cbind -h
+
+or
+ computeMatrixOperations sort -h
+
+or
+ computeMatrixOperations dataRange -h
+
+""",
+ epilog='example usages:\n'
+ 'computeMatrixOperations subset -m input.mat.gz -o output.mat.gz --group "group 1" "group 2" --samples "sample 3" "sample 10"\n\n'
+ ' \n\n')
+
+ subparsers = parser.add_subparsers(
+ title='Commands',
+ dest='command',
+ metavar='')
+
+ # info
+ subparsers.add_parser(
+ 'info',
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ parents=[infoArgs()],
+ help="Print group and sample information",
+ usage='An example usage is:\n computeMatrixOperations info -m input.mat.gz\n\n')
+
+ # relabel
+ subparsers.add_parser(
+ 'relabel',
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ parents=[infoArgs(), relabelArgs()],
+ help="Change sample and/or group label information",
+ usage='An example usage is:\n computeMatrixOperations relabel -m input.mat.gz -o output.mat.gz --sampleLabels "sample 1" "sample 2"\n\n')
+
+ # subset
+ subparsers.add_parser(
+ 'subset',
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ parents=[infoArgs(), subsetArgs()],
+ help="Actually subset the matrix. The group and sample orders are honored, so one can also reorder files.",
+ usage='An example usage is:\n computeMatrixOperations subset -m '
+ 'input.mat.gz -o output.mat.gz --groups "group 1" "group 2" '
+ '--samples "sample 3" "sample 10"\n\n')
+
+ # filterStrand
+ subparsers.add_parser(
+ 'filterStrand',
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ parents=[infoArgs(), filterStrandArgs()],
+ help="Filter entries by strand.",
+ usage='Example usage:\n computeMatrixOperations filterStrand -m '
+ 'input.mat.gz -o output.mat.gz --strand +\n\n')
+
+ # filterValues
+ subparsers.add_parser(
+ 'filterValues',
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ parents=[infoArgs(), filterValuesArgs()],
+ help="Filter entries by min/max value.",
+ usage='Example usage:\n computeMatrixOperations filterValues -m '
+ 'input.mat.gz -o output.mat.gz --min 10 --max 1000\n\n')
+
+ # rbind
+ subparsers.add_parser(
+ 'rbind',
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ parents=[bindArgs()],
+ help="merge multiple matrices by concatenating them head to tail. This assumes that the same samples are present in each in the same order.",
+ usage='Example usage:\n computeMatrixOperations rbind -m '
+ 'input1.mat.gz input2.mat.gz -o output.mat.gz\n\n')
+
+ # cbind
+ subparsers.add_parser(
+ 'cbind',
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ parents=[bindArgs()],
+ help="merge multiple matrices by concatenating them left to right. No assumptions are made about the row order. Regions not present in the first file specified are ignored. Regions missing in subsequent files will result in NAs. Regions are matches based on the first 6 columns of the computeMatrix output (essentially the columns in a BED file).",
+ usage='Example usage:\n computeMatrixOperations cbind -m '
+ 'input1.mat.gz input2.mat.gz -o output.mat.gz\n\n')
+
+ # sort
+ subparsers.add_parser(
+ 'sort',
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ parents=[sortArgs()],
+ help='Sort a matrix file to correspond to the order of entries in the desired input file(s). The groups of regions designated by the files must be present in the order found in the output of computeMatrix (otherwise, use the subset command first). Note that this subcommand can also be used to remove unwanted regions, since regions not present in the input file(s) will be omitted from the output.',
+ usage='Example usage:\n computeMatrixOperations sort -m input.mat.gz -R regions1.bed regions2.bed regions3.gtf -o input.sorted.mat.gz\n\n')
+
+ # dataRange
+ subparsers.add_parser(
+ 'dataRange',
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ parents=[infoArgs()],
+ help='Returns the min, max, median, 10th and 90th percentile of the matrix values per sample.',
+ usage='Example usage:\n computeMatrixOperations dataRange -m input.mat.gz\n\n')
+
+ parser.add_argument('--version', action='version',
+ version='%(prog)s {}'.format(version('deeptools')))
+
+ return parser
+
+
+def bindArgs():
+ parser = argparse.ArgumentParser(add_help=False)
+ required = parser.add_argument_group('Required arguments')
+
+ required.add_argument('--matrixFile', '-m',
+ help='Matrix files from the computeMatrix tool.',
+ nargs='+',
+ required=True)
+
+ required.add_argument('--outFileName', '-o',
+ help='Output file name',
+ required=True)
+
+ return parser
+
+
+def infoArgs():
+ parser = argparse.ArgumentParser(add_help=False)
+ required = parser.add_argument_group('Required arguments')
+
+ required.add_argument('--matrixFile', '-m',
+ help='Matrix file from the computeMatrix tool.',
+ required=True)
+
+ return parser
+
+
+def relabelArgs():
+ parser = argparse.ArgumentParser(add_help=False)
+ required = parser.add_argument_group('Required arguments')
+
+ required.add_argument('--outFileName', '-o',
+ help='Output file name',
+ required=True)
+
+ optional = parser.add_argument_group('Optional arguments')
+
+ optional.add_argument('--groupLabels',
+ nargs='+',
+ help="Groups labels. If none are specified then the current labels will be kept.")
+
+ optional.add_argument('--sampleLabels',
+ nargs='+',
+ help="Sample labels. If none are specified then the current labels will be kept.")
+
+ return parser
+
+
+def subsetArgs():
+ parser = argparse.ArgumentParser(add_help=False)
+ required = parser.add_argument_group('Required arguments')
+
+ required.add_argument('--outFileName', '-o',
+ help='Output file name',
+ required=True)
+
+ optional = parser.add_argument_group('Optional arguments')
+
+ optional.add_argument('--groups',
+ nargs='+',
+ help="Groups to include. If none are specified then all will be included.")
+
+ optional.add_argument('--samples',
+ nargs='+',
+ help="Samples to include. If none are specified then all will be included.")
+
+ return parser
+
+
+def filterStrandArgs():
+ parser = argparse.ArgumentParser(add_help=False)
+ required = parser.add_argument_group('Required arguments')
+
+ required.add_argument('--outFileName', '-o',
+ help='Output file name',
+ required=True)
+
+ required.add_argument('--strand', '-s',
+ help='Strand',
+ choices=['+', '-', '.'],
+ required=True)
+
+ return parser
+
+
+def filterValuesArgs():
+ parser = argparse.ArgumentParser(add_help=False)
+ required = parser.add_argument_group('Required arguments')
+
+ required.add_argument('--outFileName', '-o',
+ help='Output file name',
+ required=True)
+
+ optional = parser.add_argument_group('Optional arguments')
+ optional.add_argument('--min',
+ help='Minimum value. Any row having a single entry less than this will be excluded. The default is no minimum.',
+ type=float,
+ default=None)
+
+ optional.add_argument('--max',
+ help='Maximum value. Any row having a single entry more than this will be excluded. The default is no maximum.',
+ type=float,
+ default=None)
+
+ return parser
+
+
+def sortArgs():
+ parser = argparse.ArgumentParser(add_help=False)
+ required = parser.add_argument_group('Required arguments')
+
+ required.add_argument('--matrixFile', '-m',
+ help='Matrix file from the computeMatrix tool.',
+ required=True)
+
+ required.add_argument('--outFileName', '-o',
+ help='Output file name',
+ required=True)
+
+ required.add_argument('--regionsFileName', '-R',
+ help='File name(s), in BED or GTF format, containing the regions. '
+ 'If multiple bed files are given, each one is '
+ 'considered a group that can be plotted separately. '
+ 'Also, adding a "#" symbol in the bed file causes all '
+ 'the regions until the previous "#" to be considered '
+ 'one group. Alternatively for BED files, putting '
+ 'deepTools_group in the header can be used to indicate a '
+ 'column with group labels. Note that these should be '
+ 'sorted such that all group entries are together.',
+ required=True,
+ nargs='+')
+
+ optional = parser.add_argument_group('Optional arguments')
+
+ optional.add_argument('--transcriptID',
+ default='transcript',
+ help='When a GTF file is used to provide regions, only '
+ 'entries with this value as their feature (column 3) '
+ 'will be processed as transcripts. (Default: %(default)s)')
+
+ optional.add_argument('--transcript_id_designator',
+ default='transcript_id',
+ help='Each region has an ID (e.g., ACTB) assigned to it, '
+ 'which for BED files is either column 4 (if it exists) '
+ 'or the interval bounds. For GTF files this is instead '
+ 'stored in the last column as a key:value pair (e.g., as '
+ '\'transcript_id "ACTB"\', for a key of transcript_id '
+ 'and a value of ACTB). In some cases it can be '
+ 'convenient to use a different identifier. To do so, set '
+ 'this to the desired key. (Default: %(default)s)')
+
+ return parser
+
+
+def printInfo(matrix):
+ """
+ Print the groups and samples
+ """
+
+ print("Groups:")
+ for group in matrix.matrix.group_labels:
+ print("\t{0}".format(group))
+
+ print("Samples:")
+ for sample in matrix.matrix.sample_labels:
+ print("\t{0}".format(sample))
+
+
+def printDataRange(matrix):
+ """
+ Prints the min, max, median, 10th and 90th percentile of the matrix values per sample.
+ """
+ print("Samples\tMin\tMax\tMedian\t10th\t90th")
+ for i, sample in enumerate(matrix.matrix.sample_labels):
+ start = matrix.matrix.sample_boundaries[i]
+ end = matrix.matrix.sample_boundaries[i + 1]
+ sample_matrix = matrix.matrix.matrix[..., start:end]
+ print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}".format(sample, np.amin(sample_matrix),
+ np.amax(sample_matrix),
+ np.ma.median(sample_matrix),
+ np.percentile(sample_matrix, 10),
+ np.percentile(sample_matrix, 90)))
+
+
+def relabelMatrix(matrix, args):
+ """
+ Relabel the samples and groups in a matrix
+ """
+ if args.groupLabels:
+ if len(args.groupLabels) != len(matrix.matrix.group_labels):
+ sys.exit("You specified {} group labels, but {} are required.\n".format(len(args.groupLabels), len(matrix.matrix.group_labels)))
+ matrix.matrix.group_labels = args.groupLabels
+ if args.sampleLabels:
+ if len(args.sampleLabels) != len(matrix.matrix.sample_labels):
+ sys.exit("You specified {} sample labels, but {} are required.\n".format(len(args.sampleLabels), len(matrix.matrix.sample_labels)))
+ matrix.matrix.sample_labels = args.sampleLabels
+
+
+def getGroupBounds(args, matrix):
+ """
+ Given the group labels, return an indexing array and the resulting boundaries
+ """
+ bounds = matrix.parameters['group_boundaries']
+ if args.groups is None:
+ return range(0, matrix.matrix.matrix.shape[0]), np.array(bounds)
+ else:
+ o = list()
+ obounds = [0]
+ for group in args.groups:
+ if group not in matrix.matrix.group_labels:
+ sys.exit("Error: '{0}' is not a valid group\n".format(group))
+ idx = matrix.matrix.group_labels.index(group)
+ o.extend(range(bounds[idx], bounds[idx + 1]))
+ obounds.append(bounds[idx + 1] - bounds[idx])
+ return o, np.cumsum(obounds)
+
+
+def getSampleBounds(args, matrix):
+ """
+ Given the sample labels, return an indexing array
+ """
+ bounds = matrix.parameters['sample_boundaries']
+ if args.samples is None:
+ return np.arange(0, matrix.matrix.matrix.shape[1])
+ else:
+ o = list()
+ for sample in args.samples:
+ if sample not in matrix.matrix.sample_labels:
+ sys.exit("Error: '{0}' is not a valid sample\n".format(sample))
+ idx = matrix.matrix.sample_labels.index(sample)
+ o.extend(range(bounds[idx], bounds[idx + 1]))
+ return o
+
+
+def subsetRegions(hm, bounds):
+ out = []
+ for x in bounds:
+ reg = hm.matrix.regions[x]
+ # we need to add a list of [chrom, [(start, end), (start, end)], name, 0, strand, score)]
+ if isinstance(reg, dict):
+ # This happens on occasion
+ starts = reg["start"].split(",")
+ starts = [int(x) for x in starts]
+ ends = reg["end"].split(",")
+ ends = [int(x) for x in ends]
+ regs = [(x, y) for x, y in zip(starts, ends)]
+ out.append([reg["chrom"], regs, reg["name"], 0, reg["strand"], reg["score"]])
+ else:
+ out.append(reg)
+ return out
+
+
+def filterHeatmap(hm, args):
+ bounds = [0]
+ regions = []
+ keep = []
+ for region in hm.matrix.regions:
+ if region[4] == args.strand:
+ keep.append(True)
+ regions.append(region)
+ else:
+ keep.append(False)
+ keep = np.array(keep)
+
+ # Get the new bounds
+ for idx in range(1, len(hm.matrix.group_boundaries)):
+ i = int(np.sum(keep[hm.matrix.group_boundaries[idx - 1]:hm.matrix.group_boundaries[idx]]))
+ bounds.append(bounds[idx - 1] + i)
+
+ hm.matrix.group_boundaries = bounds
+
+ # subset the matrix
+ hm.matrix.matrix = hm.matrix.matrix[keep, :]
+ hm.matrix.regions = regions
+
+
+def filterHeatmapValues(hm, minVal, maxVal):
+ bounds = [0]
+ regions = []
+ keep = []
+ if minVal is None:
+ minVal = -np.inf
+ if maxVal is None:
+ maxVal = np.inf
+ np.warnings.filterwarnings('ignore')
+ for i, (x, y) in enumerate(zip(np.nanmin(hm.matrix.matrix, axis=1), np.nanmax(hm.matrix.matrix, axis=1))):
+ # x/y will be nan iff a row is entirely nan. Don't filter.
+ if np.isnan(x) or (x >= minVal and y <= maxVal):
+ keep.append(True)
+ regions.append(hm.matrix.regions[i])
+ else:
+ keep.append(False)
+ keep = np.array(keep)
+
+ # Get the new bounds
+ for idx in range(1, len(hm.matrix.group_boundaries)):
+ i = int(np.sum(keep[hm.matrix.group_boundaries[idx - 1]:hm.matrix.group_boundaries[idx]]))
+ bounds.append(bounds[idx - 1] + i)
+
+ hm.matrix.group_boundaries = bounds
+
+ # subset the matrix
+ hm.matrix.matrix = hm.matrix.matrix[keep, :]
+ hm.matrix.regions = regions
+
+
+def insertMatrix(hm, hm2, groupName):
+ """
+ Given two heatmapper objects and a region group name, insert the regions and
+ values from hm2 for that group to the end of those for hm.
+ """
+ # get the bounds for hm
+ idx = hm.parameters["group_labels"].index(groupName)
+ hmEnd = hm.parameters["group_boundaries"][idx + 1]
+ # get the bounds for hm2
+ idx2 = hm2.parameters["group_labels"].index(groupName)
+ hm2Start = hm2.parameters["group_boundaries"][idx2]
+ hm2End = hm2.parameters["group_boundaries"][idx2 + 1]
+
+ # Insert the subset hm2 into hm along axis 0
+ hm.matrix.matrix = np.insert(hm.matrix.matrix, hmEnd, hm2.matrix.matrix[hm2Start:hm2End, :], axis=0)
+
+ # Insert the regions
+ hm.matrix.regions[hmEnd:hmEnd] = hm2.matrix.regions[hm2Start:hm2End]
+
+ # Increase the group boundaries
+ bounds = []
+ for idx3, bound in enumerate(hm.parameters["group_boundaries"]):
+ if idx3 > idx:
+ bound += hm2End - hm2Start
+ bounds.append(bound)
+ hm.parameters["group_boundaries"] = bounds
+
+
+def appendMatrix(hm, hm2, groupName):
+ """
+ Given two heatmapper objects and a region group name, append the values from
+ that group in hm2 onto the end of hm.
+ """
+ # get the bounds for hm2
+ idx2 = hm2.parameters["group_labels"].index(groupName)
+ hm2Start = hm2.parameters["group_boundaries"][idx2]
+ hm2End = hm2.parameters["group_boundaries"][idx2 + 1]
+
+ # Append the matrix
+ hm.matrix.matrix = np.concatenate([hm.matrix.matrix, hm2.matrix.matrix[hm2Start:hm2End, :]], axis=0)
+ # Update the bounds
+ hm.parameters["group_boundaries"].append(hm.parameters["group_boundaries"][-1] + hm2End - hm2Start)
+ # Append the regions
+ hm.matrix.regions.extend(hm2.matrix.regions[hm2Start:hm2End])
+
+
+def rbindMatrices(hm, args):
+ """
+ Bind matrices, top to bottom while accounting for the groups.
+
+ It's assumed that the same samples are present in both and in the exact same order
+ """
+ hm2 = heatmapper.heatmapper()
+ hm.read_matrix_file(args.matrixFile[0])
+ for idx in range(1, len(args.matrixFile)):
+ hm2.read_matrix_file(args.matrixFile[idx])
+ for idx, group in enumerate(hm2.parameters["group_labels"]):
+ if group in hm.parameters["group_labels"]:
+ insertMatrix(hm, hm2, group)
+ else:
+ appendMatrix(hm, hm2, group)
+ hm.parameters["group_labels"].append(group)
+
+ # Update the group boundaries attribute
+ hm.matrix.group_labels = hm.parameters['group_labels']
+ hm.matrix.group_boundaries = hm.parameters['group_boundaries']
+
+
+def cbindMatrices(hm, args):
+ """
+ Bind columns from different matrices according to the group and region names
+
+ Missing regions are left as NA
+ """
+ hm2 = heatmapper.heatmapper()
+
+ # Make a dict of region name:row associations
+ hm.read_matrix_file(args.matrixFile[0])
+ d = dict({x: dict() for x in hm.parameters["group_labels"]})
+ for idx, group in enumerate(hm.parameters["group_labels"]):
+ s = hm.parameters["group_boundaries"][idx]
+ e = hm.parameters["group_boundaries"][idx + 1]
+ for idx2, reg in enumerate(hm.matrix.regions[s:e]):
+ d[group][reg[2]] = idx2 + s
+
+ # Iterate through the other matrices
+ for idx in range(1, len(args.matrixFile)):
+ hm2.read_matrix_file(args.matrixFile[idx])
+ # Add the sample labels
+ hm.parameters['sample_labels'].extend(hm2.parameters['sample_labels'])
+ # Add the sample boundaries
+ lens = [x + hm.parameters['sample_boundaries'][-1] for x in hm2.parameters['sample_boundaries']][1:]
+ hm.parameters['sample_boundaries'].extend(lens)
+
+ # Add on additional NA initialized columns
+ ncol = hm.matrix.matrix.shape[1]
+ hm.matrix.matrix = np.hstack((hm.matrix.matrix, np.empty(hm2.matrix.matrix.shape)))
+ hm.matrix.matrix[:, ncol:] = np.nan
+
+ # Update the values
+ for idx2, group in enumerate(hm2.parameters["group_labels"]):
+ if group not in d:
+ continue
+ s = hm2.parameters["group_boundaries"][idx2]
+ e = hm2.parameters["group_boundaries"][idx2 + 1]
+ for idx3, reg in enumerate(hm2.matrix.regions[s:e]):
+ if reg[2] not in d[group]:
+ continue
+ hm.matrix.matrix[d[group][reg[2]], ncol:] = hm2.matrix.matrix[s + idx3, :]
+
+ # Append the special params
+ for s in hm.special_params:
+ hm.parameters[s].extend(hm2.parameters[s])
+
+ # Update the sample parameters
+ hm.matrix.sample_labels = hm.parameters['sample_labels']
+ hm.matrix.sample_boundaries = hm.parameters['sample_boundaries']
+
+
+def loadBED(line, fp, fname, labelColumn, labels, regions, defaultGroup):
+ """
+ Given a first line, possibly a label column and a list of labels and regions, add the labels and regions in the file to them
+ """
+
+ # This is largely parseBED from deeptoolsintervals
+ labelIdx = None
+ localRegions = {}
+
+ cols = line.strip().split("\t")
+ if labelColumn is not None:
+ label = cols.pop(labelColumn)
+ if label not in labels:
+ labels[label] = len(labels)
+ labelIdx = labels[label]
+ if labelIdx >= len(regions):
+ regions.append(localRegions)
+ else:
+ localRegions = regions[labelIdx]
+
+ if len(cols) >= 6:
+ name = cols[3]
+ else:
+ name = "{0}:{1}-{2}".format(cols[0], cols[1], cols[2])
+ localRegions[name] = len(localRegions)
+
+ for line in fp:
+ if line.startswith("#") and labelColumn is None:
+ if len(localRegions) > 0:
+ label = line[1:].strip()
+ if len(label):
+ labels[dti.findRandomLabel(labels, label)] = len(labels)
+ else:
+ labels[dti.findRandomLabel(labels, os.path.basename(fname))] = len(labels)
+ regions.append(localRegions)
+ localRegions = dict()
+ continue
+ elif line.startswith("#") and labelColumn is not None:
+ continue
+
+ cols = line.strip().split("\t")
+ if len(cols) < 3:
+ continue
+ if labelColumn is not None:
+ label = cols.pop(labelColumn)
+ if label not in labels:
+ labels[label] = len(labels)
+ labelIdx = labels[label]
+ if labelIdx >= len(regions):
+ regions.append({})
+ localRegions = regions[labelIdx]
+
+ if len(cols) >= 6:
+ name = cols[3]
+ else:
+ name = "{0}:{1}-{2}".format(cols[0], cols[1], cols[2])
+ name = dti.findRandomLabel(localRegions, name)
+ localRegions[name] = len(localRegions)
+
+ # Handle the last group if there is no label
+ if labelIdx is None and len(localRegions) > 0:
+ if defaultGroup is not None:
+ labels[dti.findRandomLabel(labels, defaultGroup)] = len(labels)
+ else:
+ labels[dti.findRandomLabel(labels, os.path.basename(fname))] = len(labels)
+ regions.append(localRegions)
+
+
+def loadGTFtranscript(cols, label, defaultGroup, transcript_id_designator):
+ s = next(csv.reader([cols[8]], delimiter=' '))
+ if "deepTools_group" in s and s[-1] != "deepTools_group":
+ label = s[s.index("deepTools_group") + 1].rstrip(";")
+ elif defaultGroup is not None:
+ label = defaultGroup
+
+ if transcript_id_designator not in s or s[-1] == transcript_id_designator:
+ sys.stderr.write("Warning: {0} is malformed!\n".format("\t".join(cols)))
+ return None, None
+
+ name = s[s.index(transcript_id_designator) + 1].rstrip(";")
+ return label, name
+
+
+def loadGTF(line, fp, fname, labels, regions, transcriptID, transcript_id_designator, defaultGroup):
+ """
+ Like loadBED, but for a GTF file
+
+ This is largely a copy of what's in deeptoolsintervals
+ """
+ file_label = dti.findRandomLabel(labels, os.path.basename(fname))
+
+ # handle the first line
+ cols = line.split("\t")
+ if cols[2].lower() == transcriptID.lower():
+ label, name = loadGTFtranscript(cols, file_label, defaultGroup, transcript_id_designator)
+ if label is not None:
+ if label not in labels:
+ labels[label] = len(labels)
+ regions.append(dict())
+ labelIdx = labels[label]
+ regions[labelIdx][name] = len(regions[labelIdx])
+
+ for line in fp:
+ if not isinstance(line, str):
+ line = line.decode('ascii')
+ if not line.startswith('#'):
+ cols = line.strip().split('\t')
+ if len(cols) == 0:
+ continue
+ if cols[2].lower() == transcriptID:
+ label, name = loadGTFtranscript(cols, file_label, defaultGroup, transcript_id_designator)
+ if label is None:
+ continue
+ if label not in labels:
+ labels[label] = len(labels)
+ regions.append(dict())
+ labelIdx = labels[label]
+ regions[labelIdx][name] = len(regions[labelIdx])
+
+
+def sortMatrix(hm, regionsFileName, transcriptID, transcript_id_designator, verbose=True):
+ """
+ Iterate through the files noted by regionsFileName and sort hm accordingly
+ """
+
+ labels = dict()
+ regions = []
+ defaultGroup = None
+ if len(regionsFileName) == 1:
+ defaultGroup = "genes"
+ for fname in regionsFileName:
+ fp = dti.openPossiblyCompressed(fname)
+ line = dti.getNext(fp)
+ labelColumn = None
+ while line.startswith("#"):
+ if not labelColumn:
+ labelColumn = dti.getLabel(line)
+ line = dti.getNext(fp)
+ while line.startswith("track "):
+ line = dti.getNext(fp)
+
+ # Find the label column
+ subtract = 0
+ if labelColumn is not None:
+ subtract = 1
+
+ # Determine the file type and load into a list (or list of lists)
+ cols = line.strip().split("\t")
+ if len(cols) - subtract < 3:
+ raise RuntimeError('{0} does not seem to be a recognized file type!'.format(fname))
+ elif len(cols) - subtract <= 6:
+ loadBED(line, fp, fname, labelColumn, labels, regions, defaultGroup)
+ elif len(cols) and dti.seemsLikeGTF(cols):
+ loadGTF(line, fp, fname, labels, regions, transcriptID, transcript_id_designator, defaultGroup)
+ else:
+ loadBED(line, fp, fname, labelColumn, labels, regions, defaultGroup)
+ fp.close()
+
+ # Do some sanity checking on the group labels and region names within them
+ s1 = set(hm.parameters['group_labels'])
+ if verbose:
+ for e in labels:
+ if e not in s1:
+ sys.exit("The computeMatrix output is missing the '{}' region group. It has {} but the specified regions have {}.\n".format(e, s1, labels.keys()))
+
+ # Make a dictionary out of current labels and regions
+ d = dict()
+ pos = 0
+ groupSizes = dict()
+ for idx, label in enumerate(hm.parameters['group_labels']):
+ s = hm.parameters['group_boundaries'][idx]
+ e = hm.parameters['group_boundaries'][idx + 1]
+ if label not in labels:
+ continue
+ d[label] = dict()
+ groupSize = 0
+ for reg in hm.matrix.regions[s:e]:
+ d[label][reg[2]] = pos
+ pos += 1
+ groupSize += 1
+ groupSizes[label] = groupSize
+
+ # Convert labels to an ordered list
+ labelsList = [""] * len(labels)
+ for k, v in labels.items():
+ labelsList[v] = k
+
+ # Reorder
+ order = []
+ boundaries = [0]
+ for idx, label in enumerate(labelsList):
+ # Make an ordered list out of the region names in this region group
+ _ = [""] * len(regions[idx])
+ for k, v in regions[idx].items():
+ _[v] = k
+ sz = 0 # Track the number of enries actually matched
+ for name in _:
+ if name not in d[label]:
+ if verbose:
+ sys.stderr.write("Skipping {}, due to being absent in the computeMatrix output.\n".format(name))
+ continue
+ sz += 1
+ order.append(d[label][name])
+ if sz == 0 and verbose:
+ sys.exit("The region group {} had no matching entries!\n".format(label))
+ boundaries.append(sz + boundaries[-1])
+ hm.matrix.regions = [hm.matrix.regions[i] for i in order]
+ order = np.array(order)
+ hm.matrix.matrix = hm.matrix.matrix[order, :]
+
+ # Update the parameters
+ hm.parameters["group_labels"] = labelsList
+ hm.matrix.group_labels = labelsList
+ hm.parameters["group_boundaries"] = boundaries
+ hm.matrix.group_boundaries = boundaries
+
+
+def main(args=None):
+ # if args none is need since otherwise pytest passes 'pytest' as sys.argv
+ if args is None:
+ if len(sys.argv) == 1:
+ args = ["-h"]
+ if len(sys.argv) == 2:
+ args = [sys.argv[1], "-h"]
+
+ args = parse_arguments().parse_args(args)
+
+ hm = heatmapper.heatmapper()
+ if not isinstance(args.matrixFile, list):
+ hm.read_matrix_file(args.matrixFile)
+ if args.command == 'info':
+ printInfo(hm)
+ elif args.command == 'dataRange':
+ printDataRange(hm)
+ elif args.command == 'subset':
+ sIdx = getSampleBounds(args, hm)
+ gIdx, gBounds = getGroupBounds(args, hm)
+
+ # groups
+ hm.matrix.regions = subsetRegions(hm, gIdx)
+ # matrix
+ hm.matrix.matrix = hm.matrix.matrix[gIdx, :]
+ hm.matrix.matrix = hm.matrix.matrix[:, sIdx]
+ # boundaries
+ if args.samples is None:
+ args.samples = hm.matrix.sample_labels
+ hm.matrix.sample_boundaries = hm.matrix.sample_boundaries[0:len(args.samples) + 1]
+ hm.matrix.group_boundaries = gBounds.tolist()
+ # special params
+ keepIdx = set()
+ for _, sample in enumerate(hm.matrix.sample_labels):
+ if sample in args.samples:
+ keepIdx.add(_)
+ for param in hm.special_params:
+ hm.parameters[param] = [v for k, v in enumerate(hm.parameters[param]) if k in keepIdx]
+ # labels
+ hm.matrix.sample_labels = args.samples
+ if args.groups is None:
+ args.groups = hm.matrix.group_labels
+ hm.matrix.group_labels = args.groups
+ # save
+ hm.save_matrix(args.outFileName)
+ elif args.command == 'filterStrand':
+ filterHeatmap(hm, args)
+ hm.save_matrix(args.outFileName)
+ elif args.command == 'filterValues':
+ filterHeatmapValues(hm, args.min, args.max)
+ hm.save_matrix(args.outFileName)
+ elif args.command == 'rbind':
+ rbindMatrices(hm, args)
+ hm.save_matrix(args.outFileName)
+ elif args.command == 'cbind':
+ cbindMatrices(hm, args)
+ hm.save_matrix(args.outFileName)
+ elif args.command == 'sort':
+ sortMatrix(hm, args.regionsFileName, args.transcriptID, args.transcript_id_designator)
+ hm.save_matrix(args.outFileName)
+ elif args.command == 'relabel':
+ relabelMatrix(hm, args)
+ hm.save_matrix(args.outFileName)
+ else:
+ sys.exit("Unknown command {0}!\n".format(args.command))
diff --git a/deepTools/source/deeptools/correctGCBias.py b/deepTools/source/deeptools/correctGCBias.py
new file mode 100644
index 0000000000000000000000000000000000000000..1154b93688f92a40517c6c110e3b9872666bcac0
--- /dev/null
+++ b/deepTools/source/deeptools/correctGCBias.py
@@ -0,0 +1,746 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import os
+import shutil
+import time
+import subprocess
+import sys
+
+import py2bit
+import pysam
+import multiprocessing
+import numpy as np
+import argparse
+
+from scipy.stats import binom
+
+from deeptools.utilities import tbitToBamChrName, getGC_content
+from deeptools import writeBedGraph, parserCommon, mapReduce
+from deeptools import utilities
+from deeptools.bamHandler import openBam
+
+old_settings = np.seterr(all='ignore')
+
+
+def parse_arguments(args=None):
+ parentParser = parserCommon.getParentArgParse(binSize=True, blackList=False)
+ requiredArgs = getRequiredArgs()
+ parser = argparse.ArgumentParser(
+ parents=[requiredArgs, parentParser],
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ description='This tool corrects the GC-bias using the'
+ ' method proposed by [Benjamini & Speed (2012). '
+ 'Nucleic Acids Research, 40(10)]. It will remove reads'
+ ' from regions with too high coverage compared to the'
+ ' expected values (typically GC-rich regions) and will'
+ ' add reads to regions where too few reads are seen '
+ '(typically AT-rich regions). '
+ 'The tool ``computeGCBias`` needs to be run first to generate the '
+ 'frequency table needed here.',
+ usage='correctGCBias '
+ '-b file.bam --effectiveGenomeSize 2150570000 -g mm9.2bit '
+ '--GCbiasFrequenciesFile freq.txt -o gc_corrected.bam\n'
+ 'help: correctGCBias -h / correctGCBias --help',
+ conflict_handler='resolve',
+ add_help=False)
+ return parser
+
+
+def process_args(args=None):
+ args = parse_arguments().parse_args(args)
+
+ return args
+
+
+def getRequiredArgs():
+ parser = argparse.ArgumentParser(add_help=False)
+
+ required = parser.add_argument_group('Required arguments')
+
+ # define the arguments
+ required.add_argument('--bamfile', '-b',
+ metavar='BAM file',
+ help='Sorted BAM file to correct.',
+ required=True)
+ required.add_argument('--effectiveGenomeSize',
+ help='The effective genome size is the portion '
+ 'of the genome that is mappable. Large fractions of '
+ 'the genome are stretches of NNNN that should be '
+ 'discarded. Also, if repetitive regions were not '
+ 'included in the mapping of reads, the effective '
+ 'genome size needs to be adjusted accordingly. '
+ 'A table of values is available here: '
+ 'http://deeptools.readthedocs.io/en/latest/content/feature/effectiveGenomeSize.html .',
+ default=None,
+ type=int,
+ required=True)
+
+ required.add_argument('--genome', '-g',
+ help='Genome in two bit format. Most genomes can be '
+ 'found here: http://hgdownload.cse.ucsc.edu/gbdb/ '
+ 'Search for the .2bit ending. Otherwise, fasta '
+ 'files can be converted to 2bit using faToTwoBit '
+ 'available here: '
+ 'http://hgdownload.cse.ucsc.edu/admin/exe/',
+ metavar='two bit file',
+ required=True)
+
+ required.add_argument('--GCbiasFrequenciesFile', '-freq',
+ help='Indicate the output file from '
+ 'computeGCBias containing '
+ 'the observed and expected read frequencies per GC-'
+ 'content.',
+ type=argparse.FileType('r'),
+ metavar='FILE',
+ required=True)
+
+ output = parser.add_argument_group('Output options')
+ output.add_argument('--correctedFile', '-o',
+ help='Name of the corrected file. The ending will '
+ 'be used to decide the output file format. The options '
+ 'are ".bam", ".bw" for a bigWig file, ".bg" for a '
+ 'bedGraph file.',
+ metavar='FILE',
+ type=argparse.FileType('w'),
+ required=True)
+
+ # define the optional arguments
+ optional = parser.add_argument_group('Optional arguments')
+ optional.add_argument("--help", "-h", action="help",
+ help="show this help message and exit")
+
+ return parser
+
+
+def getReadGCcontent(tbit, read, fragmentLength, chrNameBit):
+ """
+ The fragments for forward and reverse reads are defined as follows::
+
+ |- read.pos |- read.aend
+ ---+=================>-----------------------+--------- Forward strand
+
+ |-fragStart |-fragEnd
+
+ ---+-----------------------<=================+--------- Reverse strand
+ |-read.pos |-read.aend
+
+ |-----------------------------------------|
+ read.tlen
+
+ """
+ fragStart = None
+ fragEnd = None
+
+ if read.is_paired and read.is_proper_pair and abs(read.tlen) < 2 * fragmentLength:
+ if read.is_reverse and read.tlen < 0:
+ fragEnd = read.reference_end
+ fragStart = read.reference_end + read.template_length
+ elif read.template_length >= read.query_alignment_length:
+ fragStart = read.pos
+ fragEnd = read.pos + read.template_length
+
+ if not fragStart:
+ if read.is_reverse:
+ fragEnd = read.reference_end
+ fragStart = read.reference_end - fragmentLength
+ else:
+ fragStart = read.pos
+ fragEnd = fragStart + fragmentLength
+ fragStart = max(0, fragStart)
+ try:
+ gc = getGC_content(tbit, chrNameBit, fragStart, fragEnd)
+ except Exception:
+ return None
+ if gc is None:
+ return None
+
+ # match the gc to the given fragmentLength
+ gc = int(np.round(gc * fragmentLength))
+ return gc
+
+
+def writeCorrected_wrapper(args):
+ return writeCorrected_worker(*args)
+
+
+def writeCorrected_worker(chrNameBam, chrNameBit, start, end, step):
+ r"""writes a bedgraph file containing the GC correction of
+ a region from the genome
+
+ >>> test = Tester()
+ >>> tempFile = writeCorrected_worker(*test.testWriteCorrectedChunk())
+ >>> open(tempFile, 'r').readlines()
+ ['chr2L\t200\t225\t31.6\n', 'chr2L\t225\t250\t33.8\n', 'chr2L\t250\t275\t37.9\n', 'chr2L\t275\t300\t40.9\n']
+ >>> os.remove(tempFile)
+ """
+ global R_gc
+ fragmentLength = len(R_gc) - 1
+
+ cvg_corr = np.zeros(end - start)
+
+ i = 0
+
+ tbit = py2bit.open(global_vars['2bit'])
+ bam = openBam(global_vars['bam'])
+ read_repetitions = 0
+ removed_duplicated_reads = 0
+ startTime = time.time()
+
+ # caching seems to be faster
+ # r.flag & 4 == 0 is to skip unmapped
+ # reads that nevertheless are asigned
+ # to a genomic position
+ reads = [r for r in bam.fetch(chrNameBam, start, end)
+ if r.flag & 4 == 0]
+
+ bam.close()
+
+ r_index = -1
+ for read in reads:
+ if read.is_unmapped:
+ continue
+ r_index += 1
+ try:
+ # calculate GC content of read fragment
+ gc = getReadGCcontent(tbit, read, fragmentLength,
+ chrNameBit)
+ except Exception as detail:
+ print(detail)
+ """ this exception happens when the end of a
+ chromosome is reached """
+ continue
+ if not gc:
+ continue
+
+ # is this read in the same orientation and position as the previous?
+ if r_index > 0 and read.pos == reads[r_index - 1].pos and \
+ read.is_reverse == reads[r_index - 1].is_reverse \
+ and read.pnext == reads[r_index - 1].pnext:
+ read_repetitions += 1
+ if read_repetitions >= global_vars['max_dup_gc'][gc]:
+ removed_duplicated_reads += 1
+ continue
+ else:
+ read_repetitions = 0
+
+ try:
+ fragmentStart, fragmentEnd = getFragmentFromRead(read, fragmentLength, extendPairedEnds=True)
+ vectorStart = max(fragmentStart - start, 0)
+ vectorEnd = min(fragmentEnd - start, end - start)
+ except TypeError:
+ # the get_fragment_from_read functions returns None in some cases.
+ # Those cases are to be skipped, hence the continue line.
+ continue
+
+ cvg_corr[vectorStart:vectorEnd] += float(1) / R_gc[gc]
+ i += 1
+
+ try:
+ if debug:
+ endTime = time.time()
+ print("{}, processing {} ({:.1f} per sec) "
+ "reads @ {}:{}-{}".format(multiprocessing.current_process().name,
+ i, i / (endTime - startTime),
+ chrNameBit, start, end))
+ except NameError:
+ pass
+
+ if i == 0:
+ return None
+
+ _file = open(utilities.getTempFileName(suffix='.bg'), 'w')
+ # save in bedgraph format
+ for bin in range(0, len(cvg_corr), step):
+ value = np.mean(cvg_corr[bin:min(bin + step, end)])
+ if value > 0:
+ writeStart = start + bin
+ writeEnd = min(start + bin + step, end)
+ _file.write("%s\t%d\t%d\t%.1f\n" % (chrNameBit, writeStart,
+ writeEnd, value))
+
+ tempFileName = _file.name
+ _file.close()
+ return tempFileName
+
+
+def numCopiesOfRead(value):
+ """
+ Based int he R_gc value, decides
+ whether to keep, duplicate, triplicate or delete the read.
+ It returns an integer, that tells the number of copies of the read
+ that should be keep.
+ >>> np.random.seed(1)
+ >>> numCopiesOfRead(0.8)
+ 1
+ >>> numCopiesOfRead(2.5)
+ 2
+ >>> numCopiesOfRead(None)
+ 1
+ """
+ copies = 1
+ if value:
+ copies = int(value) + (1 if np.random.rand() < value % 1 else 0)
+ return copies
+
+
+def writeCorrectedSam_wrapper(args):
+ return writeCorrectedSam_worker(*args)
+
+
+def writeCorrectedSam_worker(chrNameBam, chrNameBit, start, end,
+ step=None,
+ tag_but_not_change_number=False,
+ verbose=True):
+ r"""
+ Writes a BAM file, deleting and adding some reads in order to compensate
+ for the GC bias. **This is a stochastic method.**
+ >>> np.random.seed(1)
+ >>> test = Tester()
+ >>> args = test.testWriteCorrectedSam()
+ >>> tempFile = writeCorrectedSam_worker(*args, \
+ ... tag_but_not_change_number=True, verbose=False)
+ >>> try:
+ ... import StringIO
+ ... except ImportError:
+ ... from io import StringIO
+ >>> ostdout = sys.stdout
+ >>> import tempfile
+ >>> sys.stdout = tempfile.TemporaryFile()
+ >>> idx = pysam.index(tempFile)
+ >>> sys.stdout = ostdout
+ >>> bam = pysam.Samfile(tempFile)
+ >>> [dict(r.tags)['YN'] for r in bam.fetch(args[0], 200, 250)]
+ [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1]
+ >>> res = os.remove(tempFile)
+ >>> res = os.remove(tempFile+".bai")
+ >>> tempFile = \
+ ... writeCorrectedSam_worker(*test.testWriteCorrectedSam_paired(),\
+ ... tag_but_not_change_number=True, verbose=False)
+ >>> sys.stdout = tempfile.TemporaryFile()
+ >>> idx = pysam.index(tempFile)
+ >>> sys.stdout = ostdout
+ >>> bam = pysam.Samfile(tempFile)
+ >>> [dict(r.tags)['YN'] for r in bam.fetch('chr2L', 0, 50)]
+ [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+ >>> res = os.remove(tempFile)
+ >>> res = os.remove(tempFile+".bai")
+ """
+ global R_gc
+ fragmentLength = len(R_gc) - 1
+
+ if verbose:
+ print("Sam for %s %s %s " % (chrNameBit, start, end))
+ i = 0
+
+ tbit = py2bit.open(global_vars['2bit'])
+
+ bam = openBam(global_vars['bam'])
+ tempFileName = utilities.getTempFileName(suffix='.bam')
+
+ outfile = pysam.Samfile(tempFileName, 'wb', template=bam)
+ startTime = time.time()
+ matePairs = {}
+ read_repetitions = 0
+ removed_duplicated_reads = 0
+
+ # cache data
+ # r.flag & 4 == 0 is to filter unmapped reads that
+ # have a genomic position
+ reads = [r for r in bam.fetch(chrNameBam, start, end)
+ if r.pos > start and r.flag & 4 == 0]
+
+ r_index = -1
+ for read in reads:
+ if read.pos <= start or read.is_unmapped:
+ continue
+ r_index += 1
+ copies = None
+ gc = None
+
+ # check if a mate has already been procesed
+ # to apply the same correction
+ try:
+ copies = matePairs[read.qname]['copies']
+ gc = matePairs[read.qname]['gc']
+ del matePairs[read.qname]
+ except:
+ # this exception happens when a mate is
+ # not present. This could
+ # happen because of removal of the mate
+ # by some filtering
+ gc = getReadGCcontent(tbit, read, fragmentLength,
+ chrNameBit)
+ if gc:
+ copies = numCopiesOfRead(float(1) / R_gc[gc])
+ else:
+ copies = 1
+ # is this read in the same orientation and position as the previous?
+ if gc and r_index > 0 and read.pos == reads[r_index - 1].pos \
+ and read.is_reverse == reads[r_index - 1].is_reverse \
+ and read.pnext == reads[r_index - 1].pnext:
+ read_repetitions += 1
+ if read_repetitions >= global_vars['max_dup_gc'][gc]:
+ copies = 0 # in other words do not take into account this read
+ removed_duplicated_reads += 1
+ else:
+ read_repetitions = 0
+
+ readName = read.qname
+ # Each tag is a tuple of (tag name, value, type)
+ # Note that get_tags() returns ord(type) rather than type and this must
+ # be fixed!
+ # It turns out that the "with_value_type" option only started working in
+ # pysam-0.8.4, so we can't reliably add tags on earlier versions without
+ # potentially creating BAM files that break HTSJDK/IGV/etc.
+
+ readTag = read.get_tags(with_value_type=True)
+ replace_tags = False
+ if len(readTag) > 0:
+ if len(readTag[0]) == 3:
+ if type(readTag[2]) is int:
+ readTag = [(x[0], x[1], chr(x[2])) for x in readTag]
+ replace_tags = True
+ else:
+ replace_tags = True
+
+ if gc:
+ GC = int(100 * np.round(float(gc) / fragmentLength,
+ decimals=2))
+ readTag.append(
+ ('YC', float(round(float(1) / R_gc[gc], 2)), "f"))
+ readTag.append(('YN', copies, "i"))
+ else:
+ GC = -1
+
+ readTag.append(('YG', GC, "i"))
+ if replace_tags:
+ read.set_tags(readTag)
+
+ if read.is_paired and read.is_proper_pair \
+ and not read.mate_is_unmapped \
+ and not read.is_reverse:
+ matePairs[readName] = {'copies': copies,
+ 'gc': gc}
+
+ """
+ outfile.write(read)
+ """
+ if tag_but_not_change_number:
+ outfile.write(read)
+ continue
+
+ for numCop in range(1, copies + 1):
+ # the read has to be renamed such that newly
+ # formed pairs will match
+ if numCop > 1:
+ read.qname = readName + "_%d" % (numCop)
+ outfile.write(read)
+
+ if verbose:
+ if i % 500000 == 0 and i > 0:
+ endTime = time.time()
+ print("{}, processing {} ({:.1f} per sec) reads "
+ "@ {}:{}-{}".format(multiprocessing.current_process().name,
+ i, i / (endTime - startTime),
+ chrNameBit, start, end))
+ i += 1
+
+ outfile.close()
+ if verbose:
+ endTime = time.time()
+ print("{}, processing {} ({:.1f} per sec) reads "
+ "@ {}:{}-{}".format(multiprocessing.current_process().name,
+ i, i / (endTime - startTime),
+ chrNameBit, start, end))
+ percentage = float(removed_duplicated_reads) * 100 / len(reads) \
+ if len(reads) > 0 else 0
+ print("duplicated reads removed %d of %d (%.2f) " %
+ (removed_duplicated_reads, len(reads), percentage))
+
+ return tempFileName
+
+
+def getFragmentFromRead(read, defaultFragmentLength, extendPairedEnds=True):
+ """
+ The read has to be pysam object.
+
+ The following values are defined (for forward reads)::
+
+
+ |-- -- read.tlen -- --|
+ |-- read.alen --|
+ -----|===============>------------<==============|----
+ | | |
+ read.pos read.aend read.pnext
+
+
+ and for reverse reads
+
+
+ |-- -- read.tlen -- --|
+ |-- read.alen --|
+ -----|===============>-----------<===============|----
+ | | |
+ read.pnext read.pos read.aend
+
+ this is a sketch of a pair-end reads
+
+ The function returns the fragment start and end, either
+ using the paired end information (if available) or
+ extending the read in the appropriate direction if this
+ is single-end.
+
+ Parameters
+ ----------
+ read : pysam read object
+
+
+ Returns
+ -------
+ tuple
+ (fragment start, fragment end)
+
+ """
+ # convert reads to fragments
+
+ # this option indicates that the paired ends correspond
+ # to the fragment ends
+ # condition read.tlen < maxPairedFragmentLength is added to avoid read pairs
+ # that span thousands of base pairs
+
+ if extendPairedEnds is True and read.is_paired and 0 < abs(read.tlen) < 1000:
+ if read.is_reverse:
+ fragmentStart = read.pnext
+ fragmentEnd = read.aend
+ else:
+ fragmentStart = read.pos
+ # the end of the fragment is defined as
+ # the start of the forward read plus the insert length
+ fragmentEnd = read.pos + read.tlen
+ else:
+ if defaultFragmentLength <= read.aend - read.pos:
+ fragmentStart = read.pos
+ fragmentEnd = read.aend
+ else:
+ if read.is_reverse:
+ fragmentStart = read.aend - defaultFragmentLength
+ fragmentEnd = read.aend
+ else:
+ fragmentStart = read.pos
+ fragmentEnd = read.pos + defaultFragmentLength
+
+ return fragmentStart, fragmentEnd
+
+
+def run_shell_command(command):
+ """
+ Runs the given shell command. Report
+ any errors found.
+ """
+ try:
+ subprocess.check_call(command, shell=True)
+
+ except subprocess.CalledProcessError as error:
+ sys.stderr.write('Error{}\n'.format(error))
+ exit(1)
+ except Exception as error:
+ sys.stderr.write('Error: {}\n'.format(error))
+ exit(1)
+
+
+def main(args=None):
+ args = process_args(args)
+ global F_gc, N_gc, R_gc
+
+ data = np.loadtxt(args.GCbiasFrequenciesFile.name)
+
+ F_gc = data[:, 0]
+ N_gc = data[:, 1]
+ R_gc = data[:, 2]
+
+ global global_vars
+ global_vars = {}
+ global_vars['2bit'] = args.genome
+ global_vars['bam'] = args.bamfile
+
+ # compute the probability to find more than one read (a redundant read)
+ # at a certain position based on the gc of the read fragment
+ # the binomial function is used for that
+ max_dup_gc = [binom.isf(1e-7, F_gc[x], 1.0 / N_gc[x])
+ if F_gc[x] > 0 and N_gc[x] > 0 else 1
+ for x in range(len(F_gc))]
+
+ global_vars['max_dup_gc'] = max_dup_gc
+
+ tbit = py2bit.open(global_vars['2bit'])
+ bam, mapped, unmapped, stats = openBam(args.bamfile, returnStats=True, nThreads=args.numberOfProcessors)
+
+ global_vars['genome_size'] = sum(tbit.chroms().values())
+ global_vars['total_reads'] = mapped
+ global_vars['reads_per_bp'] = \
+ float(global_vars['total_reads']) / args.effectiveGenomeSize
+
+ # apply correction
+ print("applying correction")
+ # divide the genome in fragments containing about 4e5 reads.
+ # This amount of reads takes about 20 seconds
+ # to process per core (48 cores, 256 Gb memory)
+ chunkSize = int(4e5 / global_vars['reads_per_bp'])
+
+ # chromSizes: list of tuples
+ chromSizes = [(bam.references[i], bam.lengths[i])
+ for i in range(len(bam.references))]
+
+ regionStart = 0
+ if args.region:
+ chromSizes, regionStart, regionEnd, chunkSize = \
+ mapReduce.getUserRegion(chromSizes, args.region,
+ max_chunk_size=chunkSize)
+
+ print("genome partition size for multiprocessing: {}".format(chunkSize))
+ print("using region {}".format(args.region))
+ mp_args = []
+ bedGraphStep = args.binSize
+ chrNameBitToBam = tbitToBamChrName(list(tbit.chroms().keys()), bam.references)
+ chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.items()])
+ print(chrNameBitToBam, chrNameBamToBit)
+ c = 1
+ for chrom, size in chromSizes:
+ start = 0 if regionStart == 0 else regionStart
+ for i in range(start, size, chunkSize):
+ try:
+ chrNameBamToBit[chrom]
+ except KeyError:
+ print("no sequence information for ")
+ "chromosome {} in 2bit file".format(chrom)
+ print("Reads in this chromosome will be skipped")
+ continue
+ length = min(size, i + chunkSize)
+ mp_args.append((chrom, chrNameBamToBit[chrom], i, length,
+ bedGraphStep))
+ c += 1
+
+ pool = multiprocessing.Pool(args.numberOfProcessors)
+
+ if args.correctedFile.name.endswith('bam'):
+ if len(mp_args) > 1 and args.numberOfProcessors > 1:
+ print(("using {} processors for {} "
+ "number of tasks".format(args.numberOfProcessors,
+ len(mp_args))))
+
+ res = pool.map_async(
+ writeCorrectedSam_wrapper, mp_args).get(9999999)
+ else:
+ res = list(map(writeCorrectedSam_wrapper, mp_args))
+
+ if len(res) == 1:
+ command = "cp {} {}".format(res[0], args.correctedFile.name)
+ run_shell_command(command)
+ else:
+ print("concatenating (sorted) intermediate BAMs")
+ header = pysam.Samfile(res[0])
+ of = pysam.Samfile(args.correctedFile.name, "wb", template=header)
+ header.close()
+ for f in res:
+ f = pysam.Samfile(f)
+ for e in f.fetch(until_eof=True):
+ of.write(e)
+ f.close()
+ of.close()
+
+ print("indexing BAM")
+ pysam.index(args.correctedFile.name)
+
+ for tempFileName in res:
+ os.remove(tempFileName)
+
+ if args.correctedFile.name.endswith('bg') or \
+ args.correctedFile.name.endswith('bw'):
+
+ if len(mp_args) > 1 and args.numberOfProcessors > 1:
+
+ res = pool.map_async(writeCorrected_wrapper, mp_args).get(9999999)
+ else:
+ res = list(map(writeCorrected_wrapper, mp_args))
+
+ oname = args.correctedFile.name
+ args.correctedFile.close()
+ if oname.endswith('bg'):
+ f = open(oname, 'wb')
+ for tempFileName in res:
+ if tempFileName:
+ shutil.copyfileobj(open(tempFileName, 'rb'), f)
+ os.remove(tempFileName)
+ f.close()
+ else:
+ chromSizes = [(k, v) for k, v in tbit.chroms().items()]
+ writeBedGraph.bedGraphToBigWig(chromSizes, res, oname)
+
+
+class Tester():
+ def __init__(self):
+ import os
+ self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_corrGC/"
+ self.tbitFile = self.root + "sequence.2bit"
+ self.bamFile = self.root + "test.bam"
+ self.chrNameBam = '2L'
+ self.chrNameBit = 'chr2L'
+ bam, mapped, unmapped, stats = openBam(self.bamFile, returnStats=True)
+ tbit = py2bit.open(self.tbitFile)
+ global debug
+ debug = 0
+ global global_vars
+ global_vars = {'2bit': self.tbitFile,
+ 'bam': self.bamFile,
+ 'filter_out': None,
+ 'extra_sampling_file': None,
+ 'max_reads': 5,
+ 'min_reads': 0,
+ 'min_reads': 0,
+ 'reads_per_bp': 0.3,
+ 'total_reads': mapped,
+ 'genome_size': sum(tbit.chroms().values())}
+
+ def testWriteCorrectedChunk(self):
+ """ prepare arguments for test
+ """
+ global R_gc, R_gc_min, R_gc_max
+ R_gc = np.loadtxt(self.root + "R_gc_paired.txt")
+
+ global_vars['max_dup_gc'] = np.ones(301)
+
+ start = 200
+ end = 300
+ bedGraphStep = 25
+ return (self.chrNameBam,
+ self.chrNameBit, start, end, bedGraphStep)
+
+ def testWriteCorrectedSam(self):
+ """ prepare arguments for test
+ """
+ global R_gc, R_gc_min, R_gc_max
+ R_gc = np.loadtxt(self.root + "R_gc_paired.txt")
+
+ global_vars['max_dup_gc'] = np.ones(301)
+
+ start = 200
+ end = 250
+ return (self.chrNameBam,
+ self.chrNameBit, start, end)
+
+ def testWriteCorrectedSam_paired(self):
+ """ prepare arguments for test.
+ """
+ global R_gc, R_gc_min, R_gc_max
+ R_gc = np.loadtxt(self.root + "R_gc_paired.txt")
+
+ start = 0
+ end = 500
+ global global_vars
+ global_vars['bam'] = self.root + "paired.bam"
+ return 'chr2L', 'chr2L', start, end
+
+
+if __name__ == "__main__":
+ main()
diff --git a/deepTools/source/deeptools/correlation.py b/deepTools/source/deeptools/correlation.py
new file mode 100644
index 0000000000000000000000000000000000000000..56b8d91d2ff72647f4a144f531846209844b1406
--- /dev/null
+++ b/deepTools/source/deeptools/correlation.py
@@ -0,0 +1,706 @@
+import sys
+import itertools
+import copy
+import numpy as np
+import scipy.cluster.hierarchy as sch
+import scipy.stats
+import matplotlib as mpl
+mpl.use('Agg')
+mpl.rcParams['pdf.fonttype'] = 42
+mpl.rcParams['svg.fonttype'] = 'none'
+from deeptools import cm # noqa: F401
+import matplotlib.pyplot as plt
+import matplotlib.gridspec as gridspec
+import matplotlib.ticker
+import matplotlib.mlab
+import matplotlib.markers
+import matplotlib.colors as pltcolors
+from deeptools.utilities import toString, convertCmap
+
+import plotly.offline as offline
+import plotly.graph_objs as go
+import plotly.figure_factory as ff
+
+
+old_settings = np.seterr(all='ignore')
+
+
+class Correlation:
+ """
+ class to work with matrices
+ having sample data
+ to compute correlations, plot
+ them and make scatter plots
+ """
+
+ def __init__(self, matrix_file,
+ corr_method=None,
+ labels=None,
+ remove_outliers=False,
+ skip_zeros=False,
+ log1p=False):
+
+ self.load_matrix(matrix_file)
+ self.skip_zeros = skip_zeros
+ self.corr_method = corr_method
+ self.corr_matrix = None # correlation matrix
+ self.column_order = None
+ self.rowCenter = False
+ if labels is not None:
+ # test that the length of labels
+ # corresponds to the length of
+ # samples
+
+ self.labels = labels
+ self.labels = [toString(x) for x in self.labels]
+
+ if self.matrix.shape[1] == 1:
+ # There's nothing that can be done with a single sample
+ sys.exit("\nPlease use a matrix with more than one sample\n")
+
+ if skip_zeros is True:
+ # remove rows containing only nans or zeros
+ # that could be unmappable regions.
+ self.remove_rows_of_zeros()
+
+ if remove_outliers is True:
+ # remove outliers, otherwise outliers will produce a very
+ # high pearson correlation. Unnecessary for spearman correlation
+ self.remove_outliers()
+
+ if log1p is True:
+ self.matrix = np.log1p(self.matrix)
+
+ if corr_method:
+ self.compute_correlation()
+
+ def load_matrix(self, matrix_file):
+ """
+ loads a matrix file saved using the numpy
+ savez method. Two keys are expected:
+ 'matrix' and 'labels'. The matrix should
+ contain one sample per row
+ """
+
+ _ma = np.load(matrix_file)
+ # matrix: cols correspond to samples
+ self.matrix = np.asarray(_ma['matrix'].tolist())
+ if np.any(np.isnan(self.matrix)):
+ num_nam = len(np.flatnonzero(np.isnan(self.matrix.flatten())))
+ sys.stderr.write("*Warning*. {} NaN values were found. They will be removed along with the "
+ "corresponding bins in other samples for the computation "
+ "and plotting\n".format(num_nam))
+
+ self.matrix = np.ma.compress_rows(np.ma.masked_invalid(self.matrix))
+
+ self.labels = list(map(toString, _ma['labels']))
+
+ assert len(self.labels) == self.matrix.shape[1], "ERROR, length of labels is not equal " \
+ "to length of matrix samples"
+
+ @staticmethod
+ def get_outlier_indices(data, max_deviation=200):
+ """
+ The method is based on the median absolute deviation. See
+ Boris Iglewicz and David Hoaglin (1993),
+ "Volume 16: How to Detect and Handle Outliers",
+ The ASQC Basic References in Quality Control:
+ Statistical Techniques, Edward F. Mykytka, Ph.D., Editor.
+
+ returns the list, without the outliers
+
+ The max_deviation=200 is like selecting a z-score
+ larger than 200, just that it is based on the median
+ and the median absolute deviation instead of the
+ mean and the standard deviation.
+ """
+ median = np.median(data)
+ b_value = 1.4826 # value set for a normal distribution
+ mad = b_value * np.median(np.abs(data))
+ outliers = []
+ if mad > 0:
+ deviation = abs(data - median) / mad
+ """
+ outliers = data[deviation > max_deviation]
+ print "outliers removed {}".format(len(outliers))
+ print outliers
+ """
+ outliers = np.flatnonzero(deviation > max_deviation)
+ return outliers
+
+ def remove_outliers(self, verbose=True):
+ """
+ get the outliers *per column* using the median absolute
+ deviation method
+
+ Returns the filtered matrix
+ """
+
+ unfiltered = len(self.matrix)
+ to_remove = None
+ for col in self.matrix.T:
+ outliers = self.get_outlier_indices(col)
+ if to_remove is None:
+ to_remove = set(outliers)
+ else:
+ # only set to remove those bins in which
+ # the outliers are present in all cases (colums)
+ # that's why the intersection is used
+ to_remove = to_remove.intersection(outliers)
+ if len(to_remove):
+ to_keep = [x for x in range(self.matrix.shape[0])
+ if x not in to_remove]
+ self.matrix = self.matrix[to_keep, :]
+ if verbose:
+ sys.stderr.write(
+ "total/filtered/left: "
+ "{}/{}/{}\n".format(unfiltered,
+ unfiltered - len(to_keep),
+ len(to_keep)))
+
+ return self.matrix
+
+ def remove_rows_of_zeros(self):
+ # remove rows containing all zeros or all nans
+ _mat = np.nan_to_num(self.matrix)
+ to_keep = _mat.sum(1) != 0
+
+ self.matrix = self.matrix[to_keep, :]
+
+ def save_corr_matrix(self, file_handle):
+ """
+ saves the correlation matrix
+ """
+ if self.column_order:
+ self.corr_matrix = self.corr_matrix[:, self.column_order][self.column_order]
+ self.labels = [self.labels[i] for i in self.column_order]
+
+ self.labels = [toString(x) for x in self.labels]
+ file_handle.write("\t'" + "'\t'".join(self.labels) + "'\n")
+ fmt = "\t".join(np.repeat('%.4f', self.corr_matrix.shape[1])) + "\n"
+ i = 0
+ for row in self.corr_matrix:
+ file_handle.write(
+ "'%s'\t" % self.labels[i] + fmt % tuple(row))
+ i += 1
+
+ def compute_correlation(self):
+ """
+ computes spearman or pearson
+ correlation for the samples in the matrix
+
+ The matrix should contain the values of each sample per column
+ that's why the transpose is used.
+
+ >>> matrix = np.array([[1, 2, 3, np.nan],
+ ... [1, 2, 3, 4],
+ ... [6, 4, 3, 1]]).T
+ >>> np.savez_compressed("/tmp/test_matrix.npz", matrix=matrix, labels=['a', 'b', 'c'])
+
+ >>> c = Correlation("/tmp/test_matrix.npz", corr_method='pearson')
+
+ the results should be as in R
+
+ >>> c.compute_correlation().filled(np.nan)
+ array([[ 1. , 1. , -0.98198051],
+ [ 1. , 1. , -0.98198051],
+ [-0.98198051, -0.98198051, 1. ]])
+ >>> c.corr_method = 'spearman'
+ >>> c.corr_matrix = None
+ >>> c.compute_correlation()
+ array([[ 1., 1., -1.],
+ [ 1., 1., -1.],
+ [-1., -1., 1.]])
+ """
+ if self.corr_matrix is not None:
+ return self.corr_matrix
+
+ num_samples = len(self.labels)
+ # initialize correlation matrix
+
+ if self.corr_method == 'pearson':
+ self.corr_matrix = np.ma.corrcoef(self.matrix.T, allow_masked=True)
+
+ else:
+ corr_matrix = np.zeros((num_samples, num_samples), dtype='float')
+ # do an all vs all correlation using the
+ # indices of the upper triangle
+ rows, cols = np.triu_indices(num_samples)
+
+ for index in range(len(rows)):
+ row = rows[index]
+ col = cols[index]
+ corr_matrix[row, col] = scipy.stats.spearmanr(self.matrix[:, row], self.matrix[:, col])[0]
+ # make the matrix symmetric
+ self.corr_matrix = corr_matrix + np.triu(corr_matrix, 1).T
+
+ return self.corr_matrix
+
+ def plotly_correlation(self, corr_matrix, plot_filename, labels, plot_title='',
+ vmax=None, vmin=None, plot_numbers=True,
+ colormap='jet'):
+ """plot_correlation, but using plotly"""
+ textElement = []
+ for row in range(corr_matrix.shape[0]):
+ trow = []
+ for col in range(corr_matrix.shape[0]):
+ if plot_numbers:
+ trow.append("{:0.2f}".format(corr_matrix[row, col]))
+ else:
+ trow.append('')
+ textElement.append(trow)
+
+ zauto = True
+ if vmax is not None or vmin is not None:
+ zauto = False
+
+ convertedCmap = convertCmap(colormap)
+ fig = ff.create_annotated_heatmap(corr_matrix, x=labels, y=labels, colorscale=convertedCmap, showscale=True, zauto=zauto, zmin=vmin, zmax=vmax, annotation_text=textElement)
+ fig.layout['title'] = plot_title
+ offline.plot(fig, filename=plot_filename, auto_open=False)
+
+ def plot_correlation(self, plot_filename, plot_title='', vmax=None,
+ vmin=None, colormap='jet', image_format=None,
+ plot_numbers=False, plotWidth=11, plotHeight=9.5):
+ """
+ plots a correlation using a symmetric heatmap
+ """
+ num_rows = len(self.labels)
+ corr_matrix = self.compute_correlation()
+ # set a font size according to figure length
+ if num_rows < 6:
+ font_size = 14
+ elif num_rows > 40:
+ font_size = 5
+ else:
+ font_size = int(14 - 0.25 * num_rows)
+ mpl.rcParams.update({'font.size': font_size})
+ # set the minimum and maximum values
+ if vmax is None:
+ vmax = 1
+ if vmin is None:
+ vmin = 0 if corr_matrix .min() >= 0 else -1
+
+ # Compute and plot dendrogram.
+ fig = plt.figure(figsize=(plotWidth, plotHeight))
+ plt.suptitle(plot_title)
+
+ axdendro = fig.add_axes([0.015, 0.1, 0.1, 0.7])
+ axdendro.set_axis_off()
+ y_var = sch.linkage(corr_matrix, method='centroid')
+ z_var = sch.dendrogram(y_var, orientation='left',
+ link_color_func=lambda k: 'darkred')
+ axdendro.set_xticks([])
+ axdendro.set_yticks([])
+ cmap = copy.copy(plt.get_cmap(colormap))
+
+ # this line simply makes a new cmap, based on the original
+ # colormap that goes from 0.0 to 0.9
+ # This is done to avoid colors that
+ # are too dark at the end of the range that do not offer
+ # a good contrast between the correlation numbers that are
+ # plotted on black.
+ if plot_numbers:
+ cmap = pltcolors.LinearSegmentedColormap.from_list(colormap + "clipped",
+ cmap(np.linspace(0, 0.9, 10)))
+
+ cmap.set_under((0., 0., 1.))
+ # Plot distance matrix.
+ axmatrix = fig.add_axes([0.12, 0.1, 0.6, 0.7])
+ index = z_var['leaves']
+ corr_matrix = corr_matrix[index, :]
+ corr_matrix = corr_matrix[:, index]
+ if corr_matrix.shape[0] > 30:
+ # when there are too many rows it is better to remove
+ # the black lines surrounding the boxes in the heatmap
+ edge_color = 'none'
+ else:
+ edge_color = 'black'
+
+ if image_format == "plotly":
+ self.plotly_correlation(corr_matrix,
+ plot_filename,
+ self.labels,
+ plot_title=plot_title,
+ vmax=vmax,
+ vmin=vmin,
+ colormap=colormap,
+ plot_numbers=plot_numbers)
+ return
+
+ img_mat = axmatrix.pcolormesh(corr_matrix,
+ edgecolors=edge_color,
+ cmap=cmap,
+ vmax=vmax,
+ vmin=vmin)
+ axmatrix.set_xlim(0, num_rows)
+ axmatrix.set_ylim(0, num_rows)
+
+ axmatrix.yaxis.tick_right()
+ axmatrix.set_yticks(np.arange(corr_matrix .shape[0]) + 0.5)
+ axmatrix.set_yticklabels(np.array(self.labels).astype('str')[index])
+
+ axmatrix.xaxis.set_tick_params(labeltop=True)
+ axmatrix.xaxis.set_tick_params(labelbottom=False)
+ axmatrix.set_xticks(np.arange(corr_matrix .shape[0]) + 0.5)
+ axmatrix.set_xticklabels(np.array(self.labels).astype('str')[index], rotation=45, ha='left')
+
+ axmatrix.tick_params(
+ axis='x',
+ which='both',
+ bottom=False,
+ top=False)
+
+ axmatrix.tick_params(
+ axis='y',
+ which='both',
+ left=False,
+ right=False)
+
+ # Plot colorbar
+ axcolor = fig.add_axes([0.12, 0.065, 0.6, 0.02])
+ cobar = plt.colorbar(img_mat, cax=axcolor, orientation='horizontal')
+ cobar.solids.set_edgecolor("face")
+ if plot_numbers:
+ for row in range(num_rows):
+ for col in range(num_rows):
+ axmatrix.text(row + 0.5, col + 0.5,
+ "{:.2f}".format(corr_matrix[row, col]),
+ ha='center', va='center')
+
+ self.column_order = index
+ fig.savefig(plot_filename, format=image_format)
+ plt.close()
+
+ def plotly_scatter(self, plot_filename, corr_matrix, plot_title='', minXVal=None, maxXVal=None, minYVal=None, maxYVal=None):
+ """Make the scatter plot of a matrix with plotly"""
+ n = self.matrix.shape[1]
+ self.matrix = self.matrix
+ fig = go.Figure()
+ domainWidth = 1. / n
+
+ annos = []
+ for i in range(n):
+ x = domainWidth * (i + 1)
+ y = 1 - (domainWidth * i + 0.5 * domainWidth)
+ anno = dict(text=self.labels[i], showarrow=False, xref='paper', yref='paper', x=x, y=y, xanchor='right', yanchor='middle')
+ annos.append(anno)
+
+ data = []
+ zMin = np.inf
+ zMax = -np.inf
+ for x in range(n):
+ xanchor = 'x{}'.format(x + 1)
+ base = x * domainWidth
+ domain = [base, base + domainWidth]
+ if x > 0:
+ base = 1 - base
+ fig['layout']['xaxis{}'.format(x + 1)] = dict(domain=domain, range=[minXVal, maxXVal], anchor='free', position=base)
+ for y in range(0, n):
+ yanchor = 'y{}'.format(y + 1)
+ if x == 1:
+ base = 1 - y * domainWidth
+ domain = [base - domainWidth, base]
+ fig['layout']['yaxis{}'.format(y + 1)] = dict(domain=domain, range=[minYVal, maxYVal], side='right', anchor='free', position=1.0)
+
+ if x > y:
+ vector1 = self.matrix[:, x]
+ vector2 = self.matrix[:, y]
+ Z, xEdges, yEdges = np.histogram2d(vector1, vector2, bins=50)
+ Z = np.log10(Z)
+ if np.min(Z) < zMin:
+ zMin = np.min(Z)
+ if np.max(Z) > zMax:
+ zMax = np.max(Z)
+ name = '{}={:.2f}'.format(self.corr_method, corr_matrix[x, y])
+ trace = go.Heatmap(z=Z, x=xEdges, y=yEdges, showlegend=False, xaxis=xanchor, yaxis=yanchor, name=name, showscale=False)
+ data.append(trace)
+
+ # Fix the colorbar bounds
+ for trace in data:
+ trace.update(zmin=zMin, zmax=zMax)
+ data[-1]['colorbar'].update(title="log10(instances per bin)", titleside="right")
+ data[-1].update(showscale=True)
+
+ fig.add_traces(data)
+ fig['layout'].update(title=plot_title, showlegend=False, annotations=annos)
+
+ offline.plot(fig, filename=plot_filename, auto_open=False)
+
+ def plot_scatter(self, plot_filename, plot_title='', image_format=None, log1p=False, xRange=None, yRange=None):
+ """
+ Plot the scatter plots of a matrix
+ in which each row is a sample
+ """
+
+ num_samples = self.matrix.shape[1]
+ corr_matrix = self.compute_correlation()
+ grids = gridspec.GridSpec(num_samples, num_samples)
+ grids.update(wspace=0, hspace=0)
+ fig = plt.figure(figsize=(2 * num_samples, 2 * num_samples))
+ plt.rcParams['font.size'] = 8.0
+ plt.suptitle(plot_title)
+ if log1p is True:
+ self.matrix = np.log1p(self.matrix)
+ min_xvalue = self.matrix.min()
+ max_xvalue = self.matrix.max()
+ min_yvalue = min_xvalue
+ max_yvalue = max_xvalue
+ if xRange is not None:
+ min_xvalue = xRange[0]
+ max_xvalue = xRange[1]
+ if yRange is not None:
+ min_yvalue = yRange[0]
+ max_yvalue = yRange[1]
+ if (min_xvalue % 2 == 0 and max_xvalue % 2 == 0) or \
+ (min_xvalue % 1 == 0 and max_xvalue % 2 == 1):
+ # make one value odd and the other even
+ max_xvalue += 1
+ if (min_yvalue % 2 == 0 and max_yvalue % 2 == 0) or \
+ (min_yvalue % 1 == 0 and max_yvalue % 2 == 1):
+ # make one value odd and the other even
+ max_yvalue += 1
+
+ # plotly output
+ if image_format == 'plotly':
+ self.plotly_scatter(plot_filename, corr_matrix, plot_title=plot_title, minXVal=min_xvalue, maxXVal=max_xvalue, minYVal=min_yvalue, maxYVal=max_yvalue)
+ return
+
+ rows, cols = np.triu_indices(num_samples)
+
+ for index in range(len(rows)):
+ row = rows[index]
+ col = cols[index]
+ if row == col:
+ # add titles as
+ # empty plot in the diagonal
+ ax = fig.add_subplot(grids[row, col])
+ ax.text(0.5, 0.5, self.labels[row],
+ verticalalignment='center',
+ horizontalalignment='center',
+ fontsize=10, fontweight='bold',
+ transform=ax.transAxes)
+ ax.set_axis_off()
+ continue
+
+ ax = fig.add_subplot(grids[row, col])
+
+ vector1 = self.matrix[:, row]
+ vector2 = self.matrix[:, col]
+
+ ax.text(0.2, 0.8, "{}={:.2f}".format(self.corr_method,
+ corr_matrix[row, col]),
+ horizontalalignment='left',
+ transform=ax.transAxes)
+ ax.get_yaxis().set_tick_params(
+ which='both',
+ left=False,
+ right=False,
+ direction='out')
+
+ ax.get_xaxis().set_tick_params(
+ which='both',
+ top=False,
+ bottom=False,
+ direction='out')
+ ax.get_xaxis().set_tick_params(
+ which='major',
+ labelrotation=45)
+
+ if col != num_samples - 1:
+ ax.set_yticklabels([])
+ else:
+ ax.yaxis.tick_right()
+ ax.get_yaxis().set_tick_params(
+ which='both',
+ left=False,
+ right=True,
+ direction='out')
+ if col - row == 1:
+ ax.xaxis.tick_bottom()
+ ax.get_xaxis().set_tick_params(
+ which='both',
+ top=False,
+ bottom=True,
+ direction='out')
+ ax.get_xaxis().set_tick_params(
+ which='major',
+ labelrotation=45)
+
+ else:
+ ax.set_xticklabels([])
+
+ ax.set_xlim(min_xvalue, max_xvalue)
+ ax.set_ylim(min_yvalue, max_yvalue)
+ ax.hist2d(vector2, vector1, bins=200, cmin=0.1)
+
+ plt.savefig(plot_filename, format=image_format)
+ plt.close()
+
+ def plotly_pca(self, plotFile, Wt, pvar, PCs, eigenvalues, cols, plotTitle):
+ """
+ A plotly version of plot_pca, that's called by it to do the actual plotting
+ """
+ fig = go.Figure()
+ fig['layout']['xaxis1'] = {'domain': [0.0, 0.48], 'anchor': 'x1', 'title': 'PC{} ({:4.1f}% of var. explained)'.format(PCs[0], 100.0 * pvar[PCs[0] - 1])}
+ fig['layout']['yaxis1'] = {'domain': [0.0, 1.0], 'anchor': 'x1', 'title': 'PC{} ({:4.1f}% of var. explained)'.format(PCs[1], 100.0 * pvar[PCs[1] - 1])}
+ fig['layout']['xaxis2'] = {'domain': [0.52, 1.0], 'title': 'Principal Component'}
+ fig['layout']['yaxis2'] = {'domain': [0.0, 1.0], 'anchor': 'x2', 'title': 'Eigenvalue', 'rangemode': 'tozero', 'showgrid': False}
+ fig['layout']['yaxis3'] = {'domain': [0.0, 1.0], 'anchor': 'x2', 'title': 'Cumulative variability', 'rangemode': 'tozero', 'side': 'right', 'overlaying': 'y2'}
+ fig['layout'].update(title=plotTitle)
+
+ # PCA
+ if cols is not None:
+ colors = itertools.cycle(cols)
+ n = len(self.labels)
+ data = []
+ for i in range(n):
+ trace = go.Scatter(x=[Wt[PCs[0] - 1, i]],
+ y=[Wt[PCs[1] - 1, i]],
+ mode='marker',
+ xaxis='x1',
+ yaxis='y1',
+ name=self.labels[i])
+ trace['marker'].update(size=20)
+ if cols is not None:
+ trace['marker'].update(color=next(colors))
+ data.append(trace)
+
+ # Scree plot
+ trace = go.Bar(showlegend=False,
+ name='Eigenvalues',
+ x=range(1, n + 1),
+ y=eigenvalues[:n],
+ xaxis='x2',
+ yaxis='y2')
+ data.append(trace)
+
+ # Cumulative variability
+ trace = go.Scatter(showlegend=False,
+ x=range(1, n + 1),
+ y=pvar.cumsum()[:n],
+ mode='lines+markers',
+ name='Cumulative variability',
+ xaxis='x2',
+ yaxis='y3',
+ line={'color': 'red'},
+ marker={'symbol': 'circle-open-dot', 'color': 'black'})
+ data.append(trace)
+
+ annos = []
+ annos.append({'yanchor': 'bottom', 'xref': 'paper', 'xanchor': 'center', 'yref': 'paper', 'text': 'PCA', 'y': 1.0, 'x': 0.25, 'font': {'size': 16}, 'showarrow': False})
+ annos.append({'yanchor': 'bottom', 'xref': 'paper', 'xanchor': 'center', 'yref': 'paper', 'text': 'Scree plot', 'y': 1.0, 'x': 0.75, 'font': {'size': 16}, 'showarrow': False})
+
+ fig.add_traces(data)
+ fig['layout']['annotations'] = annos
+ offline.plot(fig, filename=plotFile, auto_open=False)
+
+ def plot_pca(self, plot_filename=None, PCs=[1, 2], plot_title='', image_format=None, log1p=False, plotWidth=5, plotHeight=10, cols=None, marks=None):
+ """
+ Plot the PCA of a matrix
+
+ Returns the matrix of plotted values.
+ """
+ fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(plotWidth, plotHeight))
+
+ # Filter
+ m = self.matrix
+ rvs = m.var(axis=1)
+ if self.transpose:
+ m = m[np.nonzero(rvs)[0], :]
+ rvs = rvs[np.nonzero(rvs)[0]]
+ if self.ntop > 0 and m.shape[0] > self.ntop:
+ m = m[np.argpartition(rvs, -self.ntop)[-self.ntop:], :]
+ rvs = rvs[np.argpartition(rvs, -self.ntop)[-self.ntop:]]
+
+ # log2 (if requested)
+ if self.log2:
+ self.matrix = np.log2(self.matrix + 0.01)
+
+ # Row center / transpose
+ if self.rowCenter and not self.transpose:
+ _ = self.matrix.mean(axis=1)
+ self.matrix -= _[:, None]
+ if self.transpose:
+ m = m.T
+
+ # Center and scale
+ m2 = (m - np.mean(m, axis=0))
+ m2 /= np.std(m2, axis=0, ddof=1) # Use the unbiased std. dev.
+
+ # SVD
+ U, s, Vh = np.linalg.svd(m2, full_matrices=False, compute_uv=True) # Is full_matrices ever needed?
+
+ # % variance, eigenvalues
+ eigenvalues = s**2
+ variance = eigenvalues / float(np.max([1, m2.shape[1] - 1]))
+ pvar = variance / variance.sum()
+
+ # Weights/projections
+ Wt = Vh
+ if self.transpose:
+ # Use the projected coordinates for the transposed matrix
+ Wt = np.dot(m2, Vh.T).T
+
+ if plot_filename is not None:
+ n = n_bars = len(self.labels)
+ if eigenvalues.size < n:
+ n_bars = eigenvalues.size
+ markers = itertools.cycle(matplotlib.markers.MarkerStyle.filled_markers)
+ if cols is not None:
+ colors = itertools.cycle(cols)
+ else:
+ colors = itertools.cycle(plt.cm.gist_rainbow(np.linspace(0, 1, n)))
+
+ if marks is not None:
+ markers = itertools.cycle(marks)
+
+ if image_format == 'plotly':
+ self.plotly_pca(plot_filename, Wt, pvar, PCs, eigenvalues, cols, plot_title)
+ else:
+ ax1.axhline(y=0, color="black", linestyle="dotted", zorder=1)
+ ax1.axvline(x=0, color="black", linestyle="dotted", zorder=2)
+ for i in range(n):
+ color = next(colors)
+ marker = next(markers)
+ if isinstance(color, np.ndarray):
+ color = pltcolors.to_hex(color, keep_alpha=True)
+ ax1.scatter(Wt[PCs[0] - 1, i], Wt[PCs[1] - 1, i],
+ marker=marker, color=color, s=150, label=self.labels[i], zorder=i + 3)
+ if plot_title == '':
+ ax1.set_title('PCA')
+ else:
+ ax1.set_title(plot_title)
+ ax1.set_xlabel('PC{} ({:4.1f}% of var. explained)'.format(PCs[0], 100.0 * pvar[PCs[0] - 1]))
+ ax1.set_ylabel('PC{} ({:4.1f}% of var. explained)'.format(PCs[1], 100.0 * pvar[PCs[1] - 1]))
+ lgd = ax1.legend(scatterpoints=1, loc='center left', borderaxespad=0.5,
+ bbox_to_anchor=(1, 0.5),
+ prop={'size': 12}, markerscale=0.9)
+
+ # Scree plot
+ ind = np.arange(n_bars) # the x locations for the groups
+ width = 0.35 # the width of the bars
+
+ if mpl.__version__ >= "2.0.0":
+ ax2.bar(2 * width + ind, eigenvalues[:n_bars], width * 2)
+ else:
+ ax2.bar(width + ind, eigenvalues[:n_bars], width * 2)
+ ax2.set_ylabel('Eigenvalue')
+ ax2.set_xlabel('Principal Component')
+ ax2.set_title('Scree plot')
+ ax2.set_xticks(ind + width * 2)
+ ax2.set_xticklabels(ind + 1)
+
+ ax3 = ax2.twinx()
+ ax3.axhline(y=1, color="black", linestyle="dotted")
+ ax3.plot(width * 2 + ind, pvar.cumsum()[:n], "r-")
+ ax3.plot(width * 2 + ind, pvar.cumsum()[:n], "wo", markeredgecolor="black")
+ ax3.set_ylim([0, 1.05])
+ ax3.set_ylabel('Cumulative variability')
+
+ plt.subplots_adjust(top=3.85)
+ plt.tight_layout()
+ plt.savefig(plot_filename, format=image_format, bbox_extra_artists=(lgd,), bbox_inches='tight')
+ plt.close()
+
+ return Wt, eigenvalues
diff --git a/deepTools/source/deeptools/correlation_heatmap.py b/deepTools/source/deeptools/correlation_heatmap.py
new file mode 100644
index 0000000000000000000000000000000000000000..58dbdfbc04f0ee25ef8038e22cd6d8ce81500df6
--- /dev/null
+++ b/deepTools/source/deeptools/correlation_heatmap.py
@@ -0,0 +1,110 @@
+from matplotlib import use as mplt_use
+mplt_use('Agg')
+from deeptools import cm # noqa: F401
+import matplotlib.pyplot as plt
+import numpy as np
+import scipy.cluster.hierarchy as sch
+from matplotlib import rcParams
+import matplotlib.colors as pltcolors
+import copy
+
+rcParams['pdf.fonttype'] = 42
+rcParams['svg.fonttype'] = 'none'
+old_settings = np.seterr(all='ignore')
+
+
+def plot_correlation(corr_matrix, labels, plotFileName, vmax=None,
+ vmin=None, colormap='jet', image_format=None,
+ plot_numbers=False, plot_title=''):
+
+ num_rows = corr_matrix.shape[0]
+
+ # set a font size according to figure length
+ if num_rows < 6:
+ font_size = 14
+ elif num_rows > 40:
+ font_size = 5
+ else:
+ font_size = int(14 - 0.25 * num_rows)
+ rcParams.update({'font.size': font_size})
+ # set the minimum and maximum values
+ if vmax is None:
+ vmax = 1
+ if vmin is None:
+ vmin = 0 if corr_matrix.min() >= 0 else -1
+
+ # Compute and plot dendrogram.
+ fig = plt.figure(figsize=(11, 9.5))
+ if plot_title:
+ plt.suptitle(plot_title)
+ axdendro = fig.add_axes([0.02, 0.12, 0.1, 0.66])
+ axdendro.set_axis_off()
+ y_var = sch.linkage(corr_matrix, method='complete')
+ z_var = sch.dendrogram(y_var, orientation='right',
+ link_color_func=lambda k: 'darkred')
+ axdendro.set_xticks([])
+ axdendro.set_yticks([])
+ cmap = copy.copy(plt.get_cmap(colormap))
+
+ # this line simply makes a new cmap, based on the original
+ # colormap that goes from 0.0 to 0.9
+ # This is done to avoid colors that
+ # are too dark at the end of the range that do not offer
+ # a good contrast between the correlation numbers that are
+ # plotted on black.
+ if plot_numbers:
+ cmap = pltcolors.LinearSegmentedColormap.from_list(colormap + "clipped",
+ cmap(np.linspace(0, 0.9, 10)))
+
+ cmap.set_under((0., 0., 1.))
+ # Plot distance matrix.
+ axmatrix = fig.add_axes([0.13, 0.1, 0.6, 0.7])
+ index = z_var['leaves']
+ corr_matrix = corr_matrix[index, :]
+ corr_matrix = corr_matrix[:, index]
+ img_mat = axmatrix.pcolormesh(corr_matrix,
+ edgecolors='black',
+ cmap=cmap,
+ vmax=vmax,
+ vmin=vmin)
+ axmatrix.set_xlim(0, num_rows)
+ axmatrix.set_ylim(0, num_rows)
+
+ axmatrix.yaxis.tick_right()
+ axmatrix.set_yticks(np.arange(corr_matrix.shape[0]) + 0.5)
+ axmatrix.set_yticklabels(np.array(labels).astype('str')[index])
+
+# axmatrix.xaxis.set_label_position('top')
+ axmatrix.xaxis.set_tick_params(labeltop=True)
+ axmatrix.xaxis.set_tick_params(labelbottom=False)
+ axmatrix.set_xticks(np.arange(corr_matrix.shape[0]) + 0.5)
+ axmatrix.set_xticklabels(np.array(labels).astype('str')[index],
+ rotation=45,
+ ha='left')
+
+ axmatrix.tick_params(
+ axis='x',
+ which='both',
+ bottom=False,
+ top=False)
+
+ axmatrix.tick_params(
+ axis='y',
+ which='both',
+ left=False,
+ right=False)
+
+ # axmatrix.set_xticks([])
+ # Plot colorbar.
+ axcolor = fig.add_axes([0.13, 0.065, 0.6, 0.02])
+ cobar = plt.colorbar(img_mat, cax=axcolor, orientation='horizontal')
+ cobar.solids.set_edgecolor("face")
+ if plot_numbers:
+ for row in range(num_rows):
+ for col in range(num_rows):
+ axmatrix.text(row + 0.5, col + 0.5,
+ "{:.2f}".format(corr_matrix[row, col]),
+ ha='center', va='center')
+
+ fig.savefig(plotFileName, format=image_format)
+ fig.close()
diff --git a/deepTools/source/deeptools/countReadsPerBin.py b/deepTools/source/deeptools/countReadsPerBin.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e6c78293bc44b24a2a9b285ca7db050b1510b6a
--- /dev/null
+++ b/deepTools/source/deeptools/countReadsPerBin.py
@@ -0,0 +1,1033 @@
+import shutil
+import os
+import time
+import sys
+import multiprocessing
+import numpy as np
+
+# deepTools packages
+import deeptools.utilities
+from deeptools import bamHandler
+from deeptools import mapReduce
+from deeptoolsintervals import GTF
+import pyBigWig
+
+debug = 0
+old_settings = np.seterr(all='ignore')
+
+
+def countReadsInRegions_wrapper(args):
+ """
+ Passes the arguments to countReadsInRegions_worker.
+ This is a step required given
+ the constrains from the multiprocessing module.
+ The args var, contains as first element the 'self' value
+ from the countReadsPerBin object
+
+ """
+ return CountReadsPerBin.count_reads_in_region(*args)
+
+
+class CountReadsPerBin(object):
+
+ r"""Collects coverage over multiple bam files using multiprocessing
+
+ This function collects read counts (coverage) from several bam files and returns
+ an numpy array with the results. This class uses multiprocessing to compute the coverage.
+
+ Parameters
+ ----------
+ bamFilesList : list
+ List containing the names of indexed bam files. E.g. ['file1.bam', 'file2.bam']
+
+ binLength : int
+ Length of the window/bin. This value is overruled by ``bedFile`` if present.
+
+ numberOfSamples : int
+ Total number of samples. The genome is divided into ``numberOfSamples``, each
+ with a window/bin length equal to ``binLength``. This value is overruled
+ by ``stepSize`` in case such value is present and by ``bedFile`` in which
+ case the number of samples and bins are defined in the bed file
+
+ numberOfProcessors : int
+ Number of processors to use. Default is 4
+
+ verbose : bool
+ Output messages. Default: False
+
+ region : str
+ Region to limit the computation in the form chrom:start:end.
+
+ bedFile : list of file_handles.
+ Each file handle corresponds to a bed file containing the regions for which to compute the coverage. This option
+ overrules ``binLength``, ``numberOfSamples`` and ``stepSize``.
+
+ blackListFileName : str
+ A string containing a BED file with blacklist regions.
+
+ extendReads : bool, int
+
+ Whether coverage should be computed for the extended read length (i.e. the region covered
+ by the two mates or the regions expected to be covered by single-reads).
+ If the value is 'int', then then this is interpreted as the fragment length to extend reads
+ that are not paired. For Illumina reads, usual values are around 300.
+ This value can be determined using the peak caller MACS2 or can be
+ approximated by the fragment lengths computed when preparing the library for sequencing. If the value
+ is of the variable is true and not value is given, the fragment size is sampled from the library but
+ only if the library is paired-end. Default: False
+
+
+ minMappingQuality : int
+ Reads of a mapping quality less than the give value are not considered. Default: None
+
+ ignoreDuplicates : bool
+ Whether read duplicates (same start, end position. If paired-end, same start-end for mates) are
+ to be excluded. Default: false
+
+ chrToSkip: list
+ List with names of chromosomes that do not want to be included in the coverage computation.
+ This is useful to remove unwanted chromosomes (e.g. 'random' or 'Het').
+
+ stepSize : int
+ the positions for which the coverage is computed are defined as follows:
+ ``range(start, end, stepSize)``. Thus, a stepSize of 1, will compute
+ the coverage at each base pair. If the stepSize is equal to the
+ binLength then the coverage is computed for consecutive bins. If seepSize is
+ smaller than the binLength, then teh bins will overlap.
+
+ center_read : bool
+ Determines if reads should be centered with respect to the fragment length.
+
+ samFlag_include : int
+ Extracts only those reads having the SAM flag. For example, to get only
+ reads that are the first mates a samFlag of 64 could be used. Similarly, the
+ samFlag_include can be used to select only reads mapping on the reverse strand
+ or to get only properly paired reads.
+
+ samFlag_exclude : int
+ Removes reads that match the SAM flag. For example to get all reads
+ that map to the forward strand a samFlag_exlude 16 should be used. Which
+ translates into exclude all reads that map to the reverse strand.
+
+ zerosToNans : bool
+ If true, zero values encountered are transformed to Nans. Default false.
+
+ skipZeroOverZero : bool
+ If true, skip bins where all input BAM files have no coverage (only applicable to bamCompare).
+
+ minFragmentLength : int
+ If greater than 0, fragments below this size are excluded.
+
+ maxFragmentLength : int
+ If greater than 0, fragments above this size are excluded.
+
+ out_file_for_raw_data : str
+ File name to save the raw counts computed
+
+ statsList : list
+ For each BAM file in bamFilesList, the associated per-chromosome statistics returned by openBam
+
+ mappedList : list
+ For each BAM file in bamFilesList, the number of mapped reads in the file.
+
+ bed_and_bin : boolean
+ If true AND a bedFile is given, compute coverage of each bin of the given size in each region of bedFile
+
+ genomeChunkSize : int
+ If not None, the length of the genome used for multiprocessing.
+
+ Returns
+ -------
+ numpy array
+
+ Each row correspond to each bin/bed region and each column correspond to each of
+ the bamFiles.
+
+
+ Examples
+ --------
+
+ The test data contains reads for 200 bp.
+
+ >>> test = Tester()
+
+ The transpose function is used to get a nicer looking output.
+ The first line corresponds to the number of reads per bin in bam file 1
+
+ >>> c = CountReadsPerBin([test.bamFile1, test.bamFile2], 50, 4)
+ >>> np.transpose(c.run())
+ array([[0., 0., 1., 1.],
+ [0., 1., 1., 2.]])
+ """
+
+ def __init__(self, bamFilesList, binLength=50, numberOfSamples=None, numberOfProcessors=1,
+ verbose=False, region=None,
+ bedFile=None, extendReads=False,
+ genomeChunkSize=None,
+ blackListFileName=None,
+ minMappingQuality=None,
+ ignoreDuplicates=False,
+ chrsToSkip=[],
+ stepSize=None,
+ center_read=False,
+ samFlag_include=None,
+ samFlag_exclude=None,
+ zerosToNans=False,
+ skipZeroOverZero=False,
+ smoothLength=0,
+ minFragmentLength=0,
+ maxFragmentLength=0,
+ out_file_for_raw_data=None,
+ bed_and_bin=False,
+ statsList=[],
+ mappedList=[]):
+
+ self.bamFilesList = bamFilesList
+ self.binLength = binLength
+ self.numberOfSamples = numberOfSamples
+ self.blackListFileName = blackListFileName
+ self.statsList = statsList
+ self.mappedList = mappedList
+ self.skipZeroOverZero = skipZeroOverZero
+ self.bed_and_bin = bed_and_bin
+ self.genomeChunkSize = genomeChunkSize
+
+ if extendReads and len(bamFilesList):
+ from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
+ frag_len_dict, read_len_dict = get_read_and_fragment_length(bamFilesList[0],
+ return_lengths=False,
+ blackListFileName=blackListFileName,
+ numberOfProcessors=numberOfProcessors,
+ verbose=verbose)
+ if extendReads is True:
+ # try to guess fragment length if the bam file contains paired end reads
+ if frag_len_dict:
+ self.defaultFragmentLength = int(frag_len_dict['median'])
+ else:
+ exit("*ERROR*: library is not paired-end. Please provide an extension length.")
+ if verbose:
+ print(("Fragment length based on paired en data "
+ "estimated to be {}".format(frag_len_dict['median'])))
+
+ elif extendReads < read_len_dict['median']:
+ sys.stderr.write("*WARNING*: read extension is smaller than read length (read length = {}). "
+ "Reads will not be extended.\n".format(int(read_len_dict['median'])))
+ self.defaultFragmentLength = 'read length'
+
+ elif extendReads > 2000:
+ exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(extendReads))
+ else:
+ self.defaultFragmentLength = int(extendReads)
+
+ else:
+ self.defaultFragmentLength = 'read length'
+
+ self.numberOfProcessors = numberOfProcessors
+ self.verbose = verbose
+ self.region = region
+ self.bedFile = bedFile
+ self.minMappingQuality = minMappingQuality
+ self.ignoreDuplicates = ignoreDuplicates
+ self.chrsToSkip = chrsToSkip
+ self.stepSize = stepSize
+ self.center_read = center_read
+ self.samFlag_include = samFlag_include
+ self.samFlag_exclude = samFlag_exclude
+ self.minFragmentLength = minFragmentLength
+ self.maxFragmentLength = maxFragmentLength
+ self.zerosToNans = zerosToNans
+ self.smoothLength = smoothLength
+
+ if out_file_for_raw_data:
+ self.save_data = True
+ self.out_file_for_raw_data = out_file_for_raw_data
+ else:
+ self.save_data = False
+ self.out_file_for_raw_data = None
+
+ # check that wither numberOfSamples or stepSize are set
+ if numberOfSamples is None and stepSize is None and bedFile is None:
+ raise ValueError("either stepSize, numberOfSamples or bedFile have to be set")
+
+ if self.defaultFragmentLength != 'read length':
+ self.maxPairedFragmentLength = 4 * self.defaultFragmentLength
+ else:
+ self.maxPairedFragmentLength = 1000
+ if self.maxFragmentLength > 0:
+ self.maxPairedFragmentLength = self.maxFragmentLength
+
+ if len(self.mappedList) == 0:
+ try:
+ for fname in self.bamFilesList:
+ bam, mapped, unmapped, stats = bamHandler.openBam(fname, returnStats=True, nThreads=self.numberOfProcessors)
+ self.mappedList.append(mapped)
+ self.statsList.append(stats)
+ bam.close()
+ except:
+ self.mappedList = []
+ self.statsList = []
+
+ def get_chunk_length(self, bamFilesHandles, genomeSize, chromSizes, chrLengths):
+ # Try to determine an optimal fraction of the genome (chunkSize) that is sent to
+ # workers for analysis. If too short, too much time is spent loading the files
+ # if too long, some processors end up free.
+ # the following values are empirical
+ if self.stepSize is None:
+ if self.region is None:
+ self.stepSize = max(int(float(genomeSize) / self.numberOfSamples), 1)
+ else:
+ # compute the step size, based on the number of samples
+ # and the length of the region studied
+ (chrom, start, end) = mapReduce.getUserRegion(chromSizes, self.region)[:3]
+ self.stepSize = max(int(float(end - start) / self.numberOfSamples), 1)
+
+ # number of samples is better if large
+ if np.mean(chrLengths) < self.stepSize and self.bedFile is None:
+ min_num_of_samples = int(genomeSize / np.mean(chrLengths))
+ raise ValueError("numberOfSamples has to be bigger than {} ".format(min_num_of_samples))
+
+ max_mapped = 0
+ if len(self.mappedList) > 0:
+ max_mapped = max(self.mappedList)
+
+ # If max_mapped is 0 (i.e., bigWig input), set chunkSize to a multiple of binLength and use every bin
+ if max_mapped == 0:
+ chunkSize = 10000 * self.binLength
+ self.stepSize = self.binLength
+ else:
+ reads_per_bp = float(max_mapped) / genomeSize
+ chunkSize = int(self.stepSize * 1e3 / (reads_per_bp * len(bamFilesHandles)))
+
+ # Ensure that chunkSize is always at least self.stepSize
+ if chunkSize < self.stepSize:
+ chunkSize = self.stepSize
+
+ # Ensure that chunkSize is always at least self.binLength
+ if self.binLength and chunkSize < self.binLength:
+ chunkSize = self.binLength
+
+ return chunkSize
+
+ def run(self, allArgs=None):
+ bamFilesHandles = []
+ for x in self.bamFilesList:
+ try:
+ y = bamHandler.openBam(x)
+ except SystemExit:
+ sys.exit(sys.exc_info()[1])
+ except:
+ y = pyBigWig.open(x)
+ bamFilesHandles.append(y)
+
+ chromsizes, non_common = deeptools.utilities.getCommonChrNames(bamFilesHandles, verbose=self.verbose)
+
+ # skip chromosome in the list. This is usually for the
+ # X chromosome which may have either one copy in a male sample
+ # or a mixture of male/female and is unreliable.
+ # Also the skip may contain heterochromatic regions and
+ # mitochondrial DNA
+ if len(self.chrsToSkip):
+ chromsizes = [x for x in chromsizes if x[0] not in self.chrsToSkip]
+
+ chrNames, chrLengths = list(zip(*chromsizes))
+
+ genomeSize = sum(chrLengths)
+
+ chunkSize = None
+ if self.bedFile is None:
+ if self.genomeChunkSize is None:
+ chunkSize = self.get_chunk_length(bamFilesHandles, genomeSize, chromsizes, chrLengths)
+ else:
+ chunkSize = self.genomeChunkSize
+
+ [bam_h.close() for bam_h in bamFilesHandles]
+
+ if self.verbose:
+ print("step size is {}".format(self.stepSize))
+
+ if self.region:
+ # in case a region is used, append the tilesize
+ self.region += ":{}".format(self.binLength)
+
+ # Handle GTF options
+ transcriptID, exonID, transcript_id_designator, keepExons = deeptools.utilities.gtfOptions(allArgs)
+
+ # use map reduce to call countReadsInRegions_wrapper
+ imap_res = mapReduce.mapReduce([],
+ countReadsInRegions_wrapper,
+ chromsizes,
+ self_=self,
+ genomeChunkLength=chunkSize,
+ bedFile=self.bedFile,
+ blackListFileName=self.blackListFileName,
+ region=self.region,
+ numberOfProcessors=self.numberOfProcessors,
+ transcriptID=transcriptID,
+ exonID=exonID,
+ keepExons=keepExons,
+ transcript_id_designator=transcript_id_designator)
+
+ if self.out_file_for_raw_data:
+ if len(non_common):
+ sys.stderr.write("*Warning*\nThe resulting bed file does not contain information for "
+ "the chromosomes that were not common between the bigwig files\n")
+
+ # concatenate intermediary bedgraph files
+ ofile = open(self.out_file_for_raw_data, "w")
+ for _values, tempFileName in imap_res:
+ if tempFileName:
+ # concatenate all intermediate tempfiles into one
+ _foo = open(tempFileName, 'r')
+ shutil.copyfileobj(_foo, ofile)
+ _foo.close()
+ os.remove(tempFileName)
+
+ ofile.close()
+
+ try:
+ num_reads_per_bin = np.concatenate([x[0] for x in imap_res], axis=0)
+ return num_reads_per_bin
+
+ except ValueError:
+ if self.bedFile:
+ sys.exit('\nNo coverage values could be computed.\n\n'
+ 'Please check that the chromosome names in the BED file are found on the bam files.\n\n'
+ 'The valid chromosome names are:\n{}'.format(chrNames))
+ else:
+ sys.exit('\nNo coverage values could be computed.\n\nCheck that all bam files are valid and '
+ 'contain mapped reads.')
+
+ def count_reads_in_region(self, chrom, start, end, bed_regions_list=None):
+ """Counts the reads in each bam file at each 'stepSize' position
+ within the interval (start, end) for a window or bin of size binLength.
+
+ The stepSize controls the distance between bins. For example,
+ a step size of 20 and a bin size of 20 will create bins next to
+ each other. If the step size is smaller than the bin size the
+ bins will overlap.
+
+ If a list of bedRegions is given, then the number of reads
+ that overlaps with each region is counted.
+
+ Parameters
+ ----------
+ chrom : str
+ Chrom name
+ start : int
+ start coordinate
+ end : int
+ end coordinate
+ bed_regions_list: list
+ List of list of tuples of the form (start, end)
+ corresponding to bed regions to be processed.
+ If not bed file was passed to the object constructor
+ then this list is empty.
+
+ Returns
+ -------
+ numpy array
+ The result is a numpy array that as rows each bin
+ and as columns each bam file.
+
+
+ Examples
+ --------
+ Initialize some useful values
+
+ >>> test = Tester()
+ >>> c = CountReadsPerBin([test.bamFile1, test.bamFile2], 25, 0, stepSize=50)
+
+ The transpose is used to get better looking numbers. The first line
+ corresponds to the number of reads per bin in the first bamfile.
+
+ >>> _array, __ = c.count_reads_in_region(test.chrom, 0, 200)
+ >>> _array
+ array([[0., 0.],
+ [0., 1.],
+ [1., 1.],
+ [1., 2.]])
+
+ """
+
+ if start > end:
+ raise NameError("start %d bigger that end %d" % (start, end))
+
+ if self.stepSize is None and bed_regions_list is None:
+ raise ValueError("stepSize is not set!")
+ # array to keep the read counts for the regions
+ subnum_reads_per_bin = []
+
+ start_time = time.time()
+
+ bam_handles = []
+ for fname in self.bamFilesList:
+ try:
+ bam_handles.append(bamHandler.openBam(fname))
+ except SystemExit:
+ sys.exit(sys.exc_info()[1])
+ except:
+ bam_handles.append(pyBigWig.open(fname))
+
+ blackList = None
+ if self.blackListFileName is not None:
+ blackList = GTF(self.blackListFileName)
+
+ # A list of lists of tuples
+ transcriptsToConsider = []
+ if bed_regions_list is not None:
+ if self.bed_and_bin:
+ transcriptsToConsider.append([(x[1][0][0], x[1][0][1], self.binLength) for x in bed_regions_list])
+ else:
+ transcriptsToConsider = [x[1] for x in bed_regions_list]
+ else:
+ if self.stepSize == self.binLength:
+ transcriptsToConsider.append([(start, end, self.binLength)])
+ else:
+ for i in range(start, end, self.stepSize):
+ if i + self.binLength > end:
+ break
+ if blackList is not None and blackList.findOverlaps(chrom, i, i + self.binLength):
+ continue
+ transcriptsToConsider.append([(i, i + self.binLength)])
+
+ if self.save_data:
+ _file = open(deeptools.utilities.getTempFileName(suffix='.bed'), 'w+t')
+ _file_name = _file.name
+ else:
+ _file_name = ''
+
+ for bam in bam_handles:
+ for trans in transcriptsToConsider:
+ tcov = self.get_coverage_of_region(bam, chrom, trans)
+ if bed_regions_list is not None and not self.bed_and_bin:
+ subnum_reads_per_bin.append(np.sum(tcov))
+ else:
+ subnum_reads_per_bin.extend(tcov)
+
+ subnum_reads_per_bin = np.concatenate([subnum_reads_per_bin]).reshape(-1, len(self.bamFilesList), order='F')
+
+ if self.save_data:
+ idx = 0
+ for i, trans in enumerate(transcriptsToConsider):
+ if len(trans[0]) != 3:
+ starts = ",".join([str(x[0]) for x in trans])
+ ends = ",".join([str(x[1]) for x in trans])
+ _file.write("\t".join([chrom, starts, ends]) + "\t")
+ _file.write("\t".join(["{}".format(x) for x in subnum_reads_per_bin[i, :]]) + "\n")
+ else:
+ for exon in trans:
+ for startPos in range(exon[0], exon[1], exon[2]):
+ if idx >= subnum_reads_per_bin.shape[0]:
+ # At the end of chromosomes (or due to blacklisted regions), there are bins smaller than the bin size
+ # Counts there are added to the bin before them, but range() will still try to include them.
+ break
+ _file.write("{0}\t{1}\t{2}\t".format(chrom, startPos, min(startPos + exon[2], exon[1])))
+ _file.write("\t".join(["{}".format(x) for x in subnum_reads_per_bin[idx, :]]) + "\n")
+ idx += 1
+ _file.close()
+
+ if self.verbose:
+ endTime = time.time()
+ rows = subnum_reads_per_bin.shape[0]
+ print("%s countReadsInRegions_worker: processing %d "
+ "(%.1f per sec) @ %s:%s-%s" %
+ (multiprocessing.current_process().name,
+ rows, rows / (endTime - start_time), chrom, start, end))
+
+ return subnum_reads_per_bin, _file_name
+
+ def get_coverage_of_region(self, bamHandle, chrom, regions,
+ fragmentFromRead_func=None):
+ """
+ Returns a numpy array that corresponds to the number of reads
+ that overlap with each tile.
+
+ >>> test = Tester()
+ >>> import pysam
+ >>> c = CountReadsPerBin([], stepSize=1, extendReads=300)
+
+ For this case the reads are length 36. The number of overlapping
+ read fragments is 4 and 5 for the positions tested.
+
+ >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile_PE), 'chr2',
+ ... [(5000833, 5000834), (5000834, 5000835)])
+ array([4., 5.])
+
+ In the following example a paired read is extended to the fragment length which is 100
+ The first mate starts at 5000000 and the second at 5000064. Each mate is
+ extended to the fragment length *independently*
+ At position 500090-500100 one fragment of length 100 overlap, and after position 5000101
+ there should be zero reads.
+
+ >>> c.zerosToNans = True
+ >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile_PE), 'chr2',
+ ... [(5000090, 5000100), (5000100, 5000110)])
+ array([ 1., nan])
+
+ In the following case the reads length is 50. Reads are not extended.
+
+ >>> c.extendReads=False
+ >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile2), '3R', [(148, 150), (150, 152), (152, 154)])
+ array([1., 2., 2.])
+
+
+ """
+ if not fragmentFromRead_func:
+ fragmentFromRead_func = self.get_fragment_from_read
+ nbins = len(regions)
+ if len(regions[0]) == 3:
+ nbins = 0
+ for reg in regions:
+ nbins += (reg[1] - reg[0]) // reg[2]
+ if (reg[1] - reg[0]) % reg[2] > 0:
+ nbins += 1
+ coverages = np.zeros(nbins, dtype='float64')
+
+ if self.defaultFragmentLength == 'read length':
+ extension = 0
+ else:
+ extension = self.maxPairedFragmentLength
+
+ blackList = None
+ if self.blackListFileName is not None:
+ blackList = GTF(self.blackListFileName)
+
+ vector_start = 0
+ for idx, reg in enumerate(regions):
+ if len(reg) == 3:
+ tileSize = int(reg[2])
+ nRegBins = (reg[1] - reg[0]) // tileSize
+ if (reg[1] - reg[0]) % tileSize > 0:
+ # Don't eliminate small bins! Issue 887
+ nRegBins += 1
+ else:
+ nRegBins = 1
+ tileSize = int(reg[1] - reg[0])
+
+ # Blacklisted regions have a coverage of 0
+ if blackList and blackList.findOverlaps(chrom, reg[0], reg[1]):
+ continue
+ regStart = int(max(0, reg[0] - extension))
+ regEnd = reg[1] + int(extension)
+
+ # If alignments are extended and there's a blacklist, ensure that no
+ # reads originating in a blacklist are fetched
+ if blackList and reg[0] > 0 and extension > 0:
+ o = blackList.findOverlaps(chrom, regStart, reg[0])
+ if o is not None and len(o) > 0:
+ regStart = o[-1][1]
+ o = blackList.findOverlaps(chrom, reg[1], regEnd)
+ if o is not None and len(o) > 0:
+ regEnd = o[0][0]
+
+ start_time = time.time()
+ # caching seems faster. TODO: profile the function
+ c = 0
+ if chrom not in bamHandle.references:
+ raise NameError("chromosome {} not found in bam file".format(chrom))
+
+ prev_pos = set()
+ lpos = None
+ # of previous processed read pair
+ for read in bamHandle.fetch(chrom, regStart, regEnd):
+ if read.is_unmapped:
+ continue
+ if self.minMappingQuality and read.mapq < self.minMappingQuality:
+ continue
+
+ # filter reads based on SAM flag
+ if self.samFlag_include and read.flag & self.samFlag_include != self.samFlag_include:
+ continue
+ if self.samFlag_exclude and read.flag & self.samFlag_exclude != 0:
+ continue
+
+ # Fragment lengths
+ tLen = deeptools.utilities.getTLen(read)
+ if self.minFragmentLength > 0 and tLen < self.minFragmentLength:
+ continue
+ if self.maxFragmentLength > 0 and tLen > self.maxFragmentLength:
+ continue
+
+ # get rid of duplicate reads that have same position on each of the
+ # pairs
+ if self.ignoreDuplicates:
+ # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions
+ if tLen >= 0:
+ s = read.pos
+ e = s + tLen
+ else:
+ s = read.pnext
+ e = s - tLen
+ if read.reference_id != read.next_reference_id:
+ e = read.pnext
+ if lpos is not None and lpos == read.reference_start \
+ and (s, e, read.next_reference_id, read.is_reverse) in prev_pos:
+ continue
+ if lpos != read.reference_start:
+ prev_pos.clear()
+ lpos = read.reference_start
+ prev_pos.add((s, e, read.next_reference_id, read.is_reverse))
+
+ # since reads can be split (e.g. RNA-seq reads) each part of the
+ # read that maps is called a position block.
+ try:
+ position_blocks = fragmentFromRead_func(read)
+ except TypeError:
+ # the get_fragment_from_read functions returns None in some cases.
+ # Those cases are to be skipped, hence the continue line.
+ continue
+
+ last_eIdx = None
+ for fragmentStart, fragmentEnd in position_blocks:
+ if fragmentEnd is None or fragmentStart is None:
+ continue
+ fragmentLength = fragmentEnd - fragmentStart
+ if fragmentLength == 0:
+ continue
+ # skip reads that are not in the region being
+ # evaluated.
+ if fragmentEnd <= reg[0] or fragmentStart >= reg[1]:
+ continue
+
+ if fragmentStart < reg[0]:
+ fragmentStart = reg[0]
+ if fragmentEnd > reg[0] + len(coverages) * tileSize:
+ fragmentEnd = reg[0] + len(coverages) * tileSize
+
+ sIdx = vector_start + max((fragmentStart - reg[0]) // tileSize, 0)
+ eIdx = vector_start + min(np.ceil(float(fragmentEnd - reg[0]) / tileSize).astype('int'), nRegBins)
+ if last_eIdx is not None:
+ sIdx = max(last_eIdx, sIdx)
+ if sIdx >= eIdx:
+ continue
+ sIdx = int(sIdx)
+ eIdx = int(eIdx)
+ coverages[sIdx:eIdx] += 1
+ last_eIdx = eIdx
+
+ c += 1
+
+ if self.verbose:
+ endTime = time.time()
+ print("%s, processing %s (%.1f per sec) reads @ %s:%s-%s" % (
+ multiprocessing.current_process().name, c, c / (endTime - start_time), chrom, reg[0], reg[1]))
+
+ vector_start += nRegBins
+
+ # change zeros to NAN
+ if self.zerosToNans:
+ coverages[coverages == 0] = np.nan
+
+ return coverages
+
+ def getReadLength(self, read):
+ return len(read)
+
+ @staticmethod
+ def is_proper_pair(read, maxPairedFragmentLength):
+ """
+ Checks if a read is proper pair meaning that both mates are facing each other and are in
+ the same chromosome and are not to far away. The sam flag for proper pair can not
+ always be trusted. Note that if the fragment size is > maxPairedFragmentLength (~2kb
+ usually) that False will be returned.
+ :return: bool
+
+ >>> import pysam
+ >>> import os
+ >>> from deeptools.countReadsPerBin import CountReadsPerBin as cr
+ >>> root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/"
+ >>> bam = pysam.AlignmentFile("{}/test_proper_pair_filtering.bam".format(root))
+ >>> iter = bam.fetch()
+ >>> read = next(iter)
+ >>> cr.is_proper_pair(read, 1000) # "keep" read
+ True
+ >>> cr.is_proper_pair(read, 200) # "keep" read, but maxPairedFragmentLength is too short
+ False
+ >>> read = next(iter)
+ >>> cr.is_proper_pair(read, 1000) # "improper pair"
+ False
+ >>> read = next(iter)
+ >>> cr.is_proper_pair(read, 1000) # "mismatch chr"
+ False
+ >>> read = next(iter)
+ >>> cr.is_proper_pair(read, 1000) # "same orientation1"
+ False
+ >>> read = next(iter)
+ >>> cr.is_proper_pair(read, 1000) # "same orientation2"
+ False
+ >>> read = next(iter)
+ >>> cr.is_proper_pair(read, 1000) # "rev first"
+ False
+ >>> read = next(iter)
+ >>> cr.is_proper_pair(read, 1000) # "rev first OK"
+ True
+ >>> read = next(iter)
+ >>> cr.is_proper_pair(read, 1000) # "for first"
+ False
+ >>> read = next(iter)
+ >>> cr.is_proper_pair(read, 1000) # "for first"
+ True
+ """
+ if not read.is_proper_pair:
+ return False
+ if read.reference_id != read.next_reference_id:
+ return False
+ if abs(read.template_length) > maxPairedFragmentLength:
+ return False
+ # check that the mates face each other (inward)
+ if read.is_reverse is read.mate_is_reverse:
+ return False
+ if read.is_reverse:
+ if read.reference_start >= read.next_reference_start:
+ return True
+ else:
+ if read.reference_start <= read.next_reference_start:
+ return True
+ return False
+
+ def get_fragment_from_read(self, read):
+ """Get read start and end position of a read.
+ If given, the reads are extended as follows:
+ If reads are paired end, each read mate is extended to match
+ the fragment length, otherwise, a default fragment length
+ is used. If reads are split (give by the CIGAR string) then
+ the multiple positions of the read are returned.
+ When reads are extended the cigar information is
+ skipped.
+
+ Parameters
+ ----------
+ read: pysam object.
+
+ The following values are defined (for forward reads)::
+
+
+ |-- -- read.tlen -- --|
+ |-- read.alen --|
+ -----|===============>------------<==============|----
+ | | |
+ read.reference_start
+ read.reference_end read.pnext
+
+ and for reverse reads
+
+
+ |-- -- read.tlen -- --|
+ |-- read.alen --|
+ -----|===============>-----------<===============|----
+ | | |
+ read.pnext read.reference_start read.reference_end
+
+ this is a sketch of a pair-end reads
+
+ The function returns the fragment start and end, either
+ using the paired end information (if available) or
+ extending the read in the appropriate direction if this
+ is single-end.
+
+ Parameters
+ ----------
+ read : pysam read object
+
+
+ Returns
+ -------
+ list of tuples
+ [(fragment start, fragment end)]
+
+
+ >>> test = Tester()
+ >>> c = CountReadsPerBin([], 1, 1, 200, extendReads=True)
+ >>> c.defaultFragmentLength=100
+ >>> c.get_fragment_from_read(test.getRead("paired-forward"))
+ [(5000000, 5000100)]
+ >>> c.get_fragment_from_read(test.getRead("paired-reverse"))
+ [(5000000, 5000100)]
+ >>> c.defaultFragmentLength = 200
+ >>> c.get_fragment_from_read(test.getRead("single-forward"))
+ [(5001491, 5001691)]
+ >>> c.get_fragment_from_read(test.getRead("single-reverse"))
+ [(5001536, 5001736)]
+ >>> c.defaultFragmentLength = 'read length'
+ >>> c.get_fragment_from_read(test.getRead("single-forward"))
+ [(5001491, 5001527)]
+ >>> c.defaultFragmentLength = 'read length'
+ >>> c.extendReads = False
+ >>> c.get_fragment_from_read(test.getRead("paired-forward"))
+ [(5000000, 5000036)]
+
+ Tests for read centering.
+
+ >>> c = CountReadsPerBin([], 1, 1, 200, extendReads=True, center_read=True)
+ >>> c.defaultFragmentLength = 100
+ >>> assert c.get_fragment_from_read(test.getRead("paired-forward")) == [(5000032, 5000068)]
+ >>> c.defaultFragmentLength = 200
+ >>> assert c.get_fragment_from_read(test.getRead("single-reverse")) == [(5001618, 5001654)]
+ """
+ # if no extension is needed, use pysam get_blocks
+ # to identify start and end reference positions.
+ # get_blocks return a list of start and end positions
+ # based on the CIGAR if skipped regions are found.
+ # E.g for a cigar of 40M260N22M
+ # get blocks return two elements for the first 40 matches
+ # and the for the last 22 matches.
+ if self.defaultFragmentLength == 'read length':
+ return read.get_blocks()
+
+ else:
+ if self.is_proper_pair(read, self.maxPairedFragmentLength):
+ if read.is_reverse:
+ fragmentStart = read.next_reference_start
+ fragmentEnd = read.reference_end
+ else:
+ fragmentStart = read.reference_start
+ # the end of the fragment is defined as
+ # the start of the forward read plus the insert length
+ fragmentEnd = read.reference_start + abs(read.template_length)
+
+ # Extend using the default fragment length
+ else:
+ if read.is_reverse:
+ fragmentStart = read.reference_end - self.defaultFragmentLength
+ fragmentEnd = read.reference_end
+ else:
+ fragmentStart = read.reference_start
+ fragmentEnd = read.reference_start + self.defaultFragmentLength
+
+ if self.center_read:
+ fragmentCenter = fragmentEnd - (fragmentEnd - fragmentStart) / 2
+ fragmentStart = int(fragmentCenter - read.infer_query_length(always=False) / 2)
+ fragmentEnd = fragmentStart + read.infer_query_length(always=False)
+
+ assert fragmentStart < fragmentEnd, "fragment start greater than fragment" \
+ "end for read {}".format(read.query_name)
+ return [(fragmentStart, fragmentEnd)]
+
+ def getSmoothRange(self, tileIndex, tileSize, smoothRange, maxPosition):
+ """
+ Given a tile index position and a tile size (length), return the a new indices
+ over a larger range, called the smoothRange.
+ This region is centered in the tileIndex an spans on both sizes
+ to cover the smoothRange. The smoothRange is trimmed in case it is less
+ than zero or greater than maxPosition ::
+
+
+ ---------------|==================|------------------
+ tileStart
+ |--------------------------------------|
+ | <-- smoothRange --> |
+ |
+ tileStart - (smoothRange-tileSize)/2
+
+ Test for a smooth range that spans 3 tiles.
+
+ Examples
+ --------
+
+ >>> c = CountReadsPerBin([], 1, 1, 1, 0)
+ >>> c.getSmoothRange(5, 1, 3, 10)
+ (4, 7)
+
+ Test smooth range truncated on start.
+
+ >>> c.getSmoothRange(0, 10, 30, 200)
+ (0, 2)
+
+ Test smooth range truncated on start.
+
+ >>> c.getSmoothRange(1, 10, 30, 4)
+ (0, 3)
+
+ Test smooth range truncated on end.
+
+ >>> c.getSmoothRange(5, 1, 3, 5)
+ (4, 5)
+
+ Test smooth range not multiple of tileSize.
+
+ >>> c.getSmoothRange(5, 10, 24, 10)
+ (4, 6)
+ """
+ smoothTiles = int(smoothRange / tileSize)
+ if smoothTiles == 1:
+ return (tileIndex, tileIndex + 1)
+
+ smoothTilesSide = float(smoothTiles - 1) / 2
+ smoothTilesLeft = int(np.ceil(smoothTilesSide))
+ smoothTilesRight = int(np.floor(smoothTilesSide)) + 1
+
+ indexStart = max(tileIndex - smoothTilesLeft, 0)
+ indexEnd = min(maxPosition, tileIndex + smoothTilesRight)
+ return (indexStart, indexEnd)
+
+
+def remove_row_of_zeros(matrix):
+ # remove rows containing all zeros or all nans
+ _mat = np.nan_to_num(matrix)
+ to_keep = _mat.sum(1) != 0
+ return matrix[to_keep, :]
+
+
+def estimateSizeFactors(m):
+ """
+ Compute size factors in the same way as DESeq2.
+ The inverse of that is returned, as it's then compatible with bamCoverage.
+
+ m : a numpy ndarray
+
+ >>> m = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [0, 10, 0], [10, 5, 100]])
+ >>> sf = estimateSizeFactors(m)
+ >>> assert np.all(np.abs(sf - [1.305, 0.9932, 0.783]) < 1e-4)
+ >>> m = np.array([[0, 0], [0, 1], [1, 1], [1, 2]])
+ >>> sf = estimateSizeFactors(m)
+ >>> assert np.all(np.abs(sf - [1.1892, 0.8409]) < 1e-4)
+ """
+ loggeomeans = np.sum(np.log(m), axis=1) / m.shape[1]
+ # Mask after computing the geometric mean
+ m = np.ma.masked_where(m <= 0, m)
+ loggeomeans = np.ma.masked_where(np.isinf(loggeomeans), loggeomeans)
+ # DESeq2 ratio-based size factor
+ sf = np.exp(np.ma.median((np.log(m).T - loggeomeans).T, axis=0))
+ return 1. / sf
+
+
+class Tester(object):
+
+ def __init__(self):
+ """
+ The distribution of reads between the two bam files is as follows.
+
+ They cover 200 bp
+
+ 0 100 200
+ |------------------------------------------------------------|
+ A ===============
+ ===============
+
+
+ B =============== ===============
+ ===============
+ ===============
+ """
+ self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/"
+ # self.root = "./test/test_data/"
+ self.bamFile1 = self.root + "testA.bam"
+ self.bamFile2 = self.root + "testB.bam"
+ self.bamFile_PE = self.root + "test_paired2.bam"
+ self.chrom = '3R'
+ global debug
+ debug = 0
+
+ def getRead(self, readType):
+ """ prepare arguments for test
+ """
+ bam = bamHandler.openBam(self.bamFile_PE)
+ if readType == 'paired-reverse':
+ read = [x for x in bam.fetch('chr2', 5000081, 5000082)][0]
+ elif readType == 'single-forward':
+ read = [x for x in bam.fetch('chr2', 5001491, 5001492)][0]
+ elif readType == 'single-reverse':
+ read = [x for x in bam.fetch('chr2', 5001700, 5001701)][0]
+ else: # by default a forward paired read is returned
+ read = [x for x in bam.fetch('chr2', 5000027, 5000028)][0]
+ return read
diff --git a/deepTools/source/deeptools/deeptools_list_tools.py b/deepTools/source/deeptools/deeptools_list_tools.py
new file mode 100644
index 0000000000000000000000000000000000000000..32dcf7021f295f3515fbad4841e37dcfec480639
--- /dev/null
+++ b/deepTools/source/deeptools/deeptools_list_tools.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import argparse
+import sys
+from importlib.metadata import version
+
+
+def parse_arguments(args=None):
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ description="""
+deepTools is a suite of python tools particularly developed for the efficient analysis of
+high-throughput sequencing data, such as ChIP-seq, RNA-seq or MNase-seq.
+
+Each tool should be called by its own name as in the following example:
+
+ $ bamCoverage -b reads.bam -o coverage.bw
+
+If you find deepTools useful for your research please cite as:
+
+Ramírez, Fidel, Devon P. Ryan, Björn Grüning, Vivek Bhardwaj, Fabian Kilpert,
+Andreas S. Richter, Steffen Heyne, Friederike Dündar,
+and Thomas Manke. 2016. "deepTools2: A next Generation Web Server for Deep-Sequencing
+Data Analysis." Nucleic Acids Research, April. doi:10.1093/nar/gkw257.
+
+
+
+[ Tools for BAM and bigWig file processing ]
+ multiBamSummary compute read coverages over bam files. Output used for plotCorrelation or plotPCA
+ multiBigwigSummary extract scores from bigwig files. Output used for plotCorrelation or plotPCA
+ correctGCBias corrects GC bias from bam file. Don't use it with ChIP data
+ bamCoverage computes read coverage per bins or regions
+ bamCompare computes log2 ratio and other operations of read coverage of two samples per bins or regions
+ bigwigCompare computes log2 ratio and other operations from bigwig scores of two samples per bins or regions
+ bigwigAverage computes average from bigwig scores of multiple samples per bins or regions
+ computeMatrix prepares the data from bigwig scores for plotting with plotHeatmap or plotProfile
+ alignmentSieve filters BAM alignments according to specified parameters, optionally producing a BEDPE file
+
+
+[ Tools for QC ]
+ plotCorrelation plots heatmaps or scatterplots of data correlation
+ plotPCA plots PCA
+ plotFingerprint plots the distribution of enriched regions
+ bamPEFragmentSize returns the read length and paired-end distance from a bam file
+ computeGCBias computes and plots the GC bias of a sample
+ plotCoverage plots a histogram of read coverage
+ estimateReadFiltering estimates the number of reads that will be filtered from a BAM file or files given certain criteria
+
+
+[Heatmaps and summary plots]
+ plotHeatmap plots one or multiple heatmaps of user selected regions over different genomic scores
+ plotProfile plots the average profile of user selected regions over different genomic scores
+ plotEnrichment plots the read/fragment coverage of one or more sets of regions
+
+[Miscellaneous]
+ computeMatrixOperations Modifies the output of computeMatrix in a variety of ways.
+
+
+For more information visit: http://deeptools.readthedocs.org
+""")
+
+ parser.add_argument('--version', action='version',
+ version='%(prog)s {}'.format(version('deeptools')))
+
+ return parser
+
+
+def process_args(args=None):
+ args = parse_arguments().parse_args(args)
+
+ return args
+
+
+def main(args=None):
+ if args is None and len(sys.argv) == 1:
+ args = ["--help"]
+ process_args(args)
diff --git a/deepTools/source/deeptools/estimateReadFiltering.py b/deepTools/source/deeptools/estimateReadFiltering.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c46a384198aae9469005f3f7ff42aeae00ec9f8
--- /dev/null
+++ b/deepTools/source/deeptools/estimateReadFiltering.py
@@ -0,0 +1,376 @@
+#!/usr/bin/env python
+import argparse
+import sys
+
+from deeptools import parserCommon, bamHandler, utilities
+from deeptools.mapReduce import mapReduce
+from deeptools.utilities import smartLabels
+from importlib.metadata import version
+
+
+def parseArguments():
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ description="""
+This tool estimates the number of reads that would be filtered given a set of
+settings and prints this to the terminal. Further, it tracks the number of singleton reads. The following metrics will always be tracked regardless of what you specify (the order output also matches this):
+
+ * Total reads (including unmapped)
+ * Mapped reads
+ * Reads in blacklisted regions (--blackListFileName)
+
+The following metrics are estimated according to the --binSize and --distanceBetweenBins parameters
+ * Estimated mapped reads filtered (the total number of mapped reads filtered for any reason)
+ * Alignments with a below threshold MAPQ (--minMappingQuality)
+ * Alignments with at least one missing flag (--samFlagInclude)
+ * Alignments with undesirable flags (--samFlagExclude)
+ * Duplicates determined by deepTools (--ignoreDuplicates)
+ * Duplicates marked externally (e.g., by picard)
+ * Singletons (paired-end reads with only one mate aligning)
+ * Wrong strand (due to --filterRNAstrand)
+
+The sum of these may be more than the total number of reads. Note that alignments are sampled from bins of size --binSize spaced --distanceBetweenBins apart.
+""",
+ usage='estimateReadFiltering -b sample1.bam sample2.bam\n'
+ 'help: estimateReadFiltering -h / estimateReadFiltering --help'
+ )
+
+ required = parser.add_argument_group('Required arguments')
+ required.add_argument('--bamfiles', '-b',
+ metavar='FILE1 FILE2',
+ help='List of indexed bam files separated by spaces.',
+ nargs='+',
+ required=True)
+
+ general = parser.add_argument_group('General arguments')
+
+ general.add_argument('--outFile', '-o',
+ type=parserCommon.writableFile,
+ help='The file to write results to. By default, results are printed to the console')
+
+ general.add_argument('--sampleLabels',
+ help='Labels for the samples. The '
+ 'default is to use the file name of the '
+ 'sample. The sample labels should be separated '
+ 'by spaces and quoted if a label itself'
+ 'contains a space E.g. --sampleLabels label-1 "label 2" ',
+ nargs='+')
+
+ general.add_argument('--smartLabels',
+ action='store_true',
+ help='Instead of manually specifying labels for the input '
+ 'BAM files, this causes deepTools to use the '
+ 'file name after removing the path and extension.')
+
+ general.add_argument('--binSize', '-bs',
+ metavar='INT',
+ help='Length in bases of the window used to sample the genome. (Default: %(default)s)',
+ default=1000000,
+ type=int)
+
+ general.add_argument('--distanceBetweenBins', '-n',
+ metavar='INT',
+ help='To reduce the computation time, not every possible genomic '
+ 'bin is sampled. This option allows you to set the distance '
+ 'between bins actually sampled from. Larger numbers are sufficient '
+ 'for high coverage samples, while smaller values are useful for '
+ 'lower coverage samples. Note that if you specify a value that '
+ 'results in too few (<1000) reads sampled, the value will be '
+ 'decreased. (Default: %(default)s)',
+ default=10000,
+ type=int)
+
+ general.add_argument('--numberOfProcessors', '-p',
+ help='Number of processors to use. Type "max/2" to '
+ 'use half the maximum number of processors or "max" '
+ 'to use all available processors. (Default: %(default)s)',
+ metavar="INT",
+ type=parserCommon.numberOfProcessors,
+ default=1,
+ required=False)
+
+ general.add_argument('--verbose', '-v',
+ help='Set to see processing messages.',
+ action='store_true')
+
+ general.add_argument('--version', action='version',
+ version='%(prog)s {}'.format(version('deeptools')))
+
+ filtering = parser.add_argument_group('Optional arguments')
+
+ filtering.add_argument('--filterRNAstrand',
+ help='Selects RNA-seq reads (single-end or paired-end) in '
+ 'the given strand. (Default: %(default)s)',
+ choices=['forward', 'reverse'],
+ default=None)
+
+ filtering.add_argument('--ignoreDuplicates',
+ help='If set, reads that have the same orientation '
+ 'and start position will be considered only '
+ 'once. If reads are paired, the mate\'s position '
+ 'also has to coincide to ignore a read.',
+ action='store_true')
+
+ filtering.add_argument('--minMappingQuality',
+ metavar='INT',
+ help='If set, only reads that have a mapping '
+ 'quality score of at least this are '
+ 'considered.',
+ type=int)
+
+ filtering.add_argument('--samFlagInclude',
+ help='Include reads based on the SAM flag. For example, '
+ 'to get only reads that are the first mate, use a flag of 64. '
+ 'This is useful to count properly paired reads only once, '
+ 'as otherwise the second mate will be also considered for the '
+ 'coverage. (Default: %(default)s)',
+ metavar='INT',
+ default=None,
+ type=int,
+ required=False)
+
+ filtering.add_argument('--samFlagExclude',
+ help='Exclude reads based on the SAM flag. For example, '
+ 'to get only reads that map to the forward strand, use '
+ '--samFlagExclude 16, where 16 is the SAM flag for reads '
+ 'that map to the reverse strand. (Default: %(default)s)',
+ metavar='INT',
+ default=None,
+ type=int,
+ required=False)
+
+ filtering.add_argument('--blackListFileName', '-bl',
+ help="A BED or GTF file containing regions that should be excluded from all analyses. Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered. Please note that you should adjust the effective genome size, if relevant.",
+ metavar="BED file",
+ nargs="+",
+ required=False)
+
+ return parser
+
+
+def getFiltered_worker(arglist):
+ chrom, start, end, args = arglist
+ # Fix the bounds
+ if end - start > args.binSize and end - start > args.distanceBetweenBins:
+ end -= args.distanceBetweenBins
+ if end <= start:
+ end = start + 1
+
+ o = []
+ for fname in args.bamfiles:
+ fh = bamHandler.openBam(fname)
+ chromUse = utilities.mungeChromosome(chrom, fh.references)
+ prev_pos = set()
+ lpos = None
+
+ minMapq = 0
+ samFlagInclude = 0
+ samFlagExclude = 0
+ internalDupes = 0
+ externalDupes = 0
+ singletons = 0
+ filterRNAstrand = 0
+ nFiltered = 0
+ total = 0 # This is only used to estimate the percentage affected
+ for read in fh.fetch(chromUse, start, end):
+ filtered = 0
+ if read.pos < start:
+ # ensure that we never double count (in case distanceBetweenBins == 0)
+ continue
+
+ if read.flag & 4:
+ # Ignore unmapped reads, they were counted already
+ continue
+
+ if args.minMappingQuality and read.mapq < args.minMappingQuality:
+ filtered = 1
+ minMapq += 1
+ if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude:
+ filtered = 1
+ samFlagInclude += 1
+ if args.samFlagExclude and read.flag & args.samFlagExclude != 0:
+ filtered = 1
+ samFlagExclude += 1
+ if args.ignoreDuplicates:
+ # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions
+ if read.tlen >= 0:
+ s = read.pos
+ e = s + read.tlen
+ else:
+ s = read.pnext
+ e = s - read.tlen
+ if read.reference_id != read.next_reference_id:
+ e = read.pnext
+ if lpos is not None and lpos == read.reference_start \
+ and (s, e, read.next_reference_id, read.is_reverse) in prev_pos:
+ filtered = 1
+ internalDupes += 1
+ if lpos != read.reference_start:
+ prev_pos.clear()
+ lpos = read.reference_start
+ prev_pos.add((s, e, read.next_reference_id, read.is_reverse))
+ if read.is_duplicate:
+ filtered = 1
+ externalDupes += 1
+ if read.is_paired and read.mate_is_unmapped:
+ filtered = 1
+ singletons += 1
+
+ # filterRNAstrand
+ if args.filterRNAstrand:
+ if read.is_paired:
+ if args.filterRNAstrand == 'forward':
+ if read.flag & 144 == 128 or read.flag & 96 == 64:
+ pass
+ else:
+ filtered = 1
+ filterRNAstrand += 1
+ elif args.filterRNAstrand == 'reverse':
+ if read.flag & 144 == 144 or read.flag & 96 == 96:
+ pass
+ else:
+ filtered = 1
+ filterRNAstrand += 1
+ else:
+ if args.filterRNAstrand == 'forward':
+ if read.flag & 16 == 16:
+ pass
+ else:
+ filtered = 1
+ filterRNAstrand += 1
+ elif args.filterRNAstrand == 'reverse':
+ if read.flag & 16 == 0:
+ pass
+ else:
+ filtered = 1
+ filterRNAstrand += 1
+
+ total += 1
+ nFiltered += filtered
+ fh.close()
+
+ # Append a tuple to the output
+ tup = (total, nFiltered, minMapq, samFlagInclude, samFlagExclude, internalDupes, externalDupes, singletons, filterRNAstrand)
+ o.append(tup)
+ return o
+
+
+def main(args=None):
+ args = parseArguments().parse_args(args)
+
+ if not args.sampleLabels and args.smartLabels:
+ args.sampleLabels = smartLabels(args.bamfiles)
+
+ if args.sampleLabels and len(args.sampleLabels) != len(args.bamfiles):
+ sys.stderr.write("\nError: --sampleLabels specified but it doesn't match the number of BAM files!\n")
+ sys.exit(1)
+
+ if args.outFile is None:
+ of = sys.stdout
+ else:
+ of = open(args.outFile, "w")
+
+ bhs = [bamHandler.openBam(x, returnStats=True, nThreads=args.numberOfProcessors) for x in args.bamfiles]
+ mapped = [x[1] for x in bhs]
+ unmappedList = [x[2] for x in bhs]
+ bhs = [x[0] for x in bhs]
+
+ # Get the reads in blacklisted regions
+ if args.blackListFileName:
+ blacklisted = []
+ for bh in bhs:
+ blacklisted.append(utilities.bam_blacklisted_reads(bh, None, args.blackListFileName, args.numberOfProcessors))
+ else:
+ blacklisted = [0] * len(bhs)
+
+ # Get the total and mapped reads
+ total = [x + y for x, y in list(zip(mapped, unmappedList))]
+
+ chrom_sizes = list(zip(bhs[0].references, bhs[0].lengths))
+ for x in bhs:
+ x.close()
+
+ # Get the remaining metrics
+ res = mapReduce([args],
+ getFiltered_worker,
+ chrom_sizes,
+ genomeChunkLength=args.binSize + args.distanceBetweenBins,
+ blackListFileName=args.blackListFileName,
+ numberOfProcessors=args.numberOfProcessors,
+ verbose=args.verbose)
+
+ totals = [0] * len(args.bamfiles)
+ nFiltered = [0] * len(args.bamfiles)
+ MAPQs = [0] * len(args.bamfiles)
+ flagIncludes = [0] * len(args.bamfiles)
+ flagExcludes = [0] * len(args.bamfiles)
+ internalDupes = [0] * len(args.bamfiles)
+ externalDupes = [0] * len(args.bamfiles)
+ singletons = [0] * len(args.bamfiles)
+ rnaStrand = [0] * len(args.bamfiles)
+ for x in res:
+ for idx, r in enumerate(x):
+ totals[idx] += r[0]
+ nFiltered[idx] += r[1]
+ MAPQs[idx] += r[2]
+ flagIncludes[idx] += r[3]
+ flagExcludes[idx] += r[4]
+ internalDupes[idx] += r[5]
+ externalDupes[idx] += r[6]
+ singletons[idx] += r[7]
+ rnaStrand[idx] += r[8]
+
+ # Print some output
+ of.write("Sample\tTotal Reads\tMapped Reads\tAlignments in blacklisted regions\tEstimated mapped reads filtered\tBelow MAPQ\tMissing Flags\tExcluded Flags\tInternally-determined Duplicates\tMarked Duplicates\tSingletons\tWrong strand\n")
+ for idx, _ in enumerate(args.bamfiles):
+ if args.sampleLabels:
+ of.write(args.sampleLabels[idx])
+ else:
+ of.write(args.bamfiles[idx])
+ of.write("\t{}\t{}\t{}".format(total[idx], mapped[idx], blacklisted[idx]))
+ # nFiltered
+ metric = 0.0
+ if totals[idx] > 0:
+ metric = blacklisted[idx] + float(nFiltered[idx]) / float(totals[idx]) * mapped[idx]
+ of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
+ # MAPQ
+ metric = 0.0
+ if totals[idx] > 0:
+ metric = float(MAPQs[idx]) / float(totals[idx]) * mapped[idx]
+ of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
+ # samFlagInclude
+ metric = 0.0
+ if totals[idx] > 0:
+ metric = float(flagIncludes[idx]) / float(totals[idx]) * mapped[idx]
+ of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
+ # samFlagExclude
+ metric = 0.0
+ if totals[idx] > 0:
+ metric = float(flagExcludes[idx]) / float(totals[idx]) * mapped[idx]
+ of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
+ # Internally determined duplicates
+ metric = 0.0
+ if totals[idx] > 0:
+ metric = float(internalDupes[idx]) / float(totals[idx]) * mapped[idx]
+ of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
+ # Externally marked duplicates
+ metric = 0.0
+ if totals[idx] > 0:
+ metric = float(externalDupes[idx]) / float(totals[idx]) * mapped[idx]
+ of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
+ # Singletons
+ metric = 0.0
+ if totals[idx] > 0:
+ metric = float(singletons[idx]) / float(totals[idx]) * mapped[idx]
+ of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
+ # filterRNAstrand
+ metric = 0.0
+ if totals[idx] > 0:
+ metric = float(rnaStrand[idx]) / float(totals[idx]) * mapped[idx]
+ of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
+ of.write("\n")
+
+ if args.outFile is not None:
+ of.close()
+
+ return 0
diff --git a/deepTools/source/deeptools/estimateScaleFactor.py b/deepTools/source/deeptools/estimateScaleFactor.py
new file mode 100644
index 0000000000000000000000000000000000000000..97869a7bdbb82099fe00c74d345c57dcc4e413b4
--- /dev/null
+++ b/deepTools/source/deeptools/estimateScaleFactor.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import argparse
+import sys
+
+from deeptools.SES_scaleFactor import estimateScaleFactor
+from deeptools.parserCommon import numberOfProcessors
+from importlib.metadata import version
+debug = 0
+
+
+def parseArguments(args=None):
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ description='Given two BAM files, this estimates scaling factors '
+ '(bigger to smaller).',
+ usage='estimateScaleFactor -b sample1.bam sample2.bam\n'
+ 'help: estimateScaleFactor -h / estimateScaleFactor --help'
+ )
+
+ # define the arguments
+ parser.add_argument('--bamfiles', '-b',
+ metavar='list of bam files',
+ help='List of indexed BAM files, space delineated',
+ nargs='+',
+ required=True)
+
+ parser.add_argument('--ignoreForNormalization', '-ignore',
+ help='A comma-separated list of chromosome names, '
+ 'limited by quotes, '
+ 'containing those '
+ 'chromosomes that should be excluded '
+ 'during normalization computations. For example, '
+ '--ignoreForNormalization "chrX, chrM" ')
+
+ parser.add_argument('--sampleWindowLength', '-l',
+ help='Length in bases for a window used to '
+ 'sample the genome and compute the size or scaling '
+ 'factors',
+ default=1000,
+ type=int)
+
+ parser.add_argument('--numberOfSamples', '-n',
+ help='Number of samplings taken from the genome '
+ 'to compute the scaling factors',
+ default=100000,
+ type=int)
+
+ parser.add_argument('--normalizationLength', '-nl',
+ help='By default, data is normalized to 1 '
+ 'fragment per 100 bases. The expected value is an '
+ 'integer. For example, if normalizationLength '
+ 'is 1000, then the resulting scaling factor '
+ 'will cause the average coverage of the BAM file to '
+ 'have on average 1 fragment per kilobase',
+ type=int,
+ default=10)
+
+ parser.add_argument('--skipZeros',
+ help='If set, then zero counts that happen for *all* '
+ 'BAM files given are ignored. This will result in a '
+ 'reduced number of read counts than that specified '
+ 'in --numberOfSamples',
+ action='store_true',
+ required=False)
+
+ parser.add_argument('--numberOfProcessors', '-p',
+ help='Number of processors to use. The default is '
+ 'to use half the maximum number of processors.',
+ metavar="INT",
+ type=numberOfProcessors,
+ default="max/2",
+ required=False)
+
+ parser.add_argument('--verbose', '-v',
+ help='Set to see processing messages.',
+ action='store_true')
+
+ parser.add_argument('--version',
+ action='version',
+ version='%(prog)s {}'.format(version('deeptools')))
+
+ args = parser.parse_args(args)
+ if args.ignoreForNormalization:
+ args.ignoreForNormalization = [
+ x.strip() for x in args.ignoreForNormalization.split(',')
+ ]
+ else:
+ args.ignoreForNormalization = []
+ return args
+
+
+def main(args=None):
+ """
+ The algorithm samples the genome a number of times as specified
+ by the --numberOfSamples parameter to estimate scaling factors of
+ between to samples
+
+ """
+ args = parseArguments(args)
+ if len(args.bamfiles) > 2:
+ print("SES method to estimate scale factors only works for two samples")
+ exit(0)
+
+ sys.stderr.write("{:,} number of samples will be computed.\n".format(args.numberOfSamples))
+ sizeFactorsDict = estimateScaleFactor(args.bamfiles, args.sampleWindowLength,
+ args.numberOfSamples,
+ args.normalizationLength,
+ numberOfProcessors=args.numberOfProcessors,
+ chrsToSkip=args.ignoreForNormalization,
+ verbose=args.verbose)
+
+ for k, v in sizeFactorsDict.items():
+ print("{}: {}".format(k, v))
diff --git a/deepTools/source/deeptools/getFragmentAndReadSize.py b/deepTools/source/deeptools/getFragmentAndReadSize.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cf2dc8eba9ce6fa8a4a43713fa93885d61e42ca
--- /dev/null
+++ b/deepTools/source/deeptools/getFragmentAndReadSize.py
@@ -0,0 +1,166 @@
+import numpy as np
+
+# own tools
+from deeptools import bamHandler
+from deeptools import mapReduce
+
+old_settings = np.seterr(all='ignore')
+
+
+def getFragmentLength_wrapper(args):
+ return getFragmentLength_worker(*args)
+
+
+def getFragmentLength_worker(chrom, start, end, bamFile, distanceBetweenBins):
+ """
+ Queries the reads at the given region for the distance between
+ reads and the read length
+
+ Parameters
+ ----------
+ chrom : str
+ chromosome name
+ start : int
+ region start
+ end : int
+ region end
+ bamFile : str
+ BAM file name
+ distanceBetweenBins : int
+ the number of bases at the end of each bin to ignore
+
+ Returns
+ -------
+ np.array
+ an np.array, where first column is fragment length, the
+ second is for read length
+ """
+ bam = bamHandler.openBam(bamFile)
+ end = max(start + 1, end - distanceBetweenBins)
+ if chrom in bam.references:
+ reads = np.array([(abs(r.template_length), r.infer_query_length(always=False))
+ for r in bam.fetch(chrom, start, end)
+ if r.is_proper_pair and r.is_read1 and not r.is_unmapped])
+ if not len(reads):
+ # if the previous operation produces an empty list
+ # it could be that the data is not paired, then
+ # we try with out filtering
+ reads = np.array([(abs(r.template_length), r.infer_query_length(always=False))
+ for r in bam.fetch(chrom, start, end) if not r.is_unmapped])
+ else:
+ raise NameError("chromosome {} not found in bam file".format(chrom))
+
+ if not len(reads):
+ reads = np.array([]).reshape(0, 2)
+
+ return reads
+
+
+def get_read_and_fragment_length(bamFile, return_lengths=False, blackListFileName=None,
+ binSize=50000, distanceBetweenBins=1000000,
+ numberOfProcessors=None, verbose=False):
+ """
+ Estimates the fragment length and read length through sampling
+
+ Parameters
+ ----------
+ bamFile : str
+ BAM file name
+ return_lengths : bool
+ numberOfProcessors : int
+ verbose : bool
+ binSize : int
+ distanceBetweenBins : int
+
+ Returns
+ -------
+ d : dict
+ tuple of two dictionaries, one for the fragment length and the other
+for the read length. The dictionaries summarise the mean, median etc. values
+
+ """
+
+ bam_handle = bamHandler.openBam(bamFile)
+ chrom_sizes = list(zip(bam_handle.references, bam_handle.lengths))
+
+ distanceBetweenBins *= 2
+ fl = []
+
+ # Fix issue #522, allow distanceBetweenBins == 0
+ if distanceBetweenBins == 0:
+ imap_res = mapReduce.mapReduce((bam_handle.filename, distanceBetweenBins),
+ getFragmentLength_wrapper,
+ chrom_sizes,
+ genomeChunkLength=binSize,
+ blackListFileName=blackListFileName,
+ numberOfProcessors=numberOfProcessors,
+ verbose=verbose)
+ fl = np.concatenate(imap_res)
+
+ # Try to ensure we have at least 1000 regions from which to compute statistics, halving the intra-bin distance as needed
+ while len(fl) < 1000 and distanceBetweenBins > 1:
+ distanceBetweenBins /= 2
+ stepsize = binSize + distanceBetweenBins
+ imap_res = mapReduce.mapReduce((bam_handle.filename, distanceBetweenBins),
+ getFragmentLength_wrapper,
+ chrom_sizes,
+ genomeChunkLength=stepsize,
+ blackListFileName=blackListFileName,
+ numberOfProcessors=numberOfProcessors,
+ verbose=verbose)
+
+ fl = np.concatenate(imap_res)
+
+ if len(fl):
+ fragment_length = fl[:, 0]
+ read_length = fl[:, 1]
+ if fragment_length.mean() > 0:
+ fragment_len_dict = {'sample_size': len(fragment_length),
+ 'min': fragment_length.min(),
+ 'qtile25': np.percentile(fragment_length, 25),
+ 'mean': np.mean(fragment_length),
+ 'median': np.median(fragment_length),
+ 'qtile75': np.percentile(fragment_length, 75),
+ 'max': fragment_length.max(),
+ 'std': np.std(fragment_length),
+ 'mad': np.median(np.abs(fragment_length - np.median(fragment_length))),
+ 'qtile10': np.percentile(fragment_length, 10),
+ 'qtile20': np.percentile(fragment_length, 20),
+ 'qtile30': np.percentile(fragment_length, 30),
+ 'qtile40': np.percentile(fragment_length, 40),
+ 'qtile60': np.percentile(fragment_length, 60),
+ 'qtile70': np.percentile(fragment_length, 70),
+ 'qtile80': np.percentile(fragment_length, 80),
+ 'qtile90': np.percentile(fragment_length, 90),
+ 'qtile99': np.percentile(fragment_length, 99)}
+ else:
+ fragment_len_dict = None
+
+ if return_lengths and fragment_len_dict is not None:
+ fragment_len_dict['lengths'] = fragment_length
+
+ read_len_dict = {'sample_size': len(read_length),
+ 'min': read_length.min(),
+ 'qtile25': np.percentile(read_length, 25),
+ 'mean': np.mean(read_length),
+ 'median': np.median(read_length),
+ 'qtile75': np.percentile(read_length, 75),
+ 'max': read_length.max(),
+ 'std': np.std(read_length),
+ 'mad': np.median(np.abs(read_length - np.median(read_length))),
+ 'qtile10': np.percentile(read_length, 10),
+ 'qtile20': np.percentile(read_length, 20),
+ 'qtile30': np.percentile(read_length, 30),
+ 'qtile40': np.percentile(read_length, 40),
+ 'qtile60': np.percentile(read_length, 60),
+ 'qtile70': np.percentile(read_length, 70),
+ 'qtile80': np.percentile(read_length, 80),
+ 'qtile90': np.percentile(read_length, 90),
+ 'qtile99': np.percentile(read_length, 99)}
+ if return_lengths:
+ read_len_dict['lengths'] = read_length
+ else:
+ fragment_len_dict = None
+ read_len_dict = None
+
+ return fragment_len_dict, read_len_dict
diff --git a/deepTools/source/deeptools/getRatio.py b/deepTools/source/deeptools/getRatio.py
new file mode 100644
index 0000000000000000000000000000000000000000..937cc7c418991e1470e5e79fd944e0aa0ba57596
--- /dev/null
+++ b/deepTools/source/deeptools/getRatio.py
@@ -0,0 +1,82 @@
+import numpy as np
+
+old_settings = np.seterr(all='ignore')
+
+
+def compute_ratio(value1, value2, args):
+ value1 = value1 + args['pseudocount'][0]
+ value2 = value2 + args['pseudocount'][1]
+
+ ratio = float(value1) / value2
+ if args['valueType'] == 'log2':
+ ratio = np.log2(ratio)
+
+ elif args['valueType'] == 'reciprocal_ratio':
+ # the reciprocal ratio of a/b
+ # is a/b if a/b > 1 else -1* b/a
+ ratio = ratio if ratio >= 1 else -1.0 / ratio
+
+ return ratio
+
+
+def getRatio(tileCoverage, args):
+ r"""
+ The mapreduce method calls this function
+ for each tile. The parameters (args) are fixed
+ in the main method.
+
+ >>> funcArgs= {'valueType': 'ratio', 'scaleFactors': (1,1), 'pseudocount': [1, 1]}
+ >>> getRatio([9, 19], funcArgs)
+ 0.5
+ >>> getRatio([0, 0], funcArgs)
+ 1.0
+ >>> getRatio([np.nan, np.nan], funcArgs)
+ nan
+ >>> getRatio([np.nan, 1.0], funcArgs)
+ nan
+ >>> funcArgs['valueType'] ='subtract'
+ >>> getRatio([20, 10], funcArgs)
+ 10
+ >>> funcArgs['scaleFactors'] = (1, 0.5)
+ >>> getRatio([10, 20], funcArgs)
+ 0.0
+
+ The reciprocal ratio is of a and b is:
+ is a/b if a/b > 1 else -1* b/a
+ >>> funcArgs['valueType'] ='reciprocal_ratio'
+ >>> funcArgs['scaleFactors'] = (1, 1)
+ >>> funcArgs['pseudocount'] = [0, 0]
+ >>> getRatio([2, 1], funcArgs)
+ 2.0
+ >>> getRatio([1, 2], funcArgs)
+ -2.0
+ >>> getRatio([1, 1], funcArgs)
+ 1.0
+ """
+
+ value1 = args['scaleFactors'][0] * tileCoverage[0]
+ value2 = args['scaleFactors'][1] * tileCoverage[1]
+
+ # if any of the two values to compare
+ # is nan, return nan
+ if np.isnan(value1) or np.isnan(value2):
+ return np.nan
+
+ # ratio case
+ if args['valueType'] in ['ratio', 'log2', 'reciprocal_ratio']:
+ bin_value = compute_ratio(value1, value2, args)
+
+ # non ratio case (diff, sum etc)
+ else:
+ if args['valueType'] == 'subtract':
+ bin_value = value1 - value2
+ elif args['valueType'] == 'add':
+ bin_value = value1 + value2
+ elif args['valueType'] == 'first':
+ bin_value = value1
+ elif args['valueType'] == 'second':
+ bin_value = value2
+ elif args['valueType'] == 'mean':
+ bin_value = (value1 + value2) / 2.0
+
+ return bin_value
diff --git a/deepTools/source/deeptools/getScaleFactor.py b/deepTools/source/deeptools/getScaleFactor.py
new file mode 100644
index 0000000000000000000000000000000000000000..541b4febdc1801bda431d1dcf98bd77e726132a6
--- /dev/null
+++ b/deepTools/source/deeptools/getScaleFactor.py
@@ -0,0 +1,305 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import deeptools.mapReduce as mapReduce
+from deeptools import bamHandler
+from deeptools import utilities
+import sys
+
+debug = 0
+
+
+def getFractionKept_wrapper(args):
+ return getFractionKept_worker(*args)
+
+
+def getFractionKept_worker(chrom, start, end, bamFile, args, offset):
+ """
+ Queries the BAM file and counts the number of alignments kept/found in the
+ first 50000 bases.
+ """
+ bam = bamHandler.openBam(bamFile)
+ start += offset * 50000
+ end = min(end, start + 50000)
+ tot = 0
+ filtered = 0
+
+ if end <= start:
+ return (filtered, tot)
+
+ prev_pos = set()
+ lpos = None
+ if chrom in bam.references:
+ for read in bam.fetch(chrom, start, end):
+ tot += 1
+ if read.is_unmapped:
+ continue
+
+ if args.minMappingQuality and read.mapq < args.minMappingQuality:
+ filtered += 1
+ continue
+
+ # filter reads based on SAM flag
+ if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude:
+ filtered += 1
+ continue
+ if args.samFlagExclude and read.flag & args.samFlagExclude != 0:
+ filtered += 1
+ continue
+
+ # fragment length filtering
+ tLen = utilities.getTLen(read)
+ if args.minFragmentLength > 0 and tLen < args.minFragmentLength:
+ filtered += 1
+ continue
+ if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength:
+ filtered += 1
+ continue
+
+ # get rid of duplicate reads that have same position on each of the
+ # pairs
+ if args.ignoreDuplicates:
+ # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions
+ if tLen >= 0:
+ s = read.pos
+ e = s + tLen
+ else:
+ s = read.pnext
+ e = s - tLen
+ if read.reference_id != read.next_reference_id:
+ e = read.pnext
+ if lpos is not None and lpos == read.reference_start \
+ and (s, e, read.next_reference_id, read.is_reverse) in prev_pos:
+ filtered += 1
+ continue
+ if lpos != read.reference_start:
+ prev_pos.clear()
+ lpos = read.reference_start
+ prev_pos.add((s, e, read.next_reference_id, read.is_reverse))
+
+ # If filterRNAstrand is in args, then filter accordingly
+ # This is very similar to what's used in the get_fragment_from_read function in the filterRnaStrand class
+ if hasattr(args, "filterRNAstrand"):
+ if read.is_paired:
+ if args.filterRNAstrand == 'forward':
+ if not ((read.flag & 128 == 128 and read.flag & 16 == 0) or (read.flag & 64 == 64 and read.flag & 32 == 0)):
+ filtered += 1
+ continue
+ elif args.filterRNAstrand == 'reverse':
+ if not (read.flag & 144 == 144 or read.flag & 96 == 96):
+ filtered += 1
+ continue
+ else:
+ if args.filterRNAstrand == 'forward' and read.flag & 16 == 0:
+ filtered += 1
+ continue
+ elif args.filterRNAstrand == 'reverse' and read.flag & 16 == 16:
+ filtered += 1
+ continue
+
+ return (filtered, tot)
+
+
+def fraction_kept(args, stats):
+ """
+ Count the following:
+ (A) The total number of alignments sampled
+ (B) The total number of alignments ignored due to any of the following:
+ --samFlagInclude
+ --samFlagExclude
+ --minMappingQuality
+ --ignoreDuplicates
+ --minFragmentLength
+ --maxFragmentLength
+
+ Black list regions are already accounted for. This works by sampling the
+ genome (by default, we'll iterate until we sample 1% or 100,000 alignments,
+ whichever is smaller (unless there are fewer than 100,000 alignments, in
+ which case sample everything).
+
+ The sampling works by dividing the genome into bins and only looking at the
+ first 50000 bases. If this doesn't yield sufficient alignments then the bin
+ size is halved.
+ """
+ # Do we even need to proceed?
+ if (not args.minMappingQuality or args.minMappingQuality == 0) and \
+ (not args.samFlagInclude or args.samFlagInclude == 0) and \
+ (not args.samFlagExclude or args.samFlagExclude == 0) and \
+ (not args.minFragmentLength or args.minFragmentLength == 0) and \
+ (not args.maxFragmentLength or args.maxFragmentLength == 0):
+ if hasattr(args, "filterRNAstrand"):
+ if args.filterRNAstrand not in ["forward", "reverse"]:
+ return 1.0
+ else:
+ return 1.0
+
+ filtered = 0
+ total = 0
+ distanceBetweenBins = 2000000
+ bam_handle = bamHandler.openBam(args.bam)
+ bam_mapped = utilities.bam_total_reads(bam_handle, args.ignoreForNormalization, stats)
+ if bam_mapped < 1000000:
+ num_needed_to_sample = bam_mapped
+ else:
+ if 0.1 * bam_mapped >= 1000000:
+ num_needed_to_sample = 0.1 * bam_mapped
+ else:
+ num_needed_to_sample = 1000000
+ if args.exactScaling:
+ num_needed_to_sample = bam_mapped
+ if num_needed_to_sample == bam_mapped:
+ distanceBetweenBins = 55000
+ if args.ignoreForNormalization:
+ chrom_sizes = [(chrom_name, bam_handle.lengths[idx]) for idx, chrom_name in enumerate(bam_handle.references)
+ if chrom_name not in args.ignoreForNormalization]
+ else:
+ chrom_sizes = list(zip(bam_handle.references, bam_handle.lengths))
+
+ offset = 0
+ # Iterate over bins at various non-overlapping offsets until we have enough data
+ while total < num_needed_to_sample and offset < np.ceil(distanceBetweenBins / 50000):
+ res = mapReduce.mapReduce((bam_handle.filename, args, offset),
+ getFractionKept_wrapper,
+ chrom_sizes,
+ genomeChunkLength=distanceBetweenBins,
+ blackListFileName=args.blackListFileName,
+ numberOfProcessors=args.numberOfProcessors,
+ verbose=args.verbose)
+
+ if len(res):
+ foo, bar = np.sum(res, axis=0)
+ filtered += foo
+ total += bar
+ offset += 1
+
+ if total == 0:
+ # This should never happen
+ total = 1
+
+ return 1.0 - float(filtered) / float(total)
+
+
+def get_num_kept_reads(args, stats):
+ """
+ Substracts from the total number of mapped reads in a bamfile
+ the proportion of reads that fall into blacklisted regions
+ or that are filtered
+
+ :return: integer
+ """
+ if stats is None:
+ bam_handle, mapped, unmapped, stats = bamHandler.openBam(args.bam, returnStats=True, nThreads=args.numberOfProcessors)
+ else:
+ bam_handle = bamHandler.openBam(args.bam)
+ bam_mapped_total = utilities.bam_total_reads(bam_handle, args.ignoreForNormalization, stats)
+ if args.blackListFileName:
+ blacklisted = utilities.bam_blacklisted_reads(bam_handle, args.ignoreForNormalization,
+ args.blackListFileName, args.numberOfProcessors)
+ print("There are {0} alignments, of which {1} are completely "
+ "within a blacklist region.".format(bam_mapped_total, blacklisted))
+ num_kept_reads = bam_mapped_total - blacklisted
+ else:
+ num_kept_reads = bam_mapped_total
+ ftk = fraction_kept(args, stats)
+ if ftk < 1:
+ num_kept_reads *= ftk
+ print("Due to filtering, {0}% of the aforementioned alignments "
+ "will be used {1}".format(100 * ftk, num_kept_reads))
+
+ return num_kept_reads, bam_mapped_total
+
+
+def get_scale_factor(args, stats):
+ scale_factor = args.scaleFactor
+ bam_mapped, bam_mapped_total = get_num_kept_reads(args, stats)
+ if args.normalizeUsing == 'RPGC':
+ # Print output, since normalzation stuff isn't printed to stderr otherwise
+ sys.stderr.write("normalization: 1x (effective genome size {})\n".format(args.effectiveGenomeSize))
+
+ # try to guess fragment length if the bam file contains paired end reads
+ from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
+ frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bam,
+ return_lengths=False,
+ blackListFileName=args.blackListFileName,
+ numberOfProcessors=args.numberOfProcessors,
+ verbose=args.verbose)
+ if args.extendReads:
+ if args.extendReads is True:
+ # try to guess fragment length if the bam file contains paired end reads
+ if frag_len_dict:
+ fragment_length = frag_len_dict['median']
+ else:
+ exit("*ERROR*: library is not paired-end. Please provide an extension length.")
+ if args.verbose:
+ print(("Fragment length based on paired en data "
+ "estimated to be {}".format(frag_len_dict['median'])))
+
+ elif args.extendReads < 1:
+ exit("*ERROR*: read extension must be bigger than one. Value give: {} ".format(args.extendReads))
+ elif args.extendReads > 2000:
+ exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads))
+ else:
+ fragment_length = args.extendReads
+
+ else:
+ # set as fragment length the read length
+ fragment_length = int(read_len_dict['median'])
+ if args.verbose:
+ print("Estimated read length is {}".format(int(read_len_dict['median'])))
+
+ current_coverage = \
+ float(bam_mapped * fragment_length) / args.effectiveGenomeSize
+ # the scaling sets the coverage to match 1x
+ scale_factor *= 1.0 / current_coverage
+ if debug:
+ print("Estimated current coverage {}".format(current_coverage))
+ print("Scaling factor {}".format(args.scaleFactor))
+
+ elif args.normalizeUsing == 'RPKM':
+ # Print output, since normalzation stuff isn't printed to stderr otherwise
+ sys.stderr.write("normalization: RPKM\n")
+
+ # the RPKM is the # reads per tile / \
+ # ( total reads (in millions) * tile length in Kb)
+ million_reads_mapped = float(bam_mapped) / 1e6
+ tile_len_in_kb = float(args.binSize) / 1000
+
+ scale_factor *= 1.0 / (million_reads_mapped * tile_len_in_kb)
+
+ if debug:
+ print("scale factor using RPKM is {0}".format(args.scaleFactor))
+
+ elif args.normalizeUsing == 'CPM':
+ # Print output, since normalzation stuff isn't printed to stderr otherwise
+ sys.stderr.write("normalization: CPM\n")
+
+ # the CPM (norm is based on post-filtering total counts of reads in BAM "bam_mapped")
+ million_reads_mapped = float(bam_mapped) / 1e6
+ scale_factor *= 1.0 / (million_reads_mapped)
+
+ if debug:
+ print("scale factor using CPM is {0}".format(args.scaleFactor))
+
+ elif args.normalizeUsing == 'BPM':
+ # Print output, since normalzation stuff isn't printed to stderr otherwise
+ sys.stderr.write("normalization: BPM\n")
+ # the BPM (norm is based on post-filtering total counts of reads in BAM "bam_mapped")
+ # sampled_bins_sum = getSampledSum(args.bam)
+ tile_len_in_kb = float(args.binSize) / 1000
+ tpm_scaleFactor = (bam_mapped / tile_len_in_kb) / 1e6
+
+ scale_factor *= 1 / (tpm_scaleFactor * tile_len_in_kb)
+ if debug:
+ print("scale factor using BPM is {0}".format(args.scaleFactor))
+
+ else:
+ # Print output, since normalzation stuff isn't printed to stderr otherwise
+ sys.stderr.write("normalization: none (signal scaled by the fraction of alignments kept after filtering)\n")
+
+ scale_factor *= bam_mapped / float(bam_mapped_total)
+
+ if args.verbose:
+ print("Final scaling factor: {}".format(scale_factor))
+
+ return scale_factor
diff --git a/deepTools/source/deeptools/getScorePerBigWigBin.py b/deepTools/source/deeptools/getScorePerBigWigBin.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f0ff45eae05f2f4d05fa58ef93af80be64bd450
--- /dev/null
+++ b/deepTools/source/deeptools/getScorePerBigWigBin.py
@@ -0,0 +1,322 @@
+import pyBigWig
+import numpy as np
+import os
+import sys
+import shutil
+import warnings
+
+# deepTools packages
+import deeptools.mapReduce as mapReduce
+import deeptools.utilities
+# debug = 0
+
+old_settings = np.seterr(all='ignore')
+
+
+def countReadsInRegions_wrapper(args):
+ # Using arguments unpacking!
+ return countFragmentsInRegions_worker(*args)
+
+
+def countFragmentsInRegions_worker(chrom, start, end,
+ bigWigFiles,
+ stepSize, binLength,
+ save_data,
+ bedRegions=None
+ ):
+ """ returns the average score in each bigwig file at each 'stepSize'
+ position within the interval start, end for a 'binLength' window.
+ Because the idea is to get counts for window positions at
+ different positions for sampling the bins are equally spaced
+ and *not adjacent*.
+
+ If a list of bedRegions is given, then the number of reads
+ that overlaps with each region is counted.
+
+ Test dataset with two samples covering 200 bp.
+ >>> test = Tester()
+
+ Fragment coverage.
+ >>> np.transpose(countFragmentsInRegions_worker(test.chrom, 0, 200, [test.bwFile1, test.bwFile2], 50, 25, False)[0])
+ array([[1., 1., 2., 2.],
+ [1., 1., 1., 3.]])
+
+ >>> np.transpose(countFragmentsInRegions_worker(test.chrom, 0, 200, [test.bwFile1, test.bwFile2], 200, 200, False)[0])
+ array([[1.5],
+ [1.5]])
+
+ BED regions:
+ >>> bedRegions = [[test.chrom, [(45, 55)]], [test.chrom, [(95, 105)]], [test.chrom, [(145, 155)]]]
+ >>> np.transpose(countFragmentsInRegions_worker(test.chrom, 0, 200,[test.bwFile1, test.bwFile2], 200, 200, False,
+ ... bedRegions=bedRegions)[0])
+ array([[1. , 1.5, 2. ],
+ [1. , 1. , 2. ]])
+ """
+ assert start < end, "start {} bigger that end {}".format(start, end)
+
+ # array to keep the scores for the regions
+ sub_score_per_bin = []
+
+ rows = 0
+
+ bigwig_handles = []
+ for foo in bigWigFiles:
+ bigwig_handles.append(pyBigWig.open(foo))
+
+ regions_to_consider = []
+ if bedRegions:
+ for reg in bedRegions:
+ regs = []
+ for exon in reg[1]:
+ regs.append((exon[0], exon[1]))
+ regions_to_consider.append(regs)
+ else:
+ for i in range(start, end, stepSize):
+ if (i + binLength) > end:
+ regions_to_consider.append([(i, end)]) # last bin (may be smaller)
+ else:
+ regions_to_consider.append([(i, i + binLength)])
+
+ if save_data:
+ _file = open(deeptools.utilities.getTempFileName(suffix='.bed'), 'w+t')
+ _file_name = _file.name
+ else:
+ _file_name = ''
+ warnings.simplefilter("default")
+ i = 0
+ for reg in regions_to_consider:
+ avgReadsArray = []
+ i += 1
+
+ for idx, bwh in enumerate(bigwig_handles):
+ if chrom not in bwh.chroms():
+ unmod_name = chrom
+ if chrom.startswith('chr'):
+ # remove the chr part from chromosome name
+ chrom = chrom[3:]
+ else:
+ # prefix with 'chr' the chromosome name
+ chrom = 'chr' + chrom
+ if chrom not in bwh.chroms():
+ exit('Chromosome name {} not found in bigwig file\n {}\n'.format(unmod_name, bigWigFiles[idx]))
+
+ weights = []
+ scores = []
+ for exon in reg:
+ weights.append(exon[1] - exon[0])
+ score = bwh.stats(chrom, exon[0], exon[1])
+
+ if score is None or score == [None] or np.isnan(score[0]):
+ score = [np.nan]
+ scores.extend(score)
+ avgReadsArray.append(np.average(scores, weights=weights)) # mean of fragment coverage for region
+
+ sub_score_per_bin.extend(avgReadsArray)
+ rows += 1
+ if save_data:
+ starts = []
+ ends = []
+ for exon in reg:
+ starts.append(str(exon[0]))
+ ends.append(str(exon[1]))
+ starts = ",".join(starts)
+ ends = ",".join(ends)
+ _file.write("\t".join(map(str, [chrom, starts, ends])) + "\t")
+ _file.write("\t".join(["{}".format(x) for x in avgReadsArray]) + "\n")
+
+ if save_data:
+ _file.close()
+ warnings.resetwarnings()
+
+ # the output is a matrix having as many rows as the variable 'row'
+ # and as many columns as bigwig files. The rows correspond to
+ # each of the regions processed by the worker.
+ # np.array([[score1_1, score1_2],
+ # [score2_1, score2_2]]
+ return np.array(sub_score_per_bin).reshape(rows, len(bigWigFiles)), _file_name
+
+
+def getChromSizes(bigwigFilesList):
+ """
+ Get chromosome sizes from bigWig file with pyBigWig
+
+ Test dataset with two samples covering 200 bp.
+ >>> test = Tester()
+
+ Chromosome name(s) and size(s).
+ >>> assert getChromSizes([test.bwFile1, test.bwFile2]) == ([('3R', 200)], set([]))
+ """
+ def print_chr_names_and_size(chr_set):
+ sys.stderr.write("chromosome\tlength\n")
+ for name, size in chr_set:
+ sys.stderr.write("{0:>15}\t{1:>10}\n".format(name, size))
+
+ bigwigFilesList = bigwigFilesList[:]
+
+ common_chr = set()
+ for fname in bigwigFilesList:
+ fh = pyBigWig.open(fname)
+ common_chr = common_chr.union(set(fh.chroms().items()))
+ fh.close()
+
+ non_common_chr = set()
+ for bw in bigwigFilesList:
+ _names_and_size = set(pyBigWig.open(bw).chroms().items())
+ if len(common_chr & _names_and_size) == 0:
+ # try to add remove 'chr' from the chromosme name
+ _corr_names_size = set()
+ for chrom_name, size in _names_and_size:
+ if chrom_name.startswith('chr'):
+ _corr_names_size.add((chrom_name[3:], size))
+ else:
+ _corr_names_size.add(('chr' + chrom_name, size))
+ if len(common_chr & _corr_names_size) == 0:
+ message = "No common chromosomes found. Are the bigwig files " \
+ "from the same species and same assemblies?\n"
+ sys.stderr.write(message)
+ print_chr_names_and_size(common_chr)
+
+ sys.stderr.write("\nand the following is the list of the unmatched chromosome and chromosome\n"
+ "lengths from file\n{}\n".format(bw))
+ print_chr_names_and_size(_names_and_size)
+ exit(1)
+ else:
+ _names_and_size = _corr_names_size
+
+ non_common_chr |= common_chr ^ _names_and_size
+ common_chr = common_chr & _names_and_size
+
+ if len(non_common_chr) > 0:
+ sys.stderr.write("\nThe following chromosome names did not match between the bigwig files\n")
+ print_chr_names_and_size(non_common_chr)
+
+ # get the list of common chromosome names and sizes
+ return sorted(common_chr), non_common_chr
+
+
+def getScorePerBin(bigWigFiles, binLength,
+ numberOfProcessors=1,
+ verbose=False, region=None,
+ bedFile=None,
+ blackListFileName=None,
+ stepSize=None,
+ chrsToSkip=[],
+ out_file_for_raw_data=None,
+ allArgs=None):
+ """
+ This function returns a matrix containing scores (median) for the coverage
+ of fragments within a region. Each row corresponds to a sampled region.
+ Likewise, each column corresponds to a bigwig file.
+
+ Test dataset with two samples covering 200 bp.
+ >>> test = Tester()
+ >>> np.transpose(getScorePerBin([test.bwFile1, test.bwFile2], 50, 3))
+ array([[1., 1., 2., 2.],
+ [1., 1., 1., 3.]])
+
+ """
+ # Try to determine an optimal fraction of the genome (chunkSize)
+ # that is sent to workers for analysis. If too short, too much time
+ # is spent loading the files
+ # if too long, some processors end up free.
+ # the following is a heuristic
+
+ # get list of common chromosome names and sizes
+ chrom_sizes, non_common = getChromSizes(bigWigFiles)
+ # skip chromosome in the list. This is usually for the
+ # X chromosome which may have either one copy in a male sample
+ # or a mixture of male/female and is unreliable.
+ # Also the skip may contain heterochromatic regions and
+ # mitochondrial DNA
+ if chrsToSkip and len(chrsToSkip):
+ chrom_sizes = [x for x in chrom_sizes if x[0] not in chrsToSkip]
+
+ chrnames, chrlengths = list(zip(*chrom_sizes))
+ if stepSize is None:
+ stepSize = binLength # for adjacent bins
+
+ # set chunksize based on number of processors used
+ chunkSize = max(sum(chrlengths) / numberOfProcessors, int(1e6))
+ # make chunkSize multiple of binLength
+ chunkSize -= chunkSize % binLength
+ if verbose:
+ print("step size is {}".format(stepSize))
+
+ if region:
+ # in case a region is used, append the tilesize
+ region += ":{}".format(binLength)
+ # mapReduce( (staticArgs), func, chromSize, etc. )
+ if out_file_for_raw_data:
+ save_file = True
+ else:
+ save_file = False
+
+ # Handle GTF options
+ transcriptID, exonID, transcript_id_designator, keepExons = deeptools.utilities.gtfOptions(allArgs)
+
+ imap_res = mapReduce.mapReduce((bigWigFiles, stepSize, binLength, save_file),
+ countReadsInRegions_wrapper,
+ chrom_sizes,
+ genomeChunkLength=chunkSize,
+ bedFile=bedFile,
+ blackListFileName=blackListFileName,
+ region=region,
+ numberOfProcessors=numberOfProcessors,
+ transcriptID=transcriptID,
+ exonID=exonID,
+ keepExons=keepExons,
+ transcript_id_designator=transcript_id_designator)
+
+ if out_file_for_raw_data:
+ if len(non_common):
+ sys.stderr.write("*Warning*\nThe resulting bed file does not contain information for "
+ "the chromosomes that were not common between the bigwig files\n")
+
+ # concatenate intermediary bedgraph files
+ ofile = open(out_file_for_raw_data, "w")
+ for _values, tempFileName in imap_res:
+ if tempFileName:
+ # concatenate all intermediate tempfiles into one
+ f = open(tempFileName, 'r')
+ shutil.copyfileobj(f, ofile)
+ f.close()
+ os.remove(tempFileName)
+
+ ofile.close()
+
+ # the matrix scores are in the first element of each of the entries in imap_res
+ score_per_bin = np.concatenate([x[0] for x in imap_res], axis=0)
+ return score_per_bin
+
+
+class Tester(object):
+
+ def __init__(self):
+ """
+ The the two bigWig files are as follows:
+ $ cat /tmp/testA.bg
+ 3R 0 100 1
+ 3R 100 200 2
+
+ $ cat /tmp/testB.bg
+ 3R 0 150 1
+ 3R 150 200 3
+
+ They cover 200 bp:
+
+ 0 50 100 150 200
+ |------------------------------------------------------------|
+ A 111111111111111111111111111111122222222222222222222222222222
+
+
+ B 111111111111111111111111111111111111111111111333333333333333
+
+ """
+
+ self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/"
+ self.bwFile1 = self.root + "testA.bw"
+ self.bwFile2 = self.root + "testB.bw"
+ self.bwFile_PE = self.root + "test_paired2.bw"
+ self.chrom = '3R'
+ # global debug
+ # debug = 0
diff --git a/deepTools/source/deeptools/heatmapper.py b/deepTools/source/deeptools/heatmapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..f86b85e234db0d99b32c582efa24a0f1637e34e3
--- /dev/null
+++ b/deepTools/source/deeptools/heatmapper.py
@@ -0,0 +1,1372 @@
+import sys
+import gzip
+from collections import OrderedDict
+import numpy as np
+from copy import deepcopy
+
+import pyBigWig
+from deeptools import getScorePerBigWigBin
+from deeptools import mapReduce
+from deeptools.utilities import toString, toBytes, smartLabels
+from deeptools.heatmapper_utilities import getProfileTicks
+
+
+old_settings = np.seterr(all='ignore')
+
+
+def chopRegions(exonsInput, left=0, right=0):
+ """
+ exons is a list of (start, end) tuples. The goal is to chop these into
+ separate lists of tuples, to take care or unscaled regions. "left" and
+ "right" denote regions of a given size to exclude from the normal binning
+ process (unscaled regions).
+
+ This outputs three lists of (start, end) tuples:
+
+ leftBins: 5' unscaled regions
+ bodyBins: body bins for scaling
+ rightBins: 3' unscaled regions
+
+ In addition are two integers
+ padLeft: Number of bases of padding on the left (due to not being able to fulfill "left")
+ padRight: As above, but on the right side
+ """
+ leftBins = []
+ rightBins = []
+ padLeft = 0
+ padRight = 0
+ exons = deepcopy(exonsInput)
+ while len(exons) > 0 and left > 0:
+ width = exons[0][1] - exons[0][0]
+ if width <= left:
+ leftBins.append(exons[0])
+ del exons[0]
+ left -= width
+ else:
+ leftBins.append((exons[0][0], exons[0][0] + left))
+ exons[0] = (exons[0][0] + left, exons[0][1])
+ left = 0
+ if left > 0:
+ padLeft = left
+
+ while len(exons) > 0 and right > 0:
+ width = exons[-1][1] - exons[-1][0]
+ if width <= right:
+ rightBins.append(exons[-1])
+ del exons[-1]
+ right -= width
+ else:
+ rightBins.append((exons[-1][1] - right, exons[-1][1]))
+ exons[-1] = (exons[-1][0], exons[-1][1] - right)
+ right = 0
+ if right > 0:
+ padRight = right
+
+ return leftBins, exons, rightBins[::-1], padLeft, padRight
+
+
+def chopRegionsFromMiddle(exonsInput, left=0, right=0):
+ """
+ Like chopRegions(), above, but returns two lists of tuples on each side of
+ the center point of the exons.
+
+ The steps are as follow:
+
+ 1) Find the center point of the set of exons (e.g., [(0, 200), (300, 400), (800, 900)] would be centered at 200)
+ * If a given exon spans the center point then the exon is split
+ 2) The given number of bases at the end of the left-of-center list are extracted
+ * If the set of exons don't contain enough bases, then padLeft is incremented accordingly
+ 3) As above but for the right-of-center list
+ 4) A tuple of (#2, #3, pading on the left, and padding on the right) is returned
+ """
+ leftBins = []
+ rightBins = []
+ size = sum([x[1] - x[0] for x in exonsInput])
+ middle = size // 2
+ cumulativeSum = 0
+ padLeft = 0
+ padRight = 0
+ exons = deepcopy(exonsInput)
+
+ # Split exons in half
+ for exon in exons:
+ size = exon[1] - exon[0]
+ if cumulativeSum >= middle:
+ rightBins.append(exon)
+ elif cumulativeSum + size < middle:
+ leftBins.append(exon)
+ else:
+ # Don't add 0-width exonic bins!
+ if exon[0] < exon[1] - cumulativeSum - size + middle:
+ leftBins.append((exon[0], exon[1] - cumulativeSum - size + middle))
+ if exon[1] - cumulativeSum - size + middle < exon[1]:
+ rightBins.append((exon[1] - cumulativeSum - size + middle, exon[1]))
+ cumulativeSum += size
+
+ # Trim leftBins/adjust padLeft
+ lSum = sum([x[1] - x[0] for x in leftBins])
+ if lSum > left:
+ lSum = 0
+ for i, exon in enumerate(leftBins[::-1]):
+ size = exon[1] - exon[0]
+ if lSum + size > left:
+ leftBins[-i - 1] = (exon[1] + lSum - left, exon[1])
+ break
+ lSum += size
+ if lSum == left:
+ break
+ i += 1
+ if i < len(leftBins):
+ leftBins = leftBins[-i:]
+ elif lSum < left:
+ padLeft = left - lSum
+
+ # Trim rightBins/adjust padRight
+ rSum = sum([x[1] - x[0] for x in rightBins])
+ if rSum > right:
+ rSum = 0
+ for i, exon in enumerate(rightBins):
+ size = exon[1] - exon[0]
+ if rSum + size > right:
+ rightBins[i] = (exon[0], exon[1] - rSum - size + right)
+ break
+ rSum += size
+ if rSum == right:
+ break
+ rightBins = rightBins[:i + 1]
+ elif rSum < right:
+ padRight = right - rSum
+
+ return leftBins, rightBins, padLeft, padRight
+
+
+def trimZones(zones, maxLength, binSize, padRight):
+ """
+ Given a (variable length) list of lists of (start, end) tuples, trim/remove and tuple that extends past maxLength (e.g., the end of a chromosome)
+
+ Returns the trimmed zones and padding
+ """
+ output = []
+ for zone, nbins in zones:
+ outZone = []
+ changed = False
+ for reg in zone:
+ if reg[0] >= maxLength:
+ changed = True
+ padRight += reg[1] - reg[0]
+ continue
+
+ if reg[1] > maxLength:
+ changed = True
+ padRight += reg[1] - maxLength
+ reg = (reg[0], maxLength)
+ if reg[1] > reg[0]:
+ outZone.append(reg)
+ if changed:
+ nBins = sum(x[1] - x[0] for x in outZone) // binSize
+ else:
+ nBins = nbins
+ output.append((outZone, nBins))
+ return output, padRight
+
+
+def compute_sub_matrix_wrapper(args):
+ return heatmapper.compute_sub_matrix_worker(*args)
+
+
+class heatmapper(object):
+ """
+ Class to handle the reading and
+ plotting of matrices.
+ """
+
+ def __init__(self):
+ self.parameters = None
+ self.lengthDict = None
+ self.matrix = None
+ self.regions = None
+ self.blackList = None
+ self.quiet = True
+ # These are parameters that were single values in versions <3 but are now internally lists. See issue #614
+ self.special_params = set(['unscaled 5 prime', 'unscaled 3 prime', 'body', 'downstream', 'upstream', 'ref point', 'bin size'])
+
+ def getTicks(self, idx):
+ """
+ This is essentially a wrapper around getProfileTicks to accomdate the fact that each column has its own ticks.
+ """
+ xticks, xtickslabel = getProfileTicks(self, self.reference_point_label[idx], self.startLabel, self.endLabel, idx)
+ return xticks, xtickslabel
+
+ def computeMatrix(self, score_file_list, regions_file, parameters, blackListFileName=None, verbose=False, allArgs=None):
+ """
+ Splits into
+ multiple cores the computation of the scores
+ per bin for each region (defined by a hash '#'
+ in the regions (BED/GFF) file.
+ """
+ if parameters['body'] > 0 and \
+ parameters['body'] % parameters['bin size'] > 0:
+ exit("The --regionBodyLength has to be "
+ "a multiple of --binSize.\nCurrently the "
+ "values are {} {} for\nregionsBodyLength and "
+ "binSize respectively\n".format(parameters['body'],
+ parameters['bin size']))
+
+ # the beforeRegionStartLength is extended such that
+ # length is a multiple of binSize
+ if parameters['downstream'] % parameters['bin size'] > 0:
+ exit("Length of region after the body has to be "
+ "a multiple of --binSize.\nCurrent value "
+ "is {}\n".format(parameters['downstream']))
+
+ if parameters['upstream'] % parameters['bin size'] > 0:
+ exit("Length of region before the body has to be a multiple of "
+ "--binSize\nCurrent value is {}\n".format(parameters['upstream']))
+
+ if parameters['unscaled 5 prime'] % parameters['bin size'] > 0:
+ exit("Length of the unscaled 5 prime region has to be a multiple of "
+ "--binSize\nCurrent value is {}\n".format(parameters['unscaled 5 prime']))
+
+ if parameters['unscaled 3 prime'] % parameters['bin size'] > 0:
+ exit("Length of the unscaled 5 prime region has to be a multiple of "
+ "--binSize\nCurrent value is {}\n".format(parameters['unscaled 3 prime']))
+
+ if parameters['unscaled 5 prime'] + parameters['unscaled 3 prime'] > 0 and parameters['body'] == 0:
+ exit('Unscaled 5- and 3-prime regions only make sense with the scale-regions subcommand.\n')
+
+ # Take care of GTF options
+ transcriptID = "transcript"
+ exonID = "exon"
+ transcript_id_designator = "transcript_id"
+ keepExons = False
+ self.quiet = False
+ if allArgs is not None:
+ allArgs = vars(allArgs)
+ transcriptID = allArgs.get("transcriptID", transcriptID)
+ exonID = allArgs.get("exonID", exonID)
+ transcript_id_designator = allArgs.get("transcript_id_designator", transcript_id_designator)
+ keepExons = allArgs.get("keepExons", keepExons)
+ self.quiet = allArgs.get("quiet", self.quiet)
+
+ chromSizes, _ = getScorePerBigWigBin.getChromSizes(score_file_list)
+ res, labels = mapReduce.mapReduce([score_file_list, parameters],
+ compute_sub_matrix_wrapper,
+ chromSizes,
+ self_=self,
+ bedFile=regions_file,
+ blackListFileName=blackListFileName,
+ numberOfProcessors=parameters['proc number'],
+ includeLabels=True,
+ transcriptID=transcriptID,
+ exonID=exonID,
+ transcript_id_designator=transcript_id_designator,
+ keepExons=keepExons,
+ verbose=verbose)
+ # each worker in the pool returns a tuple containing
+ # the submatrix data, the regions that correspond to the
+ # submatrix, and the number of regions lacking scores
+ # Since this is largely unsorted, we need to sort by group
+
+ # merge all the submatrices into matrix
+ matrix = np.concatenate([r[0] for r in res], axis=0)
+ regions = []
+ regions_no_score = 0
+ for idx in range(len(res)):
+ if len(res[idx][1]):
+ regions.extend(res[idx][1])
+ regions_no_score += res[idx][2]
+ groups = [x[3] for x in regions]
+ foo = sorted(zip(groups, list(range(len(regions))), regions))
+ sortIdx = [x[1] for x in foo]
+ regions = [x[2] for x in foo]
+ matrix = matrix[sortIdx]
+
+ # mask invalid (nan) values
+ matrix = np.ma.masked_invalid(matrix)
+
+ assert matrix.shape[0] == len(regions), \
+ "matrix length does not match regions length"
+
+ if len(regions) == 0:
+ sys.stderr.write("\nERROR: Either the BED file does not contain any valid regions or there are none remaining after filtering.\n")
+ exit(1)
+ if regions_no_score == len(regions):
+ exit("\nERROR: None of the BED regions could be found in the bigWig"
+ "file.\nPlease check that the bigwig file is valid and "
+ "that the chromosome names between the BED file and "
+ "the bigWig file correspond to each other\n")
+
+ if regions_no_score > len(regions) * 0.75:
+ file_type = 'bigwig' if score_file_list[0].endswith(".bw") else "BAM"
+ prcnt = 100 * float(regions_no_score) / len(regions)
+ sys.stderr.write(
+ "\n\nWarning: {0:.2f}% of regions are *not* associated\n"
+ "to any score in the given {1} file. Check that the\n"
+ "chromosome names from the BED file are consistent with\n"
+ "the chromosome names in the given {2} file and that both\n"
+ "files refer to the same species\n\n".format(prcnt,
+ file_type,
+ file_type))
+
+ self.parameters = parameters
+
+ numcols = matrix.shape[1]
+ num_ind_cols = self.get_num_individual_matrix_cols()
+ sample_boundaries = list(range(0, numcols + num_ind_cols, num_ind_cols))
+ if allArgs is not None and allArgs['samplesLabel'] is not None:
+ sample_labels = allArgs['samplesLabel']
+ else:
+ sample_labels = smartLabels(score_file_list)
+
+ # Determine the group boundaries
+ group_boundaries = []
+ group_labels_filtered = []
+ last_idx = -1
+ for x in range(len(regions)):
+ if regions[x][3] != last_idx:
+ last_idx = regions[x][3]
+ group_boundaries.append(x)
+ group_labels_filtered.append(labels[last_idx])
+ group_boundaries.append(len(regions))
+
+ # check if a given group is too small. Groups that
+ # are too small can't be plotted and an exception is thrown.
+ group_len = np.diff(group_boundaries)
+ if len(group_len) > 1:
+ sum_len = sum(group_len)
+ group_frac = [float(x) / sum_len for x in group_len]
+ if min(group_frac) <= 0.002:
+ sys.stderr.write(
+ "One of the groups defined in the bed file is "
+ "too small.\nGroups that are too small can't be plotted. "
+ "\n")
+
+ self.matrix = _matrix(regions, matrix,
+ group_boundaries,
+ sample_boundaries,
+ group_labels_filtered,
+ sample_labels)
+
+ if parameters['skip zeros']:
+ self.matrix.removeempty()
+
+ @staticmethod
+ def compute_sub_matrix_worker(self, chrom, start, end, score_file_list, parameters, regions):
+ """
+ Returns
+ -------
+ numpy matrix
+ A numpy matrix that contains per each row the values found per each of the regions given
+ """
+ if parameters['verbose']:
+ sys.stderr.write("Processing {}:{}-{}\n".format(chrom, start, end))
+
+ # read BAM or scores file
+ score_file_handles = []
+ for sc_file in score_file_list:
+ score_file_handles.append(pyBigWig.open(sc_file))
+
+ # determine the number of matrix columns based on the lengths
+ # given by the user, times the number of score files
+ matrix_cols = len(score_file_list) * \
+ ((parameters['downstream'] +
+ parameters['unscaled 5 prime'] + parameters['unscaled 3 prime'] +
+ parameters['upstream'] + parameters['body']) //
+ parameters['bin size'])
+
+ # create an empty matrix to store the values
+ sub_matrix = np.zeros((len(regions), matrix_cols))
+ sub_matrix[:] = np.nan
+
+ j = 0
+ sub_regions = []
+ regions_no_score = 0
+ for transcript in regions:
+ feature_chrom = transcript[0]
+ exons = transcript[1]
+ feature_start = exons[0][0]
+ feature_end = exons[-1][1]
+ feature_name = transcript[2]
+ feature_strand = transcript[4]
+ padLeft = 0
+ padRight = 0
+ padLeftNaN = 0
+ padRightNaN = 0
+ upstream = []
+ downstream = []
+
+ # get the body length
+ body_length = np.sum([x[1] - x[0] for x in exons]) - parameters['unscaled 5 prime'] - parameters['unscaled 3 prime']
+
+ # print some information
+ if parameters['body'] > 0 and \
+ body_length < parameters['bin size']:
+ if not self.quiet:
+ sys.stderr.write("A region that is shorter than the bin size (possibly only after accounting for unscaled regions) was found: "
+ "({0}) {1} {2}:{3}:{4}. Skipping...\n".format((body_length - parameters['unscaled 5 prime'] - parameters['unscaled 3 prime']),
+ feature_name, feature_chrom,
+ feature_start, feature_end))
+ coverage = np.zeros(matrix_cols)
+ if not parameters['missing data as zero']:
+ coverage[:] = np.nan
+ else:
+ if feature_strand == '-':
+ if parameters['downstream'] > 0:
+ upstream = [(feature_start - parameters['downstream'], feature_start)]
+ if parameters['upstream'] > 0:
+ downstream = [(feature_end, feature_end + parameters['upstream'])]
+ unscaled5prime, body, unscaled3prime, padLeft, padRight = chopRegions(exons, left=parameters['unscaled 3 prime'], right=parameters['unscaled 5 prime'])
+ # bins per zone
+ a = parameters['downstream'] // parameters['bin size']
+ b = parameters['unscaled 3 prime'] // parameters['bin size']
+ d = parameters['unscaled 5 prime'] // parameters['bin size']
+ e = parameters['upstream'] // parameters['bin size']
+ else:
+ if parameters['upstream'] > 0:
+ upstream = [(feature_start - parameters['upstream'], feature_start)]
+ if parameters['downstream'] > 0:
+ downstream = [(feature_end, feature_end + parameters['downstream'])]
+ unscaled5prime, body, unscaled3prime, padLeft, padRight = chopRegions(exons, left=parameters['unscaled 5 prime'], right=parameters['unscaled 3 prime'])
+ a = parameters['upstream'] // parameters['bin size']
+ b = parameters['unscaled 5 prime'] // parameters['bin size']
+ d = parameters['unscaled 3 prime'] // parameters['bin size']
+ e = parameters['downstream'] // parameters['bin size']
+ c = parameters['body'] // parameters['bin size']
+
+ # build zones (each is a list of tuples)
+ # zone0: region before the region start,
+ # zone1: unscaled 5 prime region
+ # zone2: the body of the region
+ # zone3: unscaled 3 prime region
+ # zone4: the region from the end of the region downstream
+ # the format for each zone is: [(start, end), ...], number of bins
+ # Note that for "reference-point", upstream/downstream will go
+ # through the exons (if requested) and then possibly continue
+ # on the other side (unless parameters['nan after end'] is true)
+ if parameters['body'] > 0:
+ zones = [(upstream, a), (unscaled5prime, b), (body, c), (unscaled3prime, d), (downstream, e)]
+ elif parameters['ref point'] == 'TES': # around TES
+ if feature_strand == '-':
+ downstream, body, unscaled3prime, padRight, _ = chopRegions(exons, left=parameters['upstream'])
+ if padRight > 0 and parameters['nan after end'] is True:
+ padRightNaN += padRight
+ elif padRight > 0:
+ downstream.append((downstream[-1][1], downstream[-1][1] + padRight))
+ padRight = 0
+ else:
+ unscale5prime, body, upstream, _, padLeft = chopRegions(exons, right=parameters['upstream'])
+ if padLeft > 0 and parameters['nan after end'] is True:
+ padLeftNaN += padLeft
+ elif padLeft > 0:
+ upstream.insert(0, (upstream[0][0] - padLeft, upstream[0][0]))
+ padLeft = 0
+ e = np.sum([x[1] - x[0] for x in downstream]) // parameters['bin size']
+ a = np.sum([x[1] - x[0] for x in upstream]) // parameters['bin size']
+ zones = [(upstream, a), (downstream, e)]
+ elif parameters['ref point'] == 'center': # at the region center
+ if feature_strand == '-':
+ upstream, downstream, padLeft, padRight = chopRegionsFromMiddle(exons, left=parameters['downstream'], right=parameters['upstream'])
+ else:
+ upstream, downstream, padLeft, padRight = chopRegionsFromMiddle(exons, left=parameters['upstream'], right=parameters['downstream'])
+ if padLeft > 0 and parameters['nan after end'] is True:
+ padLeftNaN += padLeft
+ elif padLeft > 0:
+ if len(upstream) > 0:
+ upstream.insert(0, (upstream[0][0] - padLeft, upstream[0][0]))
+ else:
+ upstream = [(downstream[0][0] - padLeft, downstream[0][0])]
+ padLeft = 0
+ if padRight > 0 and parameters['nan after end'] is True:
+ padRightNaN += padRight
+ elif padRight > 0:
+ downstream.append((downstream[-1][1], downstream[-1][1] + padRight))
+ padRight = 0
+ a = np.sum([x[1] - x[0] for x in upstream]) // parameters['bin size']
+ e = np.sum([x[1] - x[0] for x in downstream]) // parameters['bin size']
+ # It's possible for a/e to be floats or 0 yet upstream/downstream isn't empty
+ if a < 1:
+ upstream = []
+ a = 0
+ if e < 1:
+ downstream = []
+ e = 0
+ zones = [(upstream, a), (downstream, e)]
+ else: # around TSS
+ if feature_strand == '-':
+ unscale5prime, body, upstream, _, padLeft = chopRegions(exons, right=parameters['downstream'])
+ if padLeft > 0 and parameters['nan after end'] is True:
+ padLeftNaN += padLeft
+ elif padLeft > 0:
+ upstream.insert(0, (upstream[0][0] - padLeft, upstream[0][0]))
+ padLeft = 0
+ else:
+ downstream, body, unscaled3prime, padRight, _ = chopRegions(exons, left=parameters['downstream'])
+ if padRight > 0 and parameters['nan after end'] is True:
+ padRightNaN += padRight
+ elif padRight > 0:
+ downstream.append((downstream[-1][1], downstream[-1][1] + padRight))
+ padRight = 0
+ a = np.sum([x[1] - x[0] for x in upstream]) // parameters['bin size']
+ e = np.sum([x[1] - x[0] for x in downstream]) // parameters['bin size']
+ zones = [(upstream, a), (downstream, e)]
+
+ foo = parameters['upstream']
+ bar = parameters['downstream']
+ if feature_strand == '-':
+ foo, bar = bar, foo
+ if padLeftNaN > 0:
+ expected = foo // parameters['bin size']
+ padLeftNaN = int(round(float(padLeftNaN) / parameters['bin size']))
+ if expected - padLeftNaN - a > 0:
+ padLeftNaN += 1
+ if padRightNaN > 0:
+ expected = bar // parameters['bin size']
+ padRightNaN = int(round(float(padRightNaN) / parameters['bin size']))
+ if expected - padRightNaN - e > 0:
+ padRightNaN += 1
+
+ coverage = []
+ # compute the values for each of the files being processed.
+ # "cov" is a numpy array of bins
+ for sc_handler in score_file_handles:
+ # We're only supporting bigWig files at this point
+ cov = heatmapper.coverage_from_big_wig(
+ sc_handler, feature_chrom, zones,
+ parameters['bin size'],
+ parameters['bin avg type'],
+ parameters['missing data as zero'],
+ not self.quiet)
+
+ if padLeftNaN > 0:
+ cov = np.concatenate([[np.nan] * padLeftNaN, cov])
+ if padRightNaN > 0:
+ cov = np.concatenate([cov, [np.nan] * padRightNaN])
+
+ if feature_strand == "-":
+ cov = cov[::-1]
+
+ coverage = np.hstack([coverage, cov])
+
+ if coverage is None:
+ regions_no_score += 1
+ if not self.quiet:
+ sys.stderr.write(
+ "No data was found for region "
+ "{0} {1}:{2}-{3}. Skipping...\n".format(
+ feature_name, feature_chrom,
+ feature_start, feature_end))
+
+ coverage = np.zeros(matrix_cols)
+ if not parameters['missing data as zero']:
+ coverage[:] = np.nan
+
+ try:
+ temp = coverage.copy()
+ temp[np.isnan(temp)] = 0
+ except:
+ if not self.quiet:
+ sys.stderr.write(
+ "No scores defined for region "
+ "{0} {1}:{2}-{3}. Skipping...\n".format(feature_name,
+ feature_chrom,
+ feature_start,
+ feature_end))
+ coverage = np.zeros(matrix_cols)
+ if not parameters['missing data as zero']:
+ coverage[:] = np.nan
+
+ if parameters['min threshold'] is not None and coverage.min() <= parameters['min threshold']:
+ continue
+ if parameters['max threshold'] is not None and coverage.max() >= parameters['max threshold']:
+ continue
+ if parameters['scale'] != 1:
+ coverage = parameters['scale'] * coverage
+
+ sub_matrix[j, :] = coverage
+
+ sub_regions.append(transcript)
+ j += 1
+
+ # remove empty rows
+ sub_matrix = sub_matrix[0:j, :]
+ if len(sub_regions) != len(sub_matrix[:, 0]):
+ sys.stderr.write("regions lengths do not match\n")
+ return sub_matrix, sub_regions, regions_no_score
+
+ @staticmethod
+ def coverage_from_array(valuesArray, zones, binSize, avgType):
+ try:
+ valuesArray[0]
+ except (IndexError, TypeError) as detail:
+ sys.stderr.write("{0}\nvalues array value: {1}, zones {2}\n".format(detail, valuesArray, zones))
+
+ cvglist = []
+ zoneEnd = 0
+ valStart = 0
+ valEnd = 0
+ for zone, nBins in zones:
+ if nBins:
+ # linspace is used to more or less evenly partition the data points into the given number of bins
+ zoneEnd += nBins
+ valStart = valEnd
+ valEnd += np.sum([x[1] - x[0] for x in zone])
+ counts_list = []
+
+ # Partition the space into bins
+ if nBins == 1:
+ pos_array = np.array([valStart])
+ else:
+ pos_array = np.linspace(valStart, valEnd, nBins, endpoint=False, dtype=int)
+ pos_array = np.append(pos_array, valEnd)
+
+ idx = 0
+ while idx < nBins:
+ idxStart = int(pos_array[idx])
+ idxEnd = max(int(pos_array[idx + 1]), idxStart + 1)
+ try:
+ counts_list.append(heatmapper.my_average(valuesArray[idxStart:idxEnd], avgType))
+ except Exception as detail:
+ sys.stderr.write("Exception found: {0}\n".format(detail))
+ idx += 1
+ cvglist.append(np.array(counts_list))
+
+ return np.concatenate(cvglist)
+
+ @staticmethod
+ def change_chrom_names(chrom):
+ """
+ Changes UCSC chromosome names to ensembl chromosome names
+ and vice versa.
+ """
+ if chrom.startswith('chr'):
+ # remove the chr part from chromosome name
+ chrom = chrom[3:]
+ if chrom == "M":
+ chrom = "MT"
+ else:
+ # prefix with 'chr' the chromosome name
+ chrom = 'chr' + chrom
+ if chrom == "chrMT":
+ chrom = "chrM"
+
+ return chrom
+
+ @staticmethod
+ def coverage_from_big_wig(bigwig, chrom, zones, binSize, avgType, nansAsZeros=False, verbose=True):
+
+ """
+ uses pyBigWig
+ to query a region define by chrom and zones.
+ The output is an array that contains the bigwig
+ value per base pair. The summary over bins is
+ done in a later step when coverage_from_array is called.
+ This method is more reliable than querying the bins
+ directly from the bigwig, which should be more efficient.
+
+ By default, any region, even if no chromosome match is found
+ on the bigwig file, produces a result. In other words
+ no regions are skipped.
+
+ zones: array as follows zone0: region before the region start,
+ zone1: 5' unscaled region (if present)
+ zone2: the body of the region (not always present)
+ zone3: 3' unscaled region (if present)
+ zone4: the region from the end of the region downstream
+
+ each zone is a tuple containing start, end, and number of bins
+
+
+ This is useful if several matrices wants to be merged
+ or if the sorted BED output of one computeMatrix operation
+ needs to be used for other cases
+ """
+ nVals = 0
+ for zone, _ in zones:
+ for region in zone:
+ nVals += region[1] - region[0]
+
+ values_array = np.zeros(nVals)
+ if not nansAsZeros:
+ values_array[:] = np.nan
+ if chrom not in list(bigwig.chroms().keys()):
+ unmod_name = chrom
+ chrom = heatmapper.change_chrom_names(chrom)
+ if chrom not in list(bigwig.chroms().keys()):
+ if verbose:
+ sys.stderr.write("Warning: Your chromosome names do not match.\nPlease check that the "
+ "chromosome names in your BED file\ncorrespond to the names in your "
+ "bigWig file.\nAn empty line will be added to your heatmap.\nThe problematic "
+ "chromosome name is {0}\n\n".format(unmod_name))
+
+ # return empty nan array
+ return heatmapper.coverage_from_array(values_array, zones, binSize, avgType)
+
+ maxLen = bigwig.chroms(chrom)
+ startIdx = 0
+ endIdx = 0
+ for zone, _ in zones:
+ for region in zone:
+ startIdx = endIdx
+ if region[0] < 0:
+ endIdx += abs(region[0])
+ values_array[startIdx:endIdx] = np.nan
+ startIdx = endIdx
+ start = max(0, region[0])
+ end = min(maxLen, region[1])
+ endIdx += end - start
+ if start < end:
+ # This won't be the case if we extend off the front of a chromosome, such as (-100, 0)
+ values_array[startIdx:endIdx] = bigwig.values(chrom, start, end)
+ if end < region[1]:
+ startIdx = endIdx
+ endIdx += region[1] - end
+ values_array[startIdx:endIdx] = np.nan
+
+ # replaces nans for zeros
+ if nansAsZeros:
+ values_array[np.isnan(values_array)] = 0
+
+ return heatmapper.coverage_from_array(values_array, zones,
+ binSize, avgType)
+
+ @staticmethod
+ def my_average(valuesArray, avgType='mean'):
+ """
+ computes the mean, median, etc but only for those values
+ that are not Nan
+ """
+ valuesArray = np.ma.masked_invalid(valuesArray)
+ avg = np.ma.__getattribute__(avgType)(valuesArray)
+ if isinstance(avg, np.ma.core.MaskedConstant):
+ return np.nan
+ else:
+ return avg
+
+ def matrix_from_dict(self, matrixDict, regionsDict, parameters):
+ self.regionsDict = regionsDict
+ self.matrixDict = matrixDict
+ self.parameters = parameters
+ self.lengthDict = OrderedDict()
+ self.matrixAvgsDict = OrderedDict()
+
+ def read_matrix_file(self, matrix_file):
+ # reads a bed file containing the position
+ # of genomic intervals
+ # In case a hash sign '#' is found in the
+ # file, this is considered as a delimiter
+ # to split the heatmap into groups
+
+ import json
+ regions = []
+ matrix_rows = []
+ current_group_index = 0
+ max_group_bound = None
+
+ fh = gzip.open(matrix_file)
+ for line in fh:
+ line = toString(line).strip()
+ # read the header file containing the parameters
+ # used
+ if line.startswith("@"):
+ # the parameters used are saved using
+ # json
+ self.parameters = json.loads(line[1:].strip())
+ max_group_bound = self.parameters['group_boundaries'][1]
+ continue
+
+ # split the line into bed interval and matrix values
+ region = line.split('\t')
+ chrom, start, end, name, score, strand = region[0:6]
+ matrix_row = np.ma.masked_invalid(np.fromiter(region[6:], float))
+ matrix_rows.append(matrix_row)
+ starts = start.split(",")
+ ends = end.split(",")
+ regs = [(int(x), int(y)) for x, y in zip(starts, ends)]
+ # get the group index
+ if len(regions) >= max_group_bound:
+ current_group_index += 1
+ max_group_bound = self.parameters['group_boundaries'][current_group_index + 1]
+ regions.append([chrom, regs, name, max_group_bound, strand, score])
+
+ matrix = np.vstack(matrix_rows)
+ self.matrix = _matrix(regions, matrix, self.parameters['group_boundaries'],
+ self.parameters['sample_boundaries'],
+ group_labels=self.parameters['group_labels'],
+ sample_labels=self.parameters['sample_labels'])
+
+ if 'sort regions' in self.parameters:
+ self.matrix.set_sorting_method(self.parameters['sort regions'],
+ self.parameters['sort using'])
+
+ # Versions of computeMatrix before 3.0 didn't have an entry of these per column, fix that
+ nSamples = len(self.matrix.sample_labels)
+ h = dict()
+ for k, v in self.parameters.items():
+ if k in self.special_params and type(v) is not list:
+ v = [v] * nSamples
+ if len(v) == 0:
+ v = [None] * nSamples
+ h[k] = v
+ self.parameters = h
+
+ return
+
+ def save_matrix(self, file_name):
+ """
+ saves the data required to reconstruct the matrix
+ the format is:
+ A header containing the parameters used to create the matrix
+ encoded as:
+ @key:value\tkey2:value2 etc...
+ The rest of the file has the same first 5 columns of a
+ BED file: chromosome name, start, end, name, score and strand,
+ all separated by tabs. After the fifth column the matrix
+ values are appended separated by tabs.
+ Groups are separated by adding a line starting with a hash (#)
+ and followed by the group name.
+
+ The file is gzipped.
+ """
+ import json
+ self.parameters['sample_labels'] = self.matrix.sample_labels
+ self.parameters['group_labels'] = self.matrix.group_labels
+ self.parameters['sample_boundaries'] = self.matrix.sample_boundaries
+ self.parameters['group_boundaries'] = self.matrix.group_boundaries
+
+ # Redo the parameters, ensuring things related to ticks and labels are repeated appropriately
+ nSamples = len(self.matrix.sample_labels)
+ h = dict()
+ for k, v in self.parameters.items():
+ if type(v) is list and len(v) == 0:
+ v = None
+ if k in self.special_params and type(v) is not list:
+ v = [v] * nSamples
+ if len(v) == 0:
+ v = [None] * nSamples
+ h[k] = v
+ fh = gzip.open(file_name, 'wb')
+ params_str = json.dumps(h, separators=(',', ':'))
+ fh.write(toBytes("@" + params_str + "\n"))
+ score_list = np.ma.masked_invalid(np.mean(self.matrix.matrix, axis=1))
+ for idx, region in enumerate(self.matrix.regions):
+ # join np_array values
+ # keeping nans while converting them to strings
+ if not np.ma.is_masked(score_list[idx]):
+ float(score_list[idx])
+ matrix_values = "\t".join(
+ np.char.mod('%f', self.matrix.matrix[idx, :]))
+ starts = ["{0}".format(x[0]) for x in region[1]]
+ ends = ["{0}".format(x[1]) for x in region[1]]
+ starts = ",".join(starts)
+ ends = ",".join(ends)
+ # BEDish format (we don't currently store the score)
+ fh.write(
+ toBytes('{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n'.format(
+ region[0],
+ starts,
+ ends,
+ region[2],
+ region[5],
+ region[4],
+ matrix_values)))
+ fh.close()
+
+ def save_tabulated_values(self, file_handle, reference_point_label='TSS', start_label='TSS', end_label='TES', averagetype='mean'):
+ """
+ Saves the values averaged by col using the avg_type
+ given
+
+ Args:
+ file_handle: file name to save the file
+ reference_point_label: Name of the reference point label
+ start_label: Name of the star label
+ end_label: Name of the end label
+ averagetype: average type (e.g. mean, median, std)
+
+ """
+ # get X labels
+ w = self.parameters['bin size']
+ b = self.parameters['upstream']
+ a = self.parameters['downstream']
+ c = self.parameters.get('unscaled 5 prime', 0)
+ d = self.parameters.get('unscaled 3 prime', 0)
+ m = self.parameters['body']
+
+ xticks = []
+ xtickslabel = []
+ for idx in range(self.matrix.get_num_samples()):
+ if b[idx] < 1e5:
+ quotient = 1000
+ symbol = 'Kb'
+ else:
+ quotient = 1e6
+ symbol = 'Mb'
+
+ if m[idx] == 0:
+ last = 0
+ if len(xticks):
+ last = xticks[-1]
+ xticks.extend([last + (k / w[idx]) for k in [w[idx], b[idx], b[idx] + a[idx]]])
+ xtickslabel.extend(['{0:.1f}{1}'.format(-(float(b[idx]) / quotient), symbol), reference_point_label,
+ '{0:.1f}{1}'.format(float(a[idx]) / quotient, symbol)])
+
+ else:
+ xticks_values = [w[idx]]
+
+ # only if upstream region is set, add a x tick
+ if b[idx] > 0:
+ xticks_values.append(b[idx])
+ xtickslabel.append('{0:.1f}{1}'.format(-(float(b[idx]) / quotient), symbol))
+
+ xtickslabel.append(start_label)
+
+ if c[idx] > 0:
+ xticks_values.append(b[idx] + c[idx])
+ xtickslabel.append("")
+
+ if d[idx] > 0:
+ xticks_values.append(b[idx] + c[idx] + m[idx])
+ xtickslabel.append("")
+
+ xticks_values.append(b[idx] + c[idx] + m[idx] + d[idx])
+ xtickslabel.append(end_label)
+
+ if a[idx] > 0:
+ xticks_values.append(b[idx] + c[idx] + m[idx] + d[idx] + a[idx])
+ xtickslabel.append('{0:.1f}{1}'.format(float(a[idx]) / quotient, symbol))
+
+ last = 0
+ if len(xticks):
+ last = xticks[-1]
+ xticks.extend([last + (k / w[idx]) for k in xticks_values])
+ x_axis = np.arange(xticks[-1]) + 1
+ labs = []
+ for x_value in x_axis:
+ if x_value in xticks and xtickslabel[xticks.index(x_value)]:
+ labs.append(xtickslabel[xticks.index(x_value)])
+ elif x_value in xticks:
+ labs.append("tick")
+ else:
+ labs.append("")
+
+ with open(file_handle, 'w') as fh:
+ # write labels
+ fh.write("bin labels\t\t{}\n".format("\t".join(labs)))
+ fh.write('bins\t\t{}\n'.format("\t".join([str(x) for x in x_axis])))
+
+ for sample_idx in range(self.matrix.get_num_samples()):
+ for group_idx in range(self.matrix.get_num_groups()):
+ sub_matrix = self.matrix.get_matrix(group_idx, sample_idx)
+ values = [str(x) for x in np.ma.__getattribute__(averagetype)(sub_matrix['matrix'], axis=0)]
+ fh.write("{}\t{}\t{}\n".format(sub_matrix['sample'], sub_matrix['group'], "\t".join(values)))
+
+ def save_matrix_values(self, file_name):
+ # print a header telling the group names and their length
+ fh = open(file_name, 'wb')
+ info = []
+ groups_len = np.diff(self.matrix.group_boundaries)
+ for i in range(len(self.matrix.group_labels)):
+ info.append("{}:{}".format(self.matrix.group_labels[i],
+ groups_len[i]))
+ fh.write(toBytes("#{}\n".format("\t".join(info))))
+ # add to header the x axis values
+ fh.write(toBytes("#downstream:{}\tupstream:{}\tbody:{}\tbin size:{}\tunscaled 5 prime:{}\tunscaled 3 prime:{}\n".format(
+ self.parameters['downstream'],
+ self.parameters['upstream'],
+ self.parameters['body'],
+ self.parameters['bin size'],
+ self.parameters.get('unscaled 5 prime', 0),
+ self.parameters.get('unscaled 3 prime', 0))))
+ sample_len = np.diff(self.matrix.sample_boundaries)
+ for i in range(len(self.matrix.sample_labels)):
+ info.extend([self.matrix.sample_labels[i]] * sample_len[i])
+ fh.write(toBytes("{}\n".format("\t".join(info))))
+
+ fh.close()
+ # reopen again using append mode
+ fh = open(file_name, 'ab')
+ np.savetxt(fh, self.matrix.matrix, fmt="%.4g", delimiter="\t")
+ fh.close()
+
+ def save_BED(self, file_handle):
+ boundaries = np.array(self.matrix.group_boundaries)
+ # Add a header
+ file_handle.write("#chrom\tstart\tend\tname\tscore\tstrand\tthickStart\tthickEnd\titemRGB\tblockCount\tblockSizes\tblockStart\tdeepTools_group")
+ if self.matrix.silhouette is not None:
+ file_handle.write("\tsilhouette")
+ file_handle.write("\n")
+ for idx, region in enumerate(self.matrix.regions):
+ # the label id corresponds to the last boundary
+ # that is smaller than the region index.
+ # for example for a boundary array = [0, 10, 20]
+ # and labels ['a', 'b', 'c'],
+ # for index 5, the label is 'a', for
+ # index 10, the label is 'b' etc
+ label_idx = np.flatnonzero(boundaries <= idx)[-1]
+ starts = ["{0}".format(x[0]) for x in region[1]]
+ ends = ["{0}".format(x[1]) for x in region[1]]
+ starts = ",".join(starts)
+ ends = ",".join(ends)
+ file_handle.write(
+ '{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{1}\t{2}\t0'.format(
+ region[0],
+ region[1][0][0],
+ region[1][-1][1],
+ region[2],
+ region[5],
+ region[4]))
+ file_handle.write(
+ '\t{0}\t{1}\t{2}\t{3}'.format(
+ len(region[1]),
+ ",".join([str(int(y) - int(x)) for x, y in region[1]]),
+ ",".join([str(int(x) - int(starts[0])) for x, y in region[1]]),
+ self.matrix.group_labels[label_idx]))
+ if self.matrix.silhouette is not None:
+ file_handle.write("\t{}".format(self.matrix.silhouette[idx]))
+ file_handle.write("\n")
+ file_handle.close()
+
+ @staticmethod
+ def matrix_avg(matrix, avgType='mean'):
+ matrix = np.ma.masked_invalid(matrix)
+ return np.ma.__getattribute__(avgType)(matrix, axis=0)
+
+ def get_individual_matrices(self, matrix):
+ """In case multiple matrices are saved one after the other
+ this method splits them appart.
+ Returns a list containing the matrices
+ """
+ num_cols = matrix.shape[1]
+ num_ind_cols = self.get_num_individual_matrix_cols()
+ matrices_list = []
+ for i in range(0, num_cols, num_ind_cols):
+ if i + num_ind_cols > num_cols:
+ break
+ matrices_list.append(matrix[:, i:i + num_ind_cols])
+ return matrices_list
+
+ def get_num_individual_matrix_cols(self):
+ """
+ returns the number of columns that
+ each matrix should have. This is done because
+ the final matrix that is plotted can be composed
+ of smaller matrices that are merged one after
+ the other.
+ """
+ matrixCols = ((self.parameters['downstream'] + self.parameters['upstream'] + self.parameters['body'] + self.parameters['unscaled 5 prime'] + self.parameters['unscaled 3 prime']) //
+ self.parameters['bin size'])
+
+ return matrixCols
+
+
+def computeSilhouetteScore(d, idx, labels):
+ """
+ Given a square distance matrix with NaN diagonals, compute the silhouette score
+ of a given row (idx). Each row should have an associated label (labels).
+ """
+ keep = ~np.isnan(d[idx, ])
+ foo = np.bincount(labels[keep], weights=d[idx, ][keep])
+ groupSizes = np.bincount(labels[keep])
+ intraIdx = labels[idx]
+ if groupSizes[intraIdx] == 1:
+ return 0
+ intra = foo[labels[idx]] / groupSizes[intraIdx]
+ interMask = np.arange(len(foo))[np.arange(len(foo)) != labels[idx]]
+ inter = np.min(foo[interMask] / groupSizes[interMask])
+ return (inter - intra) / max(inter, intra)
+
+
+class _matrix(object):
+ """
+ class to hold heatmapper matrices
+ The base data is a large matrix
+ with definition to know the boundaries for row and col divisions.
+ Col divisions represent groups within a subset, e.g. Active and
+ inactive from PolII bigwig data.
+
+ Row division represent different samples, for example
+ PolII in males vs. PolII in females.
+
+ This is an internal class of the heatmapper class
+ """
+
+ def __init__(self, regions, matrix, group_boundaries, sample_boundaries,
+ group_labels=None, sample_labels=None):
+
+ # simple checks
+ assert matrix.shape[0] == group_boundaries[-1], \
+ "row max do not match matrix shape"
+ assert matrix.shape[1] == sample_boundaries[-1], \
+ "col max do not match matrix shape"
+
+ self.regions = regions
+ self.matrix = matrix
+ self.group_boundaries = group_boundaries
+ self.sample_boundaries = sample_boundaries
+ self.sort_method = None
+ self.sort_using = None
+ self.silhouette = None
+
+ if group_labels is None:
+ self.group_labels = ['group {}'.format(x)
+ for x in range(len(group_boundaries) - 1)]
+ else:
+ assert len(group_labels) == len(group_boundaries) - 1, \
+ "number of group labels does not match number of groups"
+ self.group_labels = group_labels
+
+ if sample_labels is None:
+ self.sample_labels = ['sample {}'.format(x)
+ for x in range(len(sample_boundaries) - 1)]
+ else:
+ assert len(sample_labels) == len(sample_boundaries) - 1, \
+ "number of sample labels does not match number of samples"
+ self.sample_labels = sample_labels
+
+ def get_matrix(self, group, sample):
+ """
+ Returns a sub matrix from the large
+ matrix. Group and sample are ids,
+ thus, row = 0, col=0 get the first group
+ of the first sample.
+
+ Returns
+ -------
+ dictionary containing the matrix,
+ the group label and the sample label
+ """
+ group_start = self.group_boundaries[group]
+ group_end = self.group_boundaries[group + 1]
+ sample_start = self.sample_boundaries[sample]
+ sample_end = self.sample_boundaries[sample + 1]
+
+ return {'matrix': np.ma.masked_invalid(self.matrix[group_start:group_end, :][:, sample_start:sample_end]),
+ 'group': self.group_labels[group],
+ 'sample': self.sample_labels[sample]}
+
+ def get_num_samples(self):
+ return len(self.sample_labels)
+
+ def get_num_groups(self):
+ return len(self.group_labels)
+
+ def set_group_labels(self, new_labels):
+ """ sets new labels for groups
+ """
+ if len(new_labels) != len(self.group_labels):
+ raise ValueError("length new labels != length original labels")
+ self.group_labels = new_labels
+
+ def set_sample_labels(self, new_labels):
+ """ sets new labels for groups
+ """
+ if len(new_labels) != len(self.sample_labels):
+ raise ValueError("length new labels != length original labels")
+ self.sample_labels = new_labels
+
+ def set_sorting_method(self, sort_method, sort_using):
+ self.sort_method = sort_method
+ self.sort_using = sort_using
+
+ def get_regions(self):
+ """Returns the regions per group
+
+ Returns
+ ------
+ list
+
+ Each element of the list is itself a list
+ of dictionaries containing the regions info:
+ chrom, start, end, strand, name etc.
+
+ Each element of the list corresponds to each
+ of the groups
+ """
+ regions = []
+ for idx in range(len(self.group_labels)):
+ start = self.group_boundaries[idx]
+ end = self.group_boundaries[idx + 1]
+ regions.append(self.regions[start:end])
+
+ return regions
+
+ def sort_groups(self, sort_using='mean', sort_method='no', sample_list=None):
+ """
+ Sorts and rearranges the submatrices according to the
+ sorting method given.
+ """
+ if sort_method == 'no':
+ return
+
+ if (sample_list is not None) and (len(sample_list) > 0):
+ # get the ids that correspond to the selected sample list
+ idx_to_keep = []
+ for sample_idx in sample_list:
+ idx_to_keep += range(self.sample_boundaries[sample_idx], self.sample_boundaries[sample_idx + 1])
+
+ matrix = self.matrix[:, idx_to_keep]
+
+ else:
+ matrix = self.matrix
+
+ # compute the row average:
+ if sort_using == 'region_length':
+ matrix_avgs = list()
+ for x in self.regions:
+ matrix_avgs.append(np.sum([bar[1] - bar[0] for bar in x[1]]))
+ matrix_avgs = np.array(matrix_avgs)
+ elif sort_using == 'mean':
+ matrix_avgs = np.nanmean(matrix, axis=1)
+ elif sort_using == 'mean':
+ matrix_avgs = np.nanmean(matrix, axis=1)
+ elif sort_using == 'median':
+ matrix_avgs = np.nanmedian(matrix, axis=1)
+ elif sort_using == 'max':
+ matrix_avgs = np.nanmax(matrix, axis=1)
+ elif sort_using == 'min':
+ matrix_avgs = np.nanmin(matrix, axis=1)
+ elif sort_using == 'sum':
+ matrix_avgs = np.nansum(matrix, axis=1)
+ else:
+ sys.exit("{} is an unsupported sorting method".format(sort_using))
+
+ # order per group
+ _sorted_regions = []
+ _sorted_matrix = []
+ for idx in range(len(self.group_labels)):
+ start = self.group_boundaries[idx]
+ end = self.group_boundaries[idx + 1]
+ order = matrix_avgs[start:end].argsort()
+ if sort_method == 'descend':
+ order = order[::-1]
+ _sorted_matrix.append(self.matrix[start:end, :][order, :])
+ # sort the regions
+ _reg = self.regions[start:end]
+ for idx in order:
+ _sorted_regions.append(_reg[idx])
+
+ self.matrix = np.vstack(_sorted_matrix)
+ self.regions = _sorted_regions
+ self.set_sorting_method(sort_method, sort_using)
+
+ def hmcluster(self, k, evaluate_silhouette=True, method='kmeans', clustering_samples=None):
+ matrix = np.asarray(self.matrix)
+ matrix_to_cluster = matrix
+ if clustering_samples is not None:
+ assert all(i > 0 for i in clustering_samples), \
+ "all indices should be bigger than or equal to 1."
+ assert all(i <= len(self.sample_labels) for i in
+ clustering_samples), \
+ "each index should be smaller than or equal to {}(total "\
+ "number of samples.)".format(len(self.sample_labels))
+
+ clustering_samples = np.asarray(clustering_samples) - 1
+
+ samples_cols = []
+ for idx in clustering_samples:
+ samples_cols += range(self.sample_boundaries[idx],
+ self.sample_boundaries[idx + 1])
+
+ matrix_to_cluster = matrix_to_cluster[:, samples_cols]
+ if np.any(np.isnan(matrix_to_cluster)):
+ # replace nans for 0 otherwise kmeans produces a weird behaviour
+ sys.stderr.write("*Warning* For clustering nan values have to be replaced by zeros \n")
+ matrix_to_cluster[np.isnan(matrix_to_cluster)] = 0
+
+ if method == 'kmeans':
+ from scipy.cluster.vq import vq, kmeans
+
+ centroids, _ = kmeans(matrix_to_cluster, k)
+ # order the centroids in an attempt to
+ # get the same cluster order
+ cluster_labels, _ = vq(matrix_to_cluster, centroids)
+
+ if method == 'hierarchical':
+ # normally too slow for large data sets
+ from scipy.cluster.hierarchy import fcluster, linkage
+ Z = linkage(matrix_to_cluster, method='ward', metric='euclidean')
+ cluster_labels = fcluster(Z, k, criterion='maxclust')
+ # hierarchical clustering labels from 1 .. k
+ # while k-means labels 0 .. k -1
+ # Thus, for consistency, we subtract 1
+ cluster_labels -= 1
+
+ # sort clusters
+ _clustered_mean = []
+ _cluster_ids_list = []
+ for cluster in range(k):
+ cluster_ids = np.flatnonzero(cluster_labels == cluster)
+ _cluster_ids_list.append(cluster_ids)
+ _clustered_mean.append(matrix_to_cluster[cluster_ids, :].mean())
+
+ # reorder clusters based on mean
+ cluster_order = np.argsort(_clustered_mean)[::-1]
+ # create groups using the clustering
+ self.group_labels = []
+ self.group_boundaries = [0]
+ _clustered_regions = []
+ _clustered_matrix = []
+ cluster_number = 1
+ for cluster in cluster_order:
+ self.group_labels.append("cluster_{}".format(cluster_number))
+ cluster_number += 1
+ cluster_ids = _cluster_ids_list[cluster]
+ self.group_boundaries.append(self.group_boundaries[-1] +
+ len(cluster_ids))
+ _clustered_matrix.append(self.matrix[cluster_ids, :])
+ for idx in cluster_ids:
+ _clustered_regions.append(self.regions[idx])
+
+ self.regions = _clustered_regions
+ self.matrix = np.vstack(_clustered_matrix)
+
+ return idx
+
+ def computeSilhouette(self, k):
+ if k > 1:
+ from scipy.spatial.distance import pdist, squareform
+
+ silhouette = np.repeat(0.0, self.group_boundaries[-1])
+ groupSizes = np.subtract(self.group_boundaries[1:], self.group_boundaries[:-1])
+ labels = np.repeat(np.arange(k), groupSizes)
+
+ d = pdist(self.matrix)
+ d2 = squareform(d)
+ np.fill_diagonal(d2, np.nan) # This excludes the diagonal
+ for idx in range(len(labels)):
+ silhouette[idx] = computeSilhouetteScore(d2, idx, labels)
+ sys.stderr.write("The average silhouette score is: {}\n".format(np.mean(silhouette)))
+ self.silhouette = silhouette
+
+ def removeempty(self):
+ """
+ removes matrix rows containing only zeros or nans
+ """
+ to_keep = []
+ score_list = np.ma.masked_invalid(np.mean(self.matrix, axis=1))
+ for idx, region in enumerate(self.regions):
+ if np.ma.is_masked(score_list[idx]) or float(score_list[idx]) == 0:
+ continue
+ else:
+ to_keep.append(idx)
+ self.regions = [self.regions[x] for x in to_keep]
+ self.matrix = self.matrix[to_keep, :]
+ # adjust sample boundaries
+ to_keep = np.array(to_keep)
+ self.group_boundaries = [len(to_keep[to_keep < x]) for x in self.group_boundaries]
+
+ def flatten(self):
+ """
+ flatten and remove nans from matrix. Useful
+ to get max and mins from matrix.
+
+ :return flattened matrix
+ """
+ matrix_flatten = np.asarray(self.matrix.flatten())
+ # nans are removed from the flattened array
+ matrix_flatten = matrix_flatten[~np.isnan(matrix_flatten)]
+ if len(matrix_flatten) == 0:
+ num_nan = len(np.flatnonzero(np.isnan(self.matrix.flatten())))
+ raise ValueError("matrix only contains nans "
+ "(total nans: {})".format(num_nan))
+ return matrix_flatten
diff --git a/deepTools/source/deeptools/heatmapper_utilities.py b/deepTools/source/deeptools/heatmapper_utilities.py
new file mode 100644
index 0000000000000000000000000000000000000000..e63dfb022633f60a3b57f7f100cd32ae99c4234c
--- /dev/null
+++ b/deepTools/source/deeptools/heatmapper_utilities.py
@@ -0,0 +1,204 @@
+import numpy as np
+import matplotlib
+matplotlib.use('Agg')
+matplotlib.rcParams['pdf.fonttype'] = 42
+matplotlib.rcParams['svg.fonttype'] = 'none'
+from deeptools import cm # noqa: F401
+import matplotlib.colors as pltcolors
+import plotly.graph_objs as go
+
+old_settings = np.seterr(all='ignore')
+
+
+def plot_single(ax, ma, average_type, color, label, plot_type='lines'):
+ """
+ Adds a line to the plot in the given ax using the specified method
+
+ Parameters
+ ----------
+ ax : matplotlib axis
+ matplotlib axis
+ ma : numpy array
+ numpy array The data on this matrix is summarized according
+ to the `average_type` argument.
+ average_type : str
+ string values are sum mean median min max std
+ color : str
+ a valid color: either a html color name, hex
+ (e.g #002233), RGB + alpha tuple or list or RGB tuple or list
+ label : str
+ label
+ plot_type: str
+ type of plot. Either 'se' for standard error, 'std' for
+ standard deviation, 'overlapped_lines' to plot each line of the matrix,
+ fill to plot the area between the x axis and the value or any other string to
+ just plot the average line.
+
+ Returns
+ -------
+ ax
+ matplotlib axis
+
+ Examples
+ --------
+
+ >>> import matplotlib.pyplot as plt
+ >>> import os
+ >>> fig = plt.figure()
+ >>> ax = fig.add_subplot(111)
+ >>> matrix = np.array([[1,2,3],
+ ... [4,5,6],
+ ... [7,8,9]])
+ >>> ax = plot_single(ax, matrix -2, 'mean', color=[0.6, 0.8, 0.9], label='fill light blue', plot_type='fill')
+ >>> ax = plot_single(ax, matrix, 'mean', color='blue', label='red')
+ >>> ax = plot_single(ax, matrix + 5, 'mean', color='red', label='red', plot_type='std')
+ >>> ax = plot_single(ax, matrix + 10, 'mean', color='#cccccc', label='gray se', plot_type='se')
+ >>> ax = plot_single(ax, matrix + 20, 'mean', color=(0.9, 0.5, 0.9), label='violet', plot_type='std')
+ >>> ax = plot_single(ax, matrix + 30, 'mean', color=(0.9, 0.5, 0.9, 0.5), label='violet with alpha', plot_type='std')
+ >>> leg = ax.legend()
+ >>> plt.savefig("/tmp/test.pdf")
+ >>> plt.close()
+ >>> fig = plt.figure()
+ >>> os.remove("/tmp/test.pdf")
+
+
+ """
+ summary = np.ma.__getattribute__(average_type)(ma, axis=0)
+ # only plot the average profiles without error regions
+ x = np.arange(len(summary))
+ if isinstance(color, np.ndarray):
+ color = pltcolors.to_hex(color, keep_alpha=True)
+ ax.plot(x, summary, color=color, label=label, alpha=0.9)
+ if plot_type == 'fill':
+ ax.fill_between(x, summary, facecolor=color, alpha=0.6, edgecolor='none')
+
+ if plot_type in ['se', 'std']:
+ if plot_type == 'se': # standard error
+ std = np.std(ma, axis=0) / np.sqrt(ma.shape[0])
+ else:
+ std = np.std(ma, axis=0)
+
+ alpha = 0.2
+ # an alpha channel has to be added to the color to fill the area
+ # between the mean (or median etc.) and the std or se
+ f_color = pltcolors.colorConverter.to_rgba(color, alpha)
+
+ ax.fill_between(x, summary, summary + std, facecolor=f_color, edgecolor='none')
+ ax.fill_between(x, summary, summary - std, facecolor=f_color, edgecolor='none')
+
+ ax.set_xlim(0, max(x))
+
+ return ax
+
+
+def plotly_single(ma, average_type, color, label, plot_type='line'):
+ """A plotly version of plot_single. Returns a list of traces"""
+ summary = list(np.ma.__getattribute__(average_type)(ma, axis=0))
+ x = list(np.arange(len(summary)))
+ if isinstance(color, str):
+ color = list(matplotlib.colors.to_rgb(color))
+ traces = [go.Scatter(x=x, y=summary, name=label, line={'color': "rgba({},{},{},0.9)".format(color[0], color[1], color[2])}, showlegend=False)]
+ if plot_type == 'fill':
+ traces[0].update(fill='tozeroy', fillcolor=color)
+
+ if plot_type in ['se', 'std']:
+ if plot_type == 'se': # standard error
+ std = np.std(ma, axis=0) / np.sqrt(ma.shape[0])
+ else:
+ std = np.std(ma, axis=0)
+
+ x_rev = x[::-1]
+ lower = summary - std
+ trace = go.Scatter(x=x + x_rev,
+ y=np.concatenate([summary + std, lower[::-1]]),
+ fill='tozerox',
+ fillcolor="rgba({},{},{},0.2)".format(color[0], color[1], color[2]),
+ line=go.Line(color='transparent'),
+ showlegend=False,
+ name=label)
+ traces.append(trace)
+
+ return traces
+
+
+def getProfileTicks(hm, referencePointLabel, startLabel, endLabel, idx):
+ """
+ returns the position and labelling of the xticks that
+ correspond to the heatmap
+
+ As of deepTools 3, the various parameters can be lists, in which case we then need to index things (the idx parameter)
+
+ As of matplotlib 3 the ticks in the heatmap need to have 0.5 added to them.
+
+ As of matplotlib 3.1 there is no longer padding added to all ticks. Reference point ticks will be adjusted by width/2
+ or width for spacing and the last half of scaled ticks will be shifed by 1 bin so the ticks are at the beginning of bins.
+ """
+ w = hm.parameters['bin size']
+ b = hm.parameters['upstream']
+ a = hm.parameters['downstream']
+ if idx is not None:
+ w = w[idx]
+ b = b[idx]
+ a = a[idx]
+
+ try:
+ c = hm.parameters['unscaled 5 prime']
+ if idx is not None:
+ c = c[idx]
+ except:
+ c = 0
+ try:
+ d = hm.parameters['unscaled 3 prime']
+ if idx is not None:
+ d = d[idx]
+ except:
+ d = 0
+ m = hm.parameters['body']
+ if idx is not None:
+ m = m[idx]
+
+ if b < 1e5:
+ quotient = 1000
+ symbol = 'Kb'
+ else:
+ quotient = 1e6
+ symbol = 'Mb'
+
+ if m == 0:
+ xticks = [(k / w) for k in [0, b - 0.5 * w, b + a - w]]
+ xtickslabel = ['{0:.1f}'.format(-(float(b) / quotient)),
+ referencePointLabel,
+ '{0:.1f}{1}'.format(float(a) / quotient, symbol)]
+ else:
+ xticks_values = [0]
+ xtickslabel = []
+
+ # only if upstream region is set, add a x tick
+ if b > 0:
+ xticks_values.append(b)
+ xtickslabel.append('{0:.1f}'.format(-(float(b) / quotient)))
+
+ xtickslabel.append(startLabel)
+
+ # set the x tick for the body parameter, regardless if
+ # upstream is 0 (not set)
+ if c > 0:
+ xticks_values.append(b + c)
+ xtickslabel.append("")
+
+ if d > 0:
+ xticks_values.append(b + c + m)
+ xtickslabel.append("")
+
+ # We need to subtract the bin size from the last 2 point so they're placed at the beginning of the bin
+ xticks_values.append(b + c + m + d - w)
+ xtickslabel.append(endLabel)
+
+ if a > 0:
+ xticks_values.append(b + c + m + d + a - w)
+ xtickslabel.append('{0:.1f}{1}'.format(float(a) / quotient, symbol))
+
+ xticks = [(k / w) for k in xticks_values]
+ xticks = [max(x, 0) for x in xticks]
+
+ return xticks, xtickslabel
diff --git a/deepTools/source/deeptools/mapReduce.py b/deepTools/source/deeptools/mapReduce.py
new file mode 100644
index 0000000000000000000000000000000000000000..af0b1647c9c65a2514465c645dd318b78e47453e
--- /dev/null
+++ b/deepTools/source/deeptools/mapReduce.py
@@ -0,0 +1,263 @@
+import multiprocessing
+from deeptoolsintervals import GTF
+import random
+
+debug = 0
+
+
+def mapReduce(staticArgs, func, chromSize,
+ genomeChunkLength=None,
+ region=None,
+ bedFile=None,
+ blackListFileName=None,
+ numberOfProcessors=4,
+ verbose=False,
+ includeLabels=False,
+ keepExons=False,
+ transcriptID="transcriptID",
+ exonID="exonID",
+ transcript_id_designator="transcript_id",
+ self_=None):
+ """
+ Split the genome into parts that are sent to workers using a defined
+ number of procesors. Results are collected and returned.
+
+ For each genomic region the given 'func' is called using
+ the following parameters:
+
+ chrom, start, end, staticArgs
+
+ The *arg* are static, *pickable* variables that need to be sent
+ to workers.
+
+ The genome chunk length corresponds to a fraction of the genome, in bp,
+ that is send to each of the workers for processing.
+
+ Depending on the type of process a larger or shorter regions may be
+ preferred
+
+ :param chromSize: A list of duples containing the chromosome
+ name and its length
+ :param region: The format is chr:start:end:tileSize (see function
+ getUserRegion)
+ :param staticArgs: tuple of arguments that are sent to the given 'func'
+
+ :param func: function to call. The function is called using the
+ following parameters (chrom, start, end, staticArgs)
+ :param bedFile: Is a bed file is given, the args to the func to be
+ called are extended to include a list of bed
+ defined regions.
+ :param blackListFileName: A list of regions to exclude from all computations.
+ Note that this has genomeChunkLength resolution...
+ :param self_: In case mapreduce should make a call to an object
+ the self variable has to be passed.
+ :param includeLabels: Pass group and transcript labels into the calling
+ function. These are added to the static args
+ (groupLabel and transcriptName).
+
+ If "includeLabels" is true, a tuple of (results, labels) is returned
+ """
+
+ if not genomeChunkLength:
+ genomeChunkLength = 1e5
+ genomeChunkLength = int(genomeChunkLength)
+
+ if verbose:
+ print("genome partition size for multiprocessing: {0}".format(
+ genomeChunkLength))
+
+ region_start = 0
+ region_end = None
+
+ # if a region is set, that means that the task should only cover
+ # the given genomic position
+
+ if region:
+ chromSize, region_start, region_end, genomeChunkLength = getUserRegion(chromSize, region)
+ if verbose:
+ print("chrom size: {0}, region start: {1}, region end: {2}, "
+ "genome chunk length sent to each procesor: {3}".format(chromSize, region_start, region_end, genomeChunkLength))
+
+ if bedFile:
+ defaultGroup = None
+ if len(bedFile) == 1:
+ defaultGroup = "genes"
+ bed_interval_tree = GTF(bedFile, defaultGroup=defaultGroup, transcriptID=transcriptID, exonID=exonID, transcript_id_designator=transcript_id_designator, keepExons=keepExons)
+
+ if blackListFileName:
+ blackList = GTF(blackListFileName)
+
+ TASKS = []
+ # iterate over all chromosomes
+ for chrom, size in chromSize:
+ # the start is zero unless a specific region is defined
+ start = 0 if region_start == 0 else region_start
+ for startPos in range(start, size, genomeChunkLength):
+ endPos = min(size, startPos + genomeChunkLength)
+
+ # Reject a chunk if it overlaps
+ if blackListFileName:
+ regions = blSubtract(blackList, chrom, [startPos, endPos])
+ else:
+ regions = [[startPos, endPos]]
+
+ for reg in regions:
+ if self_ is not None:
+ argsList = [self_]
+ else:
+ argsList = []
+
+ argsList.extend([chrom, reg[0], reg[1]])
+ # add to argument list the static list received the the function
+ argsList.extend(staticArgs)
+
+ # if a bed file is given, append to the TASK list,
+ # a list of bed regions that overlap with the
+ # current genomeChunk.
+ if bedFile:
+ # This effectively creates batches of intervals, which is
+ # generally more performant due to the added overhead of
+ # initializing additional workers.
+
+ # TODO, there's no point in including the chromosome
+ if includeLabels:
+ bed_regions_list = [[chrom, x[4], x[2], x[3], x[5], x[6]] for x in bed_interval_tree.findOverlaps(chrom, reg[0], reg[1], trimOverlap=True, numericGroups=True, includeStrand=True)]
+ else:
+ bed_regions_list = [[chrom, x[4], x[5], x[6]] for x in bed_interval_tree.findOverlaps(chrom, reg[0], reg[1], trimOverlap=True, includeStrand=True)]
+
+ if len(bed_regions_list) == 0:
+ continue
+ # add to argument list, the position of the bed regions to use
+ argsList.append(bed_regions_list)
+
+ TASKS.append(tuple(argsList))
+
+ if len(TASKS) > 1 and numberOfProcessors > 1:
+ if verbose:
+ print(("using {} processors for {} "
+ "number of tasks".format(numberOfProcessors,
+ len(TASKS))))
+ random.shuffle(TASKS)
+ pool = multiprocessing.Pool(numberOfProcessors)
+ res = pool.map_async(func, TASKS).get(9999999)
+ pool.close()
+ pool.join()
+ else:
+ res = list(map(func, TASKS))
+
+ if includeLabels:
+ if bedFile:
+ return res, bed_interval_tree.labels
+ else:
+ return res, None
+ return res
+
+
+def getUserRegion(chrom_sizes, region_string, max_chunk_size=1e6):
+ r"""
+ Verifies if a given region argument, given by the user
+ is valid. The format of the region_string is chrom:start:end:tileSize
+ where start, end and tileSize are optional.
+
+ :param chrom_sizes: dictionary of chromosome/scaffold size. Key=chromosome name
+ :param region_string: a string of the form chr:start:end
+ :param max_chunk_size: upper limit for the chunk size
+ :return: tuple chrom_size for the region start, region end, chunk size
+
+ #>>> data = getUserRegion({'chr2': 1000}, "chr1:10:10")
+ #Traceback (most recent call last):
+ # ...
+ #NameError: Unknown chromosome: chr1
+ #Known chromosomes are: ['chr2']
+
+ If the region end is biger than the chromosome size, this
+ value is used instead
+ >>> getUserRegion({'chr2': 1000}, "chr2:10:1001")
+ ([('chr2', 1000)], 10, 1000, 990)
+
+ Test chunk and regions size reduction to match tile size
+ >>> getUserRegion({'chr2': 200000}, "chr2:10:123344:3")
+ ([('chr2', 123344)], 9, 123345, 123336)
+
+ Test chromosome name mismatch
+ >>> getUserRegion({'2': 200000}, "chr2:10:123344:3")
+ ([('2', 123344)], 9, 123345, 123336)
+ >>> getUserRegion({'chrM': 200000}, "MT:10:123344:3")
+ ([('chrM', 123344)], 9, 123345, 123336)
+ """
+ region = region_string.split(":")
+ chrom = region[0]
+ chrom_sizes = dict(chrom_sizes)
+
+ if chrom not in list(chrom_sizes.keys()):
+ if chrom == "MT":
+ chromUse = "chrM"
+ elif chrom == "chrM":
+ chromUse = "MT"
+ elif chrom[0:3] == "chr":
+ chromUse = chrom[3:]
+ else:
+ chromUse = "chr" + chrom
+ if chromUse not in list(chrom_sizes.keys()):
+ raise NameError("Unknown chromosome: %s\nKnown "
+ "chromosomes are: %s " % (chrom, list(chrom_sizes.keys())))
+ chrom = chromUse
+ try:
+ region_start = int(region[1])
+ except IndexError:
+ region_start = 0
+ try:
+ region_end = int(region[2]) if int(region[2]) <= chrom_sizes[chrom] \
+ else chrom_sizes[chrom]
+ except IndexError:
+ region_end = chrom_sizes[chrom]
+ if region_start > region_end or region_start < 0:
+ raise NameError("{} not valid. The format is chrom:start:end. "
+ "Without comas, dashes or dots. ".format(region_string))
+ try:
+ tilesize = int(region[3])
+ except IndexError:
+ tilesize = None
+
+ chrom_sizes = [(chrom, region_end)]
+
+ # if tilesize is given, make region_start and region_end
+ # multiple of tileSize
+ if tilesize:
+ region_start -= region_start % tilesize
+ region_end += tilesize - (region_end % tilesize)
+
+ chunk_size = int(region_end - region_start)
+ if chunk_size > max_chunk_size:
+ chunk_size = max_chunk_size
+ if tilesize and tilesize < chunk_size:
+ chunk_size -= chunk_size % tilesize
+
+ return chrom_sizes, region_start, region_end, int(chunk_size)
+
+
+def blSubtract(t, chrom, chunk):
+ """
+ If a genomic region overlaps with a blacklisted region, then subtract that region out
+
+ returns a list of lists
+ """
+
+ if t is None:
+ return [chunk]
+
+ overlaps = t.findOverlaps(chrom, chunk[0], chunk[1])
+ if overlaps is not None and len(overlaps) > 0:
+ output = []
+ for o in overlaps:
+ if chunk[1] <= chunk[0]:
+ break
+ if chunk[0] < o[0]:
+ output.append([chunk[0], o[0]])
+ chunk[0] = o[1]
+ if chunk[0] < chunk[1]:
+ output.append([chunk[0], chunk[1]])
+ else:
+ output = [chunk]
+
+ return output
diff --git a/deepTools/source/deeptools/misc.py b/deepTools/source/deeptools/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..f20b22f14b13f3e1e8bd446276b91a125c62fded
--- /dev/null
+++ b/deepTools/source/deeptools/misc.py
@@ -0,0 +1,13 @@
+import os
+
+# This should force numpy to run single threaded. See issue #697
+# This module MUST be imported before numpy
+# Note that these environment variables are internal to deepTools (they won't exist on the shell after the command completes)
+if 'MKL_NUM_THREADS' not in os.environ:
+ os.environ['MKL_NUM_THREADS'] = 'sequential'
+if 'NUMEXPR_NUM_THREADS' not in os.environ:
+ os.environ['NUMEXPR_NUM_THREADS'] = '1'
+if 'OMP_NUM_THREADS' not in os.environ:
+ os.environ['OMP_NUM_THREADS'] = '1'
+if 'VECLIB_MAXIMUM_THREADS' not in os.environ:
+ os.environ['VECLIB_MAXIMUM_THREADS'] = '1'
diff --git a/deepTools/source/deeptools/multiBamSummary.py b/deepTools/source/deeptools/multiBamSummary.py
new file mode 100644
index 0000000000000000000000000000000000000000..981a99e3c2ef940e3ec05234797f78a4b35a16e7
--- /dev/null
+++ b/deepTools/source/deeptools/multiBamSummary.py
@@ -0,0 +1,294 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import os
+import sys
+import argparse
+import numpy as np
+
+import deeptools.countReadsPerBin as countR
+from deeptools import parserCommon
+from deeptools.utilities import smartLabels
+from importlib.metadata import version
+old_settings = np.seterr(all='ignore')
+
+
+def parse_arguments(args=None):
+ parser = \
+ argparse.ArgumentParser(
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ description="""
+
+``multiBamSummary`` computes the read coverages for genomic regions for typically two or more BAM files.
+The analysis can be performed for the entire genome by running the program in 'bins' mode.
+If you want to count the read coverage for specific regions only, use the ``BED-file`` mode instead.
+The standard output of ``multiBamSummary`` is a compressed numpy array (``.npz``).
+It can be directly used to calculate and visualize pairwise correlation values between the read coverages using the tool 'plotCorrelation'.
+Similarly, ``plotPCA`` can be used for principal component analysis of the read coverages using the .npz file.
+Note that using a single bigWig file is only recommended if you want to produce a bedGraph file (i.e., with the ``--outRawCounts`` option; the default output file cannot be used by ANY deepTools program if only a single file was supplied!).
+
+A detailed sub-commands help is available by typing:
+
+ multiBamSummary bins -h
+
+ multiBamSummary BED-file -h
+
+
+""",
+ epilog='example usages:\n'
+ 'multiBamSummary bins --bamfiles file1.bam file2.bam -o results.npz \n\n'
+ 'multiBamSummary BED-file --BED selection.bed --bamfiles file1.bam file2.bam \n'
+ '-o results.npz'
+ ' \n\n',
+ conflict_handler='resolve')
+
+ parser.add_argument('--version', action='version',
+ version='%(prog)s {}'.format(version('deeptools')))
+ subparsers = parser.add_subparsers(
+ title="commands",
+ dest='command',
+ description='subcommands',
+ help='subcommands',
+ metavar='')
+
+ parent_parser = parserCommon.getParentArgParse(binSize=False)
+ read_options_parser = parserCommon.read_options()
+
+ # bins mode options
+ subparsers.add_parser(
+ 'bins',
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ parents=[bamcorrelate_args(case='bins'),
+ parent_parser, read_options_parser,
+ parserCommon.gtf_options(suppress=True)
+ ],
+ help="The coverage calculation is done for consecutive bins of equal "
+ "size (10 kilobases by default). This mode is useful to assess the "
+ "genome-wide similarity of BAM files. The bin size and "
+ "distance between bins can be adjusted.",
+ add_help=False,
+ usage='%(prog)s '
+ '--bamfiles file1.bam file2.bam '
+ '-o results.npz \n'
+ 'help: multiBamSummary bins -h / multiBamSummary bins --help\n')
+
+ # BED file arguments
+ subparsers.add_parser(
+ 'BED-file',
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ parents=[bamcorrelate_args(case='BED-file'),
+ parent_parser, read_options_parser,
+ parserCommon.gtf_options()
+ ],
+ help="The user provides a BED file that contains all regions "
+ "that should be considered for the coverage analysis. A "
+ "common use is to compare ChIP-seq coverages between two "
+ "different samples for a set of peak regions.",
+ usage='%(prog)s --BED selection.bed --bamfiles file1.bam file2.bam -o results.npz\n'
+ 'help: multiBamSummary BED-file -h / multiBamSummary bins --help\n',
+ add_help=False)
+
+ return parser
+
+
+def bamcorrelate_args(case='bins'):
+ parser = argparse.ArgumentParser(add_help=False)
+ required = parser.add_argument_group('Required arguments')
+
+ # define the arguments
+ required.add_argument('--bamfiles', '-b',
+ metavar='FILE1 FILE2',
+ help='List of indexed bam files separated by spaces.',
+ nargs='+',
+ required=True)
+
+ required.add_argument('--outFileName', '-out', '-o',
+ help='File name to save the coverage matrix. This matrix '
+ 'can be subsequently plotted using plotCorrelation or '
+ 'or plotPCA.',
+ type=parserCommon.writableFile)
+
+ optional = parser.add_argument_group('Optional arguments')
+
+ optional.add_argument("--help", "-h", action="help",
+ help="show this help message and exit")
+ optional.add_argument('--labels', '-l',
+ metavar='sample1 sample2',
+ help='User defined labels instead of default labels from '
+ 'file names. '
+ 'Multiple labels have to be separated by a space, e.g. '
+ '--labels sample1 sample2 sample3',
+ nargs='+')
+ optional.add_argument('--smartLabels',
+ action='store_true',
+ help='Instead of manually specifying labels for the input '
+ 'BAM files, this causes deepTools to use the file name '
+ 'after removing the path and extension.')
+
+ optional.add_argument('--genomeChunkSize',
+ type=int,
+ default=None,
+ help='Manually specify the size of the genome provided to each processor. '
+ 'The default value of None specifies that this is determined by read '
+ 'density of the BAM file.')
+
+ if case == 'bins':
+ optional.add_argument('--binSize', '-bs',
+ metavar='INT',
+ help='Length in bases of the window used '
+ 'to sample the genome. (Default: %(default)s)',
+ default=10000,
+ type=int)
+
+ optional.add_argument('--distanceBetweenBins', '-n',
+ metavar='INT',
+ help='By default, multiBamSummary considers consecutive '
+ 'bins of the specified --binSize. However, to '
+ 'reduce the computation time, a larger distance '
+ 'between bins can by given. Larger distances '
+ 'result in fewer bins considered. (Default: %(default)s)',
+ default=0,
+ type=int)
+
+ required.add_argument('--BED',
+ help=argparse.SUPPRESS,
+ default=None)
+ else:
+ optional.add_argument('--binSize', '-bs',
+ help=argparse.SUPPRESS,
+ default=10000,
+ type=int)
+
+ optional.add_argument('--distanceBetweenBins', '-n',
+ help=argparse.SUPPRESS,
+ metavar='INT',
+ default=0,
+ type=int)
+
+ required.add_argument('--BED',
+ help='Limits the coverage analysis to '
+ 'the regions specified in these files.',
+ metavar='FILE1.bed FILE2.bed',
+ nargs='+',
+ required=True)
+
+ group = parser.add_argument_group('Output optional options')
+
+ group.add_argument('--outRawCounts',
+ help='Save the counts per region to a tab-delimited file.',
+ type=parserCommon.writableFile,
+ metavar='FILE')
+
+ group.add_argument('--scalingFactors',
+ help='Compute scaling factors (in the DESeq2 manner) '
+ 'compatible for use with bamCoverage and write them to a '
+ 'file. The file has tab-separated columns "sample" and '
+ '"scalingFactor".',
+ type=parserCommon.writableFile,
+ metavar='FILE')
+
+ return parser
+
+
+def process_args(args=None):
+ args = parse_arguments().parse_args(args)
+
+ if len(sys.argv) == 1:
+ parse_arguments().print_help()
+ sys.exit()
+
+ if args.labels and len(args.bamfiles) != len(args.labels):
+ print("The number of labels does not match the number of bam files.")
+ exit(0)
+ if not args.labels:
+ if args.smartLabels:
+ args.labels = smartLabels(args.bamfiles)
+ else:
+ args.labels = [os.path.basename(x) for x in args.bamfiles]
+
+ return args
+
+
+def main(args=None):
+ """
+ 1. get read counts at different positions either
+ all of same length or from genomic regions from the BED file
+
+ 2. save data for further plotting
+
+ """
+ args = process_args(args)
+
+ if 'BED' in args:
+ bed_regions = args.BED
+ else:
+ bed_regions = None
+
+ if len(args.bamfiles) == 1 and not (args.outRawCounts or args.scalingFactors):
+ sys.stderr.write("You've input a single BAM file and not specified "
+ "--outRawCounts or --scalingFactors. The resulting output will NOT be "
+ "useful with any deepTools program!\n")
+
+ stepsize = args.binSize + args.distanceBetweenBins
+ c = countR.CountReadsPerBin(
+ args.bamfiles,
+ args.binSize,
+ numberOfSamples=None,
+ genomeChunkSize=args.genomeChunkSize,
+ numberOfProcessors=args.numberOfProcessors,
+ verbose=args.verbose,
+ region=args.region,
+ bedFile=bed_regions,
+ blackListFileName=args.blackListFileName,
+ extendReads=args.extendReads,
+ minMappingQuality=args.minMappingQuality,
+ ignoreDuplicates=args.ignoreDuplicates,
+ center_read=args.centerReads,
+ samFlag_include=args.samFlagInclude,
+ samFlag_exclude=args.samFlagExclude,
+ minFragmentLength=args.minFragmentLength,
+ maxFragmentLength=args.maxFragmentLength,
+ stepSize=stepsize,
+ zerosToNans=False,
+ out_file_for_raw_data=args.outRawCounts)
+
+ num_reads_per_bin = c.run(allArgs=args)
+
+ sys.stderr.write("Number of bins "
+ "found: {}\n".format(num_reads_per_bin.shape[0]))
+
+ if num_reads_per_bin.shape[0] < 2:
+ exit("ERROR: too few non zero bins found.\n"
+ "If using --region please check that this "
+ "region is covered by reads.\n")
+
+ # numpy will append .npz to the file name if we don't do this...
+ if args.outFileName:
+ f = open(args.outFileName, "wb")
+ np.savez_compressed(f,
+ matrix=num_reads_per_bin,
+ labels=args.labels)
+ f.close()
+
+ if args.scalingFactors:
+ f = open(args.scalingFactors, 'w')
+ f.write("sample\tscalingFactor\n")
+ scalingFactors = countR.estimateSizeFactors(num_reads_per_bin)
+ for sample, scalingFactor in zip(args.labels, scalingFactors):
+ f.write("{}\t{:6.4f}\n".format(sample, scalingFactor))
+ f.close()
+
+ if args.outRawCounts:
+ # append to the generated file the
+ # labels
+ header = "#'chr'\t'start'\t'end'\t"
+ header += "'" + "'\t'".join(args.labels) + "'\n"
+ f = open(args.outRawCounts, 'r+')
+ content = f.read()
+ f.seek(0, 0)
+ f.write(header + content)
+ f.close()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/deepTools/source/deeptools/multiBigwigSummary.py b/deepTools/source/deeptools/multiBigwigSummary.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a5bda19abf9bcbee9bb9ec51441d59ea5220a2d
--- /dev/null
+++ b/deepTools/source/deeptools/multiBigwigSummary.py
@@ -0,0 +1,281 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+import argparse
+import os.path
+import numpy as np
+from deeptools import parserCommon
+from deeptools.utilities import smartLabels
+import deeptools.getScorePerBigWigBin as score_bw
+from importlib.metadata import version
+
+old_settings = np.seterr(all='ignore')
+
+
+def parse_arguments(args=None):
+ parser = \
+ argparse.ArgumentParser(
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ description="""
+
+Given typically two or more bigWig files, ``multiBigwigSummary`` computes the average scores for each of the files in every genomic region.
+This analysis is performed for the entire genome by running the program in ``bins`` mode, or for certain user selected regions in ``BED-file``
+mode. Most commonly, the default output of ``multiBigwigSummary`` (a compressed numpy array, .npz) is used by other tools such as ``plotCorrelation`` or ``plotPCA`` for visualization and diagnostic purposes.
+
+Note that using a single bigWig file is only recommended if you want to produce a bedGraph file (i.e., with the ``--outRawCounts`` option; the default output file cannot be used by ANY deepTools program if only a single file was supplied!).
+
+A detailed sub-commands help is available by typing:
+
+ multiBigwigSummary bins -h
+
+ multiBigwigSummary BED-file -h
+
+
+""",
+ epilog='example usage:\n multiBigwigSummary bins '
+ '-b file1.bw file2.bw -o results.npz\n\n'
+ 'multiBigwigSummary BED-file -b file1.bw file2.bw -o results.npz\n'
+ '--BED selection.bed'
+ ' \n\n',
+ conflict_handler='resolve')
+
+ parser.add_argument('--version', action='version',
+ version='%(prog)s {}'.format(version('deeptools')))
+ subparsers = parser.add_subparsers(
+ title="commands",
+ dest='command',
+ metavar='')
+
+ parent_parser = parserCommon.getParentArgParse(binSize=False)
+
+ # bins mode options
+ subparsers.add_parser(
+ 'bins',
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ parents=[
+ multiBigwigSummaryArgs(case='bins'),
+ parent_parser,
+ parserCommon.gtf_options(suppress=True)
+ ],
+ help="The average score is based on equally sized bins "
+ "(10 kilobases by default), which consecutively cover the "
+ "entire genome. The only exception is the last bin of a chromosome, which "
+ "is often smaller. The output of this mode is commonly used to assess the "
+ "overall similarity of different bigWig files.",
+ add_help=False,
+ usage='multiBigwigSummary bins '
+ '-b file1.bw file2.bw '
+ '-o results.npz\n'
+ 'help: multiBigwigSummary bins -h / multiBigwigSummary bins --help\n')
+
+ # BED file arguments
+ subparsers.add_parser(
+ 'BED-file',
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ parents=[
+ multiBigwigSummaryArgs(case='BED-file'),
+ parent_parser,
+ parserCommon.gtf_options()
+ ],
+ help="The user provides a BED file that contains all regions "
+ "that should be considered for the analysis. A "
+ "common use is to compare scores (e.g. ChIP-seq scores) between "
+ "different samples over a set of pre-defined peak regions.",
+ usage='multiBigwigSummary BED-file '
+ '-b file1.bw file2.bw '
+ '-o results.npz --BED selection.bed\n'
+ 'help: multiBigwigSummary BED-file -h / multiBigwigSummary BED-file --help\n',
+ add_help=False)
+
+ return parser
+
+
+def process_args(args=None):
+ args = parse_arguments().parse_args(args)
+
+ if len(sys.argv) == 1:
+ parse_arguments().print_help()
+ sys.exit()
+
+ if not args.labels and args.smartLabels:
+ args.labels = smartLabels(args.bwfiles)
+ elif not args.labels:
+ args.labels = []
+ for f in args.bwfiles:
+ if f.startswith("http://") or f.startswith("https://") or f.startswith("ftp://"):
+ args.labels.append(f.split("/")[-1])
+ else:
+ args.labels.append(os.path.basename(f))
+
+ if len(args.bwfiles) != len(args.labels):
+ sys.exit("The number of labels does not match the number of bigWig files.")
+
+ return args
+
+
+def multiBigwigSummaryArgs(case='bins'):
+ parser = argparse.ArgumentParser(add_help=False)
+ required = parser.add_argument_group('Required arguments')
+
+ # define the arguments
+ required.add_argument('--bwfiles', '-b',
+ metavar='FILE1 FILE2',
+ help='List of bigWig files, separated by spaces.',
+ nargs='+',
+ required=True)
+
+ required.add_argument('--outFileName', '-out', '-o',
+ help='File name to save the compressed matrix file (npz format) '
+ 'needed by the "plotPCA" and "plotCorrelation" tools.',
+ type=parserCommon.writableFile,
+ required=True)
+
+ optional = parser.add_argument_group('Optional arguments')
+
+ optional.add_argument("--help", "-h", action="help",
+ help="show this help message and exit")
+ optional.add_argument('--labels', '-l',
+ metavar='sample1 sample2',
+ help='User defined labels instead of default labels from '
+ 'file names. '
+ 'Multiple labels have to be separated by spaces, e.g., '
+ '--labels sample1 sample2 sample3',
+ nargs='+')
+ optional.add_argument('--smartLabels',
+ action='store_true',
+ help='Instead of manually specifying labels for the input '
+ 'bigWig files, this causes deepTools to use the file name '
+ 'after removing the path and extension.')
+
+ optional.add_argument('--chromosomesToSkip',
+ metavar='chr1 chr2',
+ help='List of chromosomes that you do not want to be included. '
+ ' Useful to remove "random" or "extra" chr.',
+ nargs='+')
+
+ if case == 'bins':
+ optional.add_argument('--binSize', '-bs',
+ metavar='INT',
+ help='Size (in bases) of the windows sampled '
+ 'from the genome. (Default: %(default)s)',
+ default=10000,
+ type=int)
+
+ optional.add_argument('--distanceBetweenBins', '-n',
+ metavar='INT',
+ help='By default, multiBigwigSummary considers adjacent '
+ 'bins of the specified --binSize. However, to '
+ 'reduce the computation time, a larger distance '
+ 'between bins can be given. Larger distances '
+ 'results in fewer considered bins. (Default: %(default)s)',
+ default=0,
+ type=int)
+
+ required.add_argument('--BED',
+ help=argparse.SUPPRESS,
+ default=None)
+ else:
+ optional.add_argument('--binSize', '-bs',
+ help=argparse.SUPPRESS,
+ default=10000,
+ type=int)
+
+ optional.add_argument('--distanceBetweenBins', '-n',
+ help=argparse.SUPPRESS,
+ metavar='INT',
+ default=0,
+ type=int)
+
+ required.add_argument('--BED',
+ help='Limits the analysis to '
+ 'the regions specified in this file.',
+ metavar='file1.bed file2.bed',
+ nargs='+',
+ required=True)
+
+ group = parser.add_argument_group('Output optional options')
+
+ group.add_argument('--outRawCounts',
+ help='Save average scores per region for each bigWig file to a single tab-delimited file.',
+ type=parserCommon.writableFile,
+ metavar='FILE')
+
+ return parser
+
+
+def main(args=None):
+ """
+ 1. get read counts at different positions either
+ all of same length or from genomic regions from the BED file
+
+ 2. compute the scores
+
+ """
+ args = process_args(args)
+
+ if 'BED' in args:
+ bed_regions = args.BED
+ else:
+ bed_regions = None
+
+ if len(args.bwfiles) == 1 and not args.outRawCounts:
+ sys.stderr.write("You've input a single bigWig file and not specified "
+ "--outRawCounts. The resulting output will NOT be "
+ "useful with any deepTools program!\n")
+
+ num_reads_per_bin = score_bw.getScorePerBin(
+ args.bwfiles,
+ args.binSize,
+ blackListFileName=args.blackListFileName,
+ numberOfProcessors=args.numberOfProcessors,
+ stepSize=args.binSize + args.distanceBetweenBins,
+ verbose=args.verbose,
+ region=args.region,
+ bedFile=bed_regions,
+ chrsToSkip=args.chromosomesToSkip,
+ out_file_for_raw_data=args.outRawCounts,
+ allArgs=args)
+
+ sys.stderr.write("Number of bins "
+ "found: {}\n".format(num_reads_per_bin.shape[0]))
+
+ if num_reads_per_bin.shape[0] < 2:
+ exit("ERROR: too few non zero bins found.\n"
+ "If using --region please check that this "
+ "region is covered by reads.\n")
+
+ f = open(args.outFileName, "wb")
+ np.savez_compressed(f,
+ matrix=num_reads_per_bin,
+ labels=args.labels)
+ f.close()
+
+ if args.outRawCounts:
+ # append to the generated file the
+ # labels
+ header = "#'chr'\t'start'\t'end'\t"
+ header += "'" + "'\t'".join(args.labels) + "'\n"
+ f = open(args.outRawCounts, "r+")
+ content = f.read()
+ f.seek(0, 0)
+ f.write(header + content)
+
+ """
+ if bed_regions:
+ bed_regions.seek(0)
+ reg_list = bed_regions.readlines()
+ args.outRawCounts.write("#'chr'\t'start'\t'end'\t")
+ args.outRawCounts.write("'" + "'\t'".join(args.labels) + "'\n")
+ fmt = "\t".join(np.repeat('%s', num_reads_per_bin.shape[1])) + "\n"
+ for idx, row in enumerate(num_reads_per_bin):
+ args.outRawCounts.write("{}\t{}\t{}\t".format(*reg_list[idx].strip().split("\t")[0:3]))
+ args.outRawCounts.write(fmt % tuple(row))
+
+ else:
+ args.outRawCounts.write("'" + "'\t'".join(args.labels) + "'\n")
+ fmt = "\t".join(np.repeat('{}', num_reads_per_bin.shape[1])) + "\n"
+ for row in num_reads_per_bin:
+ args.outRawCounts.write(fmt.format(*tuple(row)))
+ """
+ f.close()
diff --git a/deepTools/source/deeptools/parserCommon.py b/deepTools/source/deeptools/parserCommon.py
new file mode 100644
index 0000000000000000000000000000000000000000..9849d9c431c6acf4ad31b8cdd50f3586697a9f56
--- /dev/null
+++ b/deepTools/source/deeptools/parserCommon.py
@@ -0,0 +1,884 @@
+import argparse
+import os
+from importlib.metadata import version
+import multiprocessing
+
+
+def check_float_0_1(value):
+ v = float(value)
+ if v < 0.0 or v > 1.0:
+ raise argparse.ArgumentTypeError("%s is an invalid floating point value. It must be between 0.0 and 1.0" % value)
+ return v
+
+
+def check_list_of_comma_values(value):
+ if value is None:
+ return None
+ for foo in value:
+ foo = value.split(",")
+ if len(foo) < 2:
+ raise argparse.ArgumentTypeError("%s is an invalid element of a list of comma separated values. "
+ "Only argument elements of the following form are accepted: 'foo,bar'" % foo)
+ return value
+
+
+def output(args=None):
+ parser = argparse.ArgumentParser(add_help=False)
+ group = parser.add_argument_group('Output')
+ group.add_argument('--outFileName', '-o',
+ help='Output file name.',
+ metavar='FILENAME',
+ type=writableFile,
+ required=True)
+
+ group.add_argument('--outFileFormat', '-of',
+ help='Output file type. Either "bigwig" or "bedgraph".',
+ choices=['bigwig', 'bedgraph'],
+ default='bigwig')
+
+ return parser
+
+
+def read_options():
+ """Common arguments related to BAM files and the interpretation
+ of the read coverage
+ """
+ parser = argparse.ArgumentParser(add_help=False)
+ group = parser.add_argument_group('Read processing options')
+
+ group.add_argument('--extendReads', '-e',
+ help='This parameter allows the extension of reads to '
+ 'fragment size. If set, each read is '
+ 'extended, without exception.\n'
+ '*NOTE*: This feature is generally NOT recommended for '
+ 'spliced-read data, such as RNA-seq, as it would '
+ 'extend reads over skipped regions.\n'
+ '*Single-end*: Requires a user specified value for the '
+ 'final fragment length. Reads that already exceed this '
+ 'fragment length will not be extended.\n'
+ '*Paired-end*: Reads with mates are always extended to '
+ 'match the fragment size defined by the two read mates. '
+ 'Unmated reads, mate reads that map too far apart '
+ '(>4x fragment length) or even map to different '
+ 'chromosomes are treated like single-end reads. The input '
+ 'of a fragment length value is optional. If '
+ 'no value is specified, it is estimated from the '
+ 'data (mean of the fragment size of all mate reads).\n',
+ type=int,
+ nargs='?',
+ const=True,
+ default=False,
+ metavar="INT bp")
+
+ group.add_argument('--ignoreDuplicates',
+ help='If set, reads that have the same orientation '
+ 'and start position will be considered only '
+ 'once. If reads are paired, the mate\'s position '
+ 'also has to coincide to ignore a read.',
+ action='store_true'
+ )
+
+ group.add_argument('--minMappingQuality',
+ metavar='INT',
+ help='If set, only reads that have a mapping '
+ 'quality score of at least this are '
+ 'considered.',
+ type=int,
+ )
+
+ group.add_argument('--centerReads',
+ help='By adding this option, reads are centered with '
+ 'respect to the fragment length. For paired-end data, '
+ 'the read is centered at the fragment length defined '
+ 'by the two ends of the fragment. For single-end data, the '
+ 'given fragment length is used. This option is '
+ 'useful to get a sharper signal around enriched '
+ 'regions.',
+ action='store_true')
+
+ group.add_argument('--samFlagInclude',
+ help='Include reads based on the SAM flag. For example, '
+ 'to get only reads that are the first mate, use a flag of 64. '
+ 'This is useful to count properly paired reads only once, '
+ 'as otherwise the second mate will be also considered for the '
+ 'coverage. (Default: %(default)s)',
+ metavar='INT',
+ default=None,
+ type=int,
+ required=False)
+
+ group.add_argument('--samFlagExclude',
+ help='Exclude reads based on the SAM flag. For example, '
+ 'to get only reads that map to the forward strand, use '
+ '--samFlagExclude 16, where 16 is the SAM flag for reads '
+ 'that map to the reverse strand. (Default: %(default)s)',
+ metavar='INT',
+ default=None,
+ type=int,
+ required=False)
+
+ group.add_argument('--minFragmentLength',
+ help='The minimum fragment length needed for read/pair '
+ 'inclusion. This option is primarily useful '
+ 'in ATACseq experiments, for filtering mono- or '
+ 'di-nucleosome fragments. (Default: %(default)s)',
+ metavar='INT',
+ default=0,
+ type=int,
+ required=False)
+
+ group.add_argument('--maxFragmentLength',
+ help='The maximum fragment length needed for read/pair '
+ 'inclusion. (Default: %(default)s)',
+ metavar='INT',
+ default=0,
+ type=int,
+ required=False)
+
+ return parser
+
+
+def gtf_options(suppress=False):
+ """
+ Arguments present whenever a BED/GTF file can be used
+ """
+ if suppress:
+ parser = argparse.ArgumentParser(add_help=False)
+ group = parser
+ else:
+ parser = argparse.ArgumentParser(add_help=False)
+ group = parser.add_argument_group('GTF/BED12 options')
+
+ if suppress:
+ help = argparse.SUPPRESS
+ else:
+ help = 'When either a BED12 or GTF file are used to provide \
+ regions, perform the computation on the merged exons, \
+ rather than using the genomic interval defined by the \
+ 5-prime and 3-prime most transcript bound (i.e., columns \
+ 2 and 3 of a BED file). If a BED3 or BED6 file is used \
+ as input, then columns 2 and 3 are used as an exon. (Default: %(default)s)'
+
+ group.add_argument('--metagene',
+ help=help,
+ action='store_true',
+ dest='keepExons')
+
+ if suppress is False:
+ help = 'When a GTF file is used to provide regions, only \
+ entries with this value as their feature (column 3) \
+ will be processed as transcripts. (Default: %(default)s)'
+
+ group.add_argument('--transcriptID',
+ help=help,
+ default='transcript')
+
+ if suppress is False:
+ help = 'When a GTF file is used to provide regions, only \
+ entries with this value as their feature (column 3) \
+ will be processed as exons. CDS would be another common \
+ value for this. (Default: %(default)s)'
+
+ group.add_argument('--exonID',
+ help=help,
+ default='exon')
+
+ if suppress is False:
+ help = 'Each region has an ID (e.g., ACTB) assigned to it, \
+ which for BED files is either column 4 (if it exists) \
+ or the interval bounds. For GTF files this is instead \
+ stored in the last column as a key:value pair (e.g., as \
+ \'transcript_id "ACTB"\', for a key of transcript_id \
+ and a value of ACTB). In some cases it can be \
+ convenient to use a different identifier. To do so, set \
+ this to the desired key. (Default: %(default)s)'
+
+ group.add_argument('--transcript_id_designator',
+ help=help,
+ default='transcript_id')
+
+ return parser
+
+
+def normalization_options():
+ """Common arguments related to read coverage normalization
+ """
+ parser = argparse.ArgumentParser(add_help=False)
+ group = parser.add_argument_group('Read coverage normalization options')
+
+ group.add_argument('--effectiveGenomeSize',
+ help='The effective genome size is the portion '
+ 'of the genome that is mappable. Large fractions of '
+ 'the genome are stretches of NNNN that should be '
+ 'discarded. Also, if repetitive regions were not '
+ 'included in the mapping of reads, the effective '
+ 'genome size needs to be adjusted accordingly. '
+ 'A table of values is available here: '
+ 'http://deeptools.readthedocs.io/en/latest/content/feature/effectiveGenomeSize.html .',
+ default=None,
+ type=int,
+ required=False)
+
+ group.add_argument('--normalizeUsing',
+ help='Use one of the entered methods to '
+ 'normalize the number of reads per bin. By default, no normalization is performed. '
+ 'RPKM = Reads Per Kilobase per Million mapped reads; '
+ 'CPM = Counts Per Million mapped reads, same as CPM in RNA-seq; '
+ 'BPM = Bins Per Million mapped reads, same as TPM in RNA-seq; '
+ 'RPGC = reads per genomic content (1x normalization); '
+ 'Mapped reads are considered after blacklist filtering (if applied). '
+ 'RPKM (per bin) = number of reads per bin / '
+ '(number of mapped reads (in millions) * bin length (kb)). '
+ 'CPM (per bin) = number of reads per bin / '
+ 'number of mapped reads (in millions). '
+ 'BPM (per bin) = number of reads per bin / '
+ 'sum of all reads per bin (in millions). '
+ 'RPGC (per bin) = number of reads per bin / '
+ 'scaling factor for 1x average coverage. '
+ 'None = the default and equivalent to not setting this option at all. '
+ 'This scaling factor, in turn, is determined from the '
+ 'sequencing depth: (total number of mapped reads * fragment length) / '
+ 'effective genome size.\nThe scaling factor used '
+ 'is the inverse of the sequencing depth computed '
+ 'for the sample to match the 1x coverage. This option requires --effectiveGenomeSize. '
+ 'Each read is considered independently, '
+ 'if you want to only count one mate from a pair in '
+ 'paired-end data, then use the --samFlagInclude/--samFlagExclude options. (Default: %(default)s)',
+ choices=['RPKM', 'CPM', 'BPM', 'RPGC', 'None'],
+ default=None,
+ required=False)
+
+ group.add_argument('--exactScaling',
+ help='Instead of computing scaling factors based on a sampling of the reads, '
+ 'process all of the reads to determine the exact number that will be used in '
+ 'the output. This requires significantly more time to compute, but will '
+ 'produce more accurate scaling factors in cases where alignments that are '
+ 'being filtered are rare and lumped together. In other words, this is only '
+ 'needed when region-based sampling is expected to produce incorrect results.',
+ action='store_true')
+
+ group.add_argument('--ignoreForNormalization', '-ignore',
+ help='A list of space-delimited chromosome names '
+ 'containing those chromosomes that should be excluded '
+ 'for computing the normalization. This is useful when considering '
+ 'samples with unequal coverage across chromosomes, like male '
+ 'samples. An usage examples is --ignoreForNormalization chrX chrM.',
+ nargs='+')
+
+ group.add_argument('--skipNonCoveredRegions', '--skipNAs',
+ help='This parameter determines if non-covered regions '
+ '(regions without overlapping reads) in a BAM file should '
+ 'be skipped. The default is to treat those regions as having a value of zero. '
+ 'The decision to skip non-covered regions '
+ 'depends on the interpretation of the data. Non-covered regions '
+ 'may represent, for example, repetitive regions that should be skipped.',
+ action='store_true')
+
+ group.add_argument('--smoothLength',
+ metavar="INT bp",
+ help='The smooth length defines a window, larger than '
+ 'the binSize, to average the number of reads. For '
+ 'example, if the --binSize is set to 20 and the '
+ '--smoothLength is set to 60, then, for each '
+ 'bin, the average of the bin and its left and right '
+ 'neighbors is considered. Any value smaller than '
+ '--binSize will be ignored and no smoothing will be '
+ 'applied.',
+ type=int)
+
+ return parser
+
+
+def getParentArgParse(args=None, binSize=True, blackList=True):
+ """
+ Typical arguments for several tools
+ """
+
+ parser = argparse.ArgumentParser(add_help=False)
+ optional = parser.add_argument_group('Optional arguments')
+
+ optional.add_argument('--version', action='version',
+ version='%(prog)s {}'.format(version('deeptools')))
+
+ if binSize:
+ optional.add_argument('--binSize', '-bs',
+ help='Size of the bins, in bases, for the output '
+ 'of the bigwig/bedgraph file. (Default: %(default)s)',
+ metavar="INT bp",
+ type=int,
+ default=50)
+
+ optional.add_argument('--region', '-r',
+ help='Region of the genome to limit the operation '
+ 'to - this is useful when testing parameters to '
+ 'reduce the computing time. The format is '
+ 'chr:start:end, for example --region chr10 or '
+ '--region chr10:456700:891000.',
+ metavar="CHR:START:END",
+ required=False,
+ type=genomicRegion)
+
+ if blackList:
+ optional.add_argument('--blackListFileName', '-bl',
+ help="A BED or GTF file containing regions that should be excluded from all analyses. Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered. Please note that you should adjust the effective genome size, if relevant.",
+ metavar="BED file",
+ nargs="+",
+ required=False)
+
+ optional.add_argument('--numberOfProcessors', '-p',
+ help='Number of processors to use. Type "max/2" to '
+ 'use half the maximum number of processors or "max" '
+ 'to use all available processors. (Default: %(default)s)',
+ metavar="INT",
+ type=numberOfProcessors,
+ default=1,
+ required=False)
+
+ optional.add_argument('--verbose', '-v',
+ help='Set to see processing messages.',
+ action='store_true')
+
+ return parser
+
+
+def numberOfProcessors(string):
+ try:
+ # won't work on macOS or windows
+ # limit threads to what is available (e.g. grid submissions, issue #1199)
+ availProc = len(os.sched_getaffinity(0))
+ except AttributeError:
+ availProc = multiprocessing.cpu_count()
+
+ if string == "max/2": # default case
+ # by default half of the available processors are used
+ numberOfProcessors = int(availProc * 0.5)
+ elif string == "max":
+ # use all available processors
+ numberOfProcessors = availProc
+ else:
+ try:
+ numberOfProcessors = int(string)
+ except ValueError:
+ raise argparse.ArgumentTypeError(
+ "{} is not a valid number of processors".format(string))
+
+ except Exception as e:
+ raise argparse.ArgumentTypeError("the given value {} is not valid. "
+ "Error message: {}\nThe number of "
+ "available processors in your "
+ "computer is {}.".format(string, e, availProc))
+
+ if numberOfProcessors > availProc:
+ numberOfProcessors = availProc
+
+ return numberOfProcessors
+
+
+def genomicRegion(string):
+ # remove whitespaces using split,join trick
+ region = ''.join(string.split())
+ if region == '':
+ return None
+ # remove undesired characters that may be present and
+ # replace - by :
+ # N.B., the syntax for translate() differs between python 2 and 3
+ try:
+ region = region.translate(None, ",;|!{}()").replace("-", ":")
+ except:
+ region = region.translate({ord(i): None for i in ",;|!{}()"})
+ if len(region) == 0:
+ raise argparse.ArgumentTypeError(
+ "{} is not a valid region".format(string))
+ return region
+
+
+def writableFile(string):
+ """
+ Simple function that tests if a given path is writable
+ """
+ try:
+ open(string, 'w').close()
+ os.remove(string)
+ except:
+ msg = "{} file can't be opened for writing".format(string)
+ raise argparse.ArgumentTypeError(msg)
+ return string
+
+
+"""
+Arguments used by heatmapper and profiler
+"""
+
+
+def heatmapperMatrixArgs(args=None):
+ parser = argparse.ArgumentParser(add_help=False)
+ required = parser.add_argument_group('Required arguments')
+ required.add_argument('--matrixFile', '-m',
+ help='Matrix file from the computeMatrix tool.',
+ type=argparse.FileType('r'),
+ )
+
+ required.add_argument('--outFileName', '-out', '-o',
+ help='File name to save the image to. The file '
+ 'ending will be used to determine the image '
+ 'format. The available options are: "png", '
+ '"eps", "pdf" and "svg", e.g., MyHeatmap.png.',
+ type=writableFile,
+ required=True)
+ return parser
+
+
+def heatmapperOutputArgs(args=None,
+ mode=['heatmap', 'profile'][0]):
+ parser = argparse.ArgumentParser(add_help=False)
+ output = parser.add_argument_group('Output options')
+
+ output.add_argument(
+ '--outFileSortedRegions',
+ help='File name into which the regions are saved '
+ 'after skipping zeros or min/max threshold values. The '
+ 'order of the regions in the file follows the sorting '
+ 'order selected. This is useful, for example, to '
+ 'generate other heatmaps while keeping the sorting of the '
+ 'first heatmap. Example: Heatmap1sortedRegions.bed',
+ metavar='FILE',
+ type=argparse.FileType('w'))
+
+ if mode == 'heatmap':
+ output.add_argument('--outFileNameMatrix',
+ help='If this option is given, then the matrix '
+ 'of values underlying the heatmap will be saved '
+ 'using this name, e.g. MyMatrix.gz.',
+ metavar='FILE',
+ type=writableFile)
+
+ output.add_argument('--interpolationMethod',
+ help='If the heatmap image contains a large number of columns '
+ 'is usually better to use an interpolation method to produce '
+ 'better results (see '
+ 'https://matplotlib.org/examples/images_contours_and_fields/interpolation_methods.html). '
+ 'Be default, plotHeatmap uses the method `nearest` if the number of columns is 1000 or '
+ 'less. Otherwise it uses the bilinear method. This default behaviour can be changed by '
+ 'using any of the following options: "nearest", "bilinear", "bicubic", '
+ '"gaussian"',
+ choices=['auto', 'nearest', 'bilinear', 'bicubic', 'gaussian'],
+ metavar='STR',
+ default='auto')
+ elif mode == 'profile':
+ output.add_argument('--outFileNameData',
+ help='File name to save the data '
+ 'underlying data for the average profile, e.g. '
+ 'myProfile.tab.',
+ type=writableFile)
+ output.add_argument(
+ '--dpi',
+ help='Set the DPI to save the figure.',
+ type=int,
+ default=200)
+
+ return parser
+
+
+def heatmapperOptionalArgs(mode=['heatmap', 'profile'][0]):
+
+ parser = argparse.ArgumentParser(add_help=False)
+ cluster = parser.add_argument_group('Clustering arguments')
+ cluster.add_argument(
+ '--kmeans',
+ help='Number of clusters to compute. When this '
+ 'option is set, the matrix is split into clusters '
+ 'using the k-means algorithm. Only works for data that '
+ 'is not grouped, otherwise only the first group will '
+ 'be clustered. If more specific clustering methods '
+ 'are required, then save the underlying matrix '
+ 'and run the clustering using other software. The plotting '
+ 'of the clustering may fail with an error if a '
+ 'cluster has very few members compared to the total number '
+ 'or regions.',
+ type=int)
+ cluster.add_argument(
+ '--hclust',
+ help='Number of clusters to compute. When this '
+ 'option is set, then the matrix is split into clusters '
+ 'using the hierarchical clustering algorithm, using "ward linkage". '
+ 'Only works for data that is not grouped, otherwise only the first '
+ 'group will be clustered. --hclust could be very slow if you have '
+ '>1000 regions. In those cases, you might prefer --kmeans or if more '
+ 'clustering methods are required you can save the underlying matrix and run '
+ 'the clustering using other software. The plotting of the clustering may '
+ 'fail with an error if a cluster has very few members compared to the '
+ 'total number of regions.',
+ type=int)
+ cluster.add_argument(
+ '--silhouette',
+ help='Compute the silhouette score for regions. This is only'
+ ' applicable if clustering has been performed. The silhouette score'
+ ' is a measure of how similar a region is to other regions in the'
+ ' same cluster as opposed to those in other clusters. It will be reported'
+ ' in the final column of the BED file with regions. The '
+ 'silhouette evaluation can be very slow when you have more'
+ 'than 100 000 regions.',
+ action='store_true'
+ )
+
+ optional = parser.add_argument_group('Optional arguments')
+
+ optional.add_argument("--help", "-h", action="help",
+ help="show this help message and exit")
+ optional.add_argument('--version', action='version',
+ version='%(prog)s {}'.format(version('deeptools')))
+ if mode == 'profile':
+ optional.add_argument(
+ '--averageType',
+ default='mean',
+ choices=["mean", "median", "min", "max", "std", "sum"],
+ help='The type of statistic that should be used for the '
+ 'profile. The options are: "mean", "median", "min", "max", '
+ '"sum" and "std".')
+
+ optional.add_argument('--plotHeight',
+ help='Plot height in cm.',
+ type=float,
+ default=7)
+
+ optional.add_argument('--plotWidth',
+ help='Plot width in cm. The minimum value is 1 cm.',
+ type=float,
+ default=11)
+
+ optional.add_argument(
+ '--plotType',
+ help='"lines" will plot the profile line based '
+ 'on the average type selected. "fill" '
+ 'fills the region between zero and the profile '
+ 'curve. The fill in color is semi transparent to '
+ 'distinguish different profiles. "se" and "std" '
+ 'color the region between the profile and the '
+ 'standard error or standard deviation of the data. '
+ 'As in the case of '
+ 'fill, a semi-transparent color is used. '
+ '"overlapped_lines" plots each region\'s value, one on '
+ 'top of the other. "heatmap" plots a '
+ 'summary heatmap.',
+ choices=['lines', 'fill', 'se', 'std', 'overlapped_lines', 'heatmap'],
+ default='lines')
+
+ optional.add_argument('--colors',
+ help='List of colors to use '
+ 'for the plotted lines (N.B., not applicable to \'--plotType overlapped_lines\'). Color names '
+ 'and html hex strings (e.g., #eeff22) '
+ 'are accepted. The color names should '
+ 'be space separated. For example, '
+ '--colors red blue green ',
+ nargs='+')
+
+ optional.add_argument('--numPlotsPerRow',
+ help='Number of plots per row',
+ type=int,
+ default=8)
+
+ optional.add_argument('--clusterUsingSamples',
+ help='List of sample numbers (order as in '
+ 'matrix), that are used for clustering by '
+ '--kmeans or --hclust if not given, all samples '
+ 'are taken into account for clustering. '
+ 'Example: --ClusterUsingSamples 1 3',
+ type=int, nargs='+')
+
+ elif mode == 'heatmap':
+ optional.add_argument(
+ '--plotType',
+ help='"lines" will plot the profile line based '
+ 'on the average type selected. "fill" '
+ 'fills the region between zero and the profile '
+ 'curve. The fill in color is semi transparent to '
+ 'distinguish different profiles. "se" and "std" '
+ 'color the region between the profile and the '
+ 'standard error or standard deviation of the data.',
+ choices=['lines', 'fill', 'se', 'std'],
+ default='lines')
+ optional.add_argument('--sortRegions',
+ help='Whether the heatmap should present '
+ 'the regions sorted. The default is '
+ 'to sort in descending order based on '
+ 'the mean value per region. Note that "keep" and "no" are the same thing.',
+ choices=["descend", "ascend", "no", "keep"],
+ default='descend')
+
+ optional.add_argument('--sortUsing',
+ help='Indicate which method should be used for '
+ 'sorting. For each row the method is computed. '
+ 'For region_length, a dashed line is drawn at '
+ 'the end of the region (reference point TSS and '
+ 'center) or the beginning of the region '
+ '(reference point TES) as appropriate.',
+ choices=["mean", "median", "max", "min", "sum",
+ "region_length"],
+ default='mean')
+
+ optional.add_argument('--sortUsingSamples',
+ help='List of sample numbers (order as in matrix), '
+ 'which are used by --sortUsing for sorting. '
+ 'If no value is set, it uses all samples. '
+ 'Example: --sortUsingSamples 1 3',
+ type=int, nargs='+')
+
+ optional.add_argument('--linesAtTickMarks',
+ help='Draw dashed lines from all tick marks through the heatmap. '
+ 'This is then similar to the dashed line draw at region bounds '
+ 'when using a reference point and --sortUsing region_length',
+ action='store_true')
+
+ optional.add_argument('--clusterUsingSamples',
+ help='List of sample numbers (order as in '
+ 'matrix), that are used for clustering by '
+ '--kmeans or --hclust if not given, all samples '
+ 'are taken into account for clustering. '
+ 'Example: --ClusterUsingSamples 1 3',
+ type=int, nargs='+')
+
+ optional.add_argument(
+ '--averageTypeSummaryPlot',
+ default='mean',
+ choices=["mean", "median", "min",
+ "max", "std", "sum"],
+ help='Define the type of statistic that should be plotted in the '
+ 'summary image above the heatmap. The options are: "mean", '
+ '"median", "min", "max", "sum" and "std".')
+
+ optional.add_argument(
+ '--missingDataColor',
+ default='black',
+ help='If --missingDataAsZero was not set, such cases '
+ 'will be colored in black by default. Using this '
+ 'parameter, a different color can be set. A value '
+ 'between 0 and 1 will be used for a gray scale '
+ '(black is 0). For a list of possible color '
+ 'names see: http://packages.python.org/ete2/'
+ 'reference/reference_svgcolors.html. '
+ 'Other colors can be specified using the #rrggbb '
+ 'notation.')
+
+ import matplotlib.pyplot as plt
+ color_options = "', '".join([x for x in plt.colormaps() if not x.endswith('_r')])
+
+ optional.add_argument(
+ '--colorMap',
+ help='Color map to use for the heatmap. If more than one heatmap is being plotted the color '
+ 'of each heatmap can be enter individually (e.g. `--colorMap Reds Blues`). Color maps '
+ 'are recycled if the number of color maps is smaller than the number of heatmaps being '
+ 'plotted. Available values can be seen here: http://matplotlib.org/users/colormaps.html '
+ 'The available options are: \'' + color_options + '\'',
+ default=['RdYlBu'],
+ nargs='+')
+
+ optional.add_argument(
+ '--alpha',
+ default=1.0,
+ type=check_float_0_1,
+ help='The alpha channel (transparency) to use for the heatmaps. The default is 1.0 and values '
+ 'must be between 0 and 1.')
+
+ optional.add_argument(
+ '--colorList',
+ help='List of colors to use to create a colormap. For example, if `--colorList black,yellow,blue` '
+ 'is set (colors separated by comas) then a color map that starts with black, continues to '
+ 'yellow and finishes in blue is created. If this option is selected, it overrides the --colorMap '
+ 'chosen. The list of valid color names can be seen here: '
+ 'http://matplotlib.org/examples/color/named_colors.html '
+ 'Hex colors are valid (e.g #34a2b1). If individual colors for different heatmaps '
+ 'need to be specified they need to be separated by space as for example: '
+ '`--colorList "white,#cccccc" "white,darkred"` '
+ 'As for --colorMap, the color lists are recycled if their number is smaller thatn the number of'
+ 'plotted heatmaps. '
+ 'The number of transitions is defined by the --colorNumber option.',
+ type=check_list_of_comma_values,
+ nargs='+')
+
+ optional.add_argument(
+ '--colorNumber',
+ help='N.B., --colorList is required for an effect. This controls the '
+ 'number of transitions from one color to the other. If --colorNumber is '
+ 'the number of colors in --colorList then there will be no transitions '
+ 'between the colors.',
+ type=int,
+ default=256)
+
+ optional.add_argument('--zMin', '-min',
+ default=None,
+ help='Minimum value for the heatmap intensities. Multiple values, separated by '
+ 'spaces can be set for each heatmap. If the number of zMin values is smaller than'
+ 'the number of heatmaps the values are recycled. If a value is set to "auto", it will be set '
+ ' to the first percentile of the matrix values.',
+ type=str,
+ nargs='+')
+ optional.add_argument('--zMax', '-max',
+ default=None,
+ help='Maximum value for the heatmap intensities. Multiple values, separated by '
+ 'spaces can be set for each heatmap. If the number of zMax values is smaller than'
+ 'the number of heatmaps the values are recycled. If a value is set to "auto", it will be set '
+ ' to the 98th percentile of the matrix values.',
+ type=str,
+ nargs='+')
+ optional.add_argument('--heatmapHeight',
+ help='Plot height in cm. The default for the heatmap '
+ 'height is 28. The minimum value is '
+ '3 and the maximum is 100.',
+ type=float,
+ default=28)
+
+ optional.add_argument('--heatmapWidth',
+ help='Plot width in cm. The default value is 4 '
+ 'The minimum value is 1 and the '
+ 'maximum is 100.',
+ type=float,
+ default=4)
+ optional.add_argument(
+ '--whatToShow',
+ help='The default is to include a summary or profile plot on top '
+ 'of the heatmap and a heatmap colorbar. Other options are: '
+ '"plot and heatmap", "heatmap only", "heatmap and '
+ 'colorbar", and the default "plot, heatmap and '
+ 'colorbar".',
+ choices=["plot, heatmap and colorbar",
+ "plot and heatmap", "heatmap only",
+ "heatmap and colorbar"],
+ default='plot, heatmap and colorbar')
+
+ optional.add_argument(
+ '--boxAroundHeatmaps',
+ help='By default black boxes are plot around heatmaps. This can be turned off '
+ 'by setting --boxAroundHeatmaps no',
+ default='yes')
+
+ optional.add_argument('--xAxisLabel', '-x',
+ default='gene distance (bp)',
+ help='Description for the x-axis label.')
+
+ # end elif
+ optional.add_argument('--startLabel',
+ default='TSS',
+ help='[only for scale-regions mode] Label shown '
+ 'in the plot for the start of '
+ 'the region. Default is TSS (transcription '
+ 'start site), but could be changed to anything, '
+ 'e.g. "peak start". '
+ 'Same for the --endLabel option. See below.')
+ optional.add_argument('--endLabel',
+ default='TES',
+ help='[only for scale-regions mode] Label '
+ 'shown in the plot for the region '
+ 'end. Default is TES (transcription end site).')
+ optional.add_argument('--refPointLabel',
+ help='[only for reference-point mode] Label '
+ 'shown in the plot for the '
+ 'reference-point. Default '
+ 'is the same as the reference point selected '
+ '(e.g. TSS), but could be anything, e.g. '
+ '"peak start".',
+ default=None)
+
+ optional.add_argument('--labelRotation',
+ dest='label_rotation',
+ help='Rotation of the X-axis labels in degrees. The default is 0, positive values denote a counter-clockwise rotation.',
+ type=float,
+ default=0.0)
+
+ optional.add_argument('--nanAfterEnd',
+ help=argparse.SUPPRESS,
+ default=False)
+
+ optional.add_argument('--regionsLabel', '-z',
+ help='Labels for the regions plotted in the '
+ 'heatmap. If more than one region is being '
+ 'plotted, a list of labels separated by spaces is required. '
+ 'If a label itself contains a space, then quotes are '
+ 'needed. For example, --regionsLabel label_1, "label 2". ',
+ nargs='+')
+
+ optional.add_argument('--samplesLabel',
+ help='Labels for the samples plotted. The '
+ 'default is to use the file name of the '
+ 'sample. The sample labels should be separated '
+ 'by spaces and quoted if a label itself'
+ 'contains a space E.g. --samplesLabel label-1 "label 2" ',
+ nargs='+')
+
+ optional.add_argument('--plotTitle', '-T',
+ help='Title of the plot, to be printed on top of '
+ 'the generated image. Leave blank for no title.',
+ default='')
+
+ optional.add_argument('--yAxisLabel', '-y',
+ default='',
+ help='Y-axis label for the top panel.')
+
+ optional.add_argument('--yMin',
+ default=None,
+ nargs='+',
+ help='Minimum value for the Y-axis. Multiple values, separated by '
+ 'spaces can be set for each profile. If the number of yMin values is smaller than'
+ 'the number of plots, the values are recycled.')
+ optional.add_argument('--yMax',
+ default=None,
+ nargs='+',
+ help='Maximum value for the Y-axis. Multiple values, separated by '
+ 'spaces can be set for each profile. If the number of yMin values is smaller than'
+ 'the number of plots, the values are recycled.')
+
+ optional.add_argument('--legendLocation',
+ default='best',
+ choices=['best',
+ 'upper-right',
+ 'upper-left',
+ 'upper-center',
+ 'lower-left',
+ 'lower-right',
+ 'lower-center',
+ 'center',
+ 'center-left',
+ 'center-right',
+ 'none'
+ ],
+ help='Location for the legend in the summary plot. '
+ 'Note that "none" does not work for the profiler.')
+
+ optional.add_argument('--perGroup',
+ help='The default is to plot all groups of regions by '
+ 'sample. Using this option instead plots all samples by '
+ 'group of regions. Note that this is only useful if you '
+ 'have multiple groups of regions. by sample rather than '
+ 'group.',
+ action='store_true')
+
+ optional.add_argument('--plotFileFormat',
+ metavar='',
+ help='Image format type. If given, this '
+ 'option overrides the '
+ 'image format based on the plotFile ending. '
+ 'The available options are: "png", '
+ '"eps", "pdf", "plotly" and "svg"',
+ choices=['png', 'pdf', 'svg', 'eps', 'plotly'])
+
+ optional.add_argument('--verbose',
+ help='If set, warning messages and '
+ 'additional information are given.',
+ action='store_true')
+ return parser
+
+
+def requiredLength(minL, maxL):
+ """
+ This is an optional action that can be given to argparse.add_argument(..., nargs='+')
+ to allow a specified numeric range of arguments (e.g., "only 1 or 2 arguments").
+
+ minL and maxL are the minimum and maximum length
+ """
+ # https://stackoverflow.com/questions/4194948/python-argparse-is-there-a-way-to-specify-a-range-in-nargs
+ class RequiredLength(argparse.Action):
+ def __call__(self, parser, args, values, option_string=None):
+ if not minL <= len(values) <= maxL:
+ msg = 'argument "{}" requires between {} and {} arguments'.format(self.dest, minL, maxL)
+ raise argparse.ArgumentTypeError(msg)
+ setattr(args, self.dest, values)
+ return RequiredLength
diff --git a/deepTools/source/deeptools/plotCorrelation.py b/deepTools/source/deeptools/plotCorrelation.py
new file mode 100644
index 0000000000000000000000000000000000000000..988cf559e249de492a6df925f2d0f36cffae9c59
--- /dev/null
+++ b/deepTools/source/deeptools/plotCorrelation.py
@@ -0,0 +1,261 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+import argparse
+import numpy as np
+import matplotlib
+matplotlib.use('Agg')
+matplotlib.rcParams['pdf.fonttype'] = 42
+matplotlib.rcParams['svg.fonttype'] = 'none'
+from deeptools import cm # noqa: F401
+import matplotlib.pyplot as plt
+from importlib.metadata import version
+from deeptools.correlation import Correlation
+from deeptools.parserCommon import writableFile
+
+old_settings = np.seterr(all='ignore')
+
+
+def parse_arguments(args=None):
+ basic_args = plot_correlation_args()
+ heatmap_parser = heatmap_options()
+ scatter_parser = scatterplot_options()
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ description="""
+Tool for the analysis and visualization of sample correlations based on the output of multiBamSummary or
+multiBigwigSummary. Pearson or Spearman methods are available to compute correlation
+coefficients. Results can be saved as multiple
+scatter plots depicting the pairwise correlations or as a clustered heatmap,
+where the colors represent the correlation coefficients and the clusters are constructed using complete linkage.
+Optionally, the values can be saved as tables, too.
+
+
+detailed help:
+
+ plotCorrelation -h
+
+""",
+ epilog='example usages:\n'
+ 'plotCorrelation -in results_file --whatToPlot heatmap --corMethod pearson -o heatmap.png\n\n'
+ ' \n\n',
+ parents=[basic_args, heatmap_parser, scatter_parser],
+ usage='plotCorrelation -in matrix.gz -c spearman -p heatmap -o plot.png\n'
+ 'help: plotCorrelation -h / plotCorrelation --help\n')
+
+ return parser
+
+
+def plot_correlation_args():
+ parser = argparse.ArgumentParser(add_help=False)
+ required = parser.add_argument_group('Required arguments')
+
+ # define the arguments
+ required.add_argument('--corData', '-in',
+ metavar='FILE',
+ help='Compressed matrix of values generated by multiBigwigSummary or multiBamSummary',
+ required=True)
+
+ required.add_argument('--corMethod', '-c',
+ help="Correlation method.",
+ choices=['spearman', 'pearson'],
+ required=True)
+
+ required.add_argument('--whatToPlot', '-p',
+ help="Choose between a heatmap or pairwise scatter plots",
+ choices=['heatmap', 'scatterplot'],
+ required=True)
+
+ optional = parser.add_argument_group('Optional arguments')
+ optional.add_argument('--plotFile', '-o',
+ help='File to save the heatmap to. The file extension determines the format, '
+ 'so heatmap.pdf will save the heatmap in PDF format. '
+ 'The available formats are: .png, '
+ '.eps, .pdf and .svg.',
+ type=writableFile,
+ metavar='FILE')
+
+ optional.add_argument('--skipZeros',
+ help='By setting this option, genomic regions '
+ 'that have zero or missing (nan) values in all samples '
+ 'are excluded.',
+ action='store_true',
+ required=False)
+
+ optional.add_argument('--labels', '-l',
+ metavar='sample1 sample2',
+ help='User defined labels instead of default labels from '
+ 'file names. '
+ 'Multiple labels have to be separated by spaces, e.g. '
+ '--labels sample1 sample2 sample3',
+ nargs='+')
+
+ optional.add_argument('--plotTitle', '-T',
+ help='Title of the plot, to be printed on top of '
+ 'the generated image. Leave blank for no title. (Default: %(default)s)',
+ default='')
+
+ optional.add_argument('--plotFileFormat',
+ metavar='FILETYPE',
+ help='Image format type. If given, this option '
+ 'overrides the image format based on the plotFile '
+ 'ending. The available options are: png, '
+ 'eps, pdf and svg.',
+ choices=['png', 'pdf', 'svg', 'eps', 'plotly'])
+
+ optional.add_argument(
+ '--removeOutliers',
+ help='If set, bins with very large counts are removed. '
+ 'Bins with abnormally high reads counts artificially increase '
+ 'pearson correlation; that\'s why, multiBamSummary tries '
+ 'to remove outliers using the median absolute deviation (MAD) '
+ 'method applying a threshold of 200 to only consider extremely '
+ 'large deviations from the median. The ENCODE blacklist page '
+ '(https://sites.google.com/site/anshulkundaje/projects/blacklists) '
+ 'contains useful information about regions with unusually high counts'
+ 'that may be worth removing.',
+ action='store_true')
+
+ optional.add_argument('--version', action='version',
+ version='%(prog)s {}'.format(version('deeptools')))
+
+ group = parser.add_argument_group('Output optional options')
+
+ group.add_argument('--outFileCorMatrix',
+ help='Save matrix with pairwise correlation values to a tab-separated file.',
+ metavar='FILE',
+ type=writableFile)
+
+ return parser
+
+
+def scatterplot_options():
+ """
+ Options specific for creating the scatter plot
+ """
+ parser = argparse.ArgumentParser(add_help=False)
+ scatter_opts = parser.add_argument_group('Scatter plot options')
+
+ scatter_opts.add_argument('--xRange',
+ help='The X axis range. The default scales these such that the full range of dots is displayed.',
+ type=int,
+ nargs=2,
+ default=None)
+
+ scatter_opts.add_argument('--yRange',
+ help='The Y axis range. The default scales these such that the full range of dots is displayed.',
+ type=int,
+ nargs=2,
+ default=None)
+
+ scatter_opts.add_argument('--log1p',
+ help='Plot the natural log of the scatter plot after adding 1. Note that this is ONLY for plotting, the correlation is unaffected.',
+ action='store_true')
+
+ return parser
+
+
+def heatmap_options():
+ """
+ Options for generating the correlation heatmap
+ """
+ parser = argparse.ArgumentParser(add_help=False)
+ heatmap = parser.add_argument_group('Heatmap options')
+
+ heatmap.add_argument('--plotHeight',
+ help='Plot height in cm. (Default: %(default)s)',
+ type=float,
+ default=9.5)
+
+ heatmap.add_argument('--plotWidth',
+ help='Plot width in cm. The minimum value is 1 cm. (Default: %(default)s)',
+ type=float,
+ default=11)
+
+ heatmap.add_argument('--zMin', '-min',
+ default=None,
+ help='Minimum value for the heatmap intensities. '
+ 'If not specified, the value is set automatically',
+ type=float)
+
+ heatmap.add_argument('--zMax', '-max',
+ default=None,
+ help='Maximum value for the heatmap intensities.'
+ 'If not specified, the value is set automatically',
+ type=float)
+
+ heatmap.add_argument(
+ '--colorMap', default='jet',
+ metavar='',
+ help='Color map to use for the heatmap. Available values can be '
+ 'seen here: '
+ 'http://matplotlib.org/examples/color/colormaps_reference.html')
+
+ heatmap.add_argument('--plotNumbers',
+ help='If set, then the correlation number is plotted '
+ 'on top of the heatmap. This option is only valid when plotting a heatmap.',
+ action='store_true',
+ required=False)
+
+ return parser
+
+
+def main(args=None):
+
+ args = parse_arguments().parse_args(args)
+
+ if args.plotFile is None and args.outFileCorMatrix is None:
+ sys.exit("At least one of --plotFile and --outFileCorMatrix must be specified!\n")
+
+ corr = Correlation(args.corData,
+ args.corMethod,
+ labels=args.labels,
+ remove_outliers=args.removeOutliers,
+ skip_zeros=args.skipZeros)
+
+ if args.corMethod == 'pearson':
+ # test if there are outliers and write a message recommending the removal
+ if len(corr.get_outlier_indices(np.asarray(corr.matrix).flatten())) > 0:
+ if args.removeOutliers:
+ sys.stderr.write("\nOutliers were detected in the data. They "
+ "will be removed to avoid bias "
+ "in the pearson correlation.\n")
+
+ else:
+ sys.stderr.write("\nOutliers were detected in the data. Consider "
+ "using the --removeOutliers parameter to avoid a bias "
+ "in the pearson correlation.\n")
+
+ if args.colorMap:
+ try:
+ plt.get_cmap(args.colorMap)
+ except ValueError as error:
+ sys.stderr.write(
+ "A problem was found. Message: {}\n".format(error))
+ exit()
+
+ if args.plotFile is not None:
+ if args.whatToPlot == 'scatterplot':
+ corr.plot_scatter(args.plotFile,
+ plot_title=args.plotTitle,
+ image_format=args.plotFileFormat,
+ xRange=args.xRange,
+ yRange=args.yRange,
+ log1p=args.log1p)
+ else:
+ corr.plot_correlation(args.plotFile,
+ vmax=args.zMax,
+ vmin=args.zMin,
+ colormap=args.colorMap,
+ plot_title=args.plotTitle,
+ image_format=args.plotFileFormat,
+ plot_numbers=args.plotNumbers,
+ plotWidth=args.plotWidth,
+ plotHeight=args.plotHeight)
+
+ if args.outFileCorMatrix:
+ o = open(args.outFileCorMatrix, "w")
+ o.write("#plotCorrelation --outFileCorMatrix\n")
+ corr.save_corr_matrix(o)
+ o.close()
diff --git a/deepTools/source/deeptools/plotCoverage.py b/deepTools/source/deeptools/plotCoverage.py
new file mode 100644
index 0000000000000000000000000000000000000000..464375c7a3ba6d406d39e05c05ad601d6b3e7770
--- /dev/null
+++ b/deepTools/source/deeptools/plotCoverage.py
@@ -0,0 +1,344 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import os
+import sys
+import argparse
+import numpy as np
+
+import matplotlib
+matplotlib.use('Agg')
+matplotlib.rcParams['pdf.fonttype'] = 42
+matplotlib.rcParams['svg.fonttype'] = 'none'
+from deeptools import cm # noqa: F401
+import matplotlib.pyplot as plt
+import plotly.offline as py
+import plotly.graph_objs as go
+from importlib.metadata import version
+import deeptools.countReadsPerBin as countR
+from deeptools import parserCommon
+from deeptools.utilities import smartLabels
+
+old_settings = np.seterr(all='ignore')
+
+
+def parse_arguments(args=None):
+ parent_parser = parserCommon.getParentArgParse(binSize=False)
+ read_options_parser = parserCommon.read_options()
+
+ parser = \
+ argparse.ArgumentParser(
+ parents=[required_args(), parent_parser, read_options_parser],
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ add_help=False,
+ description="""
+
+This tool is useful to assess the sequencing depth of a given sample.
+It samples 1 million bp, counts the number of overlapping reads and can report
+a histogram that tells you how many bases are covered how many times.
+Multiple BAM files are accepted, but they all should correspond to the same genome assembly.
+
+detailed usage help:
+ $ plotCoverage -h
+
+""",
+ epilog='example usages:\nplotCoverage '
+ '--bamfiles file1.bam file2.bam -o results.png\n\n'
+ ' \n\n',
+ conflict_handler='resolve',
+ usage='plotCoverage -b sample1.bam sample2.bam -o coverage.png \n'
+ 'help: plotCoverage -h / plotCoverage --help\n')
+
+ parser.add_argument('--version', action='version',
+ version='%(prog)s {}'.format(version('deeptools')))
+
+ return parser
+
+
+def process_args(args=None):
+ args = parse_arguments().parse_args(args)
+
+ if not args.labels:
+ if args.smartLabels:
+ args.labels = smartLabels(args.bamfiles)
+ else:
+ args.labels = [os.path.basename(x) for x in args.bamfiles]
+ if args.labels and len(args.bamfiles) != len(args.labels):
+ sys.exit("The number of labels does not match the number of BAM files.")
+
+ return args
+
+
+def required_args():
+ parser = argparse.ArgumentParser(add_help=False)
+ required = parser.add_argument_group('Required arguments')
+
+ required.add_argument('--bamfiles', '-b',
+ metavar='FILE1 FILE2',
+ help='List of indexed BAM files separated by spaces.',
+ nargs='+',
+ required=True)
+
+ optional = parser.add_argument_group('Optional arguments')
+
+ optional.add_argument("--help", "-h", action="help",
+ help="show this help message and exit")
+
+ optional.add_argument('--plotFile', '-o',
+ type=parserCommon.writableFile,
+ help='File name to save the plot to.')
+
+ optional.add_argument('--labels', '-l',
+ metavar='sample1 sample2',
+ help='User defined labels instead of default labels from '
+ 'file names. '
+ 'Multiple labels have to be separated by spaces, e.g. '
+ '--labels sample1 sample2 sample3',
+ nargs='+')
+
+ optional.add_argument('--smartLabels',
+ action='store_true',
+ help='Instead of manually specifying labels for the input '
+ 'BAM files, this causes deepTools to use the file name '
+ 'after removing the path and extension.')
+
+ optional.add_argument('--plotTitle', '-T',
+ help='Title of the plot, to be printed on top of '
+ 'the generated image. Leave blank for no title. (Default: %(default)s)',
+ default='')
+
+ optional.add_argument('--skipZeros',
+ help='By setting this option, genomic regions '
+ 'that have zero or nan values in _all_ samples '
+ 'are excluded.',
+ action='store_true',
+ required=False)
+
+ optional.add_argument('--numberOfSamples', '-n',
+ help='Number of 1 bp regions to sample. (Default: %(default)s)',
+ required=False,
+ type=int,
+ default=1000000)
+
+ optional.add_argument('--BED',
+ help='Limits the coverage analysis to '
+ 'the regions specified in these files. This overrides --numberOfSamples. '
+ 'Due to memory requirements, it is inadvised to combine this with '
+ '--outRawCounts or many tens of thousands of regions, as per-base '
+ 'coverage is used!',
+ metavar='FILE1.bed FILE2.bed',
+ nargs='+')
+
+ optional.add_argument('--outRawCounts',
+ help='Save raw counts (coverages) to file.',
+ type=parserCommon.writableFile,
+ metavar='FILE')
+
+ optional.add_argument('--outCoverageMetrics',
+ help='Save percentage of bins/regions above the specified thresholds to '
+ 'the specified file. The coverage thresholds are specified by '
+ '--coverageThresholds. If no coverage thresholds are specified, the file '
+ 'will be empty.',
+ type=parserCommon.writableFile,
+ metavar='FILE')
+
+ optional.add_argument('--coverageThresholds', '-ct',
+ type=int,
+ action="append",
+ help='The percentage of reported bins/regions with signal at least as '
+ 'high as the given threshold. This can be specified multiple times.')
+
+ optional.add_argument('--plotHeight',
+ help='Plot height in cm. (Default: %(default)s)',
+ type=float,
+ default=5.0)
+
+ optional.add_argument('--plotWidth',
+ help='Plot width in cm. The minimum value is 1 cm. (Default: %(default)s)',
+ type=float,
+ default=15.0)
+
+ optional.add_argument('--plotFileFormat',
+ metavar='FILETYPE',
+ help='Image format type. If given, this option '
+ 'overrides the image format based on the plotFile '
+ 'ending. The available options are: png, '
+ 'eps, pdf, svg and plotly.',
+ default=None,
+ choices=['png', 'pdf', 'svg', 'eps', 'plotly'])
+
+ return parser
+
+
+def main(args=None):
+ args = process_args(args)
+
+ if not args.outRawCounts and not args.plotFile and not args.outCoverageMetrics:
+ sys.exit("At least one of --plotFile, --outRawCounts and --outCoverageMetrics are required.\n")
+
+ if 'BED' in args:
+ bed_regions = args.BED
+ else:
+ bed_regions = None
+
+ cr = countR.CountReadsPerBin(args.bamfiles,
+ binLength=1,
+ bedFile=bed_regions,
+ numberOfSamples=args.numberOfSamples,
+ numberOfProcessors=args.numberOfProcessors,
+ verbose=args.verbose,
+ region=args.region,
+ blackListFileName=args.blackListFileName,
+ extendReads=args.extendReads,
+ minMappingQuality=args.minMappingQuality,
+ ignoreDuplicates=args.ignoreDuplicates,
+ center_read=args.centerReads,
+ samFlag_include=args.samFlagInclude,
+ samFlag_exclude=args.samFlagExclude,
+ minFragmentLength=args.minFragmentLength,
+ maxFragmentLength=args.maxFragmentLength,
+ bed_and_bin=True,
+ out_file_for_raw_data=args.outRawCounts)
+
+ num_reads_per_bin = cr.run()
+
+ if args.outCoverageMetrics and args.coverageThresholds:
+ args.coverageThresholds.sort() # Galaxy in particular tends to give things in a weird order
+ of = open(args.outCoverageMetrics, "w")
+ of.write("Sample\tThreshold\tPercent\n")
+ nbins = float(num_reads_per_bin.shape[0])
+ for thresh in args.coverageThresholds:
+ vals = np.sum(num_reads_per_bin >= thresh, axis=0)
+ for lab, val in zip(args.labels, vals):
+ of.write("{}\t{}\t{:6.3f}\n".format(lab, thresh, 100. * val / nbins))
+ of.close()
+
+ if args.outRawCounts:
+ # append to the generated file the
+ # labels
+ header = "#plotCoverage --outRawCounts\n#'chr'\t'start'\t'end'\t"
+ header += "'" + "'\t'".join(args.labels) + "'\n"
+ f = open(args.outRawCounts, 'r+')
+ content = f.read()
+ f.seek(0, 0)
+ f.write(header + content)
+ f.close()
+
+ if num_reads_per_bin.shape[0] < 2:
+ exit("ERROR: too few non-zero bins found.\n"
+ "If using --region please check that this "
+ "region is covered by reads.\n")
+
+ if args.skipZeros:
+ num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin)
+
+ if args.plotFile:
+ if args.plotFileFormat == 'plotly':
+ fig = go.Figure()
+ fig['layout']['xaxis1'] = {'domain': [0.0, 0.48], 'anchor': 'x1', 'title': 'coverage (#reads per base)'}
+ fig['layout']['xaxis2'] = {'domain': [0.52, 1.0], 'anchor': 'x2', 'title': 'coverage (#reads per base)'}
+ fig['layout']['yaxis1'] = {'domain': [0.0, 1.0], 'anchor': 'x1', 'title': 'fraction of bases sampled'}
+ fig['layout']['yaxis2'] = {'domain': [0.0, 1.0], 'anchor': 'x2', 'title': 'fraction of bases sampled >= coverage'}
+ fig['layout'].update(title=args.plotTitle)
+ else:
+ fig, axs = plt.subplots(1, 2, figsize=(args.plotWidth, args.plotHeight))
+ plt.suptitle(args.plotTitle)
+
+ # plot up to two std from mean
+ num_reads_per_bin = num_reads_per_bin.astype(int)
+ sample_mean = num_reads_per_bin.mean(axis=0)
+ sample_std = num_reads_per_bin.std(axis=0)
+ sample_max = num_reads_per_bin.max(axis=0)
+ sample_min = num_reads_per_bin.min(axis=0)
+ sample_25 = np.percentile(num_reads_per_bin, 25, axis=0)
+ sample_50 = np.percentile(num_reads_per_bin, 50, axis=0)
+ sample_75 = np.percentile(num_reads_per_bin, 75, axis=0)
+
+ # use the largest 99th percentile from all samples to set the x_max value
+ x_max = np.max(np.percentile(num_reads_per_bin, 99, axis=0))
+ # plot coverage
+ # print headers for text output
+ print("sample\tmean\tstd\tmin\t25%\t50%\t75%\tmax")
+ # the determination of a sensible value for y_max of the first plot (fraction of bases sampled vs.
+ # coverage) is important because, depending on the data,
+ # it becomes very difficult to see the lines in the plot. For example, if the coverage of a sample
+ # is a nice gaussian curve with a large mean of 50. Then a sensible range for the y axis (fraction of
+ # reads having coverage=x) is (0, 0.02) which nicely shows the coverage curve. If instead the coverage is
+ # very por and centers close to 1 then a good y axis range is (0,1).
+
+ # the current implementation aims to find the y_value for which 50% of the reads >= x (coverage) and
+ # sets that as the x_axis range.
+ y_max = []
+ data = []
+ # We need to manually set the line colors so they're shared between the two plots.
+ plotly_colors = ["#d73027", "#fc8d59", "#f33090", "#e0f3f8", "#91bfdb", "#4575b4"]
+ plotly_styles = sum([6 * ["solid"], 6 * ["dot"], 6 * ["dash"], 6 * ["longdash"], 6 * ["dashdot"], 6 * ["longdashdot"]], [])
+ for idx, col in enumerate(num_reads_per_bin.T):
+ if args.plotFile:
+ frac_reads_per_coverage = np.bincount(col.astype(int)).astype(float) / num_reads_per_bin.shape[0]
+ csum = np.bincount(col.astype(int))[::-1].cumsum()
+ csum_frac = csum.astype(float)[::-1] / csum.max()
+ if args.plotFileFormat == 'plotly':
+ color = plotly_colors[idx % len(plotly_colors)]
+ dash = plotly_styles[idx % len(plotly_styles)]
+ trace = go.Scatter(x=np.arange(0, int(x_max) - 1),
+ y=frac_reads_per_coverage[:int(x_max)],
+ mode='lines',
+ xaxis='x1',
+ yaxis='y1',
+ line=dict(color=color, dash=dash),
+ name="{}, mean={:.1f}".format(args.labels[idx], sample_mean[idx]),
+ legendgroup="{}".format(idx))
+ data.append(trace)
+ trace = go.Scatter(x=np.arange(0, int(x_max) - 1),
+ y=csum_frac[:int(x_max)],
+ mode='lines',
+ xaxis='x2',
+ yaxis='y2',
+ line=dict(color=color, dash=dash),
+ name=args.labels[idx],
+ showlegend=False,
+ legendgroup="{}".format(idx))
+ data.append(trace)
+ else:
+ axs[0].plot(frac_reads_per_coverage, label="{}, mean={:.1f}".format(args.labels[idx], sample_mean[idx]))
+ axs[1].plot(csum_frac, label=args.labels[idx])
+ # find the indexes (i.e. the x values) for which the cumulative distribution 'fraction of bases
+ # sampled >= coverage' where fraction of bases sampled = 50%: `np.flatnonzero(csum_frac>0.5)`
+ # then find the fraction of bases sampled that that have the largest x
+ y_max.append(frac_reads_per_coverage[max(np.flatnonzero(csum_frac > 0.5))])
+ print("{}\t{:0.2f}\t{:0.2f}\t{}\t{}\t{}\t{}\t{}\t".format(args.labels[idx],
+ sample_mean[idx],
+ sample_std[idx],
+ sample_min[idx],
+ sample_25[idx],
+ sample_50[idx],
+ sample_75[idx],
+ sample_max[idx],
+ ))
+
+ if args.plotFile:
+ # Don't clip plots
+ y_max = max(y_max)
+ if args.plotFileFormat == "plotly":
+ fig.add_traces(data)
+ fig['layout']['yaxis1'].update(range=[0.0, min(1, y_max + (y_max * 0.10))])
+ fig['layout']['yaxis2'].update(range=[0.0, 1.0])
+ py.plot(fig, filename=args.plotFile, auto_open=False)
+ else:
+ axs[0].set_ylim(0, min(1, y_max + (y_max * 0.10)))
+ axs[0].set_xlim(0, x_max)
+ axs[0].set_xlabel('coverage (#reads per bp)')
+ axs[0].legend(fancybox=True, framealpha=0.5)
+ axs[0].set_ylabel('fraction of bases sampled')
+ # plot cumulative coverage
+ axs[1].set_xlim(0, x_max)
+ axs[1].set_xlabel('coverage (#reads per bp)')
+ axs[1].set_ylabel('fraction of bases sampled >= coverage')
+ axs[1].legend(fancybox=True, framealpha=0.5)
+ plt.savefig(args.plotFile, format=args.plotFileFormat)
+ plt.close()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/deepTools/source/deeptools/plotEnrichment.py b/deepTools/source/deeptools/plotEnrichment.py
new file mode 100644
index 0000000000000000000000000000000000000000..bbd53f90d5ec98b4ebf9194b680dcc34b30c7777
--- /dev/null
+++ b/deepTools/source/deeptools/plotEnrichment.py
@@ -0,0 +1,588 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+import argparse
+import numpy as np
+import matplotlib
+matplotlib.use('Agg')
+matplotlib.rcParams['pdf.fonttype'] = 42
+matplotlib.rcParams['svg.fonttype'] = 'none'
+from deeptools import cm # noqa: F401
+import matplotlib.pyplot as plt
+import matplotlib.gridspec as gridspec
+
+import plotly.offline as py
+import plotly.graph_objs as go
+
+from deeptools.mapReduce import mapReduce, getUserRegion, blSubtract
+from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
+from deeptools.utilities import getCommonChrNames, mungeChromosome, getTLen, smartLabels
+from deeptools.bamHandler import openBam
+from deeptoolsintervals import Enrichment, GTF
+from deeptools.countReadsPerBin import CountReadsPerBin as cr
+from deeptools import parserCommon
+
+
+old_settings = np.seterr(all='ignore')
+
+
+def parse_arguments(args=None):
+ basic_args = plot_enrichment_args()
+
+ # --region, --blackListFileName, -p and -v
+ parent_parser = parserCommon.getParentArgParse(binSize=False)
+
+ # --extend reads and such
+ read_options = parserCommon.read_options()
+
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ description="""
+Tool for calculating and plotting the signal enrichment in either regions in BED
+format or feature types (column 3) in GTF format. The underlying datapoints can also be output.
+Metrics are plotted as a fraction of total reads. Regions in a BED file are assigned to the 'peak' feature.
+
+detailed help:
+
+ plotEnrichment -h
+
+""",
+ epilog='example usages:\n'
+ 'plotEnrichment -b file1.bam file2.bam --BED peaks.bed -o enrichment.png\n\n'
+ ' \n\n',
+ parents=[basic_args, parent_parser, read_options],
+ usage='plotEnrichment -b sample1.bam sample2.bam --BED peaks.bed -o enrichment.png\n'
+ 'help: plotEnrichment -h / plotEnrichment --help\n')
+
+ return parser
+
+
+def plot_enrichment_args():
+ parser = argparse.ArgumentParser(add_help=False)
+ required = parser.add_argument_group('Required arguments')
+
+ # define the arguments
+ required.add_argument('--bamfiles', '-b',
+ metavar='file1.bam file2.bam',
+ help='List of indexed bam files separated by spaces.',
+ nargs='+',
+ required=True)
+
+ required.add_argument('--BED',
+ help='Limits the enrichment analysis to '
+ 'the regions specified in these BED/GTF files. Enrichment '
+ 'is calculated as the number of reads overlapping each '
+ 'feature type. The feature type is column 3 in a GTF file '
+ 'and "peak" for BED files.',
+ metavar='FILE1.bed FILE2.bed',
+ nargs='+',
+ required=True)
+
+ optional = parser.add_argument_group('Optional arguments')
+
+ optional.add_argument('--plotFile', '-o',
+ help='File to save the plot to. The file extension determines the format, '
+ 'so heatmap.pdf will save the heatmap in PDF format. '
+ 'The available formats are: .png, '
+ '.eps, .pdf and .svg.',
+ type=parserCommon.writableFile,
+ metavar='FILE')
+
+ optional.add_argument('--attributeKey',
+ help='Instead of deriving labels from the feature column in a GTF file, '
+ 'use the given attribute key, such as gene_biotype. For BED files or '
+ 'entries without the attribute key, None is used as the label.')
+
+ optional.add_argument('--labels', '-l',
+ metavar='sample1 sample2',
+ help='User defined labels instead of default labels from '
+ 'file names. '
+ 'Multiple labels have to be separated by spaces, e.g. '
+ '--labels sample1 sample2 sample3',
+ nargs='+')
+
+ optional.add_argument('--smartLabels',
+ action='store_true',
+ help='Instead of manually specifying labels for the input '
+ 'BAM/BED/GTF files, this causes deepTools to use the file name '
+ 'after removing the path and extension. For BED/GTF files, the '
+ 'eventual region name will be overriden if specified inside '
+ 'the file.')
+
+ optional.add_argument('--regionLabels',
+ metavar="region1 region2",
+ help="For BED files, the label given to its region is "
+ "the file name, but this can be overridden by providing "
+ "a custom label. For GTF files this is ignored. Note "
+ "that if you provide labels, you MUST provide one for each "
+ "BED/GTF file, even though it will be ignored for GTF files.",
+ nargs='+')
+
+ optional.add_argument('--plotTitle', '-T',
+ help='Title of the plot, to be printed on top of '
+ 'the generated image. Leave blank for no title. (Default: %(default)s)',
+ default='')
+
+ optional.add_argument('--plotFileFormat',
+ metavar='FILETYPE',
+ help='Image format type. If given, this option '
+ 'overrides the image format based on the plotFile '
+ 'ending. The available options are: png, '
+ 'eps, pdf, plotly and svg.',
+ choices=['png', 'pdf', 'svg', 'eps', 'plotly'])
+
+ optional.add_argument('--outRawCounts',
+ help='Save the counts per region to a tab-delimited file.',
+ type=parserCommon.writableFile,
+ metavar='FILE')
+
+ optional.add_argument('--perSample',
+ help='Group the plots by sample, rather than by feature type (the default).',
+ action='store_true')
+
+ optional.add_argument('--variableScales',
+ help='By default, the y-axes are always 0-100. This allows the axis range to be restricted.',
+ action='store_true')
+
+ optional.add_argument('--plotHeight',
+ help='Plot height in cm. (Default: %(default)s)',
+ type=float,
+ default=20)
+
+ optional.add_argument('--plotWidth',
+ help='Plot width in cm. The minimum value is 1 cm. (Default: %(default)s)',
+ type=float,
+ default=20)
+
+ optional.add_argument('--colors',
+ help='List of colors to use '
+ 'for the plotted lines. Color names '
+ 'and html hex strings (e.g., #eeff22) '
+ 'are accepted. The color names should '
+ 'be space separated. For example, '
+ '--colors red blue green ',
+ nargs='+')
+
+ optional.add_argument('--numPlotsPerRow',
+ help='Number of plots per row (Default: %(default)s)',
+ type=int,
+ default=4)
+
+ optional.add_argument('--alpha',
+ default=0.9,
+ type=parserCommon.check_float_0_1,
+ help='The alpha channel (transparency) to use for the bars. '
+ 'The default is 0.9 and values must be between 0 and 1.')
+
+ optional.add_argument('--Offset',
+ help='Uses this offset inside of each read as the signal. This is useful in '
+ 'cases like RiboSeq or GROseq, where the signal is 12, 15 or 0 bases past the '
+ 'start of the read. This can be paired with the --filterRNAstrand option. '
+ 'Note that negative values indicate offsets from the end of each read. A value '
+ 'of 1 indicates the first base of the alignment (taking alignment orientation '
+ 'into account). Likewise, a value of -1 is the last base of the alignment. An '
+ 'offset of 0 is not permitted. If two values are specified, then they will be '
+ 'used to specify a range of positions. Note that specifying something like '
+ '--Offset 5 -1 will result in the 5th through last position being used, which '
+ 'is equivalent to trimming 4 bases from the 5-prime end of alignments.',
+ metavar='INT',
+ type=int,
+ nargs='+',
+ required=False)
+
+ bed12 = parser.add_argument_group('BED12 arguments')
+
+ bed12.add_argument('--keepExons',
+ help="For BED12 files, use each exon as a region, rather than columns 2/3",
+ action="store_true")
+
+ return parser
+
+
+def getBAMBlocks(read, defaultFragmentLength, centerRead, offset=None):
+ """
+ This is basically get_fragment_from_read from countReadsPerBin
+ """
+ blocks = None
+ maxPairedFragmentLength = 0
+ if defaultFragmentLength != "read length":
+ maxPairedFragmentLength = 4 * defaultFragmentLength
+
+ if defaultFragmentLength == 'read length':
+ blocks = read.get_blocks()
+ else:
+ if cr.is_proper_pair(read, maxPairedFragmentLength):
+ if read.is_reverse:
+ fragmentStart = read.next_reference_start
+ fragmentEnd = read.reference_end
+ else:
+ fragmentStart = read.reference_start
+ # the end of the fragment is defined as
+ # the start of the forward read plus the insert length
+ fragmentEnd = read.reference_start + abs(read.template_length)
+ # Extend using the default fragment length
+ else:
+ if read.is_reverse:
+ fragmentStart = read.reference_end - defaultFragmentLength
+ fragmentEnd = read.reference_end
+ else:
+ fragmentStart = read.reference_start
+ fragmentEnd = read.reference_start + defaultFragmentLength
+ if centerRead:
+ fragmentCenter = fragmentEnd - (fragmentEnd - fragmentStart) / 2
+ fragmentStart = fragmentCenter - read.infer_query_length(always=False) / 2
+ fragmentEnd = fragmentStart + read.infer_query_length(always=False)
+
+ assert fragmentStart < fragmentEnd, "fragment start greater than fragment" \
+ "end for read {}".format(read.query_name)
+ blocks = [(int(fragmentStart), int(fragmentEnd))]
+
+ # Handle read offsets, if needed
+ if offset is not None:
+ rv = [(None, None)]
+ if len(offset) > 1:
+ if offset[0] > 0:
+ offset[0] -= 1
+ if offset[1] < 0:
+ offset[1] += 1
+ else:
+ if offset[0] > 0:
+ offset[0] -= 1
+ offset = [offset[0], offset[0] + 1]
+ else:
+ offset = [offset[0], None]
+ if offset[1] == 0:
+ # -1 gets switched to 0, which screws things up
+ offset = (offset[0], None)
+
+ stretch = []
+ # For the sake of simplicity, convert [(10, 20), (30, 40)] to [10, 11, 12, 13, ..., 40]
+ # Then subset accordingly
+ for block in blocks:
+ stretch.extend(range(block[0], block[1]))
+ if read.is_reverse:
+ stretch = stretch[::-1]
+ try:
+ foo = stretch[offset[0]:offset[1]]
+ except:
+ return rv
+
+ if len(foo) == 0:
+ return rv
+ if read.is_reverse:
+ foo = foo[::-1]
+ # Convert the stretch back to a list of tuples
+ foo = np.array(foo)
+ d = foo[1:] - foo[:-1]
+ idx = np.argwhere(d > 1).flatten().tolist() # This now holds the interval bounds as a list
+ idx.append(-1)
+ last = 0
+ blocks = []
+ for i in idx:
+ blocks.append((foo[last].astype("int"), foo[i].astype("int") + 1))
+ last = i + 1
+ return blocks
+
+
+def getEnrichment_worker(arglist):
+ """
+ This is the worker function of plotEnrichment.
+
+ In short, given a region, iterate over all reads **starting** in it.
+ Filter/extend them as requested and check each for an overlap with
+ findOverlaps. For each overlap, increment the counter for that feature.
+ """
+ chrom, start, end, args, defaultFragmentLength = arglist
+ if args.verbose:
+ sys.stderr.write("Processing {}:{}-{}\n".format(chrom, start, end))
+
+ olist = []
+ total = [0] * len(args.bamfiles)
+ for idx, f in enumerate(args.bamfiles):
+ odict = dict()
+ for x in gtf.features:
+ odict[x] = 0
+ fh = openBam(f)
+
+ chrom = mungeChromosome(chrom, fh.references)
+
+ lpos = None
+ prev_pos = set()
+ for read in fh.fetch(chrom, start, end):
+ # Filter
+ if read.pos < start:
+ # Ensure that a given alignment is processed only once
+ continue
+ if read.flag & 4:
+ continue
+ if args.minMappingQuality and read.mapq < args.minMappingQuality:
+ continue
+ if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude:
+ continue
+ if args.samFlagExclude and read.flag & args.samFlagExclude != 0:
+ continue
+ tLen = getTLen(read)
+ if args.minFragmentLength > 0 and tLen < args.minFragmentLength:
+ continue
+ if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength:
+ continue
+ if args.ignoreDuplicates:
+ # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions
+ if tLen >= 0:
+ s = read.pos
+ e = s + tLen
+ else:
+ s = read.pnext
+ e = s - tLen
+ if read.reference_id != read.next_reference_id:
+ e = read.pnext
+ if lpos is not None and lpos == read.reference_start \
+ and (s, e, read.next_reference_id, read.is_reverse) in prev_pos:
+ continue
+ if lpos != read.reference_start:
+ prev_pos.clear()
+ lpos = read.reference_start
+ prev_pos.add((s, e, read.next_reference_id, read.is_reverse))
+ total[idx] += 1
+
+ # Get blocks, possibly extending
+ features = gtf.findOverlaps(chrom, getBAMBlocks(read, defaultFragmentLength, args.centerReads, args.Offset))
+
+ if features is not None and len(features) > 0:
+ for x in features:
+ odict[x] += 1
+ olist.append(odict)
+ return olist, gtf.features, total
+
+
+def plotEnrichment(args, featureCounts, totalCounts, features):
+ # get the number of rows and columns
+ if args.perSample:
+ totalPlots = len(args.bamfiles)
+ barsPerPlot = len(features)
+ else:
+ totalPlots = len(features)
+ barsPerPlot = len(args.bamfiles)
+ cols = min(args.numPlotsPerRow, totalPlots)
+ rows = np.ceil(totalPlots / float(args.numPlotsPerRow)).astype(int)
+
+ # Handle the colors
+ if not args.colors:
+ cmap_plot = plt.get_cmap('jet')
+ args.colors = cmap_plot(np.arange(barsPerPlot, dtype=float) / float(barsPerPlot))
+ if args.plotFileFormat == 'plotly':
+ args.colors = range(barsPerPlot)
+ elif len(args.colors) < barsPerPlot:
+ sys.exit("Error: {0} colors were requested, but {1} were needed!".format(len(args.colors), barsPerPlot))
+
+ data = []
+ if args.plotFileFormat == 'plotly':
+ fig = go.Figure()
+ fig['layout'].update(title=args.plotTitle)
+ domainWidth = .9 / cols
+ domainHeight = .9 / rows
+ bufferHeight = 0.0
+ if rows > 1:
+ bufferHeight = 0.1 / (rows - 1)
+ bufferWidth = 0.0
+ if cols > 1:
+ bufferWidth = 0.1 / (cols - 1)
+ else:
+ grids = gridspec.GridSpec(rows, cols)
+ plt.rcParams['font.size'] = 10.0
+
+ # convert cm values to inches
+ fig = plt.figure(figsize=(args.plotWidth / 2.54, args.plotHeight / 2.54))
+ fig.suptitle(args.plotTitle, y=(1 - (0.06 / args.plotHeight)))
+
+ for i in range(totalPlots):
+ col = i % cols
+ row = np.floor(i / float(args.numPlotsPerRow)).astype(int)
+
+ if args.perSample:
+ xlabels = features
+ ylabel = "% alignments in {0}".format(args.labels[i])
+ vals = [featureCounts[i][foo] for foo in features]
+ vals = 100 * np.array(vals, dtype='float64') / totalCounts[i]
+ else:
+ xlabels = args.labels
+ ylabel = "% {0}".format(features[i])
+ vals = [foo[features[i]] for foo in featureCounts]
+ vals = 100 * np.array(vals, dtype='float64') / np.array(totalCounts, dtype='float64')
+
+ if args.plotFileFormat == 'plotly':
+ xanchor = 'x{}'.format(i + 1)
+ yanchor = 'y{}'.format(i + 1)
+ base = row * (domainHeight + bufferHeight)
+ domain = [base, base + domainHeight]
+ fig['layout']['xaxis{}'.format(i + 1)] = {'domain': domain, 'anchor': yanchor}
+ base = col * (domainWidth + bufferWidth)
+ domain = [base, base + domainWidth]
+ fig['layout']['yaxis{}'.format(i + 1)] = {'domain': domain, 'anchor': xanchor, 'title': ylabel}
+ if args.variableScales is False:
+ fig['layout']['yaxis{}'.format(i + 1)].update(range=[0, 100])
+ trace = go.Bar(x=xlabels,
+ y=vals,
+ opacity=args.alpha,
+ orientation='v',
+ showlegend=False,
+ xaxis=xanchor,
+ yaxis=yanchor,
+ name=ylabel,
+ marker={'color': args.colors, 'line': {'color': args.colors}})
+ data.append(trace)
+ else:
+ ax = plt.subplot(grids[row, col])
+ ax.bar(np.arange(vals.shape[0]), vals, width=1.0, bottom=0.0, align='center', color=args.colors, edgecolor=args.colors, alpha=args.alpha)
+ ax.set_ylabel(ylabel)
+ ax.set_xticks(np.arange(vals.shape[0]))
+ ax.set_xticklabels(xlabels, rotation='vertical')
+ if args.variableScales is False:
+ ax.set_ylim(0.0, 100.0)
+
+ if args.plotFileFormat == 'plotly':
+ fig.add_traces(data)
+ py.plot(fig, filename=args.plotFile, auto_open=False)
+ # colors
+ else:
+ plt.subplots_adjust(wspace=0.05, hspace=0.3, bottom=0.15, top=0.80)
+ plt.tight_layout()
+ plt.savefig(args.plotFile, dpi=200, format=args.plotFileFormat)
+ plt.close()
+
+
+def getChunkLength(args, chromSize):
+ """
+ There's no point in parsing the GTF time over and over again needlessly.
+ Emprically, it seems that adding ~4x the number of workers is ideal, since
+ coverage is non-uniform. This is a heuristic way of approximating that.
+
+ Note that if there are MANY small contigs and a few large ones (e.g., the
+ max and median lengths are >10x different, then it's best to take a
+ different tack.
+ """
+
+ if args.region:
+ chromSize, region_start, region_end, genomeChunkLength = getUserRegion(chromSize, args.region)
+ rv = np.ceil((region_start - region_end) / float(4 * args.numberOfProcessors)).astype(int)
+ return max(1, rv)
+
+ bl = None
+ if args.blackListFileName:
+ bl = GTF(args.blackListFileName)
+
+ lengths = []
+ for k, v in chromSize:
+ regs = blSubtract(bl, k, [0, v])
+ for reg in regs:
+ lengths.append(reg[1] - reg[0])
+
+ if len(lengths) >= 4 * args.numberOfProcessors:
+ rv = np.median(lengths).astype(int)
+ # In cases like dm6 or GRCh38, there are a LOT of really small contigs, which will cause the median to be small and performance to tank
+ if np.max(lengths) >= 10 * rv:
+ rv = np.ceil(np.sum(lengths) / (4.0 * args.numberOfProcessors)).astype(int)
+ else:
+ rv = np.ceil(np.sum(lengths) / (4.0 * args.numberOfProcessors)).astype(int)
+
+ return max(1, rv)
+
+
+def main(args=None):
+
+ args = parse_arguments().parse_args(args)
+
+ if not args.outRawCounts and not args.plotFile:
+ sys.exit("Error: You need to specify at least one of --plotFile or --outRawCounts!\n")
+
+ if args.labels is None:
+ args.labels = args.bamfiles
+ if args.smartLabels:
+ args.labels = smartLabels(args.bamfiles)
+ if len(args.labels) != len(args.bamfiles):
+ sys.exit("Error: The number of labels ({0}) does not match the number of BAM files ({1})!".format(len(args.labels), len(args.bamfiles)))
+
+ # Ensure that if we're given an attributeKey that it's not empty
+ if args.attributeKey and args.attributeKey == "":
+ args.attributeKey = None
+
+ global gtf
+ if not args.regionLabels and args.smartLabels:
+ args.regionLabels = smartLabels(args.BED)
+ gtf = Enrichment(args.BED, keepExons=args.keepExons, labels=args.regionLabels, attributeKey=args.attributeKey)
+
+ # Get fragment size and chromosome dict
+ fhs = [openBam(x) for x in args.bamfiles]
+ chromSize, non_common_chr = getCommonChrNames(fhs, verbose=args.verbose)
+ for fh in fhs:
+ fh.close()
+
+ frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bamfiles[0],
+ return_lengths=False,
+ blackListFileName=args.blackListFileName,
+ numberOfProcessors=args.numberOfProcessors,
+ verbose=args.verbose)
+ if args.extendReads:
+ if args.extendReads is True:
+ # try to guess fragment length if the bam file contains paired end reads
+ if frag_len_dict:
+ defaultFragmentLength = frag_len_dict['median']
+ else:
+ sys.exit("*ERROR*: library is not paired-end. Please provide an extension length.")
+ if args.verbose:
+ print("Fragment length based on paired en data "
+ "estimated to be {0}".format(frag_len_dict['median']))
+ elif args.extendReads < read_len_dict['median']:
+ sys.stderr.write("*WARNING*: read extension is smaller than read length (read length = {}). "
+ "Reads will not be extended.\n".format(int(read_len_dict['median'])))
+ defaultFragmentLength = 'read length'
+ elif args.extendReads > 2000:
+ sys.exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads))
+ else:
+ defaultFragmentLength = args.extendReads
+ else:
+ defaultFragmentLength = 'read length'
+
+ # Get the chunkLength
+ chunkLength = getChunkLength(args, chromSize)
+
+ # Map reduce to get the counts/file/feature
+ res = mapReduce([args, defaultFragmentLength],
+ getEnrichment_worker,
+ chromSize,
+ genomeChunkLength=chunkLength,
+ region=args.region,
+ blackListFileName=args.blackListFileName,
+ numberOfProcessors=args.numberOfProcessors,
+ verbose=args.verbose)
+
+ features = res[0][1]
+ featureCounts = []
+ for i in list(range(len(args.bamfiles))):
+ d = dict()
+ for x in features:
+ d[x] = 0
+ featureCounts.append(d)
+
+ # res is a list, with each element a list (length len(args.bamfiles)) of dicts
+ totalCounts = [0] * len(args.bamfiles)
+ for x in res:
+ for i, y in enumerate(x[2]):
+ totalCounts[i] += y
+ for i, y in enumerate(x[0]):
+ for k, v in y.items():
+ featureCounts[i][k] += v
+
+ # Make a plot
+ if args.plotFile:
+ plotEnrichment(args, featureCounts, totalCounts, features)
+
+ # Raw counts
+ if args.outRawCounts:
+ of = open(args.outRawCounts, "w")
+ of.write("file\tfeatureType\tpercent\tfeatureReadCount\ttotalReadCount\n")
+ for i, x in enumerate(args.labels):
+ for k, v in featureCounts[i].items():
+ of.write("{0}\t{1}\t{2:5.2f}\t{3}\t{4}\n".format(x, k, (100.0 * v) / totalCounts[i], v, totalCounts[i]))
+ of.close()
diff --git a/deepTools/source/deeptools/plotFingerprint.py b/deepTools/source/deeptools/plotFingerprint.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5a468802a17fa4dc8a6bfb118a195c1f3f5d0c9
--- /dev/null
+++ b/deepTools/source/deeptools/plotFingerprint.py
@@ -0,0 +1,484 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import argparse
+import sys
+import matplotlib
+matplotlib.use('Agg')
+matplotlib.rcParams['pdf.fonttype'] = 42
+matplotlib.rcParams['svg.fonttype'] = 'none'
+from deeptools import cm # noqa: F401
+import matplotlib.pyplot as plt
+from scipy import interpolate
+from scipy.stats import poisson
+
+import plotly.offline as py
+import plotly.graph_objs as go
+
+import deeptools.countReadsPerBin as countR
+import deeptools.sumCoveragePerBin as sumR
+from deeptools import parserCommon
+from deeptools.utilities import smartLabels
+
+old_settings = np.seterr(all='ignore')
+MAXLEN = 10000000
+
+
+def parse_arguments(args=None):
+ parent_parser = parserCommon.getParentArgParse(binSize=False)
+ required_args = get_required_args()
+ output_args = get_output_args()
+ optional_args = get_optional_args()
+ read_options_parser = parserCommon.read_options()
+ parser = argparse.ArgumentParser(
+ parents=[required_args, output_args, read_options_parser,
+ optional_args, parent_parser],
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ description='This tool samples indexed BAM files '
+ 'and plots a profile of cumulative read coverages for each. '
+ 'All reads overlapping a window (bin) of the '
+ 'specified length are counted; '
+ 'these counts are sorted '
+ 'and the cumulative sum is finally plotted. ',
+ conflict_handler='resolve',
+ usage='plotFingerprint -b treatment.bam control.bam '
+ '-plot fingerprint.png\n'
+ 'help: plotFingerprint -h / plotFingerprint --help',
+ add_help=False)
+
+ return parser
+
+
+def process_args(args=None):
+
+ args = parse_arguments().parse_args(args)
+
+ if args.JSDsample is not None and args.JSDsample not in args.bamfiles:
+ args.bamfiles.append(args.JSDsample)
+ if args.labels and len(args.bamfiles) == len(args.labels) - 1:
+ args.labels.append(args.JSDsample)
+
+ if not args.labels:
+ if args.smartLabels:
+ args.labels = smartLabels(args.bamfiles)
+ else:
+ args.labels = args.bamfiles
+
+ if len(args.bamfiles) != len(args.labels):
+ sys.exit("The number of labels does not match the number of BAM files.")
+
+ return args
+
+
+def get_required_args():
+ parser = argparse.ArgumentParser(add_help=False)
+ required = parser.add_argument_group('Required arguments')
+
+ # define the arguments
+ required.add_argument('--bamfiles', '-b',
+ metavar='bam files',
+ nargs='+',
+ help='List of indexed BAM files',
+ required=True)
+ return parser
+
+
+def get_optional_args():
+ parser = argparse.ArgumentParser(add_help=False,
+ conflict_handler='resolve')
+ optional = parser.add_argument_group('Optional arguments')
+ optional.add_argument("--help", "-h", action="help",
+ help="show this help message and exit")
+
+ optional.add_argument('--labels', '-l',
+ metavar='',
+ help='List of labels to use in the output. '
+ 'If not given, the file names will be used instead. '
+ 'Separate the labels by spaces.',
+ nargs='+')
+
+ optional.add_argument('--smartLabels',
+ action='store_true',
+ help='Instead of manually specifying labels for the input '
+ 'BAM/bigWig files, this causes deepTools to use the file name '
+ 'after removing the path and extension.')
+
+ optional.add_argument('--binSize', '-bs',
+ help='Window size in base pairs to '
+ 'sample the genome. This times --numberOfSamples should be less than the genome size. (Default: %(default)s)',
+ default=500,
+ type=int)
+
+ optional.add_argument('--numberOfSamples', '-n',
+ help='The number of bins that are sampled from the genome, '
+ 'for which the overlapping number of reads is computed. (Default: %(default)s)',
+ default=5e5,
+ type=int)
+
+ optional.add_argument('--plotFileFormat',
+ metavar='',
+ help='image format type. If given, this option '
+ 'overrides the image format based on the ending '
+ 'given via --plotFile '
+ 'ending. The available options are: "png", '
+ '"eps", "pdf", "plotly" and "svg"',
+ choices=['png', 'pdf', 'svg', 'eps', 'plotly'])
+
+ optional.add_argument('--plotTitle', '-T',
+ help='Title of the plot, to be printed on top of '
+ 'the generated image. Leave blank for no title. (Default: %(default)s)',
+ default='')
+
+ optional.add_argument('--skipZeros',
+ help='If set, then regions with zero overlapping reads'
+ 'for *all* given BAM files are ignored. This '
+ 'will result in a reduced number of read '
+ 'counts than that specified in --numberOfSamples',
+ action='store_true')
+
+ optional.add_argument('--outQualityMetrics',
+ help='Quality metrics can optionally be output to '
+ 'this file. The file will have one row per input BAM '
+ 'file and columns containing a number of metrics. '
+ 'Please see the online documentation for a longer '
+ 'explanation: http://deeptools.readthedocs.io/en/latest/content/feature/plotFingerprint_QC_metrics.html .',
+ type=parserCommon.writableFile,
+ metavar='FILE.txt')
+
+ optional.add_argument('--JSDsample',
+ help='Reference sample against which to compute the '
+ 'Jensen-Shannon distance and the CHANCE statistics. '
+ 'If this is not specified, '
+ 'then these will not be calculated. If '
+ '--outQualityMetrics is not specified then this will '
+ 'be ignored. The Jensen-Shannon implementation is '
+ 'based on code from Sitanshu Gakkhar at BCGSC. The '
+ 'CHANCE implementation is based on code from Matthias '
+ 'Haimel.',
+ metavar='sample.bam')
+
+ return parser
+
+
+def get_output_args():
+ parser = argparse.ArgumentParser(add_help=False)
+ group = parser.add_argument_group('Output')
+ group.add_argument('--plotFile', '-plot', '-o',
+ help='File name of the output figure. The file '
+ 'ending will be used to determine the image '
+ 'format. The available options are typically: "png", '
+ '"eps", "pdf" and "svg", e.g. : fingerprint.png.',
+ type=parserCommon.writableFile,
+ metavar='')
+
+ group.add_argument('--outRawCounts',
+ help='Output file name to save the read counts per bin.',
+ type=parserCommon.writableFile,
+ metavar='')
+
+ return parser
+
+
+def binRelEntropy(p, q):
+ """
+ Return the relative binary entropy of x
+ """
+ x1 = 0
+ x2 = 0
+ if p > 0:
+ x1 = p * np.log2(p / q)
+ if p < 1:
+ x2 = (1 - p) * np.log2((1 - p) / (1 - q))
+ return np.fmax(0.0, x1 + x2)
+
+
+def getCHANCE(args, idx, mat):
+ """
+ Compute the CHANCE p-value
+
+ 1) In short, sort IP from lowest to highest, cosorting input at the same time.
+ 2) Choose the argmax of the difference of the cumsum() of the above
+ 3) Determine a scale factor according to the ratio at the position at step 2.
+ """
+ # Get the index of the reference sample
+ if args.JSDsample not in args.bamfiles:
+ return [np.nan, np.nan, np.nan]
+ refIdx = args.bamfiles.index(args.JSDsample)
+ if refIdx == idx:
+ return [np.nan, np.nan, np.nan]
+
+ subMatrix = np.copy(mat[:, [idx, refIdx]])
+ subMatrix[np.isnan(subMatrix)] = 0
+ subMatrix = subMatrix[subMatrix[:, 0].argsort(), :]
+
+ # Find the CHANCE statistic, which is the point of maximus difference
+ cs = np.cumsum(subMatrix, axis=0)
+ normed = cs / np.max(cs, axis=0).astype(float)
+ csdiff = normed[:, 1] - normed[:, 0]
+ k = np.argmax(csdiff)
+ if csdiff[k] < 1e-6:
+ # Don't bother with negative values
+ return [0, 0, 0]
+ p = normed[k, 0] # Percent enrichment in IP
+ q = normed[k, 1] # Percent enrichment in input
+ pcenrich = 100 * (len(csdiff) - k) / float(len(csdiff))
+ diffenrich = 100.0 * (q - p)
+
+ # CHANCE's JS divergence with binary entropy
+ # Its p value is a ztest of this, which is largely useless IMO
+ M = (p + q) / 2.0
+ CHANCEdivergence = 0.5 * (binRelEntropy(p, M) + binRelEntropy(q, M))
+ CHANCEdivergence = np.sqrt(CHANCEdivergence)
+
+ return [pcenrich, diffenrich, CHANCEdivergence]
+
+
+def getSyntheticJSD(vec):
+ """
+ This is largely similar to getJSD, with the 'input' sample being a Poisson distribution with lambda the average coverage in the IP bins
+ """
+ lamb = np.mean(vec) # Average coverage
+ coverage = np.sum(vec)
+
+ chip = np.zeros(MAXLEN, dtype=int)
+ for val in vec:
+ # N.B., we need to clip past the end of the array
+ if val >= MAXLEN:
+ val = MAXLEN - 1
+ # This effectively removes differences due to coverage percentages
+ if val > 0:
+ chip[int(val)] += 1
+ input = coverage * poisson.pmf(np.arange(1, MAXLEN), lamb)
+ if chip[-1] > 0:
+ print("{} bins had coverage over the maximum value of {} during synthetic JSD computation".format(chip[-1], MAXLEN))
+
+ return getJSDcommon(chip, input)
+
+
+def getJSD(args, idx, mat):
+ """
+ Computes the Jensen-Shannon distance between two samples. This is essentially
+ a symmetric version of Kullback-Leibler divergence. The implementation
+ presented here is based on code from Sitanshu Gakkhar at BCGSC.
+
+ Note that the interpolation has the effect of removing zero count coverage
+ bins, which ends up being needed for the JSD calculation.
+
+ args: The input arguments
+ idx: The column index of the current sample
+ mat: The matrix of counts
+ """
+
+ # Get the index of the reference sample
+ if args.JSDsample not in args.bamfiles:
+ return np.nan
+ refIdx = args.bamfiles.index(args.JSDsample)
+ if refIdx == idx:
+ return np.nan
+
+ # These will hold the coverage histograms
+ chip = np.zeros(MAXLEN, dtype=int)
+ input = np.zeros(MAXLEN, dtype=int)
+ for row in mat:
+ # ChIP
+ val = row[idx]
+ # N.B., we need to clip past the end of the array
+ if val >= MAXLEN:
+ val = MAXLEN - 1
+ # This effectively removes differences due to coverage percentages
+ if val > 0:
+ chip[int(val)] += 1
+
+ # Input
+ val = row[refIdx]
+ if val >= MAXLEN:
+ val = MAXLEN - 1
+ if val > 0:
+ input[int(val)] += 1
+ if input[-1] > 0:
+ print("{} bins had coverage over the maximum value of {} in the input sample".format(input[-1], MAXLEN))
+ if chip[-1] > 0:
+ print("{} bins had coverage over the maximum value of {} in the ChIP sample".format(chip[-1], MAXLEN))
+
+ return getJSDcommon(chip, input)
+
+
+def getJSDcommon(chip, input):
+ """
+ This is a continuation of getJSD to allow getSyntheticJSD to reuse code
+ """
+ def signalAndBinDist(x):
+ x = np.array(x)
+ (n,) = x.shape
+ signalValues = np.array(list(range(n)))
+ totalSignal = x * signalValues
+ normalizedTotalSignal = np.cumsum(totalSignal) / np.sum(totalSignal).astype("float")
+ binDist = np.cumsum(x).astype("float") / sum(x)
+ interpolater = interpolate.interp1d(binDist, normalizedTotalSignal, kind='linear', bounds_error=False, fill_value=(0, 1))
+ return (binDist, normalizedTotalSignal, interpolater)
+
+ # Interpolate the signals to evenly spaced bins, which also removes 0-coverage bins
+ chipSignal = signalAndBinDist(chip)
+ inputSignal = signalAndBinDist(input)
+
+ # These are basically CDFs
+ inputSignalInterp = inputSignal[2](np.arange(0, 1.00001, 0.00001))
+ chipSignalInterp = chipSignal[2](np.arange(0, 1.00001, 0.00001))
+
+ # If there are no low coverage bins then you can get nan as the first interpolated value.
+ # That should instead be some small value
+ if np.isnan(inputSignalInterp[0]):
+ inputSignalInterp[0] = 1e-12
+ if np.isnan(chipSignalInterp[0]):
+ chipSignalInterp[0] = 1e-12
+
+ # Differentiate to PMFs, do some sanity checking
+ PMFinput = np.ediff1d(inputSignalInterp)
+ PMFchip = np.ediff1d(chipSignalInterp)
+
+ if abs(sum(PMFinput) - 1) > 0.01 or abs(sum(PMFchip) - 1) > 0.01:
+ sys.stderr.write("Warning: At least one PMF integral is significantly different from 1! The JSD will not be returned")
+ return np.nan
+
+ # Compute the JSD from the PMFs
+ M = (PMFinput + PMFchip) / 2.0
+ JSD = 0.5 * (np.nansum(PMFinput * np.log2(PMFinput / M))) + 0.5 * (np.nansum(PMFchip * np.log2(PMFchip / M)))
+ # Round sqrt of JSD to 15 decimals, as planemo test has issue with rounding ?
+ return round(np.sqrt(JSD), 15)
+
+
+def getExpected(mu):
+ """
+ Given a mean coverage mu, determine the AUC, X-intercept, and elbow point
+ of a Poisson-distributed perfectly behaved input sample with the same coverage
+ """
+ x = np.arange(round(poisson.interval(0.99999, mu=mu)[1] + 1)) # This will be an appropriate range
+ pmf = poisson.pmf(x, mu=mu)
+ cdf = poisson.cdf(x, mu=mu)
+ cs = np.cumsum(pmf * x)
+ cs /= max(cs)
+ XInt = cdf[np.nonzero(cs)[0][0]]
+ AUC = sum(poisson.pmf(x, mu=mu) * cs)
+ elbow = cdf[np.argmax(cdf - cs)]
+ return (AUC, XInt, elbow)
+
+
+def main(args=None):
+ args = process_args(args)
+
+ if not args.plotFile and not args.outRawCounts and not args.outQualityMetrics:
+ sys.stderr.write("\nAt least one of --plotFile, --outRawCounts or --outQualityMetrics is required.\n")
+ sys.exit(1)
+
+ cr = sumR.SumCoveragePerBin(
+ args.bamfiles,
+ args.binSize,
+ args.numberOfSamples,
+ blackListFileName=args.blackListFileName,
+ numberOfProcessors=args.numberOfProcessors,
+ verbose=args.verbose,
+ region=args.region,
+ extendReads=args.extendReads,
+ minMappingQuality=args.minMappingQuality,
+ ignoreDuplicates=args.ignoreDuplicates,
+ center_read=args.centerReads,
+ samFlag_include=args.samFlagInclude,
+ samFlag_exclude=args.samFlagExclude,
+ minFragmentLength=args.minFragmentLength,
+ maxFragmentLength=args.maxFragmentLength)
+
+ num_reads_per_bin = cr.run()
+ if num_reads_per_bin.sum() == 0:
+ import sys
+ sys.stderr.write(
+ "\nNo reads were found in {} regions sampled. Check that the\n"
+ "min mapping quality is not overly high and that the \n"
+ "chromosome names between bam files are consistent.\n"
+ "For small genomes, decrease the --numberOfSamples.\n"
+ "\n".format(num_reads_per_bin.shape[0]))
+ exit(1)
+
+ if args.skipZeros:
+ num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin)
+
+ total = len(num_reads_per_bin[:, 0])
+ x = np.arange(total).astype('float') / total # normalize from 0 to 1
+
+ if args.plotFile is not None:
+ i = 0
+ # matplotlib won't iterate through line styles by itself
+ pyplot_line_styles = sum([7 * ["-"], 7 * ["--"], 7 * ["-."], 7 * [":"]], [])
+ plotly_colors = ["#d73027", "#fc8d59", "#f33090", "#e0f3f8", "#91bfdb", "#4575b4"]
+ plotly_line_styles = sum([6 * ["solid"], 6 * ["dot"], 6 * ["dash"], 6 * ["longdash"], 6 * ["dashdot"], 6 * ["longdashdot"]], [])
+ data = []
+ for i, reads in enumerate(num_reads_per_bin.T):
+ count = np.cumsum(np.sort(reads))
+ count = count / count[-1] # to normalize y from 0 to 1
+ if args.plotFileFormat == 'plotly':
+ trace = go.Scatter(x=x, y=count, mode='lines', name=args.labels[i])
+ trace['line'].update(dash=plotly_line_styles[i % 36], color=plotly_colors[i % 6])
+ data.append(trace)
+ else:
+ j = i % len(pyplot_line_styles)
+ plt.plot(x, count, label=args.labels[i], linestyle=pyplot_line_styles[j])
+ plt.xlabel('rank')
+ plt.ylabel('fraction w.r.t. bin with highest coverage')
+ # set the plotFileFormat explicitly to None to trigger the
+ # format from the file-extension
+ if not args.plotFileFormat:
+ args.plotFileFormat = None
+
+ if args.plotFileFormat == 'plotly':
+ fig = go.Figure()
+ fig.add_traces(data)
+ fig['layout'].update(title=args.plotTitle)
+ fig['layout']['xaxis1'].update(title="rank")
+ fig['layout']['yaxis1'].update(title="fraction w.r.t bin with highest coverage")
+ py.plot(fig, filename=args.plotFile, auto_open=False)
+ else:
+ plt.legend(loc='upper left')
+ plt.suptitle(args.plotTitle)
+ plt.savefig(args.plotFile, bbox_inches=0, format=args.plotFileFormat)
+ plt.close()
+
+ if args.outRawCounts is not None:
+ of = open(args.outRawCounts, "w")
+ of.write("#plotFingerprint --outRawCounts\n")
+ of.write("'" + "'\t'".join(args.labels) + "'\n")
+ fmt = "\t".join(np.repeat('%d', num_reads_per_bin.shape[1])) + "\n"
+ for row in num_reads_per_bin:
+ of.write(fmt % tuple(row))
+ of.close()
+
+ if args.outQualityMetrics is not None:
+ of = open(args.outQualityMetrics, "w")
+ of.write("Sample\tAUC\tSynthetic AUC\tX-intercept\tSynthetic X-intercept\tElbow Point\tSynthetic Elbow Point")
+ if args.JSDsample:
+ of.write("\tJS Distance\tSynthetic JS Distance\t% genome enriched\tdiff. enrichment\tCHANCE divergence")
+ else:
+ of.write("\tSynthetic JS Distance")
+ of.write("\n")
+ line = np.arange(num_reads_per_bin.shape[0]) / float(num_reads_per_bin.shape[0] - 1)
+ for idx, reads in enumerate(num_reads_per_bin.T):
+ counts = np.cumsum(np.sort(reads))
+ counts = counts / float(counts[-1])
+ AUC = np.sum(counts) / float(len(counts))
+ XInt = (np.argmax(counts > 0) + 1) / float(counts.shape[0])
+ elbow = (np.argmax(line - counts) + 1) / float(counts.shape[0])
+ expected = getExpected(np.mean(reads)) # A tuple of expected (AUC, XInt, elbow)
+ of.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}".format(args.labels[idx], AUC, expected[0], XInt, expected[1], elbow, expected[2]))
+ if args.JSDsample:
+ JSD = getJSD(args, idx, num_reads_per_bin)
+ syntheticJSD = getSyntheticJSD(num_reads_per_bin[:, idx])
+ CHANCE = getCHANCE(args, idx, num_reads_per_bin)
+ of.write("\t{0}\t{1}\t{2}\t{3}\t{4}".format(JSD, syntheticJSD, CHANCE[0], CHANCE[1], CHANCE[2]))
+ else:
+ syntheticJSD = getSyntheticJSD(num_reads_per_bin[:, idx])
+ of.write("\t{0}".format(syntheticJSD))
+ of.write("\n")
+ of.close()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/deepTools/source/deeptools/plotHeatmap.py b/deepTools/source/deeptools/plotHeatmap.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2149f829918ba5a95a751e1997aa1e410460e69
--- /dev/null
+++ b/deepTools/source/deeptools/plotHeatmap.py
@@ -0,0 +1,893 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+from __future__ import division
+
+import argparse
+from collections import OrderedDict
+import numpy as np
+import matplotlib
+matplotlib.use('Agg')
+matplotlib.rcParams['pdf.fonttype'] = 42
+matplotlib.rcParams['svg.fonttype'] = 'none'
+import matplotlib.pyplot as plt
+from matplotlib.font_manager import FontProperties
+import matplotlib.gridspec as gridspec
+from matplotlib import ticker
+import copy
+import sys
+import plotly.offline as py
+import plotly.graph_objs as go
+
+# own modules
+from deeptools import cm # noqa: F401
+from deeptools import parserCommon
+from deeptools import heatmapper
+from deeptools.heatmapper_utilities import plot_single, plotly_single
+from deeptools.utilities import convertCmap
+from deeptools.computeMatrixOperations import filterHeatmapValues
+
+debug = 0
+old_settings = np.seterr(all='ignore')
+plt.ioff()
+
+
+def parse_arguments(args=None):
+ parser = argparse.ArgumentParser(
+ parents=[parserCommon.heatmapperMatrixArgs(),
+ parserCommon.heatmapperOutputArgs(mode='heatmap'),
+ parserCommon.heatmapperOptionalArgs(mode='heatmap')],
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ description='This tool creates a heatmap for '
+ 'scores associated with genomic regions. '
+ 'The program requires a matrix file '
+ 'generated by the tool ``computeMatrix``.',
+ epilog='An example usage is: plotHeatmap -m matrix.gz',
+ usage='plotHeatmap -m matrix.gz\n'
+ 'help: plotHeatmap -h / plotHeatmap --help',
+ add_help=False)
+
+ return parser
+
+
+def process_args(args=None):
+ args = parse_arguments().parse_args(args)
+
+ args.heatmapHeight = args.heatmapHeight if args.heatmapHeight > 3 and args.heatmapHeight <= 100 else 10
+
+ if not matplotlib.colors.is_color_like(args.missingDataColor):
+ exit("The value {0} for --missingDataColor is not valid".format(args.missingDataColor))
+
+ args.boxAroundHeatmaps = True if args.boxAroundHeatmaps == 'yes' else False
+
+ return args
+
+
+def prepare_layout(hm_matrix, heatmapsize, showSummaryPlot, showColorbar, perGroup, colorbar_position, fig):
+ """
+ prepare the plot layout
+ as a grid having as many rows
+ as samples (+1 for colobar)
+ and as many rows as groups (or clusters) (+1 for profile plot)
+ """
+ heatmapwidth, heatmapheight = heatmapsize
+
+ numcols = hm_matrix.get_num_samples()
+ numrows = hm_matrix.get_num_groups()
+ if perGroup:
+ numcols, numrows = numrows, numcols
+
+ # the rows have different size depending
+ # on the number of regions contained in the
+ if perGroup:
+ # heatmap
+ height_ratio = np.array([np.amax(np.diff(hm_matrix.group_boundaries))] * numrows)
+ # scale ratio to sum = heatmapheight
+ height_ratio = heatmapheight * (height_ratio.astype(float) / height_ratio.sum())
+ else:
+ # heatmap
+ height_ratio = np.diff(hm_matrix.group_boundaries)
+ # scale ratio to sum = heatmapheight
+ height_ratio = heatmapheight * (height_ratio.astype(float) / height_ratio.sum())
+
+ # convert the height_ratio from numpy array back to list
+ height_ratio = height_ratio.tolist()
+ # the width ratio is equal for all heatmaps
+ width_ratio = [heatmapwidth] * numcols
+
+ if showColorbar:
+ if colorbar_position == 'below':
+ numrows += 2 # a spacer needs to be added to avoid overlaps
+ height_ratio += [4 / 2.54] # spacer
+ height_ratio += [1 / 2.54]
+ else:
+ numcols += 1
+ width_ratio += [1 / 2.54]
+
+ if showSummaryPlot:
+ numrows += 2 # plus 2 because a spacer is added
+ # make height of summary plot
+ # proportional to the width of heatmap
+ sumplot_height = heatmapwidth
+ spacer_height = heatmapwidth / 8
+ # scale height_ratios to convert from row
+ # numbers to heatmapheigt fractions
+ height_ratio = np.concatenate([[sumplot_height, spacer_height], height_ratio])
+
+ grids = gridspec.GridSpec(numrows, numcols, height_ratios=height_ratio, width_ratios=width_ratio, figure=fig)
+
+ return grids
+
+
+def addProfilePlot(hm, plt, fig, grids, iterNum, iterNum2, perGroup, averageType, plot_type, yAxisLabel, color_list, yMin, yMax, wspace, hspace, colorbar_position, label_rotation=0.0):
+ """
+ A function to add profile plots to the given figure, possibly in a custom grid subplot which mimics a tight layout (if wspace and hspace are not None)
+ """
+ if wspace is not None and hspace is not None:
+ if colorbar_position == 'side':
+ gridsSub = gridspec.GridSpecFromSubplotSpec(1, iterNum, subplot_spec=grids[0, :-1], wspace=wspace, hspace=hspace)
+ else:
+ gridsSub = gridspec.GridSpecFromSubplotSpec(1, iterNum, subplot_spec=grids[0, :], wspace=wspace, hspace=hspace)
+
+ ax_list = []
+ globalYmin = np.inf
+ globalYmax = -np.inf
+ for sample_id in range(iterNum):
+ if perGroup:
+ title = hm.matrix.group_labels[sample_id]
+ tickIdx = sample_id % hm.matrix.get_num_samples()
+ else:
+ title = hm.matrix.sample_labels[sample_id]
+ tickIdx = sample_id
+ if sample_id > 0 and len(yMin) == 1 and len(yMax) == 1:
+ ax_profile = fig.add_subplot(grids[0, sample_id])
+ else:
+ if wspace is not None and hspace is not None:
+ ax_profile = fig.add_subplot(gridsSub[0, sample_id])
+ else:
+ ax_profile = fig.add_subplot(grids[0, sample_id])
+
+ ax_profile.set_title(title)
+ for group in range(iterNum2):
+ if perGroup:
+ sub_matrix = hm.matrix.get_matrix(sample_id, group)
+ line_label = sub_matrix['sample']
+ else:
+ sub_matrix = hm.matrix.get_matrix(group, sample_id)
+ line_label = sub_matrix['group']
+ plot_single(ax_profile, sub_matrix['matrix'],
+ averageType,
+ color_list[group],
+ line_label,
+ plot_type=plot_type)
+
+ if sample_id > 0 and len(yMin) == 1 and len(yMax) == 1:
+ plt.setp(ax_profile.get_yticklabels(), visible=False)
+
+ if sample_id == 0 and yAxisLabel != '':
+ ax_profile.set_ylabel(yAxisLabel)
+ xticks, xtickslabel = hm.getTicks(tickIdx)
+ if np.ceil(max(xticks)) != float(sub_matrix['matrix'].shape[1] - 1):
+ tickscale = float(sub_matrix['matrix'].shape[1] - 1) / max(xticks)
+ xticks_use = [x * tickscale for x in xticks]
+ ax_profile.axes.set_xticks(xticks_use)
+ else:
+ ax_profile.axes.set_xticks(xticks)
+ ax_profile.axes.set_xticklabels(xtickslabel, rotation=label_rotation)
+ ax_list.append(ax_profile)
+
+ # align the first and last label
+ # such that they don't fall off
+ # the heatmap sides
+ ticks = ax_profile.xaxis.get_major_ticks()
+ ticks[0].label1.set_horizontalalignment('left')
+ ticks[-1].label1.set_horizontalalignment('right')
+
+ globalYmin = min(float(globalYmin), ax_profile.get_ylim()[0])
+ globalYmax = max(globalYmax, ax_profile.get_ylim()[1])
+
+ # It turns out that set_ylim only takes float64s
+ for sample_id, subplot in enumerate(ax_list):
+ localYMin = yMin[sample_id % len(yMin)]
+ localYMax = yMax[sample_id % len(yMax)]
+ lims = [globalYmin, globalYmax]
+ if localYMin:
+ if localYMax:
+ lims = (float(localYMin), float(localYMax))
+ else:
+ lims = (float(localYMin), lims[1])
+ elif localYMax:
+ lims = (lims[0], float(localYMax))
+ if lims[0] >= lims[1]:
+ lims = (lims[0], lims[0] + 1)
+ ax_list[sample_id].set_ylim(lims)
+ return ax_list
+
+
+def plotlyMatrix(hm,
+ outFilename,
+ yMin=[None], yMax=[None],
+ zMin=[None], zMax=[None],
+ showSummaryPlot=False,
+ cmap=None, colorList=None, colorBarPosition='side',
+ perGroup=False,
+ averageType='median', yAxisLabel='', xAxisLabel='',
+ plotTitle='',
+ showColorbar=False,
+ label_rotation=0.0):
+ label_rotation *= -1.0
+ if colorBarPosition != 'side':
+ sys.error.write("Warning: It is not currently possible to have multiple colorbars with plotly!\n")
+
+ nRows = hm.matrix.get_num_groups()
+ nCols = hm.matrix.get_num_samples()
+ if perGroup:
+ nRows, nCols = nCols, nRows
+
+ profileHeight = 0.0
+ profileBottomBuffer = 0.0
+ if showSummaryPlot:
+ profileHeight = 0.2
+ profileBottomBuffer = 0.05
+ profileSideBuffer = 0.
+ profileWidth = 1. / nCols
+ if nCols > 1:
+ profileSideBuffer = 0.1 / (nCols - 1)
+ profileWidth = 0.9 / nCols
+
+ dataSummary = []
+ annos = []
+ fig = go.Figure()
+ fig['layout'].update(title=plotTitle)
+ xAxisN = 1
+ yAxisN = 1
+
+ # Summary plots at the top (if appropriate)
+ if showSummaryPlot:
+ yMinLocal = np.inf
+ yMaxLocal = -np.inf
+ for i in range(nCols):
+ xanchor = 'x{}'.format(xAxisN)
+ yanchor = 'y{}'.format(yAxisN)
+ xBase = i * (profileSideBuffer + profileWidth)
+ yBase = 1 - profileHeight
+ xDomain = [xBase, xBase + profileWidth]
+ yDomain = [yBase, 1.0]
+ for j in range(nRows):
+ if perGroup:
+ mat = hm.matrix.get_matrix(i, j)
+ xTicks, xTicksLabels = hm.getTicks(i)
+ label = mat['sample']
+ else:
+ mat = hm.matrix.get_matrix(j, i)
+ xTicks, xTicksLabels = hm.getTicks(j)
+ label = mat['group']
+ if j == 0:
+ fig['layout']['xaxis{}'.format(xAxisN)] = dict(domain=xDomain, anchor=yanchor, range=[0, mat['matrix'].shape[1]], tickmode='array', tickvals=xTicks, ticktext=xTicksLabels, tickangle=label_rotation)
+ fig['layout']['yaxis{}'.format(yAxisN)] = dict(anchor=xanchor, domain=yDomain)
+ trace = plotly_single(mat['matrix'], averageType, colorList[j], label)[0]
+ trace.update(xaxis=xanchor, yaxis=yanchor, legendgroup=label)
+ if min(trace['y']) < yMinLocal:
+ yMinLocal = min(trace['y'])
+ if max(trace['y']) > yMaxLocal:
+ yMaxLocal = max(trace['y'])
+ if i == 0:
+ trace.update(showlegend=True)
+ dataSummary.append(trace)
+
+ # Add the column label
+ if perGroup:
+ title = hm.matrix.group_labels[i]
+ else:
+ title = hm.matrix.sample_labels[i]
+ titleX = xBase + 0.5 * profileWidth
+ annos.append({'yanchor': 'bottom', 'xref': 'paper', 'xanchor': 'center', 'yref': 'paper', 'text': title, 'y': 1.0, 'x': titleX, 'font': {'size': 16}, 'showarrow': False})
+ xAxisN += 1
+ yAxisN += 1
+
+ # Adjust y-bounds as appropriate:
+ for i in range(1, yAxisN):
+ yMinUse = yMinLocal
+ if yMin[(i - 1) % len(yMin)] is not None:
+ yMinUse = yMin[(i - 1) % len(yMin)]
+ yMaxUse = yMaxLocal
+ if yMax[(i - 1) % len(yMax)] is not None:
+ yMaxUse = yMax[(i - 1) % len(yMax)]
+ fig['layout']['yaxis{}'.format(i)].update(range=[yMinUse, yMaxUse])
+ fig['layout']['yaxis1'].update(title=yAxisLabel)
+
+ # Add the heatmap
+ dataHeatmap = []
+ zMinLocal = np.inf
+ zMaxLocal = -np.inf
+ heatmapWidth = 1. / nCols
+ heatmapSideBuffer = 0.0
+ if nCols > 1:
+ heatmapWidth = .9 / nCols
+ heatmapSideBuffer = 0.1 / (nCols - 1)
+ heatmapHeight = 1.0 - profileHeight - profileBottomBuffer
+
+ for i in range(nCols):
+ xanchor = 'x{}'.format(xAxisN)
+ xBase = i * (heatmapSideBuffer + heatmapWidth)
+
+ # Determine the height of each heatmap, they have no buffer
+ lengths = [0.0]
+ for j in range(nRows):
+ if perGroup:
+ mat = hm.matrix.get_matrix(i, j)
+ else:
+ mat = hm.matrix.get_matrix(j, i)
+ lengths.append(mat['matrix'].shape[0])
+ fractionalHeights = heatmapHeight * np.cumsum(lengths).astype(float) / np.sum(lengths).astype(float)
+ xDomain = [xBase, xBase + heatmapWidth]
+ fig['layout']['xaxis{}'.format(xAxisN)] = dict(domain=xDomain, anchor='free', position=0.0, range=[0, mat['matrix'].shape[1]], tickmode='array', tickvals=xTicks, ticktext=xTicksLabels, title=xAxisLabel)
+
+ # Start adding the heatmaps
+ for j in range(nRows):
+ if perGroup:
+ mat = hm.matrix.get_matrix(i, j)
+ label = mat['sample']
+ start = hm.matrix.group_boundaries[i]
+ end = hm.matrix.group_boundaries[i + 1]
+ else:
+ mat = hm.matrix.get_matrix(j, i)
+ label = mat['group']
+ start = hm.matrix.group_boundaries[j]
+ end = hm.matrix.group_boundaries[j + 1]
+ regs = hm.matrix.regions[start:end]
+ regs = [x[2] for x in regs]
+ yanchor = 'y{}'.format(yAxisN)
+ yDomain = [heatmapHeight - fractionalHeights[j + 1], heatmapHeight - fractionalHeights[j]]
+ visible = False
+ if i == 0:
+ visible = True
+ fig['layout']['yaxis{}'.format(yAxisN)] = dict(domain=yDomain, anchor=xanchor, visible=visible, title=label, tickmode='array', tickvals=[], ticktext=[])
+ if np.min(mat['matrix']) < zMinLocal:
+ zMinLocal = np.min(mat['matrix'])
+ if np.max(mat['matrix']) < zMaxLocal:
+ zMaxLocal = np.max(mat['matrix'])
+
+ trace = go.Heatmap(z=np.flipud(mat['matrix']),
+ y=regs[::-1],
+ xaxis=xanchor,
+ yaxis=yanchor,
+ showlegend=False,
+ name=label,
+ showscale=False)
+
+ dataHeatmap.append(trace)
+ yAxisN += 1
+ xAxisN += 1
+ if showColorbar:
+ dataHeatmap[-1].update(showscale=True)
+ dataHeatmap[-1]['colorbar'].update(len=heatmapHeight, y=0, yanchor='bottom', ypad=0.0)
+
+ # Adjust z bounds and colorscale
+ for trace in dataHeatmap:
+ zMinUse = zMinLocal
+ zMaxUse = zMaxLocal
+ if zMin[0] is not None:
+ zMinUse = zMin[0]
+ if zMax[0] is not None:
+ zMaxUse = zMax[0]
+ trace.update(zmin=zMinUse, zmax=zMaxUse, colorscale=convertCmap(cmap[0], vmin=zMinUse, vmax=zMaxUse))
+
+ dataSummary.extend(dataHeatmap)
+ fig.add_traces(dataSummary)
+ fig['layout']['annotations'] = annos
+ py.plot(fig, filename=outFilename, auto_open=False)
+
+
+def plotMatrix(hm, outFileName,
+ colorMapDict={'colorMap': ['binary'], 'missingDataColor': 'black', 'alpha': 1.0},
+ plotTitle='',
+ xAxisLabel='', yAxisLabel='', regionsLabel='',
+ zMin=None, zMax=None,
+ yMin=None, yMax=None,
+ averageType='median',
+ reference_point_label=None,
+ startLabel='TSS', endLabel="TES",
+ heatmapHeight=25,
+ heatmapWidth=7.5,
+ perGroup=False, whatToShow='plot, heatmap and colorbar',
+ plot_type='lines',
+ linesAtTickMarks=False,
+ image_format=None,
+ legend_location='upper-left',
+ box_around_heatmaps=True,
+ label_rotation=0.0,
+ dpi=200,
+ interpolation_method='auto'):
+
+ hm.reference_point_label = hm.parameters['ref point']
+ if reference_point_label is not None:
+ hm.reference_point_label = [reference_point_label] * hm.matrix.get_num_samples()
+ hm.startLabel = startLabel
+ hm.endLabel = endLabel
+
+ matrix_flatten = None
+ if zMin is None:
+ matrix_flatten = hm.matrix.flatten()
+ # try to avoid outliers by using np.percentile
+ zMin = np.percentile(matrix_flatten, 1.0)
+ if np.isnan(zMin):
+ zMin = [None]
+ else:
+ zMin = [zMin] # convert to list to support multiple entries
+ elif 'auto' in zMin:
+ matrix_flatten = hm.matrix.flatten()
+ auto_min = np.percentile(matrix_flatten, 1.0)
+ if np.isnan(auto_min):
+ auto_min = None
+ new_mins = [float(x) if x != 'auto' else auto_min for x in zMin]
+ zMin = new_mins
+ else:
+ new_mins = [float(x) for x in zMin]
+ zMin = new_mins
+
+ if zMax is None:
+ if matrix_flatten is None:
+ matrix_flatten = hm.matrix.flatten()
+ # try to avoid outliers by using np.percentile
+ zMax = np.percentile(matrix_flatten, 98.0)
+ if np.isnan(zMax) or zMax <= zMin[0]:
+ zMax = [None]
+ else:
+ zMax = [zMax]
+ elif 'auto' in zMax:
+ matrix_flatten = hm.matrix.flatten()
+ auto_max = np.percentile(matrix_flatten, 98.0)
+ if np.isnan(auto_max):
+ auto_max = None
+ new_maxs = [float(x) if x != 'auto' else auto_max for x in zMax]
+ zMax = new_maxs
+ else:
+ new_maxs = [float(x) for x in zMax]
+ zMax = new_maxs
+ if (len(zMin) > 1) & (len(zMax) > 1):
+ for index, value in enumerate(zMax):
+ if value <= zMin[index]:
+ sys.stderr.write("Warnirng: In bigwig {}, the given zmin ({}) is larger than "
+ "or equal to the given zmax ({}). Thus, it has been set "
+ "to None. \n".format(index + 1, zMin[index], value))
+ zMin[index] = None
+
+ if yMin is None:
+ yMin = [None]
+ if yMax is None:
+ yMax = [None]
+ if not isinstance(yMin, list):
+ yMin = [yMin]
+ if not isinstance(yMax, list):
+ yMax = [yMax]
+
+ plt.rcParams['font.size'] = 8.0
+ fontP = FontProperties()
+
+ showSummaryPlot = False
+ showColorbar = False
+
+ if whatToShow == 'plot and heatmap':
+ showSummaryPlot = True
+ elif whatToShow == 'heatmap and colorbar':
+ showColorbar = True
+ elif whatToShow == 'plot, heatmap and colorbar':
+ showSummaryPlot = True
+ showColorbar = True
+
+ # colormap for the heatmap
+ if colorMapDict['colorMap']:
+ cmap = []
+ for color_map in colorMapDict['colorMap']:
+ copy_cmp = copy.copy(plt.get_cmap(color_map))
+ cmap.append(copy_cmp)
+ cmap[-1].set_bad(colorMapDict['missingDataColor']) # nans are printed using this color
+
+ if colorMapDict['colorList'] and len(colorMapDict['colorList']) > 0:
+ # make a cmap for each color list given
+ cmap = []
+ for color_list in colorMapDict['colorList']:
+ cmap.append(matplotlib.colors.LinearSegmentedColormap.from_list(
+ 'my_cmap', color_list.replace(' ', '').split(","), N=colorMapDict['colorNumber']))
+ cmap[-1].set_bad(colorMapDict['missingDataColor']) # nans are printed using this color
+
+ if len(cmap) > 1 or len(zMin) > 1 or len(zMax) > 1:
+ # position color bar below heatmap when more than one
+ # heatmap color is given
+ colorbar_position = 'below'
+ else:
+ colorbar_position = 'side'
+
+ # figsize: w,h tuple in inches
+ figwidth = heatmapWidth / 2.54
+ figheight = heatmapHeight / 2.54
+ if showSummaryPlot:
+ # the summary plot ocupies a height
+ # equal to the fig width
+ figheight += figwidth
+
+ numsamples = hm.matrix.get_num_samples()
+ if perGroup:
+ num_cols = hm.matrix.get_num_groups()
+ else:
+ num_cols = numsamples
+ total_figwidth = figwidth * num_cols
+ if showColorbar:
+ if colorbar_position == 'below':
+ figheight += 1 / 2.54
+ else:
+ total_figwidth += 1 / 2.54
+
+ fig = plt.figure(figsize=(total_figwidth, figheight), constrained_layout=True)
+ fig.suptitle(plotTitle, y=1 - (0.06 / figheight))
+
+ grids = prepare_layout(
+ hm.matrix,
+ (heatmapWidth, heatmapHeight),
+ showSummaryPlot,
+ showColorbar,
+ perGroup,
+ colorbar_position,
+ fig
+ )
+
+ # color map for the summary plot (profile) on top of the heatmap
+ cmap_plot = plt.get_cmap('jet')
+ numgroups = hm.matrix.get_num_groups()
+ if perGroup:
+ color_list = cmap_plot(np.arange(hm.matrix.get_num_samples()) / hm.matrix.get_num_samples())
+ else:
+ color_list = cmap_plot(np.arange(numgroups) / numgroups)
+ alpha = colorMapDict['alpha']
+ if image_format == 'plotly':
+ return plotlyMatrix(hm,
+ outFileName,
+ yMin=yMin, yMax=yMax,
+ zMin=zMin, zMax=zMax,
+ showSummaryPlot=showSummaryPlot, showColorbar=showColorbar,
+ cmap=cmap, colorList=color_list, colorBarPosition=colorbar_position,
+ perGroup=perGroup,
+ averageType=averageType, plotTitle=plotTitle,
+ xAxisLabel=xAxisLabel, yAxisLabel=yAxisLabel,
+ label_rotation=label_rotation)
+
+ # check if matrix is reference-point based using the upstream >0 value
+ # and is sorted by region length. If this is
+ # the case, prepare the data to plot a border at the regions end
+ regions_length_in_bins = [None] * len(hm.parameters['upstream'])
+ if hm.matrix.sort_using == 'region_length' and hm.matrix.sort_method != 'no':
+ for idx in range(len(hm.parameters['upstream'])):
+ if hm.parameters['ref point'][idx] is None:
+ regions_length_in_bins[idx] = None
+ continue
+
+ _regions = hm.matrix.get_regions()
+ foo = []
+ for _group in _regions:
+ _reg_len = []
+ for ind_reg in _group:
+ if isinstance(ind_reg, dict):
+ _len = ind_reg['end'] - ind_reg['start']
+ else:
+ _len = sum([x[1] - x[0] for x in ind_reg[1]])
+ if hm.parameters['ref point'][idx] == 'TSS':
+ _reg_len.append((hm.parameters['upstream'][idx] + _len) / hm.parameters['bin size'][idx])
+ elif hm.parameters['ref point'][idx] == 'center':
+ _len *= 0.5
+ _reg_len.append((hm.parameters['upstream'][idx] + _len) / hm.parameters['bin size'][idx])
+ elif hm.parameters['ref point'][idx] == 'TES':
+ _reg_len.append((hm.parameters['upstream'][idx] - _len) / hm.parameters['bin size'][idx])
+ foo.append(_reg_len)
+ regions_length_in_bins[idx] = foo
+
+ # plot the profiles on top of the heatmaps
+ if showSummaryPlot:
+ if perGroup:
+ iterNum = numgroups
+ iterNum2 = hm.matrix.get_num_samples()
+ else:
+ iterNum = hm.matrix.get_num_samples()
+ iterNum2 = numgroups
+ ax_list = addProfilePlot(hm, plt, fig, grids, iterNum, iterNum2, perGroup, averageType, plot_type, yAxisLabel, color_list, yMin, yMax, None, None, colorbar_position, label_rotation)
+
+ if legend_location != 'none':
+ ax_list[-1].legend(loc=legend_location.replace('-', ' '), ncol=1, prop=fontP,
+ frameon=False, markerscale=0.5)
+
+ first_group = 0 # helper variable to place the title per sample/group
+ for sample in range(hm.matrix.get_num_samples()):
+ sample_idx = sample
+ for group in range(numgroups):
+ group_idx = group
+ # add the respective profile to the
+ # summary plot
+ sub_matrix = hm.matrix.get_matrix(group, sample)
+ if showSummaryPlot:
+ if perGroup:
+ sample_idx = sample + 2 # plot + spacer
+ else:
+ group += 2 # plot + spacer
+ first_group = 1
+
+ if perGroup:
+ ax = fig.add_subplot(grids[sample_idx, group])
+ # the remainder (%) is used to iterate
+ # over the available color maps (cmap).
+ # if the user only provided, lets say two
+ # and there are 10 groups, colormaps they are reused every
+ # two groups.
+ cmap_idx = group_idx % len(cmap)
+ zmin_idx = group_idx % len(zMin)
+ zmax_idx = group_idx % len(zMax)
+ else:
+ ax = fig.add_subplot(grids[group, sample])
+ # see above for the use of '%'
+ cmap_idx = sample % len(cmap)
+ zmin_idx = sample % len(zMin)
+ zmax_idx = sample % len(zMax)
+
+ if group == first_group and not showSummaryPlot and not perGroup:
+ title = hm.matrix.sample_labels[sample]
+ ax.set_title(title)
+
+ if box_around_heatmaps is False:
+ # Turn off the boxes around the individual heatmaps
+ ax.spines['top'].set_visible(False)
+ ax.spines['right'].set_visible(False)
+ ax.spines['bottom'].set_visible(False)
+ ax.spines['left'].set_visible(False)
+ rows, cols = sub_matrix['matrix'].shape
+ # if the number of rows is too large, then the 'nearest' method simply
+ # drops rows. A better solution is to relate the threshold to the DPI of the image
+ if interpolation_method == 'auto':
+ if rows >= 1000:
+ interpolation_method = 'bilinear'
+ else:
+ interpolation_method = 'nearest'
+
+ # if np.clip is not used, then values of the matrix that exceed the zmax limit are
+ # highlighted. Usually, a significant amount of pixels are equal or above the zmax and
+ # the default behaviour produces images full of large highlighted dots.
+ # If interpolation='nearest' is used, this has no effect
+ sub_matrix['matrix'] = np.clip(sub_matrix['matrix'], zMin[zmin_idx], zMax[zmax_idx])
+ img = ax.imshow(sub_matrix['matrix'],
+ aspect='auto',
+ interpolation=interpolation_method,
+ origin='upper',
+ vmin=zMin[zmin_idx],
+ vmax=zMax[zmax_idx],
+ cmap=cmap[cmap_idx],
+ alpha=alpha,
+ extent=[0, cols, rows, 0])
+ img.set_rasterized(True)
+ # plot border at the end of the regions
+ # if ordered by length
+ if regions_length_in_bins[sample] is not None:
+ x_lim = ax.get_xlim()
+ y_lim = ax.get_ylim()
+
+ ax.plot(regions_length_in_bins[sample][group_idx],
+ np.arange(len(regions_length_in_bins[sample][group_idx])),
+ '--', color='black', linewidth=0.5, dashes=(3, 2))
+ ax.set_xlim(x_lim)
+ ax.set_ylim(y_lim)
+
+ if perGroup:
+ ax.axes.set_xlabel(sub_matrix['group'])
+ if sample < hm.matrix.get_num_samples() - 1:
+ ax.axes.get_xaxis().set_visible(False)
+ else:
+ ax.axes.get_xaxis().set_visible(False)
+ ax.axes.set_xlabel(xAxisLabel)
+ ax.axes.set_yticks([])
+ if perGroup and group == 0:
+ ax.axes.set_ylabel(sub_matrix['sample'])
+ elif not perGroup and sample == 0:
+ ax.axes.set_ylabel(sub_matrix['group'])
+
+ # Plot vertical lines at tick marks if desired
+ if linesAtTickMarks:
+ xticks_heat, xtickslabel_heat = hm.getTicks(sample)
+ xticks_heat = [x + 0.5 for x in xticks_heat] # There's an offset of 0.5 compared to the profile plot
+ if np.ceil(max(xticks_heat)) != float(sub_matrix['matrix'].shape[1]):
+ tickscale = float(sub_matrix['matrix'].shape[1]) / max(xticks_heat)
+ xticks_heat_use = [x * tickscale for x in xticks_heat]
+ else:
+ xticks_heat_use = xticks_heat
+ for x in xticks_heat_use:
+ ax.axvline(x=x, color='black', linewidth=0.5, dashes=(3, 2))
+
+ # add labels to last block in a column
+ if (perGroup and sample == numsamples - 1) or \
+ (not perGroup and group_idx == numgroups - 1):
+
+ # add xticks to the bottom heatmap (last group)
+ ax.axes.get_xaxis().set_visible(True)
+ xticks_heat, xtickslabel_heat = hm.getTicks(sample)
+ xticks_heat = [x + 0.5 for x in xticks_heat] # There's an offset of 0.5 compared to the profile plot
+ if np.ceil(max(xticks_heat)) != float(sub_matrix['matrix'].shape[1]):
+ tickscale = float(sub_matrix['matrix'].shape[1]) / max(xticks_heat)
+ xticks_heat_use = [x * tickscale for x in xticks_heat]
+ ax.axes.set_xticks(xticks_heat_use)
+ else:
+ ax.axes.set_xticks(xticks_heat)
+ ax.axes.set_xticklabels(xtickslabel_heat, size=8)
+
+ # align the first and last label
+ # such that they don't fall off
+ # the heatmap sides
+ ticks = ax.xaxis.get_major_ticks()
+ ticks[0].label1.set_horizontalalignment('left')
+ ticks[-1].label1.set_horizontalalignment('right')
+
+ ax.get_xaxis().set_tick_params(
+ which='both',
+ top=False,
+ direction='out')
+
+ if showColorbar and colorbar_position == 'below':
+ # draw a colormap per each heatmap below the last block
+ if perGroup:
+ col = group_idx
+ else:
+ col = sample
+ ax = fig.add_subplot(grids[-1, col])
+ tick_locator = ticker.MaxNLocator(nbins=3)
+ cbar = fig.colorbar(img, cax=ax, orientation='horizontal', ticks=tick_locator)
+ labels = cbar.ax.get_xticklabels()
+ ticks = cbar.ax.get_xticks()
+ if ticks[0] == 0:
+ # if the label is at the start of the colobar
+ # move it a bit inside to avoid overlapping
+ # with other labels
+ labels[0].set_horizontalalignment('left')
+ if ticks[-1] == 1:
+ # if the label is at the end of the colobar
+ # move it a bit inside to avoid overlapping
+ # with other labels
+ labels[-1].set_horizontalalignment('right')
+ # cbar.ax.set_xticklabels(labels, rotation=90)
+
+ if showColorbar and colorbar_position != 'below':
+ if showSummaryPlot:
+ # we don't want to colorbar to extend
+ # over the profiles and spacer top rows
+ grid_start = 2
+ else:
+ grid_start = 0
+
+ ax = fig.add_subplot(grids[grid_start:, -1])
+ fig.colorbar(img, cax=ax)
+
+ if box_around_heatmaps:
+ fig.get_layout_engine().set(wspace=0.10, hspace=0.025, rect=(0.04, 0, 0.96, 0.85))
+ else:
+ # When no box is plotted the space between heatmaps is reduced
+ fig.get_layout_engine().set(wspace=0.05, hspace=0.01, rect=(0.04, 0, 0.96, 0.85))
+
+ plt.savefig(outFileName, bbox_inches='tight', pad_inches=0.1, dpi=dpi, format=image_format)
+ plt.close()
+
+
+def mergeSmallGroups(matrixDict):
+ group_lengths = [len(x) for x in matrixDict.values()]
+ min_group_length = sum(group_lengths) * 0.01
+
+ to_merge = []
+ i = 0
+ _mergedHeatMapDict = OrderedDict()
+
+ for label, ma in matrixDict.items():
+ # merge small groups together
+ # otherwise visualization is impaired
+ if group_lengths[i] > min_group_length:
+ if len(to_merge):
+ to_merge.append(label)
+ new_label = " ".join(to_merge)
+ new_ma = np.concatenate([matrixDict[item]
+ for item in to_merge], axis=0)
+ else:
+ new_label = label
+ new_ma = matrixDict[label]
+
+ _mergedHeatMapDict[new_label] = new_ma
+ to_merge = []
+ else:
+ to_merge.append(label)
+ i += 1
+ if len(to_merge) > 1:
+ new_label = " ".join(to_merge)
+ new_ma = np.array()
+ for item in to_merge:
+ new_ma = np.concatenate([new_ma, matrixDict[item]])
+ _mergedHeatMapDict[new_label] = new_ma
+
+ return _mergedHeatMapDict
+
+
+def main(args=None):
+ args = process_args(args)
+ hm = heatmapper.heatmapper()
+ matrix_file = args.matrixFile.name
+ args.matrixFile.close()
+ hm.read_matrix_file(matrix_file)
+
+ if hm.parameters['min threshold'] is not None or hm.parameters['max threshold'] is not None:
+ filterHeatmapValues(hm, hm.parameters['min threshold'], hm.parameters['max threshold'])
+
+ if args.sortRegions == 'keep':
+ args.sortRegions = 'no' # These are the same thing
+ if args.kmeans is not None:
+ hm.matrix.hmcluster(args.kmeans, method='kmeans', clustering_samples=args.clusterUsingSamples)
+ elif args.hclust is not None:
+ print("Performing hierarchical clustering."
+ "Please note that it might be very slow for large datasets.\n")
+ hm.matrix.hmcluster(args.hclust, method='hierarchical', clustering_samples=args.clusterUsingSamples)
+
+ group_len_ratio = np.diff(hm.matrix.group_boundaries) / len(hm.matrix.regions)
+ if np.any(group_len_ratio < 5.0 / 1000):
+ problem = np.flatnonzero(group_len_ratio < 5.0 / 1000)
+ sys.stderr.write("WARNING: Group '{}' is too small for plotting, you might want to remove it. "
+ "There will likely be an error message from matplotlib regarding this "
+ "below.\n".format(hm.matrix.group_labels[problem[0]]))
+
+ if args.regionsLabel:
+ hm.matrix.set_group_labels(args.regionsLabel)
+
+ if args.samplesLabel and len(args.samplesLabel):
+ hm.matrix.set_sample_labels(args.samplesLabel)
+
+ if args.sortRegions != 'no':
+ sortUsingSamples = []
+ if args.sortUsingSamples is not None:
+ for i in args.sortUsingSamples:
+ if (i > 0 and i <= hm.matrix.get_num_samples()):
+ sortUsingSamples.append(i - 1)
+ else:
+ exit("The value {0} for --sortSamples is not valid. Only values from 1 to {1} are allowed.".format(args.sortUsingSamples, hm.matrix.get_num_samples()))
+ print('Samples used for ordering within each group: ', sortUsingSamples)
+
+ hm.matrix.sort_groups(sort_using=args.sortUsing,
+ sort_method=args.sortRegions,
+ sample_list=sortUsingSamples)
+
+ if args.silhouette:
+ if args.kmeans is not None:
+ hm.matrix.computeSilhouette(args.kmeans)
+ elif args.hclust is not None:
+ hm.matrix.computeSilhouette(args.args.hclust)
+
+ if args.outFileNameMatrix:
+ hm.save_matrix(args.outFileNameMatrix)
+
+ if args.outFileSortedRegions:
+ hm.save_BED(args.outFileSortedRegions)
+
+ colormap_dict = {'colorMap': args.colorMap,
+ 'colorList': args.colorList,
+ 'colorNumber': args.colorNumber,
+ 'missingDataColor': args.missingDataColor,
+ 'alpha': args.alpha}
+
+ plotMatrix(hm,
+ args.outFileName,
+ colormap_dict, args.plotTitle,
+ args.xAxisLabel, args.yAxisLabel, args.regionsLabel,
+ args.zMin, args.zMax,
+ args.yMin, args.yMax,
+ args.averageTypeSummaryPlot,
+ args.refPointLabel,
+ args.startLabel,
+ args.endLabel,
+ args.heatmapHeight,
+ args.heatmapWidth,
+ args.perGroup,
+ args.whatToShow,
+ linesAtTickMarks=args.linesAtTickMarks,
+ plot_type=args.plotType,
+ image_format=args.plotFileFormat,
+ legend_location=args.legendLocation,
+ box_around_heatmaps=args.boxAroundHeatmaps,
+ label_rotation=args.label_rotation,
+ dpi=args.dpi,
+ interpolation_method=args.interpolationMethod)
diff --git a/deepTools/source/deeptools/plotPCA.py b/deepTools/source/deeptools/plotPCA.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc17ed32b18ab94486ed447b75f5c309bc827ed7
--- /dev/null
+++ b/deepTools/source/deeptools/plotPCA.py
@@ -0,0 +1,200 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+import argparse
+import matplotlib
+matplotlib.use('Agg')
+matplotlib.rcParams['pdf.fonttype'] = 42
+matplotlib.rcParams['svg.fonttype'] = 'none'
+from deeptools import cm # noqa: F401
+from importlib.metadata import version
+from deeptools.correlation import Correlation
+from deeptools.parserCommon import writableFile
+
+
+def parse_arguments(args=None):
+ basic_args = plotCorrelationArgs()
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ description="""
+Tool for generating a principal component analysis (PCA)
+plot from multiBamSummary or multiBigwigSummary output. By default, the loadings for each sample in each principal component is plotted. If the data is transposed, the projections of each sample on the requested principal components is plotted instead.
+
+Detailed help:
+
+ plotPCA -h
+
+""",
+ epilog='example usages:\n'
+ 'plotPCA -in coverages.npz -o pca.png\n\n'
+ ' \n\n',
+ parents=[basic_args, ],
+ usage='plotPCA -in coverage.npz -o pca.png\n'
+ 'help: plotPCA -h / plotPCA --help\n')
+ return parser
+
+
+def plotCorrelationArgs():
+ parser = argparse.ArgumentParser(add_help=False)
+ required = parser.add_argument_group('Required arguments')
+
+ # define the arguments
+ required.add_argument('--corData', '-in',
+ metavar='FILE',
+ help='Coverage file (generated by multiBamSummary or multiBigwigSummary)',
+ required=True)
+
+ optional = parser.add_argument_group('Optional arguments')
+ optional.add_argument('--plotFile', '-o',
+ help='File name to save the plot to. '
+ 'The extension determines the file format. '
+ 'For example: '
+ 'pca.pdf will save the PCA plot in PDF format. '
+ 'The available options are: .png, '
+ '.eps, .pdf and .svg. If this option is omitted, then you MUST specify --outFileNameData',
+ type=writableFile,
+ metavar='FILE')
+
+ optional.add_argument('--labels', '-l',
+ metavar='sample1 sample2',
+ help='User defined labels instead of default labels from '
+ 'file names. '
+ 'Multiple labels have to be separated by spaces, e.g. '
+ '--labels sample1 sample2 sample3',
+ nargs='+')
+
+ optional.add_argument('--plotTitle', '-T',
+ help='Title of the plot, to be printed on top of '
+ 'the generated image. Leave blank for no title. (Default: %(default)s)',
+ default='')
+
+ optional.add_argument('--plotFileFormat',
+ metavar='FILETYPE',
+ help='Image format type. If given, this option '
+ 'overrides the image format based on the plotFile '
+ 'ending. The available options are: png, '
+ 'eps, pdf, plotly and svg.',
+ choices=['png', 'pdf', 'svg', 'eps', 'plotly'])
+
+ optional.add_argument('--plotHeight',
+ help='Plot height in cm. (Default: %(default)s)',
+ type=float,
+ default=10)
+
+ optional.add_argument('--plotWidth',
+ help='Plot width in cm. The minimum value is 1 cm. (Default: %(default)s)',
+ type=float,
+ default=10)
+
+ optional.add_argument('--outFileNameData',
+ metavar='file.tab',
+ type=writableFile,
+ help='File name to which the data underlying the plot '
+ 'should be saved, such as myPCA.tab. For untransposed '
+ 'data, this is the loading per-sample and PC as well '
+ 'as the eigenvalues. For transposed data, this is the '
+ 'rotation per-sample and PC and the eigenvalues. The '
+ 'projections are truncated to the number of '
+ 'eigenvalues for transposed data.')
+
+ optional.add_argument('--ntop',
+ help='Use only the top N most variable rows in the '
+ 'original matrix. Specifying 0 will result in all '
+ 'rows being used. If the matrix is to be transposed, '
+ 'rows with 0 variance are always excluded, even if a '
+ 'values of 0 is specified. The default is 1000. (Default: %(default)s)',
+ type=int,
+ default=1000)
+
+ optional.add_argument('--PCs',
+ help='The principal components to plot. If specified, '
+ 'you must provide two different integers, greater '
+ 'than zero, separated by a space. An example (and the default) is "1 2". (Default: %(default)s)',
+ type=int,
+ nargs=2,
+ default=[1, 2])
+
+ optional.add_argument('--log2',
+ help='log2 transform the datapoints prior to computing '
+ 'the PCA. Note that 0.01 is added to all values to '
+ 'prevent 0 values from becoming -infinity. Using this '
+ 'option with input that contains negative values will '
+ 'result in an error.',
+ action='store_true')
+
+ optional.add_argument('--colors',
+ metavar="COLORS",
+ nargs='+',
+ help="A list of colors for the symbols. Color names and html hex string (e.g., #eeff22) are accepted. The color names should be space separated. For example, --colors red blue green. If not specified, the symbols will be given automatic colors.")
+
+ optional.add_argument('--markers',
+ metavar="MARKERS",
+ nargs='+',
+ help="A list of markers for the symbols. (e.g., '<','>','o') are accepted. The marker values should be space separated. For example, --markers 's' 'o' 's' 'o'. If not specified, the symbols will be given automatic shapes.")
+
+ optional.add_argument('--version', action='version',
+ version='%(prog)s {}'.format(version('deeptools')))
+
+ optionalEx = optional.add_mutually_exclusive_group()
+ optionalEx.add_argument('--transpose',
+ help='Perform the PCA on the transposed matrix, (i.e., on the '
+ 'matrix where rows are samples and columns are '
+ 'bins/features. This then matches what is typically '
+ 'done in R.',
+ action='store_true')
+
+ optionalEx.add_argument('--rowCenter',
+ help='When specified, each row (bin, gene, etc.) '
+ 'in the matrix is centered at 0 before the PCA is '
+ 'computed. This is useful only if you have a strong '
+ 'bin/gene/etc. correlation and the resulting '
+ 'principal component has samples stacked vertically. This option is not applicable if --transpose is specified.',
+ action='store_true')
+
+ return parser
+
+
+def main(args=None):
+ args = parse_arguments().parse_args(args)
+
+ if args.plotFile is None and args.outFileNameData is None:
+ sys.exit("At least one of --plotFile and --outFileNameData must be specified!\n")
+
+ if args.ntop < 0:
+ sys.exit("The value specified for --ntop must be >= 0!\n")
+
+ if args.PCs[0] == args.PCs[1]:
+ sys.exit("You must specify different principal components!\n")
+ if args.PCs[0] <= 0 or args.PCs[1] <= 0:
+ sys.exit("The specified principal components must be at least 1!\n")
+
+ corr = Correlation(args.corData,
+ labels=args.labels,)
+
+ corr.rowCenter = args.rowCenter
+ corr.transpose = args.transpose
+ corr.ntop = args.ntop
+ corr.log2 = args.log2
+
+ Wt, eigenvalues = corr.plot_pca(args.plotFile,
+ PCs=args.PCs,
+ plot_title=args.plotTitle,
+ image_format=args.plotFileFormat,
+ plotWidth=args.plotWidth,
+ plotHeight=args.plotHeight,
+ cols=args.colors,
+ marks=args.markers)
+
+ if args.outFileNameData is not None:
+ of = open(args.outFileNameData, "w")
+ of.write("#plotPCA --outFileNameData\n")
+ of.write("Component\t{}\tEigenvalue\n".format("\t".join(corr.labels)))
+ n = eigenvalues.shape[0]
+ for i in range(n):
+ of.write("{}\t{}\t{}\n".format(i + 1, "\t".join(["{}".format(x) for x in Wt[i, :]]), eigenvalues[i]))
+ of.close()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/deepTools/source/deeptools/plotProfile.py b/deepTools/source/deeptools/plotProfile.py
new file mode 100644
index 0000000000000000000000000000000000000000..7497875f207cfd4ea2855a55c320a6cd05355de1
--- /dev/null
+++ b/deepTools/source/deeptools/plotProfile.py
@@ -0,0 +1,973 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+
+import sys
+
+import argparse
+import numpy as np
+from math import ceil
+import matplotlib
+matplotlib.use('Agg')
+matplotlib.rcParams['pdf.fonttype'] = 42
+matplotlib.rcParams['svg.fonttype'] = 'none'
+import deeptools.cm # noqa: F401
+import matplotlib.pyplot as plt
+from matplotlib.font_manager import FontProperties
+from matplotlib import colors as pltcolors
+import matplotlib.gridspec as gridspec
+
+import plotly.offline as py
+import plotly.graph_objs as go
+
+# own modules
+from deeptools import parserCommon
+from deeptools import heatmapper
+from deeptools.heatmapper_utilities import plot_single, plotly_single, getProfileTicks
+from deeptools.computeMatrixOperations import filterHeatmapValues
+
+
+debug = 0
+old_settings = np.seterr(all='ignore')
+plt.ioff()
+
+
+def parse_arguments(args=None):
+ parser = argparse.ArgumentParser(
+ parents=[parserCommon.heatmapperMatrixArgs(),
+ parserCommon.heatmapperOutputArgs(mode='profile'),
+ parserCommon.heatmapperOptionalArgs(mode='profile')],
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ description='This tool creates a profile plot for '
+ 'scores over sets of genomic regions. '
+ 'Typically, these regions are genes, but '
+ 'any other regions defined in BED '
+ ' will work. A matrix generated '
+ 'by computeMatrix is required.',
+ epilog='An example usage is: plotProfile -m matrix.gz',
+ add_help=False,
+ usage='plotProfile -m matrix.gz\n'
+ 'help: plotProfile -h / plotProfile --help')
+
+ return parser
+
+
+def process_args(args=None):
+ args = parse_arguments().parse_args(args)
+
+ # Ensure that yMin/yMax are there and a list
+ try:
+ assert args.yMin is not None
+ except:
+ args.yMin = [None]
+ try:
+ assert args.yMax is not None
+ except:
+ args.yMax = [None]
+
+ # Sometimes Galaxy sends --yMax '' and --yMin ''
+ if args.yMin == ['']:
+ args.yMin = [None]
+ if args.yMax == ['']:
+ args.yMax = [None]
+
+ # Convert to floats
+ if args.yMin != [None]:
+ foo = [float(x) for x in args.yMin]
+ args.yMin = foo
+ if args.yMax != [None]:
+ foo = [float(x) for x in args.yMax]
+ args.yMax = foo
+
+ if args.plotHeight < 0.5:
+ args.plotHeight = 0.5
+ elif args.plotHeight > 100:
+ args.plotHeight = 100
+
+ return args
+
+
+class Profile(object):
+
+ def __init__(self, hm, out_file_name,
+ plot_title='', y_axis_label='',
+ y_min=None, y_max=None,
+ averagetype='median',
+ reference_point_label=None,
+ start_label='TSS', end_label='TES',
+ plot_height=7,
+ plot_width=11,
+ per_group=False,
+ plot_type='lines',
+ image_format=None,
+ color_list=None,
+ legend_location='best',
+ plots_per_row=8,
+ label_rotation=0,
+ dpi=200):
+ """
+ Using the hm matrix, makes a line plot
+ either per group or per sample
+ using the specified parameters.
+
+ Args:
+ hm: heatmapper object
+ out_file_name: string
+ plot_title: string
+ y_axis_label: list
+ y_min: list
+ y_max: list
+ averagetype: mean, sum, median
+ reference_point_label: string
+ start_label: string
+ end_label: string
+ plot_height: in cm
+ plot_width: in cm
+ per_group: bool
+ plot_type: string
+ image_format: string
+ color_list: list
+ legend_location:
+ plots_per_row: int
+ label_rotation: float
+
+ Returns:
+
+ """
+ self.hm = hm
+ self.out_file_name = out_file_name
+ self.plot_title = plot_title
+ self.y_axis_label = y_axis_label
+ self.y_min = y_min
+ self.y_max = y_max
+ self.averagetype = averagetype
+ self.reference_point_label = reference_point_label
+ self.start_label = start_label
+ self.end_label = end_label
+ self.plot_height = plot_height
+ self.plot_width = plot_width
+ self.per_group = per_group
+ self.plot_type = plot_type
+ self.image_format = image_format
+ self.color_list = color_list
+ self.legend_location = legend_location
+ self.plots_per_row = plots_per_row
+ self.label_rotation = label_rotation
+ self.dpi = dpi
+
+ # Honor reference point labels from computeMatrix
+ if reference_point_label is None:
+ self.reference_point_label = hm.parameters['ref point']
+
+ # decide how many plots are needed
+ if self.per_group:
+ self.numplots = self.hm.matrix.get_num_groups()
+ self.numlines = self.hm.matrix.get_num_samples()
+ else:
+ self.numplots = self.hm.matrix.get_num_samples()
+ self.numlines = self.hm.matrix.get_num_groups()
+
+ if self.numplots > self.plots_per_row:
+ rows = np.ceil(self.numplots / float(self.plots_per_row)).astype(int)
+ cols = self.plots_per_row
+ else:
+ rows = 1
+ cols = self.numplots
+ self.grids = gridspec.GridSpec(rows, cols)
+
+ plt.rcParams['font.size'] = 10.0
+ self.font_p = FontProperties()
+ self.font_p.set_size('small')
+
+ # convert cm values to inches
+ plot_height_inches = rows * self.cm2inch(self.plot_height)[0]
+ self.fig = plt.figure(figsize=self.cm2inch(cols * self.plot_width, rows * self.plot_height))
+ self.fig.suptitle(self.plot_title, y=(1 - (0.06 / plot_height_inches)))
+
+ # Ensure that the labels are vectors
+ nSamples = len(self.hm.matrix.sample_labels)
+ if not isinstance(self.reference_point_label, list):
+ self.reference_point_label = [self.reference_point_label] * nSamples
+ if not isinstance(self.start_label, list):
+ self.start_label = [self.start_label] * nSamples
+ if not isinstance(self.end_label, list):
+ self.end_label = [self.end_label] * nSamples
+
+ def getTicks(self, idx):
+ """
+ This is essentially a wrapper around getProfileTicks to accomdate the fact that each column has its own ticks.
+ """
+ xticks, xtickslabel = getProfileTicks(self.hm, self.reference_point_label[idx], self.start_label[idx], self.end_label[idx], idx)
+ return xticks, xtickslabel
+
+ @staticmethod
+ def cm2inch(*tupl):
+ inch = 2.54
+ if isinstance(tupl[0], tuple):
+ return tuple(i / inch for i in tupl[0])
+ else:
+ return tuple(i / inch for i in tupl)
+
+ def plot_hexbin(self):
+ from matplotlib import cm
+ cmap = cm.coolwarm
+ cmap.set_bad('black')
+
+ if self.image_format == "plotly":
+ return self.plotly_hexbin()
+
+ for plot in range(self.numplots):
+ col = plot % self.plots_per_row
+ row = int(plot / float(self.plots_per_row))
+ localYMin = None
+ localYMax = None
+
+ # split the ax to make room for the colorbar and for each of the
+ # groups
+ sub_grid = gridspec.GridSpecFromSubplotSpec(self.numlines, 2, subplot_spec=self.grids[row, col],
+ width_ratios=[0.92, 0.08], wspace=0.05, hspace=0.1)
+
+ ax = self.fig.add_subplot(sub_grid[0, 0])
+
+ ax.tick_params(
+ axis='y',
+ which='both',
+ left=False,
+ right=False,
+ labelleft=True)
+
+ if self.per_group:
+ title = self.hm.matrix.group_labels[plot]
+ else:
+ title = self.hm.matrix.sample_labels[plot]
+
+ vmin = np.inf
+ vmax = -np.inf
+ for data_idx in range(self.numlines):
+ # get the max and min
+ if self.per_group:
+ _row, _col = plot, data_idx
+ else:
+ _row, _col = data_idx, plot
+
+ sub_matrix = self.hm.matrix.get_matrix(_row, _col)
+ ma = sub_matrix['matrix']
+ x_values = np.tile(np.arange(ma.shape[1]), (ma.shape[0], 1))
+ img = ax.hexbin(x_values.flatten(), ma.flatten(), cmap=cmap, mincnt=1)
+ _vmin, _vmax = img.get_clim()
+ if _vmin < vmin:
+ vmin = _vmin
+ if _vmax > vmax:
+ vmax = _vmax
+
+ if localYMin is None or self.y_min[col % len(self.y_min)] < localYMin:
+ localYMin = self.y_min[col % len(self.y_min)]
+ if localYMax is None or self.y_max[col % len(self.y_max)] > localYMax:
+ localYMax = self.y_max[col % len(self.y_max)]
+ self.fig.delaxes(ax)
+
+ # iterate again after having computed the vmin and vmax
+ ax_list = []
+ for data_idx in range(self.numlines)[::-1]:
+ ax = self.fig.add_subplot(sub_grid[data_idx, 0])
+ if data_idx == 0:
+ ax.set_title(title)
+ if data_idx != self.numlines - 1:
+ plt.setp(ax.get_xticklabels(), visible=False)
+
+ if self.per_group:
+ _row, _col = plot, data_idx
+ else:
+ _row, _col = data_idx, plot
+
+ sub_matrix = self.hm.matrix.get_matrix(_row, _col)
+
+ if self.per_group:
+ label = sub_matrix['sample']
+ else:
+ label = sub_matrix['group']
+
+ ma = sub_matrix['matrix']
+ try:
+ # matplotlib 2.0
+ ax.set_facecolor('black')
+ except:
+ # matplotlib <2.0
+ ax.set_axis_bgcolor('black')
+ x_values = np.tile(np.arange(ma.shape[1]), (ma.shape[0], 1))
+ img = ax.hexbin(x_values.flatten(), ma.flatten(), cmap=cmap, mincnt=1, vmin=vmin, vmax=vmax)
+
+ if plot == 0:
+ ax.axes.set_ylabel(label)
+
+ ax_list.append(ax)
+
+ lims = ax.get_ylim()
+ if localYMin is not None:
+ lims = (localYMin, lims[1])
+ if localYMax is not None:
+ lims = (lims[0], localYMax)
+ if lims[0] >= lims[1]:
+ lims = (lims[0], lims[0] + 1)
+ ax.set_ylim(lims)
+
+ xticks, xtickslabel = self.getTicks(plot)
+ if np.ceil(max(xticks)) != float(ma.shape[1] - 1):
+ tickscale = float(sub_matrix['matrix'].shape[1]) / max(xticks)
+ xticks_use = [x * tickscale for x in xticks]
+ ax_list[0].axes.set_xticks(xticks_use)
+ else:
+ ax_list[0].axes.set_xticks(xticks)
+ ax_list[0].axes.set_xticklabels(xtickslabel, rotation=self.label_rotation)
+ # align the first and last label
+ # such that they don't fall off
+ # the heatmap sides
+ ticks = ax_list[-1].xaxis.get_major_ticks()
+ ticks[0].label1.set_horizontalalignment('left')
+ ticks[-1].label1.set_horizontalalignment('right')
+
+ cax = self.fig.add_subplot(sub_grid[:, 1])
+ self.fig.colorbar(img, cax=cax)
+
+ plt.subplots_adjust(wspace=0.05, hspace=0.3)
+ plt.tight_layout()
+ plt.savefig(self.out_file_name, dpi=self.dpi, format=self.image_format)
+ plt.close()
+
+ def plotly_hexbin(self):
+ """plot_hexbin, but for plotly. it's annoying that we have to have sub-subplots"""
+ fig = go.Figure()
+ cols = self.plots_per_row if self.numplots > self.plots_per_row else self.numplots
+ rows = np.ceil(self.numplots / float(cols)).astype(int)
+ fig['layout'].update(title=self.plot_title)
+ domainWidth = .9 / cols
+ domainHeight = .9 / rows
+ bufferHeight = 0.0
+ if rows > 1:
+ bufferHeight = 0.1 / (rows - 1)
+ else:
+ domainHeight = 1.0
+ bufferWidth = 0.0
+ if cols > 1:
+ bufferWidth = 0.1 / (cols - 1)
+ else:
+ domainWidth = 1.0
+ subHeight = domainHeight / float(self.numlines)
+ if self.per_group:
+ sideLabels = self.hm.matrix.sample_labels
+ else:
+ sideLabels = self.hm.matrix.group_labels
+
+ data = []
+ annos = []
+ vmin = np.inf
+ vmax = -np.inf
+ for i in range(self.numplots):
+ row = rows - i / self.plots_per_row - 1
+ col = i % self.plots_per_row
+
+ if self.per_group:
+ title = self.hm.matrix.group_labels[i]
+ else:
+ title = self.hm.matrix.sample_labels[i]
+
+ base = row * (domainHeight + bufferHeight)
+ domain = [base, base + domainHeight]
+ titleY = base + domainHeight
+ base = col * (domainWidth + bufferWidth)
+ domain = [base, base + domainWidth]
+ titleX = base + 0.5 * domainWidth
+ xanchor = 'x{}'.format(i + 1)
+ fig['layout']['xaxis{}'.format(i + 1)] = dict(domain=domain)
+ annos.append({'yanchor': 'bottom', 'xref': 'paper', 'xanchor': 'center', 'yref': 'paper', 'text': title, 'y': titleY, 'x': titleX, 'font': {'size': 16}, 'showarrow': False})
+
+ # set yMin/yMax
+ yMin = np.inf
+ yMax = -np.inf
+ for j in range(self.numlines):
+ # get the max and min
+ if self.per_group:
+ _row, _col = i, j
+ else:
+ _row, _col = j, i
+
+ ma = self.hm.matrix.get_matrix(_row, _col)['matrix']
+ if np.min(ma) < yMin:
+ yMin = np.min(ma)
+ if np.max(ma) > yMax:
+ yMax = np.max(ma)
+ if self.y_min[i % len(self.y_min)] is not None:
+ yMin = self.y_min[i % len(self.y_min)]
+ if self.y_max[i % len(self.y_max)] is not None:
+ yMax = self.y_max[i % len(self.y_max)]
+
+ for j in range(self.numlines):
+ if self.per_group:
+ _row, _col = i, j
+ else:
+ _row, _col = j, i
+ foo = i * self.numlines + j + 1
+ yanchor = 'y{}'.format(foo)
+ base = row * (domainHeight + bufferHeight) + j * subHeight
+ domain = [base, base + subHeight]
+ fig['layout']['yaxis{}'.format(foo)] = {'domain': domain, 'title': self.y_axis_label, 'anchor': xanchor, 'range': [yMin, yMax]}
+ if j == 0:
+ _ = "xaxis{}".format(xanchor[1:])
+ fig['layout'][_].update(anchor='y{}'.format(foo))
+ if col == 0:
+ titleY = base + 0.5 * subHeight
+ annos.append({'yanchor': 'middle', 'xref': 'paper', 'xanchor': 'left', 'yref': 'paper', 'text': sideLabels[j], 'y': titleY, 'x': -0.03, 'font': {'size': 16}, 'showarrow': False, 'textangle': -90})
+
+ sub_matrix = self.hm.matrix.get_matrix(_row, _col)
+ ma = self.hm.matrix.get_matrix(_row, _col)['matrix']
+
+ fig['layout']['xaxis{}'.format(i + 1)].update(range=[0, ma.shape[1]])
+
+ if self.per_group:
+ label = sub_matrix['sample']
+ else:
+ label = sub_matrix['group']
+
+ # Manually compute the 2D histogram with 100x100 bins
+ x_values = np.tile(np.arange(ma.shape[1]), (ma.shape[0], 1))
+ z, xe, ye = np.histogram2d(x_values.flatten(), ma.flatten(), bins=100, range=[[0, ma.shape[1]], [yMin, yMax]])
+
+ _vmin = np.min(z)
+ _vmax = np.max(z)
+ if _vmin < vmin:
+ vmin = _vmin
+ if _vmax > vmax:
+ vmax = _vmax
+
+ trace = go.Contour(z=z.T, x=xe, y=ye, xaxis=xanchor, yaxis=yanchor, name=label, connectgaps=False)
+ data.append(trace)
+
+ # Assume the bounds for the last graph are correct
+ totalWidth = ma.shape[1]
+ xticks, xtickslabel = self.getTicks(i)
+ if np.ceil(max(xticks)) != float(totalWidth):
+ tickscale = float(totalWidth) / max(xticks)
+ xticks_use = [x * tickscale for x in xticks]
+ else:
+ xticks_use = xticks
+ xticks_use = [np.ceil(x) for x in xticks_use]
+ fig['layout']['xaxis{}'.format(i + 1)].update(tickmode='array', tickvals=xticks_use, ticktext=xtickslabel, tickangle=self.label_rotation)
+
+ for trace in data:
+ trace.update(zmin=vmin, zmax=vmax)
+
+ fig.add_traces(data)
+ fig['layout']['annotations'] = annos
+ py.plot(fig, filename=self.out_file_name, auto_open=False)
+
+ def plot_heatmap(self):
+ cmap = ['RdYlBu_r']
+ if self.color_list is not None: # check the length to be equal to the numebr of plots otherwise multiply it!
+ cmap = self.color_list
+ if len(cmap) < self.numplots:
+ all_colors = cmap
+ for i in range(ceil(self.numplots / len(cmap))):
+ cmap.extend(all_colors)
+ matrix_flatten = None
+ if self.y_min == [None]:
+ matrix_flatten = self.hm.matrix.flatten()
+ # try to avoid outliers by using np.percentile
+ self.y_min = [np.percentile(matrix_flatten, 1.0)]
+ if np.isnan(self.y_min[0]):
+ self.y_min = [None]
+
+ if self.y_max == [None]:
+ if matrix_flatten is None:
+ matrix_flatten = self.hm.matrix.flatten()
+ # try to avoid outliers by using np.percentile
+ self.y_max = [np.percentile(matrix_flatten, 98.0)]
+ if np.isnan(self.y_max[0]):
+ self.y_max = [None]
+
+ if self.image_format == "plotly":
+ return self.plotly_heatmap()
+
+ ax_list = []
+ # turn off y ticks
+ for plot in range(self.numplots):
+ labels = []
+ col = plot % self.plots_per_row
+ row = int(plot / float(self.plots_per_row))
+ localYMin = None
+ localYMax = None
+
+ # split the ax to make room for the colorbar
+ sub_grid = gridspec.GridSpecFromSubplotSpec(1, 2, subplot_spec=self.grids[row, col],
+ width_ratios=[0.92, 0.08], wspace=0.05)
+
+ ax = self.fig.add_subplot(sub_grid[0])
+ cax = self.fig.add_subplot(sub_grid[1])
+
+ ax.tick_params(
+ axis='y',
+ which='both',
+ left=False,
+ right=False,
+ labelleft=True)
+
+ if self.per_group:
+ title = self.hm.matrix.group_labels[plot]
+ tickIdx = plot % self.hm.matrix.get_num_samples()
+ else:
+ title = self.hm.matrix.sample_labels[plot]
+ tickIdx = plot
+ ax.set_title(title)
+ mat = [] # when drawing a heatmap (in contrast to drawing lines)
+ for data_idx in range(self.numlines):
+ if self.per_group:
+ row, col = plot, data_idx
+ else:
+ row, col = data_idx, plot
+ if localYMin is None or self.y_min[col % len(self.y_min)] < localYMin:
+ localYMin = self.y_min[col % len(self.y_min)]
+ if localYMax is None or self.y_max[col % len(self.y_max)] > localYMax:
+ localYMax = self.y_max[col % len(self.y_max)]
+
+ sub_matrix = self.hm.matrix.get_matrix(row, col)
+
+ if self.per_group:
+ label = sub_matrix['sample']
+ else:
+ label = sub_matrix['group']
+ labels.append(label)
+ mat.append(np.ma.__getattribute__(self.averagetype)(sub_matrix['matrix'], axis=0))
+ img = ax.imshow(np.vstack(mat), interpolation='nearest',
+ cmap=cmap[plot], aspect='auto', vmin=localYMin, vmax=localYMax)
+ self.fig.colorbar(img, cax=cax)
+
+ totalWidth = np.vstack(mat).shape[1]
+ xticks, xtickslabel = self.getTicks(tickIdx)
+ if np.ceil(max(xticks)) != float(totalWidth - 1):
+ tickscale = float(totalWidth) / max(xticks)
+ xticks_use = [x * tickscale for x in xticks]
+ ax.axes.set_xticks(xticks_use)
+ else:
+ ax.axes.set_xticks(xticks)
+ ax.axes.set_xticklabels(xtickslabel, rotation=self.label_rotation)
+ # align the first and last label
+ # such that they don't fall off
+ # the heatmap sides
+ ticks = ax.xaxis.get_major_ticks()
+ ticks[0].label1.set_horizontalalignment('left')
+ ticks[-1].label1.set_horizontalalignment('right')
+
+ # add labels as y ticks labels
+ ymin, ymax = ax.axes.get_ylim()
+ pos, distance = np.linspace(ymin, ymax, len(labels), retstep=True, endpoint=False)
+ d_half = float(distance) / 2
+ yticks = [x + d_half for x in pos]
+
+ # TODO: make rotation a parameter
+ # ax.axes.set_yticklabels(labels[::-1], rotation='vertical')
+ if plot == 0:
+ ax.axes.set_yticks(yticks)
+ ax.axes.set_yticklabels(labels[::-1])
+ else:
+ ax.axes.set_yticklabels([])
+ # matplotlib 3.1.1 (and likely some earlier versions) will change the ylim if you change the tick locations!
+ ax.axes.set_ylim([ymin, ymax])
+
+ ax_list.append(ax)
+
+ plt.subplots_adjust(wspace=0.05, hspace=0.3)
+ plt.tight_layout()
+ plt.savefig(self.out_file_name, dpi=self.dpi, format=self.image_format)
+ plt.close()
+
+ def plotly_heatmap(self):
+ """plot_heatmap, but with plotly output"""
+ fig = go.Figure()
+ cols = self.plots_per_row if self.numplots > self.plots_per_row else self.numplots
+ rows = np.ceil(self.numplots / float(cols)).astype(int)
+ fig['layout'].update(title=self.plot_title)
+ domainWidth = .9 / cols
+ domainHeight = .9 / rows
+ bufferHeight = 0.0
+ if rows > 1:
+ bufferHeight = 0.1 / (rows - 1)
+ else:
+ domainHeight = 1.0
+ bufferWidth = 0.0
+ if cols > 1:
+ bufferWidth = 0.1 / (cols - 1)
+ else:
+ domainWidth = 1.0
+
+ data = []
+ annos = []
+ zmin = np.inf
+ zmax = -np.inf
+ for i in range(self.numplots):
+ row = rows - i / self.plots_per_row - 1
+ col = i % self.plots_per_row
+
+ if self.per_group:
+ title = self.hm.matrix.group_labels[i]
+ else:
+ title = self.hm.matrix.sample_labels[i]
+
+ base = row * (domainHeight + bufferHeight)
+ domain = [base, base + domainHeight]
+ titleY = base + domainHeight
+ xanchor = 'x{}'.format(i + 1)
+ yanchor = 'y{}'.format(i + 1)
+ visible = False
+ if col == 0:
+ visible = True
+ fig['layout']['yaxis{}'.format(i + 1)] = {'domain': domain, 'anchor': xanchor, 'visible': visible}
+ base = col * (domainWidth + bufferWidth)
+ domain = [base, base + domainWidth]
+ titleX = base + 0.5 * domainWidth
+ fig['layout']['xaxis{}'.format(i + 1)] = {'domain': domain, 'anchor': yanchor}
+ annos.append({'yanchor': 'bottom', 'xref': 'paper', 'xanchor': 'center', 'yref': 'paper', 'text': title, 'y': titleY, 'x': titleX, 'font': {'size': 16}, 'showarrow': False})
+
+ mat = []
+ labels = []
+ for j in range(self.numlines):
+ if self.per_group:
+ row, col = i, j
+ else:
+ row, col = j, i
+
+ sub_matrix = self.hm.matrix.get_matrix(row, col)
+
+ if self.per_group:
+ label = sub_matrix['sample']
+ else:
+ label = sub_matrix['group']
+ labels.append(label)
+ mat.append(np.ma.__getattribute__(self.averagetype)(sub_matrix['matrix'], axis=0))
+ if np.min(mat[-1]) < zmin:
+ zmin = np.min(mat[-1])
+ if np.max(mat[-1]) > zmax:
+ zmax = np.max(mat[-1])
+ totalWidth = len(mat[-1])
+ trace = go.Heatmap(name=title, z=mat, x=range(totalWidth + 1), y=labels, xaxis=xanchor, yaxis=yanchor)
+ data.append(trace)
+
+ # Add ticks
+ xticks, xtickslabel = self.getTicks(i)
+ if np.ceil(max(xticks)) != float(totalWidth):
+ tickscale = float(totalWidth) / max(xticks)
+ xticks_use = [x * tickscale for x in xticks]
+ else:
+ xticks_use = xticks
+ xticks_use = [np.ceil(x) for x in xticks_use]
+ fig['layout']['xaxis{}'.format(i + 1)].update(tickmode='array', tickvals=xticks_use, ticktext=xtickslabel, tickangle=self.label_rotation)
+
+ # Adjust color scale limits
+ for i, trace in enumerate(data):
+ zminUse = zmin
+ zmaxUse = zmax
+ if self.y_min[i % len(self.y_min)] is not None:
+ zminUse = self.y_min[i % len(self.y_min)]
+ if self.y_max[i % len(self.y_max)] is not None:
+ zmaxUse = self.y_max[i % len(self.y_max)]
+ trace.update(zmin=zminUse, zmax=zmaxUse)
+
+ fig.add_traces(data)
+ fig['layout']['annotations'] = annos
+ py.plot(fig, filename=self.out_file_name, auto_open=False)
+
+ def plot_profile(self):
+ if self.y_min is None:
+ self.y_min = [None]
+ if self.y_max is None:
+ self.y_max = [None]
+
+ if not self.color_list:
+ cmap_plot = plt.get_cmap('jet')
+ if self.numlines > 1:
+ # kmeans, so we need to color by cluster
+ self.color_list = cmap_plot(np.arange(self.numlines, dtype=float) / float(self.numlines))
+ else:
+ self.color_list = cmap_plot(np.arange(self.numplots, dtype=float) / float(self.numplots))
+ if (self.numlines > 1 and len(self.color_list) < self.numlines) or\
+ (self.numlines == 1 and len(self.color_list) < self.numplots):
+ sys.exit("\nThe given list of colors is too small, "
+ "at least {} colors are needed\n".format(self.numlines))
+ for color in self.color_list:
+ if not pltcolors.is_color_like(color):
+ sys.exit("\nThe color name {} is not valid. Check "
+ "the name or try with a html hex string "
+ "for example #eeff22".format(color))
+
+ if self.image_format == "plotly":
+ return self.plotly_profile()
+
+ first = True
+ ax_list = []
+ globalYmin = np.inf
+ globalYmax = -np.inf
+ for plot in range(self.numplots):
+ localYMin = None
+ localYMax = None
+ col = plot % self.plots_per_row
+ row = int(plot / float(self.plots_per_row))
+ if (row == 0 and col == 0) or len(self.y_min) > 1 or len(self.y_max) > 1:
+ ax = self.fig.add_subplot(self.grids[row, col])
+ else:
+ ax = self.fig.add_subplot(self.grids[row, col])
+
+ if self.per_group:
+ title = self.hm.matrix.group_labels[plot]
+ if row != 0 and len(self.y_min) == 1 and len(self.y_max) == 1:
+ plt.setp(ax.get_yticklabels(), visible=False)
+ tickIdx = plot % self.hm.matrix.get_num_samples()
+ else:
+ title = self.hm.matrix.sample_labels[plot]
+ if col != 0 and len(self.y_min) == 1 and len(self.y_max) == 1:
+ plt.setp(ax.get_yticklabels(), visible=False)
+ tickIdx = plot
+
+ ax.set_title(title)
+ for data_idx in range(self.numlines):
+ if self.per_group:
+ _row, _col = plot, data_idx
+ else:
+ _row, _col = data_idx, plot
+ if localYMin is None or self.y_min[col % len(self.y_min)] < localYMin:
+ localYMin = self.y_min[col % len(self.y_min)]
+ if localYMax is None or self.y_max[col % len(self.y_max)] > localYMax:
+ localYMax = self.y_max[col % len(self.y_max)]
+
+ sub_matrix = self.hm.matrix.get_matrix(_row, _col)
+
+ if self.per_group:
+ label = sub_matrix['sample']
+ else:
+ label = sub_matrix['group']
+
+ if self.numlines > 1:
+ coloridx = data_idx
+ else:
+ coloridx = plot
+ plot_single(ax, sub_matrix['matrix'],
+ self.averagetype,
+ self.color_list[coloridx],
+ label,
+ plot_type=self.plot_type)
+ globalYmin = min(float(globalYmin), ax.get_ylim()[0])
+ globalYmax = max(globalYmax, ax.get_ylim()[1])
+
+ # Exclude ticks from all but one subplot by default
+ if col > 0 and len(self.y_min) == 1 and len(self.y_max) == 1:
+ plt.setp(ax.get_yticklabels(), visible=False)
+
+ totalWidth = sub_matrix['matrix'].shape[1]
+ xticks, xtickslabel = self.getTicks(tickIdx)
+ if np.ceil(max(xticks)) != float(totalWidth - 1):
+ tickscale = float(totalWidth) / max(xticks)
+ xticks_use = [x * tickscale for x in xticks]
+ ax.axes.set_xticks(xticks_use)
+ else:
+ ax.axes.set_xticks(xticks)
+ ax.axes.set_xticklabels(xtickslabel, rotation=self.label_rotation)
+ # align the first and last label
+ # such that they don't fall off
+ # the heatmap sides
+ ticks = ax.xaxis.get_major_ticks()
+ ticks[0].label1.set_horizontalalignment('left')
+ ticks[-1].label1.set_horizontalalignment('right')
+
+ if first and self.y_axis_label != '':
+ ax.set_ylabel(self.y_axis_label)
+ if first and self.plot_type not in ['heatmap', 'overlapped_lines']:
+ ax.legend(loc=self.legend_location.replace('-', ' '),
+ ncol=1, prop=self.font_p,
+ frameon=False, markerscale=0.5)
+ if len(self.y_min) == 1 and len(self.y_max) == 1:
+ first = False
+ ax_list.append(ax)
+
+ # It turns out that set_ylim only takes float64s
+ for sample_id, subplot in enumerate(ax_list):
+ localYMin = self.y_min[sample_id % len(self.y_min)]
+ localYMax = self.y_max[sample_id % len(self.y_max)]
+ lims = [globalYmin, globalYmax]
+ if localYMin is not None:
+ if localYMax is not None:
+ lims = (float(localYMin), float(localYMax))
+ else:
+ lims = (float(localYMin), lims[1])
+ elif localYMax is not None:
+ lims = (lims[0], float(localYMax))
+ if lims[0] >= lims[1]:
+ lims = (lims[0], lims[0] + 1)
+ ax_list[sample_id].set_ylim(lims)
+
+ plt.subplots_adjust(wspace=0.05, hspace=0.3)
+ plt.tight_layout()
+ plt.savefig(self.out_file_name, dpi=self.dpi, format=self.image_format)
+ plt.close()
+
+ def plotly_profile(self):
+ """
+ plot_profile for plotly output
+
+ y_min, y_max, and color_list are set already
+ """
+ fig = go.Figure()
+ cols = self.plots_per_row if self.numplots > self.plots_per_row else self.numplots
+ rows = np.ceil(self.numplots / float(cols)).astype(int)
+ fig['layout'].update(title=self.plot_title)
+ domainWidth = .9 / cols
+ domainHeight = .9 / rows
+ bufferHeight = 0.0
+ if rows > 1:
+ bufferHeight = 0.1 / (rows - 1)
+ bufferWidth = 0.0
+ if cols > 1:
+ bufferWidth = 0.1 / (cols - 1)
+
+ data = []
+ annos = []
+ yMin = None
+ yMax = None
+ for i in range(self.numplots):
+ row = np.floor(i / self.plots_per_row)
+ # row = rows - i / self.plots_per_row - 1
+ col = i % self.plots_per_row
+ xanchor = 'x{}'.format(i + 1)
+ yanchor = 'y{}'.format(i + 1)
+ base = row * (domainHeight + bufferHeight)
+ domain = [base, base + domainHeight]
+ titleY = base + domainHeight
+ fig['layout']['yaxis{}'.format(i + 1)] = {'domain': domain, 'title': self.y_axis_label, 'anchor': xanchor, 'autorange': False}
+ base = col * (domainWidth + bufferWidth)
+ domain = [base, base + domainWidth]
+ titleX = base + 0.5 * domainWidth
+ fig['layout']['xaxis{}'.format(i + 1)] = {'domain': domain, 'anchor': yanchor}
+
+ if self.per_group:
+ title = self.hm.matrix.group_labels[i]
+ else:
+ title = self.hm.matrix.sample_labels[i]
+ annos.append({'yanchor': 'bottom', 'xref': 'paper', 'xanchor': 'center', 'yref': 'paper', 'text': title, 'y': titleY, 'x': titleX, 'font': {'size': 16}, 'showarrow': False})
+
+ for j in range(self.numlines):
+ if self.per_group:
+ _row, _col = i, j
+ else:
+ _row, _col = j, i
+
+ sub_matrix = self.hm.matrix.get_matrix(_row, _col)
+ fig['layout']['xaxis{}'.format(i + 1)].update(range=[0, sub_matrix['matrix'].shape[1]])
+
+ if self.per_group:
+ label = sub_matrix['sample']
+ else:
+ label = sub_matrix['group']
+
+ if self.numlines > 1:
+ coloridx = j
+ else:
+ coloridx = i
+ color = self.color_list[coloridx]
+ traces = plotly_single(sub_matrix['matrix'],
+ self.averagetype,
+ color,
+ label,
+ plot_type=self.plot_type)
+ for trace in traces:
+ trace.update(xaxis=xanchor, yaxis=yanchor)
+ if yMin is None or min(trace['y']) < yMin:
+ yMin = min(trace['y'])
+ if yMax is None or max(trace['y']) > yMax:
+ yMax = max(trace['y'])
+ if row == col == 0:
+ traces[0].update(showlegend=True)
+ data.extend(traces)
+ totalWidth = sub_matrix['matrix'].shape[1]
+ xticks, xtickslabel = self.getTicks(i)
+ if np.ceil(max(xticks)) != float(totalWidth):
+ tickscale = float(totalWidth) / max(xticks)
+ xticks_use = [x * tickscale for x in xticks]
+ else:
+ xticks_use = xticks
+ xticks_use = [np.ceil(x) for x in xticks_use]
+ fig['layout']['xaxis{}'.format(i + 1)].update(tickmode='array', tickvals=xticks_use, ticktext=xtickslabel, tickangle=self.label_rotation)
+
+ # Set the y limits
+ for i in range(self.numplots):
+ yaxis = 'yaxis{}'.format(i + 1)
+ yRange = [yMin, yMax]
+ if self.y_min[i % len(self.y_min)] is not None:
+ yRange[0] = self.y_min[i % len(self.y_min)]
+ if self.y_max[i % len(self.y_max)] is not None:
+ yRange[1] = self.y_max[i % len(self.y_max)]
+ fig['layout'][yaxis].update(range=yRange)
+
+ fig.add_traces(data)
+ fig['layout']['annotations'] = annos
+ py.plot(fig, filename=self.out_file_name, auto_open=False)
+
+
+def main(args=None):
+ args = process_args(args)
+ hm = heatmapper.heatmapper()
+ matrix_file = args.matrixFile.name
+ args.matrixFile.close()
+ hm.read_matrix_file(matrix_file)
+
+ if hm.parameters['min threshold'] is not None or hm.parameters['max threshold'] is not None:
+ filterHeatmapValues(hm, hm.parameters['min threshold'], hm.parameters['max threshold'])
+
+ if args.kmeans is not None:
+ hm.matrix.hmcluster(args.kmeans, method='kmeans', clustering_samples=args.clusterUsingSamples)
+ else:
+ if args.hclust is not None:
+ print("Performing hierarchical clustering."
+ "Please note that it might be very slow for large datasets.\n")
+ hm.matrix.hmcluster(args.hclust, method='hierarchical', clustering_samples=args.clusterUsingSamples)
+
+ group_len_ratio = np.diff(hm.matrix.group_boundaries) / float(len(hm.matrix.regions))
+ if np.any(group_len_ratio < 5.0 / 1000):
+ problem = np.flatnonzero(group_len_ratio < 5.0 / 1000)
+ sys.stderr.write("WARNING: Group '{}' is too small for plotting, you might want to remove it. \n".format(hm.matrix.group_labels[problem[0]]))
+
+ if args.regionsLabel:
+ hm.matrix.set_group_labels(args.regionsLabel)
+
+ if args.samplesLabel and len(args.samplesLabel):
+ hm.matrix.set_sample_labels(args.samplesLabel)
+
+ if args.outFileNameData:
+ hm.save_tabulated_values(args.outFileNameData, reference_point_label=args.refPointLabel,
+ start_label=args.startLabel,
+ end_label=args.endLabel,
+ averagetype=args.averageType)
+
+ if args.outFileSortedRegions:
+ hm.save_BED(args.outFileSortedRegions)
+
+ prof = Profile(hm, args.outFileName,
+ plot_title=args.plotTitle,
+ y_axis_label=args.yAxisLabel,
+ y_min=args.yMin, y_max=args.yMax,
+ averagetype=args.averageType,
+ reference_point_label=args.refPointLabel,
+ start_label=args.startLabel,
+ end_label=args.endLabel,
+ plot_height=args.plotHeight,
+ plot_width=args.plotWidth,
+ per_group=args.perGroup,
+ plot_type=args.plotType,
+ image_format=args.plotFileFormat,
+ color_list=args.colors,
+ legend_location=args.legendLocation,
+ plots_per_row=args.numPlotsPerRow,
+ label_rotation=args.label_rotation,
+ dpi=args.dpi)
+
+ if args.plotType == 'heatmap':
+ prof.plot_heatmap()
+ elif args.plotType == 'overlapped_lines':
+ prof.plot_hexbin()
+ else:
+ prof.plot_profile()
diff --git a/deepTools/source/deeptools/sumCoveragePerBin.py b/deepTools/source/deeptools/sumCoveragePerBin.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cde45552247cbfd071ada7e7bffb8582580dec4
--- /dev/null
+++ b/deepTools/source/deeptools/sumCoveragePerBin.py
@@ -0,0 +1,240 @@
+import numpy as np
+import multiprocessing
+import time
+
+from deeptools import countReadsPerBin
+from deeptools.utilities import getTLen
+from deeptoolsintervals import GTF
+
+
+class SumCoveragePerBin(countReadsPerBin.CountReadsPerBin):
+ r"""This is an extension of CountReadsPerBin for use with plotFingerprint.
+ There, we need to sum the per-base coverage.
+ """
+ def get_coverage_of_region(self, bamHandle, chrom, regions,
+ fragmentFromRead_func=None):
+ """
+ Returns a numpy array that corresponds to the number of reads
+ that overlap with each tile.
+
+ >>> test = Tester()
+ >>> import pysam
+ >>> c = SumCoveragePerBin([], stepSize=1, extendReads=300)
+
+ For this case the reads are length 36. The number of overlapping
+ read fragments is 4 and 5 for the positions tested. Note that reads are
+ NOT extended, due to there being a 0 length input list of BAM files!
+
+ >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile_PE), 'chr2',
+ ... [(5000833, 5000834), (5000834, 5000835)])
+ array([4., 5.])
+
+ In the following case the reads length is 50. Reads are not extended.
+
+ >>> c.extendReads=False
+ >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile2), '3R', [(148, 150), (150, 152), (152, 154)])
+ array([2., 4., 4.])
+
+
+ """
+ if not fragmentFromRead_func:
+ fragmentFromRead_func = self.get_fragment_from_read
+ nbins = len(regions)
+ if len(regions[0]) == 3:
+ nbins = 0
+ for reg in regions:
+ nbins += (reg[1] - reg[0]) // reg[2]
+ coverages = np.zeros(nbins, dtype='float64')
+
+ if self.defaultFragmentLength == 'read length':
+ extension = 0
+ else:
+ extension = self.maxPairedFragmentLength
+
+ blackList = None
+ if self.blackListFileName is not None:
+ blackList = GTF(self.blackListFileName)
+
+ vector_start = 0
+ for idx, reg in enumerate(regions):
+ if len(reg) == 3:
+ tileSize = int(reg[2])
+ nRegBins = (reg[1] - reg[0]) // tileSize
+ else:
+ nRegBins = 1
+ tileSize = int(reg[1] - reg[0])
+
+ # Blacklisted regions have a coverage of 0
+ if blackList and blackList.findOverlaps(chrom, reg[0], reg[1]):
+ continue
+ regStart = int(max(0, reg[0] - extension))
+ regEnd = reg[1] + int(extension)
+
+ # If alignments are extended and there's a blacklist, ensure that no
+ # reads originating in a blacklist are fetched
+ if blackList and reg[0] > 0 and extension > 0:
+ o = blackList.findOverlaps(chrom, regStart, reg[0])
+ if o is not None and len(o) > 0:
+ regStart = o[-1][1]
+ o = blackList.findOverlaps(chrom, reg[1], regEnd)
+ if o is not None and len(o) > 0:
+ regEnd = o[0][0]
+
+ start_time = time.time()
+ # caching seems faster. TODO: profile the function
+ c = 0
+ try:
+ # BAM input
+ if chrom not in bamHandle.references:
+ raise NameError("chromosome {} not found in bam file".format(chrom))
+ except:
+ # bigWig input, as used by plotFingerprint
+ if bamHandle.chroms(chrom):
+ _ = np.array(bamHandle.stats(chrom, regStart, regEnd, type="mean", nBins=nRegBins), dtype=float)
+ _[np.isnan(_)] = 0.0
+ _ = _ * tileSize
+ coverages += _
+ continue
+ else:
+ raise NameError("chromosome {} not found in bigWig file with chroms {}".format(chrom, bamHandle.chroms()))
+
+ prev_pos = set()
+ lpos = None
+ # of previous processed read pair
+ for read in bamHandle.fetch(chrom, regStart, regEnd):
+ if read.is_unmapped:
+ continue
+ if self.minMappingQuality and read.mapq < self.minMappingQuality:
+ continue
+
+ # filter reads based on SAM flag
+ if self.samFlag_include and read.flag & self.samFlag_include != self.samFlag_include:
+ continue
+ if self.samFlag_exclude and read.flag & self.samFlag_exclude != 0:
+ continue
+
+ # Fragment lengths
+ tLen = getTLen(read)
+ if self.minFragmentLength > 0 and tLen < self.minFragmentLength:
+ continue
+ if self.maxFragmentLength > 0 and tLen > self.maxFragmentLength:
+ continue
+
+ # get rid of duplicate reads that have same position on each of the
+ # pairs
+ if self.ignoreDuplicates:
+ # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions
+ if tLen >= 0:
+ s = read.pos
+ e = s + tLen
+ else:
+ s = read.pnext
+ e = s - tLen
+ if read.reference_id != read.next_reference_id:
+ e = read.pnext
+ if lpos is not None and lpos == read.reference_start \
+ and (s, e, read.next_reference_id, read.is_reverse) in prev_pos:
+ continue
+ if lpos != read.reference_start:
+ prev_pos.clear()
+ lpos = read.reference_start
+ prev_pos.add((s, e, read.next_reference_id, read.is_reverse))
+
+ # since reads can be split (e.g. RNA-seq reads) each part of the
+ # read that maps is called a position block.
+ try:
+ position_blocks = fragmentFromRead_func(read)
+ except TypeError:
+ # the get_fragment_from_read functions returns None in some cases.
+ # Those cases are to be skipped, hence the continue line.
+ continue
+
+ last_eIdx = None
+ for fragmentStart, fragmentEnd in position_blocks:
+ if fragmentEnd is None or fragmentStart is None:
+ continue
+ fragmentLength = fragmentEnd - fragmentStart
+ if fragmentLength == 0:
+ continue
+ # skip reads that are not in the region being
+ # evaluated.
+ if fragmentEnd <= reg[0] or fragmentStart >= reg[1]:
+ continue
+
+ if fragmentStart < reg[0]:
+ fragmentStart = reg[0]
+ if fragmentEnd > reg[0] + len(coverages) * tileSize:
+ fragmentEnd = reg[0] + len(coverages) * tileSize
+
+ sIdx = vector_start + max((fragmentStart - reg[0]) // tileSize, 0)
+ eIdx = vector_start + min(np.ceil(float(fragmentEnd - reg[0]) / tileSize).astype('int'), nRegBins)
+ if eIdx >= len(coverages):
+ eIdx = len(coverages) - 1
+ if last_eIdx is not None:
+ sIdx = max(last_eIdx, sIdx)
+ if sIdx >= eIdx:
+ continue
+
+ # First bin
+ if fragmentEnd < reg[0] + (sIdx + 1) * tileSize:
+ _ = fragmentEnd - fragmentStart
+ else:
+ _ = reg[0] + (sIdx + 1) * tileSize - fragmentStart
+ if _ > tileSize:
+ _ = tileSize
+ coverages[sIdx] += _
+ _ = sIdx + 1
+ while _ < eIdx:
+ coverages[_] += tileSize
+ _ += 1
+ while eIdx - sIdx >= nRegBins:
+ eIdx -= 1
+ if eIdx > sIdx:
+ _ = fragmentEnd - (reg[0] + eIdx * tileSize)
+ if _ > tileSize:
+ _ = tileSize
+ elif _ < 0:
+ _ = 0
+ coverages[eIdx] += _
+ last_eIdx = eIdx
+
+ c += 1
+
+ if self.verbose:
+ endTime = time.time()
+ print("%s, processing %s (%.1f per sec) reads @ %s:%s-%s" % (
+ multiprocessing.current_process().name, c, c / (endTime - start_time), chrom, reg[0], reg[1]))
+
+ vector_start += nRegBins
+
+ # change zeros to NAN
+ if self.zerosToNans:
+ coverages[coverages == 0] = np.nan
+
+ return coverages
+
+
+class Tester(object):
+
+ def __init__(self):
+ """
+ The distribution of reads between the two bam files is as follows.
+
+ They cover 200 bp
+
+ 0 100 200
+ |------------------------------------------------------------|
+ A ===============
+ ===============
+
+
+ B =============== ===============
+ ===============
+ ===============
+ """
+ import os
+ self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/"
+ self.bamFile1 = self.root + "testA.bam"
+ self.bamFile2 = self.root + "testB.bam"
+ self.bamFile_PE = self.root + "test_paired2.bam"
+ self.chrom = '3R'
diff --git a/deepTools/source/deeptools/test/__init__.py b/deepTools/source/deeptools/test/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/deepTools/source/deeptools/test/skiptest_heatmapper_images.py b/deepTools/source/deeptools/test/skiptest_heatmapper_images.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e1026504ba013e7e9673848779bc3a899c62029
--- /dev/null
+++ b/deepTools/source/deeptools/test/skiptest_heatmapper_images.py
@@ -0,0 +1,140 @@
+import os
+import matplotlib
+matplotlib.use('Agg')
+from matplotlib.testing.compare import compare_images
+from tempfile import NamedTemporaryFile
+
+import deeptools.computeMatrix
+import deeptools.plotHeatmap
+import deeptools.plotProfile
+import deeptools.utilities
+
+__author__ = 'Fidel'
+
+ROOT = os.path.dirname(os.path.abspath(__file__)) + "/test_heatmapper/"
+tolerance = 30
+
+
+def test_plotHeatmap_simple_plot():
+ """
+ Test a simple plot generated using a matrix from
+ the following command:
+
+ computeMatrix reference-point -a 100 -b 100 -S {test_path}/test.bw \
+ -R {test_path}/test.bed -o /tmp/mat.gz -bs 25
+
+ """
+ outfile = NamedTemporaryFile(suffix='.png', prefix='plotHeatmap_test_', delete=False)
+ args = "-m {}/master.mat.gz --outFileName {}".format(ROOT, outfile.name).split()
+ deeptools.plotHeatmap.main(args)
+ res = compare_images(ROOT + '/master.png', outfile.name, tolerance)
+ assert res is None, res
+ os.remove(outfile.name)
+
+
+def test_plotHeatmap_rename_labels():
+ outfile = NamedTemporaryFile(suffix='.png', prefix='plotHeatmap_test_', delete=False)
+
+ args = "-m {}/master.mat.gz --outFileName {} --regionsLabel uno dos".format(ROOT, outfile.name).split()
+ deeptools.plotHeatmap.main(args)
+ res = compare_images(ROOT + '/master_relabeled.png', outfile.name, tolerance)
+ assert res is None, res
+ os.remove(outfile.name)
+
+
+def test_plotHeatmap_scale_regions():
+ outfile = NamedTemporaryFile(suffix='.png', prefix='plotHeatmap_test_', delete=False)
+ args = "-m {}/master_scale_reg.mat.gz --outFileName {}".format(ROOT, outfile.name).split()
+ deeptools.plotHeatmap.main(args)
+ res = compare_images(ROOT + '/master_scale_reg.png', outfile.name, tolerance)
+ assert res is None, res
+ os.remove(outfile.name)
+
+
+def test_plotHeatmap_multi_bigwig_pergroup():
+ outfile = NamedTemporaryFile(suffix='.png', prefix='plotHeatmap_test_', delete=False)
+ args = "-m {}/master_multi.mat.gz --perGroup --samplesLabel file1 file2 file3 file4 " \
+ "--outFileName {}".format(ROOT, outfile.name).split()
+ deeptools.plotHeatmap.main(args)
+ res = compare_images(ROOT + '/heatmap_master_multi_pergroup.png', outfile.name, tolerance)
+ assert res is None, res
+ os.remove(outfile.name)
+
+
+def test_plotHeatmap_multiple_colors_muti_scales():
+ outfile = NamedTemporaryFile(suffix='.png', prefix='plotHeatmap_test_', delete=False)
+ args = "-m {}/master_multi.mat.gz --colorList white,blue white,red --zMin 1 0 --zMax 4 5 " \
+ "--outFileName {}".format(ROOT, outfile.name).split()
+ deeptools.plotHeatmap.main(args)
+ res = compare_images(ROOT + '/heatmap_master_multi_color.png', outfile.name, tolerance)
+ assert res is None, res
+ os.remove(outfile.name)
+
+
+def test_plotHeatmap_multiple_colormap_no_boxes():
+ outfile = NamedTemporaryFile(suffix='.png', prefix='plotHeatmap_test_', delete=False)
+ args = "-m {}/master_multi.mat.gz --colorMap Reds binary terrain --boxAroundHeatmaps no " \
+ "--outFileName {}".format(ROOT, outfile.name).split()
+ deeptools.plotHeatmap.main(args)
+ res = compare_images(ROOT + '/heatmap_master_multi_colormap_no_box.png', outfile.name, tolerance)
+ assert res is None, res
+ os.remove(outfile.name)
+
+
+def test_plotHeatmap_interpolation():
+ outfile = NamedTemporaryFile(suffix='.png', prefix='plotHeatmap_test_', delete=False)
+ args = "-m {}/large_matrix.mat.gz --interpolation bilinear " \
+ "--outFileName {}".format(ROOT, outfile.name).split()
+ deeptools.plotHeatmap.main(args)
+ res = compare_images(ROOT + '/heatmap_master_interpolation_bilinear.png', outfile.name, tolerance)
+ assert res is None, res
+ os.remove(outfile.name)
+
+
+def test_plotProfiler():
+ outfile = NamedTemporaryFile(suffix='.png', prefix='plotHeatmap_test_', delete=False)
+ args = "-m {}/master.mat.gz --outFileName {} --regionsLabel uno dos " \
+ "--plotType std".format(ROOT, outfile.name).split()
+ deeptools.plotProfile.main(args)
+ res = compare_images(ROOT + '/profile_master.png', outfile.name, tolerance)
+ assert res is None, res
+ os.remove(outfile.name)
+
+
+def test_plotProfiler_heatmap():
+ outfile = NamedTemporaryFile(suffix='.png', prefix='plotHeatmap_test_', delete=False)
+ args = "-m {}/master.mat.gz --outFileName {} --plotType heatmap".format(ROOT, outfile.name).split()
+ deeptools.plotProfile.main(args)
+ res = compare_images(ROOT + '/profile_master_heatmap.png', outfile.name, tolerance)
+ assert res is None, res
+ os.remove(outfile.name)
+
+
+def test_plotProfiler_overlapped_lines():
+ outfile = NamedTemporaryFile(suffix='.png', prefix='plotHeatmap_test_', delete=False)
+ args = "-m {}/master.mat.gz --outFileName {} " \
+ "--plotType overlapped_lines --yMin -1".format(ROOT, outfile.name).split()
+ deeptools.plotProfile.main(args)
+ res = compare_images(ROOT + '/profile_master_overlap_lines.png', outfile.name, tolerance)
+ assert res is None, res
+ os.remove(outfile.name)
+
+
+def test_plotProfiler_multibigwig():
+ outfile = NamedTemporaryFile(suffix='.png', prefix='plotHeatmap_test_', delete=False)
+ args = "-m {}/master_multi.mat.gz --outFileName {} " \
+ "--numPlotsPerRow 2 --yMax 1.5".format(ROOT, outfile.name).split()
+ deeptools.plotProfile.main(args)
+ res = compare_images(ROOT + '/profile_master_multi.png', outfile.name, tolerance)
+ assert res is None, res
+ os.remove(outfile.name)
+
+
+def test_plotProfiler_multibigwig_pergroup():
+ outfile = NamedTemporaryFile(suffix='.png', prefix='plotHeatmap_test_', delete=False)
+ args = "-m {}/master_multi.mat.gz --outFileName {} " \
+ "--perGroup --yMax 1.5".format(ROOT, outfile.name).split()
+ deeptools.plotProfile.main(args)
+ res = compare_images(ROOT + '/profile_master_multi_pergroup.png', outfile.name, tolerance)
+ assert res is None, res
+ os.remove(outfile.name)
diff --git a/deepTools/source/deeptools/test/test_bamCoverage_and_bamCompare.py b/deepTools/source/deeptools/test/test_bamCoverage_and_bamCompare.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac1f23ce2084f3c2b27a0703ca0d5a8d21bb414f
--- /dev/null
+++ b/deepTools/source/deeptools/test/test_bamCoverage_and_bamCompare.py
@@ -0,0 +1,462 @@
+import deeptools.bamCoverage as bam_cov
+import deeptools.bamCompare as bam_comp
+import deeptools.getScaleFactor as gs
+import os.path
+import filecmp
+from os import unlink
+
+ROOT = os.path.dirname(os.path.abspath(__file__)) + "/test_data/"
+BAMFILE_A = ROOT + "testA.bam"
+BAMFILE_B = ROOT + "testB.bam"
+BAMFILE_FILTER1 = ROOT + "test_filtering.bam"
+BAMFILE_FILTER2 = ROOT + "test_filtering2.bam"
+CRAMFILE_A = ROOT + "testA.cram"
+CRAMFILE_B = ROOT + "testB.cram"
+CRAMFILE_FILTER1 = ROOT + "test_filtering.cram"
+CRAMFILE_FILTER2 = ROOT + "test_filtering2.cram"
+BEDFILE_FILTER = ROOT + "test_filtering.blacklist.bed"
+
+
+"""
+The distribution of reads for the bam file is:
+
+ 0 100 200
+ |------------------------------------------------------------|
+testA.bam 3R ==============>
+ <==============
+
+
+testB.bam 3R <============== ==============>
+ ==============>
+ ==============>
+ """
+
+
+def test_bam_coverage_arguments():
+ """
+ Test minimal command line args for bamCoverage
+ """
+ outfile = '/tmp/test_file.bg'
+ for fname in [BAMFILE_B, CRAMFILE_B]:
+ args = "--bam {} -o {} --outFileFormat bedgraph".format(fname, outfile).split()
+ bam_cov.main(args)
+
+ _foo = open(outfile, 'r')
+ resp = _foo.readlines()
+ _foo.close()
+ expected = ['3R\t0\t50\t0\n', '3R\t50\t150\t1\n', '3R\t150\t200\t2\n']
+ assert f"{resp}" == f"{expected}", f"{resp} != {expected}"
+ unlink(outfile)
+
+
+def test_bam_coverage_extend():
+ outfile = '/tmp/test_file.bg'
+ for fname in [BAMFILE_B, CRAMFILE_B]:
+ args = "-b {} -o {} --extendReads 100 --outFileFormat bedgraph".format(fname, outfile).split()
+ bam_cov.main(args)
+ _foo = open(outfile, 'r')
+ resp = _foo.readlines()
+ _foo.close()
+ expected = ['3R\t0\t150\t1\n', '3R\t150\t200\t3\n']
+ assert f"{resp}" == f"{expected}", f"{resp} != {expected}"
+ unlink(outfile)
+
+
+def test_bam_coverage_extend_and_normalizeUsingRPGC():
+
+ outfile = '/tmp/test_file.bg'
+ for fname in [BAMFILE_B, CRAMFILE_B]:
+ args = "-b {} -o {} --normalizeUsing RPGC --effectiveGenomeSize 200 --extendReads 100 " \
+ "--outFileFormat bedgraph".format(fname, outfile).split()
+ bam_cov.main(args)
+ _foo = open(outfile, 'r')
+ resp = _foo.readlines()
+ _foo.close()
+ # the scale factor should be 0.5, thus the result is similar to
+ # that of the previous test divided by 0.5
+ expected = ['3R\t0\t150\t0.5\n', '3R\t150\t200\t1.5\n']
+ assert f"{resp}" == f"{expected}", f"{resp} != {expected}"
+ unlink(outfile)
+
+
+def test_bam_coverage_skipnas():
+ outfile = '/tmp/test_file.bg'
+ for fname in [BAMFILE_B, CRAMFILE_B]:
+ args = "--bam {} -o {} --outFileFormat bedgraph --skipNAs".format(fname, outfile).split()
+ bam_cov.main(args)
+
+ _foo = open(outfile, 'r')
+ resp = _foo.readlines()
+ _foo.close()
+ expected = ['3R\t50\t150\t1\n', '3R\t150\t200\t2\n']
+ assert f"{resp}" == f"{expected}", f"{resp} != {expected}"
+ unlink(outfile)
+
+
+def test_bam_coverage_filtering():
+ outfile = '/tmp/test_file.bg'
+ for fname in [BAMFILE_B, CRAMFILE_B]:
+ args = "--bam {} -o {} --outFileFormat bedgraph --ignoreDuplicates --verbose".format(fname, outfile).split()
+ bam_cov.main(args)
+
+ _foo = open(outfile, 'r')
+ resp = _foo.readlines()
+ _foo.close()
+ expected = ['3R\t0\t50\t0\n', '3R\t50\t200\t1\n']
+ assert resp == expected, "{} != {}".format(resp, expected)
+ unlink(outfile)
+
+
+def test_bam_compare_arguments():
+ """
+ Test minimal command line args for bamCoverage. The ratio
+ between the same file is taken, therefore, the expected value
+ is 1.0 for all bins.
+ """
+ outfile = '/tmp/test_file.bg'
+ for fname in [BAMFILE_B, CRAMFILE_B]:
+ args = "--bamfile1 {} --bamfile2 {} " \
+ "-o {} -p 1 --outFileFormat bedgraph --operation ratio".format(fname, fname, outfile).split()
+ bam_comp.main(args)
+
+ _foo = open(outfile, 'r')
+ resp = _foo.readlines()
+ _foo.close()
+ expected = ['3R\t0\t200\t1\n']
+ assert resp == expected, "{} != {}".format(resp, expected)
+ unlink(outfile)
+
+
+def test_bam_compare_diff_files():
+ """
+ Test with two different files
+ """
+ outfile = '/tmp/test_file.bg'
+ for A, B in [(BAMFILE_A, BAMFILE_B), (CRAMFILE_A, CRAMFILE_B)]:
+ args = "--bamfile1 {} --bamfile2 {} --scaleFactors 1:1 --operation subtract " \
+ "-o {} -p 1 --outFileFormat bedgraph".format(A, B, outfile).split()
+ bam_comp.main(args)
+
+ _foo = open(outfile, 'r')
+ resp = _foo.readlines()
+ _foo.close()
+ expected = ['3R\t0\t50\t0\n', '3R\t50\t100\t-1\n', '3R\t100\t150\t0\n', '3R\t150\t200\t-1\n']
+ assert resp == expected, "{} != {}".format(resp, expected)
+ unlink(outfile)
+
+
+def test_bam_compare_pseudocounts():
+ """
+ Test with different pseudocounts
+ """
+ outfile = '/tmp/test_file.bg'
+ args = "--bamfile1 {} --bamfile2 {} --outFileFormat bedgraph --scaleFactors 1:1 -o {} " \
+ "--pseudocount 1 0".format(BAMFILE_A, BAMFILE_B, outfile).split()
+ bam_comp.main(args)
+
+ _foo = open(outfile, 'r')
+ resp = _foo.readlines()
+ _foo.close()
+ expected = ['3R\t0\t50\tinf\n', '3R\t50\t100\t0\n', '3R\t100\t150\t1\n', '3R\t150\t200\t0\n']
+ assert resp == expected, "{} != {}".format(resp, expected)
+ unlink(outfile)
+
+
+def test_bam_compare_ZoverZ():
+ """
+ Ensure --skipZeroOverZero works in bamCompare
+ """
+ outfile = '/tmp/test_file.bg'
+ args = "--bamfile1 {} --bamfile2 {} --outFileFormat bedgraph --scaleFactors 1:1 -o {} " \
+ "--skipZeroOverZero".format(BAMFILE_A, BAMFILE_B, outfile).split()
+ bam_comp.main(args)
+
+ _foo = open(outfile, 'r')
+ resp = _foo.readlines()
+ _foo.close()
+ expected = ['3R\t50\t100\t-1\n', '3R\t100\t150\t0\n', '3R\t150\t200\t-0.584963\n']
+ assert f"{resp}" == f"{expected}", f"{resp} != {expected}"
+ unlink(outfile)
+
+
+def test_get_num_kept_reads():
+ """
+ Test the scale factor functions
+ """
+ for fname in [BAMFILE_A, CRAMFILE_A]:
+ args = "--bam {} -o /tmp/test".format(fname).split()
+
+ args = bam_cov.process_args(args)
+ num_kept_reads, total_reads = gs.get_num_kept_reads(args, None)
+
+ # bam file 1 has 2 reads in 3R and 2 read in chr_cigar
+ assert num_kept_reads == 3, "num_kept_reads is wrong"
+ assert total_reads == 3, "num total reads is wrong"
+
+ # ignore chr_cigar to count the total number of reads
+ args = "--bam {} --ignoreForNormalization chr_cigar -o /tmp/test".format(fname).split()
+ args = bam_cov.process_args(args)
+ num_kept_reads, total_reads = gs.get_num_kept_reads(args, None)
+
+ # the number of kept reads should be 2 as the read on chr_cigar is skipped
+ assert num_kept_reads == 2, "num_kept_reads is wrong ({})".format(num_kept_reads)
+
+ # test filtering by read direction. Only forward reads are kept
+ args = "--bam {} -o /tmp/test --samFlagExclude 16 --ignoreForNormalization chr_cigar ".format(fname).split()
+
+ args = bam_cov.process_args(args)
+ num_kept_reads, total_reads = gs.get_num_kept_reads(args, None)
+
+ # only one forward read is expected in
+ assert num_kept_reads == 1, "num_kept_reads is wrong"
+
+
+def test_bam_compare_diff_files_skipnas():
+ """
+ Test skipnas
+ Compared to the previous tests, any region that do not have coverage (in either of the bam files)
+ is not included in the bedgraph file.
+ """
+ outfile = '/tmp/test_file.bg'
+ for A, B in [(BAMFILE_A, BAMFILE_B), (CRAMFILE_A, CRAMFILE_B)]:
+ args = "--bamfile1 {} --bamfile2 {} --scaleFactors 1:1 --operation subtract " \
+ "-o {} -p 1 --outFileFormat bedgraph --skipNAs".format(A, B, outfile).split()
+ bam_comp.main(args)
+
+ _foo = open(outfile, 'r')
+ resp = _foo.readlines()
+ _foo.close()
+ expected = ['3R\t100\t150\t0\n', '3R\t150\t200\t-1\n']
+ assert f"{resp}" == f"{expected}", f"{resp} != {expected}"
+ unlink(outfile)
+
+
+def test_bam_compare_extend():
+ """
+ Test read extension
+ """
+ outfile = '/tmp/test_file.bg'
+ for A, B in [(BAMFILE_A, BAMFILE_B), (CRAMFILE_A, CRAMFILE_B)]:
+ args = "--bamfile1 {} --bamfile2 {} --extend 100 --scaleFactors 1:1 --operation subtract " \
+ "-o {} -p 1 --outFileFormat bedgraph".format(A, B, outfile).split()
+ bam_comp.main(args)
+
+ _foo = open(outfile, 'r')
+ resp = _foo.readlines()
+ _foo.close()
+ expected = ['3R\t0\t100\t-1\n', '3R\t100\t150\t1\n', '3R\t150\t200\t-1\n']
+ assert f"{resp}" == f"{expected}", f"{resp} != {expected}"
+ unlink(outfile)
+
+
+def test_bam_compare_scale_factors_ratio():
+ """
+ Test scale factor
+ """
+ outfile = '/tmp/test_file.bg'
+ for A, B in [(BAMFILE_A, BAMFILE_B), (CRAMFILE_A, CRAMFILE_B)]:
+ args = "--bamfile1 {} --bamfile2 {} --operation ratio --ignoreForNormalization chr_cigar " \
+ "-o {} -p 1 --outFileFormat bedgraph".format(A, B, outfile).split()
+ bam_comp.main(args)
+
+ # The scale factors are [ 1. 0.5] because BAMFILE_B has double the amount of reads (4) compared to BAMFILE_A
+
+ _foo = open(outfile, 'r')
+ resp = _foo.readlines()
+ _foo.close()
+
+ """
+ The distribution of reads for the bam file is:
+
+ 0 100 200
+ |------------------------------------------------------------|
+ testA.bam 3R ==============>
+ <==============
+
+
+ testB.bam 3R <============== ==============>
+ ==============>
+ ==============>
+
+ ------------------------------------------------------------------------------
+
+ ratio: 0 (0+1)/(1*0.5+1)=0.67 (1+1)/(1+2*0.5)=1
+ (scale factors [1,0.5]) (1+1)/(1+1*0.5)=1.33
+ """
+
+ expected = ['3R\t0\t50\t1\n', '3R\t50\t100\t0.666667\n', '3R\t100\t150\t1.33333\n', '3R\t150\t200\t1\n']
+ assert f"{resp}" == f"{expected}", f"{resp} != {expected}"
+ unlink(outfile)
+
+
+def test_bam_compare_scale_factors_subtract():
+ """
+ Test scale factor
+ """
+ outfile = '/tmp/test_file.bg'
+ for A, B in [(BAMFILE_A, BAMFILE_B), (CRAMFILE_A, CRAMFILE_B)]:
+ args = "--bamfile1 {} --bamfile2 {} --operation subtract --ignoreForNormalization chr_cigar " \
+ "-o {} -p 1 --outFileFormat bedgraph --scaleFactorsMethod None --normalizeUsing CPM".format(A, B, outfile).split()
+ bam_comp.main(args)
+
+ # The scale factors are [ 1. 0.5] because BAMFILE_B has dowble the amount of reads (4) compared to BAMFILE_A
+
+ _foo = open(outfile, 'r')
+ resp = _foo.readlines()
+ _foo.close()
+
+ """
+ The distribution of reads for the bam file is:
+
+ 0 100 200
+ |------------------------------------------------------------|
+ testA.bam 3R ==============>
+ <==============
+
+
+ testB.bam 3R <============== ==============>
+ ==============>
+ ==============>
+
+ ------------------------------------------------------------------------------
+
+ subtract: After applying CPM normalization, the scale factors are [500000,250000]
+
+ after applying factors: 0 -25k 25k 0
+
+ """
+
+ expected = ['3R\t0\t50\t0\n', '3R\t50\t100\t-250000\n', '3R\t100\t150\t250000\n', '3R\t150\t200\t0\n']
+ assert f"{resp}" == f"{expected}", f"{resp} != {expected}"
+ unlink(outfile)
+
+
+def test_bam_coverage_filter_blacklist():
+ """
+ Test --samFlagInclude --samFlagExclude --minMappingQuality --ignoreDuplicates and --blackListFileName
+ """
+ outfile = '/tmp/test_file_filter.bg'
+ for fname in [BAMFILE_FILTER1, CRAMFILE_FILTER1]:
+ args = "--bam {} --normalizeUsing RPGC --effectiveGenomeSize 1400 -p 1 -o {} -of bedgraph --samFlagInclude 512 " \
+ "--samFlagExclude 256 --minMappingQuality 5 --ignoreDuplicates " \
+ "--blackListFileName {}".format(fname, outfile, BEDFILE_FILTER)
+ args = args.split()
+ bam_cov.main(args)
+
+ _foo = open(outfile, 'r')
+ resp = _foo.readlines()
+ _foo.close()
+
+ expected = ['3R\t0\t100\t0\n', '3R\t100\t150\t1.42338\n',
+ '3R\t150\t250\t4.88017\n', '3R\t250\t300\t3.05011\n',
+ '3R\t300\t400\t2.23675\n', '3R\t400\t450\t3.86347\n',
+ '3R\t450\t500\t4.06681\n', '3R\t500\t550\t2.03341\n',
+ '3R\t550\t600\t2.44009\n', '3R\t600\t650\t4.47349\n',
+ '3R\t650\t700\t3.45679\n', '3R\t700\t750\t3.66013\n',
+ '3R\t750\t800\t4.06681\n', '3R\t900\t950\t2.44009\n',
+ '3R\t950\t1000\t1.62672\n', '3R\t1000\t1050\t0.813362\n',
+ '3R\t1050\t1500\t0\n']
+
+ assert f"{resp}" == f"{expected}", f"{resp} != {expected}"
+ unlink(outfile)
+
+
+def test_bam_coverage_offset1():
+ """
+ Test -bs 1 --Offset 1
+ """
+ outfile = '/tmp/test_offset.bw'
+ for fname in [BAMFILE_A, CRAMFILE_A]:
+ args = "--Offset 1 --bam {} -p 1 -bs 1 -o {}".format(fname, outfile)
+ args = args.split()
+ bam_cov.main(args)
+ try:
+ # python 3 only
+ filecmp.clear_cache()
+ except:
+ pass
+ assert filecmp.cmp(outfile, "{}testA_offset1.bw".format(ROOT)) is True
+ unlink(outfile)
+
+
+def test_bam_coverage_offset1_10():
+ """
+ Test -bs 1 --Offset 1 10
+ """
+ outfile = '/tmp/test_offset.bw'
+ for fname in [BAMFILE_A, CRAMFILE_A]:
+ args = "--Offset 1 10 -b {} -p 1 -bs 1 -o {}".format(fname, outfile)
+ args = args.split()
+ bam_cov.main(args)
+ try:
+ # python 3 only
+ filecmp.clear_cache()
+ except:
+ pass
+ assert filecmp.cmp(outfile, "{}testA_offset1_10.bw".format(ROOT)) is True
+ unlink(outfile)
+
+
+def test_bam_coverage_offset_minus1():
+ """
+ Test -bs 1 --Offset -1
+ """
+ outfile = '/tmp/test_offset.bw'
+ for fname in [BAMFILE_A, CRAMFILE_A]:
+ args = "--Offset -1 -b {} -p 1 -bs 1 -o {}".format(fname, outfile)
+ args = args.split()
+ bam_cov.main(args)
+ try:
+ # python 3 only
+ filecmp.clear_cache()
+ except:
+ pass
+ assert filecmp.cmp(outfile, "{}testA_offset-1.bw".format(ROOT)) is True
+ unlink(outfile)
+
+
+def test_bam_coverage_offset20_minus4():
+ """
+ Test -bs 1 --Offset 20 -4
+ """
+ outfile = '/tmp/test_offset.bw'
+ for fname in [BAMFILE_A, CRAMFILE_A]:
+ args = "--Offset 20 -4 -b {} -p 1 -bs 1 -o {}".format(fname, outfile)
+ args = args.split()
+ bam_cov.main(args)
+ try:
+ # python 3 only
+ filecmp.clear_cache()
+ except:
+ pass
+ assert filecmp.cmp(outfile, "{}testA_offset20_-4.bw".format(ROOT)) is True
+ unlink(outfile)
+
+
+def test_bam_compare_filter_blacklist():
+ """
+ Test --samFlagInclude --samFlagExclude --minMappingQuality --ignoreDuplicates and --blackListFileName
+ """
+ outfile = '/tmp/test_file_filter.bg'
+ for A, B in [(BAMFILE_FILTER1, BAMFILE_FILTER2), (CRAMFILE_FILTER1, CRAMFILE_FILTER2)]:
+ args = "-b1 {} -b2 {} -p 1 -o {} -of bedgraph --samFlagInclude 512 " \
+ "--samFlagExclude 256 --minMappingQuality 5 --ignoreDuplicates " \
+ "--blackListFileName {}".format(A, B, outfile, BEDFILE_FILTER)
+ args = args.split()
+ bam_comp.main(args)
+
+ _foo = open(outfile, 'r')
+ resp = _foo.readlines()
+ _foo.close()
+ expected = ['3R\t0\t100\t0\n', '3R\t100\t150\t-0.220909\n',
+ '3R\t150\t200\t-0.159356\n', '3R\t200\t250\t-0.0718929\n',
+ '3R\t250\t300\t0.135883\n', '3R\t300\t350\t0.103093\n',
+ '3R\t350\t400\t-0.0895516\n', '3R\t400\t450\t0.0308374\n',
+ '3R\t450\t500\t0.0989418\n', '3R\t500\t550\t0.207044\n',
+ '3R\t550\t600\t0.0198996\n', '3R\t600\t650\t-0.0957241\n',
+ '3R\t650\t700\t0.00968255\n', '3R\t700\t750\t-0.040642\n',
+ '3R\t750\t800\t-0.123451\n', '3R\t900\t950\t0.212545\n',
+ '3R\t950\t1000\t0.199309\n', '3R\t1000\t1050\t0.167945\n',
+ '3R\t1050\t1500\t0\n']
+ assert f"{resp}" == f"{expected}", f"{resp} != {expected}"
+ unlink(outfile)
diff --git a/deepTools/source/deeptools/test/test_bigwigAverage.py b/deepTools/source/deeptools/test/test_bigwigAverage.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f8632912550445bcdac4dff4e89ec06a7236e60
--- /dev/null
+++ b/deepTools/source/deeptools/test/test_bigwigAverage.py
@@ -0,0 +1,83 @@
+import deeptools.bigwigAverage as bwAve
+
+import os.path
+from os import unlink
+
+ROOT = os.path.dirname(os.path.abspath(__file__)) + "/test_data/"
+BIGWIG_A = ROOT + "testA_skipNAs.bw"
+BIGWIG_B = ROOT + "testB_skipNAs.bw"
+BIGWIG_C = ROOT + "test1.bw.bw"
+
+
+"""
+The distribution of reads for the bam file is:
+
+ 0 100 200
+ |------------------------------------------------------------|
+testA.bam 3R ==============>
+ <==============
+
+
+testB.bam 3R <============== ==============>
+ ==============>
+ ==============>
+
+The resulting bigwig files are as follows:
+
+testA_skipNas:
+ 3R 100 200 1
+ chr_cigar 0 50 2
+
+testB_skipNas:
+ 3R 50 150 1
+ 3R 150 200 2
+"""
+
+
+def test_bigwigAverage():
+ outfile = '/tmp/result.bg'
+ args = "--bigwigs {} {} -o {} --outFileFormat bedgraph".format(BIGWIG_A, BIGWIG_B, outfile).split()
+ bwAve.main(args)
+ _foo = open(outfile, 'r')
+ resp = _foo.readlines()
+ _foo.close()
+ expected = ['3R\t0\t50\t0\n', '3R\t50\t100\t0.5\n', '3R\t100\t150\t1\n', '3R\t150\t200\t1.5\n']
+ assert f"{resp}" == f"{expected}", f"{resp} != {expected}"
+ unlink(outfile)
+
+
+def test_bigwigAverage_skipnas():
+ outfile = '/tmp/result.bg'
+ args = "--bigwigs {} {} -o {} --skipNAs " \
+ "--outFileFormat bedgraph".format(BIGWIG_A, BIGWIG_B, outfile).split()
+ bwAve.main(args)
+ _foo = open(outfile, 'r')
+ resp = _foo.readlines()
+ _foo.close()
+ expected = ['3R\t100\t150\t1\n', '3R\t150\t200\t1.5\n']
+ assert f"{resp}" == f"{expected}", f"{resp} != {expected}"
+ unlink(outfile)
+
+
+def test_bigwigAverageWithScale():
+ outfile = '/tmp/result.bg'
+ args = "--bigwigs {} {} -o {} --outFileFormat bedgraph --scaleFactors 1:0.5".format(BIGWIG_A, BIGWIG_B, outfile).split()
+ bwAve.main(args)
+ _foo = open(outfile, 'r')
+ resp = _foo.readlines()
+ _foo.close()
+ expected = ['3R\t0\t50\t0\n', '3R\t50\t100\t0.25\n', '3R\t100\t150\t0.75\n', '3R\t150\t200\t1\n']
+ assert f"{resp}" == f"{expected}", f"{resp} != {expected}"
+ unlink(outfile)
+
+
+def test_bigwigAverageThree():
+ outfile = '/tmp/result.bg'
+ args = "--bigwigs {} {} {} -o {} --outFileFormat bedgraph --scaleFactors 0.75:0.75:.75".format(BIGWIG_A, BIGWIG_A, BIGWIG_B, outfile).split()
+ bwAve.main(args)
+ _foo = open(outfile, 'r')
+ resp = _foo.readlines()
+ _foo.close()
+ expected = ['3R\t0\t50\t0\n', '3R\t50\t100\t0.25\n', '3R\t100\t150\t0.75\n', '3R\t150\t200\t1\n']
+ assert f"{resp}" == f"{expected}", f"{resp} != {expected}"
+ unlink(outfile)
diff --git a/deepTools/source/deeptools/test/test_bigwigCompare_and_multiBigwigSummary.py b/deepTools/source/deeptools/test/test_bigwigCompare_and_multiBigwigSummary.py
new file mode 100644
index 0000000000000000000000000000000000000000..076baa21953acc6494efc49655cb33787d7e2f02
--- /dev/null
+++ b/deepTools/source/deeptools/test/test_bigwigCompare_and_multiBigwigSummary.py
@@ -0,0 +1,136 @@
+import deeptools.bigwigCompare as bwComp
+import deeptools.multiBigwigSummary as bwCorr
+import numpy as np
+import numpy.testing as nt
+
+import os.path
+from os import unlink
+
+ROOT = os.path.dirname(os.path.abspath(__file__)) + "/test_data/"
+BIGWIG_A = ROOT + "testA_skipNAs.bw"
+BIGWIG_B = ROOT + "testB_skipNAs.bw"
+BIGWIG_C = ROOT + "test1.bw.bw"
+
+
+"""
+The distribution of reads for the bam file is:
+
+ 0 100 200
+ |------------------------------------------------------------|
+testA.bam 3R ==============>
+ <==============
+
+
+testB.bam 3R <============== ==============>
+ ==============>
+ ==============>
+
+The resulting bigwig files are as follows:
+
+testA_skipNas:
+ 3R 100 200 1
+ chr_cigar 0 50 2
+
+testB_skipNas:
+ 3R 50 150 1
+ 3R 150 200 2
+"""
+
+
+def test_bigwigCompare():
+ outfile = '/tmp/result.bg'
+ args = "-b1 {} -b2 {} -o {} --operation add --outFileFormat bedgraph".format(BIGWIG_A, BIGWIG_B, outfile).split()
+ bwComp.main(args)
+ _foo = open(outfile, 'r')
+ resp = _foo.readlines()
+ _foo.close()
+ expected = ['3R\t0\t50\t0\n', '3R\t50\t100\t1\n', '3R\t100\t150\t2\n', '3R\t150\t200\t3\n']
+ assert f"{resp}" == f"{expected}", f"{resp} != {expected}"
+ unlink(outfile)
+
+
+def test_bigwigCompare_skipnas():
+ outfile = '/tmp/result.bg'
+ args = "-b1 {} -b2 {} -o {} --operation add --skipNAs " \
+ "--outFileFormat bedgraph".format(BIGWIG_A, BIGWIG_B, outfile).split()
+ bwComp.main(args)
+ _foo = open(outfile, 'r')
+ resp = _foo.readlines()
+ _foo.close()
+ expected = ['3R\t100\t150\t2\n', '3R\t150\t200\t3\n']
+ assert f"{resp}" == f"{expected}", f"{resp} != {expected}"
+ unlink(outfile)
+
+
+def test_bigwigCompare_skipZeroOverZero():
+ outfile = '/tmp/result.bg"'
+ args = "-b1 {} -b2 {} -o {} --skipZeroOverZero --pseudocount 1 3 --outFileFormat bedgraph".format(BIGWIG_A, BIGWIG_A, outfile).split()
+ bwComp.main(args)
+ _foo = open(outfile, 'r')
+ resp = _foo.readlines()
+ _foo.close()
+ expected = ['3R\t100\t200\t-1\n']
+ assert f"{resp}" == f"{expected}", f"{resp} != {expected}"
+ unlink(outfile)
+
+
+def test_multiBigwigSummary():
+ outfile = '/tmp/result.bg'
+ args = "bins -b {} {} --binSize 50 -o {}".format(BIGWIG_A, BIGWIG_B, outfile).split()
+ bwCorr.main(args)
+ resp = np.load(outfile)
+ matrix = resp['matrix']
+ labels = resp['labels']
+ nt.assert_equal(matrix, np.array([[np.nan, np.nan],
+ [np.nan, 1.],
+ [1., 1.],
+ [1., 2.]]))
+ nt.assert_equal(labels, ['testA_skipNAs.bw', 'testB_skipNAs.bw'])
+ unlink(outfile)
+
+
+def test_multiBigwigSummary_outrawcounts():
+ """
+ Test multiBigwigSummary raw counts output
+ """
+ outfile = '/tmp/result.bg'
+ args = "bins -b {} {} --binSize 50 -o /tmp/null --outRawCounts {} ".format(BIGWIG_A, BIGWIG_B, outfile).split()
+ bwCorr.main(args)
+ _foo = open(outfile, 'r')
+ resp = _foo.read()
+ _foo.close()
+ expected = """#'chr' 'start' 'end' 'testA_skipNAs.bw' 'testB_skipNAs.bw'
+3R 0 50 nan nan
+3R 50 100 nan 1.0
+3R 100 150 1.0 1.0
+3R 150 200 1.0 2.0
+"""
+ assert f"{resp}" == f"{expected}", f"{resp} != {expected}"
+ unlink(outfile)
+ unlink("/tmp/null")
+
+
+def test_multiBigwigSummary_gtf():
+ outfile = '/tmp/_test.npz'
+ args = "BED-file -b {0} {0} --BED {1}/test.gtf -o {2}".format(BIGWIG_C, ROOT, outfile).split()
+ bwCorr.main(args)
+ resp = np.load(outfile)
+ matrix = resp['matrix']
+ labels = resp['labels']
+ nt.assert_equal(labels, ['test1.bw.bw', 'test1.bw.bw'])
+ nt.assert_allclose(matrix, np.array([[27.475, 27.475],
+ [27.31248719, 27.31248719]]))
+ unlink(outfile)
+
+
+def test_multiBigwigSummary_metagene():
+ outfile = '/tmp/_test.npz'
+ args = "BED-file --metagene -b {0} {0} --BED {1}/test.gtf -o {2}".format(BIGWIG_C, ROOT, outfile).split()
+ bwCorr.main(args)
+ resp = np.load(outfile)
+ matrix = resp['matrix']
+ labels = resp['labels']
+ nt.assert_equal(labels, ['test1.bw.bw', 'test1.bw.bw'])
+ nt.assert_allclose(matrix, np.array([[20.28956028, 20.28956028],
+ [22.1923501, 22.1923501]]))
+ unlink(outfile)
diff --git a/deepTools/source/deeptools/test/test_computeMatrixOperations.py b/deepTools/source/deeptools/test/test_computeMatrixOperations.py
new file mode 100644
index 0000000000000000000000000000000000000000..c253431d7d6ce4354d872f7cbc902b3b34b97161
--- /dev/null
+++ b/deepTools/source/deeptools/test/test_computeMatrixOperations.py
@@ -0,0 +1,165 @@
+# from unittest import TestCase
+
+import deeptools.computeMatrixOperations as cmo
+import os
+import hashlib
+import gzip
+import json
+
+__author__ = 'Devon'
+
+
+def getHeader(fp):
+ s = fp.readline()
+ if isinstance(s, bytes):
+ s = s.decode()
+ s = s[1:]
+ return json.loads(s)
+
+
+class TestComputeMatrixOperations(object):
+ root = os.path.dirname(os.path.abspath(__file__)) + "/test_data/"
+ matrix = root + "computeMatrixOperations.mat.gz"
+ bed = root + "computeMatrixOperations.bed"
+ rbindMatrix1 = root + "somegenes.txt.gz"
+ rbindMatrix2 = root + "othergenes.txt.gz"
+
+ def testSubset(self):
+ """
+ computeMatrixOperations subset
+ """
+
+ dCorrect = {"verbose": True, "scale": 1, "skip zeros": False, "nan after end": False, "sort using": "mean", "unscaled 5 prime": [0, 0, 0, 0], "body": [1000, 1000, 1000, 1000], "sample_labels": ["SRR648667.forward", "SRR648668.forward", "SRR648669.forward", "SRR648670.forward"], "downstream": [0, 0, 0, 0], "unscaled 3 prime": [0, 0, 0, 0], "group_labels": ["genes"], "bin size": [10, 10, 10, 10], "upstream": [0, 0, 0, 0], "group_boundaries": [0, 196], "sample_boundaries": [0, 100, 200, 300, 400], "max threshold": None, "ref point": [None, None, None, None], "min threshold": None, "sort regions": "no", "proc number": 20, "bin avg type": "mean", "missing data as zero": False}
+ oname = "/tmp/subset.mat.gz"
+ args = "subset -m {} --sample SRR648667.forward SRR648668.forward SRR648669.forward SRR648670.forward -o {}".format(self.matrix, oname)
+ args = args.split()
+ cmo.main(args)
+ f = gzip.GzipFile(oname)
+ d = getHeader(f) # Skip the header, which can be in a different order
+ h = hashlib.md5(f.read()).hexdigest()
+ f.close()
+ assert d == dCorrect
+ expectedh = 'edb3c8506c3f27ebb8c7ddf94d5ba594'
+ assert f'{h}' == f'{expectedh}'
+ os.remove(oname)
+
+ def testRelabel(self):
+ """
+ computeMatrixOperations relabel
+ """
+ dCorrect = {"verbose": True, "scale": 1, "skip zeros": False, "nan after end": False, "sort using": "mean", "unscaled 5 prime": [0, 0, 0, 0, 0, 0, 0, 0], "body": [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000], "sample_labels": ["first", "sec ond", "3rd", "4th", "5th", "6th", "7th", "8th"], "downstream": [0, 0, 0, 0, 0, 0, 0, 0], "unscaled 3 prime": [0, 0, 0, 0, 0, 0, 0, 0], "group_labels": ["foo bar"], "bin size": [10, 10, 10, 10, 10, 10, 10, 10], "upstream": [0, 0, 0, 0, 0, 0, 0, 0], "group_boundaries": [0, 196], "sample_boundaries": [0, 100, 200, 300, 400, 500, 600, 700, 800], "max threshold": None, "ref point": [None, None, None, None, None, None, None, None], "min threshold": None, "sort regions": "no", "proc number": 20, "bin avg type": "mean", "missing data as zero": False}
+ oname = "/tmp/relabeled.mat.gz"
+ args = "relabel -m {} -o {} --sampleLabels first sec_ond 3rd 4th 5th 6th 7th 8th --groupLabels foo_bar".format(self.matrix, oname)
+ args = args.split()
+ args[7] = 'sec ond' # split mucks up spaces
+ args[-1] = 'foo bar'
+ cmo.main(args)
+ f = gzip.GzipFile(oname)
+ d = getHeader(f)
+ assert d == dCorrect
+ f.close()
+ os.remove(oname)
+
+ def testfilterStrand(self):
+ """
+ computeMatrixOperations filterStrand
+ """
+ dCorrect = {"verbose": True, "scale": 1, "skip zeros": False, "nan after end": False, "sort using": "mean", "unscaled 5 prime": [0, 0, 0, 0, 0, 0, 0, 0], "body": [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000], "sample_labels": ["SRR648667.forward", "SRR648668.forward", "SRR648669.forward", "SRR648670.forward", "SRR648667.reverse", "SRR648668.reverse", "SRR648669.reverse", "SRR648670.reverse"], "downstream": [0, 0, 0, 0, 0, 0, 0, 0], "unscaled 3 prime": [0, 0, 0, 0, 0, 0, 0, 0], "group_labels": ["genes"], "bin size": [10, 10, 10, 10, 10, 10, 10, 10], "upstream": [0, 0, 0, 0, 0, 0, 0, 0], "group_boundaries": [0, 107], "sample_boundaries": [0, 100, 200, 300, 400, 500, 600, 700, 800], "max threshold": None, "ref point": [None, None, None, None, None, None, None, None], "min threshold": None, "sort regions": "no", "proc number": 20, "bin avg type": "mean", "missing data as zero": False}
+ oname = "/tmp/filterStrand1.mat.gz"
+ args = "filterStrand -m {} -o {} --strand +".format(self.matrix, oname)
+ args = args.split(' ')
+ cmo.main(args)
+ f = gzip.GzipFile(oname)
+ d = getHeader(f) # Skip the header, which can be in a different order
+ h = hashlib.md5(f.read()).hexdigest()
+ f.close()
+ assert d == dCorrect
+ expectedh = '300f8000be5b5f51e803b57ef08f1c9e'
+ assert f'{h}' == f'{expectedh}'
+ os.remove(oname)
+
+ dCorrect = {u'verbose': True, u'scale': 1, u'skip zeros': False, u'nan after end': False, u'sort using': u'mean', u'unscaled 5 prime': [0, 0, 0, 0, 0, 0, 0, 0], u'body': [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000], u'sample_labels': [u'SRR648667.forward', u'SRR648668.forward', u'SRR648669.forward', u'SRR648670.forward', u'SRR648667.reverse', u'SRR648668.reverse', u'SRR648669.reverse', u'SRR648670.reverse'], u'downstream': [0, 0, 0, 0, 0, 0, 0, 0], u'unscaled 3 prime': [0, 0, 0, 0, 0, 0, 0, 0], u'group_labels': [u'genes'], u'bin size': [10, 10, 10, 10, 10, 10, 10, 10], u'upstream': [0, 0, 0, 0, 0, 0, 0, 0], u'group_boundaries': [0, 89], u'sample_boundaries': [0, 100, 200, 300, 400, 500, 600, 700, 800], u'missing data as zero': False, u'ref point': [None, None, None, None, None, None, None, None], u'min threshold': None, u'sort regions': u'no', u'proc number': 20, u'bin avg type': u'mean', u'max threshold': None}
+ oname = "/tmp/filterStrand2.mat.gz"
+ args = "filterStrand -m {} -o {} --strand -".format(self.matrix, oname)
+ args = args.split()
+ cmo.main(args)
+ f = gzip.GzipFile(oname)
+ d = getHeader(f) # Skip the header, which can be in a different order
+ h = hashlib.md5(f.read()).hexdigest()
+ f.close()
+ assert d == dCorrect
+ expectedh = '0a6ca070a5ba4564f1ab950ac3b7c8f1'
+ assert f'{h}' == f'{expectedh}'
+ os.remove(oname)
+
+ def testrbind(self):
+ """
+ computeMatrixOperations rbind
+ """
+ dCorrect = {"verbose": True, "scale": 1, "skip zeros": False, "nan after end": False, "sort using": "mean", "unscaled 5 prime": [0, 0, 0, 0, 0, 0, 0, 0], "body": [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000], "sample_labels": ["SRR648667.forward", "SRR648668.forward", "SRR648669.forward", "SRR648670.forward", "SRR648667.reverse", "SRR648668.reverse", "SRR648669.reverse", "SRR648670.reverse"], "downstream": [0, 0, 0, 0, 0, 0, 0, 0], "unscaled 3 prime": [0, 0, 0, 0, 0, 0, 0, 0], "group_labels": ["genes"], "bin size": [10, 10, 10, 10, 10, 10, 10, 10], "upstream": [0, 0, 0, 0, 0, 0, 0, 0], "group_boundaries": [0, 392], "sample_boundaries": [0, 100, 200, 300, 400, 500, 600, 700, 800], "max threshold": None, "ref point": [None, None, None, None, None, None, None, None], "min threshold": None, "sort regions": "no", "proc number": 20, "bin avg type": "mean", "missing data as zero": False}
+ oname = "/tmp/rbind.mat.gz"
+ args = "rbind -m {0} {0} -o {1}".format(self.matrix, oname)
+ args = args.split()
+ cmo.main(args)
+ f = gzip.GzipFile(oname)
+ d = getHeader(f) # Skip the header, which can be in a different order
+ h = hashlib.md5(f.read()).hexdigest()
+ f.close()
+ assert d == dCorrect
+ expectedh = '3dd96c7b05e0ca5ada21212defe57fba'
+ assert f'{h}' == f'{expectedh}'
+ os.remove(oname)
+
+ def testrbind2(self):
+ """
+ computeMatrixOperations rbind with different groups
+ """
+ dCorrect = {"verbose": False, "scale": 1, "skip zeros": False, "nan after end": False, "sort using": "mean", "unscaled 5 prime": [0], "body": [2], "sample_labels": ["signal"], "downstream": [1], "unscaled 3 prime": [0], "group_labels": ["somegenes", "othergenes"], "bin size": [1], "upstream": [1], "group_boundaries": [0, 3, 7], "sample_boundaries": [0, 4], "max threshold": None, "ref point": [None], "min threshold": None, "sort regions": "keep", "proc number": 1, "bin avg type": "mean", "missing data as zero": True}
+ oname = "/tmp/rbind2.mat.gz"
+ args = "rbind -m {0} {1} -o {2}".format(self.rbindMatrix1, self.rbindMatrix2, oname)
+ args = args.split()
+ cmo.main(args)
+ f = gzip.GzipFile(oname)
+ d = getHeader(f) # Skip the header, which can be in a different order
+ h = hashlib.md5(f.read()).hexdigest()
+ f.close()
+ assert d == dCorrect
+ expectedh = '5d8b1517fc4c63d000b6b37f70ee163b'
+ assert f'{h}' == f'{expectedh}'
+ os.remove(oname)
+
+ def testcbind(self):
+ """
+ computeMatrixOperations cbind
+ """
+ dCorrect = {"verbose": True, "scale": 1, "skip zeros": False, "nan after end": False, "sort using": "mean", "unscaled 5 prime": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "body": [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000], "sample_labels": ["SRR648667.forward", "SRR648668.forward", "SRR648669.forward", "SRR648670.forward", "SRR648667.reverse", "SRR648668.reverse", "SRR648669.reverse", "SRR648670.reverse", "SRR648667.forward", "SRR648668.forward", "SRR648669.forward", "SRR648670.forward", "SRR648667.reverse", "SRR648668.reverse", "SRR648669.reverse", "SRR648670.reverse"], "downstream": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "unscaled 3 prime": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "group_labels": ["genes"], "bin size": [10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10], "upstream": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "group_boundaries": [0, 196], "sample_boundaries": [0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600], "max threshold": None, "ref point": [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], "min threshold": None, "sort regions": "no", "proc number": 20, "bin avg type": "mean", "missing data as zero": False}
+ oname = "/tmp/filterStrand.mat.gz"
+ args = "cbind -m {0} {0} -o {1}".format(self.matrix, oname)
+ args = args.split()
+ cmo.main(args)
+ f = gzip.GzipFile(oname)
+ d = getHeader(f) # Skip the header, which can be in a different order
+ h = hashlib.md5(f.read()).hexdigest()
+ f.close()
+ assert d == dCorrect
+ expectedh = 'e55d89704bb16a11f366663a8fd90a47'
+ assert f'{h}' == f'{expectedh}'
+ os.remove(oname)
+
+ def testsort(self):
+ """
+ computeMatrixOperations sort
+ """
+ dCorrect = {"verbose": True, "scale": 1, "skip zeros": False, "nan after end": False, "sort using": "mean", "unscaled 5 prime": [0, 0, 0, 0, 0, 0, 0, 0], "body": [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000], "sample_labels": ["SRR648667.forward", "SRR648668.forward", "SRR648669.forward", "SRR648670.forward", "SRR648667.reverse", "SRR648668.reverse", "SRR648669.reverse", "SRR648670.reverse"], "downstream": [0, 0, 0, 0, 0, 0, 0, 0], "unscaled 3 prime": [0, 0, 0, 0, 0, 0, 0, 0], "group_labels": ["genes"], "bin size": [10, 10, 10, 10, 10, 10, 10, 10], "upstream": [0, 0, 0, 0, 0, 0, 0, 0], "group_boundaries": [0, 196], "sample_boundaries": [0, 100, 200, 300, 400, 500, 600, 700, 800], "max threshold": None, "ref point": [None, None, None, None, None, None, None, None], "min threshold": None, "sort regions": "no", "proc number": 20, "bin avg type": "mean", "missing data as zero": False}
+ oname = "/tmp/sorted.mat.gz"
+ args = "sort -m {} -o {} -R {}".format(self.matrix, oname, self.bed)
+ args = args.split()
+ cmo.main(args)
+ f = gzip.GzipFile(oname)
+ d = getHeader(f) # Skip the header, which can be in a different order
+ h = hashlib.md5(f.read()).hexdigest()
+ f.close()
+ assert d == dCorrect
+ expectedh = '10ea07d1aa58f44625abe2142ef76094'
+ assert f'{h}' == f'{expectedh}'
+ os.remove(oname)
diff --git a/deepTools/source/deeptools/test/test_corrGC/R_gc b/deepTools/source/deeptools/test/test_corrGC/R_gc
new file mode 100644
index 0000000000000000000000000000000000000000..cd33a11349f54116d3dd85847824c8a62a33b2f2
--- /dev/null
+++ b/deepTools/source/deeptools/test/test_corrGC/R_gc
@@ -0,0 +1,201 @@
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+3.307656666280550661e-02
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+4.410208888374068242e-02
+1.323062666512220265e-01
+1.000000000000000000e+00
+3.307656666280550661e-02
+1.653828333140275331e-02
+6.615313332561101323e-02
+6.615313332561101323e-02
+3.307656666280550661e-02
+2.205104444187034121e-02
+4.134570832850689021e-02
+4.134570832850689021e-02
+3.307656666280550661e-02
+7.560358094355544567e-02
+3.113088627087577664e-02
+7.028770415846170849e-02
+5.292250666048881752e-02
+5.197746189869437150e-02
+6.339675277037723489e-02
+1.036995062942010770e-01
+8.820417776748137872e-02
+1.035983031325606596e-01
+1.033178149692127318e-01
+1.504450290146960578e-01
+1.947324628880662545e-01
+1.609302185709576005e-01
+1.740871929621342629e-01
+2.137790729575008708e-01
+2.485379401579031200e-01
+2.669751452069302200e-01
+2.805382876215726795e-01
+2.961386359029306026e-01
+3.090365352437303703e-01
+3.312250633872607652e-01
+3.799211198630577657e-01
+3.649828045550952971e-01
+4.913611021356240061e-01
+4.709473062942309274e-01
+5.058536003401513659e-01
+5.815793868441754277e-01
+6.173010406256145277e-01
+6.081702422862803603e-01
+7.693364394163653142e-01
+8.164577035606058741e-01
+8.815917563596734619e-01
+9.460360674886332255e-01
+9.823026416119507997e-01
+9.144433123649089445e-01
+1.051462126168338562e+00
+1.176055703566418309e+00
+1.455368933163442513e+00
+1.183844878468770512e+00
+1.445746659225171982e+00
+1.509884015255844369e+00
+1.584432399161448402e+00
+1.626152022259152785e+00
+2.090608636508092033e+00
+1.675879377582145890e+00
+2.215676862755054266e+00
+2.345517712471297145e+00
+2.070960590498989617e+00
+2.173893735130848270e+00
+2.361992203002964086e+00
+2.245898876404494704e+00
+2.871647378452660160e+00
+2.895853411328622506e+00
+2.284394369946100234e+00
+2.596510483030232574e+00
+3.175350399629329523e+00
+3.018787984092050181e+00
+4.022110506197150492e+00
+3.322691469309099421e+00
+3.175350399629329079e+00
+6.805503590872233666e+00
+2.590417431276558435e+00
+3.969187999536661682e+00
+3.340733232943356779e+00
+4.041355054073691555e+00
+3.170625175820356745e+00
+4.308222807830418333e+00
+5.634041854897872348e+00
+7.822608015753503174e+00
+3.327502606278234243e+00
+3.274580099617745876e+00
+7.298895710259083458e+00
+3.517141588478319481e+00
+6.830311015869337865e+00
+6.400315649252866557e+00
+3.929496119541294963e+00
+7.839146299084906566e+00
+2.037516506428819429e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.488445499826248186e+00
+1.000000000000000000e+00
+2.249206533070774672e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
diff --git a/deepTools/source/deeptools/test/test_corrGC/R_gc_paired.txt b/deepTools/source/deeptools/test/test_corrGC/R_gc_paired.txt
new file mode 100644
index 0000000000000000000000000000000000000000..08cc627f4c05755da8ceaaf2756cd6e75044ccfb
--- /dev/null
+++ b/deepTools/source/deeptools/test/test_corrGC/R_gc_paired.txt
@@ -0,0 +1,301 @@
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+9.999910797344536695e-01
+9.999881063479697518e-01
+9.999960354178876187e-01
+9.999986784691363706e-01
+9.999995594893240636e-01
+9.999998531630649445e-01
+9.999999510543501335e-01
+9.999910634195275927e-01
+9.999881009096933671e-01
+9.999960336051001430e-01
+9.999986778648706220e-01
+9.999906390302180093e-01
+9.999790393966496715e-01
+9.999751729119278343e-01
+9.999828040490971182e-01
+9.999853477873693608e-01
+9.999683558235907821e-01
+9.999626919639887923e-01
+9.999429650515958556e-01
+9.998917955244356337e-01
+9.999014952266391809e-01
+9.999404043573286716e-01
+9.998998605246807170e-01
+9.998774284063762785e-01
+9.998610333117045057e-01
+9.997396508489226896e-01
+9.996546236674602737e-01
+9.996084574474356499e-01
+9.995039467350594897e-01
+9.994334724531075409e-01
+9.993832544833378684e-01
+9.992952489888378143e-01
+9.992124766060953567e-01
+9.991937947471972725e-01
+9.991341353337095388e-01
+9.989361911898771984e-01
+9.985677120637509452e-01
+9.981249142589175838e-01
+9.978708020679103452e-01
+9.976973257405287177e-01
+9.972845087879461667e-01
+9.967037002953756941e-01
+9.959790394434188121e-01
+9.951101586761411655e-01
+9.941062991879133781e-01
+9.924612331344023763e-01
+9.904680476116898280e-01
+9.885747079662892123e-01
+9.864643712263454489e-01
+9.833182778577946870e-01
+9.775190664032900045e-01
+9.700848144057120370e-01
+9.672094029191796727e-01
+9.615246458671455887e-01
+9.508348296088652285e-01
+9.391160320609912571e-01
+9.242509022112956885e-01
+9.066909222689595316e-01
+8.882577371730345783e-01
+8.671542759947431511e-01
+8.484839583712968647e-01
+8.326341029616983747e-01
+8.143221578913033554e-01
+7.888729155613086741e-01
+7.605389670367882493e-01
+7.393839281847264244e-01
+7.186395445767683743e-01
+6.932633298695518587e-01
+6.675563656468179730e-01
+6.557166368114221155e-01
+6.504490236624866917e-01
+6.351631261409090845e-01
+6.094480944384946808e-01
+5.959239597618474216e-01
+6.062711267642100221e-01
+6.096765092929627983e-01
+6.012419563708105708e-01
+5.950445568540655428e-01
+5.915286238538426389e-01
+5.945652918963458822e-01
+6.066585583688769434e-01
+6.232366849715897938e-01
+6.207735893523381732e-01
+6.174292194441698411e-01
+6.219755658323700143e-01
+6.333643704897149451e-01
+6.462121265298644834e-01
+6.507620909354857597e-01
+6.567693207017699653e-01
+6.666917142803674423e-01
+6.720857286686742205e-01
+6.809026098441173236e-01
+6.978556999377046877e-01
+7.049648706604161319e-01
+7.127873732353485758e-01
+7.296378668166592085e-01
+7.403554022845679761e-01
+7.463937996388737561e-01
+7.554663302694013538e-01
+7.613302447567783515e-01
+7.725250193054987724e-01
+7.874507966249127966e-01
+7.950448485277744615e-01
+8.123925157445656131e-01
+8.351176112325804368e-01
+8.448163899633056584e-01
+8.486626022941238245e-01
+8.518629775517951863e-01
+8.673459557442074752e-01
+8.867932780337125509e-01
+8.970672882946463256e-01
+9.121570405649662705e-01
+9.252104843400752454e-01
+9.347096521350021225e-01
+9.511757790014061520e-01
+9.749130560843786153e-01
+9.871340056274372499e-01
+9.861327645317359281e-01
+9.956404917161891799e-01
+1.013828711491180368e+00
+1.030610357269147626e+00
+1.044959547295003288e+00
+1.069485810055249120e+00
+1.085347970221164804e+00
+1.089971470505272988e+00
+1.102264255644627378e+00
+1.118238256270866371e+00
+1.139355768843890893e+00
+1.156758081791598780e+00
+1.169924492290504325e+00
+1.173819244193573974e+00
+1.171493467719889825e+00
+1.184597612914873865e+00
+1.200345579387997130e+00
+1.211459832379051527e+00
+1.238295300414279598e+00
+1.247966504099611296e+00
+1.240167911695359004e+00
+1.246792585479475024e+00
+1.262948060117081939e+00
+1.276655219603586966e+00
+1.275103640549761330e+00
+1.281133866408310507e+00
+1.296069180560731615e+00
+1.299133483576072612e+00
+1.300451108519138810e+00
+1.301216709707966990e+00
+1.304273174129296997e+00
+1.310916470284052782e+00
+1.318653145359068635e+00
+1.326424179836547923e+00
+1.332809104899952857e+00
+1.330874322483974304e+00
+1.331227188570605735e+00
+1.320629859072630419e+00
+1.310655178935595355e+00
+1.324967409044265132e+00
+1.305711227011654652e+00
+1.300056688446822761e+00
+1.325284998407550541e+00
+1.314578390711100164e+00
+1.286193460421670043e+00
+1.286803733181254294e+00
+1.280205307605654985e+00
+1.260356435274226472e+00
+1.237936479140162138e+00
+1.216672030278138861e+00
+1.213736531974581645e+00
+1.187420877435089617e+00
+1.163243401233964081e+00
+1.154202888945774275e+00
+1.138357408768514656e+00
+1.138048286269977227e+00
+1.128822718258669422e+00
+1.092812688452169168e+00
+1.074694290821570641e+00
+1.087005990409082079e+00
+1.067480924916803131e+00
+1.031726291161634945e+00
+1.023230841280303949e+00
+1.013539088603025107e+00
+1.016605799520342357e+00
+1.023192913724467523e+00
+1.026343655721627224e+00
+1.032122198652947631e+00
+1.027953032235908948e+00
+1.024294852853029569e+00
+1.013620755330080891e+00
+1.008819214784733242e+00
+1.018096662438548350e+00
+1.021338264269237017e+00
+1.022289328406943865e+00
+1.022615666885497099e+00
+1.037514057530589895e+00
+1.033582667958585999e+00
+1.004774416770266354e+00
+9.954468909407750976e-01
+1.001248555328404288e+00
+1.017317615127176555e+00
+1.008286166216193047e+00
+9.931907316083085080e-01
+1.003277482727201075e+00
+9.987250406848885431e-01
+9.882493937498659786e-01
+9.847709265094811704e-01
+9.876623465877873986e-01
+9.931650085164636099e-01
+9.888285353549343126e-01
+9.871859278030299389e-01
+9.873418023156780299e-01
+9.862213970957208753e-01
+9.804238547456385344e-01
+9.770518017386508047e-01
+9.838955984354510464e-01
+9.842054715700987444e-01
+9.822685046843998569e-01
+9.852919858875200942e-01
+9.870299766344043935e-01
+9.890557790764858970e-01
+9.893290115293941200e-01
+9.846980430143793539e-01
+9.821084715098751250e-01
+9.823189389533665272e-01
+9.848355849484369262e-01
+9.883068206567093839e-01
+9.866950501930423778e-01
+9.844581038612434387e-01
+9.876883213054056254e-01
+9.901331292641196713e-01
+9.898927843589159226e-01
+9.915692829348887738e-01
+9.940747967543506203e-01
+9.938446205266897593e-01
+9.934512733871044832e-01
+9.942493761614623615e-01
+9.952191479438154964e-01
+9.954819871703829426e-01
+9.938617138262522133e-01
+9.938404117603623078e-01
+9.945385622671887305e-01
+9.952572980347597076e-01
+9.966992800530753982e-01
+9.976503126506753860e-01
+9.976504786622593102e-01
+9.980590675708043147e-01
+9.983731660360567473e-01
+9.984779194266618640e-01
+9.986551642616506852e-01
+9.990079613462254926e-01
+9.993393758732639354e-01
+9.995033624355047497e-01
+9.996026008989237210e-01
+9.997070006299922929e-01
+9.997685533215920906e-01
+9.998336574703498014e-01
+9.998999515503660396e-01
+9.999220515307329915e-01
+9.999204994015705728e-01
+9.999556589695512976e-01
+9.999852192195868295e-01
+9.999950730246457420e-01
+9.999894374332408686e-01
+9.999786388732953002e-01
+9.999839593845165631e-01
+9.999857329008385864e-01
+9.999863240743439574e-01
+9.999954413165518252e-01
+9.999984804342324507e-01
+9.999994934775644495e-01
+9.999998311591312694e-01
+9.999999437197039098e-01
+9.999999812399005261e-01
+9.999999937466335087e-01
+9.999999979155446139e-01
+9.999999993051814640e-01
+9.999999997683938213e-01
+9.999999999227978664e-01
+9.999999999742660295e-01
+9.999999999914219728e-01
+9.999999999971407316e-01
+9.999999999990469846e-01
+9.999999999996823652e-01
+9.999999999998941957e-01
+9.999999999999646949e-01
+9.999999999999883427e-01
+9.999999999999961142e-01
+9.999999999999986677e-01
+9.999999999999995559e-01
+9.999999999999997780e-01
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
+1.000000000000000000e+00
diff --git a/deepTools/source/deeptools/test/test_corrGC/extra_sampling.bed b/deepTools/source/deeptools/test/test_corrGC/extra_sampling.bed
new file mode 100644
index 0000000000000000000000000000000000000000..b68bec53ad78daf79df01f12210ca552992f9515
--- /dev/null
+++ b/deepTools/source/deeptools/test/test_corrGC/extra_sampling.bed
@@ -0,0 +1 @@
+chr2L 1 4 In1 0 +
\ No newline at end of file
diff --git a/deepTools/source/deeptools/test/test_corrGC/filter_out.bed b/deepTools/source/deeptools/test/test_corrGC/filter_out.bed
new file mode 100644
index 0000000000000000000000000000000000000000..7ee4efd04f9cb368a6dc2428cec1470b10d1d7e7
--- /dev/null
+++ b/deepTools/source/deeptools/test/test_corrGC/filter_out.bed
@@ -0,0 +1 @@
+chr2L 4 8 OUT1 0 +
diff --git a/deepTools/source/deeptools/test/test_corrGC/frequencies_data.txt b/deepTools/source/deeptools/test/test_corrGC/frequencies_data.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7ba4f356f8f6c9bbd04184001cb805b38e58cd61
--- /dev/null
+++ b/deepTools/source/deeptools/test/test_corrGC/frequencies_data.txt
@@ -0,0 +1,11 @@
+0.000000000000000000e+00 0.000000000000000000e+00 1.000000000000000000e+00
+3.000000000000000000e+00 2.500000000000000000e+01 6.848780487804877470e-01
+4.000000000000000000e+00 3.700000000000000000e+01 6.170072511535926729e-01
+1.900000000000000000e+01 9.400000000000000000e+01 1.153606642449403141e+00
+2.300000000000000000e+01 1.020000000000000000e+02 1.286944045911047274e+00
+1.600000000000000000e+01 1.240000000000000000e+02 7.364280094413846456e-01
+9.000000000000000000e+00 5.900000000000000000e+01 8.706076891277387819e-01
+7.000000000000000000e+00 1.700000000000000000e+01 2.350071736011477341e+00
+1.000000000000000000e+00 1.000000000000000000e+01 5.707317073170731225e-01
+0.000000000000000000e+00 0.000000000000000000e+00 1.000000000000000000e+00
+0.000000000000000000e+00 0.000000000000000000e+00 1.000000000000000000e+00
diff --git a/deepTools/source/deeptools/test/test_corrGC/mappability.bg b/deepTools/source/deeptools/test/test_corrGC/mappability.bg
new file mode 100644
index 0000000000000000000000000000000000000000..a412b7ee1a60dd5e9ec4cf32d2fc779916fae994
--- /dev/null
+++ b/deepTools/source/deeptools/test/test_corrGC/mappability.bg
@@ -0,0 +1,3 @@
+chr2L 0 100 1
+chr2L 100 110 0.5
+chr2L 110 1000 1
diff --git a/deepTools/source/deeptools/test/test_corrGC/mappability.bw b/deepTools/source/deeptools/test/test_corrGC/mappability.bw
new file mode 100644
index 0000000000000000000000000000000000000000..03cefb8aafb27cf855c33e81bcb41c308af4ffc7
Binary files /dev/null and b/deepTools/source/deeptools/test/test_corrGC/mappability.bw differ
diff --git a/deepTools/source/deeptools/test/test_corrGC/paired.bam b/deepTools/source/deeptools/test/test_corrGC/paired.bam
new file mode 100644
index 0000000000000000000000000000000000000000..4fda03bcf496e3282ab94d391f34ee45d4691d92
--- /dev/null
+++ b/deepTools/source/deeptools/test/test_corrGC/paired.bam
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:291bd506f21f93b66e15b20454194e7284377725fe9f5f934546ff9e5db5ee4d
+size 512290
diff --git a/deepTools/source/deeptools/test/test_corrGC/paired.bam.bai b/deepTools/source/deeptools/test/test_corrGC/paired.bam.bai
new file mode 100644
index 0000000000000000000000000000000000000000..9859091e60053fa7dcacdb87731c78743e7f7da0
Binary files /dev/null and b/deepTools/source/deeptools/test/test_corrGC/paired.bam.bai differ
diff --git a/deepTools/source/deeptools/test/test_corrGC/sequence.2bit b/deepTools/source/deeptools/test/test_corrGC/sequence.2bit
new file mode 100644
index 0000000000000000000000000000000000000000..9fad041e7efd98ea197e79fe4f254e52e503a08c
Binary files /dev/null and b/deepTools/source/deeptools/test/test_corrGC/sequence.2bit differ
diff --git a/deepTools/source/deeptools/test/test_corrGC/sequence.fa b/deepTools/source/deeptools/test/test_corrGC/sequence.fa
new file mode 100644
index 0000000000000000000000000000000000000000..6c3649ce1047b9a664587dc50a453701aa2a99c9
--- /dev/null
+++ b/deepTools/source/deeptools/test/test_corrGC/sequence.fa
@@ -0,0 +1,21 @@
+>chr2L
+GAGTATCAGGAAGACCCAGAAATGTTGCTTGACCTCATGAATCGTATTGC
+CAAGGGATACCAAAATAACCCTGATCTACGACTGACTTGGTTGGAAAATA
+TGGCTAAAAAACACCGCGAGCGAGCAAATCACACGGAAGCAGCCATGTGT
+TATGTACATGCTGCTTCTTTAGTTTCTGAATATCTTAGCATGTTGGAGTC
+ACAAAAACATTTGCCTGTTGGAGCTGTAAGTTTTCAACGAATTTCTCCCA
+ACACACTAATGGAGTCGGCCGTATCGGATGATGTGCTAAGTCCCGGCGAA
+GATGGTATCTGCCTAGGAAATCATTTCACTGAAACTGGGTTGAAGGCCTT
+GCTGGAAGAAGCCTCCAATTCTTTTCAAGTTGCTGGCATGTATGAAGCAA
+TGAACGAAGTGTACAAAATTCTAATACCCATATGCGAGGCTAACAGAGAT
+TTTCAAAAGCTAAGCAAAGTTCATGGCAAATTGCAGGAGGCATTTAATCG
+AATATCCCAACTACAGGTAACAATATTGTGTAAATTTTACCAACGGAAAA
+TATATACATATTTATAAACAGGGTAAGAGAGTTTTTGGAACATACTTTCG
+TGTTGGCTTCTATGGCGGAAAATTTGGGGACTTGGATCAGCAGGAATTCA
+TTTATAAAGAGCCAACATTGACGAAGTTGCCCGAAATATTTAGTCGGCTT
+CAGGTATATATTGCAAATTGGAAAAAATAGAACTAATCAATTTTGTTTCA
+ACATACGTTAGAACTTTTACACTGAACGATTCGGACCGGACTCTGTGCAT
+ATCATTAAAGATTCCAATACCGTTGATATTAATAGCTTGGATCCCGATAA
+GGCTTACATTCAAATTACTTATGTTGAACCCTACTTTGAAACATATGAAA
+TGCGTCATCGTGAGACATACTTTGAGCGGAATTTCAATATAAGTATGATA
+TGAATTAAACAGATAATTTAAATCGaaatttaaaattataattttaaCAT
diff --git a/deepTools/source/deeptools/test/test_corrGC/sequence.fa.fai b/deepTools/source/deeptools/test/test_corrGC/sequence.fa.fai
new file mode 100644
index 0000000000000000000000000000000000000000..0de5601d45dc96edc3e09b8dc8edc436b69d394a
--- /dev/null
+++ b/deepTools/source/deeptools/test/test_corrGC/sequence.fa.fai
@@ -0,0 +1 @@
+chr2L 1000 7 50 51
diff --git a/deepTools/source/deeptools/test/test_corrGC/sizes b/deepTools/source/deeptools/test/test_corrGC/sizes
new file mode 100644
index 0000000000000000000000000000000000000000..7aa1a5605c00e33acb9205d1d05af566c7e470d9
--- /dev/null
+++ b/deepTools/source/deeptools/test/test_corrGC/sizes
@@ -0,0 +1,2 @@
+chr2L 1000
+
diff --git a/deepTools/source/deeptools/test/test_corrGC/test.bam b/deepTools/source/deeptools/test/test_corrGC/test.bam
new file mode 100644
index 0000000000000000000000000000000000000000..609e4e3b3f03d8042902806f1b18edf6d8c588f8
Binary files /dev/null and b/deepTools/source/deeptools/test/test_corrGC/test.bam differ
diff --git a/deepTools/source/deeptools/test/test_corrGC/test.bam.bai b/deepTools/source/deeptools/test/test_corrGC/test.bam.bai
new file mode 100644
index 0000000000000000000000000000000000000000..b2f00c94b5319b9f77da9d53188ea3e3072daa73
Binary files /dev/null and b/deepTools/source/deeptools/test/test_corrGC/test.bam.bai differ
diff --git a/deepTools/source/deeptools/test/test_corrGC/test.sam b/deepTools/source/deeptools/test/test_corrGC/test.sam
new file mode 100644
index 0000000000000000000000000000000000000000..27d683e970e72c813b1dec4a85fa584c3771bcfb
--- /dev/null
+++ b/deepTools/source/deeptools/test/test_corrGC/test.sam
@@ -0,0 +1,203 @@
+@HD VN:1.0 SO:unsorted
+@SQ SN:2L LN:23011544
+DD61XKN1:97:COBJ7ACXX:6:1304:14465:187787 0 2L 2 255 51M * 0 0 AGTATCAGGAAGACCCAGAAATGTTGCTTGACCTCATGAATCGTATTGCCA @@?DDBD?G XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2105:11023:179557 16 2L 12 255 51M * 0 0 AGACCCAGAAATGTTGCTTGACCTCATGAATCGTATTGCCAAGGGATACCA BG?CEHGIF@GGJIIGB?<9CIHGHHHHGAEDFIHFGF?;?AFDDEDD@@@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1304:5656:150699 0 2L 17 255 51M * 0 0 CAGAAATGTTGCTTGACCTCATGAATCGTATTGCCAAGGGATACCAAAATA CCCFFFFFHHHHHJJGIJJJJJJIHGJJGGJJJIJIJJJJIIJJJJJIIJJ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1304:8200:170238 0 2L 19 255 51M * 0 0 GAAATGTTGCTTGACCTCATGAATCGTATTGCCAAGGGATACCAAAATAAC ?@@FBDDDHHGHAHHIJCGGDFHIHGHGIJJJGGIGIII3DHIGGGHIIJJ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1305:18486:172550 16 2L 21 255 51M * 0 0 AATGTTGCTTGACCTCATGAATCGTATTGCCAAGGGATACCAAAATAACCC JIGJJIJJIIHDJJJIHBJIIIHHIIIJJJIIJJIIHGFHHHHFFDDDCCB XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2307:2321:170436 0 2L 27 255 51M * 0 0 GCTTGACCTCATGAATCGTATTGCCAAGGGATACCAAAATAACCCTGATCT CC@FFFDFHHHHHIFIJJEBGGHGIIJJIJGGEGEHIGIJJJAHHIIHIII XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1207:5255:137547 0 2L 28 255 51M * 0 0 CTTGACCTCATGAATCGTATTGCCAAGGGATACCAAAATAACCCTGATCTA CCCFFFFFHGHHGIIGJHIHIHGIIJIIIGGIJJIJJIIJJEHJJJJJJJD XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1304:3130:106740 0 2L 32 255 51M * 0 0 ACCTCATGAATCGTATTGCCAAGGGATACCAAAATAACCCTGATCTACGAC CCCFFFFFHHGHHJJJJJJIIJJJJGHIJJJJJJJIJIJJJJIIIIGJJJI XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2306:8891:14840 0 2L 32 255 51M * 0 0 ACCTCATGAATCGTATTGCCAAGGGATACCAAAATAACCCTGATCTACGAC =;?BBBDDFHDFHFBBD4B>@@G@?3A;DG3CBCEAGIG@?99DH*?D6 XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2207:9080:146208 0 2L 34 255 51M * 0 0 CTCNTGGATCGTATTGCCAAGGGATACCAAAATAANCCTGATCTACGNCTG ?@@#4A22AFHFHEHIJIIJJJJIJIJJJJJJIJJ#0?FCHHJIIJJ#-CED XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1205:9633:61290 0 2L 105 255 51M * 0 0 TAAAAAACACCGCGAGCGAGCAAATCACACGGAAGCAGCCATGTGTTATGT @@@DDDFDFDHHFI1C:DDG>EHIIICEHIIFGIAEHAA?9?CFF>CADC@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2108:16953:173164 0 2L 105 255 51M * 0 0 TAAAAAACACCGCGAGCGAGCAAATCACACGGAAGCAGCCATGTGTTATGT CCCFFFFFHHHHHJIIIJHJJIJJJJJJJJJJIIJHHHHFEFFDBCEEEED XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2306:11917:55163 16 2L 105 255 51M * 0 0 TAAAAAACACCGCGAGCGAGCAAATCACACGGAAGCAGCCATGTGTTATGT =ACGHC@FGHGHHGGGGGC@@ECC=CEBE;=.;;@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1108:18039:70045 0 2L 112 255 51M * 0 0 CACCGCGAGCGAGCAAATCACACGGAAGCAGCCATGTGTTATGTACATGAG @@@FFFFFDDHFH>DBFGGIIIIIIIIIGGHE@6=F=DHGEAA7@CCHCFH XA:i:0 MD:Z:49C0T0 NM:i:2
+DD61XKN1:97:COBJ7ACXX:6:2302:8293:92003 16 2L 113 255 51M * 0 0 ACCGCGAGCGAGCAAATCACACGGAAGCAGCCATGTGTTATGTACATGCTG @JJIJJJJJIIIGJJIHG>IHJJJJJJIJJJJJJJJJJHHHHHFFFFFCCC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1206:17788:76420 0 2L 117 255 51M * 0 0 CGAGCGAGCAAATCACACGGAAGCAGCCATGTGTTATGTACATGCTAGATC @@@DBDFFGHGHFIGGGIIIEH:ADGHEIGD*??DEA23A:C=2+B48== XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2301:9302:70963 16 2L 118 255 51M * 0 0 GAGCGAGCAAATCACACGGAAGCAGCCATGTGTTATGTACATGCTGCTTCT JJIGJIIIJJIGHFIGJJJJJJJJJJJJJJJJJJJJJJHHHHHFFFFFCCC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1108:5503:182542 16 2L 120 255 51M * 0 0 GCGAGCAAATCACACGGAAGCAGCCATGTGTTATGTACATGCTGCTTCTTT 0?DED>GFB@GB8CDDFFF?FEGIGIIIAHGIIIGIIIIIIII9EGAGIEBCGEGIGB XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1102:12979:59008 16 2L 138 255 51M * 0 0 AGCAGCCATGTGTTATGTACATGCTGCTTCTTTAGTTTCTGAATATCTTAG JJJIJJIIJJJJJIJJIGHJJJJJJJJJJJIJJIJJJJHHHHHFFFFFCCC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1206:9329:69872 16 2L 138 255 51M * 0 0 AGCAGCCATGTGTTATGTACATGCTGCTTCTTTAGTTTCTGAATATCTTAG HIIGIIGJIJJIJIJJIGIJJIHIJJIIGJIJIJJJIIGHGGHFFDFFCC@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2103:16418:25318 16 2L 144 255 51M * 0 0 CATGTGTTATGTACATGCTGCTTCTTTAGTTTCTGAATATCTTAGCATGTT H>JJJJJJJJJHIJJJJJIIIJIJIJJJJJJJJJJJJJHHGHHFFFFFCCC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2306:20617:39460 16 2L 147 255 51M * 0 0 GTGTTATGTACATGCTGCTTCTTTAGTTTCTGAATATCTTAGCATGTTGGA JJJJIJJJIIJJJJJJJJJJIIJJJJJJJJJJJJIJJJHHHHHFFFFFCCC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1306:11689:51296 0 2L 148 255 51M * 0 0 TGTTATGTACATGCTGCTTCTTTAGTTTCTGAATATCTTAGCATGTTGGAG @@@FDDADBFF8FGIIGIDCFHIGBHHGIJJIJE@GGIJJIIJEIAHDHHI XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1304:20062:21791 16 2L 151 255 51M * 0 0 TATGTACATGCTGCTTCTTTAGTTTCTGAATATCTTAGCATGTTGGAGTCA JJJIIJIJIJJJJJJJJJJJJJJJJJJJJJJJJJJJJJHHHHHFFFFFCCC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2102:19935:152156 16 2L 153 255 51M * 0 0 TGTACATGCTGCTTCTTTAGTTTCTGAATATCTTAGCATGTTGGAGTCACA JIIIJIGHHGIIIGJIJJJJJJGJIJJJIIJJJIJJJIHHHHHFFFFFCCC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2306:17010:6596 16 2L 153 255 51M * 0 0 TGTACATGCTGCTTCTTTAGTTTCTGAATATCTTAGCATGTTGGAGTCACA JIIJJJJJIJJJJJIJJJJJJJJJJJJJJJJJIJJJJJHHHHHFFFFFCCC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1206:8566:197557 16 2L 158 255 51M * 0 0 ATGCTGCTTCTTTAGTTTCTGAATATCTTAGCATGTTGGAGTCACAAACAC EJJJJIIGGGGGIGHIHEHBGIIIJIIJIGGJIJIHEIGHHHHDFFDD@@? XA:i:1 MD:Z:48A2 NM:i:1
+DD61XKN1:97:COBJ7ACXX:6:1307:11048:127938 0 2L 161 255 51M * 0 0 CTGCTTCTTTAGTTTCTGAATATCTTAGCATGTTGGAGTCACAAACACATT @@@DFFADHDHHHGGIEIHEG>IFHE?FHHHGAECFFGFEEHIIIIIEDHI XA:i:0 MD:Z:45A5 NM:i:1
+DD61XKN1:97:COBJ7ACXX:6:1204:11281:93412 16 2L 163 255 51M * 0 0 GCTTCTTTAGTTTCTGAATATCTTAGCATGTTGGAGTCACAAACACATTTG JJJIJJJJIICHIJJJJIJIJJJJJJJJJJIJJJJJGJHHHHHFFFFFCCC XA:i:1 MD:Z:43A7 NM:i:1
+DD61XKN1:97:COBJ7ACXX:6:1105:16101:180301 0 2L 181 255 51M * 0 0 TATCTTAGCATGTTGGAGTCACAAACACATTTGCCTGTTGGAGCTGTAAGT @FFDDDHHAEF?GHEFHEBBBD:??FB XA:i:1 MD:Z:25A25 NM:i:1
+DD61XKN1:97:COBJ7ACXX:6:1307:14338:192440 0 2L 184 255 51M * 0 0 CTTAGCATGTTGGAGTCACAAACACATTTGCCTGTTGGAGCTGTAAGTTTT @DDDDDFHHFHGIGCGGGIDGGIIIIJBH>GHGGI?DHIGGDGHIJJGG XA:i:1 MD:Z:22A28 NM:i:1
+DD61XKN1:97:COBJ7ACXX:6:1103:12859:144897 16 2L 186 255 51M * 0 0 TAGCATGTTGGAGTCACAAACACATTTGCCTGTTGGAGCTGTAAGTTTTCA HIEJIHGIGHFGGIIHIGGIHECEHIIJJGJJJIJIIEHHFHHFFFFFCCC XA:i:0 MD:Z:20A30 NM:i:1
+DD61XKN1:97:COBJ7ACXX:6:2107:7805:110890 16 2L 187 255 51M * 0 0 AGCATGTTGGAGTCACAAACACATTTGCCTGTTGGAGCTGTAAGTTTTCAA IIJIIJJIIIIGJJJIJJGIGJJIIJFJIJJIIIJJIIHHHHHFFFFFC@C XA:i:0 MD:Z:19A31 NM:i:1
+DD61XKN1:97:COBJ7ACXX:6:1102:1861:107928 0 2L 192 255 51M * 0 0 GTTGGAGTCACAAACACATTTGCCTGTTGGAGCTGTAAGTTTTCAACGAAT @?@DDDDFFBHHHJEGIIGIGIIJGHIJJJIIGHJGHIICFHIJIIJIIJI XA:i:1 MD:Z:14A36 NM:i:1
+DD61XKN1:97:COBJ7ACXX:6:2308:20330:196190 16 2L 192 255 51M * 0 0 GTTCGAGTCACAAACACATTTGCCTGTTGGAGCTGTAAGTTTTCAACGAAT D90)JIIGHHEEFCGBJIHIIIJJIJJJJJJIHGJJIIHGHHHDFFFD@@@ XA:i:0 MD:Z:3G10A36 NM:i:2
+DD61XKN1:97:COBJ7ACXX:6:1201:12265:85344 0 2L 200 255 51M * 0 0 CACAAACACATTTGCCTGTTGGAGCTGTAAGTTTTCAACGAATTTCTCCCA CCCFFFFFHHHHHIIJJJJIJJJJJIJIIJJIJJJGIJIJJIJJIGIGGGI XA:i:1 MD:Z:6A44 NM:i:1
+DD61XKN1:97:COBJ7ACXX:6:2308:10209:45972 16 2L 201 255 51M * 0 0 ACAAACACATTTGCCTGTTGGAGCTCTAAGTTTTCAACGAATTTCTCCCAA GHGIIHDIIIIHG=GGIIG@JIGHCFFHIIAH:DDGGGDFECGFBG@AGH>>D XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1203:19077:54250 16 2L 245 255 51M * 0 0 ATCTCAACACCCTAATGGAGTCGGCCGTATCGGATGATGTGCTAAGTCCCG ??=)6(-;.@4CBB@=B@60)AADDE?@?DFDEE?AC:DCAFGGIGIIIFIIBEE XA:i:0 MD:Z:48G0A1 NM:i:2
+DD61XKN1:97:COBJ7ACXX:6:1201:1270:36522 16 2L 251 255 51M * 0 0 ACACACTAATGGAGTCGGCCGTATCGGATGATGTGCTAAGTCCCGGCGAAG EEEEEFFFFFFFHHIGJJJJJIIJJJJJJJJJJJJJJJHHHHHFFFDFCCC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1208:4869:36164 16 2L 251 255 51M * 0 0 ACACACTAATGGAGTCGGCCGTATCGGATGATGTGCTAAGTCCCGGCGAAG B@AA;D>FFEFFHHJJIJJIGGGIJJJJIFJIJIJIIJGHHHHFFFFFCCC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2302:8813:117992 16 2L 251 255 51M * 0 0 ACACACTAATGGAGTCGGCCGTATCGGATGATGTGCTAAGTCCCGGCGAAG EEEDCFDFFFFFHHJIJJJIJJJJJJJJJJJJIJJJJJHHHHHFFFFFCCC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2306:2156:189168 16 2L 251 255 51M * 0 0 ACACACTAATGGAGTCGGCCGTATCGGATGATGTGCTAAGTCCCGGCGAAG EECEADBD@FFFHEGGBIIJJJJIJJIIJJIJJJJJJJHHHFHFFFDDCC@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2108:20676:200337 16 2L 253 255 51M * 0 0 CCCTTAATGGAGTCGGCCGTATCGGATGATGTGCTAAGTCCCGGCGAAGAT ####@EDBFFEHEE;>IIIGHBJJIHHBIIIIEJIJIHHHHFC@+FFD?@? XA:i:0 MD:Z:0A1A0C47 NM:i:3
+DD61XKN1:97:COBJ7ACXX:6:2303:4499:87063 16 2L 253 255 51M * 0 0 ACACTAATGGAGTCGGCCGTATCGGATGATGTGCTAAGTCCCGGCGAAGAT @EEEEEDFFFFEAJIGHIHHBCJJIGGGGIJJIIIGGHHHDHHFFFFFCC@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2301:12873:140362 0 2L 254 255 51M * 0 0 CACTAATGGAGTCGGCCGTATCGGATGATGTGCTAAGTCCCGGCGAAGATG CCCFFFFFHHHHHIJJJJJJJJJJIIJIJIIIGIJJIIIJJJJIIHFFDEE XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2307:13419:129801 0 2L 256 255 51M * 0 0 CTAATGGAGTCGGCCGTATCGGATGATGTGCTAAGTCCCGGCGAAGGTGGG @@@FFFDDFBDHHBIIBAHGHEHCH??D?DAG@FC?DBDGGFHAA>DFEEIIJIHGJIHGIHG?GGGGIIGGJJJIJJJJHHHHHFFDFF@CC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1208:21153:74484 0 2L 260 255 51M * 0 0 TGGAGTCGGCCGTATCGGATGATGTGCTAAGTCCCGGCGAAGATGGTATCT @C@FFDEFAHGHFHGIJGGHEDGGIIGIEEGHGGGIIGGGFEBBDD@CC>B XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2101:13440:96989 0 2L 260 255 51M * 0 0 TGGAGTCGGCCGTATCGGATGATGTGCTAAGTCCCGGCGAAGATGGTATCT CCCFFFFFHHHHHJJJIJIJJIJJIJIIJJJIIIIJJIIJHGFFFF>CEEE XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1104:11297:23938 0 2L 261 255 51M * 0 0 GGAGTCGGCCGTATCGGATGATGTGCTAAGTCCCGGCGAAGATGGTATCTG ?@@D1BD8DHDAFGFCBGGCG8:@4@A@# XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1302:16303:101241 0 2L 261 255 51M * 0 0 GGAGTCGGCCGTATCGGATGATGTGCTAAGTCCCGGCGAAGATGGTATCTG @@@DDDDFHGHFGIIIIGIIJIJJJIIIGGFBHIIHEHBHEG@BF;3?;B> XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2103:10328:49800 16 2L 262 255 51M * 0 0 GAGTCGGCCGTATCGGATGATGTGCTAAGTCCCGGCGAAGATGGTATCTGC @C@BDBBD=FFFFC=.888BCIFF@B90FEEFCF=CFAEIB=,B1;;; XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1107:9046:65823 16 2L 273 255 51M * 0 0 ATCGGATGATGTGCTAAGTCCCGGCGAAGATGGTATCTGCCTAGGAAATCA DJHGGHGEJIIG@JIIGGDEJGHHFDJJHGHFHGHHGBHBAHFD8FFF@@@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1205:16702:86065 16 2L 273 255 51M * 0 0 ATCGGATGATGTGCTAAGTCCCGGCGAAGATGGTATCTGCCTAGGAAATCA IHIIFIIIIIGCIGGIHF@GIIIIIIIHEFCIIHFB?GHFDGHFFFDFCCC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2207:9123:81043 0 2L 275 255 51M * 0 0 CGGNTGGTGTGCTAAGTCCCGGCGAAGATGGTATCNGCCTAGGAAATNATT ?@@#4A22AAFHFGEGFHGGIGDGHIIIIII9DBF#-DFF4?<@C<8=FC@F9CF=7 XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2307:1821:56847 16 2L 291 255 51M * 0 0 TCCCGGCGAAGATGGTATCTGCCTAGGAAATCATTTCACTGAAACTGGGTT JIHIJJJJIJJJJIJJJJJJJJJJIJJJJJIJJJJJJIHHHHHFFFFFCCC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1108:6855:132514 16 2L 292 255 51M * 0 0 CCCGGCGAAGATGGTATCTGCCTAGGAAATCATTTCACTGAAACTGGGTTG JIJGJJJJJJIJJJIJIJJIJJIHJJJIHBJJGJJJIJGHHGHFFFFF@@B XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1307:18891:60022 0 2L 298 255 51M * 0 0 GAAGATGGTATCTGCCTAGGAAATCATTTCACTGAAACTGGGTTGAAGGCC @@@FDDEFHHHHHJJJJJJJIIJJIJIIGJGHJJJGIIGIIJDHHJJJJJJ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1101:19397:5078 0 2L 299 255 51M * 0 0 AAGATGGTATCTGCCTAGGAAATCATTTCACTGAAACTGGGTTGAAGGCCT @@CDDFFFHHHHHJJJJJJIJJJJJJJJGDHIJJJJJJIJJIIJJJJJHII XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1205:3808:46142 16 2L 302 255 51M * 0 0 ATGGTATCTGCCTAGGAAATCATTTCACTGAAACTGGGTTGAAGGCCTTGC GGHGGIGIIIIIIIIGIIGEGGGHEHABIIIIIIGHGIHHDDHD;?DD@@@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2301:2533:6592 16 2L 303 255 51M * 0 0 TGGTATCTGCCTAGGAAATCATTTCACTGAAACTGGGTTGAAGGCCTTGCT GHIIJJJIJJJJJJIJJJIJJJJJIIJJJJJIJJJJJJHHHHHFFFFFCCC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2304:17062:148239 16 2L 306 255 51M * 0 0 TATCTGCCTAGGAAATCATTTCACTGAAACTGGGTTGAAGGCCTTGCTGGA IJIJIEIIJJIJIGIHIGIJGIGJGJJIIJJIGGGGGIHB=HHFDFFF@CC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2308:2095:33008 0 2L 307 255 51M * 0 0 ATCTGCCTAGGAAATCATTTCACTGAAACTGGGTTGAAGGCCTTGCTGGAA ;8@DDDDDBH>?FHF:F>CHH@EHIGIIIIIEFCF3:B?GGGBIHAHHAHDFFDF@@@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1107:15167:131846 16 2L 366 255 51M * 0 0 CAATTCTTTTCAAGTTGCTGGCATGTATGAAGCAATGAACGAAGTGTACAA GEFBHG@IHEIGIIJIIIIGIGIFCIIHCGIGIEIIGJHGHGHFFFFF@CC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1108:1279:80195 0 2L 387 255 51M * 0 0 CATGTATGAAGCAATGAACGAAGTGTACAAAATTCTAATACCCATATGCGA @@CFFFFFHHHHHIJJJJJIJJJHIIGIIJJIJJIIJJJIJJJJJJJIIJI XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1206:17772:153344 0 2L 387 255 51M * 0 0 CATGTATGAAGCAATGAACGAAGTGTACAAAATTCTAATACCCATATGCGA @<@DDFFFFDGHHJJGIJJIJIIDFIHIIJIIJJIJIJJIIIIJJIJI@GH XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2102:5591:82195 0 2L 392 255 51M * 0 0 ATGAAGCAATGAACGAAGTGTACAAAATTCTAATACCCATATGCGAGGCTA CCCFFFFFHHHHHJJJJJJJJJJJJJJJJJJIJJJJJJIJJJJJJIJJJJJ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2301:16432:177637 0 2L 394 255 51M * 0 0 GAAGCAATGAACGAAGTGTACAAAATTCTAATACCCATATGCGAGGCTAAC @@@DDDFFDDDBHHGIG>HCFEHEFHIDHHGH@HIEFG>FIHEHHGHCEGB XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2305:10443:61453 0 2L 395 255 51M * 0 0 AAGCAATGAACGAAGTGTACAAAATTCTAATACCCATATGCGAGGCTAAGA CCCFFFFFHHHHHJIHIHGIJJJJJJJJIJJJJJJIJIIJJJJJJJIJIII XA:i:0 MD:Z:49C1 NM:i:1
+DD61XKN1:97:COBJ7ACXX:6:1305:12300:195423 16 2L 396 255 51M * 0 0 AGCAATGAACGAAGTGTACAAAATTCTAATACCCATATGCGAGGCTAACAG IGGGHGADBF@IHBGHD;IEIHBCIIHF?AAIHFBGA)DF?HF?ADDD@@@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1301:2252:57537 16 2L 403 255 51M * 0 0 AACGAAGTGTACAAAATTCTAATACCCATATGCGAGGCTAACAGAGATTTT JJIIJJJIHHJJJJJIFHFCIIGHJJJIHIIJJIHJIHBHHHHFFFFFCCC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2307:15958:126504 0 2L 404 255 51M * 0 0 ACGAAGTGTACAAAATTCTAATACCCATATGCGAGGCTAACAGAGATTTTC @@CFFFFFHHHHHJJJJJJJJJJJJJJJJJJJJJIJJJIIIJJJJJJJJJJ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1106:8340:77588 0 2L 414 255 51M * 0 0 CAAAATTCTAATACCCATATGCGAGGCTAACAGAGATTTTCAAAAGCTAAG @CCFFFFFHHHHHJJIJJJJJJJIIIIJJJIIJJJIIIJJJJJIJJJJJJJ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1206:20077:11499 16 2L 416 255 51M * 0 0 AAATTCTAATACCCATATGCGAGGCTAACAGAGATTTTCAAAAGCTAAGCA IJHIGEJJIHDJJJHGIGJIJIGIHHJJIJJJIJIIIGHHHHFBFDFFCC@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1203:17030:154608 0 2L 425 255 51M * 0 0 TACCCATATGCGAGGCTAACAGAGATTTTCAAAAGCTAAGCAAAGTTCATG CCCFFFFFHHHH1FHJJGIJJJJIIJJJJJJJJJJJJIIJIGGGHHGIHIJ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1107:20142:132171 16 2L 429 255 51M * 0 0 CATATGCGAGGCTAACAGAGATTTTCAAAAGCTAAGCAAAGTTCATGGCAA JJJIGGGGIIIICIJIIEHGIJIJIJJIGHGIGJIGJJGHFHHFFDFFCC@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2105:20333:148776 0 2L 435 255 51M * 0 0 CGAGGCTAACAGAGATTTTCAAAAGCTAAGCAAAGTTCATGGCAAATTGCA @C@FFFFFHHHHGJHIJJJIJJJJJIIJIJJJJJJGHIEIJJIJJIHIIII XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1207:11431:60533 16 2L 441 255 51M * 0 0 TAACAGAGATTTTCAAAAGCTAAGCAAAGTTCATGGCAAATTGCAGGAGGC HIIJJJIIJJJIHHIJJIIHFJIIGIJJHJJJIIJJJJHFGHHFFFFFCCC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2108:9521:60699 16 2L 450 255 51M * 0 0 TTTTCAAAAGCTAAGCAAAGTTCATGGCAAATTGCAGGAGGCATTTAATCG JJIIGJIHIJJIIJJJJJJGIJJIIJJJJJJJJJIJIJGHHHHFFFFFC@@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1202:8970:170602 0 2L 453 255 51M * 0 0 TCAAAAGCTAAGCAAAGTTCATGGCAAATTGCAGGAGGCATTTAATCGAAT CCCFFFFFHHHHHJJJJJJJJJJIJIJJIJJIJJIJJJJJJJJBIIGGHJJ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2107:8693:164773 0 2L 457 255 51M * 0 0 AAGCTAAGCAAAGTTCATGGCAAATTGCAGGAGGCATTTAATCGAATATCC @@@FFFFFHHHDHFHGGGJJJJJIJJJIJIHGHFIIJJJJIGGIIIGGIBF XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2307:18419:157090 16 2L 460 255 51M * 0 0 CTAAGCAAAGTTCATGGCAAATTGCAGGAGGCATTTAATCGAATATCCCAA IJJJJJJJJJJIJJJJJJJJIJJJJJJJJJJJJIIJJJHHHHHFFFFFCCC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1104:16588:94026 16 2L 462 255 51M * 0 0 AAGCAAAGTTCATGGCAAATTGCAGGAGGCATTTAATCGAATATCCCAACT JJJIJJJJIGGIGJJJJIIIJJJJJJIJIJIJIJJJJJHHHFHFFFFDBB? XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1207:4817:5073 16 2L 462 255 51M * 0 0 AAGCAAAGTTCATGGCAAATTGCAGGAGGCATTTAATCGAATATCCCAACT B9B4B4@B:**?4EEDG?1A@F?ABFF>FA?=03A=:=; XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2107:17894:130557 0 2L 465 255 51M * 0 0 CAAAGTTCATGGCAAATTGCAGGAGGCATTTAATCGAATATCCCAACTACA BCCFFDFFGHGHHJJJJJJJJJJJJJJJJJJJJIIIJJJJJJJJJJJJJJI XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2303:2851:98881 0 2L 555 255 51M * 0 0 TACATATTTATAAACAGGGTAAGAGAGTTTTTGGAACATACTTTCGTGTTG @BCFFFFFHHHHHJJJJJJAFGIJJIIGIIJJJJIJJJJJIJJJJJGIIJJ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2304:12883:92184 16 2L 563 255 51M * 0 0 TATAAACAGGGTAAGAGAGTTTTTGGAACATACTTTCGTGTTGGCTTCTAT EECDDBDEEEEEDBCCDD:DEAEEDEFCFEC9@@:1DE?D:DDFGGGGGBGEHDDH3;DGGIE;DD XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2106:16981:166296 0 2L 743 255 51M * 0 0 TTGTTTCAACATACGTTAGAACTTTTACACTGAACGATTCGGACCGGACTC ?@@AADBDHFDFDGHFAF;@BECHHII@EEHIGGDBFEFF;FHEGII;AF; XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2108:11645:6736 16 2L 744 255 51M * 0 0 TGTTTCAACATACGTTAGAACTTTTACACTGAACGATTCGGACCGGACTCT JIJJJIIFIIHHGIIGIGGIJIIIHGGIHEIHGIJIJIHHHHHFFFFFC@@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1207:13167:93216 16 2L 747 255 51M * 0 0 TTCAACATACGTTAGAACTTTTACACTGAACGATTCGGACCGGACTCTGTG BCEDGHEFDDGFBGHFBIIIGHGG?GDFDBFBGGGHGGFHDHDDFFFD@@@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1307:17649:158836 0 2L 749 255 51M * 0 0 CAACATACGTTAGAACTTTTACACTGAACGATTCGGACCGGACTCTGTGCA CCCFFFFFHFHHHIGEIGJJIJGJJJJIF@GHGIIJIIIHJGGJJJJ@FGI XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1101:12267:163597 0 2L 752 255 51M * 0 0 CATACGTTAGAACTTTTACACTGAACGATTCGGACCGGACTCTGTGCATAT @@@DFF?DHHDHFHIIJJJIJGHIJJIIJJJJJGHIGIGHEGHI=CHIDGI XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2102:12127:14270 16 2L 752 255 51M * 0 0 CATACGTTAGAACTTTTACACTGAACGATTCGGACCGGACTCTGTGCATAT JIIGJJJIIIIHJJJIGHIJJIIJJJJJJIJJJIJJJJHHHGHFFFFFCCC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1105:19675:113812 16 2L 771 255 51M * 0 0 ACTGAACGATTCGGACCGGACTCTGTGCATATCATTAAAGATTCCAATACC CGGJIJIIJJIJIGGDIIHHG@IHAAJIGIJIIIGIGHFHFHHFDDDD=@B XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:2308:3713:14646 0 2L 772 255 51M * 0 0 CTGAACGATTCGGACCGGACTCTGTGCATATCATTAAAGATTCCAATACCG @@CFFDDFFFGFFEFDBGGHDDGEDDFGGGGGEHGCHGGEGIIJJJIEIGG XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1202:10930:167853 16 2L 773 255 51M * 0 0 TGAACGATTCGGACCGGACTCTGTGCATATCATTAAAGATTCCAATACCGT GHCD@HGGFHEGHGIHFFIIIGFGHDGF*D8?DG7BGFF@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:97:COBJ7ACXX:6:1207:3046:69048 0 2L 784 255 51M * 0 0 GACCGGACTCTGTGCATATCATTAAAGATTCCAATACCGTTGATATTAATA CCCFFFFFHHHHHJJJJGIIJIJJJJJJJIJIJJJIJJJIIJJJJJJJJII XA:i:0 MD:Z:51 NM:i:0
diff --git a/deepTools/source/deeptools/test/test_corrGC/test_paired.bam b/deepTools/source/deeptools/test/test_corrGC/test_paired.bam
new file mode 100644
index 0000000000000000000000000000000000000000..f3962bfa3783766de0539285bd4a22be7933c9ba
Binary files /dev/null and b/deepTools/source/deeptools/test/test_corrGC/test_paired.bam differ
diff --git a/deepTools/source/deeptools/test/test_corrGC/test_paired.bam.bai b/deepTools/source/deeptools/test/test_corrGC/test_paired.bam.bai
new file mode 100644
index 0000000000000000000000000000000000000000..0df7203a23a423a24f0435915097bb958fa65c5c
Binary files /dev/null and b/deepTools/source/deeptools/test/test_corrGC/test_paired.bam.bai differ
diff --git a/deepTools/source/deeptools/test/test_corrGC/test_paired.sam b/deepTools/source/deeptools/test/test_corrGC/test_paired.sam
new file mode 100644
index 0000000000000000000000000000000000000000..4a9f7f1600a5e374965770f2bc57b909fd431c16
--- /dev/null
+++ b/deepTools/source/deeptools/test/test_corrGC/test_paired.sam
@@ -0,0 +1,51 @@
+@HD VN:1.0 SO:unsorted
+@SQ SN:chr2 LN:5010000
+HWUSI-EAS616:7:89:1518:3543#0 99 chr2 5000027 255 36M = 5000355 364 TGTAACAATTTACTTGATTGTTCTCAAGGATGTGAT S\dbdcfaaccbbcccc^dcdc^c^YaaabaaWdaa XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:23:12543:6958#0 83 chr2 5000081 255 36M = 4999776 -341 CTTTGTTGTTCTCCTGTCCATTTCTCACAAAGCTGG hhhhhhhghfhhfhhhhhhfhhhhhhhhhhhhhhhh XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:6:10857:18740#0 83 chr2 5000191 255 36M = 4999866 -361 GTTAGGGGCAATCCTGATTCACAGTTAGCTTCTTAG ggggggggegggggggffcfdgggggggggggdggg XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:93:7807:15274#0 163 chr2 5000304 255 36M = 5000676 408 GAGAGGGAGAGGGAGAGGGAGAATGAAGCAGGAATG hhhhhhhhhehhhhhhhhhehhhhghhhfhhhfghh XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:89:1518:3543#0 147 chr2 5000355 255 36M = 5000027 -364 AGGTCATCAGGCTGGAATTTCAGGTAAGAATTACCA R]bT]`eRd^db]baY`W_W^_\]\_XZUZVVNVHU XA:i:1 MD:Z:34A1 NM:i:1
+HWUSI-EAS616:7:21:17769:5446#0 163 chr2 5000385 255 36M = 5000496 147 TTACAATTGATGTCTGGACTCCAAATCCCTCAAGTG hhhhhhghhhhhhhhhhhhhhfhhhhhhhhhgghdh XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:49:8992:4457#0 99 chr2 5000385 255 36M = 5000496 147 TTACAATTGATGTCTGGACTCCAAATCCCTCAAGTG de\dc\fffdf^cdfe\cff_ffffffdafgcgggg XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:21:17769:5446#0 83 chr2 5000496 255 36M = 5000385 -147 TACTCTCAAAGCTTTCAAATGAAAGGACCCACACAC ghhahhgghhdffccfhhhhhhhghhdfffffdfff XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:49:8992:4457#0 147 chr2 5000496 255 36M = 5000385 -147 TACTCTCAAAGCTTTCAAATGAAAGGACCCACACAC hehhhhgghhgchhghfhhfafhhffhhhhhfdfff XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:86:18020:6752#0 99 chr2 5000560 255 36M = 5000906 382 AACACCAATCACATAAAGAATGTGTCTCCACAGAAG hhgahhhhhghfhhhhhghhhh_cghfhhfhchghh XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:93:7807:15274#0 83 chr2 5000676 255 36M = 5000304 -408 AAAACTTATCAGTTTTTAAAGGTTACTGAGGGCTTG hhhghhhhhghhhhhhghhhghhhhhhhdhghhhhh XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:64:3102:21101#0 99 chr2 5000737 255 36M = 5001136 435 TTGAGGTCAACCTGGGTTACATGGCAAGACCTTGGT hhhhhhghhhhhhhhhfhhhhhhhhhhhghhhhhgf XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:36:16424:18394#0 163 chr2 5000820 255 36M = 5001193 409 TGAAAAGGCATACGGAGCAGCTGATGTTTCTCCAAC hhhhhhhghhhhhhhhhfghghhfhgdhhgehhhhh XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:50:18110:21009#0 163 chr2 5000822 255 36M = 5001123 337 AAAAGGCATACGGAGCAGCTGATGTTTCTCCAACAT ffffccafacfffcfahffdfaffc`a`^`ddfdh] XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:111:14171:9899#0 163 chr2 5000822 255 36M = 5001123 337 AAAAGGCATACGGAGCAGCTGATGTTTCTCCAACAT hhhhhhhhhhhhhhhhghhhhhgefhhhhhhhhhgh XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:114:18552:5841#0 163 chr2 5000822 255 36M = 5001123 337 AAAAGGCATACGGAGCAGCTGATGTTTCTCCAACAT hhhhhhhhhhhhhhghehhhhhhhhhhhfhhhhhhh XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:8:2829:6484#0 99 chr2 5000835 255 36M = 5001214 415 AGCAGCTGATGTTTCTCCAACATCATCCTGGTGTGG hhhfhhhghhhhhhghhhhchghhghhhghhghhhh XA:i:1 MD:Z:28C7 NM:i:1
+HWUSI-EAS616:7:4:16767:6127#0 163 chr2 5000856 255 36M = 5001242 422 ATCATCCTGGTGTGGGGAGGTAGAGGCAGAGGATCA hhghhhhhhc^edeec__]_cfccfdfffW_fdad_ XA:i:1 MD:Z:7C28 NM:i:1
+HWUSI-EAS616:7:66:4289:12506#0 99 chr2 5000868 255 36M = 5001183 351 TGGGGAGGTAGAGGCAGAGGATCAGGAGTTCATGTT hhhhhhhhdhhghghggghhehhhhhehehfhhhfh XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:86:18020:6752#0 147 chr2 5000906 255 36M = 5000560 -382 CCCCTAGCTTTATGCTGTCCATGGTTCATCATCTCT hchhfdhghhfghfahhhhhhhhfhhhhhhhghheh XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:23:8152:13433#0 163 chr2 5000926 255 36M = 5000988 98 ATGGTTCATCATCTCTTATGGCCCATATTAGTCATT hhhhhhhhhhhhhhhhhhhghhhhhhehhhhhhehh XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:103:13675:10395#0 163 chr2 5000926 255 36M = 5000988 98 ATGGTTCATCATCTCTTATGGCCCATATTAGTCATT ffffahhhhfffgfhffh]f]fddfd_fffdcfff\ XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:106:13391:17723#0 163 chr2 5000938 255 36M = 5001303 401 CTCTTATGGCCCATATTAGTCATTGTGCCATTCAAA hhhhhhhhhhhhhhhhhhhghhghhehhhhhhhhhh XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:23:8152:13433#0 83 chr2 5000988 255 36M = 5000926 -98 AGAAACATGTTTCACAGCTCCTACTGTATCCTGGAC hhhhghghhhhhhehhhgdhhhhhhghhhhhhhhhh XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:103:13675:10395#0 83 chr2 5000988 255 36M = 5000926 -98 AGAAACATGTTTCACAGCTCCTACTGTATCCTGGAC f_ffcfdddcaV`XZa[dadcaWcadddaccd[fcf XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:116:8653:16765#0 99 chr2 5001011 255 36M = 5001141 166 CTGTATCCTGGACCCTAGGGATGCAACAGTGGCAAG hhghghhhhfhhhhhhhhhhffhhhhgghghfhfhh XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:81:1971:19006#0 99 chr2 5001026 255 36M = 5001396 406 TAGGGATGCAATAGTGGCAAGATGTGGTTTCTGCTC hhhhhhhhhhhghfhhhhhhghhhhhhghhhhhghd XA:i:1 MD:Z:11C24 NM:i:1
+HWUSI-EAS616:7:39:11820:4528#0 163 chr2 5001051 255 36M = 5001401 386 GGTTTCTGCTCTTCATTGTGAGCTGACTTGGCTGAG hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:25:4767:20646#0 163 chr2 5001115 255 36M = 5001378 299 TCAGAGAGGATAACGTAAGGTAACTCAGAGATATGC ghhfhhcaahe^d`ebfddc^ad`dada[cVb`^^c XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:2:14523:20268#0 163 chr2 5001116 255 36M = 5001234 154 CAGAGAGGATAACGTAAGGTAACTCAGAGATATGCA ghghhhhhfgchghcecfchhchhhghchchehhhh XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:79:5441:2939#0 163 chr2 5001116 255 36M = 5001234 154 CAGAGAGGATAACGTAAGGTAACTCAGAGATATGCA ghcghfhhhchhhffffaffffacffa_f[fafcaf XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:50:18110:21009#0 83 chr2 5001123 255 36M = 5000822 -337 GATAACGTAAGGTAACTCAGAGATATGCACAGGAAG \Z_aa^XY^aa\aaaa\^S^cd^dddYccffb\f_f XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:111:14171:9899#0 83 chr2 5001123 255 36M = 5000822 -337 GATAACGTAAGGTAACTCAGAGATATGCACAGGAAG hhghhhhhhhhhhhhhhhhhhhhghhhhhhhhhhhh XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:114:18552:5841#0 83 chr2 5001123 255 36M = 5000822 -337 GATAACGTAAGGTAACTCAGAGATATGCACAGGAAG hhhhhhghhhhgghhhhhhhhhhhhhhhhhhhhhhh XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:64:3102:21101#0 147 chr2 5001136 255 36M = 5000737 -435 AACTCAGAGATATGCACAGGAAGTTGCATATTTGCA afghhhhgghgehghghhhhhhhhhhfhhhhhhhhh XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:116:8653:16765#0 147 chr2 5001141 255 36M = 5001011 -166 AGAGATATGCACAGGAAGTTGCATATTTGCAATAAA hhhhhhhhhgghhhhhhhhhhhhhhhhhhhhhhhhh XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:66:4289:12506#0 147 chr2 5001183 255 36M = 5000868 -351 CACAGCAGAACTATTAGATCCAGGCACTCACTCAAC hghhhhfffcfddfbdhhhfhhhhhghhhhhhhhhh XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:36:16424:18394#0 83 chr2 5001193 255 36M = 5000820 -409 CTATTAGATCCAGGCACTCACTCAACGTGGATTCTG dbddcbfhehhehffdghhghhggghhhhhhhhhhh XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:8:2829:6484#0 147 chr2 5001214 255 36M = 5000835 -415 TCAACGTGGATTCTGGGCTCTTCAGCTCTGATCTCT ghhhhhfhhhhhhhhhhhhhhhhhhhhhghhfhhgg XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:3:9592:15607#0 163 chr2 5001227 255 36M = 5001568 377 TGGGCTCTTCAGCTCTGATCTCTTTAGCTCTGATTT hhhehhhhhhhhhhfhhhhghghhhhhhhhhhhhgh XA:i:2 MD:Z:24C6C4 NM:i:2
+HWUSI-EAS616:7:2:14523:20268#0 83 chr2 5001234 255 36M = 5001116 -154 TTCAGCTCTGATCTCTTCAGCTCTCATTTGCTCAGA aagggggffgaffafcafeafaggcgaggcgfeggg XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:79:5441:2939#0 83 chr2 5001234 255 36M = 5001116 -154 TTCAGCTCTGATCTCTTCGGCTCTCATTTGCTCAGA gdd`baWfafedd`ae`dKadfcfffaaffdfffff XA:i:1 MD:Z:18A17 NM:i:1
+HWUSI-EAS616:7:4:16767:6127#0 83 chr2 5001242 255 36M = 5000856 -422 TGATCTCTTTAGCTCTGATTTGCTCAGACTGTCTAT fggfggaffffcefgaffddf_^ggggeggfgcgdg XA:i:2 MD:Z:9C6C19 NM:i:2
+HWUSI-EAS616:7:106:13391:17723#0 83 chr2 5001303 255 36M = 5000938 -401 GTCTGCAGTCAACTGGCAGTCCCTCTCAGGGAAAAT hhhhhhhhhghghhhhhhhhhghhhfhhhhhhhhhh XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:25:4767:20646#0 83 chr2 5001378 255 36M = 5001115 -299 GAAAGTGCTCTTTGTCAGACTAAGACCTTTTAATTT gffff_ggaggfaffca^_a\ffd`afgcggggggg XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:81:1971:19006#0 147 chr2 5001396 255 36M = 5001026 -406 ACTAAGACCTTTTAATTTGTCCCATTTTAATAGTAC hhhhhhghhhhhhhhhhhhhhhhhhhhhhhhhhhhh XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:39:11820:4528#0 83 chr2 5001401 255 36M = 5001051 -386 GACCTTTTAATTTGTCCCATTTTAATAGTACATATG hhhhhhhhghhhhhhhhhghhhhhghhhhhhhhhhh XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:29:12313:10611#0 0 chr2 5001492 255 36M * 0 0 AATTGTAAGACCCCCGAAACTGGGGAGACCTCCGCT fd]aeecaaWcdfd_ffffcZ[aaa_\Z]`Z^Z___ XA:i:0 MD:Z:36 NM:i:0
+HWUSI-EAS616:7:39:11820:45288#0 16 chr2 5001701 255 36M * 0 0 GACCTTTTAATTTGTCCCATTTTAATAGTACATATG hhhhhhhhghhhhhhhhhghhhhhghhhhhhhhhhh XA:i:0 MD:Z:36 NM:i:0
diff --git a/deepTools/source/deeptools/test/test_countReadsPerBin.py b/deepTools/source/deeptools/test/test_countReadsPerBin.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb09a9684c5521f2434dc696bf6031ce7b1b391a
--- /dev/null
+++ b/deepTools/source/deeptools/test/test_countReadsPerBin.py
@@ -0,0 +1,209 @@
+# from unittest import TestCase
+
+import deeptools.countReadsPerBin as cr
+import numpy as np
+import numpy.testing as nt
+import os.path
+import pytest
+
+__author__ = 'Fidel'
+
+
+@pytest.mark.parametrize("bc", ["bam", 'cram'])
+class TestCountReadsPerBin():
+
+ def ifiles(self, ext='bam'):
+ root = os.path.dirname(os.path.abspath(__file__)) + "/test_data/"
+ bamFile1 = root + "testA." + ext
+ bamFile2 = root + "testB." + ext
+ bamFile_PE = root + "test_paired2." + ext
+ chrom = '3R'
+ step_size = 50
+ bin_length = 25
+ c = cr.CountReadsPerBin(
+ [bamFile1, bamFile2],
+ binLength=bin_length,
+ stepSize=step_size
+ )
+ return c, bamFile1, bamFile2, bamFile_PE, chrom, step_size, bin_length
+ """
+ The distribution of reads between the two bam files is as follows.
+
+ They cover 200 bp::
+
+ 0 100 200
+ |------------------------------------------------------------|
+ A ==============>
+ <==============
+
+
+ B <============== ==============>
+ ==============>
+ ==============>
+ """
+
+ def test_count_reads_in_region(self, bc):
+ c, bamFile1, bamFile2, bamFile_PE, chrom, step_size, bin_length = self.ifiles(bc)
+ c.skipZeros = False
+ resp, _ = c.count_reads_in_region(chrom, 0, 200)
+
+ nt.assert_equal(resp, np.array([[0, 0.],
+ [0, 1.],
+ [1, 1.],
+ [1, 2.]]))
+
+ def test_count_reads_in_region_extension_1(self, bc):
+ """
+ In this case when read extension is smaller than read length
+ extension is turned off and a warning is printed.
+ """
+ c, bamFile1, bamFile2, bamFile_PE, chrom, step_size, bin_length = self.ifiles(bc)
+ c = cr.CountReadsPerBin(
+ [bamFile1, bamFile2],
+ binLength=1,
+ stepSize=50,
+ extendReads=25
+ )
+
+ resp, _ = c.count_reads_in_region(chrom, 0, 200)
+
+ nt.assert_equal(resp, np.array([[0, 0.],
+ [0, 1.],
+ [1, 1.],
+ [1, 2.]]))
+
+ def test_count_reads_in_region_total(self, bc):
+ """ count the reads over the whole region
+ 2 for the first case, and 4 for the second
+ """
+ c, bamFile1, bamFile2, bamFile_PE, chrom, step_size, bin_length = self.ifiles(bc)
+ c.skipZeros = False
+ c.stepSize = 200
+ c.binLength = 200
+ resp, _ = c.count_reads_in_region(chrom, 0, 200)
+ nt.assert_equal(resp, np.array([[2, 4.]]))
+
+ def test_countReadsInRegions_min_mapping_quality(self, bc):
+ c, bamFile1, bamFile2, bamFile_PE, chrom, step_size, bin_length = self.ifiles(bc)
+ # Test min mapping quality.
+ c.minMappingQuality = 40
+ c.skipZeros = False
+
+ resp, _ = c.count_reads_in_region(chrom, 0, 200)
+ nt.assert_equal(resp, np.array([[0, 0, 0, 1.],
+ [0, 0, 0, 1.]]).T)
+
+ def test_count_reads_in_region_ignore_duplicates(self, bc):
+ c, bamFile1, bamFile2, bamFile_PE, chrom, step_size, bin_length = self.ifiles(bc)
+ # Test ignore duplicates
+ c.skipZeros = False
+ c.ignoreDuplicates = True
+ resp, _ = c.count_reads_in_region(chrom, 0, 200)
+
+ nt.assert_equal(resp, np.array([[0, 0, 1, 1.],
+ [0, 1, 1, 1.]]).T)
+
+ def test_count_reads_in_region_ignore_bed_regions(self, bc):
+ c, bamFile1, bamFile2, bamFile_PE, chrom, step_size, bin_length = self.ifiles(bc)
+ # Test bed regions:
+ bed_regions = [[chrom, [(10, 20)], "."], [chrom, [(150, 160)], "."]]
+ c.skipZeros = False
+ c.binLength = 10
+ resp, _ = c.count_reads_in_region(chrom, 0, 200, bed_regions_list=bed_regions)
+ nt.assert_equal(resp, np.array([[0, 1.],
+ [0, 2.]]).T)
+
+ def test_get_coverage_of_region_sam_flag_include(self, bc):
+ c, bamFile1, bamFile2, bamFile_PE, chrom, step_size, bin_length = self.ifiles(bc)
+ c.samFlag_include = 16 # include reverse reads only
+ c.bamFilesList = [bamFile1]
+ resp, _ = c.count_reads_in_region(chrom, 0, 200)
+ nt.assert_array_equal(resp, np.array([[0], [0], [0], [1]]))
+
+ def test_get_coverage_of_region_sam_flag_exclude(self, bc):
+ c, bamFile1, bamFile2, bamFile_PE, chrom, step_size, bin_length = self.ifiles(bc)
+ c.samFlag_exclude = 16 # exclude reverse reads
+ c.bamFilesList = [bamFile1]
+ resp, _ = c.count_reads_in_region(chrom, 0, 200)
+ nt.assert_array_equal(resp, np.array([[0], [0], [1], [0]]))
+
+ def test_get_coverage_of_region_large_bin(self, bc):
+ c, bamFile1, bamFile2, bamFile_PE, chrom, step_size, bin_length = self.ifiles(bc)
+ c.bamFilesList = [bamFile2]
+ c.binLength = 200
+ c.stepSize = 200
+ resp, _ = c.count_reads_in_region(chrom, 0, 200)
+ nt.assert_array_equal(resp, np.array([[4]]))
+
+ def test_get_coverage_of_region_ignore_duplicates(self, bc):
+ c, bamFile1, bamFile2, bamFile_PE, chrom, step_size, bin_length = self.ifiles(bc)
+ c.ignoreDuplicates = True
+ c.bamFilesList = [bamFile2]
+ resp, _ = c.count_reads_in_region(chrom, 0, 200)
+ nt.assert_array_equal(resp, np.array([[0.],
+ [1.],
+ [1.],
+ [1.]]))
+
+ # check zero to nans
+ c.zerosToNans = True
+ resp, _ = c.count_reads_in_region(chrom, 0, 200)
+ nt.assert_array_equal(resp, np.array([[np.nan],
+ [1.],
+ [1.],
+ [1.]]))
+
+ def test_get_coverage_of_region_split_read(self, bc):
+ """
+ The bamFile1 contains a read at position 10
+ with the following CIGAR: 10S20M10N10M10S
+ that maps to a chromosome named chr_cigar.
+ """
+ c, bamFile1, bamFile2, bamFile_PE, chrom, step_size, bin_length = self.ifiles(bc)
+ # turn of read extension
+ c.extendPairedEnds = False
+ c.bamFilesList = [bamFile1]
+ c.binLength = 10
+ c.stepSize = 10
+ resp, _ = c.count_reads_in_region('chr_cigar', 0, 100)
+ nt.assert_array_equal(resp, np.array([[0.],
+ [1.],
+ [1.],
+ [0.],
+ [1.],
+ [0.],
+ [0.],
+ [0.],
+ [0.],
+ [0.]]))
+
+ def test_get_coverage_of_region_zeros_to_nan(self, bc):
+ c, bamFile1, bamFile2, bamFile_PE, chrom, step_size, bin_length = self.ifiles(bc)
+ c.zerosToNans = True
+ resp, _ = c.count_reads_in_region(chrom, 0, 200)
+
+ nt.assert_equal(resp, np.array([[np.nan, np.nan],
+ [np.nan, 1],
+ [1, 1],
+ [1, 2]]))
+
+ def test_bed_file(self, bc):
+ c, bamFile1, bamFile2, bamFile_PE, chrom, step_size, bin_length = self.ifiles(bc)
+ bed = "chr3R\t0\t10\nchr3R\t110\t120\nchr3R\t160\t180"
+ import tempfile
+ bed_file = tempfile.NamedTemporaryFile(suffix=".bed", delete=False, mode="w")
+ bed_file.write(bed)
+ bed_file.close()
+
+ c = cr.CountReadsPerBin(
+ [bamFile2],
+ bedFile=[bed_file.name]
+ )
+
+ resp = c.run()
+ nt.assert_equal(resp, np.array([[0.],
+ [1.],
+ [2.]]))
+
+ import os
+ os.unlink(bed_file.name)
diff --git a/deepTools/source/deeptools/test/test_data/computeMatrixOperations.bed b/deepTools/source/deeptools/test/test_data/computeMatrixOperations.bed
new file mode 100644
index 0000000000000000000000000000000000000000..9a262203562f675f05669373e3b6edab66b6bbda
--- /dev/null
+++ b/deepTools/source/deeptools/test/test_data/computeMatrixOperations.bed
@@ -0,0 +1,196 @@
+6 85676989,85677423,85677794,85677953,85678135 85677191,85677492,85677875,85678032,85678736 ENST00000369605 . -
+2 132147908,132150911,132153531,132154661,132156972,132157308,132161487 132148228,132150976,132153694,132154768,132157146,132157423,132161955 ENST00000295181 . -
+11 43829708,43833609 43831590,43833917 ENST00000499066 . -
+3 48630451,48630606,48630992,48631223,48631648,48631879,48632244 48630515,48630720,48631126,48631306,48631801,48632044,48632337 ENST00000444531 . -
+22 28687762,28689134,28694031,28695126,28695709,28696900,28699837,28703504,28710005,28711908,28719394,28724976,28725242,28730419,28734402,28741768 28687986,28689215,28694117,28695242,28695873,28696987,28699937,28703566,28710059,28712017,28719485,28725124,28725367,28730548,28734727,28741838 ENST00000382580 . -
+7 76510427,76510902,76511656,76513083,76513946,76515044 76510737,76511052,76511882,76513163,76514076,76516521 ENST00000334348 . +
+6 154086506,154089825,154090951,154118682 154087785,154090178,154091472,154118969 ENST00000522555 . +
+X 153444719,153447747,153454393,153455541,153456264,153456492,153457136,153462609,153464987,153469145,153470449 153445464,153447909,153454508,153455766,153456364,153456651,153457228,153462671,153465055,153469261,153470587 ENST00000334497 . -
+3 154121002,154121941,154124409,154129573,154149388,154152771,154187684,154191288,154194643,154217868,154225855,154240369,154253115,154254719,154255330 154121235,154123075,154124449,154129719,154149445,154152932,154187837,154191418,154194718,154217958,154226010,154240579,154253183,154254824,154257827 ENST00000356448 . +
+20 21197461,21198728,21202313,21213535,21213910,21218146 21197576,21198804,21202372,21213592,21214022,21218289 ENST00000433213 . -
+18 28009803,28011845,28013682,28097062 28009872,28011992,28013909,28097229 ENST00000430882 . -
+2 215476666,215479921 215476730,215480248 ENST00000414756 . +
+16 35195778 35197544 ENST00000564059 . +
+10 73496494,73498297 73496906,73498541 ENST00000620559 . +
+14 105209285,105211121,105212112,105217543,105218997,105219150,105220068,105221647,105226068,105226250,105226633,105228819,105241264,105252506,105256517,105272720,105286295,105300445 105210588,105211293,105212164,105217800,105219053,105219232,105220130,105221914,105226161,105226290,105226760,105228913,105241414,105252579,105256549,105272894,105286376,105315589 ENST00000546474 . -
+1 68375326 68375663 ENST00000434209 . -
+20 43916166,43955147,43973366,44006546,44051305,44054298,44064776,44065711,44066729,44068649 43916208,43955272,43973432,44006792,44051545,44054526,44064857,44066107,44066857,44069616 ENST00000372999 . +
+22 20318118 20318749 ENST00000608275 . -
+15 60456764,60466593,60468060,60476062,60478567,60479002 60456794,60466713,60468162,60476167,60478650,60479107 ENST00000561087 . -
+12 101280127,101281115,101285569,101285748,101286320,101288959,101290136,101290732 101280327,101281196,101285636,101285881,101286509,101289041,101290274,101291427 ENST00000551825 . +
+19 5894676,5896452,5896904,5903611 5894854,5896575,5896997,5903756 ENST00000308961 . -
+8 88032008,88046668,88056127,88074604,88116506,88118699,88167668,88186475,88197157,88327074 88041795,88046784,88056278,88074743,88116718,88118861,88167973,88186598,88197306,88327488 ENST00000286614 . -
+2 218217140,218217371,218225919,218228737,218234351 218217254,218217544,218225954,218228850,218234398 ENST00000420104 . +
+1 150067808,150068629,150072165,150076232 150067950,150068764,150072226,150077029 ENST00000497638 . +
+1 114716915,114718612,114719578,114720538,114723882,114725220,114726210,114726982,114730257,114730507,114732603,114733731,114733988,114734441,114736757,114737470,114737962,114739691,114741526,114749820 114718216,114718745,114719742,114720717,114724002,114725333,114726386,114727090,114730422,114730648,114732816,114733857,114734117,114734523,114736855,114737563,114738072,114739890,114741673,114750190 ENST00000438362 . -
+15 51884555 51884653 ENST00000384753 . +
+11 126283105,126288442,126289664,126290461,126290802 126283153,126288573,126289833,126290585,126290897 ENST00000467006 . +
+2 241150466,241153482,241157806,241158483 241150547,241153604,241157862,241158807 ENST00000473017 . +
+12 121210064,121216954,121217133,121221912,121222093,121222946,121228532,121228724,121228962,121232413,121232610,121232996,121233522 121210298,121217002,121217281,121221984,121222166,121223043,121228613,121228866,121229099,121232507,121232676,121233092,121234070 ENST00000359949 . +
+2 58428384,58656649,58695633 58428464,58656711,58696055 ENST00000455219 . +
+17 43006724,43012178,43013046,43013266,43013475,43013775,43014107 43006968,43012277,43013194,43013373,43013662,43013882,43014456 ENST00000415816 . +
+5 38429891 38430160 ENST00000513087 . -
+2 203380307,203391045,203394699,203395658,203402575 203380384,203391143,203394846,203395780,203402734 ENST00000451591 . +
+12 122976884,122978359,122978752 122976970,122978569,122978830 ENST00000545976 . +
+19 10637643,10637855,10638226,10642366,10643278 10637747,10637929,10638315,10642451,10643528 ENST00000591194 . +
+19 39264381,39264964,39265108,39265490,39265718 39264531,39265036,39265252,39265565,39265817 ENST00000607083 . +
+11 118999040,119010763,119011220,119011938,119012148,119013231,119014213 118999109,119010830,119011311,119012003,119012254,119013262,119014346 ENST00000580556 . +
+11 83459507,83462001 83459924,83462149 ENST00000529159 . -
+12 55684857,55686201,55687970,55688201,55688843 55685288,55686314,55688096,55688300,55688891 ENST00000557555 . -
+2 207868581,207869807 207868942,207869915 ENST00000421964 . -
+10 86968191,86970199,86970475 86968605,86970262,86970915 ENST00000372013 . +
+16 71525232,71526119,71536659 71525364,71526312,71538746 ENST00000338482 . +
+1 2590925,2591556,2591931,2592654,2592832,2593813,2594384,2594789,2595275,2596008,2596560,2598206,2598653,2603883,2604146,2605557,2606247,2606973,2609338,2609669,2611280,2612126,2629330 2591089,2591633,2592027,2592720,2592966,2593962,2594443,2594893,2595359,2596107,2596689,2598300,2598790,2603973,2604281,2605623,2606366,2607069,2609419,2609831,2611340,2612204,2629494 ENST00000504800 . -
+19 39886265,39887118 39886585,39887292 ENST00000595713 . -
+1 24625410 24625513 ENST00000459380 . -
+8 12738028,12740873,12743163,12755923 12738144,12740996,12743282,12756073 ENST00000530693 . -
+17 82900541,82903404,82905935,82907760,82909284,82911757,82920555,82921500,82923651,82924938,82926399 82900731,82903478,82906053,82907821,82909307,82911789,82920618,82921577,82923733,82925057,82926439 ENST00000574422 . +
+4 158171348,158172872 158171489,158173318 ENST00000587787 . -
+2 135638617,135638738,135638895,135645378,135649086 135638655,135638789,135639122,135645527,135649331 ENST00000443537 . +
+X 74278373,74280930,74281701,74292426 74280494,74281085,74281848,74292600 ENST00000429124 . -
+3 33277465,33297663,33373091,33373259,33373577,33373846,33375287,33377272,33378102,33378684,33383988,33385500,33403233 33277515,33297725,33373160,33373355,33373704,33373921,33375418,33377333,33378147,33378741,33384201,33385728,33403662 ENST00000463736 . +
+8 46792064,46792608 46792320,46793064 ENST00000509929 . +
+1 44988233 44988725 ENST00000411837 . -
+6 44127553,44134560,44135016,44135327,44136348,44138479,44139466,44139707,44140251 44127678,44134743,44135096,44135366,44136439,44138517,44139609,44139759,44140328 ENST00000532634 . +
+15 57720294 57720928 ENST00000567865 . +
+19 21788879,21793516 21789100,21793860 ENST00000593824 . -
+10 35195146,35206894,35211253 35195214,35207051,35212923 ENST00000356917 . +
+17 8144993 8145071 ENST00000614952 . -
+1 226870183,226871261,226875364,226881887 226870505,226871404,226875550,226881970 ENST00000524196 . +
+2 2729907,2730775 2730093,2730957 ENST00000457813 . -
+16 88803212,88804544,88804761,88805439,88805723,88806020,88806485,88807050,88807280,88808114 88804059,88804667,88804898,88805637,88805869,88806121,88806674,88807203,88807482,88809258 ENST00000301019 . +
+12 6981293,6981810,6982675 6981632,6981904,6982783 ENST00000620843 . -
+16 84459258,84461694,84467303 84460893,84462999,84467361 ENST00000565700 . -
+4 77030782,77039742 77030970,77040100 ENST00000513373 . +
+2 120090482,120091554,120093248,120100243,120100698,120101588 120090516,120091661,120093276,120100286,120100814,120101627 ENST00000489017 . +
+18 71932492 71932796 ENST00000604699 . +
+8 15688913 15689562 ENST00000506768 . -
+X 49177988,49178271,49179250,49179692,49183733,49184624,49184825 49178179,49178475,49179388,49179806,49183917,49184710,49184898 ENST00000432913 . -
+6 111661759,111674498,111694374,111694627,111696276 111661947,111674630,111694528,111694704,111696524 ENST00000467921 . -
+4 77720237,77726166,77728853,77731386,77742140,77744717,77748315,77756861,77773080,77774529,77776270,77819303 77720643,77726369,77729081,77731538,77742295,77744875,77748384,77756951,77773166,77774712,77776392,77819376 ENST00000504804 . -
+1 24323039,24334644,24336481,24337077,24337635,24337991,24339667,24342114,24342693,24342891,24344896,24346552,24347467,24350057,24354373 24323117,24334706,24336827,24337151,24337789,24338103,24339762,24342273,24342772,24343025,24344931,24346641,24347553,24350122,24354488 ENST00000528064 . +
+11 10591575,10593491 10591612,10594023 ENST00000529471 . -
+18 13059181,13067830,13068093,13068358,13068851,13069088,13069737,13071038,13072754,13073008,13087016,13087530,13089451,13092376,13095502,13096183,13099475,13100304,13103508,13104983,13116373,13117584,13124631 13059312,13067956,13068237,13068422,13068991,13069181,13069856,13071212,13072845,13073185,13087277,13087646,13089565,13092527,13095681,13096307,13099581,13100512,13103588,13105079,13116503,13117643,13125034 ENST00000430049 . +
+16 19417695,19430412,19439959,19444080,19449541,19460234,19463279,19463775,19466081,19469680,19472087,19474124,19477439,19479430,19481369,19486944,19487192,19490394,19492149,19494261,19497120,19497919 19418092,19430640,19440826,19444250,19449631,19460334,19463367,19464024,19466233,19469825,19472243,19474276,19477518,19479528,19481465,19487020,19487326,19490568,19492228,19494366,19497163,19499113 ENST00000542583 . +
+14 76151934,76154353,76166662,76171842,76173545,76176622,76177891,76180763,76195877,76201690 76151991,76155025,76166727,76172019,76173625,76176690,76177931,76180849,76195972,76202788 ENST00000312858 . +
+14 104773790,104774937,104775075,104775651,104776658 104773980,104775003,104775207,104775799,104776694 ENST00000554826 . -
+5 72816671,72848384,72851243,72861807,72865595,72872638,72875614,72877227,72882466,72883063,72887069,72888077,72889785,72891809,72893138,72893376,72893615,72896457,72897055,72900005,72900973,72903708,72905302 72816752,72848498,72851319,72861914,72865729,72872720,72875737,72877346,72882527,72883232,72887222,72888303,72889957,72891896,72893246,72893535,72893703,72896556,72897151,72900081,72901073,72903783,72905429 ENST00000523768 . +
+3 149812707,149846010,149852515,149872028,149895472,149902071,149911977,149921133,149960055,149960739 149813353,149846140,149852596,149872154,149895560,149902162,149912083,149921227,149960136,149962139 ENST00000392894 . +
+2 89078009,89078738 89078310,89078784 ENST00000517571 . -
+19 37594829,37598375,37599625,37611512 37595095,37598456,37599752,37613387 ENST00000589117 . +
+5 74693474,74696692,74696995,74705218 74693704,74696739,74697106,74705257 ENST00000510820 . +
+3 14402575,14416411,14443627,14445716,14447581,14457949,14466515,14467852,14468087,14472204,14477204,14478465,14479084,14481670,14484866 14402847,14416453,14443863,14445851,14447816,14458082,14466650,14467956,14468212,14472317,14477342,14478568,14479185,14481841,14489349 ENST00000613060 . +
+7 141649130,141651524,141652786 141649333,141651609,141653065 ENST00000494053 . +
+2 74264147,74265103,74285772,74290243 74264299,74265264,74285902,74290705 ENST00000432728 . -
+9 35812973,35813444,35813643,35814898 35813338,35813549,35813784,35815021 ENST00000461169 . -
+2 169529754,169530586,169531351,169540050 169529926,169530701,169531490,169540117 ENST00000490590 . -
+16 86490267,86508654 86491004,86508877 ENST00000593604 . -
+3 157149286,157149484,157149835,157150064,157150281,157152176,157153035,157156935,157158865,157159404,157159791 157149385,157149596,157149977,157150169,157150381,157152241,157153156,157157083,157158975,157159479,157160178 ENST00000477127 . -
+12 51382334,51383460,51391600 51382509,51383582,51391675 ENST00000603482 . -
+19 11420603,11421127,11421676,11422470,11422700,11423876,11426143,11426445,11426682,11426872,11430698,11430898,11434772 11420947,11421212,11421832,11422627,11422861,11424029,11426266,11426571,11426784,11427040,11430776,11431020,11435104 ENST00000356392 . -
+2 55313828,55315927 55314123,55316051 ENST00000476903 . -
+2 86604599,86612158,86620329,86622660,86623815 86605418,86612274,86620469,86622827,86623866 ENST00000477307 . -
+19 35059058,35086955,35099204,35106228 35059778,35087100,35099387,35106304 ENST00000392227 . -
+17 4945666,4946627 4945997,4947469 ENST00000574872 . -
+10 126413868,126416799,126417715,126421786 126414738,126416911,126417808,126421879 ENST00000456514 . -
+22 30522798,30525349,30525610,30525825,30529086,30529288,30531902,30532524,30532796,30533995,30538826,30546628 30525109,30525519,30525750,30525932,30529170,30529349,30531998,30532713,30532856,30534039,30538902,30546682 ENST00000402034 . -
+5 154049601,154051736,154052919 154050017,154051943,154053002 ENST00000519928 . +
+5 93741639,93743002 93741689,93743500 ENST00000606528 . +
+22 45718422,45729424,45738730,45740368 45718493,45729590,45738839,45740800 ENST00000483549 . +
+8 114282135,114284218,114287721 114282294,114284514,114287996 ENST00000519248 . +
+16 2091822,2092046,2092479,2093543,2093810 2091906,2092188,2092592,2093738,2093884 ENST00000562425 . -
+1 154992588,154992901 154992786,154993111 ENST00000481758 . +
+6 32934628,32940752 32938965,32940984 ENST00000498020 . -
+2 109794684,109801782,109803431 109795076,109801850,109803539 ENST00000432606 . +
+1 966501,966703,970276,970520,970685,971076,971323,972074,972287,972860,973185,973499,973832,974315,974441 966614,966803,970423,970601,971006,971208,971404,972150,972424,973010,973326,973640,974051,974364,975008 ENST00000379409 . +
+6 61630232,61652254,61659116,61661281,61678886,61680886 61630501,61652317,61659150,61661364,61678947,61681049 ENST00000511849 . -
+13 30713477,30735550,30744059,30752051,30755943,30763943 30713841,30735675,30744159,30752122,30756025,30764425 ENST00000617770 . +
+1 161206408,161209192,161209482 161206597,161209313,161209727 ENST00000473321 . +
+18 28146232 28146703 ENST00000621223 . -
+16 2148623,2149940,2151568,2151694,2151855,2152819,2152988,2153318 2148978,2150051,2151610,2151761,2151908,2152885,2153222,2154110 ENST00000562735 . +
+5 175492206,175509060,175510108,175511450,175512110,175513462,175516613 175492267,175509202,175510207,175511526,175512196,175513590,175516662 ENST00000502865 . +
+22 29231017,29231457,29231592,29232255,29233378,29233613,29234136,29234304,29243444 29231140,29231508,29231682,29232402,29233468,29233666,29234199,29234349,29243489 ENST00000433143 . +
+21 27638692,27648664,27653355 27638923,27648765,27653491 ENST00000426418 . +
+17 6641026,6641752,6642248 6641227,6641834,6642357 ENST00000571957 . +
+6 41683978,41686089,41687093,41687752,41687907,41689730,41690662,41691000,41734312 41685078,41686237,41687169,41687809,41688028,41689811,41690917,41691235,41734401 ENST00000403298 . -
+9 76611376,76637534,76638185,76644738,76652482,76655422,76692073 76613373,76637549,76638288,76644912,76652683,76655502,76692200 ENST00000223609 . -
+17 1843919,1844577,1853100,1854138 1843998,1844686,1853189,1854274 ENST00000571725 . +
+11 64224800,64226049 64224970,64226234 ENST00000540472 . -
+17 41619445,41620535,41620658,41620965,41621592,41622949,41624077 41619688,41620558,41620879,41621091,41622511,41623032,41624296 ENST00000493253 . -
+20 45416109,45419294,45419503,45420135,45420331,45420527,45421382,45424215,45424495,45425573 45416343,45419395,45419590,45420223,45420429,45420693,45421583,45424381,45424579,45426042 ENST00000279035 . +
+17 41966740,41968067,41971891,41973474 41966887,41968740,41972031,41977731 ENST00000393892 . +
+9 129612268,129613106,129613434,129615483,129620602 129612454,129613251,129613597,129615644,129620743 ENST00000619117 . -
+10 132351606,132355789,132361456,132365423,132366845 132351753,132355886,132361575,132365550,132367001 ENST00000472556 . +
+12 52782649 52782839 ENST00000547968 . +
+14 23953788,23955034,23965761,23965931,23967210,23968756 23953916,23955212,23965832,23965983,23967266,23969274 ENST00000559632 . +
+14 24146809,24147522 24147221,24147570 ENST00000561103 . -
+16 69799045,69840125,69842023,69871803 69799295,69840263,69842120,69871865 ENST00000567986 . +
+9 119511668 119511760 ENST00000616497 . +
+14 102928829,102930164,102930405,102930575 102928955,102930327,102930493,102930591 ENST00000559789 . +
+15 49155770,49158844,49201161,49217189,49235850,49239220 49156016,49158968,49201250,49217313,49235941,49239330 ENST00000560654 . +
+1 19608113 19608568 ENST00000457263 . +
+8 26547668 26548463 ENST00000524123 . +
+10 37600795 37601368 ENST00000448191 . -
+17 44769985,44771583,44771755,44772266,44772398,44772856,44773013,44773260,44774294,44774491,44774697,44775211,44775393,44775583,44776126,44776744,44776898,44777165,44777471,44777694,44777951,44778151,44779221,44779738 44770048,44771669,44771831,44772333,44772466,44772931,44773085,44773427,44774379,44774582,44774749,44775311,44775465,44775676,44776207,44776795,44776962,44777265,44777601,44777863,44778066,44778242,44779239,44780610 ENST00000587773 . +
+1 58084418 58084559 ENST00000441183 . +
+14 52646191,52646374 52646287,52647124 ENST00000555069 . -
+15 71167024,71185404,71188815 71167189,71185506,71189016 ENST00000566268 . -
+2 26848423,26898495,26924886,26927252 26848497,26898760,26925045,26927401 ENST00000431402 . +
+17 49844056,49847912 49844157,49848017 ENST00000608380 . -
+14 105858334 105858412 ENST00000581354 . -
+11 4832132 4833072 ENST00000421277 . +
+6 29657209,29659318,29666151,29667642,29667903,29670173,29670700,29671171 29657297,29659666,29666265,29667663,29667924,29670221,29670721,29671185 ENST00000396704 . +
+6 27126078,27132524 27126462,27132548 ENST00000606923 . -
+3 9649504,9653620,9662266,9669431,9671047,9672684,9677316,9677983,9683177 9649742,9653769,9662375,9669492,9671170,9672758,9677387,9678058,9683728 ENST00000430020 . +
+10 131092390,131098305,131104264,131116798,131134378,131146505,131163121,131166796,131260258,131308210,131309152,131311293 131093318,131098424,131104354,131116934,131134448,131146660,131163210,131166885,131260444,131308391,131309299,131311721 ENST00000368642 . -
+X 103585562,103586218,103586653 103585624,103586291,103587526 ENST00000494801 . +
+12 104286994,104288930,104289465,104311289,104313244,104315776 104287109,104289040,104289542,104311412,104313317,104315832 ENST00000531689 . +
+9 20726285,20740235,20758089,20764868,20770031 20726348,20740340,20758191,20765073,20770112 ENST00000605031 . +
+1 26182054,26182360,26182479,26183203,26183345,26183728,26184070,26184213,26184400 26182082,26182402,26182584,26183256,26183414,26183830,26184141,26184287,26184409 ENST00000528001 . +
+7 75953988,75972412,75979450,75980338,75981516,75982223,75983519,75983737,75984776,75985057,75985578,75985922,75986158,75986336 75954180,75972461,75979579,75980488,75981606,75982322,75983636,75983856,75984958,75985207,75985849,75986068,75986241,75986854 ENST00000454934 . +
+20 19693266,19694684,19695435,19696543 19693462,19694924,19695516,19696727 ENST00000598694 . -
+2 218893220,218899190 218893267,218899581 ENST00000489887 . +
+3 9362841,9365015,9366907,9371059,9374515,9377818,9380502,9383198,9384211,9384523 9363127,9365320,9366985,9371536,9374646,9377888,9380618,9383309,9384335,9385702 ENST00000452837 . +
+X 71368665 71368968 ENST00000611704 . +
+13 27621885,27648378,27650041 27622009,27648453,27651549 ENST00000489647 . +
+3 112990446 112991153 ENST00000609673 . -
+12 111513473,111518248,111519831 111513539,111518427,111519967 ENST00000481331 . -
+1 111449483,111456085,111456629,111459456,111460916 111449581,111456249,111456755,111459636,111461026 ENST00000483994 . +
+8 103213410,103219464,103227988 103213594,103219554,103228166 ENST00000521926 . +
+7 100336078,100338178,100338772,100345864,100349730,100351251 100336220,100338264,100338889,100345968,100349887,100351900 ENST00000473757 . +
+11 66312852,66318833 66312992,66319237 ENST00000534065 . +
+7 44566001,44566447,44568117,44568902,44569129,44569808,44570014,44571491,44572346,44572573,44572889,44573582,44573835 44566079,44566524,44568223,44568992,44569203,44569903,44570128,44571736,44572437,44572744,44573050,44573744,44573925 ENST00000431640 . -
+16 11756321,11758430,11761405 11756370,11758539,11761662 ENST00000570862 . -
+5 35675601,35691036,35692569,35694287,35695734,35697689,35700495,35704553,35705650 35675992,35691256,35692724,35694348,35695796,35697793,35700752,35704662,35705791 ENST00000504054 . +
+6 32038582,32039109,32039355,32039545,32039748 32038811,32039248,32039457,32039647,32039756 ENST00000464325 . +
+16 4797741,4797937,4798070,4798568,4799685,4800497,4801502,4801913 4797840,4797987,4798184,4798667,4799781,4800578,4801585,4802184 ENST00000586336 . -
+22 38111871,38112503 38112305,38112855 ENST00000463287 . -
+X 78945420,78947814,78952192,78960507 78945495,78947863,78952335,78961954 ENST00000171757 . +
+8 143817978,143818192,143818372,143818958 143818075,143818285,143818534,143819037 ENST00000528999 . -
+19 42300088,42300170,42301949,42302231,42302436 42300092,42300287,42302039,42302325,42302777 ENST00000601865 . -
+9 136483494 136486066 ENST00000354376 . +
+8 97691018,97713661,97719048,97722878,97724599 97691188,97713769,97719189,97723035,97724652 ENST00000519293 . +
+1 152663395,152664084 152663429,152664659 ENST00000368784 . +
+11 213035,214229 213418,214516 ENST00000526557 . +
+1 43023548 43023637 ENST00000516994 . -
+2 25328632 25328744 ENST00000408518 . -
+19 45128568 45129030 ENST00000589460 . +
+3 49416778,49417817,49418970,49419259,49419709,49420210,49421491,49422103,49422360 49417718,49417973,49419151,49419405,49419788,49420342,49421572,49422271,49422753 ENST00000273588 . -
+12 133037393,133041277,133047954,133048752 133037554,133041482,133048081,133048818 ENST00000438628 . +
+11 61329956,61331542,61332907,61333603 61330074,61331691,61333008,61333775 ENST00000543627 . -
+17 43528498,43529134,43529503,43529883,43530106 43528743,43529236,43529676,43529952,43530434 ENST00000586826 . -
+2 101309449 101309534 ENST00000578474 . +
+21 28872190,28876311,28878191,28879869,28882159,28882984,28885211 28872739,28876491,28878333,28879953,28882250,28883071,28885371 ENST00000460212 . -
+17 42968725,42970288,42971606,42979169,42979377 42969186,42970342,42971696,42979268,42980349 ENST00000462157 . -
+5 90410032,90410499 90410304,90410669 ENST00000546238 . +
+10 32928192,32929821,32932505,32935491,32958144 32928264,32930044,32932600,32935558,32958227 ENST00000488494 . -
+4 185678422,185678795,185684753,185740000 185678547,185678822,185684842,185740330 ENST00000452351 . -
+1 231925833,231940927,231943748,231944922 231925950,231941050,231943959,231945233 ENST00000456782 . +
+17 551633,553379,560425,562502 551950,553462,560573,562686 ENST00000572607 . -
+6 2245776,2248836,2263600,2269697,2271815,2283498 2245930,2248926,2263684,2269799,2273417,2283774 ENST00000456943 . +
+15 56634037 56634167 ENST00000614892 . -
+Y 6837706 6838252 ENST00000433995 . +
+2 73271196,73272186,73273014,73284159 73271510,73272302,73273129,73284431 ENST00000520186 . -
+9 112380079,112403994,112405623,112418859,112438430,112441698,112454012,112459448,112470431 112380180,112404158,112405722,112418958,112438625,112441770,112454162,112459577,112472405 ENST00000398803 . +
+16 35640028 35640582 ENST00000566449 . +
+2 108493300,108499552,108507559 108497109,108499754,108509415 ENST00000480863 . +
diff --git a/deepTools/source/deeptools/test/test_data/computeMatrixOperations.mat.gz b/deepTools/source/deeptools/test/test_data/computeMatrixOperations.mat.gz
new file mode 100644
index 0000000000000000000000000000000000000000..7e273da291290945574196564c2af7bb428728c9
--- /dev/null
+++ b/deepTools/source/deeptools/test/test_data/computeMatrixOperations.mat.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e80548b8e0d6f291f435836f62c5a841f473de2012e2f5ccf43c3dd537d1f8ca
+size 62917
diff --git a/deepTools/source/deeptools/test/test_data/make_test_data.sh b/deepTools/source/deeptools/test/test_data/make_test_data.sh
new file mode 100644
index 0000000000000000000000000000000000000000..77117fd4b786a1bb933c1ecb2cde30a9ff134e06
--- /dev/null
+++ b/deepTools/source/deeptools/test/test_data/make_test_data.sh
@@ -0,0 +1,2 @@
+bamCoverage -b testA.bam -o testA_skipNAs.bw --skipNAs
+bamCoverage -b testB.bam -o testB_skipNAs.bw --skipNAs
diff --git a/deepTools/source/deeptools/test/test_data/othergenes.txt.gz b/deepTools/source/deeptools/test/test_data/othergenes.txt.gz
new file mode 100644
index 0000000000000000000000000000000000000000..d606b866c885b1a566f2269927069d030a0e615f
--- /dev/null
+++ b/deepTools/source/deeptools/test/test_data/othergenes.txt.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:005a7c7126d8719b623485aa8335eaa03c7f5e571341d89cfe5a4c6d5b51a9b2
+size 329
diff --git a/deepTools/source/deeptools/test/test_data/somegenes.txt.gz b/deepTools/source/deeptools/test/test_data/somegenes.txt.gz
new file mode 100644
index 0000000000000000000000000000000000000000..19e48af1970bd4cfede2b04fcdf22dd8bb32189e
--- /dev/null
+++ b/deepTools/source/deeptools/test/test_data/somegenes.txt.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:29b27a01e50ad69865de2451f22b58f97fdcd92985083dce8a68c11b546a9511
+size 320
diff --git a/deepTools/source/deeptools/test/test_data/test.bed3 b/deepTools/source/deeptools/test/test_data/test.bed3
new file mode 100644
index 0000000000000000000000000000000000000000..3a436173d285c6fcd01a85f512fbbec3bc638008
--- /dev/null
+++ b/deepTools/source/deeptools/test/test_data/test.bed3
@@ -0,0 +1,4 @@
+chr1 1 10
+chr2 1 10
+chr2 5 15
+chr2 20 30
\ No newline at end of file
diff --git a/deepTools/source/deeptools/test/test_data/test.gtf b/deepTools/source/deeptools/test/test_data/test.gtf
new file mode 100644
index 0000000000000000000000000000000000000000..18587cf68f36177262e4d20440f4ad744f8460fa
--- /dev/null
+++ b/deepTools/source/deeptools/test/test_data/test.gtf
@@ -0,0 +1,9 @@
+3R deepTools gene 1 1000 . . 0 gene_id "example";
+3R deepTools transcript 1 1000 . + 0 gene_id example; transcript_id "first";
+3R deepTools exon 1 50 . + 0 gene_id example; transcript_id "first";
+3R deepTools exon 400 510 . + 0 gene_id example; transcript_id "first";
+3R deepTools exon 980 1000 . + 0 gene_id example; transcript_id "first";
+3R deepTools transcript 100 1100 . - 0 gene_id example; transcript_id "second";
+3R deepTools exon 100 150 . - 0 gene_id example; transcript_id "second";
+3R deepTools exon 500 610 . - 0 gene_id example; transcript_id "second";
+3R deepTools exon 1080 1100 . - 0 gene_id example; transcript_id "second";
diff --git a/deepTools/source/deeptools/test/test_data/test1.bam b/deepTools/source/deeptools/test/test_data/test1.bam
new file mode 100644
index 0000000000000000000000000000000000000000..ba9f13eb1ee708445621faf1903fb3b384811811
Binary files /dev/null and b/deepTools/source/deeptools/test/test_data/test1.bam differ
diff --git a/deepTools/source/deeptools/test/test_data/test1.bam.bai b/deepTools/source/deeptools/test/test_data/test1.bam.bai
new file mode 100644
index 0000000000000000000000000000000000000000..a846febd97e1248cd4d5498cd754fa77a1eeacd1
Binary files /dev/null and b/deepTools/source/deeptools/test/test_data/test1.bam.bai differ
diff --git a/deepTools/source/deeptools/test/test_data/test1.bg b/deepTools/source/deeptools/test/test_data/test1.bg
new file mode 100644
index 0000000000000000000000000000000000000000..a4d04caaf817032281f113cc604d4f9d9d67d018
--- /dev/null
+++ b/deepTools/source/deeptools/test/test_data/test1.bg
@@ -0,0 +1,41 @@
+3R 0 25 0.76
+3R 25 50 1.00
+3R 50 75 0.32
+3R 75 100 1.92
+3R 100 125 3.28
+3R 125 150 2.92
+3R 150 175 7.88
+3R 175 200 18.96
+3R 200 225 15.84
+3R 225 250 8.40
+3R 250 275 6.84
+3R 275 300 4.80
+3R 300 325 5.68
+3R 325 350 3.96
+3R 350 375 3.16
+3R 375 400 4.56
+3R 400 425 8.64
+3R 425 450 10.92
+3R 450 475 9.48
+3R 475 500 7.48
+3R 500 525 4.12
+3R 525 550 4.68
+3R 550 575 5.32
+3R 575 600 7.96
+3R 600 625 12.76
+3R 625 650 8.76
+3R 650 675 5.48
+3R 675 700 6.56
+3R 700 725 8.12
+3R 725 750 12.84
+3R 750 775 10.56
+3R 775 800 6.44
+3R 800 825 4.08
+3R 825 850 1.76
+3R 850 875 7.80
+3R 875 900 22.24
+3R 900 925 21.96
+3R 925 950 9.32
+3R 950 975 4.20
+3R 975 1000 1.20
+3R 1000 1025 0.80
diff --git a/deepTools/source/deeptools/test/test_data/test1.bw.bw b/deepTools/source/deeptools/test/test_data/test1.bw.bw
new file mode 100644
index 0000000000000000000000000000000000000000..baccc5e89f08d4f6ccc5ab0fb61bf58adc87723f
Binary files /dev/null and b/deepTools/source/deeptools/test/test_data/test1.bw.bw differ
diff --git a/deepTools/source/deeptools/test/test_data/test1.cram b/deepTools/source/deeptools/test/test_data/test1.cram
new file mode 100644
index 0000000000000000000000000000000000000000..6e6f5db66c12a2c46ae5b361a9126e5bc123c4c6
Binary files /dev/null and b/deepTools/source/deeptools/test/test_data/test1.cram differ
diff --git a/deepTools/source/deeptools/test/test_data/test1.cram.crai b/deepTools/source/deeptools/test/test_data/test1.cram.crai
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d2733bafdf841aee3e60e750159594a659cdc
Binary files /dev/null and b/deepTools/source/deeptools/test/test_data/test1.cram.crai differ
diff --git a/deepTools/source/deeptools/test/test_data/test1.fa b/deepTools/source/deeptools/test/test_data/test1.fa
new file mode 100644
index 0000000000000000000000000000000000000000..7e18693bfc6158daf7e4bbde48b6b8459320dc55
--- /dev/null
+++ b/deepTools/source/deeptools/test/test_data/test1.fa
@@ -0,0 +1,26 @@
+>3R
+GAATTCTCTCTTGTTGTAGTCTCTTGACAAAATGCAATGGTCAGGTAGCGTTGTTCTAAA
+CTCAAGATTTAAAGGTGAATAGTCCTGTAAGCCCTATAAACATATGTACATAGGTAGGCC
+AGTACTTAGTACTGGCACATGCCGCTGATCTGTTAGTAGATTATCCATTTCCCTTCAGCG
+CCTACCTGCGTCACCAATGATGAGGTCGAGACAGAATCCTACTAGTACCTGCCTCGAGTC
+GATCGGGCAGAGAGCGAGAAATGGTAAGCAGGTGAGTGAGCGCAGAGAGCGTCTTTCGAC
+GACTCTTTCGTCGCGAGCAAACAACAAGTAGACGTCGCTCAGACACTGTCGGCCAGATTC
+ATTTTCCAGAAAGACGTCGTCGCGTTGACAAGCTTAAATTCGTAGCGGGCGCCAGTAGGA
+CGACCCAGTGGATATCGTCAGTTGAACCAGGGGAAACGTAGCAGCCCAGTTACATTGCTC
+GGGAGGGGTAAAGAGCTTGACGACAGCGCGTGCGTAGAGTGAAAGTATGCAAGGAGATTC
+GCGATCAGAACCTCACGACGCCATATTTGTTTTCCAGGGCTTGCTTGTGTGTGCGTGTGT
+TTCAACTCCGGTGTGCGAGTGTGAGGGCAGGGCTTTGCCAGGTGATAATGCCTTAACTGT
+CCCTTGTATTCGGGCTTCGTCTTCGCAAATTCGAACAACAGTATTCTTGATTGATGCAGT
+TTTACAGCGACTTTGTGTGTGCGTATGCTGTCACCACACTATGTTCGAGTGTGTGTGCGC
+TCGTGTTTCTGTGAACCCAATCGCGAACACTGTTGTGAGCCAGTGGCTCTGTCTGCGCGG
+CGAGAAATATCCGCTTACCTAAACGAAAAGTTCTCTAGCGTCGGCCGACGCACGGCACCC
+AGGCACACACACAGCCACATTTGCAGAAATACCACTACATACGAAACGAACGTGGCCAGC
+ACACAAGCGAAACCGGAAAATCCACATTTTTTAGACCTGCTCTCTGTCCCGCGTCTCTCA
+TCTCTTTTCATTACGCTCTCGACTGGAACGCAATACCAAGACCACAATCAACAGCTACAG
+CTTTAGGTGTTTGAAGGTAGATTACGTAGTTGCAATGGGCGACTCCACGCCCATTTGCCG
+ATGCCGAGTACTTTATCTGGGCAGTGCCGTGCCCCGACAGAGCAAGGATGGGCTGCAGGG
+CATCCAGGAGCCGCTGCGAAGCCTGTATCCTTCGGAAGGGGCGGTGGGCGCCAAGGGCAT
+CGACAGCTGGCTAAGCGTCTGGTCCAACGGCATCCTGCTGGAGAACGTGGACGAAAACCT
+CAAACAGATCACGCGCTTTTTCCCGATCGAGTCGCTACACTACTGCGCCGCCGTTCGCCA
+AGTGCTCATCCCAGAGCGCGGAAACACCCACCCGGAACCAAAGTTCCTCCCTCTGGACTC
+GCCTTTTGCGCGAATGCCGCGCGCTCAGCACCCACCCATTTTCGCTGCTATTCTGCGACG
diff --git a/deepTools/source/deeptools/test/test_data/test1.fa.fai b/deepTools/source/deeptools/test/test_data/test1.fa.fai
new file mode 100644
index 0000000000000000000000000000000000000000..77c0bbcbf014e308b9af87a2180e231f3abffb47
--- /dev/null
+++ b/deepTools/source/deeptools/test/test_data/test1.fa.fai
@@ -0,0 +1 @@
+3R 1500 4 60 61
diff --git a/deepTools/source/deeptools/test/test_data/test1.sam b/deepTools/source/deeptools/test/test_data/test1.sam
new file mode 100644
index 0000000000000000000000000000000000000000..517f5a3a53d1562cea3d22b1478ccbadc819274a
--- /dev/null
+++ b/deepTools/source/deeptools/test/test_data/test1.sam
@@ -0,0 +1,146 @@
+@HD VN:1.0 SO:unsorted
+@SQ SN:3R LN:1500 M5:9bad144b81e5e7369a164895691b3e73 UR:./test1.fa
+DD61XKN1:101:D0EKPACXX:2:2103:15826:166544 0 3R 7 5 51M * 0 0 TCTCTTGTTGTAGTCTCTTGACAAAATGCAATGGTCAGGTAGCGTTGTTAG @CCFFFFFHGHHHIJIIIJIJJJHJJJJJGIJJJHJJJJF*?HHHIJHEHH XA:i:0 MD:Z:49C0T0 NM:i:2
+DD61XKN1:101:D0EKPACXX:2:1107:3353:155198 16 3R 75 9 51M * 0 0 GTGAATAGTCCTGTAAGCCCTATAAACATATGTACATAGGTAGGCCAGTAC HEGIGEGD@GHHHGHFDEIHEFGIIHGF?CEF?BGGBCHHFFFDDDDA@@@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:2205:8714:154131 0 3R 86 5 51M * 0 0 TGTAAGCCCTATAAACATATGTACATAGGTAGGCCAGTACTTAGTACTGGC CCCFFFFFHHHHHIJJFIEHIHHHHIIHJIJJJJJJIIIIJJIIGHJJFHE XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:2306:16413:174917 0 3R 93 32 51M * 0 0 CCTATAAACATATGTACATAGGTAGGCCAGTACTTAGTACTGGCACATGCC @@@DDEEBHBBFHGECFGG>FCHDEGGHA?FGD XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:1305:17377:158887 0 3R 119 19 51M * 0 0 CCAGTACTTAGTACTGGCACATGCCGCTGATCTGTTAGTAGATTATCCATT @@@FDFFFHHGDHHIJJJJJGEGHJEGIJJJJIIHHIJHIGHIIIIJHGHI XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:1308:19209:144101 16 3R 134 26 51M * 0 0 CCCACAAGCCGCCGATCTGTTAATAGAATATCCATTTCCCTTCAGCGCCTA ###############################@HFC;=IFFA6FDADAD@@@ XA:i:1 MD:Z:0G0G4T5T9G4T23 NM:i:6
+DD61XKN1:101:D0EKPACXX:2:1108:9446:127495 0 3R 149 10 51M * 0 0 TCTGTTAGTAGATTATCCATTTCCCTTCAGCGCCTACCTGCGTCACCAATG @@@FDFFFF:CBFDHEFGEHJJIJGIIJIIDIEGBIGIIJJJBHGIIII@G XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:2305:7088:93345 0 3R 152 40 51M * 0 0 GTTAGTAGTTTATCCATTTCCCTTCAGCGCCTACCTGCGTCACCAATGATG @@;BDABDDFDDDII9EGGHICHE+AAHDFADFIDEE@>@FAFHIIIIEID XA:i:1 MD:Z:8A42 NM:i:1
+DD61XKN1:101:D0EKPACXX:2:1304:4948:39372 0 3R 156 25 51M * 0 0 GTAGATTATCCATTTCCCTTCAGCGCCTACCTGCGTCACCAATGATGAAGA ?8?ADDDA4ADDDIAEEED>EE:C@DFAB XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:2307:18363:54940 16 3R 160 39 51M * 0 0 ATTATCCATTTCCCTTCAGCGCCTACCTGCGTCACCAATGATGAGGTCGAG #####@@@==8'FB8DF8???F??1F@C1A+9FEFFEGD?CG@;AFB(BFEHBIGBDIGGC?A::EFDEC:IIIGEFHFD6FD?DD@<@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:1305:12595:193763 0 3R 168 38 51M * 0 0 TTTCCCTTCAGCCCCTACCTGCGTCACCAATGATGAGGTCGAGACAGAATC @@@FFADFFFFH:EFHEIIIIIIIG@HIIGIBBDGGIFB@@HGFGIIGIII XA:i:1 MD:Z:12G38 NM:i:1
+DD61XKN1:101:D0EKPACXX:2:1106:7245:9884 0 3R 169 6 51M * 0 0 TTCCCTTCAGCGCCTACCTGCGTCACCAATGATGAGGTCGAGACAGAATCC CCCFFFFFHHGGHJIIHIGHIGGGGHIHGEGGHIIIEFGG@GCHHIJGIGE XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:1102:16207:31886 0 3R 171 12 51M * 0 0 CCCTTCAGCGCCTACCTGCGTCACCAATGATGAGGTCGAGACAGAATCCTA @@>B>:FABFIIEHEBFFDHBBFHGAG0B@FHHI@DA XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:1206:11566:95721 16 3R 172 19 51M * 0 0 CCTTCAGCGCCTACCTGCGTCACCAATGATGAGGTCGAGACAGAATCCTAC HECFEB@@CG?00EFGIGGAF@IIIH@GHCD@HG>IEHFDDHFFEDFF@@@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:2305:9277:80019 0 3R 173 37 51M * 0 0 CTTCAGCGCCTACCTGCGTCACCAATGATGAGGTCGAGACAGAATCCTACT @@@DDDDDHHH:FGGGGIIIIIIG;DFDGIGIIBDHHI8;BCFGGIHIFGE XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:2304:5021:30275 16 3R 175 23 51M * 0 0 TCAGCGCCTACCTGCGTCACCAATGATGAGGTCGAGACAGAATCCTACTAG GGAFFBFD@?GHDIGHGCHHEGHGEHFHGIGGEHFHIIGGIHIGIJG XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:1103:5323:97525 0 3R 182 1 51M * 0 0 CTACCTGCGTCACCAATGATGAGGTCGGGACAGAATCCTACTAGTACCTGC ;==DBD?AFFA:DEF:CF:AECG2 XA:i:1 MD:Z:27A23 NM:i:1
+DD61XKN1:101:D0EKPACXX:2:1202:2896:73043 0 3R 184 17 51M * 0 0 ACCTGCGTCACCAATGATGAGGTCGAGACAGAATCCTACTAGTACCTGCCT CCCFFFFFHHHHHJJIGHJJJJIJJGJGIJJJIJIIJIJIIJJJDHIJJJJ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:1208:12933:183980 0 3R 187 23 51M * 0 0 TGCGTCACCAATGATGAGGTCGAGACAGAATCCTACTAGTACCTGCCTCGA @@@?DD:DFABBF?DAFHGGE...().8@3;8BE XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:2204:17651:41380 16 3R 236 39 51M * 0 0 CTGTCGATCGGGCAGAGAGCGAGAAATGGTAAGCAGGTGAGTGAGCGCAGA A;EGEIIIFIIIIGHFDIIIIIIIIHEIIIGGHBIIIIHDHHHFFFFFCCC XA:i:0 MD:Z:0G0A49 NM:i:2
+DD61XKN1:101:D0EKPACXX:2:2206:10244:13816 16 3R 236 24 51M * 0 0 CTGTCGATCGGGCAGAGAGCGAGAAATGGTAAGCAGGTGAGTGAGCGCAGA IJJIIJIGJJJJJJJJJJJIJJIJJJJJIGIJIIJJJJHHHHHFFFFFCCC XA:i:0 MD:Z:0G0A49 NM:i:2
+DD61XKN1:101:D0EKPACXX:2:2106:8135:102852 16 3R 270 9 51M * 0 0 AGGTGAGTGAGCGCAGAGAGCGTCTTTCGACGACTCTTTCGTCGCGAGCAA DDDDDCEDDAFFHHE?6'EBADGJIIIIIGGHFIIJJJHHHHHFFFFFB;@6?@### XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:2207:4417:135858 0 3R 285 36 51M * 0 0 GAGAGCGTCTTTCGACGACTCTTTCGTCGCGAGCAAACAACAAGTAGACGT 8:=;AAD@ADFDDD:ACE@?0CGEDBBB6677@;BEC=CC?B?D;BCC>?# XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:1107:10958:114349 16 3R 289 36 51M * 0 0 CGATCTTTCGACGACTCTTTCGTCGCGAGCAAACAACAAGTAGACGTCGCT ###########################BBBFCCFEA?:C4FBC8?:DD??; XA:i:0 MD:Z:0G0C0G48 NM:i:3
+DD61XKN1:101:D0EKPACXX:2:2203:1197:62431 16 3R 293 17 51M * 0 0 CTTTCGACGACTCTTTCGTCGCGAGCAAACAACAAGTAGACGTCGCTCAGA #BB;=?=8@?DBFFFE;@@GGBEC<08D@3D;D?=: XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:2205:17888:164787 0 3R 304 12 51M * 0 0 TCTTTCGTCGCGAGCAAACAACAAGTAGACGTCGCTCAGACACTGTCGGCC CCCFFFFFHHHHGJJJJJJJJJJIJGIGGHJIJJJIJJJ9DCHHEHHFBDD XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:1305:6255:131813 16 3R 341 9 51M * 0 0 AGACACTGTCGGCCAGATTCATTTTCCAGAAAGACGTCGTCGCGTTGACAA #######EB@=);@9EFGGB4@F??F@FBHHFB?F67';8'4));@9>9>BB?## XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:2101:3335:34398 0 3R 384 15 51M * 0 0 GTTGACAAGCTTAAATTCGTAGCGGGCGCCAGTAGGACGACCCAGTGGATA @@CFBADDFFHHGGDHHEHFGIIIIIGGIIGIHIIIIIIHHFFDDEEECC@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:2302:4837:149731 16 3R 390 22 51M * 0 0 AAGCTTAAATTCGTAGCGGGCGCCAGTAGGACGACCCAGTGGATATCGTCA FHGGIJJJJJIJIGJIJJJJJJJIJJJJJJJJHGJJJIHGHHHFFFFFCCC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:1206:10242:101284 0 3R 393 19 51M * 0 0 CTTAAATTCGTAGCGGGCGCCAGAAGGACGACCCAGTGGATATCGTCAGTT ?@@DDDBD?;AFB7@FAGF>ABF(/?(=B<@'@/(6.;3;7;@;A;?;?;> XA:i:1 MD:Z:23T27 NM:i:1
+DD61XKN1:101:D0EKPACXX:2:1107:9066:17687 16 3R 396 28 51M * 0 0 GAACTCGTAGCGGGCGCCAGTAGGACGACCCAGTGGATATCGTCAGTTGAA FFHGHIJIJIIJJIJJIIGHGJIGIJIIIJJJJJJIIGHFHHFFFFFFCC@ XA:i:0 MD:Z:0A2T47 NM:i:2
+DD61XKN1:101:D0EKPACXX:2:1302:10299:198112 0 3R 402 23 51M * 0 0 GTAGCGGGCGCCAGTAGGACGACCCAGTGGATATCGTCAGTTGAACCAGGG @@CEACA?DEE@CCCCCBBB XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:2108:17719:86681 16 3R 402 29 51M * 0 0 GTAGCGGGCGCCAGTAGGACGACCCAGTGGATATCGTCAGTTGAACCAGGG BBA>C,A33?EBDCD;;0B<;??)B3?BDB?=FFE XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:1108:9454:176988 0 3R 436 3 51M * 0 0 CGTCAGTTGAACCAGGGGAAACGTAGCAGCCCAGTTACATTGCTCGGGAGG =18AD:B;,A?@EHEGGI<<4?*99?C@EEHIIFGCFCHGHIIIEEGHD8@FGE';CAAACHHC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:2206:21259:60113 16 3R 455 18 51M * 0 0 AACGTATCAGCCCAGTTACATTGCTCGGGAGGGGTAAAGAGCTTGACGACA #########F?9**DB0?GFHGF::FCC@:+F@CGIIHDCFFDDADDD@@@ XA:i:0 MD:Z:6G44 NM:i:1
+DD61XKN1:101:D0EKPACXX:2:2202:20063:53581 0 3R 462 29 51M * 0 0 CAGCCCAGTTACATTGCTCGGGAGGGGTAAAGAGCTTGACGACAGCGCGTG C@@FFFDFHHHHDIIGGGGHII@FHIJ0BFDHGGIIGFCGA;FH@D=EB## XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:2105:20550:77807 0 3R 467 39 51M * 0 0 CAGTTACATTGCTCGGGAGGGGGAAAGAGCTTGACGACAGCGCGGGCGTAG ?1?D=DDB,2=C4?EE?8FEEI)?@8B800?@CBC8@############## XA:i:1 MD:Z:22T21T6 NM:i:2
+DD61XKN1:101:D0EKPACXX:2:1307:13537:69880 16 3R 472 9 51M * 0 0 ACATTGCTCGGGAGGGGTAAAGAGCTTGACGACAGCGCGTGCGTAGAGTGA 3FECHHEHIIJIHGIIH@GJHDGHEGIHGEGGEGGBEIHHHDFDDDDD@@@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:2204:10042:31362 16 3R 488 10 51M * 0 0 GTAAAGAGCTTGACGATAGCGCGTGCGTAGAGTGAAAGTATGCAAGGAGAT GECIHGC;;;BHCB?0)D?8CC<+@FBHGCEHGIIHCA?FDDDFFDDD@@@ XA:i:0 MD:Z:16C34 NM:i:1
+DD61XKN1:101:D0EKPACXX:2:1206:8347:137786 16 3R 506 38 51M * 0 0 GCGCGGGCGTAGAGTGAAAGTATGCAAGGAGATTCGCGATCAGAACCTCAC A@A;''6?DB4ED@CDDDDA48:1 XA:i:0 MD:Z:5T45 NM:i:1
+DD61XKN1:101:D0EKPACXX:2:1103:10361:180321 16 3R 524 12 51M * 0 0 GATCTTCCGATCTATTCGCGATCAGAACCTCACGACGCCATATTTGTTTTC ####################CDB?*1;;83:)@)@6@DD??48?DA7D??? XA:i:0 MD:Z:0A0G1A1G1A0A0G0G0A0G38 NM:i:10
+DD61XKN1:101:D0EKPACXX:2:2103:18603:41773 16 3R 527 40 51M * 0 0 ATGCAAGGAGATTCGCGATCAGAACCTCACGACGCCATATTTGTTTTCCAG FHEGIJJIIJJGGBBGIJJJJIIHIIGHGGHFIGJJIIHGHHHFFDFFCC@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:1301:2009:157371 16 3R 537 18 51M * 0 0 ATTCGCGATCAGAACCTCACGACGCCATATTTGTTTTCCAGGGCTTGCTTG BHJJIIHIJJJIIHJIGGGIIIJIIJIGIIJJJJJJJIGHGHHFFFFFCCC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:2205:6958:140155 0 3R 542 23 51M * 0 0 CGATCAGAACCTCACGACGCCATATTTGTTTTCCAGGGCTTGCTTGTGTGT @@@DDDFFFHDBFEG@FD@FGED9DHGGHHIGIGAHGE@FGGCHB@=@@D= XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:2304:7387:67115 16 3R 544 26 51M * 0 0 ATCAGAACCTCACGACGCCATATTTGTTTTCCAGGGCTTGCTTGTGTGTGC A>@;=EFFB8@7:??@8BG?9?DGE:?HFF?F?@GE?@F??AD==A?B:?: XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:1103:12878:37036 16 3R 574 2 51M * 0 0 TCTGGGCTTGCTTGTGTGTGCGTGTGTTTCAACTCCGGTGTGCGAGTGTGA FFHHHEHIHDJJJIJJJJJJJJJIHCJJJIHGJJJJJJHHHHHFFFFFCCC XA:i:0 MD:Z:0C1A48 NM:i:2
+DD61XKN1:101:D0EKPACXX:2:1307:11922:110790 16 3R 575 24 51M * 0 0 CAGGGCTTGCTTGTGTGTGCGTGTGTTTCAACTCCGGTGTGCGAGTGTGAG DDFEFECD9D;@@B8F?@:FIIFG?4D9FIFF@FEA<)FFACB?ADDD@@@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:2108:11686:2993 16 3R 576 1 51M * 0 0 AGGGCTTGCTTGTGTGTGCGTGTGTTTCAACTCCGGTGTGCTAGTGTGAGG CCA@7C=@H@FFDEIEAEG@3CGIJIH@HF6>GHGGHBDBA:HDD?=D@@@ XA:i:1 MD:Z:41G9 NM:i:1
+DD61XKN1:101:D0EKPACXX:2:1106:13858:49049 0 3R 580 23 51M * 0 0 CTTGCTTGTGTGTGCGTGTGTTTCAACTCCGGTGTGCGGGTGTGAGGGCAG @;?DABD?+:;E=AE< XA:i:0 MD:Z:38A12 NM:i:1
+DD61XKN1:101:D0EKPACXX:2:1201:6831:94113 0 3R 584 3 51M * 0 0 CTTGTGTGTGCGTGTGTTTCAACTCCGGTGTGCGAGTGTGAGGGCAGGGAG @CCFDADBFDHHHGFHIIJIJJIIIGGGGEFGHG@GG9?FFDCAA1FEHHE XA:i:0 MD:Z:49C0T0 NM:i:2
+DD61XKN1:101:D0EKPACXX:2:1307:8654:112075 0 3R 588 12 51M * 0 0 TGTGTGCGTGTGTTTAAACTCCGGTGTGCGAGTGTGAGGGCAGGGCAGATC ?71?=?4ADF1CFFI,A@BDA@E?C@)?DF<:08DF>FFII?EEB1;A### XA:i:1 MD:Z:15C30T0T0T0G1 NM:i:5
+DD61XKN1:101:D0EKPACXX:2:2202:16204:102746 0 3R 588 36 51M * 0 0 TGTGTGCGTGTGTTTCAACTCCGGTGTGCGAGTGTGAGGGCAGGGCCGATC ?@?DDDFDF?@FHEDGHIGGHEG>::C)?D?6)??BDFGEBGIGGG##### XA:i:0 MD:Z:46T0T0T0G1 NM:i:4
+DD61XKN1:101:D0EKPACXX:2:1305:3741:136456 16 3R 591 3 51M * 0 0 GTGCGTGTGTTTCAACTCCGGTGTGCGAGTGTGAGGGCAGGGCTTTGCCAG BFIIJJIIGJJJIIIIJJJJJJJJIIJHJJIIGBJJJIHHHHHFFFFFCCC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:1103:10524:6617 16 3R 602 34 51M * 0 0 TCAACTCCGGTGTGCGAGTGTGAGGGCAGGGCTTTGCCAGGTGATAATGCC DCCEIGGHFFDA@6CF@C3=EEHAHBCHAGF@EHFE@G8?HFABBAB1?@@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:1203:5457:89791 0 3R 606 9 51M * 0 0 CTCCGGTGTGCGAGTGTGAGGGCAGGGCTTTGCCAGGTGATAATGCCTTAA @@@DDD@DDFFA)<GHGEEA XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:2105:3502:9363 0 3R 606 31 51M * 0 0 CTCCGGTGTGCGAGTGTGAGGGCAGGGCTTTGCCAGGTGATAATGCCTTAA @@CFFFDDFHHH?GCFEGBHHIGGHJJJJJJJIJGFHBGGHJIJHIJIJHD XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:2306:10026:168446 0 3R 606 2 51M * 0 0 CTCCGGTGTGCGAGTGTGAGGGCAGGGCTTTGCCAGGTGATAATGCCTTAA @@@DDDDDD;DF?FCBEHIDDDAD@?; XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:1202:5681:105453 16 3R 677 18 51M * 0 0 TCGTCTTCGCAAATTCGAACAACAGTATTCTTGATTGATGCAGTTTTACAG IJIJJJJJIJJJHIIJGIIIJIJJGDGGHHGJJIIIIGAGHHHFFFFFCCC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:1105:3468:125138 16 3R 689 35 51M * 0 0 ATTCGAACAACAGTATTCTTGATTGATGCAGTTTTACAGCGACTTTGTGTG DCHDAGEHGHGGCCCD>G@HBEEBHFF9BGGEHFC1B@:FBDHADDDD@@@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:2106:14544:82922 16 3R 689 37 51M * 0 0 ATTCGAACAACAGTATTCTTGATTGATGCAGTTTTACAGCGACTTTGTGTG HDFDB;HEGIHGF@CEGC>C@E>FEEGFC@BGHFC@:??AAAFBDDDD?8: XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:1302:1975:188249 0 3R 699 2 51M * 0 0 CAGTATTCTTGATTGATGCAGTTTTACAGCGACTTTGTGTGTGCGTATGCT @@@DDFFDHHFDHIIEEHIECGIJJJIEHCHGIIJJJCGFHDFDGGGGCG> XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:2102:7580:193903 0 3R 706 24 51M * 0 0 CTTGATTGATGCAGTTTTACAGCGACTTTGTGTGTGCGTATGCTGTCACCA @CCFDFFFFHHGHIIIJIIJIIJIAFHIJEHGHGGIIGIIJFIJJIIIIJC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:1204:10450:4067 0 3R 709 34 51M * 0 0 GATTGATGCAGTTTTACAGCGACTTTGTGTGTGCGTATGCTGTAACCCCAC ::?B?:=AB=CFFIE?C:31::89?::??############ XA:i:0 MD:Z:43C3A3 NM:i:2
+DD61XKN1:101:D0EKPACXX:2:1303:17605:117243 0 3R 709 35 51M * 0 0 GATTGATGCAGTTTTACAGCGACTTTGTGTGTGCGTATGCTGTAACAACAC +8?BB::DB?DFFI>DF@HGF;=HGFFDGGHEAGG=HGEHEHEDH>CB<>@@EHFBFFFFDBF@@@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:2305:13537:158359 16 3R 729 7 51M * 0 0 GACTTTGTGTGTGCGTATGCTGTCACCATACTATGTTCGAGTGTGTGTGCG 7CFFCEDIFF:EFAB9F?8@AGCBEB?2+FBBF@EIIIFFFFADDDDD@@@ XA:i:1 MD:Z:28C22 NM:i:1
+DD61XKN1:101:D0EKPACXX:2:2108:11187:179159 0 3R 730 14 51M * 0 0 ACTTTGTGTGTGCGTATGCTGTCACCACCCTTTGTTCGAGTGTGTGTGCGA ???DBDD22=+2,?E72+A4:+GEEHFA<(GEFDDC;DGED@EFCABIFHFAD8F>DFDD@?< XA:i:0 MD:Z:17G33 NM:i:1
+DD61XKN1:101:D0EKPACXX:2:2302:8329:199577 16 3R 767 12 51M * 0 0 GAGTGTGTGTGCGCTCGTGTTTCTGTGAACCCAATCGCGAACACTGTTGTG EFFHHHJIIGFIJIIJJIJJIIJIHFHHHJIIHGCJIIFFHHGFDFFF@@B XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:2205:6343:170954 16 3R 769 16 51M * 0 0 GTGTGTGTGCGCTCGTGTTTCTGTGAACCCAATCGCGAACACTGTTGTGAG DEHHEHGIGIIIDIFIIIGHIIHDBF@IIIIHGIGGEFBFBHBFDFFFCC@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:1101:12875:176255 0 3R 775 16 51M * 0 0 GTGCGCTCGTGTTTCTGTGAACCCAATCGCGAACACTGTTGTGAGCCAGTG @@@ADDADH?EGGGG?GGGGI'=CHEC<)5=CHIIG=7 XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:2206:13591:47620 0 3R 824 17 51M * 0 0 TGGCTCTGTCTGCGCGGCGAGAAATATCCGCTTACCTAAACGAAAAGTTCT CCCFFFFFHFHHHJJIGIIIDFBGHIJIJJIJIIHHHFHHFFFDDDDCC>B XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:1308:18519:39835 0 3R 841 31 51M * 0 0 CGAGAAATATCCGCTTACCTAAACGAAAAGTTCTCTAGCGTCGGCCGACGC @CCFFFDFFHFFHBHIJIJCHIJJJICHGCFHECGGIEIGIFIFIIIDHFC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:1207:12521:101238 0 3R 842 3 51M * 0 0 GAGAAATATCCGCTTACCTAAACGAAAAGTTCTCTATCGTCGGCCGACGAC @<@BDDDD?F==69D6D?DDFEEGGEEB3@CAEA>FF:FEBFAEHB6>CC?BBB59< XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:1107:3972:59070 16 3R 868 36 51M * 0 0 GATCTCTCTAGCGTCGGCCGACGCACGGCACCCAGGCACACACACAGCCAC #############@4;76A@:81)::EC@3GIHA@EDC8C/=D=:/BFBHF?@;ED?1BAGHEIIHEBF??3CDCFBD>AD@@@ XA:i:0 MD:Z:0A1G0T47 NM:i:3
+DD61XKN1:101:D0EKPACXX:2:1207:13576:40002 16 3R 868 2 51M * 0 0 GATCTCTCTAGCGTCGGCCGACGCACGGCACCCAGGCACACACACAGCCAC ##?=3C>A?<<:;/@@D6FDDA@?8AECCBBDDFDDDDD@@@ XA:i:0 MD:Z:0A1G0T47 NM:i:3
+DD61XKN1:101:D0EKPACXX:2:2106:13119:17764 16 3R 868 32 51M * 0 0 AAGTTCTCTAGCGTCGGCCGACGCACGGCACCCAGGCACACACACAGCCAC CCACEDDDFEFEHGIJIIGHHHGGGHGHEAIJIGGH@DFFDBFDDEDF@?? XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:2204:20637:13691 16 3R 868 18 51M * 0 0 GATCTCTCTAGCGTCGGCCGACGCACGGCACCCAGGCACACACACAGCCAC #A=53AA>ED=EB@EHGHDDD@??:CFF<3GIHDBFCAA;/?<8:@;>BFF?<<=;'FAGEGAD6DE?6GG@AEIIGGG@EBA?CFDDDDD@?@ XA:i:0 MD:Z:0A1G0T47 NM:i:3
+DD61XKN1:101:D0EKPACXX:2:2303:4165:141511 16 3R 868 40 51M * 0 0 GATCTCTCTAGCGTCGGCCGACGCACGGCACCCAGGCACACACACAGCCAC #@;;(>38B=<:BAAA XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:2105:7761:130218 0 3R 876 1 51M * 0 0 TAGCGTCGGCCGACGCACGGCACCCAGGCACACACACAGCCACATTTGCAG @@CFFDFDDDFHFFGIJJJIHAFHIGHIGHHFHBFEDA@?B@D@CDDDDEC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:2106:20083:91623 0 3R 876 3 51M * 0 0 TCGCGTCGGCCGACGCACGGCACCCAGGCACACACACAGCCACATTTGCAG 11144@DDA>A@ XA:i:1 MD:Z:1A49 NM:i:1
+DD61XKN1:101:D0EKPACXX:2:2106:11915:186462 16 3R 876 39 51M * 0 0 TAGCGTCGGCCGACGCACGGCACCCAGGCACACACACAGCCACATTTTCGG BBBCDFFIIIFEBEIFDABBC9IIFFHFF@ECDEBBFEFFC<<4)04++11 XA:i:2 MD:Z:47G1A1 NM:i:2
+DD61XKN1:101:D0EKPACXX:2:2305:7910:29158 16 3R 878 32 51M * 0 0 GCGTCGGCCGACGCACGGCACCCAGGCACACACACAGCCACATTTGCAGAA CFDBGIIGIGGHGHDGFD@BGEHCGHF;GFECGEIIGHFFFHHFFFFFCCC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:2106:9016:64754 0 3R 880 2 51M * 0 0 GTCGGCCGACGCACGGCACCCAGGCACACACACAGCCACATTTGCAGAAAT @@@FFF>DHDHAFGIJGGIIGIGIIIIHGHIJJGGFHFFFFFFEDEEEADC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:1304:4663:45784 16 3R 884 26 51M * 0 0 GCCGACGCACGGCACCCAGGCACACACACAGCCACATTTGCAGAAATACCA BGB;HEGGFIIGEECHFCIHHAHHDFFDD@@@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:2307:5310:188136 16 3R 885 12 51M * 0 0 CCGACGCACGGCACCCAGGCACACACACAGCCACATTCGCAGAAATACCAC #@5;@@@6@@B0:GIHGGFD?;CBBECCGCEFAEF@E8DDDDBDE;C+238BEEAAED>*0?DEEC3BDDFHGIGIH XA:i:1 MD:Z:10A40 NM:i:1
+DD61XKN1:101:D0EKPACXX:2:1103:2005:94273 16 3R 919 26 51M * 0 0 ATTTGCAGAAATACCACTACATACGAAACGAACGTGGCCAGCACACAAGCG GJJIJIIGIIHEFIIIIGHIIHIIJIIIIIIJHGHJJIHHFDFDDFFF@@@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:2:1104:3068:110926 0 3R 924 14 51M * 0 0 CAGAAATACCACTACATACGAAACGAACGTGGCCAGCACACAAGCGAAACC @@@=D>?DFB3<;CG@IEDFBDEGGHGGHHHHHEDDFFCCC XA:i:0 MD:Z:51 NM:i:0
diff --git a/deepTools/source/deeptools/test/test_data/test2.bam b/deepTools/source/deeptools/test/test_data/test2.bam
new file mode 100644
index 0000000000000000000000000000000000000000..c53dc3b90c2d2a2bcf180414d8b4b8fc45be8546
Binary files /dev/null and b/deepTools/source/deeptools/test/test_data/test2.bam differ
diff --git a/deepTools/source/deeptools/test/test_data/test2.bam.bai b/deepTools/source/deeptools/test/test_data/test2.bam.bai
new file mode 100644
index 0000000000000000000000000000000000000000..6c98b7fd1a3993b859f994df9f62dd62b6b34f15
Binary files /dev/null and b/deepTools/source/deeptools/test/test_data/test2.bam.bai differ
diff --git a/deepTools/source/deeptools/test/test_data/test2.bg b/deepTools/source/deeptools/test/test_data/test2.bg
new file mode 100644
index 0000000000000000000000000000000000000000..c07f71d82b4f142d77a4bf28f709e30d4e067809
--- /dev/null
+++ b/deepTools/source/deeptools/test/test_data/test2.bg
@@ -0,0 +1,42 @@
+3R 0 25 1.68
+3R 25 50 2.00
+3R 50 75 0.40
+3R 75 100 2.04
+3R 100 125 5.44
+3R 125 150 8.48
+3R 150 175 13.48
+3R 175 200 21.68
+3R 200 225 22.76
+3R 225 250 15.64
+3R 250 275 7.28
+3R 275 300 6.80
+3R 300 325 11.76
+3R 325 350 9.56
+3R 350 375 5.20
+3R 375 400 7.16
+3R 400 425 13.76
+3R 425 450 17.08
+3R 450 475 12.44
+3R 475 500 11.88
+3R 500 525 10.64
+3R 525 550 3.68
+3R 550 575 1.88
+3R 575 600 10.28
+3R 600 625 19.64
+3R 625 650 18.96
+3R 650 675 12.48
+3R 675 700 9.00
+3R 700 725 9.44
+3R 725 750 14.52
+3R 750 775 17.32
+3R 775 800 12.72
+3R 800 825 5.64
+3R 825 850 4.56
+3R 850 875 10.28
+3R 875 900 10.92
+3R 900 925 8.76
+3R 925 950 4.84
+3R 950 975 2.76
+3R 975 1000 2.68
+3R 1000 1025 3.76
+3R 1025 1050 2.44
diff --git a/deepTools/source/deeptools/test/test_data/test2.cram b/deepTools/source/deeptools/test/test_data/test2.cram
new file mode 100644
index 0000000000000000000000000000000000000000..eed9a442313d4ed56b109323af919034dfa1a54c
Binary files /dev/null and b/deepTools/source/deeptools/test/test_data/test2.cram differ
diff --git a/deepTools/source/deeptools/test/test_data/test2.cram.crai b/deepTools/source/deeptools/test/test_data/test2.cram.crai
new file mode 100644
index 0000000000000000000000000000000000000000..30ca5ac26ff3196b19e83ebce0fcdbdb77739076
Binary files /dev/null and b/deepTools/source/deeptools/test/test_data/test2.cram.crai differ
diff --git a/deepTools/source/deeptools/test/test_data/test2.sam b/deepTools/source/deeptools/test/test_data/test2.sam
new file mode 100644
index 0000000000000000000000000000000000000000..187c08b7e139bd79a28e09fc135874e1e1fe151f
--- /dev/null
+++ b/deepTools/source/deeptools/test/test_data/test2.sam
@@ -0,0 +1,195 @@
+@HD VN:1.0 SO:unsorted
+@SQ SN:3R LN:1500 M5:9bad144b81e5e7369a164895691b3e73 UR:./test1.fa
+DD61XKN1:101:D0EKPACXX:1:1307:6042:120399 0 3R 3 26 51M * 0 0 ATTCTCTCTTGTTGTAGTCTCTTGACAAAATGCAATGGTCAGGTAGCGTTG ?@@=D4=AC2AFDE3:AFDE*:B?DGI# XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:2201:2981:158995 0 3R 7 37 51M * 0 0 TCTCTTGTTGTAGTCTCTTTACAAAATGTAATGGTCAGGTAGCATTGTTCT ??GFFBHHFFFFFCCC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:2102:10665:57593 0 3R 88 28 51M * 0 0 TAAGCCCTATAAACATATGTACATAGGTAGGCCAGTACTTAGTGCTGGCAC CCCFFFFFDHHFFHGF;G3CBHHGHHCHGCI=BFDHDBCAH XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:1308:7611:23534 16 3R 108 11 51M * 0 0 ACATAGGTAGGCCAGTACTTAGTACTGGCACATGCCGCTGATCTGTTAGTA =IFBDDADGJIHGFFGHGHHFCJGHECIIGIIG>IGHGHGFFDDDCCC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:2305:1314:97144 16 3R 108 12 51M * 0 0 CTATAGGTAGGCCAGTACTTAGTACTGGCACATGCCGCTGATCTGTTAGTA =@IGFGHCHEECIEHGFIJJIIGGDDIHIHEFC>HAGHBDFBDEDDFF@@@ XA:i:0 MD:Z:0A0C49 NM:i:2
+DD61XKN1:101:D0EKPACXX:1:2107:3608:70379 0 3R 127 30 51M * 0 0 TAGTACTGGCACATGCCGCTGATCTGTTAGTAGATTATCCATTTCCCTTCA CCCDFFFFHHHHHJHIJJJJIIJJJJGIIJHIJIJJIJJJJJJJJJJIJII XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:1308:12105:61609 0 3R 132 28 51M * 0 0 CTGGCACATGCCGCTGATCTGTTAGTAGATTATCCATTTCCCTTCAGCGCC @@@DD?DDDBBD1EDGGICHHFEFCEFHIGI9?FBFFDEHD>?BHHH9FGGFBGIGGIII@@FD< XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:1301:14071:87949 0 3R 134 33 51M * 0 0 GGCACATGCCGCTGATCTGTTAGTAGATTATCCATTTCCCTTCAGCGCCTC @@@?DBEAFHAAAHDGGCBHG?B?BHDEHEIGEEHGHGHGGH@FCGHIGG# XA:i:0 MD:Z:50A0 NM:i:1
+DD61XKN1:101:D0EKPACXX:1:1207:15012:198224 16 3R 138 30 51M * 0 0 CATGCCGCTGATCTGTTAGTAGATTATCCATTTCCCTTCAGCGCCTACCTG @@@IGABFFEEGIDFHGGFHHHDDB:1?@?@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:1108:20053:52835 16 3R 143 28 51M * 0 0 CGCTGATCTGTTAGTAGATTAGCCATTTCCCTTCAGCGCCTACCTGCGTCA @==3GC;B???4GFDBAGFB:)HGF=AHD;BAGIIGF@DC2+F@6DA?@@? XA:i:0 MD:Z:21T29 NM:i:1
+DD61XKN1:101:D0EKPACXX:1:2205:3896:101741 16 3R 143 36 51M * 0 0 TGCTGATCTGTTAGTAGATTATCCATTTCCCCTCAGCGCCTACCTGCGTCA @GF;G@HGHDIGEDIIIIIIGIIIIHFDD?1)BDGGGDAAHDHFDFFF@@? XA:i:1 MD:Z:0C30T19 NM:i:2
+DD61XKN1:101:D0EKPACXX:1:1304:17350:141266 0 3R 150 25 51M * 0 0 CTGTTAGTAGATTATCCATTTCCCTTCAGCGCCTACCTGCGTCACCAATGA @@CDFFFDHFHHHGHIIIHJIJJGJGHGIIIIJIJJIGHIFFGEHIJJJI< XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:2308:16926:146336 0 3R 150 13 51M * 0 0 CTGTTAGTAGATTATCCATTTCCCTTCAGCGCCTACCTGCGTCACCAATGA B@@FFBDBFDBBFHJIGGJIIJIGCHIJEIHIJJIIIJDGI@GHJJJJJG@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:1206:19765:24106 16 3R 154 4 51M * 0 0 TAGTAGATTATCCATTTCCCTTCAGCGCCTACCTGCGTCACCAATGATGAG IIIIGHFIGEHCHHFB8@GF?GGEAEGCEC8@AGFC:ECDHECGIHDGD6GGFBC92HEGDIIHGADFHFBFDDDC@@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:2107:9512:13865 16 3R 160 8 51M * 0 0 ATTATCCATTTCCCTTCAGCGCCTACCTGCGTCACCAATGATGAGGTCGAG IHHGAHGGBB8D?9:??8:)1+22CFEA,:4A<+AEEA?CEE?@)1?9?C?DC3?D:A/(8B)==@C###### XA:i:1 MD:Z:6C37C0T1C0T0A1 NM:i:6
+DD61XKN1:101:D0EKPACXX:1:1106:21127:183692 0 3R 180 40 51M * 0 0 GCCTACCTGCGTCACCAATGATGAGGTCGAGACAGAATCCTACTAGTACCT @@@DFFFFGHFFAHIJJIGDHGHEG>FHHIIHHIGIGEHC>ECDGC@BG>C XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:1307:6197:2841 0 3R 181 36 51M * 0 0 CCTACCTGCGTCACCAATGATGAGGTCGAGACAGAATCCTACTAATACCTG @@FFBE4<2+A;*CC@F?)9?@FF@DGDAIHBGFA?>?DD?A??;@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:1205:8830:10322 0 3R 182 25 51M * 0 0 CTACCTGCGTCACCAATGATGAGGACGAGACAGAATCCTACTAGTACCTGC ?7?AA?DD<8CDDEED@+<:BB8)8B# XA:i:1 MD:Z:24T26 NM:i:1
+DD61XKN1:101:D0EKPACXX:1:1107:3548:177528 0 3R 184 20 51M * 0 0 ACCTGCGTCACCAATGATGAGGTCGAGACAGAATCCTACTAGTACCTGCCT ??FG>DAE>GEI@*?FGDDDD9D?*:DD??):@C@<@2<@)BD??? XA:i:1 MD:Z:42T8 NM:i:1
+DD61XKN1:101:D0EKPACXX:1:1307:3116:6430 16 3R 197 27 51M * 0 0 ATGATGAGGTCGAGACAGAATCCTACTAGTACCTGCCTCGAGTCGATCGGG @4C=)75A7=@B@A@F?9D?*DGCFFEFBC:DC8<@C+FADADDDA?D@@@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:1107:16927:153976 0 3R 201 25 51M * 0 0 TGAGGTCGAGACAGAATCCTACTAGTACCTGCCTCGAGTCGATCGGGCAGA ?@?DB2=AA@AAFFAEACFHGDCFBFF1BG>6?@@AEB;@### XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:2204:6880:127511 0 3R 207 16 51M * 0 0 CGAGACAGAATCCTACTAGTACCTGCCTCGAGTCGATCGGGCAGAGAGCGA @@@DDDDDFFFFE=CFCE< XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:2101:10755:114928 0 3R 208 1 51M * 0 0 GAGACAGAATCCTACTAGTACCTGCCTCGAGTCGATCGGGCAGAGAGCGAG @@@DDDDDFDHDH@EFEH?EHGGDHGIIIFGHHGGIIHIGGIB;CAHGIHE XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:1203:9513:150353 0 3R 212 30 51M * 0 0 CAGAATCCTACTAGTACCTGCCTCGAGTCGATCGGGCAGAGAGCGAGAAAT ???DDDDDADB?CBAE:4?+A??EDFEGGGGHGHEG@FHAFHE>DHGIHIGFD>F7@@DCH XA:i:1 MD:Z:4G46 NM:i:1
+DD61XKN1:101:D0EKPACXX:1:1302:13211:169414 0 3R 233 29 51M * 0 0 CTCGAGTCGATCGGGCAGAGAGCGAGAAATGGTAAGCAGGTGAGTGAGCGC @@@FBDBDFFHHHIIIGIGIF@@F>DAGBGGIH XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:2108:3748:138979 0 3R 233 34 51M * 0 0 CTCGAGTCGATCGGGCAGAGAGCGAGAAATGGTAAGCAGGTGAGTGGGATC @@@FDF?DHDHFFI>HGGGIEIDGIBFHIJIF@FICHHIJ@EHIDC(6CCE XA:i:0 MD:Z:46A1C0G1 NM:i:3
+DD61XKN1:101:D0EKPACXX:1:1207:13423:102525 16 3R 271 7 51M * 0 0 GGTGAGTGAGCGCAGAGAGCGTCTTTCGACGACTCTTTCGTCGCGAGCAAA BCCC@CADFFDFHHGIGIIIIIGHBDEGDIIHDGGE6JIHGHGHFFFFF@CC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:2306:9355:141323 16 3R 278 10 51M * 0 0 GAGCGCAGAGAGCGTCTTTCGACGACTCTTTCGTCGCGAGCAAACAACAAG B;?>>>>>;/8GGFBFHFB?ADD=:@ XA:i:0 MD:Z:1A49 NM:i:1
+DD61XKN1:101:D0EKPACXX:1:2208:17627:8790 16 3R 290 11 51M * 0 0 CGTCTTTCGACGACTCTTTCGTCGCGAGCAANCAACAAGTAGACGTCGCTC #C>B?BBABB>?=;>B;ADD@C(;AC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:1301:13576:199290 0 3R 298 37 51M * 0 0 GACGACTCTTTCGTCGCGAGCAAACAACAAGTAGACGTCGCTCAGACAGAT ?<@DDAA8?DBB;:2BDDDDDA:??<@:?DDA3B;@DCB?*9*0?6-;''-6-7).?DAD;; XA:i:0 MD:Z:50C0 NM:i:1
+DD61XKN1:101:D0EKPACXX:1:2204:8124:34946 16 3R 306 2 51M * 0 0 TTTCGTCGCGAGCAAACAACAAGTAGACGTCACTCAGACACTGTCGGCCAG :EBHGAGABFD?9DJHIIIIGGJJIGHFF<22HEIHGHFHHHHDFFFFC@@ XA:i:1 MD:Z:31G19 NM:i:1
+DD61XKN1:101:D0EKPACXX:1:2301:13637:198207 16 3R 306 29 51M * 0 0 TTTCGTCGCGAGCAAACAACAAGTAGACGTCGCTCAGACACTGTCGGCCAG B?;A>EEECIIIIIIIIFIIIIGIHFAGE:GGGEEGEIFFHHAAFFFF@@@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:2303:3102:37681 0 3R 318 28 51M * 0 0 CAAACAACAGGTAGACGTCGCTCAGACACTGTCGGCCAGATTCATTTAGAT ???7DDDDD2:+:AC+<<)@?CD1CDD)1?9:*?D@A(=B4/=CC#### XA:i:1 MD:Z:9A37T0C0C0A0 NM:i:5
+DD61XKN1:101:D0EKPACXX:1:1202:15301:118122 16 3R 336 18 51M * 0 0 CGATCTGACACTGTCGGCCAGATTCATTTTCCAGAAAGACGTCGTCGCGTT ################HBB4@D??*4>GGGBFB?GGF:?8:8<8@8AA=:= XA:i:0 MD:Z:2C2A45 NM:i:2
+DD61XKN1:101:D0EKPACXX:1:1307:2511:100728 16 3R 336 23 51M * 0 0 CCGATCTACACTGTCGGCCAGATTCCTTTTCCAGAAAGACGTCGTCGCGTT #######################B0*0B99*D??>DB8?C1@8<8A:41;= XA:i:1 MD:Z:1G0C0T0C0A0G18A25 NM:i:7
+DD61XKN1:101:D0EKPACXX:1:2208:19469:21286 0 3R 339 3 51M * 0 0 TCAGACACTGTCGGCCAGANTCATTTTCCAGAAAGACGTCGTCGCGTTGAC @@@DDD8DFFDFFIDF1CF#2AEGFIIF@EB=CD XA:i:1 MD:Z:19T31 NM:i:1
+DD61XKN1:101:D0EKPACXX:1:1304:18506:91672 0 3R 361 11 51M * 0 0 ATTTTCCAGAAAGACGTCGTCGCGTTGACAAGCTTAAATTCGTAGCGGGCA @@CD?DBDFHGDFGHICEFC?CFDFBG=@FH@9BFGFCCFFHDHG>:@GHG XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:2305:19231:72018 0 3R 384 23 51M * 0 0 GTTGACAAGCTTAAATTCGTAGCGGGCGCCAGTAGGACGACCCAGGGGATA @@@D?DD>?DHFHG+A==@'59=(6(39;?? XA:i:0 MD:Z:45T5 NM:i:1
+DD61XKN1:101:D0EKPACXX:1:2107:15978:5397 0 3R 389 29 51M * 0 0 CAAGCTTAAATTCGTAGCGGGCGCCAGTAGGACGACCCAGTGGATATCGTC @@@FFBDDHHGHHIDFEGGIJF:F*?GDD:@@FEHDDGE7=??=;CE2=>BC6;@=; XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:1303:9692:124869 16 3R 400 6 51M * 0 0 CGATCTCGGGCGCCAGTAGGACGACCCAGTGGATATCGTCAGTTGAACCAG ########A?(BBDCFFB??C:CFADD@FDFHDFE@FH6?C9?FGFFFA1)88?BF*BBB(8=CG(;8)7C XA:i:0 MD:Z:45G0A1A2 NM:i:3
+DD61XKN1:101:D0EKPACXX:1:1205:3993:177564 16 3R 408 5 51M * 0 0 GGCGCCAGTAGGACGACCCAGTGGATATCGTCAGTTGAACCAGGGGAAACG FF@=D3BB@FD@HDFFEFFCEBHGBF@FCA3CB@CHEAHFH?HDDFFD@@@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:1201:17678:143204 0 3R 413 3 51M * 0 0 CAGTAGGACGACCCAGTGGATATCGTCAGTTGAACCAGGGGAAACGTAGCA @@@DDDD>DCFAFADFBE?D@EBGG?C;@FGCFHHICEGIBHB=@C##### XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:1204:6403:23100 16 3R 413 4 51M * 0 0 CAGTAGGACAACCCAGTGGATATCGTCAGTTGAACCAGGGGAAACGTAGCA ############DB?????@@?88CC9A:3+4@E;BA+A:BD>DA:=:7?1 XA:i:0 MD:Z:9G41 NM:i:1
+DD61XKN1:101:D0EKPACXX:1:2301:11427:64315 16 3R 413 10 51M * 0 0 CAGTAGGACGACCCAGTGGATATCGTCAGTTGAACCAGGGGAAACGTAGCA JHF=IJIGIHGJJJIJJIJHHGIIIHFJJJJJJHJJJIHHHHHFFFFFCCC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:1203:4885:175695 0 3R 416 36 51M * 0 0 TAGGACGACCCAGTGGATATCGTCAGTTGAACCAGGGGAAACGTAGCAGCC =?;D?B@@FDF?F+ACGBHGGE??FA?FHEIADDG@<<(6BCH(.=@@;CA XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:1301:1384:121298 0 3R 416 5 51M * 0 0 TAGGACGACCCAGTGGATATCGTCAGTTGAACCAGGGGAAACGTAGCAGCC CCCFDEFFHHHHHGIIIJIGHIJIJIJJFCHJJJJIJIHIJIJIJJIIIJI XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:1207:10462:36321 0 3R 418 31 51M * 0 0 GGACGACCCAGTGGATATCGTCAGTTGAACCAGGGGAAACGTAGCAGCCCA @@@DD@DDABDFFGABEDBGCGGBB>FGIJ=8DHE@.@D>B(=CE>FH;;@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:1307:4428:77090 16 3R 418 13 51M * 0 0 GGACGACCCAGTGGATATCGTCAGTTGAACCAGGGGAAACGTAGCAGCCCA =IHHEDGGIGF?BGA?IIGHDFBJJIHIGJJIIJJIGGF@GHDDHGHFFDFF@CC XA:i:0 MD:Z:1A49 NM:i:1
+DD61XKN1:101:D0EKPACXX:1:2203:5030:49537 16 3R 424 21 51M * 0 0 CACAGTGGATATCGTCAGTTGAACCAGGGGATACGTAGCAGCCCAGTTACA ##HGFIGHFD;FGBDDHIGIGIIIIGGHFA3+GC<4@?HHHHHDDDDD@@@ XA:i:1 MD:Z:1C29A19 NM:i:2
+DD61XKN1:101:D0EKPACXX:1:2202:11507:147190 0 3R 436 24 51M * 0 0 CGTCAGTTGAACCAGGGGANACGTAGCAGCCCAGTTACATTGCTCGGGAGG ??;=A8DDDA?DD>EEE@E#2CC=C=3;A;;0DDD7DDFFEEH)@:@:1?DEH4DCFHGFEH;AA@ XA:i:0 MD:Z:49T0G0 NM:i:2
+DD61XKN1:101:D0EKPACXX:1:1203:5003:27314 16 3R 474 36 51M * 0 0 AGTGCTCGGGAGGGGTAAAGAGCTTGACGCCAGCGCGTGCGTAGAGTGAAA ################DF=8G?DHGDGHGHB@AC##### XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:2303:15856:134002 0 3R 478 2 51M * 0 0 CTCGGGAGGGGTAAAGAGCTTGACGACAGCGCGTGCGTAGAGTGAAAGTAT @@@FFFDFHFH>AEE>CC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:2104:6954:171255 0 3R 483 38 51M * 0 0 GAGGGGTAAAGAGCTTGACGACAGCGCGTGCGTAGAGTGAAAGTATGAGAT ==?DDA8DF?D?;:CA?:3<):8?0?7?-<5;;(7.7CE>>777A#### XA:i:0 MD:Z:47C0A1G0 NM:i:3
+DD61XKN1:101:D0EKPACXX:1:2307:7881:196830 16 3R 493 36 51M * 0 0 TAGCTTGACCACAGCGCGTGCGTAGAGTGAAAGTATGCAAGGAGATTCGCG AFEE>DC;HF;HGGHGGDG@ECG@F?H?HGIIGIIIGBDF>D>DDDDD@?< XA:i:0 MD:Z:0G8G41 NM:i:2
+DD61XKN1:101:D0EKPACXX:1:1202:12689:141385 0 3R 496 13 51M * 0 0 CTTGACGACAGCGCGTGCGTAGAGTGAAAGTATGCAAGGAGATTCGCGATC @@@D=D:DD6:?FHE?DFH??GDH9??B?D9BFCFHGEAFEAGGHDHH6=A XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:2207:10715:109420 0 3R 496 25 51M * 0 0 CTTGACGACAGCGCGTGCGNAGAGTGAAAGTATGCAAGGAGATTCGCGATC @@@DFFFFH?FHFBE1CDB#08?D?FHEEE?FCGGIIGGCGAAA@EEEBCA XA:i:1 MD:Z:19T31 NM:i:1
+DD61XKN1:101:D0EKPACXX:1:2204:18234:179424 16 3R 533 5 51M * 0 0 CTAGATTCGCGATCAGAACCTCACGACGCCATATTTGTTTTCCAGGGCTTG #?=-48@6(?*/900*??9@?DAEEEFA<+EIEEBEDC>DDDAB44<;? XA:i:0 MD:Z:0G0G49 NM:i:2
+DD61XKN1:101:D0EKPACXX:1:2304:8751:2632 0 3R 556 5 51M * 0 0 CGACGCCATATTTGTTTTCCAGGGCTTGCTTGTGTGTGCGTGTGTTTCAAC CCCFFFFFHGHHHJGIIIJIIIJBHIJIJJJFBADCGHGGCFEGGGIJJIC XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:2203:1524:184251 0 3R 574 29 51M * 0 0 CCAGGGCTTGCTTGTGTGTGCGTGTGTTTCAACTCCGGTGTGCGAGTGTGA @@@DDDDDHHHHHIG9CFH+3@CEGFHHIIICDGHGIIDFDEGI6F'=;D@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:2102:8875:72443 16 3R 576 40 51M * 0 0 AGGGCTTGCTTGTGTGTGCGTGTGTTTCAACTCCGGTGTGCGAGTGTGAGG EDA@DCGIIIGEGHFHEFB?IIIHG@GIIHAGEGGHEA+BBFGFDFDF@C@ XA:i:1 MD:Z:1G0G35G12 NM:i:3
+DD61XKN1:101:D0EKPACXX:1:2101:3407:141094 0 3R 580 17 51M * 0 0 CTTGCTTGTGTGTGCGTGTGTTTCAACTCCGGTGTGCGAGTGTGAGGGCAG 11=DDEFFHHDFCGEHIGFFHIJHGFHIIIIIDEHGEG:(08==BFGG### XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:2303:12087:40107 0 3R 580 12 51M * 0 0 CTTGCTTGTGTGTGCGTGTGTTTCAACTCCGGTGTGCGAGTGTGAGGGCAA ?@@DDDDDF+IIGGAHGHDGFHDHGFFFFF@@@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:2105:6289:140892 16 3R 581 39 51M * 0 0 CCGATCTTGTGTGCGTTTGTTTCAACTCCGGTGTGCGAGTGTGAGGGCAGG #####@@A8'-7;7?9)BDIEDBDD@?DE?F@@1<+AABDCA??BDD:??? XA:i:0 MD:Z:0T0T1C1T0G9G34 NM:i:6
+DD61XKN1:101:D0EKPACXX:1:1103:12128:78055 16 3R 587 7 51M * 0 0 TCGTGTGCGTGTGTTTCAACTCCGGTGTGCGAGTGTGAGGGCAGGGCTTTG #####BEECD?;1 XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:1104:5082:17741 16 3R 590 8 51M * 0 0 TGTGCGTGTGTTTCAACTCCGGTGTGCGAGTGTGAGGGCAGGGCTTTGCCA AGGIGIIFIJJJIJIIIJJJJJJJJJHIIIIJIJJJJJHHHHHFFFFFCC@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:1302:13170:74786 16 3R 594 18 51M * 0 0 CGTGTGTTTCAACTCCGGTGTGCGAGTGTGAGGGCAGGGCTTTGCCAGGTG 8A?IGGIIGEHGGAGHFHDIHIIIIIJIIEEIIGGHBIF>FFF?FDBD@@? XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:2307:14759:121417 0 3R 595 10 51M * 0 0 GTGTGTTTCAACTCCGGTGTGCGAGTGTGAGGGCAGGGCTTTGCCAGGTGA =+:BD?DDHIHEHHDDHDDD?D@?@ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:1305:6666:159819 0 3R 606 24 51M * 0 0 CTCCGGTGTGCGAGTGTGAGGGCAGGGCTTTGCCAGGTGATAATGCCTTAA @@CFFD@DHHHHHJFED@FHIGGIEHJJJIJJJJJJIBFEEHJIIIIJJJJ XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:2201:16418:139923 0 3R 607 5 51M * 0 0 TCCGGTGTGCGAGTGTGAGCGCAGGGCTTTGCCAGGTGATAATGCCTTAAC @@@DD?D:FFF??1AFBF@?:)C:*?@ECA+3#E:?<>C?DAF;D???D@@@ XA:i:1 MD:Z:31A19 NM:i:1
+DD61XKN1:101:D0EKPACXX:1:2207:2606:158481 16 3R 616 7 51M * 0 0 CGAGTGTGAGGGCAGGGCTTTGCCAGGTGATNATGCCTTAACTGTCCCTTG JIIIJIGIHGJJJJIJJGJJJIIHF9HFFA3#HGFIHBHDHHFDDFFFC@@ XA:i:1 MD:Z:31A19 NM:i:1
+DD61XKN1:101:D0EKPACXX:1:1304:7344:178232 0 3R 626 13 51M * 0 0 GGCAGGGCTTTGCCAGGTGATAATGCCTTAACTGTCCCTTGTATTCGGGAG @<@DFFDDFHGFHHGGEBB>AFBH@HGIGIGEGEDGIGGFCB?GDFGIF@6 XA:i:0 MD:Z:49C0T0 NM:i:2
+DD61XKN1:101:D0EKPACXX:1:2108:6658:122648 0 3R 628 12 51M * 0 0 CAGGGCTTTGCCAGGTGATAATGCCTTAACTGTCCCTTGTATTCGGGCTAG @@@DEHBAHHGFHE?CGCDGH*?DED>D@BHCG XA:i:0 MD:Z:49T0C0 NM:i:2
+DD61XKN1:101:D0EKPACXX:1:2206:14847:13155 0 3R 628 6 51M * 0 0 CAGGGCTTTGCCAGGTGATNATGCCTTAACTGTCCCTTGTATTCGGGCTTC @@CFFFFFHGHFFGGFGGC#3:AFHHGGIJJJGHCGGIGEEGJJJIFIGID XA:i:1 MD:Z:19A31 NM:i:1
+DD61XKN1:101:D0EKPACXX:1:1301:9974:152868 16 3R 629 35 51M * 0 0 AGGGCTTTGCCAGGTGATAATGCCTTAACTGTCCCTTGTATTCGGGCTTCG >AHGFIHHGIHCGIICIDB3CIGHFGGGGGGHGJIHHHFBCC?FFDDF@@? XA:i:0 MD:Z:51 NM:i:0
+DD61XKN1:101:D0EKPACXX:1:2307:18343:74834 0 3R 636 31 51M * 0 0 TGCCAGGTGATAATGCCTTAACTGTCACTTGTATTCGGGCTTCGTCTTCGC BCCFFFFDFHHHGJJJJJJJIJJJIIIJJJJHIJJJIIIJJJJIGIIJIJF XA:i:1 MD:Z:26C24 NM:i:1
+DD61XKN1:101:D0EKPACXX:1:1105:16709:8151 0 3R 638 24 51M * 0 0 CCAGGTGATAATGCCTTAACTGTCCCTTGTATTCGGGCTTCGTCTTCGAGA ??