{ "cells": [ { "cell_type": "raw", "metadata": {}, "source": [ "---\n", "title: K-mer distribution table Hash Table\n", "description: Generate a k-mer distribution table using a hash table\n", "show-code: False\n", "format:\n", " theme: night\n", "params:\n", " show_string:\n", " label: show the entire dna_sequence (warning, this can be very long)\n", " input: checkbox\n", " value: False\n", " dna_string:\n", " input: text\n", " label: input dna_string\n", " value: ACGT\n", " dna_len:\n", " input: numeric\n", " label: input dna_length\n", " value: 100\n", " min: 5\n", " max: 10000000000\n", " step: 10\n", " kmer_size:\n", " input: numeric\n", " label: input kmer_length\n", " value: 7\n", " min: 1\n", " max: 1000\n", " step: 1\n", "---" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Edit the cell below to input your desired values. You can also input values using the input scanner." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dna_string = \"ACGT\"\n", "show_string = True\n", "dna_len = 10**7\n", "kmer_size = 7" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from data_processing import generate_kmers as generator\n", "from data_processing import graph_distribution as graph\n", "from data_structures import hash_table as hashing\n", "from data_processing import distribution_parser as parse\n", "\n", "import time" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def generate_table(kmer_list, hashing_function):\n", " \"\"\"\n", " Generates a hash table from a list of kmers and the desired hashing function\n", " \"\"\"\n", " list_len = len(kmer_list)\n", " table = hashing.hash_table(list_len, hashing_function)\n", " for kmer in kmer_list:\n", " table.insert(kmer)\n", "\n", " return table" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def test_hash_table(dna_string, dna_len, kmer_size):\n", " \"\"\"\n", " Utility function to test the hash table\n", " \"\"\"\n", " alphabet = dna_string\n", " if dna_string is None:\n", " alphabet = input(\"Enter the alphabet: \")\n", " \n", " dna_length = dna_len\n", " if dna_len is None:\n", " dna_length = (input(\"Enter the length of the DNA string: \"))\n", " dna_length = parse.parse_math_expression(dna_length) \n", "\n", " k_mer = kmer_size\n", " if kmer_size is None:\n", " k_mer = input(\"Enter the length of the kmer: \")\n", " k_mer = parse.parse_math_expression(k_mer)\n", "\n", " # generate the DNA sequence\n", " dna_sequence = generator.create_dna_sequence(alphabet, dna_length)\n", " if(show_string):\n", " print(\"DNA sequence: \", dna_sequence)\n", "\n", " # generate the list of k-mers\n", " kmer_list = generator.generate_kmers(dna_sequence, k_mer)\n", "\n", " # test the mmh3 hashing function\n", " start = time.time()\n", " mmh3_table = generate_table(kmer_list, \"mmh\")\n", " end = time.time()\n", "\n", " mmh3_insertion_time = end - start\n", "\n", " # test the xxh hashing function\n", " start = time.time()\n", " xxh_table = generate_table(kmer_list, \"xxh\")\n", " end = time.time()\n", "\n", " xxh_insertion_time = end - start\n", " # generate the distributions\n", " start = time.time()\n", " mmh3_distribution = parse.parse_table(mmh3_table, kmer_list)\n", " end = time.time()\n", "\n", " mmh_parsing_time = end - start\n", "\n", " start = time.time()\n", " xxh_distribution = parse.parse_table(xxh_table, kmer_list)\n", " end = time.time()\n", "\n", " xxh_parsing_time = end - start\n", "\n", " tabulated_data = {\n", " \"Hash Function\": [\"mmh3\", \"xxh\"],\n", " \"Insertion Time\": [mmh3_insertion_time, xxh_insertion_time],\n", " \"Parsing Time\": [mmh_parsing_time, xxh_parsing_time],\n", " \"Collisions\": [mmh3_table.collision_count, xxh_table.collision_count]\n", " }\n", "\n", " graph.print_hashing_statistics(tabulated_data, alphabet + str(dna_length) + \" \" + str(k_mer) + \"mers\")\n", " unique_kmers = parse.get_unique_kmers(xxh_distribution)\n", " graph.bar_graph(xxh_distribution)\n", "\n", " if len(unique_kmers) == 0:\n", " print(\"No unique k-mers found\")\n", " else:\n", " print(f\"Unique k-mers: {len(unique_kmers)}\")\n", " graph.bar_graph(unique_kmers)\n", "\n", " if (len(xxh_distribution) > 100):\n", " print(\"Most common 10% kmers: \")\n", " most_common_kmers = parse.get_most_common_kmers(xxh_distribution, int(len(xxh_distribution) * 0.10))\n", " graph.bar_graph(most_common_kmers)\n", "\n", " # the distribution of one is the same as the other\n", " # graph.bar_graph(mmh3_distribution)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def main():\n", " test_hash_table(dna_string, dna_len, kmer_size)\n", " # # uncomment if you would like to enter your own values \n", " # test_hash_table(None, None, None)\n", "\n", "main()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.10.8 64-bit", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.8" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e" } } }, "nbformat": 4, "nbformat_minor": 2 }