{
 "cells": [
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "---\n",
    "title: K-mer distribution table Hash Table\n",
    "description: Generate a k-mer distribution table using a hash table\n",
    "show-code: False\n",
    "format:\n",
    "   theme: night\n",
    "params:\n",
    "   show_string:\n",
    "      label: show the entire dna_sequence (warning, this can be very long)\n",
    "      input: checkbox\n",
    "      value: False\n",
    "   dna_string:\n",
    "      input: text\n",
    "      label: input dna_string\n",
    "      value: ACGT\n",
    "   dna_len:\n",
    "      input: numeric\n",
    "      label: input dna_length\n",
    "      value: 100\n",
    "      min: 5\n",
    "      max: 10000000000\n",
    "      step: 10\n",
    "   kmer_size:\n",
    "      input: numeric\n",
    "      label: input kmer_length\n",
    "      value: 7\n",
    "      min: 1\n",
    "      max: 1000\n",
    "      step: 1\n",
    "---"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Edit the cell below to input your desired values. You can also input values using the input scanner."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dna_string = \"ACGT\"\n",
    "show_string = True\n",
    "dna_len = 10**7\n",
    "kmer_size = 7"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from data_processing import generate_kmers as generator\n",
    "from data_processing import graph_distribution as graph\n",
    "from data_structures import hash_table as hashing\n",
    "from data_processing import distribution_parser as parse\n",
    "\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_table(kmer_list, hashing_function):\n",
    "    \"\"\"\n",
    "    Generates a hash table from a list of kmers and the desired hashing function\n",
    "    \"\"\"\n",
    "    list_len = len(kmer_list)\n",
    "    table = hashing.hash_table(list_len, hashing_function)\n",
    "    for kmer in kmer_list:\n",
    "        table.insert(kmer)\n",
    "\n",
    "    return table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def test_hash_table(dna_string, dna_len, kmer_size):\n",
    "    \"\"\"\n",
    "    Utility function to test the hash table\n",
    "    \"\"\"\n",
    "    alphabet = dna_string\n",
    "    if dna_string is None:\n",
    "        alphabet = input(\"Enter the alphabet: \")\n",
    "    \n",
    "    dna_length = dna_len\n",
    "    if dna_len is None:\n",
    "        dna_length = (input(\"Enter the length of the DNA string: \"))\n",
    "        dna_length = parse.parse_math_expression(dna_length) \n",
    "\n",
    "    k_mer = kmer_size\n",
    "    if kmer_size is None:\n",
    "        k_mer = input(\"Enter the length of the kmer: \")\n",
    "        k_mer = parse.parse_math_expression(k_mer)\n",
    "\n",
    "    # generate the DNA sequence\n",
    "    dna_sequence = generator.create_dna_sequence(alphabet, dna_length)\n",
    "    if(show_string):\n",
    "        print(\"DNA sequence: \", dna_sequence)\n",
    "\n",
    "    # generate the list of k-mers\n",
    "    kmer_list = generator.generate_kmers(dna_sequence, k_mer)\n",
    "\n",
    "    # test the mmh3 hashing function\n",
    "    start = time.time()\n",
    "    mmh3_table = generate_table(kmer_list, \"mmh\")\n",
    "    end = time.time()\n",
    "\n",
    "    mmh3_insertion_time = end - start\n",
    "\n",
    "    # test the xxh hashing function\n",
    "    start = time.time()\n",
    "    xxh_table = generate_table(kmer_list, \"xxh\")\n",
    "    end = time.time()\n",
    "\n",
    "    xxh_insertion_time = end - start\n",
    "    # generate the distributions\n",
    "    start = time.time()\n",
    "    mmh3_distribution = parse.parse_table(mmh3_table, kmer_list)\n",
    "    end = time.time()\n",
    "\n",
    "    mmh_parsing_time = end - start\n",
    "\n",
    "    start = time.time()\n",
    "    xxh_distribution = parse.parse_table(xxh_table, kmer_list)\n",
    "    end = time.time()\n",
    "\n",
    "    xxh_parsing_time = end - start\n",
    "\n",
    "    tabulated_data = {\n",
    "        \"Hash Function\": [\"mmh3\", \"xxh\"],\n",
    "        \"Insertion Time\": [mmh3_insertion_time, xxh_insertion_time],\n",
    "        \"Parsing Time\": [mmh_parsing_time, xxh_parsing_time],\n",
    "        \"Collisions\": [mmh3_table.collision_count, xxh_table.collision_count]\n",
    "    }\n",
    "\n",
    "    graph.print_hashing_statistics(tabulated_data, alphabet + str(dna_length) + \" \" + str(k_mer) + \"mers\")\n",
    "    unique_kmers = parse.get_unique_kmers(xxh_distribution)\n",
    "    graph.bar_graph(xxh_distribution)\n",
    "\n",
    "    if len(unique_kmers) == 0:\n",
    "        print(\"No unique k-mers found\")\n",
    "    else:\n",
    "        print(f\"Unique k-mers: {len(unique_kmers)}\")\n",
    "        graph.bar_graph(unique_kmers)\n",
    "\n",
    "    if (len(xxh_distribution) > 100):\n",
    "        print(\"Most common 10% kmers: \")\n",
    "        most_common_kmers = parse.get_most_common_kmers(xxh_distribution, int(len(xxh_distribution) * 0.10))\n",
    "        graph.bar_graph(most_common_kmers)\n",
    "\n",
    "    # the distribution of one is the same as the other\n",
    "    # graph.bar_graph(mmh3_distribution)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def main():\n",
    "    test_hash_table(dna_string, dna_len, kmer_size)\n",
    "    # # uncomment if you would like to enter your own values \n",
    "    # test_hash_table(None, None, None)\n",
    "\n",
    "main()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.10.8 64-bit",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.8"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}