Edit the cell below to input your desired values. You can also input values using the input scanner.

In [None]:
dna_string = "ACGT"
show_string = True
dna_len = 10**7
kmer_size = 7

In [None]:
from data_processing import generate_kmers as generator
from data_processing import graph_distribution as graph
from data_structures import hash_table as hashing
from data_processing import distribution_parser as parse

import time

In [None]:
def generate_table(kmer_list, hashing_function):
 """
 Generates a hash table from a list of kmers and the desired hashing function
 """
 list_len = len(kmer_list)
 table = hashing.hash_table(list_len, hashing_function)
 for kmer in kmer_list:
 table.insert(kmer)

 return table

In [None]:
def test_hash_table(dna_string, dna_len, kmer_size):
 """
 Utility function to test the hash table
 """
 alphabet = dna_string
 if dna_string is None:
 alphabet = input("Enter the alphabet: ")
 
 dna_length = dna_len
 if dna_len is None:
 dna_length = (input("Enter the length of the DNA string: "))
 dna_length = parse.parse_math_expression(dna_length) 

 k_mer = kmer_size
 if kmer_size is None:
 k_mer = input("Enter the length of the kmer: ")
 k_mer = parse.parse_math_expression(k_mer)

 # generate the DNA sequence
 dna_sequence = generator.create_dna_sequence(alphabet, dna_length)
 if(show_string):
 print("DNA sequence: ", dna_sequence)

 # generate the list of k-mers
 kmer_list = generator.generate_kmers(dna_sequence, k_mer)

 # test the mmh3 hashing function
 start = time.time()
 mmh3_table = generate_table(kmer_list, "mmh")
 end = time.time()

 mmh3_insertion_time = end - start

 # test the xxh hashing function
 start = time.time()
 xxh_table = generate_table(kmer_list, "xxh")
 end = time.time()

 xxh_insertion_time = end - start
 # generate the distributions
 start = time.time()
 mmh3_distribution = parse.parse_table(mmh3_table, kmer_list)
 end = time.time()

 mmh_parsing_time = end - start

 start = time.time()
 xxh_distribution = parse.parse_table(xxh_table, kmer_list)
 end = time.time()

 xxh_parsing_time = end - start

 tabulated_data = {
 "Hash Function": ["mmh3", "xxh"],
 "Insertion Time": [mmh3_insertion_time, xxh_insertion_time],
 "Parsing Time": [mmh_parsing_time, xxh_parsing_time],
 "Collisions": [mmh3_table.collision_count, xxh_table.collision_count]
 }

 graph.print_hashing_statistics(tabulated_data, alphabet + str(dna_length) + " " + str(k_mer) + "mers")
 unique_kmers = parse.get_unique_kmers(xxh_distribution)
 graph.bar_graph(xxh_distribution)

 if len(unique_kmers) == 0:
 print("No unique k-mers found")
 else:
 print(f"Unique k-mers: {len(unique_kmers)}")
 graph.bar_graph(unique_kmers)

 if (len(xxh_distribution) > 100):
 print("Most common 10% kmers: ")
 most_common_kmers = parse.get_most_common_kmers(xxh_distribution, int(len(xxh_distribution) * 0.10))
 graph.bar_graph(most_common_kmers)

 # the distribution of one is the same as the other
 # graph.bar_graph(mmh3_distribution)


In [None]:
def main():
 test_hash_table(dna_string, dna_len, kmer_size)
 # # uncomment if you would like to enter your own values 
 # test_hash_table(None, None, None)

main()