Spaces:

Hack90
/

virus_explorer

Sleeping

App Files Files Community

virus_explorer / utils.py

Hack90

Update utils.py

7f7d36e verified 4 months ago

raw

history blame

34.5 kB

	from shiny import render
	from shiny.express import input, output, ui
	from datasets import load_dataset
	import pandas as pd
	from pathlib import Path
	import matplotlib
	import numpy as np
	import gradio as gr
	import matplotlib.pyplot as plt
	import matplotlib.style as mplstyle
	from scipy.interpolate import interp1d
	from typing import Dict, Optional
	from collections import namedtuple


	# Mapping of nucleotides to float coordinates
	mapping_easy = {
	'A': np.array([0.5, -0.8660254037844386]),
	'T': np.array([0.5, 0.8660254037844386]),
	'G': np.array([0.8660254037844386, -0.5]),
	'C': np.array([0.8660254037844386, 0.5]),
	'N': np.array([0, 0])
	}

	# coordinates for x+iy
	Coord = namedtuple("Coord", ["x","y"])

	# coordinates for a CGR encoding
	CGRCoords = namedtuple("CGRCoords", ["N","x","y"])

	# coordinates for each nucleotide in the 2d-plane
	DEFAULT_COORDS = dict(A=Coord(1,1),C=Coord(-1,1),G=Coord(-1,-1),T=Coord(1,-1))

	# Function to convert a DNA sequence to a list of coordinates
	def _dna_to_coordinates(dna_sequence, mapping):
	dna_sequence = dna_sequence.upper()
	coordinates = np.array([mapping.get(nucleotide, mapping['N']) for nucleotide in dna_sequence])
	return coordinates

	# Function to create the cumulative sum of a list of coordinates
	def _get_cumulative_coords(mapped_coords):
	cumulative_coords = np.cumsum(mapped_coords, axis=0)
	return cumulative_coords

	# Function to take a list of DNA sequences and plot them in a single figure
	def plot_2d_sequences(dna_sequences, mapping=mapping_easy, single_sequence=False):
	fig, ax = plt.subplots()
	if single_sequence:
	dna_sequences = [dna_sequences]
	for dna_sequence in dna_sequences:
	mapped_coords = _dna_to_coordinates(dna_sequence, mapping)
	cumulative_coords = _get_cumulative_coords(mapped_coords)
	ax.plot(*cumulative_coords.T)
	return fig

	# Function to plot a comparison of DNA sequences
	def plot_2d_comparison(dna_sequences_grouped, labels, mapping=mapping_easy):
	fig, ax = plt.subplots()
	colors = plt.cm.rainbow(np.linspace(0, 1, len(dna_sequences_grouped)))
	for count, (dna_sequences, color) in enumerate(zip(dna_sequences_grouped, colors)):
	for dna_sequence in dna_sequences:
	mapped_coords = _dna_to_coordinates(dna_sequence, mapping)
	cumulative_coords = _get_cumulative_coords(mapped_coords)
	ax.plot(*cumulative_coords.T, color=color, label=labels[count])
	# Only show unique labels in the legend
	handles, labels = ax.get_legend_handles_labels()
	by_label = dict(zip(labels, handles))
	ax.legend(by_label.values(), by_label.keys())
	return fig

	# Function to plot a comparison of DNA sequences
	def plot_distrobutions(dna_sequences_grouped, labels, basepair, mapping=mapping_easy):
	fig, ax = plt.subplots()
	colors = plt.cm.rainbow(np.linspace(0, 1, len(dna_sequences_grouped)))
	for count, (dna_sequences, color) in enumerate(zip(dna_sequences_grouped, colors)):
	virus_y = []
	for dna_sequence in dna_sequences:
	mapped_coords = _dna_to_coordinates(dna_sequence, mapping)
	cumulative_coords = _get_cumulative_coords(mapped_coords)
	y = cumulative_coords[:, 1][basepair]
	virus_y.append(y)
	count_bins, bins = np.histogram(virus_y)
	ax.stairs(count_bins, bins , color= color, label=labels[count])
	# Only show unique labels in the legend
	handles, labels = ax.get_legend_handles_labels()
	by_label = dict(zip(labels, handles))
	ax.legend(by_label.values(), by_label.keys())
	return fig

	############################################################# Virus Dataset ########################################################
	#ds = load_dataset('Hack90/virus_tiny')
	df = pd.read_parquet('virus_ds.parquet')
	virus = df['Organism_Name'].unique()
	virus = {v: v for v in virus}

	############################################################# Filter and Select ########################################################
	def filter_and_select(group):
	if len(group) >= 3:
	return group.head(3)

	############################################################# Wens Method ########################################################
	import numpy as np

	WEIGHTS = {'0100': 1/6, '0101': 2/6, '1100' : 3/6, '0110':3/6, '1101': 4/6, '1110': 5/6,'0111':5/6, '1111': 6/6}
	LOWEST_LENGTH = 5000

	def _get_subsequences(sequence):
	return {nuc: [i+1 for i, x in enumerate(sequence) if x == nuc] for nuc in 'ACTG'}

	def _calculate_coordinates_fixed(subsequence, L=LOWEST_LENGTH):
	return [((2 * np.pi / (L - 1)) * (K-1), np.sqrt((2 * np.pi / (L - 1)) * (K-1))) for K in subsequence]

	def _calculate_weighting_full(sequence, WEIGHTS, L=LOWEST_LENGTH, E=0.0375):
	weightings = [0]
	for i in range(1, len(sequence) - 1):
	if i < len(sequence) - 2:
	subsequence = sequence[i-1:i+3]
	comparison_pattern = f"{'1' if subsequence[0] == subsequence[1] else '0'}1{'1' if subsequence[2] == subsequence[1] else '0'}{'1' if subsequence[3] == subsequence[1] else '0'}"
	weight = WEIGHTS.get(comparison_pattern, 0)
	weight = weight * E if i > L else weight
	else:
	weight = 0
	weightings.append(weight)
	weightings.append(0)
	return weightings

	def _centre_of_mass(polar_coordinates, weightings):
	x, y = _calculate_standard_coordinates(polar_coordinates)
	return sum(weightings[i] * ((x[i] - (x[i]weightings[i]))2 + (y[i] - y[i]weightings[i])**2) for i in range(len(x)))

	def _normalised_moment_of_inertia(polar_coordinates, weightings):
	moment = _centre_of_mass(polar_coordinates, weightings)
	return np.sqrt(moment / sum(weightings))

	def _calculate_standard_coordinates(polar_coordinates):
	return [rho * np.cos(theta) for theta, rho in polar_coordinates], [rho * np.sin(theta) for theta, rho in polar_coordinates]


	def _moments_of_inertia(polar_coordinates, weightings):
	return [_normalised_moment_of_inertia(indices, weightings) for subsequence, indices in polar_coordinates.items()]

	def moment_of_inertia(sequence, WEIGHTS, L=5000, E=0.0375):
	subsequences = _get_subsequences(sequence)
	polar_coordinates = {subsequence: _calculate_coordinates_fixed(indices, len(sequence)) for subsequence, indices in subsequences.items()}
	weightings = _calculate_weighting_full(sequence, WEIGHTS, L=L, E=E)
	return _moments_of_inertia(polar_coordinates, weightings)


	def similarity_wen(sequence1, sequence2, WEIGHTS, L=5000, E=0.0375):
	L = min(len(sequence1), len(sequence2))
	inertia1 = moment_of_inertia(sequence1, WEIGHTS, L=L, E=E)
	inertia2 = moment_of_inertia(sequence2, WEIGHTS, L=L, E=E)
	similarity = np.sqrt(sum((x - y)**2 for x, y in zip(inertia1, inertia2)))
	return similarity
	def heatmap(data, row_labels, col_labels, ax=None,
	cbar_kw=None, cbarlabel="", **kwargs):
	"""
	Create a heatmap from a numpy array and two lists of labels.
	Parameters
	----------
	data
	A 2D numpy array of shape (M, N).
	row_labels
	A list or array of length M with the labels for the rows.
	col_labels
	A list or array of length N with the labels for the columns.
	ax
	A `matplotlib.axes.Axes` instance to which the heatmap is plotted. If
	not provided, use current axes or create a new one. Optional.
	cbar_kw
	A dictionary with arguments to `matplotlib.Figure.colorbar`. Optional.
	cbarlabel
	The label for the colorbar. Optional.
	**kwargs
	All other arguments are forwarded to `imshow`.
	"""

	if ax is None:
	ax = plt.gca()

	if cbar_kw is None:
	cbar_kw = {}

	# Plot the heatmap
	im = ax.imshow(data, **kwargs)

	# Create colorbar
	cbar = ax.figure.colorbar(im, ax=ax, **cbar_kw)
	cbar.ax.set_ylabel(cbarlabel, rotation=-90, va="bottom")

	# Show all ticks and label them with the respective list entries.
	ax.set_xticks(np.arange(data.shape[1]), labels=col_labels)
	ax.set_yticks(np.arange(data.shape[0]), labels=row_labels)

	# Let the horizontal axes labeling appear on top.
	ax.tick_params(top=True, bottom=False,
	labeltop=True, labelbottom=False)

	# Rotate the tick labels and set their alignment.
	plt.setp(ax.get_xticklabels(), rotation=-30, ha="right",
	rotation_mode="anchor")

	# Turn spines off and create white grid.
	ax.spines[:].set_visible(False)

	ax.set_xticks(np.arange(data.shape[1]+1)-.5, minor=True)
	ax.set_yticks(np.arange(data.shape[0]+1)-.5, minor=True)
	ax.grid(which="minor", color="w", linestyle='-', linewidth=3)
	ax.tick_params(which="minor", bottom=False, left=False)

	return im, cbar


	def annotate_heatmap(im, data=None, valfmt="{x:.2f}",
	textcolors=("black", "white"),
	threshold=None, **textkw):
	"""
	A function to annotate a heatmap.
	Parameters
	----------
	im
	The AxesImage to be labeled.
	data
	Data used to annotate. If None, the image's data is used. Optional.
	valfmt
	The format of the annotations inside the heatmap. This should either
	use the string format method, e.g. "$ {x:.2f}", or be a
	`matplotlib.ticker.Formatter`. Optional.
	textcolors
	A pair of colors. The first is used for values below a threshold,
	the second for those above. Optional.
	threshold
	Value in data units according to which the colors from textcolors are
	applied. If None (the default) uses the middle of the colormap as
	separation. Optional.
	**kwargs
	All other arguments are forwarded to each call to `text` used to create
	the text labels.
	"""

	if not isinstance(data, (list, np.ndarray)):
	data = im.get_array()

	# Normalize the threshold to the images color range.
	if threshold is not None:
	threshold = im.norm(threshold)
	else:
	threshold = im.norm(data.max())/2.

	# Set default alignment to center, but allow it to be
	# overwritten by textkw.
	kw = dict(horizontalalignment="center",
	verticalalignment="center")
	kw.update(textkw)

	# Get the formatter in case a string is supplied
	if isinstance(valfmt, str):
	valfmt = matplotlib.ticker.StrMethodFormatter(valfmt)

	# Loop over the data and create a `Text` for each "pixel".
	# Change the text's color depending on the data.
	texts = []
	for i in range(data.shape[0]):
	for j in range(data.shape[1]):
	kw.update(color=textcolors[int(im.norm(data[i, j]) > threshold)])
	text = im.axes.text(j, i, valfmt(data[i, j], None), **kw)
	texts.append(text)

	return texts

	def wens_method_heatmap(df, virus_species):
	# Create a dataframe to store the similarity values
	similarity_df = pd.DataFrame(index=virus_species, columns=virus_species)
	# Fill the dataframe with similarity values
	for virus1 in virus_species:
	for virus2 in virus_species:
	if virus1 == virus2:
	sequence1 = df[df['Organism_Name'] == virus1]['Sequence'].values[0]
	sequence2 = df[df['Organism_Name'] == virus2]['Sequence'].values[1]
	similarity = similarity_wen(sequence1, sequence2, WEIGHTS)
	similarity_df.loc[virus1, virus2] = similarity
	else:
	sequence1 = df[df['Organism_Name'] == virus1]['Sequence'].values[0]
	sequence2 = df[df['Organism_Name'] == virus2]['Sequence'].values[0]
	similarity = similarity_wen(sequence1, sequence2, WEIGHTS)
	similarity_df.loc[virus1, virus2] = similarity
	similarity_df = similarity_df.apply(pd.to_numeric)

	# Optional: Handle NaN values if your similarity computation might result in them
	# similarity_df.fillna(0, inplace=True)

	fig, ax = plt.subplots()
	# Plotting
	im = ax.imshow(similarity_df, cmap="YlGn")
	ax.set_xticks(np.arange(len(virus_species)), labels=virus_species)
	ax.set_yticks(np.arange(len(virus_species)), labels=virus_species)
	plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
	cbar = ax.figure.colorbar(im, ax=ax)
	cbar.ax.set_ylabel("Similarity", rotation=-90, va="bottom")


	return fig
	############################################################# Sub-Specie ########################################################
	import numpy as np
	from scipy.interpolate import interp1d, CubicSpline
	import pandas as pd
	from tqdm import tqdm

	# Define constants
	MIN_DISTANCE = 2581
	VECTORS = {
	'A': [0.5, -0.8660254],
	'T': [0.5, 0.8660254],
	'G': [0.8660254, -0.5],
	'C': [0.8660254, 0.5]
	}

	def create_dna_representation_ew_subs(seq):
	"""Create a 2D representation of DNA sequence using cubic spline interpolation."""
	# Clean the sequence
	clean_seq = ''.join(char for char in seq if char in VECTORS)

	# Convert sequence to numerical representation
	num_seq = np.array([VECTORS[char] for char in clean_seq], dtype=float)

	# Calculate cumulative sum
	cum_sum = num_seq.cumsum(axis=0)

	# Perform cubic spline interpolation
	x = np.arange(len(cum_sum))
	cs_x = CubicSpline(x, cum_sum[:, 0])
	cs_y = CubicSpline(x, cum_sum[:, 1])

	# Interpolate to 2048 points
	x_new = np.linspace(0, len(cum_sum) - 1, 2048)
	return np.column_stack([cs_x(x_new), cs_y(x_new)]).tolist()

	def create_dna_representation_for_subs(row):
	"""Create a 1D representation of DNA sequence using linear interpolation."""
	min_distance = int(row['min_distance'])
	seq = ''.join(char for char in row['seq'] if char in VECTORS)[:min_distance]
	min_distance = int(min_distance * 0.66)

	# Convert sequence to numerical representation
	num_seq = np.array([VECTORS[char] for char in seq], dtype=float)

	# Calculate cumulative sum
	cum_sum = num_seq.cumsum(axis=0)

	# Perform linear interpolation
	f = interp1d(cum_sum[:, 0], cum_sum[:, 1], kind='cubic', fill_value='extrapolate')
	x_new = np.linspace(0, min_distance - 1, min_distance)
	return f(x_new)

	def create_groups_subs(closest_matches):
	"""Create groups based on closest matches."""
	groups = {}
	visited = set()

	def dfs(node, group):
	if node in visited:
	return
	visited.add(node)
	group.add(node)
	for neighbor in closest_matches[node]:
	dfs(neighbor, group)

	for i in range(len(closest_matches)):
	if i not in visited:
	group = set()
	dfs(i, group)
	if len(group) > 1: # Ignore elements with no closest match
	groups[f"group_{len(groups) + 1}"] = sorted(list(group))

	return groups

	def process_data_sub_specie(df, species, varience):
	"""Process DNA data for a given species."""
	# Filter data for the given species
	df_plot = df[df['organism_name'] == species].reset_index(drop=True).copy()

	# Calculate median sequence length and filter sequences
	median = df_plot['seq_len'].median() * 0.8
	df_plot['min_distance'] = median
	df_plot = df_plot[df_plot['seq_len'] > median].reset_index(drop=True)

	# Create DNA representations
	df_plot['two_d'] = df_plot.apply(create_dna_representation_for_subs, axis=1)
	values = np.array(df_plot['two_d'].tolist())

	# Calculate differences between sequences
	n_rows = values.shape[0]
	b_list = []

	for i in tqdm(range(n_rows)):
	diff = np.abs(values[i:i+1, :] - values).sum(axis=1)
	b_list.append(diff)

	bbbb = np.array(b_list)
	print(bbbb)
	np.fill_diagonal(bbbb, 10000)
	median_filter = median * varience
	maxxx = [np.where(bbbb[i] < median_filter)[0] for i in range(len(bbbb))]

	# Create groups
	groups = create_groups_subs(maxxx)

	# Add group information to dataframe
	df_plot['group'] = 'No Group'
	for group_name, group_indices in groups.items():
	df_plot.loc[group_indices, 'group'] = group_name

	# Create 2D representations
	df_plot['two_d'] = df_plot['seq'].apply(create_dna_representation_ew_subs)

	return df_plot


	############################################################# ColorSquare ########################################################
	import math
	import numpy as np
	import matplotlib.pyplot as plt
	from matplotlib.colors import ListedColormap
	import pandas as pd

	def _fill_spiral(matrix, seq_colors, k):
	left, top, right, bottom = 0, 0, k-1, k-1
	index = 0
	while left <= right and top <= bottom:
	for i in range(left, right + 1): # Top row
	if index < len(seq_colors):
	matrix[top][i] = seq_colors[index]
	index += 1
	top += 1
	for i in range(top, bottom + 1): # Right column
	if index < len(seq_colors):
	matrix[i][right] = seq_colors[index]
	index += 1
	right -= 1
	for i in range(right, left - 1, -1): # Bottom row
	if index < len(seq_colors):
	matrix[bottom][i] = seq_colors[index]
	index += 1
	bottom -= 1
	for i in range(bottom, top - 1, -1): # Left column
	if index < len(seq_colors):
	matrix[i][left] = seq_colors[index]
	index += 1
	left += 1


	def _generate_color_square(sequence,virus, save=False, count=0, label=None):
	# Define the sequence and corresponding colors with indices
	colors = {'a': 0, 't': 1, 'c': 2, 'g': 3, 'n': 4} # Assign indices to each color
	seq_colors = [colors[char] for char in sequence.lower()] # Map the sequence to color indices

	# Calculate k (size of the square)
	k = math.ceil(math.sqrt(len(sequence)))

	# Initialize a k x k matrix filled with the index for 'white'
	matrix = np.full((k, k), colors['n'], dtype=int)

	# Fill the matrix in a clockwise spiral
	_fill_spiral(matrix, seq_colors, k)

	# Define a custom color map for plotting
	cmap = ListedColormap(['red', 'green', 'yellow', 'blue', 'white'])

	# Plot the matrix
	plt.figure(figsize=(5, 5))
	plt.imshow(matrix, cmap=cmap, interpolation='nearest')
	if label:
	plt.title(label)
	plt.axis('off') # Hide the axes
	if save:
	plt.savefig(f'color_square_{virus}_{count}.png', dpi=300, bbox_inches='tight')
	# plt.show()

	def plot_color_square(df, virus_species):
	ncols = 3
	nrows = len(virus_species)
	fig, axeses = plt.subplots(
	nrows=nrows,
	ncols=ncols,
	squeeze=False,
	)
	for i in range(0, ncols * nrows):
	row = i // ncols
	col = i % ncols
	axes = axeses[row, col]
	data = df[i]
	virus = virus_species[row]
	# Define the sequence and corresponding colors with indices
	colors = {'a': 0, 't': 1, 'c': 2, 'g': 3, 'n': 4}
	# remove all non-nucleotide characters
	data = ''.join([char for char in data.lower() if char in 'atcgn'])
	# Assign indices to each color
	seq_colors = [colors[char] for char in data.lower()] # Map the sequence to color indices

	# Calculate k (size of the square)
	k = math.ceil(math.sqrt(len(data)))

	# Initialize a k x k matrix filled with the index for 'white'
	matrix = np.full((k, k), colors['n'], dtype=int)

	# Fill the matrix in a clockwise spiral
	_fill_spiral(matrix, seq_colors, k)

	# Define a custom color map for plotting
	cmap = ListedColormap(['red', 'green', 'yellow', 'blue', 'white'])
	axes.imshow(matrix, cmap=cmap, interpolation='nearest')
	axes.set_title(virus)
	return fig



	def generate_color_square(sequence,virus, multi=False, save=False, label=None):
	if multi:
	for i,seq in enumerate(sequence):
	_generate_color_square(seq, virus,save, i, label[i] if label else None)
	else:
	_generate_color_square(sequence, save, label=label)


	############################################################# FCGR ########################################################

	from typing import Dict, Optional
	from collections import namedtuple

	# coordinates for x+iy
	Coord = namedtuple("Coord", ["x","y"])

	# coordinates for a CGR encoding
	CGRCoords = namedtuple("CGRCoords", ["N","x","y"])

	# coordinates for each nucleotide in the 2d-plane
	DEFAULT_COORDS = dict(A=Coord(1,1),C=Coord(-1,1),G=Coord(-1,-1),T=Coord(1,-1))

	class CGR:
	"Chaos Game Representation for DNA"
	def __init__(self, coords: Optional[Dict[chr,tuple]]=None):
	self.nucleotide_coords = DEFAULT_COORDS if coords is None else coords
	self.cgr_coords = CGRCoords(0,0,0)

	def nucleotide_by_coords(self,x,y):
	"Get nucleotide by coordinates (x,y)"
	# filter nucleotide by coordinates
	filtered = dict(filter(lambda item: item[1] == Coord(x,y), self.nucleotide_coords.items()))

	return list(filtered.keys())[0]

	def forward(self, nucleotide: str):
	"Compute next CGR coordinates"
	x = (self.cgr_coords.x + self.nucleotide_coords.get(nucleotide).x)/2
	y = (self.cgr_coords.y + self.nucleotide_coords.get(nucleotide).y)/2

	# update cgr_coords
	self.cgr_coords = CGRCoords(self.cgr_coords.N+1,x,y)

	def backward(self,):
	"Compute last CGR coordinates. Current nucleotide can be inferred from (x,y)"
	# get current nucleotide based on coordinates
	n_x,n_y = self.coords_current_nucleotide()
	nucleotide = self.nucleotide_by_coords(n_x,n_y)

	# update coordinates to the previous one
	x = 2*self.cgr_coords.x - n_x
	y = 2*self.cgr_coords.y - n_y

	# update cgr_coords
	self.cgr_coords = CGRCoords(self.cgr_coords.N-1,x,y)

	return nucleotide

	def coords_current_nucleotide(self,):
	x = 1 if self.cgr_coords.x>0 else -1
	y = 1 if self.cgr_coords.y>0 else -1
	return x,y

	def encode(self, sequence: str):
	"From DNA sequence to CGR"
	# reset starting position to (0,0,0)
	self.reset_coords()
	for nucleotide in sequence:
	self.forward(nucleotide)
	return self.cgr_coords

	def reset_coords(self,):
	self.cgr_coords = CGRCoords(0,0,0)

	def decode(self, N:int, x:int, y:int)->str:
	"From CGR to DNA sequence"
	self.cgr_coords = CGRCoords(N,x,y)

	# decoded sequence
	sequence = []

	# Recover the entire genome
	while self.cgr_coords.N>0:
	nucleotide = self.backward()
	sequence.append(nucleotide)
	return "".join(sequence[::-1])


	from itertools import product
	from collections import defaultdict
	import numpy as np

	class FCGR(CGR):
	"""Frequency matrix CGR
	an (2k x 2k) 2D representation will be created for a
	n-long sequence.
	- k represents the k-mer.
	- 2k x 2k = 4**k the total number of k-mers (sequences of length k)
	- pixel value correspond to the value of the frequency for each k-mer
	"""

	def __init__(self, k: int,):
	super().__init__()
	self.k = k # k-mer representation
	self.kmers = list("".join(kmer) for kmer in product("ACGT", repeat=self.k))
	self.kmer2pixel = self.kmer2pixel_position()

	def __call__(self, sequence: str):
	"Given a DNA sequence, returns an array with his frequencies in the same order as FCGR"
	self.count_kmers(sequence)

	# Create an empty array to save the FCGR values
	array_size = int(2**self.k)
	freq_matrix = np.zeros((array_size,array_size))

	# Assign frequency to each box in the matrix
	for kmer, freq in self.freq_kmer.items():
	pos_x, pos_y = self.kmer2pixel[kmer]
	freq_matrix[int(pos_x)-1,int(pos_y)-1] = freq
	return freq_matrix

	def count_kmer(self, kmer):
	if "N" not in kmer:
	self.freq_kmer[kmer] += 1

	def count_kmers(self, sequence: str):
	self.freq_kmer = defaultdict(int)
	# representativity of kmers
	last_j = len(sequence) - self.k + 1
	kmers = (sequence[i:(i+self.k)] for i in range(last_j))
	# count kmers in a dictionary
	list(self.count_kmer(kmer) for kmer in kmers)

	def kmer_probabilities(self, sequence: str):
	self.probabilities = defaultdict(float)
	N=len(sequence)
	for key, value in self.freq_kmer.items():
	self.probabilities[key] = float(value) / (N - self.k + 1)

	def pixel_position(self, kmer: str):
	"Get pixel position in the FCGR matrix for a k-mer"

	coords = self.encode(kmer)
	N,x,y = coords.N, coords.x, coords.y

	# Coordinates from [-1,1]² to [1,2**k]²
	np_coords = np.array([(x + 1)/2, (y + 1)/2]) # move coordinates from [-1,1]² to [0,1]²
	np_coords = 2self.k # rescale coordinates from [0,1]² to [0,2*k]²
	x,y = np.ceil(np_coords) # round to upper integer

	# Turn coordinates (cx,cy) into pixel (px,py) position
	# px = 2**k-cy+1, py = cx
	return 2**self.k-int(y)+1, int(x)

	def kmer2pixel_position(self,):
	kmer2pixel = dict()
	for kmer in self.kmers:
	kmer2pixel[kmer] = self.pixel_position(kmer)
	return kmer2pixel


	from tqdm import tqdm
	from pathlib import Path

	import numpy as np


	class GenerateFCGR:
	def __init__(self, kmer: int = 5, ):
	self.kmer = kmer
	self.fcgr = FCGR(kmer)
	self.counter = 0 # count number of time a sequence is converted to fcgr


	def __call__(self, list_fasta,):

	for fasta in tqdm(list_fasta, desc="Generating FCGR"):
	self.from_fasta(fasta)




	def from_seq(self, seq: str):
	"Get FCGR from a sequence"
	seq = self.preprocessing(seq)
	chaos = self.fcgr(seq)
	self.counter +=1
	return chaos

	def reset_counter(self,):
	self.counter=0

	@staticmethod
	def preprocessing(seq):
	seq = seq.upper()
	for letter in seq:
	if letter not in "ATCG":
	seq = seq.replace(letter,"N")
	return seq

	def plot_fcgr(df, virus_species):
	ncols = 3
	nrows = len(virus_species)
	fig, axeses = plt.subplots(
	nrows=nrows,
	ncols=ncols,
	squeeze=False,
	)
	for i in range(0, ncols * nrows):
	row = i // ncols
	col = i % ncols
	axes = axeses[row, col]
	data = df[i].upper()
	chaos = GenerateFCGR().from_seq(seq=data)
	virus = virus_species[row]
	axes.imshow(chaos)
	axes.set_title(virus)
	return fig

	############################################################# Persistant Homology ########################################################
	import numpy as np
	import persim
	import ripser
	import matplotlib.pyplot as plt

	NUCLEOTIDE_MAPPING = {
	'a': np.array([1, 0, 0, 0]),
	'c': np.array([0, 1, 0, 0]),
	'g': np.array([0, 0, 1, 0]),
	't': np.array([0, 0, 0, 1])
	}

	def encode_nucleotide_to_vector(nucleotide):
	return NUCLEOTIDE_MAPPING.get(nucleotide)

	def chaos_4d_representation(dna_sequence):
	points = [encode_nucleotide_to_vector(dna_sequence[0])]
	for nucleotide in dna_sequence[1:]:
	vector = encode_nucleotide_to_vector(nucleotide)
	if vector is None:
	continue
	next_point = 0.5 * (points[-1] + vector)
	points.append(next_point)
	return np.array(points)

	def persistence_homology(dna_sequence, multi=False, plot=False, sample_rate=7):
	if multi:
	c4dr_points = np.array([chaos_4d_representation(sequence) for sequence in dna_sequence])
	dgm_dna = [ripser.ripser(points[::sample_rate], maxdim=1)['dgms'] for points in c4dr_points]
	if plot:
	persim.plot_diagrams([dgm[1] for dgm in dgm_dna], labels=[f'sequence {i}' for i in range(len(dna_sequence))])
	else:
	c4dr_points = chaos_4d_representation(dna_sequence)
	dgm_dna = ripser.ripser(c4dr_points[::sample_rate], maxdim=1)['dgms']
	if plot:
	persim.plot_diagrams(dgm_dna[1])
	return dgm_dna

	def plot_diagrams(
	diagrams,
	plot_only=None,
	title=None,
	xy_range=None,
	labels=None,
	colormap="default",
	size=20,
	ax_color=np.array([0.0, 0.0, 0.0]),
	diagonal=True,
	lifetime=False,
	legend=True,
	show=False,
	ax=None
	):
	"""A helper function to plot persistence diagrams.
	Parameters
	----------
	diagrams: ndarray (n_pairs, 2) or list of diagrams
	A diagram or list of diagrams. If diagram is a list of diagrams,
	then plot all on the same plot using different colors.
	plot_only: list of numeric
	If specified, an array of only the diagrams that should be plotted.
	title: string, default is None
	If title is defined, add it as title of the plot.
	xy_range: list of numeric [xmin, xmax, ymin, ymax]
	User provided range of axes. This is useful for comparing
	multiple persistence diagrams.
	labels: string or list of strings
	Legend labels for each diagram.
	If none are specified, we use H_0, H_1, H_2,... by default.
	colormap: string, default is 'default'
	Any of matplotlib color palettes.
	Some options are 'default', 'seaborn', 'sequential'.
	See all available styles with
	.. code:: python
	import matplotlib as mpl
	print(mpl.styles.available)
	size: numeric, default is 20
	Pixel size of each point plotted.
	ax_color: any valid matplotlib color type.
	See [https://matplotlib.org/api/colors_api.html](https://matplotlib.org/api/colors_api.html) for complete API.
	diagonal: bool, default is True
	Plot the diagonal x=y line.
	lifetime: bool, default is False. If True, diagonal is turned to False.
	Plot life time of each point instead of birth and death.
	Essentially, visualize (x, y-x).
	legend: bool, default is True
	If true, show the legend.
	show: bool, default is False
	Call plt.show() after plotting. If you are using self.plot() as part
	of a subplot, set show=False and call plt.show() only once at the end.
	"""

	fig, ax = plt.subplots() if ax is None else ax
	plt.style.use(colormap)

	xlabel, ylabel = "Birth", "Death"

	if not isinstance(diagrams, list):
	# Must have diagrams as a list for processing downstream
	diagrams = [diagrams]

	if labels is None:
	# Provide default labels for diagrams if using self.dgm_
	labels = ["$H_{{{}}}$".format(i) for i , _ in enumerate(diagrams)]

	if plot_only:
	diagrams = [diagrams[i] for i in plot_only]
	labels = [labels[i] for i in plot_only]

	if not isinstance(labels, list):
	labels = [labels] * len(diagrams)

	# Construct copy with proper type of each diagram
	# so we can freely edit them.
	diagrams = [dgm.astype(np.float32, copy=True) for dgm in diagrams]

	# find min and max of all visible diagrams
	concat_dgms = np.concatenate(diagrams).flatten()
	has_inf = np.any(np.isinf(concat_dgms))
	finite_dgms = concat_dgms[np.isfinite(concat_dgms)]

	# clever bounding boxes of the diagram
	if not xy_range:
	# define bounds of diagram
	ax_min, ax_max = np.min(finite_dgms), np.max(finite_dgms)
	x_r = ax_max - ax_min

	# Give plot a nice buffer on all sides.
	# ax_range=0 when only one point,
	buffer = 1 if xy_range == 0 else x_r / 5

	x_down = ax_min - buffer / 2
	x_up = ax_max + buffer

	y_down, y_up = x_down, x_up
	else:
	x_down, x_up, y_down, y_up = xy_range

	yr = y_up - y_down

	if lifetime:

	# Don't plot landscape and diagonal at the same time.
	diagonal = False

	# reset y axis so it doesn't go much below zero
	y_down = -yr * 0.05
	y_up = y_down + yr

	# set custom ylabel
	ylabel = "Lifetime"

	# set diagrams to be (x, y-x)
	for dgm in diagrams:
	dgm[:, 1] -= dgm[:, 0]

	# plot horizon line
	ax.plot([x_down, x_up], [0, 0], c=ax_color)

	# Plot diagonal
	if diagonal:
	ax.plot([x_down, x_up], [x_down, x_up], "--", c=ax_color)

	# Plot inf line
	if has_inf:
	# put inf line slightly below top
	b_inf = y_down + yr * 0.95
	ax.plot([x_down, x_up], [b_inf, b_inf], "--", c="k", label=r"$\infty$")

	# convert each inf in each diagram with b_inf
	for dgm in diagrams:
	dgm[np.isinf(dgm)] = b_inf

	# Plot each diagram
	for dgm, label in zip(diagrams, labels):

	# plot persistence pairs
	ax.scatter(dgm[:, 0], dgm[:, 1], size, label=label, edgecolor="none")

	ax.set_xlabel(xlabel)
	ax.set_ylabel(ylabel)

	ax.set_xlim([x_down, x_up])
	ax.set_ylim([y_down, y_up])
	ax.set_aspect('equal', 'box')

	if title is not None:
	ax.set_title(title)

	if legend is True:
	ax.legend(loc="lower right")

	if show is True:
	plt.show()
	return fig, ax


	def plot_persistence_homology(df, virus_species):
	# if len(virus_species.unique()) > 1:
	c4dr_points = [chaos_4d_representation(sequence.lower()) for sequence in df]
	dgm_dna = [ripser.ripser(points[::15], maxdim=1)['dgms'] for points in c4dr_points]
	labels =[f'{virus_specie}_{i}' for i, virus_specie in enumerate(virus_species)]
	fig, ax = plot_diagrams([dgm[1] for dgm in dgm_dna], labels=labels)
	# else:
	# c4dr_points = [chaos_4d_representation(sequence.lower()) for sequence in df]
	# dgm_dna = [ripser.ripser(points[::10], maxdim=1)['dgms'] for points in c4dr_points]
	# labels =[f'{virus_specie}_{i}' for i, virus_specie in enumerate(virus_species)]
	# print(labels)
	# print(len(dgm_dna))
	# fig, ax = plot_diagrams([dgm[1] for dgm in dgm_dna], labels=labels)
	return fig

	def compare_persistence_homology(dna_sequence1, dna_sequence2):
	dgm_dna1 = persistence_homology(dna_sequence1)
	dgm_dna2 = persistence_homology(dna_sequence2)
	distance = persim.sliced_wasserstein(dgm_dna1[1], dgm_dna2[1])
	return distance