Seq-TransfoRNA / kba_pipeline /src /annotate_from_mapping.py

uploaded TransfoRNA repo

0b11a42 verified 4 months ago

No virus

36.7 kB

	######################################################################################################
	# annotate sequences based on mapping results
	######################################################################################################
	#%%
	import os
	import logging

	import numpy as np
	import pandas as pd
	from difflib import get_close_matches
	from Levenshtein import distance
	import json

	from joblib import Parallel, delayed
	import multiprocessing


	from utils import (fasta2df, fasta2df_subheader,log_time, reverse_complement)
	from precursor_bins import get_bin_with_max_overlap


	log = logging.getLogger(__name__)

	pd.options.mode.chained_assignment = None


	######################################################################################################
	# paths to reference and mapping files
	######################################################################################################

	version = '_v4'

	HBDxBase_csv = f'../../references/HBDxBase/HBDxBase_all{version}.csv'
	miRBase_mature_path = '../../references/HBDxBase/miRBase/mature.fa'
	mat_miRNA_pos_path = '../../references/HBDxBase/miRBase/hsa_mature_position.txt'

	mapped_file = 'seqsmapped2HBDxBase_combined.txt'
	unmapped_file = 'tmp_seqs3mm2HBDxBase_pseudo__unmapped.fa'
	TE_file = 'tmp_seqsmapped2genome_intersect_TE.txt'
	mapped_genome_file = 'seqsmapped2genome_combined.txt'
	toomanyloci_genome_file = 'tmp_seqs0mm2genome__toomanyalign.fa'
	unmapped_adapter_file = 'tmp_seqs3mm2adapters__unmapped.fa'
	unmapped_genome_file = 'tmp_seqs0mm2genome__unmapped.fa'
	unmapped_bacterial_file = 'tmp_seqs0mm2bacterial__unmapped.fa'
	unmapped_viral_file = 'tmp_seqs0mm2viral__unmapped.fa'


	sRNA_anno_file = 'sRNA_anno_from_mapping.csv'
	aggreg_sRNA_anno_file = 'sRNA_anno_aggregated_on_seq.csv'



	#%%
	######################################################################################################
	# specific functions
	######################################################################################################

	@log_time(log)
	def extract_general_info(mapping_file):
	# load mapping file
	mapping_df = pd.read_csv(mapping_file, sep='\t', header=None)
	mapping_df.columns = ['tmp_seq_id','reference','ref_start','sequence','other_alignments','mm_descriptors']

	# add precursor length + number of bins that will be used for names
	HBDxBase_df = pd.read_csv(HBDxBase_csv, index_col=0)
	HBDxBase_df = HBDxBase_df[['precursor_length','precursor_bins','pseudo_class']].reset_index()
	HBDxBase_df.rename(columns={'index': "reference"}, inplace=True)
	mapping_df = mapping_df.merge(HBDxBase_df, left_on='reference', right_on='reference', how='left')

	# extract information
	mapping_df.loc[:,'mms'] = mapping_df.mm_descriptors.fillna('').str.count('>')
	mapping_df.loc[:,'mm_descriptors'] = mapping_df.mm_descriptors.str.replace(',', ';')
	mapping_df.loc[:,'small_RNA_class_annotation'] = mapping_df.reference.str.split('\|').str[0]
	mapping_df.loc[:,'subclass_type'] = mapping_df.reference.str.split('\|').str[2]
	mapping_df.loc[:,'precursor_name_full'] = mapping_df.reference.str.split('\|').str[1].str.split('\|').str[0]
	mapping_df.loc[:,'precursor_name'] = mapping_df.precursor_name_full.str.split('__').str[0].str.split('\|').str[0]
	mapping_df.loc[:,'seq_length'] = mapping_df.sequence.apply(lambda x: len(x))
	mapping_df.loc[:,'ref_end'] = mapping_df.ref_start + mapping_df.seq_length - 1
	mapping_df.loc[:,'mitochondrial'] = np.where(mapping_df.reference.str.contains(r'(\\|MT-)\|(12S)\|(16S)'), 'mito', 'nuclear')

	return mapping_df


	#%%
	@log_time(log)
	def tRNA_annotation(mapping_df):
	"""Extract tRNA specific annotation from mapping.
	"""
	# keep only tRNA leader/trailer with right cutting sites (+/- 5nt)
	# leader
	tRF_leader_df = mapping_df[mapping_df['subclass_type'] == 'leader_tRF']
	# assign as misc-leader-tRF if exceeding defined cutting site range
	tRF_leader_df.loc[:,'subclass_type'] = np.where((tRF_leader_df.ref_start + tRF_leader_df.sequence.apply(lambda x: len(x))).between(45, 55, inclusive='both'), 'leader_tRF', 'misc-leader-tRF')

	# trailer
	tRF_trailer_df = mapping_df[mapping_df['subclass_type'] == 'trailer_tRF']
	# assign as misc-trailer-tRF if exceeding defined cutting site range
	tRF_trailer_df.loc[:,'subclass_type'] = np.where(tRF_trailer_df.ref_start.between(0, 5, inclusive='both'), 'trailer_tRF', 'misc-trailer-tRF')

	# define tRF subclasses (leader_tRF and trailer_tRF have been assigned previously)
	# NOTE: allow more flexibility at ends (similar to miRNA annotation)
	tRNAs_df = mapping_df[((mapping_df['small_RNA_class_annotation'] == 'tRNA') & mapping_df['subclass_type'].isna())]
	tRNAs_df.loc[((tRNAs_df.ref_start < 3) & (tRNAs_df.seq_length >= 30)),'subclass_type'] = '5p-tR-half'
	tRNAs_df.loc[((tRNAs_df.ref_start < 3) & (tRNAs_df.seq_length < 30)),'subclass_type'] = '5p-tRF'
	tRNAs_df.loc[(((tRNAs_df.precursor_length - (tRNAs_df.ref_end + 1)) < 6) & (tRNAs_df.seq_length >= 30)),'subclass_type'] = '3p-tR-half'
	tRNAs_df.loc[(((tRNAs_df.precursor_length - (tRNAs_df.ref_end + 1)).between(3,6,inclusive='neither')) & (tRNAs_df.seq_length < 30)),'subclass_type'] = '3p-tRF'
	tRNAs_df.loc[(((tRNAs_df.precursor_length - (tRNAs_df.ref_end + 1)) < 3) & (tRNAs_df.seq_length < 30)),'subclass_type'] = '3p-CCA-tRF'
	tRNAs_df.loc[tRNAs_df.subclass_type.isna(),'subclass_type'] = 'misc-tRF'
	# add ref_iso flag
	tRNAs_df['tRNA_ref_iso'] = np.where(
	(
	(tRNAs_df.ref_start == 0)
	\| ((tRNAs_df.ref_end + 1) == tRNAs_df.precursor_length)
	\| ((tRNAs_df.ref_end + 1) == (tRNAs_df.precursor_length - 3))
	), 'reftRF', 'isotRF'
	)
	# concat tRNA, leader & trailer dfs
	tRNAs_df = pd.concat([tRNAs_df, tRF_leader_df, tRF_trailer_df],axis=0)
	# adjust precursor name and create tRNA name
	tRNAs_df['precursor_name'] = tRNAs_df.precursor_name.str.extract(r"((tRNA-...-...)\|(MT-..)\|(tRX-...-...)\|(tRNA-i...-...))", expand=True)[0]
	tRNAs_df['subclass_name'] = tRNAs_df.subclass_type + '__' + tRNAs_df.precursor_name

	return tRNAs_df

	#%%
	def faustrules_check(row):
	"""Check if isomiRs follow Faustrules (based on Tomasello et al. 2021).
	"""

	# mark seqs that are not in range +/- 2nt of mature start
	# check if ref_start.between(miRNAs_df.mature_start-2, miRNAs_df.mature_start+2, inclusive='both')]
	ref_start = row['ref_start']
	mature_start = row['mature_start']

	if ref_start < mature_start - 2 or ref_start > mature_start + 2:
	return False

	# mark seqs with mismatch unless A>G or C>T in seed region (= position 0-8) or 3' polyA/polyT (max 3nt)
	if pd.isna(row['mm_descriptors']):
	return True

	seed_region_positions = set(range(9))
	non_templated_ends = {'A', 'AA', 'AAA', 'T', 'TT', 'TTT'}

	sequence = row['sequence']
	mm_descriptors = row['mm_descriptors'].split(';')

	seed_region_mismatches = 0
	three_prime_end_mismatches = 0

	for descriptor in mm_descriptors:
	pos, change = descriptor.split(':')
	pos = int(pos)
	original, new = change.split('>')

	if pos in seed_region_positions and (original == 'A' and new == 'G' or original == 'C' and new == 'T'):
	seed_region_mismatches += 1

	if pos >= len(sequence) - 3 and sequence[pos:] in non_templated_ends:
	three_prime_end_mismatches += 1

	total_mismatches = seed_region_mismatches + three_prime_end_mismatches

	return total_mismatches == len(mm_descriptors)

	@log_time(log)
	def miRNA_annotation(mapping_df):
	"""Extract miRNA specific annotation from mapping. RaH Faustrules are applied.
	"""

	miRNAs_df = mapping_df[mapping_df.small_RNA_class_annotation == 'miRNA']

	nr_missing_alignments_expected = len(miRNAs_df.loc[miRNAs_df.duplicated(['tmp_seq_id','reference'], keep='first'),:])

	# load positions of mature miRNAs within precursor
	miRNA_pos_df = pd.read_csv(mat_miRNA_pos_path, sep='\t')
	miRNA_pos_df.drop(columns=['precursor_length'], inplace=True)
	miRNAs_df = miRNAs_df.merge(miRNA_pos_df, left_on='precursor_name_full', right_on='name_precursor', how='left')

	# load mature miRNA sequences from miRBase
	miRBase_mature_df = fasta2df_subheader(miRBase_mature_path,0)
	# subset to human miRNAs
	miRBase_mature_df = miRBase_mature_df.loc[miRBase_mature_df.index.str.contains('hsa-'),:]
	miRBase_mature_df.index = miRBase_mature_df.index.str.replace('hsa-','')
	miRBase_mature_df.reset_index(inplace=True)
	miRBase_mature_df.columns = ['name_mature','ref_miR_seq']
	# add 'ref_miR_seq'
	miRNAs_df = miRNAs_df.merge(miRBase_mature_df, left_on='name_mature', right_on='name_mature', how='left')

	# for each duplicated tmp_seq_id/reference combi, keep the one lowest lev dist of sequence to ref_miR_seq
	miRNAs_df['lev_dist'] = miRNAs_df.apply(lambda x: distance(x['sequence'], x['ref_miR_seq']), axis=1)
	miRNAs_df = miRNAs_df.sort_values(by=['tmp_seq_id','lev_dist'], ascending=[True, True]).drop_duplicates(['tmp_seq_id','reference'], keep='first')

	# add ref_iso flag
	miRNAs_df['miRNA_ref_iso'] = np.where(
	(
	(miRNAs_df.ref_start == miRNAs_df.mature_start)
	& (miRNAs_df.ref_end == miRNAs_df.mature_end)
	& (miRNAs_df.mms == 0)
	), 'refmiR', 'isomiR'
	)

	# apply RaH Faustrules
	miRNAs_df['faustrules_check'] = miRNAs_df.apply(faustrules_check, axis=1)

	# set miRNA_ref_iso to 'misc-miR' if faustrules_check is False
	miRNAs_df.loc[~miRNAs_df.faustrules_check,'miRNA_ref_iso'] = 'misc-miR'

	# set subclass_name to name_mature if faustrules_check is True, else use precursor_name
	miRNAs_df['subclass_name'] = np.where(miRNAs_df.faustrules_check, miRNAs_df.name_mature, miRNAs_df.precursor_name)

	# store name_mature for functional analysis as miRNA_names, set miR- to mir- if faustrules_check is False
	miRNAs_df['miRNA_names'] = np.where(miRNAs_df.faustrules_check, miRNAs_df.name_mature, miRNAs_df.name_mature.str.replace('miR-', 'mir-'))

	# add subclass (NOTE: in cases where subclass is not part of mature name, use position relative to precursor half to define group )
	miRNAs_df['subclass_type'] = np.where(miRNAs_df.name_mature.str.endswith('5p'), '5p', np.where(miRNAs_df.name_mature.str.endswith('3p'), '3p', 'tbd'))
	miRNAs_df.loc[((miRNAs_df.subclass_type == 'tbd') & (miRNAs_df.mature_start < miRNAs_df.precursor_length/2)), 'subclass_type'] = '5p'
	miRNAs_df.loc[((miRNAs_df.subclass_type == 'tbd') & (miRNAs_df.mature_start >= miRNAs_df.precursor_length/2)), 'subclass_type'] = '3p'

	# subset to relevant columns
	miRNAs_df = miRNAs_df[list(mapping_df.columns) + ['subclass_name','miRNA_ref_iso','miRNA_names','ref_miR_seq']]

	return miRNAs_df, nr_missing_alignments_expected


	#%%
	######################################################################################################
	# annotation of other sRNA classes
	######################################################################################################
	def get_bin_with_max_overlap_parallel(df):
	return df.apply(get_bin_with_max_overlap, axis=1)

	def applyParallel(df, func):
	retLst = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(func)(group) for group in np.array_split(df,30))
	return pd.concat(retLst)


	@log_time(log)
	def other_sRNA_annotation_new_binning(mapping_df):
	"""Generate subclass_name for non-tRNA/miRNA sRNAs by precursor-binning.
	New binning approach: bin size is dynamically determined by the precursor length. Assignments are based on the bin with the highest overlap.
	"""

	other_sRNAs_df = mapping_df[~((mapping_df.small_RNA_class_annotation == 'miRNA') \| (mapping_df.small_RNA_class_annotation == 'tRNA'))]

	#create empty columns; bin start and bin end
	other_sRNAs_df['bin_start'] = ''
	other_sRNAs_df['bin_end'] = ''

	other_sRNAs_df = applyParallel(other_sRNAs_df, get_bin_with_max_overlap_parallel)

	return other_sRNAs_df


	#%%
	@log_time(log)
	def extract_sRNA_class_specific_info(mapping_df):
	tRNAs_df = tRNA_annotation(mapping_df)
	miRNAs_df, nr_missing_alignments_expected = miRNA_annotation(mapping_df)
	other_sRNAs_df = other_sRNA_annotation_new_binning(mapping_df)

	# add miRNA columns
	tRNAs_df[['miRNA_ref_iso', 'miRNA_names', 'ref_miR_seq']] = pd.DataFrame(columns=['miRNA_ref_iso', 'miRNA_names', 'ref_miR_seq'])
	other_sRNAs_df[['miRNA_ref_iso', 'miRNA_names', 'ref_miR_seq']] = pd.DataFrame(columns=['miRNA_ref_iso', 'miRNA_names', 'ref_miR_seq'])

	# re-concat sRNA class dfs
	sRNA_anno_df = pd.concat([miRNAs_df, tRNAs_df, other_sRNAs_df],axis=0)

	# TEST if alignments were lost or duplicated
	assert ((len(mapping_df) - nr_missing_alignments_expected) == len(sRNA_anno_df)), "alignments were lost or duplicated"

	return sRNA_anno_df

	#%%
	def get_nth_nt(row):
	return row['sequence'][int(row['PTM_position_in_seq'])-1]



	#%%
	@log_time(log)
	def aggregate_info_per_seq(sRNA_anno_df):
	# fillna of 'subclass_name_bin_pos' with 'subclass_name'
	sRNA_anno_df['subclass_name_bin_pos'] = sRNA_anno_df['subclass_name_bin_pos'].fillna(sRNA_anno_df['subclass_name'])
	# get aggregated info per seq
	aggreg_per_seq_df = sRNA_anno_df.groupby(['sequence']).agg({'small_RNA_class_annotation': lambda x: ';'.join(sorted(x.unique())), 'pseudo_class': lambda x: ';'.join(x.astype(str).sort_values(ascending=True).unique()), 'subclass_type': lambda x: ';'.join(x.astype(str).sort_values(ascending=True).unique()), 'subclass_name': lambda x: ';'.join(sorted(x.unique())), 'subclass_name_bin_pos': lambda x: ';'.join(sorted(x.unique())), 'miRNA_names': lambda x: ';'.join(x.fillna('').unique()), 'precursor_name_full': lambda x: ';'.join(sorted(x.unique())), 'mms': lambda x: ';'.join(x.astype(str).sort_values(ascending=True).unique()), 'reference': lambda x: len(x), 'mitochondrial': lambda x: ';'.join(x.astype(str).sort_values(ascending=True).unique()), 'ref_miR_seq': lambda x: ';'.join(x.fillna('').unique())})
	aggreg_per_seq_df['miRNA_names'] = aggreg_per_seq_df.miRNA_names.str.replace(r';$','', regex=True)
	aggreg_per_seq_df['ref_miR_seq'] = aggreg_per_seq_df.ref_miR_seq.str.replace(r';$','', regex=True)
	aggreg_per_seq_df['mms'] = aggreg_per_seq_df['mms'].astype(int)

	# re-add 'miRNA_ref_iso','tRNA_ref_iso'
	refmir_df = sRNA_anno_df[['sequence','miRNA_ref_iso','tRNA_ref_iso']]
	refmir_df.drop_duplicates('sequence', inplace=True)
	refmir_df.set_index('sequence', inplace=True)
	aggreg_per_seq_df = aggreg_per_seq_df.merge(refmir_df, left_index=True, right_index=True, how='left')

	# TEST if sequences were lost
	assert (len(aggreg_per_seq_df) == len(sRNA_anno_df.sequence.unique())), "sequences were lost by aggregation"

	# load unmapped seqs, if it exits
	if os.path.exists(unmapped_file):
	unmapped_df = fasta2df(unmapped_file)
	unmapped_df = pd.DataFrame(data='no_annotation', index=unmapped_df.sequence, columns=aggreg_per_seq_df.columns)
	unmapped_df['mms'] = np.nan
	unmapped_df['reference'] = np.nan
	unmapped_df['pseudo_class'] = True # set no annotation as pseudo_class

	# merge mapped and unmapped
	annotation_df = pd.concat([aggreg_per_seq_df,unmapped_df])
	else:
	annotation_df = aggreg_per_seq_df.copy()

	# load mapping to genome file
	mapping_genome_df = pd.read_csv(mapped_genome_file, index_col=0, sep='\t', header=None)
	mapping_genome_df.columns = ['strand','reference','ref_start','sequence','other_alignments','mm_descriptors']
	mapping_genome_df = mapping_genome_df[['strand','reference','ref_start','sequence','other_alignments']]

	# use reverse complement of 'sequence' for 'strand' == '-'
	mapping_genome_df.loc[:,'sequence'] = np.where(mapping_genome_df.strand == '-', mapping_genome_df.sequence.apply(lambda x: reverse_complement(x)), mapping_genome_df.sequence)

	# get aggregated info per seq
	aggreg_per_seq__genome_df = mapping_genome_df.groupby('sequence').agg({'reference': lambda x: ';'.join(sorted(x.unique())), 'other_alignments': lambda x: len(x)})
	aggreg_per_seq__genome_df['other_alignments'] = aggreg_per_seq__genome_df['other_alignments'].astype(int)

	# number of genomic loci
	genomic_loci_df = pd.DataFrame(mapping_genome_df.sequence.value_counts())
	genomic_loci_df.columns = ['num_genomic_loci_maps']

	# load too many aligments seqs
	if os.path.exists(toomanyloci_genome_file):
	toomanyloci_genome_df = fasta2df(toomanyloci_genome_file)
	toomanyloci_genome_df = pd.DataFrame(data=101, index=toomanyloci_genome_df.sequence, columns=genomic_loci_df.columns)
	else:
	toomanyloci_genome_df = pd.DataFrame(columns=genomic_loci_df.columns)

	# load unmapped seqs
	if os.path.exists(unmapped_genome_file):
	unmapped_genome_df = fasta2df(unmapped_genome_file)
	unmapped_genome_df = pd.DataFrame(data=0, index=unmapped_genome_df.sequence, columns=genomic_loci_df.columns)
	else:
	unmapped_genome_df = pd.DataFrame(columns=genomic_loci_df.columns)

	# concat toomanyloci, unmapped, and genomic_loci
	num_genomic_loci_maps_df = pd.concat([genomic_loci_df,toomanyloci_genome_df,unmapped_genome_df])

	# merge to annotation_df
	annotation_df = annotation_df.merge(num_genomic_loci_maps_df, left_index=True, right_index=True, how='left')
	annotation_df.reset_index(inplace=True)

	# add 'miRNA_seed'
	annotation_df.loc[:,"miRNA_seed"] = np.where(annotation_df.small_RNA_class_annotation.str.contains('miRNA', na=False), annotation_df.sequence.str[1:9], "")

	# TEST if nan values in 'num_genomic_loci_maps'
	assert (annotation_df.num_genomic_loci_maps.isna().any() == False), "nan values in 'num_genomic_loci_maps'"

	return annotation_df




	#%%
	@log_time(log)
	def get_five_prime_adapter_info(annotation_df, five_prime_adapter):
	adapter_df = pd.DataFrame(index=annotation_df.sequence)

	min_length = 6

	is_prefixed = None
	print("5' adapter affixes:")
	for l in range(0, len(five_prime_adapter) - min_length):
	is_prefixed_l = adapter_df.index.str.startswith(five_prime_adapter[l:])
	print(f"{five_prime_adapter[l:].ljust(30, ' ')}{is_prefixed_l.sum()}")
	adapter_df.loc[adapter_df.index.str.startswith(five_prime_adapter[l:]), "five_prime_adapter_length"] = len(five_prime_adapter[l:])
	if is_prefixed is None:
	is_prefixed = is_prefixed_l
	else:
	is_prefixed \|= is_prefixed_l

	print(f"There are {is_prefixed.sum()} prefixed features.")
	print("\n")

	adapter_df['five_prime_adapter_length'] = adapter_df['five_prime_adapter_length'].fillna(0)
	adapter_df['five_prime_adapter_length'] = adapter_df['five_prime_adapter_length'].astype('int')
	adapter_df['five_prime_adapter_filter'] = np.where(adapter_df['five_prime_adapter_length'] == 0, True, False)
	adapter_df = adapter_df.reset_index()

	return adapter_df

	#%%
	@log_time(log)
	def reduce_ambiguity(annotation_df: pd.DataFrame) -> pd.DataFrame:
	"""Reduce ambiguity by

	a) using subclass_name of precursor with shortest genomic context, if all other assigned precursors overlap with its genomic region

	b) using subclass_name whose bin is at the 5' or 3' end of the precursor

	Parameters
	----------
	annotation_df : pd.DataFrame
	A DataFrame containing the annotation of the sequences (var)

	Returns
	-------
	pd.DataFrame
	An improved version of the input DataFrame with reduced ambiguity
	"""

	# extract ambigious assignments for subclass name
	ambigious_matches_df = annotation_df[annotation_df.subclass_name.str.contains(';',na=False)]
	if len(ambigious_matches_df) == 0:
	print('No ambigious assignments for subclass name found.')
	return annotation_df
	clear_matches_df = annotation_df[~annotation_df.subclass_name.str.contains(';',na=False)]

	# extract required information from HBDxBase
	HBDxBase_all_df = pd.read_csv(HBDxBase_csv, index_col=0)
	bin_dict = HBDxBase_all_df[['precursor_name','precursor_bins']].set_index('precursor_name').to_dict()['precursor_bins']
	sRNA_class_dict = HBDxBase_all_df[['precursor_name','small_RNA_class_annotation']].set_index('precursor_name').to_dict()['small_RNA_class_annotation']
	pseudo_class_dict = HBDxBase_all_df[['precursor_name','pseudo_class']].set_index('precursor_name').to_dict()['pseudo_class']
	sc_type_dict = HBDxBase_all_df[['precursor_name','subclass_type']].set_index('precursor_name').to_dict()['subclass_type']
	genomic_context_bed = HBDxBase_all_df[['chr','start','end','precursor_name','score','strand']]
	genomic_context_bed.columns = ['seq_id','start','end','name','score','strand']
	genomic_context_bed.reset_index(drop=True, inplace=True)
	genomic_context_bed['genomic_length'] = genomic_context_bed.end - genomic_context_bed.start


	def get_overlaps(genomic_context_bed: pd.DataFrame, name: str = None, complement: bool = False) -> list:
	"""Get genomic overlap of a given precursor name

	Parameters
	----------
	genomic_context_bed : pd.DataFrame
	A DataFrame containing genomic locations of precursors in bed format
	with column names: 'chr','start','end','precursor_name','score','strand'
	name : str
	The name of the precursor to get genomic context for
	complement : bool
	If True, return all precursors that do not overlap with the given precursor

	Returns
	-------
	list
	A list containing the precursors in the genomic (anti-)context of the given precursor
	(including the precursor itself)
	"""
	series_OI = genomic_context_bed[genomic_context_bed['name'] == name]
	start = series_OI['start'].values[0]
	end = series_OI['end'].values[0]
	seq_id = series_OI['seq_id'].values[0]
	strand = series_OI['strand'].values[0]

	overlap_df = genomic_context_bed.copy()

	condition = (((overlap_df.start > start) &
	(overlap_df.start < end)) \|
	((overlap_df.end > start) &
	(overlap_df.end < end)) \|
	((overlap_df.start < start) &
	(overlap_df.end > start)) \|
	((overlap_df.start == start) &
	(overlap_df.end == end)) \|
	((overlap_df.start == start) &
	(overlap_df.end > end)) \|
	((overlap_df.start < start) &
	(overlap_df.end == end)))
	if not complement:
	overlap_df = overlap_df[condition]
	else:
	overlap_df = overlap_df[~condition]
	overlap_df = overlap_df[overlap_df.seq_id == seq_id]
	if strand is not None:
	overlap_df = overlap_df[overlap_df.strand == strand]
	overlap_list = overlap_df['name'].tolist()
	return overlap_list


	def check_genomic_ctx_of_smallest_prec(precursor_name: str) -> str:
	"""Check for a given ambigious precursor assignment (several names separated by ';')
	if all assigned precursors overlap with the genomic region
	of the precursor with the shortest genomic context

	Parameters
	----------
	precursor_name: str
	A string containing several precursor names separated by ';'

	Returns
	-------
	str
	The precursor suggested to be used instead of the multi assignment,
	or None if the ambiguity could not be resolved
	"""
	assigned_names = precursor_name.split(';')

	tmp_genomic_context = genomic_context_bed[genomic_context_bed.name.isin(assigned_names)]
	# get name of smallest genomic region
	if len(tmp_genomic_context) > 0:
	smallest_name = tmp_genomic_context.name[tmp_genomic_context.genomic_length.idxmin()]
	# check if all assigned names are in overlap of smallest genomic region
	if set(assigned_names).issubset(set(get_overlaps(genomic_context_bed,smallest_name))):
	return smallest_name
	else:
	return None
	else:
	return None

	def get_subclass_name(subclass_name: str, short_prec_match_new_name: str) -> str:
	"""Get subclass name matching to a precursor name from a ambigious assignment (several names separated by ';')

	Parameters
	----------
	subclass_name: str
	A string containing several subclass names separated by ';'
	short_prec_match_new_name: str
	The name of the precursor to be used instead of the multi assignment

	Returns
	-------
	str
	The subclass name suggested to be used instead of the multi assignment,
	or None if the ambiguity could not be resolved
	"""
	if short_prec_match_new_name is not None:
	matches = get_close_matches(short_prec_match_new_name,subclass_name.split(';'),cutoff=0.2)
	if matches:
	return matches[0]
	else:
	print(f"Could not find match for {short_prec_match_new_name} in {subclass_name}")
	return subclass_name
	else:
	return None


	def check_end_bins(subclass_name: str) -> str:
	"""Check for a given ambigious subclass name assignment (several names separated by ';')
	if ambiguity can be resolved by selecting the subclass name whose bin matches the 3'/5' end of the precursor

	Parameters
	----------
	subclass_name: str
	A string containing several subclass names separated by ';'

	Returns
	-------
	str
	The subclass name suggested to be used instead of the multi assignment,
	or None if the ambiguity could not be resolved
	"""
	for name in subclass_name.split(';'):
	if '_bin-' in name:
	name_parts = name.split('_bin-')
	if name_parts[0] in bin_dict and bin_dict[name_parts[0]] == int(name_parts[1]):
	return name
	elif int(name_parts[1]) == 1:
	return name
	return None


	def adjust_4_resolved_cases(row: pd.Series) -> tuple:
	"""For a resolved ambiguous subclass names return adjusted values of
	precursor_name_full, small_RNA_class_annotation, pseudo_class, and subclass_type

	Parameters
	----------
	row: pd.Series
	A row of the var annotation containing the columns 'subclass_name', 'precursor_name_full',
	'small_RNA_class_annotation', 'pseudo_class', 'subclass_type', and 'ambiguity_resolved'

	Returns
	-------
	tuple
	A tuple containing the adjusted values of 'precursor_name_full', 'small_RNA_class_annotation',
	'pseudo_class', and 'subclass_type' for resolved ambiguous cases and the original values for unresolved cases
	"""
	if row.ambiguity_resolved:
	matches_prec = get_close_matches(row.subclass_name, row.precursor_name_full.split(';'), cutoff=0.2)
	if matches_prec:
	return matches_prec[0], sRNA_class_dict[matches_prec[0]], pseudo_class_dict[matches_prec[0]], sc_type_dict[matches_prec[0]]
	return row.precursor_name_full, row.small_RNA_class_annotation, row.pseudo_class, row.subclass_type


	# resolve ambiguity by checking genomic context of smallest precursor
	ambigious_matches_df['short_prec_match_new_name'] = ambigious_matches_df.precursor_name_full.apply(check_genomic_ctx_of_smallest_prec)
	ambigious_matches_df['short_prec_match_new_name'] = ambigious_matches_df.apply(lambda x: get_subclass_name(x.subclass_name, x.short_prec_match_new_name), axis=1)
	ambigious_matches_df['short_prec_match'] = ambigious_matches_df['short_prec_match_new_name'].notnull()

	# resolve ambiguity by checking if bin matches 3'/5' end of precursor
	ambigious_matches_df['end_bin_match_new_name'] = ambigious_matches_df.subclass_name.apply(check_end_bins)
	ambigious_matches_df['end_bin_match'] = ambigious_matches_df['end_bin_match_new_name'].notnull()

	# check if short_prec_match and end_bin_match are equal in any case
	test_df = ambigious_matches_df[((ambigious_matches_df.short_prec_match == True) & (ambigious_matches_df.end_bin_match == True))]
	if not (test_df.short_prec_match_new_name == test_df.end_bin_match_new_name).all():
	print('Number of cases where short_prec_match is not matching end_bin_match_new_name:',len(test_df[(test_df.short_prec_match_new_name != test_df.end_bin_match_new_name)]))

	# replace subclass_name with short_prec_match_new_name or end_bin_match_new_name
	# NOTE: if short_prec_match and end_bin_match are True, short_prec_match_new_name is used
	ambigious_matches_df['subclass_name'] = ambigious_matches_df.apply(lambda x: x.end_bin_match_new_name if x.end_bin_match == True else x.subclass_name, axis=1)
	ambigious_matches_df['subclass_name'] = ambigious_matches_df.apply(lambda x: x.short_prec_match_new_name if x.short_prec_match == True else x.subclass_name, axis=1)

	# generate column 'ambiguity_resolved' which is True if short_prec_match and/or end_bin_match is True
	ambigious_matches_df['ambiguity_resolved'] = ambigious_matches_df.short_prec_match \| ambigious_matches_df.end_bin_match
	print("Ambiguity resolved?\n",ambigious_matches_df.ambiguity_resolved.value_counts(normalize=True))

	# for resolved ambiguous matches, adjust precursor_name_full, small_RNA_class_annotation, pseudo_class, subclass_type
	ambigious_matches_df[['precursor_name_full','small_RNA_class_annotation','pseudo_class','subclass_type']] = ambigious_matches_df.apply(adjust_4_resolved_cases, axis=1, result_type='expand')

	# drop temporary columns
	ambigious_matches_df.drop(columns=['short_prec_match_new_name','short_prec_match','end_bin_match_new_name','end_bin_match'], inplace=True)

	# concat with clear_matches_df
	clear_matches_df['ambiguity_resolved'] = False
	improved_annotation_df = pd.concat([clear_matches_df, ambigious_matches_df], axis=0)
	improved_annotation_df = improved_annotation_df.reindex(annotation_df.index)

	return improved_annotation_df

	#%%
	######################################################################################################
	# HICO (=high confidence) annotation
	######################################################################################################
	@log_time(log)
	def add_hico_annotation(annotation_df, five_prime_adapter):
	"""For miRNAs only use hico annotation if part of miRBase hico set AND refmiR
	"""

	# add 'TE_annotation'
	TE_df = pd.read_csv(TE_file, sep='\t', header=None, names=['sequence','TE_annotation'])
	annotation_df = annotation_df.merge(TE_df, left_on='sequence', right_on='sequence', how='left')

	# add 'bacterial' mapping filter
	bacterial_unmapped_df = fasta2df(unmapped_bacterial_file)
	annotation_df.loc[:,'bacterial'] = np.where(annotation_df.sequence.isin(bacterial_unmapped_df.sequence), False, True)

	# add 'viral' mapping filter
	viral_unmapped_df = fasta2df(unmapped_viral_file)
	annotation_df.loc[:,'viral'] = np.where(annotation_df.sequence.isin(viral_unmapped_df.sequence), False, True)

	# add 'adapter_mapping_filter' column
	adapter_unmapped_df = fasta2df(unmapped_adapter_file)
	annotation_df.loc[:,'adapter_mapping_filter'] = np.where(annotation_df.sequence.isin(adapter_unmapped_df.sequence), True, False)

	# add filter column 'five_prime_adapter_filter' and column 'five_prime_adapter_length' indicating the length of the prefixed 5' adapter sequence
	adapter_df = get_five_prime_adapter_info(annotation_df, five_prime_adapter)
	annotation_df = annotation_df.merge(adapter_df, left_on='sequence', right_on='sequence', how='left')

	# apply ambiguity reduction
	annotation_df = reduce_ambiguity(annotation_df)

	# add 'single_class_annotation'
	annotation_df.loc[:,'single_class_annotation'] = np.where(annotation_df.small_RNA_class_annotation.str.contains(';',na=True), False, True)

	# add 'single_name_annotation'
	annotation_df.loc[:,'single_name_annotation'] = np.where(annotation_df.subclass_name.str.contains(';',na=True), False, True)

	# add 'hypermapper' for sequences where more than 50 potential mapping references are recorded
	annotation_df.loc[annotation_df.reference > 50,'subclass_name'] = 'hypermapper_' + annotation_df.reference.fillna(0).astype(int).astype(str)
	annotation_df.loc[annotation_df.reference > 50,'subclass_name_bin_pos'] = 'hypermapper_' + annotation_df.reference.fillna(0).astype(int).astype(str)
	annotation_df.loc[annotation_df.reference > 50,'precursor_name_full'] = 'hypermapper_' + annotation_df.reference.fillna(0).astype(int).astype(str)

	annotation_df.loc[:,'mitochondrial'] = np.where(annotation_df.mitochondrial.str.contains('mito',na=False), True, False)

	# add 'hico'
	annotation_df.loc[:,'hico'] = np.where((
	(annotation_df.mms == 0)
	& (annotation_df.single_name_annotation == True)
	& (annotation_df.TE_annotation.isna() == True)
	& (annotation_df.bacterial == False)
	& (annotation_df.viral == False)
	& (annotation_df.adapter_mapping_filter == True)
	& (annotation_df.five_prime_adapter_filter == True)
	), True, False)
	## NOTE: for miRNAs only use hico annotation if part of refmiR set
	annotation_df.loc[annotation_df.small_RNA_class_annotation == 'miRNA','hico'] = annotation_df.loc[annotation_df.small_RNA_class_annotation == 'miRNA','hico'] & (annotation_df.miRNA_ref_iso == 'refmiR')

	print(annotation_df[annotation_df.single_class_annotation == True].groupby('small_RNA_class_annotation').hico.value_counts())

	return annotation_df




	#%%
	######################################################################################################
	# annotation pipeline
	######################################################################################################
	@log_time(log)
	def main(five_prime_adapter):
	"""Executes 'annotate_from_mapping'.

	Uses:

	- HBDxBase_csv
	- miRBase_mature_path
	- mat_miRNA_pos_path

	- mapping_file
	- unmapped_file
	- mapped_genome_file
	- toomanyloci_genome_file
	- unmapped_genome_file

	- TE_file
	- unmapped_adapter_file
	- unmapped_bacterial_file
	- unmapped_viral_file
	- five_prime_adapter

	"""


	print('-------- extract general information for sequences that mapped to the HBDxBase --------')
	mapped_info_df = extract_general_info(mapped_file)
	print("\n")

	print('-------- extract sRNA class specific information for sequences that mapped to the HBDxBase --------')
	mapped_sRNA_anno_df = extract_sRNA_class_specific_info(mapped_info_df)

	print('-------- save to file --------')
	mapped_sRNA_anno_df.to_csv(sRNA_anno_file)
	print("\n")

	print('-------- aggregate information for mapped and unmapped sequences (HBDxBase & human genome) --------')
	sRNA_anno_per_seq_df = aggregate_info_per_seq(mapped_sRNA_anno_df)
	print("\n")

	print('-------- add hico annotation (based on aggregated infos + mapping to viral/bacterial genomes + intersection with TEs) --------')
	sRNA_anno_per_seq_df = add_hico_annotation(sRNA_anno_per_seq_df, five_prime_adapter)
	print("\n")

	print('-------- save to file --------')
	# set sequence as index again
	sRNA_anno_per_seq_df.set_index('sequence', inplace=True)
	sRNA_anno_per_seq_df.to_csv(aggreg_sRNA_anno_file)
	print("\n")

	print('-------- generate subclass_to_annotation dict --------')
	result_df = sRNA_anno_per_seq_df[['subclass_name', 'small_RNA_class_annotation']].copy()
	result_df.reset_index(drop=True, inplace=True)
	result_df.drop_duplicates(inplace=True)
	result_df = result_df[~result_df["subclass_name"].str.contains(";")]
	subclass_to_annotation = dict(zip(result_df["subclass_name"],result_df["small_RNA_class_annotation"]))
	with open('subclass_to_annotation.json', 'w') as fp:
	json.dump(subclass_to_annotation, fp)

	print('-------- delete tmp files --------')
	os.system("rm tmp_")


	#%%