######################################################################################################
# annotate sequences based on mapping results
######################################################################################################
#%%
import os
import logging

import numpy as np
import pandas as pd
from difflib import get_close_matches
from Levenshtein import distance
import json

from joblib import Parallel, delayed
import multiprocessing


from utils import (fasta2df, fasta2df_subheader,log_time, reverse_complement)
from precursor_bins import get_bin_with_max_overlap


log = logging.getLogger(__name__)

pd.options.mode.chained_assignment = None


######################################################################################################
# paths to reference and mapping files
######################################################################################################

version = '_v4'

HBDxBase_csv = f'../../references/HBDxBase/HBDxBase_all{version}.csv'
miRBase_mature_path = '../../references/HBDxBase/miRBase/mature.fa'
mat_miRNA_pos_path = '../../references/HBDxBase/miRBase/hsa_mature_position.txt'

mapped_file = 'seqsmapped2HBDxBase_combined.txt'
unmapped_file = 'tmp_seqs3mm2HBDxBase_pseudo__unmapped.fa'
TE_file = 'tmp_seqsmapped2genome_intersect_TE.txt'
mapped_genome_file = 'seqsmapped2genome_combined.txt'
toomanyloci_genome_file = 'tmp_seqs0mm2genome__toomanyalign.fa'
unmapped_adapter_file = 'tmp_seqs3mm2adapters__unmapped.fa'
unmapped_genome_file = 'tmp_seqs0mm2genome__unmapped.fa'
unmapped_bacterial_file = 'tmp_seqs0mm2bacterial__unmapped.fa'
unmapped_viral_file = 'tmp_seqs0mm2viral__unmapped.fa'


sRNA_anno_file = 'sRNA_anno_from_mapping.csv'
aggreg_sRNA_anno_file = 'sRNA_anno_aggregated_on_seq.csv'


#%%
######################################################################################################
# specific functions
######################################################################################################

@log_time(log)
def extract_general_info(mapping_file):
    # load mapping file
    mapping_df = pd.read_csv(mapping_file, sep='\t', header=None)
    mapping_df.columns = ['tmp_seq_id','reference','ref_start','sequence','other_alignments','mm_descriptors']

    # add precursor length + number of bins that will be used for names
    HBDxBase_df = pd.read_csv(HBDxBase_csv, index_col=0)
    HBDxBase_df = HBDxBase_df[['precursor_length','precursor_bins','pseudo_class']].reset_index()
    HBDxBase_df.rename(columns={'index': "reference"}, inplace=True)
    mapping_df = mapping_df.merge(HBDxBase_df, left_on='reference', right_on='reference', how='left')

    # extract information
    mapping_df.loc[:,'mms'] = mapping_df.mm_descriptors.fillna('').str.count('>')
    mapping_df.loc[:,'mm_descriptors'] = mapping_df.mm_descriptors.str.replace(',', ';')
    mapping_df.loc[:,'small_RNA_class_annotation'] = mapping_df.reference.str.split('|').str[0]
    mapping_df.loc[:,'subclass_type'] = mapping_df.reference.str.split('|').str[2]
    mapping_df.loc[:,'precursor_name_full'] = mapping_df.reference.str.split('|').str[1].str.split('|').str[0]
    mapping_df.loc[:,'precursor_name'] = mapping_df.precursor_name_full.str.split('__').str[0].str.split('|').str[0]
    mapping_df.loc[:,'seq_length'] = mapping_df.sequence.apply(lambda x: len(x))
    mapping_df.loc[:,'ref_end'] = mapping_df.ref_start +  mapping_df.seq_length - 1
    mapping_df.loc[:,'mitochondrial'] = np.where(mapping_df.reference.str.contains(r'(\|MT-)|(12S)|(16S)'), 'mito', 'nuclear')
    
    return mapping_df


#%%
@log_time(log)
def tRNA_annotation(mapping_df):
    """Extract tRNA specific annotation from mapping.
    """
    # keep only tRNA leader/trailer with right cutting sites (+/- 5nt)
    # leader
    tRF_leader_df = mapping_df[mapping_df['subclass_type'] == 'leader_tRF']
    # assign as misc-leader-tRF if exceeding defined cutting site range
    tRF_leader_df.loc[:,'subclass_type'] = np.where((tRF_leader_df.ref_start + tRF_leader_df.sequence.apply(lambda x: len(x))).between(45, 55, inclusive='both'), 'leader_tRF', 'misc-leader-tRF')

    # trailer
    tRF_trailer_df = mapping_df[mapping_df['subclass_type'] == 'trailer_tRF']
    # assign as misc-trailer-tRF if exceeding defined cutting site range
    tRF_trailer_df.loc[:,'subclass_type'] = np.where(tRF_trailer_df.ref_start.between(0, 5, inclusive='both'), 'trailer_tRF', 'misc-trailer-tRF')

    # define tRF subclasses (leader_tRF and trailer_tRF have been assigned previously)
    # NOTE: allow more flexibility at ends (similar to miRNA annotation)
    tRNAs_df = mapping_df[((mapping_df['small_RNA_class_annotation'] == 'tRNA') & mapping_df['subclass_type'].isna())]
    tRNAs_df.loc[((tRNAs_df.ref_start < 3) & (tRNAs_df.seq_length >= 30)),'subclass_type'] = '5p-tR-half'
    tRNAs_df.loc[((tRNAs_df.ref_start < 3) & (tRNAs_df.seq_length < 30)),'subclass_type'] = '5p-tRF'
    tRNAs_df.loc[(((tRNAs_df.precursor_length - (tRNAs_df.ref_end + 1)) < 6) & (tRNAs_df.seq_length >= 30)),'subclass_type'] = '3p-tR-half'
    tRNAs_df.loc[(((tRNAs_df.precursor_length - (tRNAs_df.ref_end + 1)).between(3,6,inclusive='neither')) & (tRNAs_df.seq_length < 30)),'subclass_type'] = '3p-tRF'
    tRNAs_df.loc[(((tRNAs_df.precursor_length - (tRNAs_df.ref_end + 1)) < 3) & (tRNAs_df.seq_length < 30)),'subclass_type'] = '3p-CCA-tRF' 
    tRNAs_df.loc[tRNAs_df.subclass_type.isna(),'subclass_type'] = 'misc-tRF'
    # add ref_iso flag
    tRNAs_df['tRNA_ref_iso'] = np.where(
        (
            (tRNAs_df.ref_start == 0) 
            | ((tRNAs_df.ref_end + 1) == tRNAs_df.precursor_length) 
            | ((tRNAs_df.ref_end + 1) == (tRNAs_df.precursor_length - 3))
        ), 'reftRF', 'isotRF'
    )
    # concat tRNA, leader & trailer dfs
    tRNAs_df = pd.concat([tRNAs_df, tRF_leader_df, tRF_trailer_df],axis=0)
    # adjust precursor name and create tRNA name
    tRNAs_df['precursor_name'] = tRNAs_df.precursor_name.str.extract(r"((tRNA-...-...)|(MT-..)|(tRX-...-...)|(tRNA-i...-...))", expand=True)[0]
    tRNAs_df['subclass_name'] = tRNAs_df.subclass_type + '__' + tRNAs_df.precursor_name
    
    return tRNAs_df

#%%
def faustrules_check(row):
    """Check if isomiRs follow Faustrules (based on Tomasello et al. 2021).
    """
    
    # mark seqs that are not in range +/- 2nt of mature start
    # check if ref_start.between(miRNAs_df.mature_start-2, miRNAs_df.mature_start+2, inclusive='both')]
    ref_start = row['ref_start']
    mature_start = row['mature_start']

    if ref_start < mature_start - 2 or ref_start > mature_start + 2:
        return False
    
    # mark seqs with mismatch unless A>G or C>T in seed region (= position 0-8) or 3' polyA/polyT (max 3nt)
    if pd.isna(row['mm_descriptors']):
        return True
    
    seed_region_positions = set(range(9))
    non_templated_ends = {'A', 'AA', 'AAA', 'T', 'TT', 'TTT'}
    
    sequence = row['sequence']
    mm_descriptors = row['mm_descriptors'].split(';')
    
    seed_region_mismatches = 0
    three_prime_end_mismatches = 0
    
    for descriptor in mm_descriptors:
        pos, change = descriptor.split(':')
        pos = int(pos)
        original, new = change.split('>')
        
        if pos in seed_region_positions and (original == 'A' and new == 'G' or original == 'C' and new == 'T'):
            seed_region_mismatches += 1
        
        if pos >= len(sequence) - 3 and sequence[pos:] in non_templated_ends:
            three_prime_end_mismatches += 1
    
    total_mismatches = seed_region_mismatches + three_prime_end_mismatches

    return total_mismatches == len(mm_descriptors)

@log_time(log)
def miRNA_annotation(mapping_df):
    """Extract miRNA specific annotation from mapping. RaH Faustrules are applied.
    """
    
    miRNAs_df = mapping_df[mapping_df.small_RNA_class_annotation == 'miRNA']
    
    nr_missing_alignments_expected = len(miRNAs_df.loc[miRNAs_df.duplicated(['tmp_seq_id','reference'], keep='first'),:])
    
    # load positions of mature miRNAs within precursor
    miRNA_pos_df = pd.read_csv(mat_miRNA_pos_path, sep='\t')
    miRNA_pos_df.drop(columns=['precursor_length'], inplace=True)
    miRNAs_df = miRNAs_df.merge(miRNA_pos_df, left_on='precursor_name_full', right_on='name_precursor', how='left')

    # load mature miRNA sequences from miRBase
    miRBase_mature_df = fasta2df_subheader(miRBase_mature_path,0)
    # subset to human miRNAs
    miRBase_mature_df = miRBase_mature_df.loc[miRBase_mature_df.index.str.contains('hsa-'),:]
    miRBase_mature_df.index = miRBase_mature_df.index.str.replace('hsa-','')
    miRBase_mature_df.reset_index(inplace=True)
    miRBase_mature_df.columns = ['name_mature','ref_miR_seq']
    # add 'ref_miR_seq' 
    miRNAs_df = miRNAs_df.merge(miRBase_mature_df, left_on='name_mature', right_on='name_mature', how='left')

    # for each duplicated tmp_seq_id/reference combi, keep the one lowest lev dist of sequence to ref_miR_seq
    miRNAs_df['lev_dist'] = miRNAs_df.apply(lambda x: distance(x['sequence'], x['ref_miR_seq']), axis=1)
    miRNAs_df = miRNAs_df.sort_values(by=['tmp_seq_id','lev_dist'], ascending=[True, True]).drop_duplicates(['tmp_seq_id','reference'], keep='first')

    # add ref_iso flag
    miRNAs_df['miRNA_ref_iso'] = np.where(
        (
            (miRNAs_df.ref_start == miRNAs_df.mature_start) 
            & (miRNAs_df.ref_end == miRNAs_df.mature_end) 
            & (miRNAs_df.mms == 0)
        ), 'refmiR', 'isomiR'
    )

    # apply RaH Faustrules
    miRNAs_df['faustrules_check'] =  miRNAs_df.apply(faustrules_check, axis=1)

    # set miRNA_ref_iso to 'misc-miR' if faustrules_check is False
    miRNAs_df.loc[~miRNAs_df.faustrules_check,'miRNA_ref_iso'] = 'misc-miR'

    # set subclass_name to name_mature if faustrules_check is True, else use precursor_name
    miRNAs_df['subclass_name'] = np.where(miRNAs_df.faustrules_check, miRNAs_df.name_mature, miRNAs_df.precursor_name)

    # store name_mature for functional analysis as miRNA_names, set miR- to mir- if faustrules_check is False
    miRNAs_df['miRNA_names'] = np.where(miRNAs_df.faustrules_check, miRNAs_df.name_mature, miRNAs_df.name_mature.str.replace('miR-', 'mir-'))

    # add subclass (NOTE: in cases where subclass is not part of mature name, use position relative to precursor half to define group )
    miRNAs_df['subclass_type'] = np.where(miRNAs_df.name_mature.str.endswith('5p'), '5p', np.where(miRNAs_df.name_mature.str.endswith('3p'), '3p', 'tbd'))
    miRNAs_df.loc[((miRNAs_df.subclass_type == 'tbd') & (miRNAs_df.mature_start < miRNAs_df.precursor_length/2)), 'subclass_type'] = '5p'
    miRNAs_df.loc[((miRNAs_df.subclass_type == 'tbd') & (miRNAs_df.mature_start >= miRNAs_df.precursor_length/2)), 'subclass_type'] = '3p'
    
    # subset to relevant columns
    miRNAs_df = miRNAs_df[list(mapping_df.columns) + ['subclass_name','miRNA_ref_iso','miRNA_names','ref_miR_seq']]

    return miRNAs_df, nr_missing_alignments_expected


#%%
######################################################################################################
# annotation of other sRNA classes
######################################################################################################
def get_bin_with_max_overlap_parallel(df):
    return df.apply(get_bin_with_max_overlap, axis=1)

def applyParallel(df, func):
    retLst = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(func)(group) for group in np.array_split(df,30))
    return pd.concat(retLst)


@log_time(log)
def other_sRNA_annotation_new_binning(mapping_df):
    """Generate subclass_name for non-tRNA/miRNA sRNAs by precursor-binning.
    New binning approach: bin size is dynamically determined by the precursor length. Assignments are based on the bin with the highest overlap.
    """

    other_sRNAs_df = mapping_df[~((mapping_df.small_RNA_class_annotation == 'miRNA') | (mapping_df.small_RNA_class_annotation == 'tRNA'))]
    
    #create empty columns; bin start and bin end
    other_sRNAs_df['bin_start'] = ''
    other_sRNAs_df['bin_end'] = ''
    
    other_sRNAs_df = applyParallel(other_sRNAs_df, get_bin_with_max_overlap_parallel)
    
    return other_sRNAs_df


#%%
@log_time(log)
def extract_sRNA_class_specific_info(mapping_df):
    tRNAs_df = tRNA_annotation(mapping_df)
    miRNAs_df, nr_missing_alignments_expected = miRNA_annotation(mapping_df)
    other_sRNAs_df = other_sRNA_annotation_new_binning(mapping_df)
    
    # add miRNA columns
    tRNAs_df[['miRNA_ref_iso', 'miRNA_names', 'ref_miR_seq']] = pd.DataFrame(columns=['miRNA_ref_iso', 'miRNA_names', 'ref_miR_seq'])
    other_sRNAs_df[['miRNA_ref_iso', 'miRNA_names', 'ref_miR_seq']] = pd.DataFrame(columns=['miRNA_ref_iso', 'miRNA_names', 'ref_miR_seq'])
    
    # re-concat sRNA class dfs
    sRNA_anno_df = pd.concat([miRNAs_df, tRNAs_df, other_sRNAs_df],axis=0)

    # TEST if alignments were lost or duplicated
    assert ((len(mapping_df) - nr_missing_alignments_expected) == len(sRNA_anno_df)), "alignments were lost or duplicated" 
    
    return sRNA_anno_df

#%%
def get_nth_nt(row):
    return row['sequence'][int(row['PTM_position_in_seq'])-1]


#%%
@log_time(log)
def aggregate_info_per_seq(sRNA_anno_df):
    # fillna of 'subclass_name_bin_pos' with 'subclass_name'
    sRNA_anno_df['subclass_name_bin_pos'] = sRNA_anno_df['subclass_name_bin_pos'].fillna(sRNA_anno_df['subclass_name'])
    # get aggregated info per seq 
    aggreg_per_seq_df = sRNA_anno_df.groupby(['sequence']).agg({'small_RNA_class_annotation': lambda x: ';'.join(sorted(x.unique())), 'pseudo_class': lambda x: ';'.join(x.astype(str).sort_values(ascending=True).unique()), 'subclass_type': lambda x: ';'.join(x.astype(str).sort_values(ascending=True).unique()), 'subclass_name': lambda x: ';'.join(sorted(x.unique())), 'subclass_name_bin_pos': lambda x: ';'.join(sorted(x.unique())), 'miRNA_names': lambda x: ';'.join(x.fillna('').unique()), 'precursor_name_full': lambda x: ';'.join(sorted(x.unique())), 'mms': lambda x: ';'.join(x.astype(str).sort_values(ascending=True).unique()), 'reference': lambda x: len(x), 'mitochondrial': lambda x: ';'.join(x.astype(str).sort_values(ascending=True).unique()), 'ref_miR_seq': lambda x: ';'.join(x.fillna('').unique())})
    aggreg_per_seq_df['miRNA_names'] = aggreg_per_seq_df.miRNA_names.str.replace(r';$','', regex=True)
    aggreg_per_seq_df['ref_miR_seq'] = aggreg_per_seq_df.ref_miR_seq.str.replace(r';$','', regex=True)
    aggreg_per_seq_df['mms'] = aggreg_per_seq_df['mms'].astype(int)

    # re-add 'miRNA_ref_iso','tRNA_ref_iso'
    refmir_df = sRNA_anno_df[['sequence','miRNA_ref_iso','tRNA_ref_iso']]
    refmir_df.drop_duplicates('sequence', inplace=True)
    refmir_df.set_index('sequence', inplace=True)
    aggreg_per_seq_df = aggreg_per_seq_df.merge(refmir_df, left_index=True, right_index=True, how='left')

    # TEST if sequences were lost
    assert (len(aggreg_per_seq_df) == len(sRNA_anno_df.sequence.unique())), "sequences were lost by aggregation" 

    # load unmapped seqs, if it exits
    if os.path.exists(unmapped_file):
        unmapped_df = fasta2df(unmapped_file)
        unmapped_df = pd.DataFrame(data='no_annotation', index=unmapped_df.sequence, columns=aggreg_per_seq_df.columns)
        unmapped_df['mms'] = np.nan
        unmapped_df['reference'] = np.nan
        unmapped_df['pseudo_class'] = True # set no annotation as pseudo_class

        # merge mapped and unmapped
        annotation_df = pd.concat([aggreg_per_seq_df,unmapped_df])
    else:
        annotation_df = aggreg_per_seq_df.copy()

    # load mapping to genome file
    mapping_genome_df = pd.read_csv(mapped_genome_file, index_col=0, sep='\t', header=None)
    mapping_genome_df.columns = ['strand','reference','ref_start','sequence','other_alignments','mm_descriptors']
    mapping_genome_df = mapping_genome_df[['strand','reference','ref_start','sequence','other_alignments']]

    # use reverse complement of 'sequence' for 'strand' == '-'
    mapping_genome_df.loc[:,'sequence'] = np.where(mapping_genome_df.strand == '-', mapping_genome_df.sequence.apply(lambda x: reverse_complement(x)), mapping_genome_df.sequence)

    # get aggregated info per seq
    aggreg_per_seq__genome_df = mapping_genome_df.groupby('sequence').agg({'reference': lambda x: ';'.join(sorted(x.unique())), 'other_alignments': lambda x: len(x)})
    aggreg_per_seq__genome_df['other_alignments'] = aggreg_per_seq__genome_df['other_alignments'].astype(int)

    # number of genomic loci
    genomic_loci_df = pd.DataFrame(mapping_genome_df.sequence.value_counts())
    genomic_loci_df.columns = ['num_genomic_loci_maps']

    # load too many aligments seqs
    if os.path.exists(toomanyloci_genome_file):
        toomanyloci_genome_df = fasta2df(toomanyloci_genome_file)
        toomanyloci_genome_df = pd.DataFrame(data=101, index=toomanyloci_genome_df.sequence, columns=genomic_loci_df.columns)
    else:
        toomanyloci_genome_df = pd.DataFrame(columns=genomic_loci_df.columns)

    # load unmapped seqs
    if os.path.exists(unmapped_genome_file):
        unmapped_genome_df = fasta2df(unmapped_genome_file)
        unmapped_genome_df = pd.DataFrame(data=0, index=unmapped_genome_df.sequence, columns=genomic_loci_df.columns)
    else:
        unmapped_genome_df = pd.DataFrame(columns=genomic_loci_df.columns)

    # concat toomanyloci, unmapped, and genomic_loci
    num_genomic_loci_maps_df = pd.concat([genomic_loci_df,toomanyloci_genome_df,unmapped_genome_df])

    # merge to annotation_df
    annotation_df = annotation_df.merge(num_genomic_loci_maps_df, left_index=True, right_index=True, how='left')
    annotation_df.reset_index(inplace=True)

    # add 'miRNA_seed'
    annotation_df.loc[:,"miRNA_seed"] = np.where(annotation_df.small_RNA_class_annotation.str.contains('miRNA', na=False), annotation_df.sequence.str[1:9], "")

    # TEST if nan values in 'num_genomic_loci_maps'
    assert (annotation_df.num_genomic_loci_maps.isna().any() == False), "nan values in 'num_genomic_loci_maps'" 

    return annotation_df


#%%
@log_time(log)
def get_five_prime_adapter_info(annotation_df, five_prime_adapter):
    adapter_df = pd.DataFrame(index=annotation_df.sequence)

    min_length = 6

    is_prefixed = None
    print("5' adapter affixes:")
    for l in range(0, len(five_prime_adapter) - min_length):
        is_prefixed_l = adapter_df.index.str.startswith(five_prime_adapter[l:])
        print(f"{five_prime_adapter[l:].ljust(30, ' ')}{is_prefixed_l.sum()}")
        adapter_df.loc[adapter_df.index.str.startswith(five_prime_adapter[l:]), "five_prime_adapter_length"] = len(five_prime_adapter[l:])
        if is_prefixed is None:
            is_prefixed = is_prefixed_l
        else:
            is_prefixed |= is_prefixed_l

    print(f"There are {is_prefixed.sum()} prefixed features.")
    print("\n")

    adapter_df['five_prime_adapter_length'] = adapter_df['five_prime_adapter_length'].fillna(0)
    adapter_df['five_prime_adapter_length'] =  adapter_df['five_prime_adapter_length'].astype('int')
    adapter_df['five_prime_adapter_filter'] = np.where(adapter_df['five_prime_adapter_length'] == 0, True, False)
    adapter_df = adapter_df.reset_index()
    
    return adapter_df

#%%
@log_time(log)
def reduce_ambiguity(annotation_df: pd.DataFrame) -> pd.DataFrame:
    """Reduce ambiguity by 

    a) using subclass_name of precursor with shortest genomic context, if all other assigned precursors overlap with its genomic region
    
    b) using subclass_name whose bin is at the 5' or 3' end of the precursor

    Parameters
    ----------
    annotation_df : pd.DataFrame
        A DataFrame containing the annotation of the sequences (var)

    Returns
    -------
    pd.DataFrame
        An improved version of the input DataFrame with reduced ambiguity
    """

    # extract ambigious assignments for subclass name
    ambigious_matches_df = annotation_df[annotation_df.subclass_name.str.contains(';',na=False)]
    if len(ambigious_matches_df) == 0:
        print('No ambigious assignments for subclass name found.')
        return annotation_df
    clear_matches_df = annotation_df[~annotation_df.subclass_name.str.contains(';',na=False)]

    # extract required information from HBDxBase
    HBDxBase_all_df = pd.read_csv(HBDxBase_csv, index_col=0)
    bin_dict = HBDxBase_all_df[['precursor_name','precursor_bins']].set_index('precursor_name').to_dict()['precursor_bins']
    sRNA_class_dict = HBDxBase_all_df[['precursor_name','small_RNA_class_annotation']].set_index('precursor_name').to_dict()['small_RNA_class_annotation']
    pseudo_class_dict = HBDxBase_all_df[['precursor_name','pseudo_class']].set_index('precursor_name').to_dict()['pseudo_class']
    sc_type_dict = HBDxBase_all_df[['precursor_name','subclass_type']].set_index('precursor_name').to_dict()['subclass_type']
    genomic_context_bed = HBDxBase_all_df[['chr','start','end','precursor_name','score','strand']]
    genomic_context_bed.columns = ['seq_id','start','end','name','score','strand']
    genomic_context_bed.reset_index(drop=True, inplace=True)
    genomic_context_bed['genomic_length'] = genomic_context_bed.end - genomic_context_bed.start


    def get_overlaps(genomic_context_bed: pd.DataFrame, name: str = None, complement: bool = False) -> list:
        """Get genomic overlap of a given precursor name

        Parameters
        ----------
        genomic_context_bed : pd.DataFrame
            A DataFrame containing genomic locations of precursors in bed format
        with column names: 'chr','start','end','precursor_name','score','strand'
        name : str
            The name of the precursor to get genomic context for
        complement : bool
            If True, return all precursors that do not overlap with the given precursor

        Returns
        -------
        list
            A list containing the precursors in the genomic (anti-)context of the given precursor 
            (including the precursor itself)
        """
        series_OI = genomic_context_bed[genomic_context_bed['name'] == name]
        start = series_OI['start'].values[0]
        end = series_OI['end'].values[0]
        seq_id = series_OI['seq_id'].values[0]
        strand = series_OI['strand'].values[0]

        overlap_df = genomic_context_bed.copy()

        condition = (((overlap_df.start > start) &
                        (overlap_df.start < end)) |
                        ((overlap_df.end > start) &
                        (overlap_df.end < end)) |
                        ((overlap_df.start < start) &
                        (overlap_df.end > start)) |
                        ((overlap_df.start == start) &
                        (overlap_df.end == end)) |
                        ((overlap_df.start == start) &
                        (overlap_df.end > end)) |
                        ((overlap_df.start < start) &
                        (overlap_df.end == end)))
        if not complement:
            overlap_df = overlap_df[condition]
        else:
            overlap_df = overlap_df[~condition]
        overlap_df = overlap_df[overlap_df.seq_id == seq_id]
        if strand is not None:
            overlap_df = overlap_df[overlap_df.strand == strand]
        overlap_list = overlap_df['name'].tolist()
        return overlap_list


    def check_genomic_ctx_of_smallest_prec(precursor_name: str) -> str:
        """Check for a given ambigious precursor assignment (several names separated by ';')
        if all assigned precursors overlap with the genomic region
        of the precursor with the shortest genomic context

        Parameters
        ----------
        precursor_name: str
            A string containing several precursor names separated by ';'

        Returns
        -------
        str
            The precursor suggested to be used instead of the multi assignment, 
            or None if the ambiguity could not be resolved
        """
        assigned_names = precursor_name.split(';')

        tmp_genomic_context = genomic_context_bed[genomic_context_bed.name.isin(assigned_names)]
        # get name of smallest genomic region
        if len(tmp_genomic_context) > 0:
            smallest_name = tmp_genomic_context.name[tmp_genomic_context.genomic_length.idxmin()]
            # check if all assigned names are in overlap of smallest genomic region
            if set(assigned_names).issubset(set(get_overlaps(genomic_context_bed,smallest_name))):
                return smallest_name
            else:
                return None
        else:
            return None
        
    def get_subclass_name(subclass_name: str, short_prec_match_new_name: str) -> str:
        """Get subclass name matching to a precursor name from a ambigious assignment (several names separated by ';')

        Parameters
        ----------
        subclass_name: str
            A string containing several subclass names separated by ';'
        short_prec_match_new_name: str
            The name of the precursor to be used instead of the multi assignment

        Returns
        -------
        str
            The subclass name suggested to be used instead of the multi assignment, 
            or None if the ambiguity could not be resolved
        """
        if short_prec_match_new_name is not None:
            matches = get_close_matches(short_prec_match_new_name,subclass_name.split(';'),cutoff=0.2)
            if matches:
                return matches[0]
            else:
                print(f"Could not find match for {short_prec_match_new_name} in {subclass_name}")
                return subclass_name
        else:
            return None


    def check_end_bins(subclass_name: str) -> str:
        """Check for a given ambigious subclass name assignment (several names separated by ';')
        if ambiguity can be resolved by selecting the subclass name whose bin matches the 3'/5' end of the precursor

        Parameters
        ----------
        subclass_name: str
            A string containing several subclass names separated by ';'

        Returns
        -------
        str
            The subclass name suggested to be used instead of the multi assignment, 
            or None if the ambiguity could not be resolved
        """
        for name in subclass_name.split(';'):
            if '_bin-' in name:
                name_parts = name.split('_bin-')
                if name_parts[0] in bin_dict and bin_dict[name_parts[0]] == int(name_parts[1]):
                    return name
                elif int(name_parts[1]) == 1:
                    return name
        return None


    def adjust_4_resolved_cases(row: pd.Series) -> tuple:
        """For a resolved ambiguous subclass names return adjusted values of 
        precursor_name_full, small_RNA_class_annotation, pseudo_class, and subclass_type 

        Parameters
        ----------
        row: pd.Series
            A row of the var annotation containing the columns 'subclass_name', 'precursor_name_full',
            'small_RNA_class_annotation', 'pseudo_class', 'subclass_type', and 'ambiguity_resolved'

        Returns
        -------
        tuple
            A tuple containing the adjusted values of 'precursor_name_full', 'small_RNA_class_annotation', 
            'pseudo_class', and 'subclass_type' for resolved ambiguous cases and the original values for unresolved cases
        """
        if row.ambiguity_resolved:
            matches_prec = get_close_matches(row.subclass_name, row.precursor_name_full.split(';'), cutoff=0.2)
            if matches_prec:
                return matches_prec[0], sRNA_class_dict[matches_prec[0]], pseudo_class_dict[matches_prec[0]], sc_type_dict[matches_prec[0]]
        return row.precursor_name_full, row.small_RNA_class_annotation, row.pseudo_class, row.subclass_type
    
    
    # resolve ambiguity by checking genomic context of smallest precursor
    ambigious_matches_df['short_prec_match_new_name'] = ambigious_matches_df.precursor_name_full.apply(check_genomic_ctx_of_smallest_prec)
    ambigious_matches_df['short_prec_match_new_name'] = ambigious_matches_df.apply(lambda x: get_subclass_name(x.subclass_name, x.short_prec_match_new_name), axis=1)
    ambigious_matches_df['short_prec_match'] = ambigious_matches_df['short_prec_match_new_name'].notnull()

    # resolve ambiguity by checking if bin matches 3'/5' end of precursor
    ambigious_matches_df['end_bin_match_new_name'] = ambigious_matches_df.subclass_name.apply(check_end_bins)
    ambigious_matches_df['end_bin_match'] = ambigious_matches_df['end_bin_match_new_name'].notnull()

    # check if short_prec_match and end_bin_match are equal in any case
    test_df = ambigious_matches_df[((ambigious_matches_df.short_prec_match == True) & (ambigious_matches_df.end_bin_match == True))]
    if not (test_df.short_prec_match_new_name == test_df.end_bin_match_new_name).all():
        print('Number of cases where short_prec_match is not matching end_bin_match_new_name:',len(test_df[(test_df.short_prec_match_new_name != test_df.end_bin_match_new_name)]))

    # replace subclass_name with short_prec_match_new_name or end_bin_match_new_name
    # NOTE: if short_prec_match and end_bin_match are True, short_prec_match_new_name is used
    ambigious_matches_df['subclass_name'] = ambigious_matches_df.apply(lambda x: x.end_bin_match_new_name if x.end_bin_match == True else x.subclass_name, axis=1)
    ambigious_matches_df['subclass_name'] = ambigious_matches_df.apply(lambda x: x.short_prec_match_new_name if x.short_prec_match == True else x.subclass_name, axis=1)

    # generate column 'ambiguity_resolved' which is True if short_prec_match and/or end_bin_match is True
    ambigious_matches_df['ambiguity_resolved'] = ambigious_matches_df.short_prec_match | ambigious_matches_df.end_bin_match
    print("Ambiguity resolved?\n",ambigious_matches_df.ambiguity_resolved.value_counts(normalize=True))

    # for resolved ambiguous matches, adjust precursor_name_full, small_RNA_class_annotation, pseudo_class, subclass_type
    ambigious_matches_df[['precursor_name_full','small_RNA_class_annotation','pseudo_class','subclass_type']] = ambigious_matches_df.apply(adjust_4_resolved_cases, axis=1, result_type='expand')

    # drop temporary columns
    ambigious_matches_df.drop(columns=['short_prec_match_new_name','short_prec_match','end_bin_match_new_name','end_bin_match'], inplace=True)
    
    # concat with clear_matches_df
    clear_matches_df['ambiguity_resolved'] = False
    improved_annotation_df = pd.concat([clear_matches_df, ambigious_matches_df], axis=0)
    improved_annotation_df = improved_annotation_df.reindex(annotation_df.index)

    return improved_annotation_df

#%%
######################################################################################################
# HICO (=high confidence) annotation
######################################################################################################
@log_time(log)
def add_hico_annotation(annotation_df, five_prime_adapter):
    """For miRNAs only use hico annotation if part of miRBase hico set AND refmiR
    """

    # add 'TE_annotation'
    TE_df = pd.read_csv(TE_file, sep='\t', header=None, names=['sequence','TE_annotation'])
    annotation_df = annotation_df.merge(TE_df, left_on='sequence', right_on='sequence', how='left')

    # add 'bacterial' mapping filter
    bacterial_unmapped_df = fasta2df(unmapped_bacterial_file)
    annotation_df.loc[:,'bacterial'] = np.where(annotation_df.sequence.isin(bacterial_unmapped_df.sequence), False, True)

    # add 'viral' mapping filter
    viral_unmapped_df = fasta2df(unmapped_viral_file)
    annotation_df.loc[:,'viral'] = np.where(annotation_df.sequence.isin(viral_unmapped_df.sequence), False, True)

    # add 'adapter_mapping_filter' column 
    adapter_unmapped_df = fasta2df(unmapped_adapter_file)
    annotation_df.loc[:,'adapter_mapping_filter'] = np.where(annotation_df.sequence.isin(adapter_unmapped_df.sequence), True, False)

    # add filter column 'five_prime_adapter_filter' and column 'five_prime_adapter_length' indicating the length of the prefixed 5' adapter sequence
    adapter_df = get_five_prime_adapter_info(annotation_df, five_prime_adapter)
    annotation_df = annotation_df.merge(adapter_df, left_on='sequence', right_on='sequence', how='left')

    # apply ambiguity reduction
    annotation_df = reduce_ambiguity(annotation_df)

    # add 'single_class_annotation'
    annotation_df.loc[:,'single_class_annotation'] = np.where(annotation_df.small_RNA_class_annotation.str.contains(';',na=True), False, True)

    # add 'single_name_annotation'
    annotation_df.loc[:,'single_name_annotation'] = np.where(annotation_df.subclass_name.str.contains(';',na=True), False, True)

    # add 'hypermapper' for sequences where more than 50 potential mapping references are recorded
    annotation_df.loc[annotation_df.reference > 50,'subclass_name'] = 'hypermapper_' + annotation_df.reference.fillna(0).astype(int).astype(str)
    annotation_df.loc[annotation_df.reference > 50,'subclass_name_bin_pos'] = 'hypermapper_' + annotation_df.reference.fillna(0).astype(int).astype(str)
    annotation_df.loc[annotation_df.reference > 50,'precursor_name_full'] = 'hypermapper_' + annotation_df.reference.fillna(0).astype(int).astype(str)

    annotation_df.loc[:,'mitochondrial'] = np.where(annotation_df.mitochondrial.str.contains('mito',na=False), True, False)

    # add 'hico' 
    annotation_df.loc[:,'hico'] = np.where((
        (annotation_df.mms == 0) 
        & (annotation_df.single_name_annotation == True)
        & (annotation_df.TE_annotation.isna() == True)
        & (annotation_df.bacterial == False)
        & (annotation_df.viral == False)
        & (annotation_df.adapter_mapping_filter == True)
        & (annotation_df.five_prime_adapter_filter == True)
    ), True, False)
    ## NOTE: for miRNAs only use hico annotation if part of refmiR set
    annotation_df.loc[annotation_df.small_RNA_class_annotation == 'miRNA','hico'] = annotation_df.loc[annotation_df.small_RNA_class_annotation == 'miRNA','hico'] & (annotation_df.miRNA_ref_iso == 'refmiR')

    print(annotation_df[annotation_df.single_class_annotation == True].groupby('small_RNA_class_annotation').hico.value_counts())

    return annotation_df


#%%
######################################################################################################
# annotation pipeline
######################################################################################################
@log_time(log)
def main(five_prime_adapter):
    """Executes 'annotate_from_mapping'.

    Uses:

    - HBDxBase_csv
    - miRBase_mature_path
    - mat_miRNA_pos_path

    - mapping_file
    - unmapped_file
    - mapped_genome_file 
    - toomanyloci_genome_file 
    - unmapped_genome_file

    - TE_file
    - unmapped_adapter_file
    - unmapped_bacterial_file
    - unmapped_viral_file
    - five_prime_adapter

    """


    print('-------- extract general information for sequences that mapped to the HBDxBase --------')
    mapped_info_df = extract_general_info(mapped_file)
    print("\n")

    print('-------- extract sRNA class specific information for sequences that mapped to the HBDxBase --------')
    mapped_sRNA_anno_df = extract_sRNA_class_specific_info(mapped_info_df)

    print('-------- save to file --------')
    mapped_sRNA_anno_df.to_csv(sRNA_anno_file)
    print("\n")
    
    print('-------- aggregate information for mapped and unmapped sequences (HBDxBase & human genome) --------')
    sRNA_anno_per_seq_df = aggregate_info_per_seq(mapped_sRNA_anno_df)
    print("\n")

    print('-------- add hico annotation (based on aggregated infos + mapping to viral/bacterial genomes + intersection with TEs) --------')
    sRNA_anno_per_seq_df = add_hico_annotation(sRNA_anno_per_seq_df, five_prime_adapter)
    print("\n")

    print('-------- save to file --------')
    # set sequence as index again
    sRNA_anno_per_seq_df.set_index('sequence', inplace=True)
    sRNA_anno_per_seq_df.to_csv(aggreg_sRNA_anno_file)
    print("\n")

    print('-------- generate subclass_to_annotation dict --------')
    result_df = sRNA_anno_per_seq_df[['subclass_name', 'small_RNA_class_annotation']].copy()
    result_df.reset_index(drop=True, inplace=True)
    result_df.drop_duplicates(inplace=True)
    result_df = result_df[~result_df["subclass_name"].str.contains(";")] 
    subclass_to_annotation = dict(zip(result_df["subclass_name"],result_df["small_RNA_class_annotation"]))
    with open('subclass_to_annotation.json', 'w') as fp:
        json.dump(subclass_to_annotation, fp)

    print('-------- delete tmp files --------')
    os.system("rm *tmp_*")


#%%