#from turtle import shape import streamlit as st #from st_keyup import st_keyup import pandas as pd import numpy as np from st_aggrid import AgGrid, GridOptionsBuilder,GridUpdateMode,DataReturnMode import time import os from PIL import Image st.set_page_config(layout="wide") st.markdown( """ """, unsafe_allow_html=True, ) latext1 = r''' $$ f(s_j) = \frac{1}{1+exp(-g(s_j))} \\ g(s_j) = int + \sum_{i} w_{ij} \\ where f(s_j) \epsilon \ [0,1] \\ ''' latext2 = r''' $$ Specificity (g) = \frac{1}{1+\sum_{o\epsilon OffTargets(g)} CFD(o,g)} \\ ''' gene_rank = r''' $$ Score(g) = min[Specificity(g), Specificity(g'), 1.25*CuttingEfficiency(g)] \\ ''' sgran_target_score = r''' $$ score = \sum_{off\_targets}[\frac{log_{10}(dist) + score_{off\_target}}{total\_off\_targets}] - total\_off\_targets - (2) \\ ''' off_target = r''' $$ score_{Off-target} = \sum_{mismatch} 1.2^{pos} - (1)\\ ''' def transform(df,str): # Select columns #cols = st.multiselect('Please select columns to save current Table as csv file', cols = st.multiselect(str, df.columns.tolist(), df.columns.tolist() ) df = df[cols] return df @st.cache def convert_df1(df): return df.to_csv(index=False).encode('utf-8') def convert_df(df): return df.to_csv().encode('utf-8') #BE-DICT def BEDICT(method,select_method): #BE-DICT #if select_method == "BE-DICT": st.header("BE-DICT: "+method) expander = st.expander("Summary") #expander.markdown("**Summary**") expander.markdown("BE-DICT predicts base editing outcomes for **4 commonly used base editors (BEs)** by employing an **attention-based deep learning [algorithm](https://www.nature.com/articles/s41467-021-25375-z)** trained on high-throughput target library (of 28,394 target sequences) screens. It **predicts the editing of a target and surrounding bystander nucleotides** for the following ABEs and CBEs.") expander.write( """ - Adenine Base Editors (ABEs) - Based on Adenine deaminase ecTad7.10 **(ABEmax)** and ecTadA-8e **(ABE8e)** - Cytosine Base Editors (CBEs) - Based on Cytosine deaminase rAPOPEC1 **(CBE4max)** and **Target-AID** """ ) expander.markdown("Most base editors convert target bases in a ~5- nucleotide region within the protospacer target sequence and undesired **bystander** editing of additional C or A bases in the editing window are common. All results shown here are based on bystander models for each ABE and CBE.") expander1 = st.expander("How it works") expander1.markdown( """ BE-DICT utilizes comprehensive base editing data (to train and test) generated in the ABE and CBE target library screens (8,558 for ABEmax; 9,534 for CBE4max; 3,416 for ABE8e; 10,177 for Target-AID) for designing and training a machine learning model capable of predicting base editing outcomes at any given target site. The model has three main blocks: - **Embedding block** that embeds both the nucleotide’s and its correponding position from one-hot encoded representation to a dense vector representation. - **Encoder block** that contains: - A self-attention layer (with multi-head support). - Layer normalization & residual connections. - Feed-forward network. - **Output block** that contains: - A position attention layer - A classifier layer. """ ) expander1.markdown('BE-DICT predicts base editing efficiencies with high accuracy from previously published experiments as well as 18/16 separate endogenous genomic loci for ABEmax/CBE4max and ABE8e/Target-AID.') expander1.markdown('Currently, BE-DICT is trained on datasets for ABEmax, CBE4- max, ABE8e, and Target-AID.' ) expander2 = st.expander("References") expander2.write("[BE-DICT Web App](http://130.60.24.130/page-set?actionID=5f8c494b8c854d0029ffa9d3)") expander2.write("[BE-DICT Paper](https://www.nature.com/articles/s41467-021-25375-z)") #expander3 = st.expander("Tool options") expander3 = st.expander("Tool Options: All you can do with this tool") #expander3.markdown('Options') #expander3.markdown(tips,unsafe_allow_html=True) expander3.markdown('This tool uses an **attention based deep learning framework** to predict base editing outcomes for **4 popular** base editors: ABEMax, ABE8e, CBE4max and Target-AID.') expander3.markdown( """ Input: - 2-column csv: - col1=Inp_seq - col2=seq_id where **Inp_seq is a 20 bp** target sequence and seq_id is an identification. """ ) expander3.markdown( """ Output: - 5 column csv file - Columns of interest: - Inp_seq: - Outp_seq: - Pred_score: Higher is better """ ) #expander3.markdown('**Details about Score column**') #expander3.markdown('BE-DICT models (using multi-head self-attention inspired by the Transformer encoder architecture) and interprets dependencies of base editing on the protospacer target sequence') # expander3.write( # """ # - It takes a sequence of nucleotides of the protospacer as input and computes the probability of editing for each target nucleotide as output # - assigns a weight (attention-score) to each base within the protospacer # - The output is a probability score, reflecting the likelihood (between 0 and 1) with which a target base will be edited (C-to-T or A-to-G). # """ # ) expander3.markdown("**Batch mode** can be run from [here](http://130.60.24.130/page?actionID=607552549609a200293b663f)") expander4 = st.expander("Scoring") expander4.markdown( """ Editing efficiency of a base editor depends on: - Consensus sequence preference of the deaminase. - Binding efficiency of the sgRNA to the protospacer. - Undesired ‘bystander’ editing of additional C or A bases in the editing window, among other factors. """ ) expander4.markdown("\n") expander4.markdown( """ BE-DICT uses an **attention-based machine learning [algorithm](https://www.nature.com/articles/s41467-021-25375-z) (an encoder–decoder architecture)** to predict base editing outcomes of commonly used **ABEs** and **CBEs**. - It takes protospacer sequence as input - Assigns a weight (attention-score) to each base within the protospacer - Computes the probability of editing for each target nucleotide as output - The output is a probability score, reflecting the likelihood (between 0 and 1) with which a target base will be edited (C-to-T or A-to-G). """ ) #expander4.markdown('It assigns a weight (attention-score) to each base within the protospacer with regard to its influence on the editing outcome. The **output** is a probability score (between 0 and 1) with which a target base will be edited (C-to-T or A-to-G).') expander4.markdown('BE- DICT implements **per-base (probability score whether a single base will be edited)** as well as **bystander (probabilities for all combinations of sequences with target-based and bystander transitions, as well as the probability of observing a wild-type sequence)** model.') #expander4.markdown('Bystander model: Multiple A or C nucleotides within the editing window can lead to (undesired) bystander base conversions. It takes a sequence of nucleotides of the protospacer as input, and com- putes the probability of the different output sequences (i.e. probabilities for all combinations of sequences with target-based and bystander transitions, as well as the probability of observing a wild-type sequence)') st.markdown('**Please note that this tool only targets NGG PAM**') st.markdown("**Please note:** *Only one of all possible alleles is used to generate this output.*") st.markdown("**Also note that:** *All results shown here are based on bystander models for each ABE and CBE.*") st.markdown(caution,unsafe_allow_html=True) display_res(method,',','rs_id',select_method) #ChopChop def Chopchop(method,select_method): st.header("Chopchop: "+method) expander = st.expander("Summary") expander.markdown( """ ChopChop is a versatile tool that identifies single guide RNA (sgRNA) targets for CRISPR–Cas9 system for: - **DNA** - **RNA** - **Targeted enrichment of loci for long-read sequencing** for over **200 genomes** and **3 transcriptomes**. \n\n It offers a wide range of selection of **CRISPR effectors** (Cas9, Cas9 Nickase, CasX, and Cas13), **Species**, and **Purpose** (knockout, knockdown, activation, repression, enrichment) alongside a variety of **Options** including selection of specific region, PAM sequences, various efficiency measures, primers, prediction of Cas9 repair outcomes etc. """ ) expander1 = st.expander("How it works") expander1.markdown(""" CHOPCHOP accepts **input** in one of the following forms: - Gene name - Genomic coordinates - **In batch mode, we used** a text file containing chr:start-end per line for each snp. Ex: chr1:152220450-152220451". - DNA sequence Based on the input provided, chopchop retrieves sequence (corresponding to gene name/coordinates) and scan it for all potential target (and off-target) sites (based on search requirement selected). **Each sgRNA** is then ranked according to: - Number of off-targets in the genome - Number of mismatches lie within the off-targets. - Ranked by: - GC-content - Presence of a guanine (G) at position 20 in the sgRNA target site - Any target sites with the same score are then sorted by their position in the gene (with preference to 5′ positions). **Output:** A tab separated text file - **Columns of interest**: - Target sequence - Efficiency (**higher the better**) Please note that not all options have Efficiency defined [Ref](https://chopchop.cbu.uib.no/instructions) """ ) expander1.markdown("**Instructions to run Batch mode** can be found [here](https://bitbucket.org/valenlab/chopchop/src/master/)") #expander1.markdown(tips,unsafe_allow_html=True) expander2 = st.expander("References") expander2.write("[ChopChop Web App](https://chopchop.cbu.uib.no)") expander2.write("[ChopChop Paper](https://academic.oup.com/nar/article/47/W1/W171/5491735)") expander3 = st.expander("Tool Options: All you can do with this tool") expander3.write( """ This tool offers sgRNA design for: - **DNA using:** - CRISPR/Cas9 system for knockout, knockin, activation, repression and nanopore enrichment. - CRISPR/Cpf1 or CasX system for knockout, activation, repression and nanopore enrichment. - CRISPR/Cas9 nickase system for knockout and knockin. - TALEN system for knockout. - **RNA using:** - CRISPR/Cas13 (C2C2) for knockdown. **This tool also offers a variety of PAM sequences and other filtering options.** \n\n **Scoring:** It offers ***2 off-target detection*** methods and ***7 efficiency scores*** to aid users in selecting optimal sgRNAs for their research. """ ) expander4 = st.expander("Scoring") expander4.markdown('All CRISPR–Cas applications use an sgRNA to direct the CRISPR effector (Cas9 and its variants) protein to its target. In theory, CRISPR–Cas targeting only requires complementarity between the sgRNA and its DNA target. However, **efficient targeting** follows more complex factors including **1)** position of specific nucleotides in the target sequence, **2)** the accessibility of the target site, and **3)** the sequence of its flanking regions as well as specific design.') expander4.markdown( """ **Efficiency score:** *Efficiency* column displays normalized score between 0-1. Based on **Cas9 effector (Cas9/Cas9 nickase/CasX/Cpf1)** used, user can select an appropriate scoring algorithm from one of the following options. - **G20**: Prioritizes a guanine at position 20, just upstream of PAM. - Doench et al. 2014 - only for NGG PAM - Doench et al. 2016 - only for NGG PAM (default) - Chari et al. 2015 - only NGG and NNAGAAW PAM's in hg19 and mm10 - Xu et al. 2015 - only for NGG PAM (but can be used for other PAMs) - Moreno-Mateos et al. 2015 - only for NGG PAM. **Ranking:** Target sites are ranked according to: - Efficiency score - Number of off-targets and whether they have mismatches - Existence of self-complementarity regions longer than 3 nt. The number indicates how many regions of self-complementarity are predicted. - GC-content - Location of sgRNA within a gene (5’ (best) -> 3’ (worst)) """ ) expander4.markdown( """ \n\n Here we describe these scoring algorithms.\n\n **Sequence model for predicting sgRNA efficiency** in CRISPR/Cas9 knockout (and CRISPRi/a) experiments [Xu et](https://genome.cshlp.org/content/25/8/1147): DNA sequence features that contribute to single guide RNA (sgRNA) efficiency in CRISPR-based screens - Predictive (sequence) model: - Select a list of essential genes whose deletion resulted in a growth disadvantage in genome-wide knockout experiments from published data sets. - Efficient sgRNAs: all sgRNAs targeting these essential genes (two fold depletion). - Inefficient sgRNAs: all sgRNAs targeting these essential genes (very low depletion w.r.to positive control) - Computedthe log odds ratio of nucleotide (40 bp sequences (aligned at the PAM) including the 19-bp or 20-bp spacer targets as well as their 3′ and 5′ flanking DNAs.) frequency between DNA sequences targeted by efficient and inefficient sgRNAs. - Feature selection (sgRNA nucleotides): (1) Signs of the odds ratios are concordant; and (2) magnitudes of the odds ratios are above a threshold in all three sgRNA sets, where the threshold was computed from a statistical significance analysis - Elastic-Net model was used to identify dominating nucleotide features: 28 sequence features were identified. """ ) expander4.markdown('Cas9 knockout mode: reports prediction of DSB repair outcomes. The model estimates the probability that a given sgRNA will result in a frameshift mutation.') expander4.markdown( """ **CRISPR/Cas13 mRNA Knockdown:** - Input: Default is 28 bp (28 bp sgRNA without PAM and 1bp protospacer flanking site (PFS) - Output: MM0, MM1, MM2 and MM3 columns specify the number of off-target transcripts (with 0, 1, 2 or 3 mismatches, respectively) that your gRNA may bind to, outside of your target gene. **CRISPR/Cas9/Cas9 nickase Knock-in:** An sgRNA activity [model](https://www.nature.com/articles/nmeth.3543): - Sequence: 35 bp (6nt upstream flanking + 20nt sgRNA + 3nt PAM + 6nt downstream flanking) - Total Features: 684 - Mononucleotides Features: 140 - 4 bases (A,C,G,T) × 35 nt (6nt upstream, 20nt sgRNA, 3nt PAM, 6nt downstream) - Dinucleotide Features: 16 dinucleotides × 34 positions (of 35 bp sequence) - Using randomized logistic regression on 684 features with regularization and selected 91 features. **Chari et. al [model](support vector machine model):** A support vector machine model """ ) expander4.markdown( """ **A deep-learning-based (Convolutional Neural Network - CNN) based DeepCpf1 model for Cas12a/Cpf1 RNA [Kim et. al](https://www.nature.com/articles/nmeth.4104):** An automated feature engineering and prediction algorithm that automatically learn informative representations of target sequences relevant to Cpf1 activity profiles. - Input: 34-bp target sequence - Uses one-hot encoding input layer, convolution layer with 80 filter and rectified linear unit (ReLU) to the convolution outputs and three fully connected layers of 80, 40, and 40 units, chromatin accessibility integration layer and regression output layer. """ ) expander4.markdown( """ **A predictive model (logistic regression classifier) for sgRNA activity (CRISPR/Cas9 knockout) [Doench](https://www.nature.com/articles/nbt.3026):** Efficiency score of a CRISPR/Cas9 system depends on: - Nucleotide composition of the DNA downstream from the spacer target - Sequence features: 4 nt upstream of the sgRNA target site, the 20 nt of sgRNA complementarity, the PAM and the 3 nt downstream of the sgRNA target sequence. """ ) expander4.markdown('Self-complementarity column: Data suggests that self-complementarity within the gRNA or between the gRNA and RNA backbone can inhibit gRNA efficiency (Thyme et al., 2016 ). This option searches for complementarity within the gRNA, and between the gRNA and either a standard backbone (AGGCTAGTCCGT), an extended backbone (AGGCTAGTCCGT, ATGCTGGAA) or a custom backbone. Some users will choose to replace the leading nucleotides of their gRNA with “GG” for T7 transcription. Check this box to search for complementarity with the GG replacement.') expander4.markdown( """ **Method for determining off-targets in the genome ((selectable from Cas9/CasX/Cpf1 tabs in Options)):** - **Cong et al., 2013:** single-base mismatches up to 11 bp 5' of the PAM completely abolish cleavage by Cas9. However, mutations further upstream of the PAM retain cleavage activity. We have created a uniqueness method that searches for mismatches only in the first 9 bp, since a mismatch further towards the PAM motif is predicted to cause no cleavage. - **Hsu et al., 2013:** mismatches can be tolerated at any position except in the PAM motif. - Default Method: Searches for mismatches only in the 20 bp upstream of the PAM. """ ) st.markdown("**Please note that the tool was run for CRISPR/cas9 for NGG (knock-out), CRISPR/cas9-nickase (knock-out) for NGG and NRG (R=A or G), CRISPR/cpf1 for TTN, CRISPR/CasX for TTCN PAM and CRISPR/cas13(c2c2).**") st.markdown("**Please note that not all options results in an efficienc score (0 is reported in efficiency column).**") st.markdown(caution,unsafe_allow_html=True) display_res(method,'\t','snp_id',select_method) # def Guidescan2(method,select_method): st.header("Guidescan2: "+method) #st.markdown("**Summary**") expander = st.expander("Summary") expander.markdown("GuideScan2 employes Cas9 (tracrRNA and crRNA) and Cas12a (also known as cpf1, requires only crRNA) for sgRNA **(single- and paired-gRNA)** design (coding and noncoding genomic regions) for 8 organisms. It enables construction of high-specificity gRNA databases with reduced off-target effects.") expander.markdown("CRISPR-Cas9 targets a 20-nucleotide spacer sequence at the end of the gRNA that is complementary to a DNA protospacer sequence followed immediately at the 3’ end by a PAM of the form NGG (more efficient targeting) or NAG (less efficient); here N stands for a ‘wildcard’, i.e. can match any nucleotide. Other natural and engineered CRISPR-Cas systems can **vary in PAM sequence, PAM position with respect to the protospacer sequence, and requirements on the level of similarity between gRNA and the target.**") expander.markdown("Given a genomic region, the task of gRNA design is to find gRNAs that can target anywhere in that region. Many potential gRNAs can target at multiple locations in the genome with varying efficiency. Typically a gRNA is designed to target a particular location with **perfect complementarity** with all other targets of this gRNA are being **off-targets**. **Goal** of gRNA design is typically to **maximize gRNA efficiency at the primary target site while minimizing off-targeting.**") expander.markdown("Variants and extensions of the gRNA design task include: paired gRNA design to select two gRNAs targeting flanking sites of a genomic region of interest; saturation experiment design to exhaustively select all gRNAs expected to target a selected region of interest; and library design to select a small number of the most effective gRNAs for each of hundreds or thousands of regions of interest.") expander1 = st.expander("How it works") expander1.write( """ **Algorithm*:* - A single gRNA is evaluated against a genome B (Burrows-Wheeler Transform compressed genome and index). - All occurences of the sgRNA (A spacer sequence g and PAM set P, for all g′ in a Hamming distance (depth-first search using rank-queries on the forward and reverse complement strands of the BWT of the genome) ball of radius k centered at g) in B are identified. - These occurrences are validated against the PAM set P, **pruning** any occurrences that are not followed by a PAM in this set. - This set of validated occurrences forms the **set of targets for this gRNA**. - gRNAs that have multiple perfect occurrences (indistinguishable intended target) are **filtered out**. - **gRNA with a single perfect occurrence**, considered to be its primary target, is then included in the database. - All other **targets that contain mismatches are considered off-targets.** - GuideScan allows: target sequences, PAM, PAM position relative to the gRNA binding sequence, and gRNA length. **Potential off-targets:** - Uses a retrieval tree (trie, preprocess the targetable space in the genome, i.e. **all 20-mers followed by primary and secondary PAMs**) data structure to efficiently and precisely enumerates all targetable sequences (with a specific number of mismatches) present in a given genome. """ ) expander2 = st.expander("References") expander2.write("[GuideScan2 Web App](https://guidescan.com)") expander2.write("[GuideScan2 Paper](https://www.biorxiv.org/content/10.1101/2022.05.02.490368v1)") expander3 = st.expander("Tool Options: All you can do with this tool") expander3.write( """ This tool offers sgRNA design for: - **CRISPR/Cas9** - **CRISPR/Cpf1** - **Please note that this tool work best for genomic intervals >30bp.** **Input:** - Line delimited Genomic intervals (or DNA sequence) as a text file in the webapp **[here](https://guidescan.com/)** in the following format (of genomic range 30bp, 40bp etc): - Line1: chr10:11676698-11676728 - Line2: chr1:152220435-152220465 - and so on **Output:** - A csv file containing all gRNAs within the genomic regions provided in the input file **Columns of interest (at most 6 gRNAa are reported from all possible)**: - gRNA-Seq and Target-Seq - PAM - Number of off-targets - Cutting efficiency (**Higher the better**) - Specificity (**Higher the better**) [Ref](https://www.biorxiv.org/content/10.1101/2022.05.02.490368v1.full.pdf) - Rank: - Uses a score that balances maximizing the gRNA specificity and cutting efficiency. """ ) expander3.markdown(gene_rank) expander4 = st.expander("Scoring") expander4.markdown( """ **Efficiency (Please see Scoring and Quality Matrices of the README section of this app):** Rule Set 2 [DOENCH 2016](https://www.nature.com/articles/nbt.3437) - sgRNAs were filtered out with cutting efficiency less than 0.25 or specificity less than 0.20 - Selected six gRNAs for each gene. For genes with more than six sgRNAs, Ranked Genes and (selected six genes) - Ranked gRNAs for each gene using a simple score that balances maximizing the gRNA specificity and cutting efficiency. - Nucleotide at the 5’ end of gRNA (called g) is replaced with a G (called g') for better efficiency - Ranking gRNAs for each gene is defined as **sgRNA Specificity and Rank score:** [DOENCH 2016](https://www.nature.com/articles/nbt.3437) """ ) expander4.markdown(latext2) #expander4.markdown('sgRNA Specifity (off-target) Score') expander4.markdown('sgRNA Rank Score') expander4.markdown(gene_rank) st.markdown("**Please note that the software was run with Cas9 (NGG PAM) and cpf1 (TTG PAM) option with all other options left as default.**") st.markdown(tips,unsafe_allow_html=True) st.markdown(caution,unsafe_allow_html=True) display_res(method,',','query',select_method) #Pnbdesigner def Pnbdesigner(method,select_method): st.header("PnB Designer: "+method) expander = st.expander("Summary") #expander.markdown("**NEW: PnB Designer, we designed pegRNAs to model all known disease causing mutations available in ClinVar. Additionally, PnB Designer can be used to design guide RNAs to install or revert a SNV, scanning the genome with one CBE and seven different ABE PAM variants and returning the best BE to use**") #expander.markdown("END NEW") expander.markdown( """ Single base editors (BEs) employe cytidine-deaminase (Cytosine BE, CBEs: C/G -> T/A converters) or Adenine-deaminase (Adenine BE, ABEs: A/T -> G/C converters) **can only introduce 4 edits** via sgRNA. Prime Editors **(PEs)**, employing Cas9 nickase fused to an engineered reverse transcriptase template **(RTT, a wild-type Moloney Murine Leukemia Virus (M-MLV) for PE1 and mutagenised M-MLV in PE2 systems for enhanced DNA-RNA affinity, enzyme processivity, and thermostability)** via a gRNA called prime editing guide RNA **(pegRNA)**, on the other hand **can do all 12 edits.** PEs use pegRNA consisting of a 20 nt guide sequence, a primer binding site **(PBS)** and an RTT. The guide directs the Cas enzyme to a target site, the PBS hybridizes to the opposite strand to prime the reverse transcriptase, and the RTT integrates the desired genomic alteration. **Optimized** PE2, called (by employing additional sgRNA to nick the unedited strand so that cell's natural repair system copies the information in the edited strand to the complementary strand, permanently installing the edit) PE3 and PE3b with reduced off-targets are used. **PnB Designer** allows design of pegRNAs for PEs and guide RNAs for CBE and the most recent ABEs such as ABEmax and ABE8e. PnB Designer makes it easy to design targeting guide RNAs for single or multiple targets on a variant or reference genome from organisms (and non-model organisms or synthetic constructs) spanning multiple kingdoms. **PnB Designer enables design of pegRNAs for all known disease causing mutations available in ClinVar** **Nicking guides** for the PE3 and PE3b systems are designed and filtered to provide a suitable selection of gRNAs. For PE3, only nicking guides 40–100 nt up/downstream of the initial nick are considered. For PE3b, only PAM sequences on the complementary strand that partially overlap with the PE2 PAM or protospacer sequence are displayed. """ ) #expander.markdown("Nicking guides for the PE3 and PE3b systems are designed and filtered to provide a suitable selection of gRNAs. For PE3, only nicking guides 40–100 nt up/downstream of the initial nick are considered. For PE3b, only PAM sequences on the complementary strand that partially overlap with the PE2 PAM or protospacer sequence are displayed.") #expander.markdown("PnB Designer allows design of pegRNAs for PEs and guide RNAs for CBE and the most recent ABEs such as ABEmax and ABE8e. PnB Designer makes it easy to design targeting guide RNAs for single or multiple targets on a variant or reference genome from organisms (and non-model organisms or synthetic constructs) spanning multiple kingdoms. It has been used PnB Designer to design candidate pegRNAs to model all human mutations in ClinVar") #expander.markdown("**PnB Designer enables design of pegRNAs for all known disease causing mutations available in ClinVar**") expander1 = st.expander("How it works") expander1.markdown( """ **Prime editing steps (PE1):** - Generation of a single-stranded break (SSB) in the non-target strand via Cas9 H840A nickase - DNA/RNA hybridization between the primer binding site (PBS) in the pegRNA and the 3′-region of the nicked strand - Reverse transcriptase (RT) mediated reverse transcription of the nicked strand according to the RT template sequence, which generates a 3'-flap containing the edit - Incorporation of the 3'-flap sequence in the DNA following ligation - The edited DNA strand displaces the unedited 5’ flap and the resulting heteroduplex is resolved by the cell’s mismatch repair (MMR) system **PE3 and PE3b:** - PE3 uses the PE2 Cas9 nickase-pentamutant (five mutations in RT enzyme (D200N/L603W/T330P/T306K/W313F) to increase activity, enhance binding between the template and PBS, increase processivity, and improve thermostability) RT fusion enzyme and pegRNA plus an additional simple sgRNA, which directs the Cas9 nickase to nick the unedited strand at a nearby site. The newly edited strand is then favored as the template for repair during heteroduplex resolution. The process of double nicking, however, increases indel formation slightly. Designing the sgRNA with a spacer that only binds the edited strand, as in the PE3b system, guides nicking of the unedited strand only after the edit has occurred. """ ) image = Image.open('pe1.png') expander1.image(image, caption='Prime Editing. https://www.addgene.org/crispr/prime-edit/') expander1.markdown( """ **Design strategy for pegRNAs:** - PnB Designer scans the sense and antisense strands to find all possible 5′-NGG-3′ protospacer adjacent motif (PAM) sites around the edit position, beginning+6 nt to the 3′ end of the desired edit and then scanning 100 nt in the 5′ direction, giving the user the option to choose also very distant PAMs. - All possible NGG PAMs are stored and evaluated in respect to their distance from the edit position and the input RTT length. - A pegRNA is considered a possible candidate if the edit is fully covered by the RTT. - PnB Designer then stores the protospacer, PBS, and RTT sequences. - Nicking guides for the PE3 and PE3b systems are designed and filtered to provide a suitable selection of gRNAs. - For PE3, only nicking guides 40–100 nt up/downstream of the initial nick are considered. **Design strategy for BEs gRNAs:** **Input:** - Sequence: - Upstream and downstream sequence with desired edit such as (one of A>G, T>C, C>T, G>A) - Genomic coordinates: Retrieves the genomic sequence from the selected reference genome and converts the specific variant sequence to include the SNV. - The resulting sequence is searched for **PAM (5′-NGG-3′ (SpCas9), 5′-NGA-3′ (SpCas9-VRQR), 5′-NGCG-3′ (SpCas9-NG), 5′-NNGRRT-3′ (SaCas9), 5′-NNNRRT-3′ (SaCas9-KKH), SpG 5′-NGN-3′, SpRY 5′-NRN-3′ and 5′-NYN-3′)** sites in the right distance to the SNV given the described editing windows of the Cas9-ABE and -CBE variants. - With ABEs, C → T genomic variants can be reverted by A → G conversion on the antisense strand to achieve the intended edit on the sense strand. - With CBEs, C → T and G → A conversions are possible. All previously described PAM variants with their respective editing window are tested against the edit. - Editing windows are defined based on the experimental data. - ABEmax was implemented with an editing window from base 5–7, with base 1 being the most distal from the PAM site. - For BE3 (R33A/K34A), the strong sequence preference for a 5′ T next to the edit has been included as well """ ) expander2 = st.expander("References") expander2.write("[PnB Designer Web App](https://fgcz-shiny.uzh.ch/PnBDesigner/)") expander2.write("[PnB Designer Paper](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-021-04034-6)") expander3 = st.expander("Tool Options: All you can do with this tool") expander3.write( """ This tool can be run in **two modes**: - **Base editing mode:** - Does not allow A>T or G>C, dels, or insertions. - Only **180**/414 variants could be targeted. - **Columns of interest**: Protospacer, PAM and Base Editor (the system for producing the base edit). - **There is no score**. - **Prime editing mode:** - Requires two guides: detailed in two files. - pegRNA oligos for cloning. - **Score: Higer is better**. - Nicking guides: the corresponding nicking guides **Input** - Multiple lines provided in as a csv file in the webapp **[here](https://fgcz-shiny.uzh.ch/PnBDesigner/)** in the following format: - Prime editing: - varinat, chromosome num, genomic location, Edit, gene orientation, PBS, RTT - Ex: rs7412, 19, 44908822, insA, +, 13, 13 - Base editing - varinat, chromosome num, genomic location, SNO, gene orientation, PBS, RTT - Ex: rs7412, 19, 44908822, C>T, + **Output:** A csv file - Base editing - 20 nt protospacer sequence (targeting the variant sequence) - Edit position - PAM site - Suggested base editor that can target the variant - Score: No score is provided - Prime editing - pegRNA (protospacer and extension seq for both strands) - Edit position - PAM and PAM strand - Score (Higher the better) - PBS and RTT length - Nicking guides (PE3 or PE3b system) shown by selecting Nicking_guides_PE3_PE3B from the side bar. """ ) #expander3.markdown(tips,unsafe_allow_html=True) expander4 = st.expander("Scoring") expander4.markdown( """ **Prime Editing:** Primer binding site (PBS) and reverse transcriptase templeta (RTT) length are important parameters for successful pegRNA design. In PnB Designer, PBS and RTT lengths are by default set at suggested values of 13 nt. The pegRNA Score follows a [penalty system](https://www.nature.com/articles/s41586-019-1711-4) with **larger negative numbers** are indicative of worse pegRNA designs. **pegRNA Score (Sum of all penalties):** - Penalty system - C as the first base in the 3′ extension: penalty score = − 28 - T (thymine) > 4 nucleotides in the 3′ extension of the pegRNA are strongly penalized with a score = − 50 - number of homologous bases after the intended edit < 5, penalty score = − 6 - larger negative numbers are indi- cating worse pegRNA designs """ ) st.markdown("**Please note that the tool was run in Base editing and Prime editing modes. Corresponding nicking guides are also reported here.**") st.markdown(caution,unsafe_allow_html=True) display_res(method,',','query',select_method) #SNP_CRISPR def SNPcrispr(method,select_method): st.header("SNP_CRISPR: "+method) #st.markdown("**Summary**") expander = st.expander("Summary") expander.markdown( """ **SNP-CRISPR** - Facilitate identification of sgRNAs in **non-reference genomes**, **across varying genetic backgrounds** or for specific targeting of **SNP-containing alleles** (for example, disease relevant mutations). - It computes **efficiency and specificity** scores for sgRNA designs targeting **both** the **variant** and the **reference**. - Can be used to design sgRNAs based on public variant data sets or user-identified variants - Design sgRNAs for NGG and NAG PAM sequences """ ) expander1 = st.expander("How it works") expander1.write( """ **Design:** - SNP-CRISPR validates the input reference sequences and **warn if the submitted reference sequences does not match**, which might reflect a different version of the genome assembly being used in the user input vs. SNP-CRISPR and re-constructs the template sequence, swapping the reference nucleotide with the variant nucleotide for SNPs, while inserting or deleting the corresponding fragment for indel type variants. - Computes potential variant-targeting sgRNAs based on **availability of PAM (NGG or NAG) sequences** in the neighboring region. - sgRNA designs that contain **four or more consecutive thymine residues**, which can result in termination of RNA transcription by RNA polymerase III, **are filtered out**. - Computes efficiency [Housden et al. 2015](https://pubmed.ncbi.nlm.nih.gov/26350902/) and specificity (based on BLAST results against the reference genome) scores. **For identification of the best variant-specific sgRNAs, following information are provided.** - Iinformation on both sgRNAs targeting specific variants and sgRNAs targeting the reference sequence in the same region. - The efficiency score and an off-target score - Positions of relevant SNPs or indels in the sgRNA are included. """ ) #expander1.markdown("**To facilitate identification of the best variant-specific sgRNAs, we provide information about both sgRNAs targeting specific variants and sgRNAs targeting the reference sequence in the same region. The efficiency score and an off-target score are provided, and the positions of relevant SNPs or indels in the sgRNA are included so that users can select the most suitable sgRNA or filter out less optimal ones.**") expander2 = st.expander("References") expander2.write("[SNP_CRISPR Web App](https://www.flyrnai.org/tools/snp_crispr/web/)") expander2.write("[SNP_CRISPR Paper](https://academic.oup.com/g3journal/article/10/2/489/6026318)") expander3 = st.expander("Tool Options: All you can do with this tool") #expander3.markdown(tips,unsafe_allow_html=True) expander3.write( """ This tool can design guides for: - **NGG.** - **NAG.** - **Target multiple variants within the same guide.** - Public variant data sets or user-identified variants. **Input:** Multiple lines provided in as a (6 columns) csv file uploaded to the webapp **[here](https://www.flyrnai.org/tools/snp_crispr/web/)** in the following format: - varinat, chromosome, position, strand, reference, variant - Ex: rs7412, 19, 44908822, C, +, T **Output:** A csv file - **Columns of interest**: - Housden Efficiency Score [Ref](https://www.ncbi.nlm.nih.gov/pubmed/26350902) (Range from 1.47-12.32 **(higher is better, > 5 recommended))** - Off Target Score (Range from 0-5441.73 (lower is better, < 1 recommended)) """ ) expander4 = st.expander("Scoring") expander4.markdown( """ **Efficiency score:** - Used 75 sgRNAs to target a single sequence cloned into a luciferase reporter. - 3 replicates using sgRNAs with 0, 1, 2, or ≥ 3 mismatches (black bars) or in the absence of sgRNA - A probability matrix (of size 4(bp)x20(position in sgRNA)) is dynamically computed reflecting a cumulative P value for high efficiency of each nucleotide from position 1 to 20, with higher values representing higher efficiency. **Off-target (Specificity) score:** - Potential off-target loci are evaluated by performing a BLAST search of each (sgRNA) design against the species reference genome. - 3 base pairs mismatch (on the 5' end of sgRNA) as a cutoff to detect an off-target - An off-target score is assigned based on both the number of hits found in the BLAST results and the number of mismatched nucleotides per off-target hit. """ ) st.markdown(caution,unsafe_allow_html=True) st.markdown("**Please note that the software was run for NAG and NGG PAM sequences only.**") display_res(method,',','Gene',select_method) #ECRISP def ecrisp(select_method): st.header("E-CRISP") expander = st.expander("Summary") #st.markdown("**Summary**") expander.markdown("E-CRISP is used to design gRNA sequences **(supports 12 organisms)** and can also reevaluate CRISPR constructs for on- or off-target sites and targeted genomic loci. It identifies target sequences complementary to the gRNA ending in a 3ʹ protospacer-adjacent motif (PAM), **N(G or A)G** and uses a fast indexing approach to find binding sites and a binary interval tree for rapid annotation of putative gRNA target sites.") expander.markdown("**Off-target** effects and target-site homology are evaluated using Bowtie2 aligner. Designs are **shown** in the output if the number of **off-targets does not exceed a user-specified threshold**. **More than one** design targeting a desired locus are **ranked** according to on-target specificity and number of off-targets.") expander1 = st.expander("How it works") expander1.markdown( """ - E-CRISP identifies target sequences ending with a PAM motif 5′-NGG/NAG-3′ and uses them to propose guide RNAs. - It uses a fast indexing approach to locate binding sites and the alignment program Bowtie 2 to identify off-target effects. - Designed sgRNAs are assessesed based on genomic context (e.g. exons, transcripts, CpG islands) and ranked according to target specificity and efficiency. """ ) expander1.write( """ **Input:** - Multiple lines provided in the Input fasta sequence edit box in the webapp **[here](http://www.e-crisp.org/E-CRISP/index.html)** in the following format - Line1: rs12726330 - Line2: CGGGACATGGAAGAGGTCTGGACCAGGGTACTGGGAAGGCGCTCGGAGGA - Line3: rs76763715 - Line4: CCAGCCGACCACATGGTACAGGAGGTTCTAGGGTAAGGACAAAGGCAAAG - and so on **Output:** - A tab separated .tab file - **Columns of interest**: - sgRNA Length - Efficiency Score (E Score, **Higher the better**) [Ref](https://www.nature.com/articles/nbt.3026) - Specificity Score (S Score, **Higher the better** (max = 100)) - Doench and Xu Score - Nucleotide sequence (A, C, G, T) compositions in % """ ) expander2 = st.expander("References") expander2.write("[E-CRISP Web App](http://www.e-crisp.org/E-CRISP/)") expander2.write("[E-CRISP Paper](https://www.nature.com/articles/nbt.3026)") expander3 = st.expander("Tool Options: All you can do with this tool") expander3.write( """ This tool offers single or paired sgRNA and: - **Options for PAM:** - **Relaxed** - **Medium:** - **Strict** - **Options for Design:** - knockdown. - knockin. - N/C terminal tagging. - CRISPRi. - CRISPRa. - **Other filtering options.** - gRNA length, allowed % of G, C, A and T, 3' and 5' flanking sequence length, off-targets evaluation etc """ ) expander4 = st.expander("Scoring") expander4.markdown( """ E-CRISP utilises its own **SAE (Specificity, Annotation, Efficacy) score** to determine the quality of each sgRNA in addition to Rule Set 1 [Doench et al](https://www.nature.com/articles/nbt.3026) and [Xu et al](https://genome.cshlp.org/content/25/8/1147). Please see Scoring and Quality Matrices in README tab of this app for details. - Specificity Score (S-score): - Start with 100. - For every off-target, substract (20-mismatches)/iteration. - Annotation Score (A-score): - Start with zero - For every hit exon add 5/exon count - For every hit CpG Island subtract 1 - For every start codon hit add 1 - For every stop codon hit add 1 - For every CDS hit add 5/CDS count - For every gene hit add 1 - Efficacy Score (E-score): - Add 1 if last 6 bp have a CG content higher then 70 % - Subtract 1 if the entire sequence has GC content > 80 % - Add 1 if sequence is preceded by a G - Add 1 if there are GG in front of the target sequence (opposite the PAM) - Add micro-homology score (is higher when sequence tends to give out of frame deletions) """ ) #expander4.markdown('on-target and off-target predictions, it utilises its own ‘SAE (Specificity, Annotation, Efficacy) Score’ to determine the quality of each gRNA, while Rule Set 1 ( predictive model for sgRNA activity by training a logistic regression classifier to discriminate the highest-activity) [Doench](https://www.nature.com/articles/nbt.3026) and Spacer Scoring for CRISPR (identified sequence features that contribute to sgRNA efficiency by calculating log odds ratio of nucleotide frequency between DNA sequences targeted by efficient and inefficient sgRNAs) [Xu](https://genome.cshlp.org/content/25/8/1147) are also included in its results.') #expander4.markdown('**Doench Score:** sgRNA score. A guide necessarily only has a subset of all the features, indicated via one-hot encoding as binary variables. Let the model weights for the features i for a particular guide sj be wij, the intercept int. Then the sgRNA score f (sj) is given via logistic regression as:') # latext = r''' # $$ # f(s_j) = \frac{1}{1+exp(-g(s_j))} \\ # g(s_j) = int + \sum_{i} w_{ij} \\ # where f(s_j) \epsilon \ [0,1] \\ # ''' # expander4.markdown(latext1) # expander4.markdown( # """ # Here, features used for prediction are: # - Individual nucleotides and all pairs of adjacent nucleotides indexed by position in the 30 mer target site. # - Count of Gs and Cs in the 20 nt of the sgRNA . # - Two GC-count features for deviations below ten and above ten. # """ # ) st.markdown(tips,unsafe_allow_html=True) st.markdown("**Please note that the result reported here are for PAM=NGG**") st.markdown(caution,unsafe_allow_html=True) #st.header(select_method) fnm=cwd+select_method+'/'+select_method+'_NGG'+'.csv' data = pd.read_csv(fnm, sep=',') #get snp data #data_snp = data[data['Name'].str.contains(variant_spl[0])] if len(variant_spl) > 1: #variant_spl has two components #data_snp = data[data['rs_id'].str.contains(variant_spl[1])] data_snp = data[data['Name'].str.contains(variant_spl[1])] data_snp['Name']=variant_spl[0]+':'+data_snp['Name'] else: #data_snp = data[data['rs_id'].str.contains(variant_spl[0])] data_snp = data[data['Name'].str.contains(variant_spl[0])] data_snp['Name']='NaN'+':'+data_snp['Name'] data_snp.reset_index(drop=True, inplace=True) data_snp.reset_index(drop=True, inplace=True) if data_snp.shape[0]>0: df = transform(data_snp,'Please Select columns to save whole table') #fname = st.text_input('Please input file name to save Table', 'temp') #fname = st_keyup("Please input file name to save Table", value='temp') csv = convert_df(df) st.download_button( label="Download Table as CSV file", data=csv, file_name=select_method+'_'+variant_spl[0]+'.csv',#fname+'.csv', mime='text/csv', ) #st.table(data_snp) if len(variant_spl) > 1: f""" **Results for SNP: {variant_spl[1]} on GENE: {variant_spl[0]}** """ else: f""" **Results for SNP: {variant_spl[0]} on GENE: NAN** """ #AgGrid(data_snp) st.markdown(table_edit,unsafe_allow_html=True) gb = GridOptionsBuilder.from_dataframe(data_snp) gb.configure_pagination(enabled=False)#,paginationAutoPageSize=False)#True) #Add pagination gb.configure_default_column(enablePivot=True, enableValue=True, enableRowGroup=True) gb.configure_selection(selection_mode="multiple", use_checkbox=True) gb.configure_side_bar() gridOptions = gb.build() grid_response = AgGrid( data_snp, height=200, gridOptions=gridOptions, enable_enterprise_modules=True, update_mode=GridUpdateMode.MODEL_CHANGED, data_return_mode=DataReturnMode.FILTERED_AND_SORTED, fit_columns_on_grid_load=False, header_checkbox_selection_filtered_only=True, use_checkbox=True, width='100%' ) #data = grid_response['data'] selected = grid_response['selected_rows'] if selected: st.write('Selected rows') dfs = pd.DataFrame(selected) st.dataframe(dfs[dfs.columns[1:dfs.shape[1]]]) dfs1 = transform(dfs[dfs.columns[1:dfs.shape[1]]],'Please select columns to save selected Table') #csv = convert_df1(dfs[dfs.columns[1:dfs.shape[1]]]) csv = convert_df1(dfs1) st.download_button( label="Download data as CSV", data=csv, file_name=select_method+'_'+variant_spl[0]+'.csv', mime='text/csv', ) def display_res(method,sep,rsid,select_method): #if method == 'bystander_ABE8e_mean': #st.header(select_method+' with: '+method+' option') fnm=cwd+select_method+'/'+select_method+'_'+method+'.csv' #data = pd.read_csv(fnm, sep=',') data = pd.read_csv(fnm, sep=sep) #get snp data if len(variant_spl) > 1: #variant_spl has two components #data_snp = data[data['rs_id'].str.contains(variant_spl[1])] data_snp = data[data[rsid].str.contains(variant_spl[1])] data_snp[rsid]=variant_spl[0]+':'+data_snp[rsid] else: #data_snp = data[data['rs_id'].str.contains(variant_spl[0])] data_snp = data[data[rsid].str.contains(variant_spl[0])] data_snp[rsid]='NaN'+':'+data_snp[rsid] data_snp.reset_index(drop=True, inplace=True) if data_snp.shape[0]>0: df = transform(data_snp,'Please Select columns to save whole table') #fname = st_keyup("Please input file name to save Table", value='temp') #st.text_input('Please input file name to save Table', 'temp', live=True) csv = convert_df(df) st.download_button( label="Download Table as CSV file", data=csv, #file_name=fname+'.csv', file_name=method+'_'+variant_spl[0]+'.csv', mime='text/csv', ) if len(variant_spl) > 1: f""" **Results for SNP: {variant_spl[1]} on GENE: {variant_spl[0]}** """ else: f""" **Results for SNP: {variant_spl[0]} on GENE: NAN** """ #AgGrid(data_snp) st.markdown(table_edit,unsafe_allow_html=True) gb = GridOptionsBuilder.from_dataframe(data_snp) gb.configure_pagination(enabled=False)#,paginationAutoPageSize=False)#True) #Add pagination gb.configure_default_column(enablePivot=True, enableValue=True, enableRowGroup=True) gb.configure_selection(selection_mode="multiple", use_checkbox=True) gb.configure_side_bar() gridOptions = gb.build() grid_response = AgGrid( data_snp, height=200, gridOptions=gridOptions, enable_enterprise_modules=True, update_mode=GridUpdateMode.MODEL_CHANGED, data_return_mode=DataReturnMode.FILTERED_AND_SORTED, fit_columns_on_grid_load=False, header_checkbox_selection_filtered_only=True, use_checkbox=True, width='100%' ) #data = grid_response['data'] selected = grid_response['selected_rows'] if selected: st.write('Selected rows') dfs = pd.DataFrame(selected) st.dataframe(dfs[dfs.columns[1:dfs.shape[1]]]) dfs1 = transform(dfs[dfs.columns[1:dfs.shape[1]]],'Please select columns to save selected Table') #csv = convert_df1(dfs[dfs.columns[1:dfs.shape[1]]]) csv = convert_df1(dfs1) st.download_button( label="Download data as CSV", data=csv, file_name=method+'_'+variant_spl[0]+'.csv', mime='text/csv', ) cwd=os.getcwd()+'/'+'data/' #get snps list snps = pd.read_csv("SNPS.csv") variants=snps['GENE:SNP'].unique() variants_s=sorted(variants,key=len) caution = '

Please note that not (necessarily) all variants are targetted.

' tips = '

Important Tool Tips:

' table_edit = '

About Table: Please note that table can be sorted based on by clicking on any column and Multiple rows can be selected (by clicking check box in first column) to save only those rows.

' st.title('Single Base Editiors') st.markdown('**Please select an option from the sidebar**') st.sidebar.image("logo-card-white.png", use_column_width=True) #ReadMe = st.sidebar.radio('ReadME',value=True) #Calc = st.sidebar.radio('Selection Menu') Calc = st.sidebar.radio( "", ('ReadME', 'Tools Selection Menu')) #if Calc: #st.sidebar.title("Selection Menu") if Calc == 'ReadME': #st.markdown("[Introduction](#Introduction)") #st.markdown("[How do base editors work](#How-do-base-editors-work)") expander = st.expander("How to use this app") #st.header('How to use this app') expander.markdown('Please note that all tools were run using Human Genome **(hg38)**. Each tool require **specific input format** (described for each tool selected from the sidebar when **Tools Selection Menue is enabled**) and **output results** in different formats **(with different columns based on method selected as described under each tool)**. Some of these tools also allow selection of various **endonucleases and related options**, their **reulsts are provided as radio controls** in the sidebar of this app under each tool.') expander.markdown('**Requirements:** 1) Python3.4 or higher and 2) streamlit 1.13') expander.markdown('To start this app, **unzip** the base_editor_app.zip in a folder of your choice') expander.markdown('Open shell terminal and **cd to base_editor_app folder**') expander.markdown('Type: **streamlit run baserditorsV3.py**, It will launch baseeditor app in the default browser') expander.markdown('**By default** README radio button is enabled to describe general information about the App and How to use it.') expander.markdown("- Please enable **Tools Selection Menu** radio control in the sidebar **to enable variant, tool and endonuclease options**") expander.markdown("- Select Desired Variant from the dropdown list") expander.markdown("- Select a Tool") expander.markdown("- Select one of the options **(if available)**") expander1 = st.expander('Introduction') expander1.markdown( """**TLDR** This app **reviewes** popular single base quality estimators for a **[list](https://drive.google.com/file/d/1Sxb-Cc-epbs6vujQaX9wa5acqus0RW3q/view?usp=sharing) of rsIDs** per disease of interest based on CARD’s cross-NDD efforts. We filtered our candidate list of **base edit predictors** for those that are at least **semi-automated and reproducible** (no copy and pasting IDs or sequences one at a time). """ ) expander1.markdown('Clustered Regularly Interspaced Short Palindromic Repeat CRISPR/CRISPR-associated (Cas) systems, such as **Cas9 (type II endonuclease which recognises the 5"'"-NGG-3"'" PAM)** and **Cas12a (type V endonuclease which recognises the 5"'"-TTTV-3"'" PAM)** (also called Cpf1), are the primary tools used for genome editing. CRISPR/Cas9 based gene editing uses sequence-specific nucleases (Cas9 etc) and a sgRNA for precise gene knock-out/in whereas catalytically inactive Cas9 (dCas9) provides gene expression regulation via activation/inhibition (CRISPRa/i) and Cas9 nickase (nCas9) + sgRNA, by incorporating deaminases, enables single base editing. Finally nCas9 + prime editing gRNA (pegRNA) enables editing of all 12 possible base edits') expander1.markdown('**A CRISPR/Cas9 sytem** requires a custom single guide RNA (sgRNA) that contains a crRNA (a 20 nt sequence homologous to the region of interest that direct Cas9 (or dCas9 or Cas9 nickase) nuclease to the region of interest) and a Cas9 nuclease-recruiting sequence (tracrRNA). An ideal gRNA should maximize on-target activity **(cleavage efficiency)** while also minimizing potential off-target effects **(specificity)**.') expander1.markdown('**Current sgRNA design tools** (including HDR based and deaminase based) fall under three major categories:') expander1.markdown('- **Alignment-based:** candidate gRNAs are aligned and retrieved from the given genome by locating PAM') expander1.markdown('- **Hypothesis-driven/Rule-based:** sgRNA activity is predicted according to empirically derived, handcrafted rules (GC content, sequence preference etc)') expander1.markdown('- **Learning-based:** sgRNAs are scored by models trained on datasets of CRISPR experiments.') expander1.markdown('**Two categories of DNA base editors (BEs) are:**') expander1.markdown('- Cytosine base editors **(CBEs: C/G -> T/A converters)** and') expander1.markdown('- Adenine base editors **(ABEs: A/T -> G/C converters)** as shown in Figure below') image = Image.open('CBE_ABE.webp') expander1.image(image, caption='Cytosine and Adenine base editors. Figure from: https://www.nature.com/articles/s41573-020-0084-6') expander1.markdown('**Prime Editors (PEs)**') expander1.markdown('While base editors can only introduce 4 edits, PEs on the other hand can do all 12 edits using usual Cas9 (and its variants) and a gRNA called prime editing guide RNA (**pegRNA**). PEs use pegRNA consisting of a 20 nt guide sequence, a primer binding site (PBS) and a reverse transcriptase template (RTT). The guide directs the Cas enzyme to a target site, the PBS hybridizes to the opposite strand to prime the reverse transcriptase, and the RTT integrates the desired genomic alteration.') expander1.markdown('In this app we also tested a **prime editor** and an **RNA editor for gene knockdown** for these targets.') expander2 = st.expander('How does CRISPR-Cas9 (and base editing) System works') expander2.markdown('**CRISPR-Cas9** system consists of **two** key components (accomplishing three steps: Recognition, Cleavage, and Repair):') expander2.markdown("- **Recognition:** A single guide RNA (sgRNA which is composed of target-specific CRISPR RNA (crRNA) and an auxiliary trans-activating crRNA (trcrRNA) joined by linker loop) targeting Cas9 to a specific DNA locus") #expander2.markdown('- **Recognition:** A guide RNA (gRNA) that consists of a small piece of pre-designed RNA sequence (usually 20 bases complimentary to the target DNA sequence in the genome) and **guides** Cas9 to the right part of the genome.') expander2.markdown('- **Cleavage, and Repair**: A Cas9 enzyme (has six domains, REC I (responsible for binding guide RNA), REC II, Bridge Helix, PAM Interacting (confers PAM specificity and is responsible for initiating binding to target DNA), HNH and RuvC (each cut single-stranded DNA after 3rd base upstream of PAM)) that acts as a pair of ‘molecular scissors’ that **cut** the two strands of DNA at a specific location in the genome **so that bits of DNA can then be added or removed** using either non-homologous end joining **(NHEJ)** or homology-directed repair **(HDR)**.') expander2.markdown('CRISPR-Cas9 system, while efficient at knocking out genes, it is very inefficient at introducing single base changes and oftenly introduces random insertions and deletions **(indels)** during double-stranded breaks (DSB) repair. **Base editors** on the other drives specific, accurate, and permanent single nucleotide changes without introducing double-stranded DNA breaks.') expander2.markdown("**Base editing requires three elements:**") expander2.markdown("- A single guide RNA (sgRNA) for ABEs and CBEs or pegRNA for PEs")# which is composed of target-specific CRISPR RNA (crRNA) and an auxiliary trans-activating crRNA (trcrRNA) joined by linker loop) targeting Cas9 to a specific DNA locus") expander2.markdown("- A Cas nickase (Cas9 with mutation in RuvC nuclease domain, which enables it to nick but not cleave DNA) or Cas fused to a deaminase that makes the edit.") expander2.markdown("- A target base for editing within the editing window specified by the Cas9 protein") #expander2.markdown('The Cas9 protein has six domains, REC I (responsible for binding guide RNA), REC II, Bridge Helix, PAM Interacting (**confers PAM specificity and is responsible for initiating binding to target DNA**), HNH and RuvC (**each cut single-stranded DNA after 3rd base upstream of PAM**). Cas9 and its variants are highly specific to various PAM sequences and have two endonuclease domains: the n-terminal RuvC-like nuclease domain and the HNH-like nuclease domain near the center of the protein') expander2.markdown('A whole range of **CBEs** and **ABEs** have been developed. Various CEBs ranging from **simple** deactivated Cas9 (dCas9)+cytidine deaminsae+uracil DNA glycosylase inhibitor (UGI) to improved single mutated Cas9 (nCas9)+cytidine deaminsae+uracil DNA glycosylase inhibitor (UGI) called BE3 systems and its variants such as Target-AID editors were developed. 4th generation BEs (called BE4, such as BE4max etc which focus on improving editors delivery to the nucleus) further minimize undesired base conversions that can happen with BE3.') expander2.markdown('Similar to CBEs, **Adenine base editors (ABEs)** such as ABEmax, ABE4max, ABE8e and ABE8s were also developed.') #st.markdown('Similar to CBEs, adenine base editor such as ABEmax, ABE4max, ABE8e and ABE8s were also developed.') #st.markdown("**Key parameters for a good BE are:**") #st.markdown("- Editing efficiency: 4th generation base editiors **BE4max and ABE4max [2](https://www.nature.com/articles/nbt.4172), ABE8s [3](https://www.nature.com/articles/s41587-020-0491-6) and Target-AID (dual base) [4](https://www.nature.com/articles/s41587-020-0535-y)**") #st.markdown("- Editing efficiency") #st.markdown("- Minimal off-target effects") expander3 = st.expander("Scoring and Quality Matrices") expander3.markdown( """ An ideal CRISPR/Cas9 (and its variants including base and prime editors) system should employ sgRNA (and pegRNA) that **maximize** on-target activity **(efficiency)** and **minimize** potential off-target effects **(specificity)**. Balancing these two requirements can be a challenging. Various sgRNA design tools **assist** in the selection of the best target sites available by **excluding undesirable targets based on predicted low efficiency or specificity**, saving resources and time in the experiment. **Efficiency (On-Target Activity):** Ideally, CRISPR/Cas9 protein scans the PAM sequence, and sgRNA (spacer sequence complementary to the targeted DNA sequence) recognises target loci and activates endonuclease activity to cleave specific sites. **sgRNA Cleavage efficiency**, however, varies greatly among different target sites and/or cell lines and **depends on several factors** including sgRNA sequence (sequence composition, nucleotide position, GC content), genetic and epigenetic features etc. Most sgRNA design tools employ a **combination of features** for efficinecy scoring. Broadly, these tools fall into tow categoris. - **Hypothesis-driven (rule-based) tools:** These tools employ **simple** sgRNA sequence metrices such as: GC content and position dependent specific sequence motifs. **Enhanced** sgRNA based metrices such as sequence, structural, and chromatin characteristics are also adapted in various tools. - **Machine-learning (and Deep-learning) tools:** Cleavage efficiency of sgRNA is a complex interplay of factors such as target sequence, cellular environment and experimental conditions. Therefore, simple rule-based system may not be adequate for choosing target sites and designing CRISPR gRNAs. Machine learning (deep-learning) tools employ different hand-curated (auto-detected) **feature sets** including sgRNA sequence, PAM, and/or adjacent nucleotides and window size on the target sequence to design sgRNA. """ ) expander3.markdown( """ **Efficiency models used in tools discussed in this App** **An sgRNA activity predictive model (An L1-regularized linear support vector machine - SVM) [Doench et. al 2014](https://www.nature.com/articles/nbt.3026):** ***Rule set 1*** Based on sequence features of 1,841 sgRNAs (targeting all possible target sites in six endogenous mouse and three endogenous human genes), a predictive model of sgRNA activity was developed, as described below, to improve sgRNA design for gene editing and genetic screens. Model was trained using a total of 586 sequence and GC features as explained below. - Rnaking sgRNAs from 0 (worst) - 1(best). - Total Features: 586 - Sequence: 30bp sequence(4bp upstream + 20bp sgRNA protospacer + 3bp PAM + 3bp downstream) - Single nucleotide Features: 120 - 4 bases (A,C,G,T) × 30 bp sequence - Dinucleotide Features: 464 - 16 dinucleotides × 29 positions (of 30 bp sequence) - 2 GC-count features in the 20 nt of the sgRNA - Deviations of GC-count below and above ten nucleotide - sgRNA score: - Assuming w_{ij} are model weights for the features i for a particular guide s_{j} and the intercept int. Then the sgRNA score f(s_{j}) is given via logistic regression as: """ ) expander3.markdown(latext1) expander3.markdown("After validation, final trained model using all available data used only 72 of the 586 features, including both GC-count features.") expander3.markdown( """ **Linear Predictive (sequence) model** (CRISPR/Cas9 knockout and CRISPRi/a) [Xu et al 2015](https://genome.cshlp.org/content/25/8/1147): Models for predicting sgRNA efficiency for CRISPR knockout and CRISPRi/a based on systematic assessment of the effect of sequence (sgRNA protospacer, DNA target and downstream of DNA target) context based on six published data sets. - Identification of efficient and inefficient sgRNAs in published data sets - Essential genes: Genes whose deletion resulted in a growth disadvantage in genome-wide knockout experiments. - Efficient sgRNAs: all sgRNAs targeting these essential genes (two fold depletion). - Inefficient sgRNAs: all sgRNAs targeting these essential genes (very low depletion w.r.to positive control) - Computed log odds ratio of (40 bp sequences (aligned at the PAM) including the 19-bp or 20-bp spacer targets as well as their 3′ and 5′ flanking DNAs) **nucleotide frequency between DNA sequences targeted by efficient and inefficient sgRNAs**. - Feature selection (sgRNA nucleotides): - Signs of the odds ratios are concordant - Magnitudes of the odds ratios are above a threshold (computed from statistical significance analysis) in all three sgRNA sets. - Elastic-Net model was used to identify dominating nucleotide features: 28 sequence features were identified. """ ) expander3.markdown( """ **An sgRNA activity model [Moreno et. al 2015](https://www.nature.com/articles/nmeth.3543) (CRISPR/Cas9/Cas9 nickase Knock-in):** - Sequence: 35 bp (6nt upstream flanking + 20nt sgRNA + 3nt PAM + 6nt downstream flanking) - Total Features: 684 - Mononucleotides Features: 140 - 4 bases (A,C,G,T) × 35 nt (6nt upstream, 20nt sgRNA, 3nt PAM, 6nt downstream) - Dinucleotide Features: 16 dinucleotides × 34 positions (of 35 bp sequence) - Using randomized logistic regression on 684 features with regularization and **selected 91 features**. """ ) expander3.markdown( """ **An sgRNA activity predictive model [Doench et. al 2016](https://www.nature.com/articles/nbt.3437): Rule set 2** **Using sgRNA design [Rule set 1](https://www.nature.com/articles/nbt.3026)** to create human and mouse genome-wide libraries, performed positive and negative selection screens to derive additional rules for improved sgRNA design. Also developed a metric to **predict off-target sites**. - **Features used (using one-hot encoding).** - Position Specific features: 80 order 1 and 320 order 2 - Position-independent features: 4 order 1 and 16 order 2 - **GC Count Features:** - Number of Gs and Cs in the 20 mer - Number of Gs and Cs >10 - **PAM (NGGN) Features:** - 16 features for the two nucleotides in the N and N positions - **Thermodynamic features:** Melting temperatures of the DNA version of the RNA guide sequence of the - Entire 30-mer target site plus context) - 5 nucleotides immediately proximal to the PAM - 8 nucleotides adjacent to that (away from the PAM) - 5 nucleotides in turn adjacent to the 8 mer (again, away from the PAM) **A deep-learning-based (Convolutional Neural Network - CNN) based DeepCpf1 model for Cas12a/Cpf1 RNA [Kim et. al 2017](https://www.nature.com/articles/nmeth.4104):** An automated feature engineering and prediction algorithm (based on data from 16,292 (experiment A) and 2,963 (experiment B) target sequences and 20-nt guide sequences) that automatically learn informative representations of target sequences relevant to Cpf1 activity profiles. - Input: 34-bp target sequence - One-hot encoding input layer: Representing each nucleotide A, C, G, and T as row number then 4-by-34 dimensional binary matrix represents the whole 34-bp target sequence. - Convolution layer: 80 filters of length 5 and rectified linear unit (ReLU) to the convolution outputs. - The pooling layer: Computes the average in each of the non-overlapping windows of size 2, providing invariance to local shifts and reducing the number of parameters. - Three fully connected layers of 80, 40, and 40 units, chromatin accessibility integration layer and regression output layer. - The chromatin accessibility integration layer: It incorporates the sequence representations with the chromatin accessibility information of the target sequence """ ) #expander3.markdown('Model scores f(s_{j}) will fall into the range [0,1], and higher values predict higher activity') expander3.markdown('**Specificity (Off-Target Activity)**') expander3.markdown('Cas9 nucleases often cleaves unintended genomic sites due to sgRNAs recognising DNA sequences with a few mismatches (off-target cleavage). Two main methods are used to predict cutting Specificity of sgRNA:') expander3.markdown('- **Alignment-based methods:** sgRNAs are aligned to a given genome (often using traditional alignment tools such as Bowtie or BWA) and off-target sequences and sites are returned. This method is mainly used to find out all potential off-targets ***in-silico***') expander3.markdown( """ - **Scoring-based methods:** sgRNAs are **further ranked** using identified off-targets from alignment process: - **Hypothesis-driven:** off-targets are scored based on the contribution of specific genome context factors (features such as those influencing the nonspecific binding of sgRNAs) **(MIT score)** and cutting frequency determination **(CFD score)** as sgRNA can also bind genome loci with non-canonical PAMs such as NAG, NCG and NGA. - **Learning-based:** sgRNAs are scored and predicted from a training model that considers the different features (including PAM type, nucleotide composition, GC content, chromatin structure, DNA methylation, RNA secondary structure, etc) affecting specificity. - **An algorithm to predict off-target loci for specificity analyses [Hsu. et. al 2013](https://www.nature.com/articles/nbt.2647):** - Generally, specificity of SpCas9-mediated DNA cleavage is sequence- and locus-dependent and governed by the quantity, position and identity of mismatching bases. - Design guidelines to minimize off-target cleavage: - Potential ‘off-target’ genomic sequences: should follow these four constraints: - should not be followed by a PAM with either 5′-NGG or 5′-NAG sequences - Their global sequence similarity to the target sequence should be minimized, and guide sequences with genomic off-target loci that have fewer than three mismatches should be avoided - At least two mismatches should lie within the PAM-proximal region of the off-target site. - A maximal number of mismatches should be consecutive or spaced less than four bases apart. **Off-target scoring.** - **Off-target mismatch [Stemmer et. al 2015](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0124633) score [Doench et. al 2016](https://www.nature.com/articles/nbt.3437):** a value of 0 indicates no predicted off-target activity whereas a value of 1 indicates a perfect match - Off-target search - The search of off-target sites is carried out using Bowtie - For each off-target site of any sgRNA a score is computed that indicates the likelihood of a stable sgRNA/DNA heteroduplex and is quantified as in Eq. 1: - sgRNA target site score: The list of sgRNA target sites is ranked according to the number of predicted off-target sites and their potential deleterious effects on the respective off-target gene. The ranking is based on a single score that combines the number of off-target sites, the distribution of their mismatches and the distance to the closest annotated exon (Eq 2). - """ ) expander3.markdown(off_target) expander3.markdown("where pos is the position of each mismatch counted from the 5' end") expander3.markdown(sgran_target_score) #expander3.markdown('- An important feature of the SpCas9 system is the PAM, which is a CRISPR-dependent and conserved DNA sequence motif adjacent to the target site, and is used by bacteria to distinguish between self and non-self DNA (24). Therefore, target recognition requires both base pairing to the gRNA sequence and the presence of the PAM (i.e. 5′-NGG-3′) adjacent to the targeted sequence (1). When the Cas9 binds with PAM and the target site pairs with the gRNA, a double-strand break (DSB) is caused between positions 17 and 18 of the 20-nt gRNA sequence (Figure 3) (1). Following the break, random insertions or deletions (indels) can be generated via the non-homologous end-joining (NHEJ) pathway, which is error-prone (gene knockout). Alternatively, a desired modification can be introduced through homology-directed repair (HDR) when provided with a DNA template (gene knock-in) (30,31).') #expander3.markdown('- The efficiency of DNA cleavage depends not only on the intrinsic nuclease activity, but also on target site accessibility and the affinity of DNA binding domain(s) (e.g. gRNA) to the target sequence. However, there is a lack of understanding on the exact behavior of the engineered Cas9 nuclease in living cells, especially regarding the dynamics of its interaction with DNA, and the cell cycle-dependent cleavage activity. Due to the limited biological knowledge, prediction of nuclease target accessibility and cleavage rates in living cells remains difficult. Therefore, experimental validation of target-site selection is necessary. Computational approaches can analyze and extract knowledge from large-scale CRISPR screens. Thus, they can help identify gRNA features modulating Cas9 activity as well as make plausible hypotheses regarding its mechanism of action.') #expander3.markdown('- gRNA sequence features') #expander3.markdown('- Protospacer adjacent motif') #expander3.markdown('- gRNA sequence motifs') #expander3.markdown('- Overall nucleotide usage, Position-specific nucleotide composition and Structural features') #expander3.markdown('TOOLS FOR GUIDE EFFICIENCY PREDICTION: ') expander4 = st.expander("CRISPR-Cas9 and Base editor tools reviewed") #st.header('CRISPR-Cas9 and Base editor tools reviewed') expander4.markdown( """ We have reviewed a total of ***6 tools*** in the public domain which are **at least semi-automated and reproducible** (no copy and pasting IDs or sequences one at a time). These tools offer a wide range of options ranging from **HDR** based edits to improved **single base editors** to precise base editing such as **Prime editing**. Furthermore, many of these tools offer **variety of PAM sequences expanding the number of available target sites for base editing.** - [BE-DICT](http://130.60.24.130/page-set?actionID=5f8c494b8c854d0029ffa9d3) - An attention based deep learning algorithm for based editing outcomes prediction [Paper](https://www.nature.com/articles/s41467-021-25375-z). - Options: ABE8e, ABEmax, BE4max, Target-AID. - [ChopChop](https://chopchop.cbu.uib.no) - This tool offers various Endonucleaes (Cas9, nCas9, Cpf1 (also known as Cas12a and **only contains crRNA**), CasX (generates staggered double-stranded break) and **Cas13 (also known as C2c2)) RNA editor**) and PAM options. Results for following options are reported in this app: - Cas13a, CasX_TTCN, Cpf1_TTN, NGG (Cas9), Nickase_NGG, Nickase_NRG. - [E-CRISP](http://www.e-crisp.org/E-CRISP/) - This tool offers relaxed, medium and strict options for PAM sequence. - [GuideScan2](https://guidescan.com) - This tool offers Cas9 and Cpf1 endonucleases with various options to filter out results. Results based on 30, 40 and 50bp (SNP location = (n/2)bp) input sequence range (for Cas9 and Cpf1) are reported in this app: - 30bp_cpf1, 30bp_NGG, 40bp_cpf1, 40bp_NGG, 50bp_cpf1, 50bp_NGG - [PnB Designer](https://fgcz-shiny.uzh.ch/PnBDesigner/) - This tool allows base editing as well as **prime editing**. Results reported in this app are based on: - Base_editing_guides, Nicking_guides, pegRNA_oligos - [SNP_CRISPR](https://www.flyrnai.org/tools/snp_crispr/web/) - This tool offers guides for NGG and NAG PAM sequences and are reporoted in this app: - NGG, NAG **For more details on each tool, Please select select it from the sidebar menu under Tools Selection Menu** """ ) else: #if Calc == 'Selection Menu': #ReadMe = st.sidebar.checkbox('ReadME',value=False) select_variant = st.sidebar.selectbox( "Please select variant", variants_s ) variant_spl=select_variant.split() st.sidebar.write('Please select A method') BE_DICT = st.sidebar.checkbox('BE-DICT',key=1) if BE_DICT: select_method_BEDICT='BE-DICT' method_bedict = st.sidebar.radio( "Please select an option", ('bystander_ABE8e_mean', 'bystander_ABEmax_mean_5','bystander_BE4max_mean','bystander_Target-AID_mean')) ChopChop = st.sidebar.checkbox('ChopChop',key=2) if ChopChop: select_method_ChopChop='ChopChop' method_chopchop = st.sidebar.radio( "Please select an option", ('Cas13a', 'CasX_TTCN','Cpf1_TTN','CRISPR-CAS9_NGG', 'Nickase_NGG','Nickase_NRG')) ECRISP = st.sidebar.checkbox('E-CRISP',key=3) if ECRISP: ecrisp('E-CRISP') #select_method_ChopChop='ChopChop' #method_chopchop = st.sidebar.radio( # "Please select an option", # ('Cas13a', 'CasX_TTCN','Cpf1_TTN','CRISPR-CAS9_NGG', 'Nickase_NGG','Nickase_NRG')) GuideScan2 = st.sidebar.checkbox('GuideScan2',key=4) if GuideScan2: select_method_GuideScan2='GuideScan2' method_GuideScan2 = st.sidebar.radio( "Please select an option", ('30bp_cpf1', '30bp_NGG','40bp_cpf1', '40bp_NGG','50bp_cpf1', '50bp_NGG')) PnBDesigner = st.sidebar.checkbox('PnB Designer',key=5) if PnBDesigner: select_method_PnBDesigner='PnB Designer' method_PnBDesigner = st.sidebar.radio( "Please select an option", ('Base_editing_guides', 'Nicking_guides_PE3_PE3b','pegRNA_oligos')) SNPCRISPR = st.sidebar.checkbox('SNP_CRISPR',key=6) if SNPCRISPR: select_method_SNPCRISPR='SNP_CRISPR' method_SNPCRISPR = st.sidebar.radio( "Please select an option", ('NAG', 'NGG')) if BE_DICT and select_method_BEDICT == "BE-DICT": BEDICT(method_bedict,select_method_BEDICT) if ChopChop and select_method_ChopChop == "ChopChop": Chopchop(method_chopchop,select_method_ChopChop) if GuideScan2 and select_method_GuideScan2 == "GuideScan2": Guidescan2(method_GuideScan2,select_method_GuideScan2) if PnBDesigner and select_method_PnBDesigner=='PnB Designer': Pnbdesigner(method_PnBDesigner,select_method_PnBDesigner) if SNPCRISPR and select_method_SNPCRISPR=='SNP_CRISPR': SNPcrispr(method_SNPCRISPR,select_method_SNPCRISPR) st.sidebar.image("DataTecnica_White.png", use_column_width=True)