import streamlit as st
import numpy as np
import torch
from torch import nn
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Page title
st.title('Advanced Deep Learning for NLP')

# Word Embeddings: Word2Vec, GloVe, FastText
st.header('1. Word Embeddings')

st.subheader('Definition:')
st.write("""
Word embeddings are dense vector representations of words, where words with similar meanings have similar vector representations. 
They allow machine learning models to work with text data more efficiently.

- **Word2Vec (Skip-gram, CBOW)**: A model that learns word representations based on the context of words. 
    - **Skip-gram**: Predicts surrounding words from a given word.
    - **CBOW (Continuous Bag of Words)**: Predicts the target word from surrounding context words.
- **GloVe (Global Vectors for Word Representation)**: A count-based model that learns word vectors by leveraging the global word-word co-occurrence statistics in a corpus.
- **FastText**: Similar to Word2Vec but also takes subword information into account, helping with out-of-vocabulary words.

""")

# Example: Word2Vec
st.subheader('Word2Vec Example:')
sentence = st.text_area("Enter a sentence to visualize Word2Vec embeddings", "NLP is amazing and very useful.")

if st.button('Visualize Word2Vec'):
    # Simulate Word2Vec output by showing the word embeddings
    words = sentence.split()
    embeddings = {word: np.random.rand(1, 50) for word in words}  # Fake embeddings for demonstration
    st.write("Word2Vec Embeddings (Random Example):")
    for word, emb in embeddings.items():
        st.write(f"{word}: {emb.flatten()[:5]}...")  # Display first 5 values of fake embedding

# Sequence Models: RNNs, LSTMs, GRUs
st.header('2. Sequence Models')

st.subheader('Definition:')
st.write("""
Sequence models are used to process sequential data (like sentences), where the order of the data matters. They are key in NLP tasks like translation, summarization, and sentiment analysis.

- **RNNs (Recurrent Neural Networks)**: RNNs process sequences by using a loop to pass information from one step to the next. They are simple but struggle with long-term dependencies.
- **LSTMs (Long Short-Term Memory)**: LSTMs are a type of RNN designed to handle long-range dependencies by using gating mechanisms to control the flow of information.
- **GRUs (Gated Recurrent Units)**: GRUs are similar to LSTMs but with a simplified architecture that still helps capture long-range dependencies effectively.

""")

# Example: RNN with PyTorch (Simple architecture for demonstration)
st.subheader('RNN Example (PyTorch):')

if st.button('Show RNN Model Architecture'):
    # Basic RNN model using PyTorch
    class SimpleRNN(nn.Module):
        def __init__(self, input_size, hidden_size, output_size):
            super(SimpleRNN, self).__init__()
            self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
            self.fc = nn.Linear(hidden_size, output_size)
        
        def forward(self, x):
            out, _ = self.rnn(x)
            out = self.fc(out[:, -1, :])  # Get the last output of the sequence
            return out

    rnn_model = SimpleRNN(input_size=10, hidden_size=20, output_size=1)
    st.write("RNN Architecture:")
    st.write(rnn_model)

# Attention Mechanisms: Self-attention, Seq2Seq, Transformer
st.header('3. Attention Mechanisms')

st.subheader('Definition:')
st.write("""
Attention mechanisms allow models to focus on specific parts of an input sequence when making predictions, which improves their performance on tasks that require understanding long-range dependencies.

- **Self-attention**: A mechanism where a model looks at different parts of the sequence (or the entire sequence) to compute a weighted representation for each element.
- **Seq2Seq Models**: Sequence-to-sequence models use an encoder-decoder architecture, where the encoder processes the input sequence, and the decoder generates the output sequence.
- **Transformer**: The Transformer model relies on self-attention to capture relationships between all words in a sentence, making it very efficient for NLP tasks like translation, summarization, and language generation.

""")

# Example: Transformer architecture (simplified explanation)
st.subheader('Transformer Example (Simplified):')

if st.button('Show Transformer Architecture'):
    transformer_model = keras.Sequential([
        layers.InputLayer(input_shape=(None, 512)),  # Input size (None for variable-length sequences)
        layers.MultiHeadAttention(num_heads=8, key_dim=512),
        layers.GlobalAveragePooling1D(),
        layers.Dense(256, activation="relu"),
        layers.Dense(1)
    ])

    st.write("Transformer Architecture (Simplified):")
    st.write(transformer_model)

# Explanation of key attention components
st.header('4. Attention Components')

st.subheader('Self-attention:')
st.write("""
In self-attention, each word in a sequence attends to all other words and assigns a weight (importance) to each. 
This mechanism allows models to capture long-range dependencies between words in a sentence.
""")

st.subheader('Seq2Seq:')
st.write("""
Seq2Seq models are typically used for tasks like translation, where the model encodes an input sequence (like a sentence) and decodes it to produce an output sequence.
""")

st.subheader('Transformer:')
st.write("""
The Transformer model revolutionized NLP by using self-attention in both the encoder and decoder. It processes all tokens in parallel, making it highly efficient and suitable for large datasets.
""")