Daksh0505's picture
Update app.py
36323bd verified
import streamlit as st
import pandas as pd
import numpy as np
import tensorflow as tf
import pickle
from keras.models import load_model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Concatenate
from tensorflow.keras.preprocessing.sequence import pad_sequences
from huggingface_hub import hf_hub_download
# ------------------------------------------------
# Page configuration
# ------------------------------------------------
st.set_page_config(
page_title="English → Hindi Seq2Seq Translator",
layout="centered",
page_icon="🧠",
)
st.title("🧠 English → Hindi Translation (Seq2Seq + Self-Attention)")
st.markdown("A demonstration of **LSTM Encoder–Decoder** with **Multi-Head Self-Attention** for translation tasks.")
# ------------------------------------------------
# About Section
# ------------------------------------------------
with st.expander("ℹ️ About This Project"):
st.markdown("""
This demo showcases a **Seq2Seq translation model enhanced with self-attention**.
It demonstrates how attention helps the decoder focus on relevant parts of the input during translation.
**Key Highlights:**
- Encoder–Decoder with LSTMs
- Integrated Multi-Head Self and Cross-Attention
- Two model sizes: **12M** and **42M** parameters
- Designed for educational visualization and experimentation
""")
# ------------------------------------------------
# Purpose Section
# ------------------------------------------------
with st.expander("🎯 Purpose"):
st.markdown("""
This Space is designed for:
- Demonstration and educational purposes
- Understanding **Seq2Seq + Attention mechanisms**
- Translating English sentences to Hindi
- Exploring how encoder outputs can serve as **context embeddings** for downstream NLP tasks
""")
# ------------------------------------------------
# Load models and tokenizers
# ------------------------------------------------
@st.cache_resource
def load_model_and_tokenizer(model_file, tokenizer_file):
model_path = hf_hub_download(repo_id="Daksh0505/Seq2Seq-LSTM-MultiHeadAttention", filename=model_file)
tokenizer_path = hf_hub_download(repo_id="Daksh0505/Seq2Seq-LSTM-MultiHeadAttention", filename=tokenizer_file)
model = load_model(model_path)
with open(tokenizer_path, "rb") as f:
tokenizer = pickle.load(f)
return model, tokenizer['english'], tokenizer['hindi']
model_12M, tokenizer_en_12, tokenizer_hi_12 = load_model_and_tokenizer(
"seq2seq-lstm-multiheadattention-12.3.keras", "seq2seq-tokenizers-12.3M.pkl"
)
model_42M, tokenizer_en_42, tokenizer_hi_42 = load_model_and_tokenizer(
"seq2seq-lstm-multiheadattention-42.keras", "seq2seq-tokenizers-42M.pkl"
)
# ------------------------------------------------
# Load dataset
# ------------------------------------------------
@st.cache_data
def load_data():
return pd.read_csv("translation.csv").head(5000)
data = load_data()
# ------------------------------------------------
# Model selection
# ------------------------------------------------
st.subheader("⚙️ Select Model Size")
model_choice = st.radio("Choose a model:", ["12M parameters", "42M parameters"], index=0, horizontal=True)
if model_choice == "12M parameters":
model = model_12M
tokenizer_en = tokenizer_en_12
tokenizer_hi = tokenizer_hi_12
max_seq_len = 40
else:
model = model_42M
tokenizer_en = tokenizer_en_42
tokenizer_hi = tokenizer_hi_42
max_seq_len = 50
word2idx_en = tokenizer_en.word_index
word2idx_hi = tokenizer_hi.word_index
idx2word_hi = tokenizer_hi.index_word
max_vocab_en = len(word2idx_en) + 1
# ------------------------------------------------
# Safe layer extraction helper
# ------------------------------------------------
def get_layer_safe(model, possible_names):
for name in possible_names:
try:
return model.get_layer(name)
except:
continue
raise ValueError(f"No matching layer found among {possible_names}")
# ------------------------------------------------
# Build encoder & decoder inference
# ------------------------------------------------
@st.cache_resource
def build_inference_models(model):
encoder_input = model.input[0]
emb_layer = get_layer_safe(model, ['embedding', 'embedding_0'])
norm_layer = get_layer_safe(model, ['layer_normalization', 'layer_normalization_0'])
drop_layer = get_layer_safe(model, ['dropout', 'dropout_1'])
lstm_layer = get_layer_safe(model, ['bidirectional'])
enc_emb = drop_layer(norm_layer(emb_layer(encoder_input), training=False))
enc_out, fh, fc, bh, bc = lstm_layer(enc_emb)
state_h = Concatenate()([fh, bh])
state_c = Concatenate()([fc, bc])
encoder_model = Model(encoder_input, [enc_out, state_h, state_c])
# Decoder
decoder_input = Input(shape=(1,))
decoder_lstm = get_layer_safe(model, ['lstm_1', 'lstm'])
decoder_emb = get_layer_safe(model, ['embedding_1', 'embedding_2'])
decoder_norm = get_layer_safe(model, ['layer_normalization_1', 'layer_normalization_2'])
decoder_drop = get_layer_safe(model, ['dropout_2', 'dropout_1'])
decoder_dense = get_layer_safe(model, ['dense'])
attention_layer = get_layer_safe(model, ['multi_head_attention'])
decoder_state_input_h = Input(shape=(decoder_lstm.units,))
decoder_state_input_c = Input(shape=(decoder_lstm.units,))
encoder_outputs_input = Input(shape=(None, decoder_lstm.units))
dec_emb = decoder_drop(decoder_norm(decoder_emb(decoder_input), training=False))
dec_out, dec_h, dec_c = decoder_lstm(dec_emb, initial_state=[decoder_state_input_h, decoder_state_input_c])
context = attention_layer(query=dec_out, key=encoder_outputs_input, value=encoder_outputs_input)
dec_combined = Concatenate(axis=-1)([context, dec_out])
dec_final = decoder_dense(dec_combined)
decoder_model = Model(
[decoder_input, decoder_state_input_h, decoder_state_input_c, encoder_outputs_input],
[dec_final, dec_h, dec_c]
)
return encoder_model, decoder_model
encoder_model, decoder_model = build_inference_models(model)
# ------------------------------------------------
# Helper functions
# ------------------------------------------------
def preprocess_input_sentence(sentence):
oov_idx = word2idx_en.get('<OOV>', 1)
seq = [word2idx_en.get(w.lower(), oov_idx) for w in sentence.split()]
seq = [idx if idx < max_vocab_en else oov_idx for idx in seq]
return pad_sequences([seq], maxlen=max_seq_len, padding='post')
def decode_sequence(input_seq):
start_token = word2idx_hi['<start>']
end_token = word2idx_hi['<end>']
enc_outs, h, c = encoder_model.predict(input_seq, verbose=0)
target_seq = np.array([[start_token]])
decoded_sentence = []
for _ in range(max_seq_len):
output_tokens, h, c = decoder_model.predict([target_seq, h, c, enc_outs], verbose=0)
sampled_idx = np.argmax(output_tokens[0, 0, :])
if sampled_idx == end_token:
break
if sampled_idx > 0:
decoded_sentence.append(idx2word_hi.get(sampled_idx, ''))
target_seq[0, 0] = sampled_idx
return " ".join(decoded_sentence)
# ------------------------------------------------
# Input selection with session_state caching
# ------------------------------------------------
st.subheader("📝 Try a Sample Translation")
# Initialize selected text in session_state
if "selected_text" not in st.session_state:
st.session_state.selected_text = data["english"].iloc[0]
if "translation" not in st.session_state:
st.session_state.translation = ""
# Dropdown for English sentence selection
selected_text = st.selectbox(
"Select an English sentence:",
data["english"].tolist(),
index=data["english"].tolist().index(st.session_state.selected_text)
)
# Update session_state if selection changed
if selected_text != st.session_state.selected_text:
st.session_state.selected_text = selected_text
st.session_state.translation = "" # Reset translation for new selection
original_hindi = data.loc[data["english"] == selected_text, "hindi"].values[0]
st.write("**Original English:**", selected_text)
st.write("**Reference Hindi:**", original_hindi)
# Translate button
if st.button("🚀 Translate"):
with st.spinner("Generating translation..."):
preprocessed = preprocess_input_sentence(selected_text)
translation = decode_sequence(preprocessed)
st.session_state.translation = translation
# Show cached translation if exists
if st.session_state.translation:
st.success(f"✅ **Predicted Hindi Translation:** {st.session_state.translation}")
# ------------------------------------------------
# Learning Header
# ------------------------------------------------
st.subheader(" Understanding the Model")
# ------------------------------------------------
# Self Attention Section
# ------------------------------------------------
with st.expander("🔹 Self-Attention Mechanism"):
st.markdown("""
Self-Attention is a mechanism where each token in a sequence attends to **other tokens in the same sequence** to capture dependencies.
**Key points:**
- Helps the model focus on relevant words within the same sentence.
- Computes attention scores between all pairs of positions in the input.
- Often implemented as **Multi-Head Self-Attention** to capture different types of relationships simultaneously.
**Example:**
In the sentence *"The cat sat on the mat"*, self-attention allows the model to understand that *"cat"* is related to *"sat"* and *"mat"*.
""")
# ------------------------------------------------
# Cross Attention Section
# ------------------------------------------------
with st.expander("🔹 Cross-Attention Mechanism"):
st.markdown("""
Cross-Attention is used in encoder-decoder architectures where the **decoder attends to encoder outputs**.
**Key points:**
- Decoder queries encoder outputs to focus on relevant parts of the input sentence.
- Crucial for translation, summarization, or any sequence-to-sequence task.
**Example:**
Translating *"I am hungry"* to Hindi: when generating the Hindi word *"भूखा"*, cross-attention helps the decoder focus on *"hungry"* in the English input.
""")
# ------------------------------------------------
# Multi-Head Attention Section
# ------------------------------------------------
with st.expander("🔹 Multi-Head Attention"):
st.markdown("""
Multi-Head Attention is an extension of the attention mechanism that allows the model to **capture information from different representation subspaces simultaneously**.
**Key Points:**
- Instead of using a single attention function, we use **multiple attention heads**.
- Each head learns to focus on **different parts or relationships** of the input.
- The outputs from all heads are **concatenated and linearly projected** to form the final context vector.
- Improves the model’s ability to understand complex dependencies in sequences.
**Example:**
- In translating *"The cat sat on the mat"*:
- Head 1 may focus on subject-verb relations (*cat ↔ sat*).
- Head 2 may focus on verb-object relations (*sat ↔ mat*).
- Head 3 may focus on positional or syntactic patterns.
- Combining all heads gives a richer context for the decoder.
**In your Seq2Seq Model:**
- Multi-Head Attention can be used as:
- **Self-Attention** in encoder/decoder layers
- **Cross-Attention** between encoder outputs and decoder hidden states
""")
# ------------------------------------------------
# Seq2Seq task Explaining Section
# ------------------------------------------------
with st.expander("🔹 Sequence-to-Sequence (Seq2Seq) Task"):
st.markdown("""
Seq2Seq models map an **input sequence** to an **output sequence**, often with **different lengths**.
**Examples:**
- Machine Translation: English → Hindi
- Text Summarization
- Chatbots / Dialogue Systems
**Characteristics:**
- Handles variable-length input and output sequences.
- Uses encoder to process input, decoder to generate output.
- Can integrate attention mechanisms to improve alignment between input and output tokens.
""")
# ------------------------------------------------
# Seq2Seq Task- Fixed-Length vs Variable-Length Section
# ------------------------------------------------
with st.expander("🔹 Fixed-Length vs Variable-Length Tasks"):
st.markdown("""
**Fixed-Length Tasks:**
- Input and output sequences have the **same length**.
- Example: Time series forecasting with fixed steps, classification tasks.
**Variable-Length Tasks:**
- Input and output sequences can **differ in length**.
- Example: Machine translation, summarization, speech recognition.
- Seq2Seq models are designed to handle this flexibility.
""")
# ------------------------------------------------
# Mathematics Expanders (Advanced / Optional)
# ------------------------------------------------
st.subheader("🧮 Mathematics Behind the Model")
with st.expander("🔹 Self-Attention Equations", expanded=False):
st.markdown(r"""
The attention function is computed as:
\[
\text{Attention}(Q,K,V) = \text{softmax}\left(\frac{Q K^T}{\sqrt{d_k}}\right) V
\]
Where:
- \(Q\) = Query matrix
- \(K\) = Key matrix
- \(V\) = Value matrix
- \(d_k\) = Dimension of key vectors
This allows the model to compute a weighted sum of values based on relevance.
""")
with st.expander("🔹 Multi-Head Attention Equations", expanded=False):
st.markdown(r"""
Multi-Head Attention combines multiple self-attention heads:
\[
\text{MultiHead}(Q,K,V) = \text{Concat}(\text{head}_1, ..., \text{head}_h) W^O
\]
Each head:
\[
\text{head}_i = \text{Attention}(Q W_i^Q, K W_i^K, V W_i^V)
\]
Where \(W_i^Q, W_i^K, W_i^V, W^O\) are learnable projection matrices.
""")
with st.expander("🔹 Cross-Attention / Encoder-Decoder Attention", expanded=False):
st.markdown(r"""
Cross-Attention computes attention using decoder queries and encoder outputs:
\[
\text{Context}_t = \text{Attention}(Q_t, K_{enc}, V_{enc})
\]
- \(Q_t\) = decoder hidden state at timestep \(t\)
- \(K_{enc}, V_{enc}\) = encoder outputs
""")
with st.expander("🔹 Seq2Seq Decoder Step", expanded=False):
st.markdown(r"""
At each decoder timestep:
\[
s_t, c_t = \text{LSTM}(y_{t-1}, s_{t-1}, c_{t-1})
\]
\[
\text{Output}_t = \text{Dense}(\text{Concat}(s_t, \text{Context}_t))
\]
""")
# ------------------------------------------------
# Show model architecture
# ------------------------------------------------
st.markdown("---")
show_arch = st.checkbox("🧩 Show Model Architecture")
if show_arch:
layer_info = []
for i, layer in enumerate(model.layers):
try:
out_shape = layer.output_shape
except:
try:
out_shape = layer.output.shape
except:
out_shape = "N/A"
layer_info.append([i, layer.name, str(out_shape)])
df_layers = pd.DataFrame(layer_info, columns=["Index", "Layer Name", "Output Shape"])
st.subheader("Model Architecture Overview")
st.dataframe(df_layers, width='stretch')
# ------------------------------------------------
# Footer
# ------------------------------------------------
st.markdown("---")
st.subheader("🔗 Resources")
st.markdown("""
- 🧩 **Model Repository:** [Daksh0505/Seq2Seq-LSTM-MultiHeadAttention](https://huggingface.co/Daksh0505/Seq2Seq-LSTM-MultiHeadAttention)
- 📘 **Dataset:** English–Hindi Parallel Corpus- IIT Bombay
- 🧠 **Framework:** TensorFlow / Keras
""")
st.caption("© 2025 Daksh Bhardwaj | For educational and research purposes.")