import streamlit as st

# Function to display the Home Page
def show_home_page():
    st.title("Natural Language Processing (NLP)")
    st.markdown(
        """
        ### Welcome to NLP Guide
        Natural Language Processing (NLP) is a branch of artificial intelligence that focuses on the interaction between
        computers and humans through natural language. It enables machines to read, understand, and respond to human
        language in a way that is both meaningful and useful. NLP powers a wide range of applications like chatbots,
        translation tools, sentiment analysis, and search engines.

        Use the menu in the sidebar to explore each topic in detail.
        """
    )

# Function to display specific topic pages
def show_page(page):
    if page == "NLP Terminologies":
        st.title("NLP Terminologies")
        st.markdown(
            """
            ### NLP Terminologies (Detailed Explanation)

            - **Tokenization**: Breaking text into smaller units like words or sentences.
            - **Stop Words**: Commonly used words (e.g., "the", "is") often removed during preprocessing.
            - **Stemming**: Reducing words to their root forms (e.g., "running" -> "run").
            - **Lemmatization**: Converting words to their dictionary base forms (e.g., "running" -> "run").
            - **Corpus**: A large collection of text used for NLP training and analysis.
            - **Vocabulary**: The set of all unique words in a corpus.
            - **n-grams**: Continuous sequences of n words/characters from text.
            - **POS Tagging**: Assigning parts of speech to words.
            - **NER (Named Entity Recognition)**: Identifying names, places, organizations, etc.
            - **Parsing**: Analyzing grammatical structure of text.
            """
        )
    elif page == "One-Hot Vectorization":
        st.title("One-Hot Vectorization")
        st.markdown(
            """
            ### One-Hot Vectorization

            A simple representation where each word in the vocabulary is represented as a binary vector.

            #### How It Works:
            - Each unique word in the corpus is assigned an index.
            - The vector for a word is all zeros except for a 1 at the index corresponding to that word.

            #### Example:
            Vocabulary: ["cat", "dog", "bird"]
            - "cat" -> [1, 0, 0]
            - "dog" -> [0, 1, 0]
            - "bird" -> [0, 0, 1]

            #### Advantages:
            - Simple to implement.

            #### Limitations:
            - High dimensionality for large vocabularies.
            - Does not capture semantic relationships (e.g., "cat" and "kitten" are unrelated).

            #### Applications:
            - Useful for small datasets and when computational simplicity is prioritized.
            """
        )
    elif page == "Bag of Words":
        st.title("Bag of Words (BoW)")
        st.markdown(
            """
            ### Bag of Words (BoW)

            Bag of Words is a method of representing text data as word frequency counts without considering word order.

            #### How It Works:
            1. Create a vocabulary of all unique words in the text.
            2. Count the frequency of each word in a document.

            #### Example:
            Given two sentences:
            - "I love NLP."
            - "I love programming."

            Vocabulary: ["I", "love", "NLP", "programming"]
            - Sentence 1: [1, 1, 1, 0]
            - Sentence 2: [1, 1, 0, 1]

            #### Advantages:
            - Simple to implement.

            #### Limitations:
            - High dimensionality for large vocabularies.
            - Does not consider word order or semantic meaning.
            - Sensitive to noise and frequent terms.

            #### Applications:
            - Text classification and clustering.
            """
        )
    elif page == "TF-IDF Vectorizer":
        st.title("TF-IDF Vectorizer")
        st.markdown(
            """
            ### TF-IDF Vectorizer

            Term Frequency-Inverse Document Frequency (TF-IDF) is a statistical measure that evaluates the importance of a word in a document relative to a collection of documents (corpus).

            #### Formula:
            \[ \text{TF-IDF} = \text{TF} \times \text{IDF} \]

            - **Term Frequency (TF)**: Number of times a term appears in a document divided by total terms in the document.
            - **Inverse Document Frequency (IDF)**: Logarithm of total documents divided by the number of documents containing the term.

            #### Advantages:
            - Reduces the weight of common words.
            - Highlights unique and important words.

            #### Example:
            For the corpus:
            - Doc1: "NLP is amazing."
            - Doc2: "NLP is fun and amazing."

            TF-IDF highlights words like "fun" and "amazing" over commonly occurring words like "is".

            #### Applications:
            - Search engines, information retrieval, and document classification.
            """
        )
    elif page == "Word2Vec":
        st.title("Word2Vec")
        st.markdown(
            """
            ### Word2Vec

            Word2Vec is a neural network-based technique for creating dense vector representations of words, capturing their semantic relationships.

            #### Key Concepts:
            - **CBOW (Continuous Bag of Words)**: Predicts the target word from its context.
            - **Skip-gram**: Predicts the context from the target word.

            #### Advantages:
            - Captures semantic meaning (e.g., "king" - "man" + "woman" ≈ "queen").
            - Efficient for large datasets.

            #### Applications:
            - Text classification, sentiment analysis, and recommendation systems.

            #### Limitations:
            - Requires significant computational resources.
            """
        )
    elif page == "FastText":
        st.title("FastText")
        st.markdown(
            """
            ### FastText

            FastText is an extension of Word2Vec that represents words as a combination of character n-grams.

            #### Advantages:
            - Handles rare and out-of-vocabulary words.
            - Captures subword information (e.g., prefixes and suffixes).

            #### Example:
            The word "playing" might be represented by n-grams like "pla", "lay", "ayi", "ing".

            #### Applications:
            - Multilingual text processing.
            - Handling noisy and incomplete data.

            #### Limitations:
            - Higher computational cost compared to Word2Vec.
            """
        )
    elif page == "Tokenization":
        st.title("Tokenization")
        st.markdown(
            """
            ### Tokenization

            Tokenization is the process of breaking text into smaller units (tokens) such as words, phrases, or sentences.

            #### Types of Tokenization:
            - **Word Tokenization**: Splits text into words.
            - **Sentence Tokenization**: Splits text into sentences.

            #### Libraries for Tokenization:
            - NLTK, SpaCy, and Hugging Face Transformers.

            #### Example:
            Sentence: "NLP is exciting."
            - Word Tokens: ["NLP", "is", "exciting", "."]

            #### Applications:
            - Preprocessing for machine learning models.

            #### Challenges:
            - Handling complex text like abbreviations and multilingual data.
            """
        )
    elif page == "Stop Words":
        st.title("Stop Words")
        st.markdown(
            """
            ### Stop Words

            Stop words are commonly used words in a language that are often removed during text preprocessing.

            #### Examples of Stop Words:
            - English: "is", "the", "and", "in".
            - Spanish: "es", "el", "y", "en".

            #### Why Remove Stop Words?
            - To reduce noise in text data.

            #### Applications:
            - Sentiment analysis, text classification, and search engines.

            #### Challenges:
            - Some stop words might carry context-specific importance.
            """
        )

# Sidebar navigation
st.sidebar.title("NLP Topics")
menu_options = [
    "Home",
    "NLP Terminologies",
    "One-Hot Vectorization",
    "Bag of Words",
    "TF-IDF Vectorizer",
    "Word2Vec",
    "FastText",
    "Tokenization",
    "Stop Words",
]
selected_page = st.sidebar.radio("Select a topic", menu_options)

# Display the selected page
if selected_page == "Home":
    show_home_page()
else:
    show_page(selected_page)