Spaces:

pranayreddy316
/

Zero_to_Hero_in_Machine_Learning

Build error

App Files Files Community

pranayreddy316 commited on Apr 7

Commit

43af36a

verified ·

1 Parent(s): 888390e

Upload The NLP Basic_Terminologies.py

Browse files

Files changed (1) hide show

pages/The NLP Basic_Terminologies.py +119 -0

pages/The NLP Basic_Terminologies.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import streamlit as st
+# Streamlit App Title and Introduction
+st.title("Basic Terminology in NLP")
+st.write(
+    """
+    Before diving deep into the concepts of NLP, it's crucial to understand the basic terminologies frequently used in this domain.
+    These terms lay the foundation for exploring more advanced NLP topics.
+    """
+)
+# Section: Key Terminologies in NLP
+st.header("1. Key Terminologies in NLP")
+st.write(
+    """
+    - **Corpus**: A collection of text documents.
+      Example: {d1, d2, d3, ...}
+    - **Document**: A single unit of text (e.g., a sentence, paragraph, or article).
+    - **Paragraph**: A collection of sentences.
+    - **Sentence**: A collection of words forming a meaningful expression.
+    - **Word**: A collection of characters.
+    - **Character**: A basic unit like an alphabet, number, or special symbol.
+    """
+)
+# Section: Tokenization
+st.header("2. Tokenization")
+st.write(
+    """
+    Tokenization is the process of splitting text into smaller units, called tokens.
+    Types of Tokenization:
+    - **Sentence Tokenization**: Splitting text into sentences.
+      Example: "I love ice-cream. I love chocolate." → ["I love ice-cream", "I love chocolate"]
+    - **Word Tokenization**: Splitting sentences into words.
+      Example: "I love biryani" → ["I", "love", "biryani"]
+    - **Character Tokenization**: Splitting words into characters.
+      Example: "Love" → ["L", "o", "v", "e"]
+    """
+)
+if st.button("Try Tokenization Example"):
+    text = "Streamlit makes NLP visualization interactive."
+    st.write(f"Original Text: {text}")
+    st.write(f"Word Tokens: {text.split()}")
+# Section: Stop Words
+st.header("3. Stop Words")
+st.write(
+    """
+    Stop words are commonly used words in a language that are ignored during text processing as they contribute little to the overall meaning.
+    Example:
+    - Sentence: "In Hyderabad, we can eat famous biryani."
+    - Stop words: ["in", "we", "can"]
+    """
+)
+if st.button("View Processed Text without Stop Words"):
+    text = "In Hyderabad, we can eat famous biryani."
+    stop_words = ["in", "we", "can"]
+    filtered_text = " ".join([word for word in text.split() if word.lower() not in stop_words])
+    st.write(f"Processed Text: {filtered_text}")
+# Section: Vectorization
+st.header("4. Vectorization")
+st.write(
+    """
+    Vectorization converts text data into numerical formats for machine learning models, enabling text processing and analysis.
+    Types of Vectorization:
+    - **One-Hot Encoding**: Represents each word as a binary vector.
+    - **Bag of Words (BoW)**: Represents text based on word frequencies.
+    - **TF-IDF (Term Frequency-Inverse Document Frequency)**: Adjusts word frequency by importance.
+    - **Word2Vec**: Embeds words in a vector space using deep learning.
+    - **GloVe**: Uses global co-occurrence statistics for embedding.
+    - **FastText**: Similar to Word2Vec but includes subword information.
+    """
+)
+# Section: Stemming
+st.header("5. Stemming")
+st.write(
+    """
+    Stemming reduces words to their base or root form by chopping off prefixes or suffixes. It is a rule-based heuristic process
+    and can produce words that may not be valid in the language.
+    Example:
+    - Original Words: "running", "runner", "runs"
+    - Stemmed Form: "run"
+    """
+)
+if st.button("Try Stemming Example"):
+    words = ["running", "runner", "runs"]
+    stemmed_words = [word[:-3] if word.endswith("ing") else word[:-2] if word.endswith("er") else word for word in words]
+    st.write("Original Words:", words)
+    st.write("Stemmed Words:", stemmed_words)
+# Section: Lemmatization
+st.header("6. Lemmatization")
+st.write(
+    """
+    Lemmatization reduces words to their dictionary or base form, called a lemma, while considering the context of the word in a sentence.
+    Example:
+    - Original Words: "studying", "better", "carrying"
+    - Lemmatized Form: "study", "good", "carry"
+    Lemmatization is more accurate than stemming but computationally more intensive as it requires a language dictionary.
+    """
+)
+if st.button("Try Lemmatization Example"):
+    words = ["studying", "better", "carrying"]
+    lemmatized_words = ["study" if word == "studying" else "good" if word == "better" else "carry" if word == "carrying" else word for word in words]
+    st.write("Original Words:", words)
+    st.write("Lemmatized Words:", lemmatized_words)