pranayreddy316's picture
Upload The NLP_Steps.py
5905397 verified
import streamlit as st
import pandas as pd
import re
def main_page():
# Title of the app
st.title("Important Steps in NLP Project")
# Introduction
st.write("""
In our **ZERO TO HERO IN ML** app, we have already learned about the first two steps of an NLP project:
1. **Problem Statement**
2. **Data Collection**
On this page, we will explore the next three main steps specific to an NLP project. These steps are essential for processing and understanding text data.
""")
# Highlight the steps
st.header("Three Main Steps in an NLP Project")
# Step 1: Simple EDA of Text
st.subheader("1. Simple EDA of Text")
st.write("""
**Exploratory Data Analysis (EDA)** helps you understand the structure and quality of the text data.
Some key actions in EDA for text include:
- Checking for missing values
- Examining data distribution
- Identifying patterns like URLs, mentions (@, #), and numeric data
- Understanding the case format and punctuation
- Spotting special characters, HTML/XML tags, and emojis
""")
if st.button("Know More About Simple EDA"):
st.session_state.page = "simple_eda_app"
st.markdown("---")
# Step 2: Pre-Processing of Text
st.subheader("2. Pre-Processing of Text")
st.write("""
**Pre-processing** prepares the raw text data for analysis by:
- Converting text to lowercase (Case Normalization)
- Removing special characters, punctuation, and numbers
- Eliminating stopwords (e.g., "the", "and", "in")
- Expanding contractions (e.g., "can't" to "cannot")
- Handling URLs, emails, mentions, and hashtags
- Using Stemming or Lemmatization to reduce words to their base forms
- Converting emojis into textual descriptions or removing them
""")
if st.button("Know More About Pre-Processor"):
st.session_state.page = "pre_processing"
st.markdown("---")
# Step 3: Feature Engineering of Text
st.subheader("3. Feature Engineering of Text")
st.write("""
**Feature Engineering** involves extracting meaningful features from text data, such as:
- **Bag of Words (BoW)**: Converting text to word counts
- **TF-IDF (Term Frequency-Inverse Document Frequency)**: Highlighting important terms
- **Word Embeddings**: Representing words in numerical vector format (e.g., Word2Vec, GloVe, FastText)
- **N-grams**: Generating word sequences for richer context
- **Custom Features**: Length of the text, sentiment scores, and more
""")
st.markdown("---")
# Note
st.markdown("""
**Note:** These three steps are explained in the context of NLP projects that primarily deal with **text data**.
- Do not confuse these steps with the general roadmap for a machine learning project, as they are tailored for NLP-specific tasks.
""")
# Define the main EDA function
def simple_eda_app():
# Title and Introduction
st.title("Simple EDA for Text Data in NLP")
st.write("""
This application demonstrates various steps involved in Simple EDA (Exploratory Data Analysis) for text data.
These steps help assess the quality and structure of the collected text data, which is crucial for successful preprocessing and NLP projects.
""")
# Sample dataset
data = pd.DataFrame({
"Review": [
"I ❤️ programming with Python",
"Contact us at support@python.org",
"Debugging <i>errors</i> is tedious",
"@John loves Python",
"AI has grown exponentially in 2023",
"Visit https://www.github.com/",
"Coding is fun!",
"Learning AI is exciting",
"Learn AI in 12/05/2023"
]
})
# Display dataset
st.write("Below is the sample dataset we will use:")
st.dataframe(data)
# Step selection dropdown
selected_option = st.selectbox(
"Choose a step to explore:",
[
"Introduction to Simple EDA",
"Check Case Format",
"Detect HTML/XML Tags",
"Detect Mentions (@, #)",
"Detect Numeric Data",
"Detect URLs",
"Detect Punctuation & Special Characters",
"Detect Emojis (Code Only)",
"Detect Dates and Times",
"Detect Emails"
]
)
# Perform actions based on selected option
if selected_option == "Introduction to Simple EDA":
st.header("Introduction to Simple EDA for Text Data")
st.write("""
Exploratory Data Analysis (EDA) for text data helps examine, visualize, and summarize unstructured datasets.
These analyses reveal patterns, outliers, and inconsistencies to ensure better preprocessing and model accuracy.
""")
elif selected_option == "Check Case Format":
st.header("Step 1: Check Case Format")
code = """
data["Case Format"] = data["Review"].apply(
lambda x: "Lower/Upper" if x.islower() or x.isupper() else "Mixed"
)
"""
st.code(code, language="python")
data["Case Format"] = data["Review"].apply(
lambda x: "Lower/Upper" if x.islower() or x.isupper() else "Mixed"
)
st.write("Identified case formats in the dataset:")
st.dataframe(data)
elif selected_option == "Detect HTML/XML Tags":
st.header("Step 2: Detect HTML/XML Tags")
code = """
data["Contains Tags"] = data["Review"].apply(lambda x: bool(re.search(r"<.*?>", x)))
"""
st.code(code, language="python")
data["Contains Tags"] = data["Review"].apply(lambda x: bool(re.search(r"<.*?>", x)))
st.write("Rows with HTML/XML tags detected:")
st.dataframe(data)
elif selected_option == "Detect Mentions (@, #)":
st.header("Step 3: Detect Mentions (@, #)")
code = """
data["Contains Mentions"] = data["Review"].apply(lambda x: bool(re.search(r"\\B[@#]\\S+", x)))
"""
st.code(code, language="python")
data["Contains Mentions"] = data["Review"].apply(lambda x: bool(re.search(r"\B[@#]\S+", x)))
st.write("Rows with mentions identified:")
st.dataframe(data)
elif selected_option == "Detect Numeric Data":
st.header("Step 4: Detect Numeric Data")
code = """
data["Contains Numeric"] = data["Review"].apply(lambda x: bool(re.search(r"\\d+", x)))
"""
st.code(code, language="python")
data["Contains Numeric"] = data["Review"].apply(lambda x: bool(re.search(r"\d+", x)))
st.write("Rows containing numeric data:")
st.dataframe(data)
elif selected_option == "Detect URLs":
st.header("Step 5: Detect URLs")
code = """
data["Contains URL"] = data["Review"].apply(lambda x: bool(re.search(r"https?://\\S+", x)))
"""
st.code(code, language="python")
data["Contains URL"] = data["Review"].apply(lambda x: bool(re.search(r"https?://\S+", x)))
st.write("Rows containing URLs:")
st.dataframe(data)
elif selected_option == "Detect Punctuation & Special Characters":
st.header("Step 6: Detect Punctuation & Special Characters")
code = """
data["Contains Punctuation"] = data["Review"].apply(
lambda x: bool(re.search(r'[!"#$%&\\'()*+,-./:;<=>?@[\\]^_`{|}~]', x))
)
"""
st.code(code, language="python")
data["Contains Punctuation"] = data["Review"].apply(
lambda x: bool(re.search(r'[!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~]', x))
)
st.write("Rows with punctuation or special characters identified:")
st.dataframe(data)
elif selected_option == "Detect Emojis (Code Only)":
st.header("Step 7: Detect Emojis (Code Only)")
st.write("""
Here is the code for detecting emojis in text data using Python:
""")
code = """
import emoji
data["Contains Emojis"] = data["Review"].apply(lambda x: bool(emoji.emoji_count(x)))
"""
st.code(code, language="python")
st.write("Emojis add meaning and emotion to text. Handle them based on your project needs.")
elif selected_option == "Detect Dates and Times":
st.header("Step 8: Detect Dates and Times")
code = """
data["Contains Date/Time"] = data["Review"].apply(
lambda x: bool(re.search(r"\\d{1,2}/\\d{1,2}/\\d{4}|\\d{4}/\\d{1,2}/\\d{1,2}", x))
)
"""
st.code(code, language="python")
data["Contains Date/Time"] = data["Review"].apply(
lambda x: bool(re.search(r"\d{1,2}/\d{1,2}/\d{4}|\d{4}/\d{1,2}/\d{1,2}", x))
)
st.write("Rows with date and time information detected:")
st.dataframe(data)
elif selected_option == "Detect Emails":
st.header("Step 9: Detect Emails")
code = """
data["Contains Email"] = data["Review"].apply(lambda x: bool(re.search(r"\\S+@\\S+", x)))
"""
st.code(code, language="python")
data["Contains Email"] = data["Review"].apply(lambda x: bool(re.search(r"\S+@\S+", x)))
st.write("Rows containing emails:")
st.dataframe(data)
def preprocessing():
# Set up the Streamlit layout
st.title("Text Preprocessing in NLP")
st.write("""
Preprocessing in Natural Language Processing (NLP) transforms raw, unstructured text data
into a clean format suitable for modeling. The following steps help standardize the data,
remove unwanted elements, and extract meaningful information.
""")
# Example Data
data = pd.DataFrame({
"Review": [
"I love Hyderabad Biryani!",
"I hate other places Biryani.",
"I like the Cooking process! 😊",
"Follow us on #Instagram @foodies. http://example.com"
]
})
st.subheader("Original Data:")
st.dataframe(data)
# Step-1: Case Normalization
st.subheader("Step 1: Case Normalization")
st.write("Convert all text to lowercase to ensure consistency.")
st.code("""
data['Review'] = data['Review'].str.lower()
""")
data["Review"] = data["Review"].str.lower()
st.write("Updated Data (Lowercase Text):")
st.dataframe(data)
st.markdown("---")
# Step-2: Removing Noise (HTML Tags, URLs, Emails, Mentions/Hashtags)
st.subheader("Step 2: Removing Noise")
st.write("Remove unwanted special characters, HTML/XML tags, URLs, email addresses, mentions, and hashtags.")
st.code("""
# Removing HTML tags
data['Review'] = data['Review'].apply(lambda x: re.sub('<.*?>', ' ', x))
# Removing URLs
data['Review'] = data['Review'].apply(lambda x: re.sub('https?://\S+', ' ', x))
# Removing Emails
data['Review'] = data['Review'].apply(lambda x: re.sub(r'\S+@\S+', ' ', x))
# Removing Mentions and Hashtags
data['Review'] = data['Review'].apply(lambda x: re.sub(r'\B[@#]\S+', ' ', x))
""")
data["Review"] = data["Review"].apply(lambda x: re.sub('<.*?>', ' ', x))
data["Review"] = data["Review"].apply(lambda x: re.sub('https?://\S+', ' ', x))
data["Review"] = data["Review"].apply(lambda x: re.sub(r'\S+@\S+', ' ', x))
data["Review"] = data["Review"].apply(lambda x: re.sub(r'\B[@#]\S+', ' ', x))
st.write("Updated Data (After Noise Removal):")
st.dataframe(data)
st.markdown("---")
# Step-3: Emoji Handling
st.subheader("Step 3: Emoji Handling")
st.write("Convert emojis to descriptive text or remove them.")
st.code("""
# Example: Replace emojis with a placeholder 'EMOJI'
data['Review'] = data['Review'].apply(lambda x: emoji.demojize(x, delimiters=(' ', ' ')))
""")
data["Review"] = data["Review"].apply(lambda x: re.sub(r'[^\x00-\x7F]+', 'EMOJI', x)) # Replace emojis with 'EMOJI'
st.write("Updated Data (After Emoji Handling):")
st.dataframe(data)
st.markdown("---")
# Step-4: Removing Stopwords (Excluding NLTK)
st.subheader("Step 4: Removing Stopwords")
st.write("Remove common words like 'and', 'is', which don't add value.")
st.code("""
stopwords = ["and", "the", "is", "in", "to", "for", "on"]
data['Review'] = data['Review'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords]))
""")
stopwords = ["and", "the", "is", "in", "to", "for", "on"] # Example stopwords list
data["Review"] = data["Review"].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords]))
st.write("Updated Data (After Stopwords Removal):")
st.dataframe(data)
st.markdown("---")
# Step-5: Removing Punctuation and Digits
st.subheader("Step 5: Removing Punctuation and Digits")
st.write("Remove punctuation marks and digits if not meaningful.")
st.code("""
# Removing Punctuation
data['Review'] = data['Review'].apply(lambda x: re.sub(r'[^\w\s]', ' ', x))
# Removing Digits
data['Review'] = data['Review'].apply(lambda x: re.sub(r'\d+', '', x))
""")
data["Review"] = data["Review"].apply(lambda x: re.sub(r'[^\w\s]', ' ', x))
data["Review"] = data["Review"].apply(lambda x: re.sub(r'\d+', '', x))
st.write("Updated Data (After Removing Punctuation and Digits):")
st.dataframe(data)
st.markdown("---")
# Step-6: Fixing Contractions
st.subheader("Step 6: Fixing Contractions")
st.write("Expand contractions like 'can't' to 'cannot'.")
st.code("""
contractions_dict = {"can't": "cannot", "won't": "will not", "I'm": "I am", "you're": "you are"}
data['Review'] = data['Review'].apply(lambda x: ' '.join([contractions_dict.get(word, word) for word in x.split()]))
""")
contractions_dict = {"can't": "cannot", "won't": "will not", "I'm": "I am", "you're": "you are"} # Example contraction dictionary
data["Review"] = data["Review"].apply(lambda x: ' '.join([contractions_dict.get(word, word) for word in x.split()]))
st.write("Updated Data (After Fixing Contractions):")
st.dataframe(data)
st.markdown("---")
# Step-7: Handling Dates and Times
st.subheader("Step 7: Handling Dates and Times")
st.write("Standardize dates and times into a uniform format.")
st.code("""
# Example: Replacing date-like patterns with 'DATE'
data['Review'] = data['Review'].apply(lambda x: re.sub(r'\b\d{1,2}\/\d{1,2}\/\d{4}\b', 'DATE', x))
""")
data["Review"] = data["Review"].apply(lambda x: re.sub(r'\b\d{1,2}\/\d{1,2}\/\d{4}\b', 'DATE', x))
st.write("Updated Data (After Handling Dates and Times):")
st.dataframe(data)
st.markdown("---")
# Display final clean data
st.subheader("Final Cleaned Data:")
st.dataframe(data)
# Run the EDA app if the button is clicked
if 'page' not in st.session_state:
st.session_state.page = 'main'
# Navigation logic
if st.session_state.page == 'main':
main_page()
elif st.session_state.page == 'simple_eda_app':
simple_eda_app()
elif st.session_state.page == 'pre_processing':
preprocessing()