Spaces:
Sleeping
Sleeping
import streamlit as st | |
import string | |
import numpy as np | |
import pandas as pd | |
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer | |
from sklearn.naive_bayes import MultinomialNB | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.svm import SVC | |
from sklearn.datasets import fetch_20newsgroups | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import accuracy_score | |
from sklearn.decomposition import LatentDirichletAllocation | |
from sklearn.decomposition import NMF | |
# Page title | |
st.title('Traditional NLP Techniques') | |
# Text Preprocessing | |
st.header('1. Text Preprocessing') | |
st.subheader('Definition:') | |
st.write(""" | |
Text preprocessing is the process of cleaning and preparing raw text for further analysis or modeling. | |
This includes tasks such as removing unnecessary punctuation, converting text to lowercase, | |
and handling special characters like emojis. | |
""") | |
# Interactive example for preprocessing | |
text_input = st.text_area("Enter text to preprocess", "I love NLP! π This is amazing.") | |
# Punctuation removal | |
if st.button('Remove Punctuation'): | |
processed_text = ''.join([char for char in text_input if char not in string.punctuation]) | |
st.write(f"Text without punctuation: {processed_text}") | |
# Convert to lowercase | |
if st.button('Convert to Lowercase'): | |
lowercase_text = text_input.lower() | |
st.write(f"Text in lowercase: {lowercase_text}") | |
# Handle emojis (replace with a message) | |
if st.button('Remove Emojis'): | |
processed_text_no_emoji = ''.join(char for char in text_input if char.isalnum() or char.isspace()) | |
st.write(f"Text without emojis: {processed_text_no_emoji}") | |
# Text Vectorization | |
st.header('2. Text Vectorization') | |
st.subheader('Definition:') | |
st.write(""" | |
Text vectorization converts text into numerical form so that machine learning models can process it. | |
Two common techniques are Bag of Words (BoW) and Term Frequency-Inverse Document Frequency (TF-IDF). | |
""") | |
# Interactive example for vectorization | |
vectorization_choice = st.selectbox('Choose vectorization technique:', ('Bag of Words', 'TF-IDF')) | |
# Text for vectorization | |
sample_text = ["I love programming.", "NLP is fun.", "Streamlit makes things easy!"] | |
if st.button('Apply Vectorization'): | |
if vectorization_choice == 'Bag of Words': | |
vectorizer = CountVectorizer() | |
X = vectorizer.fit_transform(sample_text) | |
st.write(f"Bag of Words representation:\n{X.toarray()}") | |
st.write(f"Feature names: {vectorizer.get_feature_names_out()}") | |
elif vectorization_choice == 'TF-IDF': | |
vectorizer = TfidfVectorizer() | |
X = vectorizer.fit_transform(sample_text) | |
st.write(f"TF-IDF representation:\n{X.toarray()}") | |
st.write(f"Feature names: {vectorizer.get_feature_names_out()}") | |
# Basic Machine Learning | |
st.header('3. Basic Machine Learning') | |
st.subheader('Definition:') | |
st.write(""" | |
Basic machine learning techniques, such as Naive Bayes, Logistic Regression, and Support Vector Machines (SVM), | |
are commonly used for text classification tasks. | |
""") | |
# Load dataset | |
newsgroups = fetch_20newsgroups(subset='train') | |
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.3) | |
model_choice = st.selectbox('Choose machine learning model for text classification:', | |
('Naive Bayes', 'Logistic Regression', 'SVM')) | |
# Vectorization for classification | |
vectorizer = TfidfVectorizer() | |
X_train_vec = vectorizer.fit_transform(X_train) | |
X_test_vec = vectorizer.transform(X_test) | |
# Train model based on choice | |
if st.button('Train Model'): | |
if model_choice == 'Naive Bayes': | |
model = MultinomialNB() | |
elif model_choice == 'Logistic Regression': | |
model = LogisticRegression(max_iter=1000) | |
elif model_choice == 'SVM': | |
model = SVC() | |
model.fit(X_train_vec, y_train) | |
y_pred = model.predict(X_test_vec) | |
accuracy = accuracy_score(y_test, y_pred) | |
st.write(f"Model Accuracy: {accuracy * 100:.2f}%") | |
# Topic Modeling | |
st.header('4. Topic Modeling') | |
st.subheader('Definition:') | |
st.write(""" | |
Topic modeling is a technique used to identify the underlying topics in a collection of text data. | |
Latent Dirichlet Allocation (LDA) and Non-negative Matrix Factorization (NMF) are two common techniques for this task. | |
""") | |
topic_model_choice = st.selectbox('Choose topic modeling technique:', ('LDA', 'NMF')) | |
# Apply LDA or NMF for topic modeling | |
if st.button('Run Topic Modeling'): | |
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2) | |
X = vectorizer.fit_transform(newsgroups.data) | |
if topic_model_choice == 'LDA': | |
model = LatentDirichletAllocation(n_components=3, random_state=42) | |
elif topic_model_choice == 'NMF': | |
model = NMF(n_components=3, random_state=42) | |
model.fit(X) | |
feature_names = vectorizer.get_feature_names_out() | |
# Display top words for each topic | |
for topic_idx, topic in enumerate(model.components_): | |
st.write(f"Topic {topic_idx + 1}:") | |
top_words_idx = topic.argsort()[:-10 - 1:-1] | |
top_words = [feature_names[i] for i in top_words_idx] | |
st.write(", ".join(top_words)) | |