import streamlit as st import pandas as pd import numpy as np import plotly.graph_objs as go from keras.preprocessing.text import Tokenizer import requests from bs4 import BeautifulSoup from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans import matplotlib.pyplot as plt # Set up the Streamlit app st.set_page_config(page_title='Keyword Extraction and Clustering') # Load data from Wikipedia def load_wiki_data(pages): data = [] for page in pages: url = f'https://en.wikipedia.org/wiki/{page}' response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') text = soup.get_text() data.append(text) df = pd.DataFrame({'text': data}) return df # Create a bar chart of word frequency def plot_word_frequency(text): tokenizer = Tokenizer() tokenizer.fit_on_texts(text) word_counts = tokenizer.word_counts words = list(word_counts.keys()) counts = list(word_counts.values()) # Categorize words by type and assign color based on type word_types = {} for word in words: if word.isalpha(): if word.isupper(): word_types[word] = 'uppercase' elif word.istitle(): word_types[word] = 'titlecase' else: word_types[word] = 'lowercase' else: word_types[word] = 'other' colors = {'uppercase': 'red', 'titlecase': 'green', 'lowercase': 'blue', 'other': 'gray'} color_list = [colors[word_types[word]] for word in words] fig = go.Figure([go.Bar(x=words, y=counts, marker={'color': color_list})]) fig.update_layout(title='Word Frequency') st.plotly_chart(fig) # Create a scatter plot of clustered keywords def plot_keyword_clusters(keywords, clusters): fig, ax = plt.subplots() ax.scatter(keywords[:,0], keywords[:,1], c=clusters) st.pyplot(fig) # Main Streamlit app pages = ['Python_(programming_language)', 'Data_science', 'Machine_learning'] if st.button('Load Wikipedia Data'): df = load_wiki_data(pages) st.write('Data loaded') else: df = pd.DataFrame({'text': []}) st.write('Click "Load Wikipedia Data" to load data') st.write(df) text = df['text'].tolist() if text: # Perform keyword extraction vectorizer = TfidfVectorizer(stop_words='english') X = vectorizer.fit_transform(text) #feature_names = vectorizer.get_feature_names() # Perform clustering of keywords kmeans = KMeans(n_clusters=3, random_state=0).fit(X) keywords = kmeans.cluster_centers_[:, :2] # Plot word frequency and keyword clusters plot_word_frequency(text) plot_keyword_clusters(keywords, kmeans.labels_)