Spaces:
Build error
Build error
| import streamlit as st | |
| import pandas as pd | |
| from wordcloud import WordCloud | |
| import matplotlib.pyplot as plt | |
| import plotly.express as px | |
| import re | |
| from collections import Counter | |
| from itertools import chain | |
| # Set page configuration | |
| st.set_page_config(page_title="Advanced Log Analytics", page_icon="π") | |
| # Title of the app | |
| st.title("Interactive Log Analytics with N-gram Keyword Extraction") | |
| # Function to read log files with error handling for invalid UTF-8 sequences | |
| def read_log_file(file): | |
| try: | |
| # Attempt to read the file with error handling | |
| return file.read().decode('utf-8', errors='replace') | |
| except Exception as e: | |
| st.error(f'Error reading file: {str(e)}') | |
| st.stop() | |
| # Function to manually extract n-grams from log lines | |
| STOPWORDS = set(['the', 'is', 'in', 'and', 'to', 'a', 'of', 'for', 'with', 'on', 'this', 'as', 'that', 'by', 'from', 'at', 'are', 'it', 'was', 'an', 'be', 'will', 'or', 'but', 'not']) | |
| def extract_ngrams(text, n=2): | |
| # Tokenize and filter tokens | |
| tokens = re.findall(r'\b\w+\b', text.lower()) | |
| tokens_filtered = [token for token in tokens if token not in STOPWORDS] | |
| # Generate n-grams | |
| ngrams = zip(*[tokens_filtered[i:] for i in range(n)]) | |
| return [' '.join(ngram) for ngram in ngrams] | |
| # File uploader for log files | |
| uploaded_file = st.file_uploader("Upload your log file", type=["txt", "log"]) | |
| if uploaded_file is not None: | |
| # Read the log contents | |
| log_data = read_log_file(uploaded_file) | |
| st.write("### Log File Content Preview") | |
| st.text(log_data[:1000]) # Display the first 1000 characters | |
| # Preprocess log data (simple split by lines) | |
| log_lines = log_data.splitlines() | |
| # Slider to select number of n-grams (1 for unigrams, 2 for bigrams, etc.) | |
| n_value = st.slider("Select number of words in N-grams", min_value=1, max_value=5, value=2) | |
| # Extract n-grams from each log line based on slider value | |
| ngrams_from_logs = [extract_ngrams(line, n_value) for line in log_lines] | |
| # Flatten the list of n-grams and count their frequencies | |
| all_ngrams = list(chain.from_iterable(ngrams_from_logs)) | |
| ngram_frequencies = Counter(all_ngrams) | |
| # Convert to DataFrame for viewing and sort by frequency | |
| ngram_df = pd.DataFrame(ngram_frequencies.items(), columns=['N-gram', 'Frequency']) | |
| # Dropdown to select top or bottom N n-grams for visualization | |
| top_bottom_choice = st.selectbox("Select visualization type", ["Top N", "Bottom N"]) | |
| # Slider to select how many N-grams to display | |
| num_ngrams_to_display = st.slider("Select number of N-grams to display", min_value=1, max_value=50, value=20) | |
| if top_bottom_choice == "Top N": | |
| selected_ngrams_df = ngram_df.sort_values(by='Frequency', ascending=False).head(num_ngrams_to_display) | |
| else: | |
| selected_ngrams_df = ngram_df.sort_values(by='Frequency').head(num_ngrams_to_display) | |
| # Display selected N-grams with frequencies | |
| st.write(f"### {top_bottom_choice} {num_ngrams_to_display} N-grams") | |
| st.dataframe(selected_ngrams_df) | |
| # Generate and display a word cloud of selected N-grams | |
| st.write("### N-gram Frequency Word Cloud") | |
| wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies( | |
| {row['N-gram']: row['Frequency'] for _, row in selected_ngrams_df.iterrows()} | |
| ) | |
| plt.figure(figsize=(10, 5)) | |
| plt.imshow(wordcloud, interpolation='bilinear') | |
| plt.axis('off') | |
| st.pyplot(plt) | |
| # Prepare data for heatmap: count occurrences of each selected n-gram per line number | |
| heatmap_data = pd.DataFrame([(ng, i) for i, ngs in enumerate(ngrams_from_logs) for ng in ngs], | |
| columns=['N-gram', 'LineNumber']) | |
| heatmap_data = heatmap_data[heatmap_data['N-gram'].isin(selected_ngrams_df['N-gram'])] | |
| heatmap_matrix = pd.crosstab(index=heatmap_data['LineNumber'], columns=heatmap_data['N-gram']) | |
| # Create a heatmap using Plotly | |
| st.write("### Heatmap of Line Number and Selected N-grams") | |
| fig = px.imshow(heatmap_matrix.T, | |
| labels=dict(x="Line Number", y="N-grams", color="Frequency"), | |
| x=heatmap_matrix.index, | |
| y=heatmap_matrix.columns, | |
| aspect="auto", | |
| color_continuous_scale='YlGnBu') | |
| fig.update_layout(title='Heatmap of Line Number vs Selected N-grams', | |
| xaxis_title='Line Number', | |
| yaxis_title='N-grams') | |
| st.plotly_chart(fig) | |
| # Dropdown to filter logs based on selected N-grams | |
| filter_ngram_choice = st.selectbox("Filter logs by N-gram", options=['None'] + list(selected_ngrams_df['N-gram'])) | |
| if filter_ngram_choice != 'None': | |
| filtered_logs = [line for line, ngrams in zip(log_lines, ngrams_from_logs) if filter_ngram_choice in ngrams] | |
| st.write(f"### Logs containing '{filter_ngram_choice}'") | |
| st.dataframe(pd.DataFrame(filtered_logs, columns=["Log Line"])) | |
| # Sidebar information | |
| st.sidebar.title("About") | |
| st.sidebar.info(""" | |
| This tool uses manual extraction of n-grams from log files to identify trends, anomalies, | |
| and potential issues through interactive visualization and analysis. | |
| """) | |