Spaces:
Sleeping
Sleeping
File size: 5,259 Bytes
46155e1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
import streamlit as st
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import plotly.express as px
import re
from collections import Counter
from itertools import chain
# Set page configuration
st.set_page_config(page_title="Advanced Log Analytics", page_icon="🔍")
# Title of the app
st.title("Interactive Log Analytics with N-gram Keyword Extraction")
# Function to read log files with error handling for invalid UTF-8 sequences
def read_log_file(file):
try:
# Attempt to read the file with error handling
return file.read().decode('utf-8', errors='replace')
except Exception as e:
st.error(f'Error reading file: {str(e)}')
st.stop()
# Function to manually extract n-grams from log lines
STOPWORDS = set(['the', 'is', 'in', 'and', 'to', 'a', 'of', 'for', 'with', 'on', 'this', 'as', 'that', 'by', 'from', 'at', 'are', 'it', 'was', 'an', 'be', 'will', 'or', 'but', 'not'])
def extract_ngrams(text, n=2):
# Tokenize and filter tokens
tokens = re.findall(r'\b\w+\b', text.lower())
tokens_filtered = [token for token in tokens if token not in STOPWORDS]
# Generate n-grams
ngrams = zip(*[tokens_filtered[i:] for i in range(n)])
return [' '.join(ngram) for ngram in ngrams]
# File uploader for log files
uploaded_file = st.file_uploader("Upload your log file", type=["txt", "log"])
if uploaded_file is not None:
# Read the log contents
log_data = read_log_file(uploaded_file)
st.write("### Log File Content Preview")
st.text(log_data[:1000]) # Display the first 1000 characters
# Preprocess log data (simple split by lines)
log_lines = log_data.splitlines()
# Slider to select number of n-grams (1 for unigrams, 2 for bigrams, etc.)
n_value = st.slider("Select number of words in N-grams", min_value=1, max_value=5, value=2)
# Extract n-grams from each log line based on slider value
ngrams_from_logs = [extract_ngrams(line, n_value) for line in log_lines]
# Flatten the list of n-grams and count their frequencies
all_ngrams = list(chain.from_iterable(ngrams_from_logs))
ngram_frequencies = Counter(all_ngrams)
# Convert to DataFrame for viewing and sort by frequency
ngram_df = pd.DataFrame(ngram_frequencies.items(), columns=['N-gram', 'Frequency'])
# Dropdown to select top or bottom N n-grams for visualization
top_bottom_choice = st.selectbox("Select visualization type", ["Top N", "Bottom N"])
# Slider to select how many N-grams to display
num_ngrams_to_display = st.slider("Select number of N-grams to display", min_value=1, max_value=50, value=20)
if top_bottom_choice == "Top N":
selected_ngrams_df = ngram_df.sort_values(by='Frequency', ascending=False).head(num_ngrams_to_display)
else:
selected_ngrams_df = ngram_df.sort_values(by='Frequency').head(num_ngrams_to_display)
# Display selected N-grams with frequencies
st.write(f"### {top_bottom_choice} {num_ngrams_to_display} N-grams")
st.dataframe(selected_ngrams_df)
# Generate and display a word cloud of selected N-grams
st.write("### N-gram Frequency Word Cloud")
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(
{row['N-gram']: row['Frequency'] for _, row in selected_ngrams_df.iterrows()}
)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
st.pyplot(plt)
# Prepare data for heatmap: count occurrences of each selected n-gram per line number
heatmap_data = pd.DataFrame([(ng, i) for i, ngs in enumerate(ngrams_from_logs) for ng in ngs],
columns=['N-gram', 'LineNumber'])
heatmap_data = heatmap_data[heatmap_data['N-gram'].isin(selected_ngrams_df['N-gram'])]
heatmap_matrix = pd.crosstab(index=heatmap_data['LineNumber'], columns=heatmap_data['N-gram'])
# Create a heatmap using Plotly
st.write("### Heatmap of Line Number and Selected N-grams")
fig = px.imshow(heatmap_matrix.T,
labels=dict(x="Line Number", y="N-grams", color="Frequency"),
x=heatmap_matrix.index,
y=heatmap_matrix.columns,
aspect="auto",
color_continuous_scale='YlGnBu')
fig.update_layout(title='Heatmap of Line Number vs Selected N-grams',
xaxis_title='Line Number',
yaxis_title='N-grams')
st.plotly_chart(fig)
# Dropdown to filter logs based on selected N-grams
filter_ngram_choice = st.selectbox("Filter logs by N-gram", options=['None'] + list(selected_ngrams_df['N-gram']))
if filter_ngram_choice != 'None':
filtered_logs = [line for line, ngrams in zip(log_lines, ngrams_from_logs) if filter_ngram_choice in ngrams]
st.write(f"### Logs containing '{filter_ngram_choice}'")
st.dataframe(pd.DataFrame(filtered_logs, columns=["Log Line"]))
# Sidebar information
st.sidebar.title("About")
st.sidebar.info("""
This tool uses manual extraction of n-grams from log files to identify trends, anomalies,
and potential issues through interactive visualization and analysis.
""")
|