Spaces:

DexterSptizu
/

log-analytics-ngram

Sleeping

File size: 5,259 Bytes

46155e1

import streamlit as st
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import plotly.express as px
import re
from collections import Counter
from itertools import chain

# Set page configuration
st.set_page_config(page_title="Advanced Log Analytics", page_icon="🔍")

# Title of the app
st.title("Interactive Log Analytics with N-gram Keyword Extraction")

# Function to read log files with error handling for invalid UTF-8 sequences
def read_log_file(file):
    try:
        # Attempt to read the file with error handling
        return file.read().decode('utf-8', errors='replace')
    except Exception as e:
        st.error(f'Error reading file: {str(e)}')
        st.stop()

# Function to manually extract n-grams from log lines
STOPWORDS = set(['the', 'is', 'in', 'and', 'to', 'a', 'of', 'for', 'with', 'on', 'this', 'as', 'that', 'by', 'from', 'at', 'are', 'it', 'was', 'an', 'be', 'will', 'or', 'but', 'not'])

def extract_ngrams(text, n=2):
    # Tokenize and filter tokens
    tokens = re.findall(r'\b\w+\b', text.lower())
    tokens_filtered = [token for token in tokens if token not in STOPWORDS]
    # Generate n-grams
    ngrams = zip(*[tokens_filtered[i:] for i in range(n)])
    return [' '.join(ngram) for ngram in ngrams]

# File uploader for log files
uploaded_file = st.file_uploader("Upload your log file", type=["txt", "log"])

if uploaded_file is not None:
    # Read the log contents
    log_data = read_log_file(uploaded_file)

    st.write("### Log File Content Preview")
    st.text(log_data[:1000])  # Display the first 1000 characters

    # Preprocess log data (simple split by lines)
    log_lines = log_data.splitlines()

    # Slider to select number of n-grams (1 for unigrams, 2 for bigrams, etc.)
    n_value = st.slider("Select number of words in N-grams", min_value=1, max_value=5, value=2)

    # Extract n-grams from each log line based on slider value
    ngrams_from_logs = [extract_ngrams(line, n_value) for line in log_lines]

    # Flatten the list of n-grams and count their frequencies
    all_ngrams = list(chain.from_iterable(ngrams_from_logs))
    ngram_frequencies = Counter(all_ngrams)

    # Convert to DataFrame for viewing and sort by frequency
    ngram_df = pd.DataFrame(ngram_frequencies.items(), columns=['N-gram', 'Frequency'])
    
    # Dropdown to select top or bottom N n-grams for visualization
    top_bottom_choice = st.selectbox("Select visualization type", ["Top N", "Bottom N"])
    
    # Slider to select how many N-grams to display
    num_ngrams_to_display = st.slider("Select number of N-grams to display", min_value=1, max_value=50, value=20)

    if top_bottom_choice == "Top N":
        selected_ngrams_df = ngram_df.sort_values(by='Frequency', ascending=False).head(num_ngrams_to_display)
    else:
        selected_ngrams_df = ngram_df.sort_values(by='Frequency').head(num_ngrams_to_display)

    # Display selected N-grams with frequencies
    st.write(f"### {top_bottom_choice} {num_ngrams_to_display} N-grams")
    st.dataframe(selected_ngrams_df)

    # Generate and display a word cloud of selected N-grams
    st.write("### N-gram Frequency Word Cloud")
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(
        {row['N-gram']: row['Frequency'] for _, row in selected_ngrams_df.iterrows()}
    )

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    st.pyplot(plt)

    # Prepare data for heatmap: count occurrences of each selected n-gram per line number
    heatmap_data = pd.DataFrame([(ng, i) for i, ngs in enumerate(ngrams_from_logs) for ng in ngs],
                                columns=['N-gram', 'LineNumber'])
    
    heatmap_data = heatmap_data[heatmap_data['N-gram'].isin(selected_ngrams_df['N-gram'])]
    
    heatmap_matrix = pd.crosstab(index=heatmap_data['LineNumber'], columns=heatmap_data['N-gram'])

    # Create a heatmap using Plotly
    st.write("### Heatmap of Line Number and Selected N-grams")
    
    fig = px.imshow(heatmap_matrix.T, 
                    labels=dict(x="Line Number", y="N-grams", color="Frequency"),
                    x=heatmap_matrix.index,
                    y=heatmap_matrix.columns,
                    aspect="auto",
                    color_continuous_scale='YlGnBu')

    fig.update_layout(title='Heatmap of Line Number vs Selected N-grams',
                      xaxis_title='Line Number',
                      yaxis_title='N-grams')

    st.plotly_chart(fig)

    # Dropdown to filter logs based on selected N-grams
    filter_ngram_choice = st.selectbox("Filter logs by N-gram", options=['None'] + list(selected_ngrams_df['N-gram']))
    
    if filter_ngram_choice != 'None':
        filtered_logs = [line for line, ngrams in zip(log_lines, ngrams_from_logs) if filter_ngram_choice in ngrams]
        st.write(f"### Logs containing '{filter_ngram_choice}'")
        st.dataframe(pd.DataFrame(filtered_logs, columns=["Log Line"]))

# Sidebar information
st.sidebar.title("About")
st.sidebar.info("""
This tool uses manual extraction of n-grams from log files to identify trends, anomalies, 
and potential issues through interactive visualization and analysis.
""")