xyncz's picture
Upload 10 files
6f30b20 verified
import streamlit as st
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from PIL import Image
def app():
# title
st.title('Malicious or Benign Website Detection')
# subheader
st.subheader('EDA for Malicious or Benign Website Detection')
# add image
image = Image.open('web.jpg')
st.image(image, caption = 'Malicious or Benign Website')
# Markdown
st.markdown('----')
# Masukkan pandas dataframe
# show dataframe
df = pd.read_csv('dataset.csv')
st.dataframe(df)
# **Explanation directly taken from the website:**
# writing dataset explanation
st.write('#### Dataset Explanation')
st.write('''
- **URL**: It is the anonymous identification of the URL analyzed in the study.
- **URL_LENGTH**: It is the number of characters in the URL.
- **NUMBER_SPECIAL_CHARACTERS**: It is the number of special characters identified in the URL, such as, β€œ/”, β€œ%”, β€œ#”, β€œ&”, β€œ. β€œ, β€œ=”.
- **CHARSET**: It is a categorical value and its meaning is the character encoding standard (also called character set).
- **SERVER**: It is a categorical value and its meaning is the operative system of the server got from the packet response.
- **CONTENT_LENGTH**: It represents the content size of the HTTP header.
- **WHOIS_COUNTRY**: It is a categorical variable, its values are the countries we got from the server response (specifically, our script used the API of Whois).
- **WHOIS_STATEPRO**: It is a categorical variable, its values are the states we got from the server response (specifically, our script used the API of Whois).
- **WHOIS_REGDATE**: Whois provides the server registration date, so, this variable has date values with format DD/MM/YYY HH:MM
- **WHOIS_UPDATED_DATE**: Through the Whois we got the last update date from the server analyzed.
- **TCP_CONVERSATION_EXCHANGE**: This variable is the number of TCP packets exchanged between the server and our honeypot client.
- **DIST_REMOTE_TCP_PORT**: It is the number of the ports detected and different to TCP.
- **REMOTE_IPS**: This variable has the total number of IPs connected to the honeypot.
- **APP_BYTES**: This is the number of bytes transferred.
- **SOURCE_APP_PACKETS**: Packets sent from the honeypot to the server.
- **REMOTE_APP_PACKETS**: Packets received from the server.
- **APP_PACKETS**: This is the total number of IP packets generated during the communication between the honeypot and the server.
- **DNS_QUERY_TIMES**: This is the number of DNS packets generated during the communication between the honeypot and the server.
- **TYPE**: This is a categorical variable, its values represent the type of web page analyzed, specifically, 1 is for malicious websites and 0 is for benign websites.
''')
object_columns = df.select_dtypes(include=['object']).columns
numerical_columns = df.select_dtypes(exclude=['object']).columns
st.write('#### Plot Categorical Columns using Pie Chart')
option_cat = st.selectbox('Select Column:', ('CHARSET', 'SERVER', 'WHOIS_COUNTRY', 'WHOIS_STATEPRO'))
fig = plt.figure(figsize=(15,5))
plt.pie(df[option_cat].value_counts(), labels=df[option_cat].value_counts().index, autopct='%1.1f%%', startangle=180)
st.pyplot(fig)
# # plot historical date data with lineplot for WHOIS_REGDATE and WHOIS_UPDATED_DATE separated by type column
# date_columns = ['WHOIS_REGDATE', 'WHOIS_UPDATED_DATE']
# st.write('#### Plot Historical Date Data with Lineplot')
# option_date = st.selectbox('Select Column:', ('WHOIS_REGDATE', 'WHOIS_UPDATED_DATE'))
# fig = plt.figure(figsize=(15,5))
# sns.lineplot(x=option_date, y='Type', data=df)
# st.pyplot(fig)
st.write('#### Plot Numerical Columns')
option = st.selectbox('Select Column:', ('URL_LENGTH', 'NUMBER_SPECIAL_CHARACTERS', 'CONTENT_LENGTH', 'APP_PACKETS', 'DNS_QUERY_TIMES'))
fig = plt.figure(figsize=(15,5))
sns.histplot(df[option], bins=30, kde=True)
st.pyplot(fig)
option_pay = st.selectbox('Select Column:', ('TCP_CONVERSATION_EXCHANGE', 'DIST_REMOTE_TCP_PORT', 'REMOTE_IPS', 'APP_BYTES'))
fig = plt.figure(figsize=(15,5))
sns.histplot(df[option_pay], bins=30, kde=True)
st.pyplot(fig)
option_bill_amt = st.selectbox('Select Column:', ('SOURCE_APP_PACKETS', 'REMOTE_APP_PACKETS', 'SOURCE_APP_BYTES', 'REMOTE_APP_BYTES'))
fig = plt.figure(figsize=(15,5))
sns.histplot(df[option_bill_amt], bins=30, kde=True)
st.pyplot(fig)
# plot type column count with boxplot color with type column
st.write('#### Plot Type Column Count with Boxplot')
fig = plt.figure(figsize=(15,5))
sns.boxplot(x='Type', y='URL_LENGTH', data=df, hue='Type')
st.pyplot(fig)
# Sort DataFrame by 'Type'
df = df.sort_values('Type')
# Membuat plotly plot
st.write('#### Plotly Plot - URL_LENGTH vs CONTENT_LENGTH')
fig = plt.figure(figsize=(15,5))
sns.scatterplot(x='URL_LENGTH', y='CONTENT_LENGTH', data=df, hue='Type')
st.pyplot(fig)
if __name__ == '__main__':
app()