|
import streamlit as st |
|
from sentence_transformers import SentenceTransformer, util |
|
import pandas as pd |
|
import numpy as np |
|
from ast import literal_eval |
|
|
|
|
|
model_choice = "Embedder-typosquat-detect-Canine" |
|
|
|
@st.cache_resource |
|
def load_model() -> SentenceTransformer: |
|
return SentenceTransformer(f"./{model_choice}") |
|
|
|
st.title("Search for the target of typosquat domains with our Domain Embedder") |
|
st.markdown("This streamlit demonstrates how you can use our domain embedder to find the targets of typosquatted domains. " |
|
"Each domain is represented as an vector embedding that can be stored in a vector store for efficient retrieval. " |
|
"The domains you can search for in this application are the top 4k most popular domains, like `google.com`. " |
|
"You can use the domain embedder to create a vector store specifically for the websites **you want to monitor**. " |
|
"This can include the services your company uses like Office365, or the websites of your company that may " |
|
"become spear phishing targets.") |
|
|
|
model = load_model() |
|
|
|
|
|
domains_df = pd.read_csv(f'./{model_choice}/domains_embs.csv') |
|
domains_df.embedding = domains_df.embedding.apply(literal_eval) |
|
corpus_domains = domains_df.domain.to_list() |
|
corpus_embeddings = np.stack(domains_df.embedding.values).astype(np.float32) |
|
|
|
st.header("Enter a potential typosquatted domain and select the number of top results to retrieve. ") |
|
domain = st.text_input("Potential Typosquatted Domain") |
|
top_k = st.number_input("Top K Results", min_value=1, max_value=50, value=5, step=1) |
|
|
|
if st.button("Search for Legitimate Domains"): |
|
if domain: |
|
|
|
query_emb = model.encode(domain).astype(np.float32) |
|
semantic_res = util.semantic_search(query_emb, corpus_embeddings, top_k=top_k)[0] |
|
ids = [r['corpus_id'] for r in semantic_res] |
|
scores = [r['score'] for r in semantic_res] |
|
|
|
res_df = domains_df.loc[ids, ['domain']].copy() |
|
res_df['score'] = scores |
|
|
|
st.write("Mined Domains:") |
|
st.dataframe(res_df) |
|
else: |
|
st.warning("Please enter a domain to perform the search.") |
|
|