import streamlit as st from sentence_transformers import SentenceTransformer, util import pandas as pd import numpy as np from ast import literal_eval # Load the model model_name = "./Embedder-Typosquat" model = SentenceTransformer(model_name) # Load the domains and embeddings domains_df = pd.read_csv('domains_embs.csv') domains_df.embedding = domains_df.embedding.apply(literal_eval) corpus_domains = domains_df.domain.to_list() corpus_embeddings = np.stack(domains_df.embedding.values).astype(np.float32) # Ensure embeddings are float32 # Streamlit App st.title("Mining Potential Legitimate Domains from a Typosquatted Domain") st.write("Enter a potential typosquatted domain and select the number of top results to retrieve.") # User Inputs domain = st.text_input("Potential Typosquatted Domain") top_k = st.number_input("Top K Results", min_value=1, max_value=50, value=5, step=1) # Button to trigger search if st.button("Search for Legitimate Domains"): if domain: # Perform Semantic Search query_emb = model.encode(domain).astype(np.float32) # Ensure query embedding is also float32 semantic_res = util.semantic_search(query_emb, corpus_embeddings, top_k=top_k)[0] ids = [r['corpus_id'] for r in semantic_res] scores = [r['score'] for r in semantic_res] # Create a DataFrame for the results res_df = domains_df.loc[ids,['domain']].copy() res_df['score'] = scores # Display the result DataFrame st.write("Mined Domains:") st.dataframe(res_df) else: st.warning("Please enter a domain to perform the search.")