Spaces:

Anvilogic
/

T5-Typosquat-Detect

Sleeping

File size: 2,263 Bytes

64aa5df
 
 
 
 
 
 
 
 
 
 
 
 
 
480c8da
64aa5df
 
 
4e4710d
64aa5df
 
dd3d098
 
 
 
 
64aa5df
 
4e4710d
 
dd3d098
 
64aa5df
4e4710d
 
64aa5df
 
4e4710d
 
64aa5df
 
4e4710d
64aa5df
 
4e4710d
dd3d098
 
64aa5df
4e4710d

import streamlit as st
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import PeftModel
import torch

@st.cache_resource
def load_model():
    model_id = "google/flan-t5-large"
    adapter_path = "./Flan-T5-Typosquat-detect"  # Adjust to your saved adapter path

    # Load the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
    model = PeftModel.from_pretrained(model, adapter_path)
    model = model.merge_and_unload()
    model.eval()
    
    return model, tokenizer
device='cpu'
model, tokenizer = load_model()

st.title("Fine tuned FLAN-T5 Typosquatting Detection")
st.markdown("This streamlit demonstrates our fine tuned model for typosquatting detection.  We found that using "
            "SLMs or LLMs and prompt engineering for this task could not achieve the same accuracy as our [cross encoder](https://huggingface.co/Anvilogic/CE-typosquat-detect). "
            "We found that by fine tuning a FLAN-T5 model, we could get the same accuracy as our cross encoder model.  "
            "Using an SLM like Flan allows you to output the response (here `true` or `false`) directly into another LM.  ")
st.write("Enter a potential typosquatted domain and a target domain to check if one is a variant of the other.")

prompt_prefix = "Is the first domain a typosquat of the second:"

potential_typosquat = st.text_input("Potential Typosquatted Domain", value="tiktok-tikto-tibyd-yjdj.com")
target_domain = st.text_input("Legitimate Domain", value="tiktok.com")

full_prompt = f"{prompt_prefix} {potential_typosquat} {target_domain}"

if st.button("Check Typosquatting"):
    if potential_typosquat and target_domain:
        # Encode and generate response
        input_ids = tokenizer(full_prompt, return_tensors="pt").input_ids.to(device)
        outputs = model.generate(input_ids, max_new_tokens=20)
        
        # Decode the response
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Display the result
        st.markdown(f"Is {potential_typosquat} a typosquat of {target_domain}? **{prediction}**")
        
    else:
        st.warning("Please enter both domains to perform the check.")