File size: 2,263 Bytes
64aa5df
 
 
 
 
 
 
 
 
 
 
 
 
 
480c8da
64aa5df
 
 
4e4710d
64aa5df
 
dd3d098
 
 
 
 
64aa5df
 
4e4710d
 
dd3d098
 
64aa5df
4e4710d
 
64aa5df
 
4e4710d
 
64aa5df
 
4e4710d
64aa5df
 
4e4710d
dd3d098
 
64aa5df
4e4710d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import PeftModel
import torch

@st.cache_resource
def load_model():
    model_id = "google/flan-t5-large"
    adapter_path = "./Flan-T5-Typosquat-detect"  # Adjust to your saved adapter path

    # Load the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
    model = PeftModel.from_pretrained(model, adapter_path)
    model = model.merge_and_unload()
    model.eval()
    
    return model, tokenizer
device='cpu'
model, tokenizer = load_model()

st.title("Fine tuned FLAN-T5 Typosquatting Detection")
st.markdown("This streamlit demonstrates our fine tuned model for typosquatting detection.  We found that using "
            "SLMs or LLMs and prompt engineering for this task could not achieve the same accuracy as our [cross encoder](https://huggingface.co/Anvilogic/CE-typosquat-detect). "
            "We found that by fine tuning a FLAN-T5 model, we could get the same accuracy as our cross encoder model.  "
            "Using an SLM like Flan allows you to output the response (here `true` or `false`) directly into another LM.  ")
st.write("Enter a potential typosquatted domain and a target domain to check if one is a variant of the other.")

prompt_prefix = "Is the first domain a typosquat of the second:"

potential_typosquat = st.text_input("Potential Typosquatted Domain", value="tiktok-tikto-tibyd-yjdj.com")
target_domain = st.text_input("Legitimate Domain", value="tiktok.com")

full_prompt = f"{prompt_prefix} {potential_typosquat} {target_domain}"

if st.button("Check Typosquatting"):
    if potential_typosquat and target_domain:
        # Encode and generate response
        input_ids = tokenizer(full_prompt, return_tensors="pt").input_ids.to(device)
        outputs = model.generate(input_ids, max_new_tokens=20)
        
        # Decode the response
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Display the result
        st.markdown(f"Is {potential_typosquat} a typosquat of {target_domain}? **{prediction}**")
        
    else:
        st.warning("Please enter both domains to perform the check.")