Spaces:
Sleeping
Sleeping
File size: 2,263 Bytes
64aa5df 480c8da 64aa5df 4e4710d 64aa5df dd3d098 64aa5df 4e4710d dd3d098 64aa5df 4e4710d 64aa5df 4e4710d 64aa5df 4e4710d 64aa5df 4e4710d dd3d098 64aa5df 4e4710d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import PeftModel
import torch
@st.cache_resource
def load_model():
model_id = "google/flan-t5-large"
adapter_path = "./Flan-T5-Typosquat-detect" # Adjust to your saved adapter path
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
model = PeftModel.from_pretrained(model, adapter_path)
model = model.merge_and_unload()
model.eval()
return model, tokenizer
device='cpu'
model, tokenizer = load_model()
st.title("Fine tuned FLAN-T5 Typosquatting Detection")
st.markdown("This streamlit demonstrates our fine tuned model for typosquatting detection. We found that using "
"SLMs or LLMs and prompt engineering for this task could not achieve the same accuracy as our [cross encoder](https://huggingface.co/Anvilogic/CE-typosquat-detect). "
"We found that by fine tuning a FLAN-T5 model, we could get the same accuracy as our cross encoder model. "
"Using an SLM like Flan allows you to output the response (here `true` or `false`) directly into another LM. ")
st.write("Enter a potential typosquatted domain and a target domain to check if one is a variant of the other.")
prompt_prefix = "Is the first domain a typosquat of the second:"
potential_typosquat = st.text_input("Potential Typosquatted Domain", value="tiktok-tikto-tibyd-yjdj.com")
target_domain = st.text_input("Legitimate Domain", value="tiktok.com")
full_prompt = f"{prompt_prefix} {potential_typosquat} {target_domain}"
if st.button("Check Typosquatting"):
if potential_typosquat and target_domain:
# Encode and generate response
input_ids = tokenizer(full_prompt, return_tensors="pt").input_ids.to(device)
outputs = model.generate(input_ids, max_new_tokens=20)
# Decode the response
prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Display the result
st.markdown(f"Is {potential_typosquat} a typosquat of {target_domain}? **{prediction}**")
else:
st.warning("Please enter both domains to perform the check.") |