ACMCMC commited on
Commit
8fdcd57
1 Parent(s): 18f39d5
Files changed (4) hide show
  1. .gitignore +3 -0
  2. README.md +1 -3
  3. app.py +201 -0
  4. requirements.txt +7 -0
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ __pycache__
2
+ flagged
3
+ .venv
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: Homoglyphs Alarm
3
- emoji: 🏃
4
  colorFrom: pink
5
  colorTo: blue
6
  sdk: gradio
@@ -9,5 +9,3 @@ app_file: app.py
9
  pinned: false
10
  short_description: Test alarm for homoglyph-based attacks
11
  ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Homoglyphs Alarm
3
+ emoji: 🚨
4
  colorFrom: pink
5
  colorTo: blue
6
  sdk: gradio
 
9
  pinned: false
10
  short_description: Test alarm for homoglyph-based attacks
11
  ---
 
 
app.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+
4
+ # import homoglyphs
5
+ import confusable_homoglyphs.confusables
6
+ from unidecode import unidecode
7
+
8
+
9
+ from transformers import AutoModelForCausalLM, AutoTokenizer
10
+
11
+
12
+ model = AutoModelForCausalLM.from_pretrained("gpt2")
13
+ tokenizer = AutoTokenizer.from_pretrained("gpt2")
14
+
15
+
16
+ from evaluate import load
17
+
18
+ perplexity_metric = load("perplexity", module_type="metric")
19
+
20
+
21
+ # homoglyphs_processor = homoglyphs.Homoglyphs(
22
+ # ascii_strategy=homoglyphs.STRATEGY_LOAD, strategy=homoglyphs.STRATEGY_LOAD
23
+ # )
24
+
25
+
26
+ def calculate_perplexity(text_logits: torch.Tensor):
27
+ # The logits are not normalized, so we need to normalize them, e.g. by doing a log softmax
28
+ text_logits = torch.nn.functional.log_softmax(text_logits, dim=-1)
29
+ # Calculate the perplexity of the text
30
+ sequence_negative_log_likelihoods = -text_logits
31
+ average_negative_log_likelihood = sequence_negative_log_likelihoods.mean()
32
+ perplexity = torch.exp(average_negative_log_likelihood)
33
+ return perplexity.item()
34
+
35
+
36
+ # Function to calculate burstiness using an LLM
37
+ def process_homoglyphed_text(homoglyphed_text, unhomoglyphed_text):
38
+ # # Tokenize the texts
39
+ # unhomoglyphed_text_tokens = tokenizer(unhomoglyphed_text, return_tensors="pt")[
40
+ # "input_ids"
41
+ # ]
42
+ # homoglyphed_text_tokens = tokenizer(homoglyphed_text, return_tensors="pt")[
43
+ # "input_ids"
44
+ # ]
45
+ # # Calculate the logits for the texts
46
+ # with torch.no_grad():
47
+ # unhomoglyphed_text_logits = model(unhomoglyphed_text_tokens).logits
48
+ # homoglyphed_text_logits = model(homoglyphed_text_tokens).logits
49
+ # # Calculate the perplexity for both texts
50
+ # unhomoglyphed_text_perplexity = calculate_perplexity(unhomoglyphed_text_logits)
51
+ # homoglyphed_text_perplexity = calculate_perplexity(homoglyphed_text_logits)
52
+ unhomoglyphed_text_perplexity, homoglyphed_text_perplexity = (
53
+ perplexity_metric.compute(
54
+ predictions=[homoglyphed_text, unhomoglyphed_text], model_id="gpt2"
55
+ )["perplexities"]
56
+ )
57
+
58
+ print(
59
+ f"Unhomoglyphed text perplexity: {unhomoglyphed_text_perplexity}, homoglyphed text perplexity: {homoglyphed_text_perplexity}"
60
+ )
61
+
62
+ # If the version without homoglyphs is more than 1.5 of the perplexity of the version with homoglyphs, trigger the alarm
63
+ difference_ratio = unhomoglyphed_text_perplexity / homoglyphed_text_perplexity
64
+ print(f"Difference ratio: {difference_ratio}")
65
+ alarm_triggered = difference_ratio > 1.5
66
+
67
+ # Return the burstiness for both texts
68
+ return alarm_triggered, difference_ratio
69
+
70
+
71
+ def unhomoglyphize_text(homoglyphed_text):
72
+ confusables = confusable_homoglyphs.confusables.is_confusable(
73
+ homoglyphed_text, greedy=True, preferred_aliases=["latin"]
74
+ )
75
+ print(f"Confusables: {confusables}")
76
+ unhomoglyphed_text = homoglyphed_text
77
+ # Returns something like:
78
+ """
79
+ [{'character': 'ρ',
80
+ 'alias': 'GREEK',
81
+ 'homoglyphs': [{'c': 'p', 'n': 'LATIN SMALL LETTER P'}]},
82
+ {'character': 'τ',
83
+ 'alias': 'GREEK',
84
+ 'homoglyphs': [{'c': 'ᴛ', 'n': 'LATIN LETTER SMALL CAPITAL T'}]}]
85
+ """
86
+ for confusable in confusables:
87
+ # Check if the character is in ASCII
88
+ if ord(confusable["character"]) < 128:
89
+ continue
90
+ homoglyph = confusable["homoglyphs"][0]
91
+ unhomoglyphed_text = unhomoglyphed_text.replace(
92
+ confusable["character"], homoglyph["c"]
93
+ )
94
+ # Finally, remove any diacritics (this is not done by the homoglyphs library)
95
+ unhomoglyphed_text = unidecode(unhomoglyphed_text)
96
+ return unhomoglyphed_text
97
+
98
+
99
+ def process_user_text(user_text):
100
+ # If the user text doesn't contain homoglyphs, don't trigger the alarm
101
+ if not bool(
102
+ confusable_homoglyphs.confusables.is_confusable(
103
+ user_text, preferred_aliases=["latin"]
104
+ )
105
+ ):
106
+ return False, 0.0, "# ✅ All good"
107
+
108
+ unhomoglyphed_text = unhomoglyphize_text(user_text)
109
+
110
+ print(f"Unhomoglyphed text: {unhomoglyphed_text}")
111
+
112
+ alarm_triggered, difference_ratio = process_homoglyphed_text(
113
+ homoglyphed_text=user_text, unhomoglyphed_text=unhomoglyphed_text
114
+ )
115
+ return (
116
+ True,
117
+ difference_ratio,
118
+ "# 🚨 Alarm triggered" if alarm_triggered else "# ✅ All good",
119
+ )
120
+
121
+
122
+ theme = gr.themes.Soft(
123
+ primary_hue="fuchsia",
124
+ secondary_hue="cyan",
125
+ neutral_hue="gray",
126
+ radius_size="none",
127
+ font=[
128
+ gr.themes.GoogleFont("IBM Plex Sans"),
129
+ "ui-sans-serif",
130
+ "system-ui",
131
+ "sans-serif",
132
+ ],
133
+ font_mono=[
134
+ gr.themes.GoogleFont("IBM Plex Mono"),
135
+ "ui-monospace",
136
+ "Consolas",
137
+ "monospace",
138
+ ],
139
+ )
140
+
141
+
142
+ # Create a Gradio interface
143
+ demo = gr.Interface(
144
+ theme=theme,
145
+ fn=process_user_text,
146
+ inputs=[
147
+ gr.Textbox(lines=5, placeholder="Enter your text here...", label="Text"),
148
+ ],
149
+ outputs=[
150
+ # A checkbox: is dangerous or not
151
+ gr.Checkbox(label="Is dangerous"),
152
+ # The number of the difference ratio
153
+ gr.Number(label="Difference ratio"),
154
+ # Just an emoji: alarm triggered or not
155
+ gr.Markdown(label="Alarm triggered", show_label=False),
156
+ ],
157
+ title="Homoglyphs Alarm 🚨",
158
+ description="""Calculates the probablility that a given text has been the target of a homoglyph-based attack.
159
+
160
+ It calculates the perplexity of the text according to GPT-2 and compares it to the perplexity of the text with homoglyphs replaced by their ASCII equivalents.
161
+
162
+ Example texts adapted from:
163
+ - https://arxiv.org/abs/2401.12070 (also in a version adapted using https://huggingface.co/spaces/acmc/SilverSpeak with 5% replacement)
164
+ - https://huggingface.co/google/gemma-2-2b (also in a version adapted using https://huggingface.co/spaces/acmc/SilverSpeak with 5% replacement)
165
+ - https://www.persee.fr/doc/rbph_0035-0818_2012_num_90_3_8269
166
+ - https://arxiv.org/abs/2411.14257
167
+ - https://www.busuu.com/en/spanish/conditional
168
+
169
+ Written by: [Aldan Creo](https://acmc-website.web.app/intro)
170
+ """,
171
+ allow_flagging="never",
172
+ examples=[
173
+ [
174
+ "Dr. Capy Cosmos, a capybara unlike any other, astounded the scientific community with his groundbreaking research in astrophysics. With his keen sense of observation and unparalleled ability to interpret cosmic data, he uncovered new insights into the mysteries of black holes and the origins of the universe. As he peered through telescopes with his large, round eyes, fellow researchers often remarked that it seemed as if the stars themselves whispered their secrets directly to him. Dr. Cosmos not only became a beacon of inspiration to aspiring scientists but also proved that intellect and innovation can be found in the most unexpected of creatures."
175
+ ],
176
+ [
177
+ "Dr. Capу Cosmos, a caрybаra unlіkе any other, astounded the scientific community with hіs groundbreakіng reѕearcһ in astrophysics. With hiѕ keen sense of observation and unparаlleled ability to interpret cosmic dаta, he uncovеred new іnsightѕ into tһe myѕteries of black holes аnd the origins of the universe. Aѕ he peered through telescopes with his large, round eyes, fellow reѕearchers often remarked that it seemed as if the stars themѕelves whiѕpered theіr secrets directlу to him. Dr. Cosmos not only became a beacon of inspіration to aspiring scientіsts but also proved thаt intellect and іnnovation can bе found in the most unexpecteԁ οf сreatures."
178
+ ],
179
+ [
180
+ "Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models. They are text-to-text, decoder-only large language models, available in English, with open weights for both pre-trained variants and instruction-tuned variants. Gemma models are well-suited for a variety of text generation tasks, including question answering, summarization, and reasoning. Their relatively small size makes it possible to deploy them in environments with limited resources such as a laptop, desktop or your own cloud infrastructure, democratizing access to state of the art AI models and helping foster innovation for everyone."
181
+ ],
182
+ [
183
+ "Gemma iѕ a family of lightweіght, ѕtatе-οf-the-art open models from Google, built from the same research аnd technolοgy uѕed to сreate tһe Gemini models. Theу are text-to-text, decoder-only lаrge lаnguage models, available in English, with οpen weightѕ for both рre-trainеd vаrіants аnd instruction-tuned variantѕ. Gemma models are well-suited for a vаrietу of text generation tasks, including question answering, summarization, and rеaѕoning. Their relatively small size makes it possible to dеploy them іn environments witһ limited resourceѕ such as a laptop, desktop or your own cloud infraѕtructure, democratizing acceѕs to state of the art AΙ models and һelping foster іnnovation for everyone."
184
+ ],
185
+ [
186
+ "We run the model on the set of prompts containing known and unknown entities. Inspired by Meng et al. (2022a); Geva et al. (2023); Nanda et al. (2023) we use the residual stream of the final token of the entity, 𝒙 known and 𝒙 unknown. In each layer (l), we compute the activations of each latent in the SAE, i.e. al,j⁢(𝒙lknown) and al,j⁢(𝒙lunknown). For each latent, we obtain the fraction of the time that it is active (i.e. has a value greater than zero) on known and unknown entities respectively: fl,jknown=∑iNknown𝟙⁢[al,j⁢(𝒙l,iknown)>0]Nknown,fl,junknown=∑iNunknown𝟙⁢[al,j⁢(𝒙l,iunknown)>0]Nunknown,(6) where Nknown and Nunknown are the total number of prompts in each subset. Then, we take the difference, obtaining the latent separation scores sl,jknown=fl,jknown−fl,junknown and sl,junknown=fl,junknown−fl,jknown, for detecting known and unknown entities respectively."
187
+ ],
188
+ [
189
+ "The national/ official name of the country, the people and the language are respectively Eλλάδα, Έλληνας, ελληνικά ([ eláδa, élinas, eliniká]), derived from Ancient Greek Ἑλλάς, Ἕλλην, ἑλληνικός ([ hellás, héllen, hellenikós]) ‘Greece, Greek (noun), Greek (adj.)’, which are also to be found in most European languages as Hellas, hellenic, hellénique etc.; Hellenic Republic is the official name of the country in the European Union. The etymology of these words is uncertain. They first occur in the Iliad of Homer (2.683-4) as a designation of a small area in Thessaly, the homeland of Achilles, and its people. (3) Also in Homer, it is possible to find the compound πανέλληνες ([ panhellenes]) denoting all Greeks (from adjective pan ‘all’ + noun hellen), and it is again uncertain under what historical circumstances this local appellation spread to the totality of the Greek nation, although various theories have been proposed (see Babiniotis 2002)."
190
+ ],
191
+ [
192
+ "To form the conditional tense in Spanish, you need to use the infinitive form of the verb and add the corresponding endings for each subject pronoun. Regardless of the verb type (-ar, -er, or -ir), the endings remain the same. In singular-plural order, 1st-3rd, the terminations are: -ía, -ías, -ía, -íamos, -ían, -ían. For example: Hablaría (I would speak) Comerías (You would eat) Escribiría (He/She/You would write) Haríamos (We would do/make) Beberían (You all would drink) Leerían (They/You all would read)."
193
+ ],
194
+ [
195
+ "To form the cοnԁіtіonal tense in Spanish, yοu need tο uѕе the infinitive fοrm of the verb and add the corresponding endіngs for eaсһ subject рronοun. Regardless of the verb type (-ar, -er, or -ir), the endings remain the same. In singular-plural order, 1st-3rd, thе terminations are: -ía, -ías, -ía, -íamos, -ían, -ían. For example: Hablaríа (I woulԁ speak) Comerías (You would еаt) Eѕcribiría (He/Shе/You would write) Haríаmos (We would do/make) Bеberían (Yοu all would ԁrink) Leerían (They/You all would read)."
196
+ ],
197
+ ],
198
+ )
199
+
200
+ # Launch the Gradio app
201
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio==4.44.1
2
+ torch==2.5.0
3
+ transformers==4.46.0
4
+ homoglyphs==2.0.4
5
+ confusable-homoglyphs==3.3.1
6
+ evaluate==0.4.3
7
+ Unidecode==1.3.8