hannahcyberey commited on
Commit
40a29d6
·
verified ·
1 Parent(s): c64bf5f

Change to local inference

Browse files
activations/candidate_vectors.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed63186d01ddaf6df8835818144185b5fb05d1c9a4683fce9517a921472353b3
3
+ size 804046
activations/deepseek-1.5b-candidate_vectors.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a4f3701085a9090e78fc402aaaef5adbf23f0b49c932f82eb4fc107d191aac0
3
+ size 345294
activations/deepseek-1.5b-offsets.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d84cb880bee5feb83248b476d8d0f3f87dca74bc8ae53807f3ab2a9bdb959920
3
+ size 345244
activations/offsets.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6cb10bd9014f9cd2470d37f56f491abd5f72bd162543a7569f40cd385f127c3
3
+ size 803996
app.py CHANGED
@@ -1,35 +1,25 @@
1
- import os, json
2
  import logging
3
  from pathlib import Path
4
- import asyncio
5
- import aiohttp
6
  import pandas as pd
7
  import gradio as gr
8
  from gradio_toggle import Toggle
 
 
9
  from scheduler import load_scheduler
10
  from schemas import UserRequest, SteeringOutput, CONFIG
11
 
12
-
13
- MAX_RETRIES = 10
14
- MAX_RETRY_WAIT_TIME = 75
15
- MIN_RETRY_WAIT_TIME = 5
16
- ENDPOINT_ALIVE = False
17
-
18
- HF_TOKEN = os.getenv('HF_TOKEN')
19
- API_URL = "https://a6k5m81qw14hkvhz.us-east-1.aws.endpoints.huggingface.cloud"
20
- headers = {
21
- "Accept" : "application/json",
22
- "Authorization": f"Bearer {HF_TOKEN}",
23
- "Content-Type": "application/json"
24
- }
25
-
26
  logging.basicConfig(level=logging.INFO, format='%(asctime)s %(name)s %(levelname)s:%(message)s')
27
  logger = logging.getLogger(__name__)
28
 
29
  model_name = "DeepSeek-R1-Distill-Qwen-7B"
30
  examples = pd.read_csv("assets/examples.csv")
 
31
  instances = {}
32
  scheduler = load_scheduler()
 
33
 
34
 
35
  HEAD = """
@@ -198,8 +188,6 @@ def initialize_instance(request: gr.Request):
198
 
199
 
200
  def cleanup_instance(request: gr.Request):
201
- global ENDPOINT_ALIVE
202
-
203
  session_id = request.session_hash
204
 
205
  if session_id in instances:
@@ -209,51 +197,48 @@ def cleanup_instance(request: gr.Request):
209
 
210
  del instances[session_id]
211
 
212
- if len(instances) == 0:
213
- ENDPOINT_ALIVE = False
214
-
215
  logger.info("Number of connections: %d", len(instances))
216
 
217
 
218
- async def initialize_endpoint():
219
- alive = False
220
- session = aiohttp.ClientSession()
221
- async with session.get(f"{API_URL}/health", headers=headers) as resp:
222
- resp_text = await resp.text()
223
- if resp.status == 200:
224
- alive = True
225
- else:
226
- logger.error("API Error Code: %d, Message: %s", resp.status, resp_text)
227
-
228
- await session.close()
229
- return alive
230
 
 
231
 
232
- async def get_endpoint_state():
233
- global ENDPOINT_ALIVE
234
- n = 0
235
- sleep_time = MAX_RETRY_WAIT_TIME
236
-
237
- while n < MAX_RETRIES:
238
- n += 1
 
 
 
239
 
240
- if not ENDPOINT_ALIVE:
241
- logger.info("Initializing inference endpoint")
242
- yield "Initializing"
243
- ENDPOINT_ALIVE = await initialize_endpoint()
 
 
 
 
 
 
 
 
 
 
 
 
244
 
245
- if ENDPOINT_ALIVE:
246
- logger.info("Inference endpoint is ready")
247
- gr.Info("Inference endpoint is ready")
248
- yield "Ready"
249
- break
250
-
251
- gr.Warning("Initializing inference endpoint\n(This may take 2~3 minutes)", duration=sleep_time)
252
- await asyncio.sleep(sleep_time)
253
- sleep_time = max(sleep_time * 0.8, MIN_RETRY_WAIT_TIME)
254
 
255
- if n == MAX_RETRIES:
256
- yield "Server Error"
257
 
258
 
259
  async def post_process(session_id, output):
@@ -266,62 +251,11 @@ async def post_process(session_id, output):
266
  answer = None
267
  else:
268
  answer = p[-1]
269
- else:
270
- answer = None
271
- reasoning = output
272
-
273
- steering_output = SteeringOutput(**req.model_dump(), reasoning=reasoning, answer=answer)
274
- instances[session_id].append(steering_output)
275
-
276
-
277
- class Generator:
278
- def __init__(self):
279
- self.stop_events = {}
280
-
281
- async def stop(self, session_id):
282
- self.stop_events[session_id] = True
283
- logger.info("Stopping generation")
284
-
285
- async def generate(
286
- self, session_id: str, prompt: str, steering: bool, coeff: float,
287
- max_new_tokens: int, top_p: float, temperature: float, layer: int, vec_scaling: float
288
- ):
289
- req = UserRequest(
290
- session_id=session_id, prompt=prompt, steering=steering, coeff=coeff,
291
- max_new_tokens=max_new_tokens, top_p=top_p, temperature=temperature, vec_scale=vec_scaling, layer=layer
292
- )
293
-
294
- instances[session_id].append(req)
295
-
296
- data = req.get_api_format()
297
- logger.info("User Request: %s", data)
298
-
299
- generated_text = ""
300
- self.stop_events[session_id] = False
301
-
302
- try:
303
- async with aiohttp.ClientSession() as session:
304
- async with session.post(f"{API_URL}/generate", headers=headers, json=data) as resp:
305
- if resp.status == 200:
306
- generated_text += "<think>"
307
-
308
- async for chunk, _ in resp.content.iter_chunks():
309
- if self.stop_events[session_id]:
310
- break
311
 
312
- generated_text += chunk.decode()
313
- yield generated_text
314
- else:
315
- logger.error("API Error Ccode: %d, Error Message: %s", resp.status, resp.text())
316
- raise gr.Error("API Server Error")
317
-
318
- except:
319
- logger.info("Client session error")
320
 
321
- if generated_text != "":
322
- await post_process(session_id, generated_text)
323
-
324
- del self.stop_events[session_id]
325
 
326
 
327
  async def output_feedback(session_id, feedback):
@@ -339,31 +273,13 @@ async def output_feedback(session_id, feedback):
339
  logger.debug("Feedback submission error")
340
 
341
 
342
- async def show_feedback_buttons(upvote_btn, downvote_btn):
343
- return gr.update(interactive=True), gr.update(interactive=True)
344
-
345
-
346
  gr.set_static_paths(paths=[Path.cwd().absolute() / "assets"])
347
  theme = gr.themes.Base(primary_hue="emerald", text_size=gr.themes.sizes.text_lg).set()
348
- generator = Generator()
349
 
350
  with gr.Blocks(title="LLM Censorship Steering", theme=theme, head=HEAD, css=CSS, js=JS) as demo:
351
  session_id = gr.State()
352
- endpoint_state = gr.State(get_endpoint_state)
353
-
354
  gr.HTML(HTML)
355
-
356
- @gr.render(inputs=endpoint_state, triggers=[endpoint_state.change])
357
- def render_state(endpoint_state):
358
- if endpoint_state == "Ready":
359
- color = "green"
360
- elif endpoint_state == "Server Error":
361
- color = "red"
362
- else:
363
- color = "orange"
364
-
365
- if endpoint_state != None:
366
- gr.Markdown(f'🤖 {model_name} | Inference Endpoint State: <span style="color:{color}; font-weight: bold;">{endpoint_state}</span>', elem_id="model-state")
367
 
368
  with gr.Row(elem_id="main-components"):
369
  with gr.Column(scale=1):
@@ -382,7 +298,6 @@ with gr.Blocks(title="LLM Censorship Steering", theme=theme, head=HEAD, css=CSS,
382
 
383
  with gr.Row():
384
  clear_btn = gr.ClearButton()
385
- stop_btn = gr.Button("Stop")
386
  generate_btn = gr.Button("Generate", variant="primary")
387
 
388
  with gr.Accordion("⚙️ Advanced Settings", open=False):
@@ -408,25 +323,19 @@ with gr.Blocks(title="LLM Censorship Steering", theme=theme, head=HEAD, css=CSS,
408
  gr.Examples(examples=examples[examples["type"] == "harmful"].prompt.tolist(), inputs=input_text, label="Harmful")
409
 
410
 
411
- @gr.on(triggers=[clear_btn.click, stop_btn.click], outputs=[upvote_btn, downvote_btn])
412
- def clear_feedback_buttons():
413
  return gr.update(interactive=False), gr.update(interactive=False)
414
-
415
- @gr.on(triggers=[generate_btn.click], outputs=[upvote_btn, downvote_btn])
416
- def show_feedback_buttons():
417
- return gr.update(interactive=True), gr.update(interactive=True)
418
-
419
-
420
- submission = generate_btn.click(
421
- generator.generate, inputs=[session_id, input_text, steer_toggle, coeff, max_new_tokens, top_p, temperature, layer, vec_scaling], outputs=output
422
- )
423
 
424
  clear_btn.add([input_text, output])
425
- stop_btn.click(generator.stop, inputs=session_id, queue=False)
 
 
 
 
426
 
427
  upvote_btn.click(output_feedback, inputs=[session_id, upvote_btn])
428
  downvote_btn.click(output_feedback, inputs=[session_id, downvote_btn])
429
-
430
  layer.change(fn=lambda x: 1, inputs=vec_scaling, outputs=vec_scaling)
431
 
432
  demo.load(initialize_instance, outputs=session_id)
 
1
+ import threading
2
  import logging
3
  from pathlib import Path
4
+ from typing import Dict
5
+ import spaces
6
  import pandas as pd
7
  import gradio as gr
8
  from gradio_toggle import Toggle
9
+ from transformers import TextIteratorStreamer
10
+ from model import load_model
11
  from scheduler import load_scheduler
12
  from schemas import UserRequest, SteeringOutput, CONFIG
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  logging.basicConfig(level=logging.INFO, format='%(asctime)s %(name)s %(levelname)s:%(message)s')
15
  logger = logging.getLogger(__name__)
16
 
17
  model_name = "DeepSeek-R1-Distill-Qwen-7B"
18
  examples = pd.read_csv("assets/examples.csv")
19
+
20
  instances = {}
21
  scheduler = load_scheduler()
22
+ model = load_model()
23
 
24
 
25
  HEAD = """
 
188
 
189
 
190
  def cleanup_instance(request: gr.Request):
 
 
191
  session_id = request.session_hash
192
 
193
  if session_id in instances:
 
197
 
198
  del instances[session_id]
199
 
 
 
 
200
  logger.info("Number of connections: %d", len(instances))
201
 
202
 
203
+ @spaces.GPU(duration=90)
204
+ def generate(prompt: str, steering: bool, coeff: float, generation_config: Dict[str, float], layer: int, k: float):
205
+ formatted_prompt = model.apply_chat_template(prompt)
206
+ inputs = model.tokenize(formatted_prompt)
 
 
 
 
 
 
 
 
207
 
208
+ streamer = TextIteratorStreamer(model.tokenizer, timeout=10, skip_prompt=True, skip_special_tokens=True)
209
 
210
+ if steering:
211
+ thread = threading.Thread(
212
+ target=model.steer_generation,
213
+ args=(inputs, streamer, k, layer, coeff, generation_config)
214
+ )
215
+ else:
216
+ thread = threading.Thread(
217
+ target=model.run_generation,
218
+ args=(inputs, streamer, generation_config)
219
+ )
220
 
221
+ thread.start()
222
+
223
+ generated_text = "<think>"
224
+ for new_text in streamer:
225
+ generated_text += new_text
226
+ yield generated_text
227
+
228
+
229
+ def generate_output(
230
+ session_id: str, prompt: str, steering: bool, coeff: float,
231
+ max_new_tokens: int, top_p: float, temperature: float, layer: int, vec_scaling: float
232
+ ):
233
+ req = UserRequest(
234
+ session_id=session_id, prompt=prompt, steering=steering, coeff=coeff,
235
+ max_new_tokens=max_new_tokens, top_p=top_p, temperature=temperature, vec_scale=vec_scaling, layer=layer
236
+ )
237
 
238
+ logger.info("User request: %s", req)
239
+ instances[session_id].append(req)
 
 
 
 
 
 
 
240
 
241
+ yield from generate(prompt, steering, coeff, req.generation_config(), layer, req.k)
 
242
 
243
 
244
  async def post_process(session_id, output):
 
251
  answer = None
252
  else:
253
  answer = p[-1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
 
255
+ steering_output = SteeringOutput(**req.model_dump(), reasoning=reasoning, answer=answer)
256
+ instances[session_id].append(steering_output)
 
 
 
 
 
 
257
 
258
+ return gr.update(interactive=True), gr.update(interactive=True)
 
 
 
259
 
260
 
261
  async def output_feedback(session_id, feedback):
 
273
  logger.debug("Feedback submission error")
274
 
275
 
 
 
 
 
276
  gr.set_static_paths(paths=[Path.cwd().absolute() / "assets"])
277
  theme = gr.themes.Base(primary_hue="emerald", text_size=gr.themes.sizes.text_lg).set()
 
278
 
279
  with gr.Blocks(title="LLM Censorship Steering", theme=theme, head=HEAD, css=CSS, js=JS) as demo:
280
  session_id = gr.State()
 
 
281
  gr.HTML(HTML)
282
+ gr.Markdown(f'🤖 {model_name}')
 
 
 
 
 
 
 
 
 
 
 
283
 
284
  with gr.Row(elem_id="main-components"):
285
  with gr.Column(scale=1):
 
298
 
299
  with gr.Row():
300
  clear_btn = gr.ClearButton()
 
301
  generate_btn = gr.Button("Generate", variant="primary")
302
 
303
  with gr.Accordion("⚙️ Advanced Settings", open=False):
 
323
  gr.Examples(examples=examples[examples["type"] == "harmful"].prompt.tolist(), inputs=input_text, label="Harmful")
324
 
325
 
326
+ @gr.on(triggers=[clear_btn.click], outputs=[upvote_btn, downvote_btn])
327
+ def clear():
328
  return gr.update(interactive=False), gr.update(interactive=False)
 
 
 
 
 
 
 
 
 
329
 
330
  clear_btn.add([input_text, output])
331
+ generate_btn.click(
332
+ generate_output, inputs=[session_id, input_text, steer_toggle, coeff, max_new_tokens, top_p, temperature, layer, vec_scaling], outputs=output
333
+ ).success(
334
+ post_process, inputs=[session_id, output], outputs=[upvote_btn, downvote_btn]
335
+ )
336
 
337
  upvote_btn.click(output_feedback, inputs=[session_id, upvote_btn])
338
  downvote_btn.click(output_feedback, inputs=[session_id, downvote_btn])
 
339
  layer.change(fn=lambda x: 1, inputs=vec_scaling, outputs=vec_scaling)
340
 
341
  demo.load(initialize_instance, outputs=session_id)
model.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, warnings
2
+ from operator import attrgetter
3
+ from typing import List, Dict
4
+
5
+ import torch
6
+ import torch.nn.functional as F
7
+ from torchtyping import TensorType
8
+ from transformers import TextIteratorStreamer
9
+ from transformers import AutoTokenizer, BatchEncoding
10
+ import nnsight
11
+ from nnsight import LanguageModel
12
+ from nnsight.intervention import Envoy
13
+
14
+ warnings.filterwarnings("ignore")
15
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
16
+
17
+ # nnsight with multi-threading: https://github.com/ndif-team/nnsight/issues/280
18
+ nnsight.CONFIG.APP.GLOBAL_TRACING = False
19
+
20
+ config = {
21
+ "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
22
+ "steering_vec": "activations/candidate_vectors.pt",
23
+ "offset": "activations/offsets.pt",
24
+ }
25
+
26
+ def detect_module_attrs(model: LanguageModel) -> str:
27
+ if "model" in model._modules and "layers" in model.model._modules:
28
+ return "model.layers"
29
+ elif "transformers" in model._modules and "h" in model.transformers._modules:
30
+ return "transformers.h"
31
+ else:
32
+ raise Exception("Failed to detect module attributes.")
33
+
34
+
35
+ class ModelBase:
36
+ def __init__(
37
+ self, model_name: str,
38
+ steering_vecs: TensorType, offsets: TensorType,
39
+ tokenizer: AutoTokenizer = None, block_module_attr=None
40
+ ):
41
+ if tokenizer is None:
42
+ self.tokenizer = self._load_tokenizer(model_name)
43
+ else:
44
+ self.tokenizer = tokenizer
45
+ self.model = self._load_model(model_name, self.tokenizer)
46
+
47
+ self.device = self.model.device
48
+ self.hidden_size = self.model.config.hidden_size
49
+ if block_module_attr is None:
50
+ self.block_modules = self.get_module(detect_module_attrs(self.model))
51
+ else:
52
+ self.block_modules = self.get_module(block_module_attr)
53
+
54
+ self.steering_vecs = F.normalize(steering_vecs, dim=-1)
55
+ self.steering_vecs, self.offsets = self.set_dtype(self.steering_vecs, offsets)
56
+
57
+ def _load_model(self, model_name: str, tokenizer: AutoTokenizer) -> LanguageModel:
58
+ return LanguageModel(model_name, tokenizer=tokenizer, dispatch=True, trust_remote_code=True, device_map="auto", torch_dtype=torch.bfloat16)
59
+
60
+ def _load_tokenizer(self, model_name) -> AutoTokenizer:
61
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
62
+ tokenizer.padding_side = "left"
63
+ if not tokenizer.pad_token:
64
+ tokenizer.pad_token_id = tokenizer.eos_token_id
65
+ tokenizer.pad_token = tokenizer.eos_token
66
+
67
+ tokenizer.chat_template = tokenizer.chat_template.replace("<|Assistant|><think>\\n", "<|Assistant|><think>")
68
+ return tokenizer
69
+
70
+ def tokenize(self, prompt: str) -> BatchEncoding:
71
+ return self.tokenizer(prompt, padding=True, truncation=False, return_tensors="pt")
72
+
73
+ def get_module(self, attr: str) -> Envoy:
74
+ return attrgetter(attr)(self.model)
75
+
76
+ def set_dtype(self, *vars):
77
+ if len(vars) == 1:
78
+ return vars[0].to(self.model.dtype)
79
+ else:
80
+ return (var.to(self.model.dtype) for var in vars)
81
+
82
+ def apply_chat_template(self, instruction: str) -> List[str]:
83
+ messages = [{"role": "user", "content": instruction}]
84
+ return self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
85
+
86
+ def run_generation(self, inputs, streamer: TextIteratorStreamer, generation_config: Dict):
87
+ inputs = inputs.to(self.device)
88
+ _ = self.model._model.generate(**inputs, do_sample=True, streamer=streamer, **generation_config)
89
+
90
+ def steer_generation(
91
+ self, inputs, streamer: TextIteratorStreamer, k: float,
92
+ layer: int, coeff: float, generation_config: Dict
93
+ ):
94
+ layer_block = self.block_modules[layer]
95
+ unit_vec = self.steering_vecs[layer]
96
+ offset = self.offsets[layer]
97
+
98
+ with self.model.generate(inputs, do_sample=True, streamer=streamer, **generation_config):
99
+ with self.block_modules.all():
100
+ acts = layer_block.output[0].clone()
101
+ proj = (acts - offset) @ unit_vec.unsqueeze(-1) * unit_vec
102
+ layer_block.output[0][:] = acts - proj + coeff * k * unit_vec
103
+
104
+
105
+ def load_model() -> ModelBase:
106
+ steering_vecs = torch.load(config['steering_vec'], weights_only=True)
107
+ offsets = torch.load(config['offset'], weights_only=True)
108
+ model = ModelBase(config['model_name'], steering_vecs=steering_vecs, offsets=offsets)
109
+ return model
110
+
requirements.txt CHANGED
@@ -2,3 +2,11 @@ aiohttp==3.11.16
2
  pandas==2.2.2
3
  pyarrow==19.0.1
4
  gradio_toggle==2.0.2
 
 
 
 
 
 
 
 
 
2
  pandas==2.2.2
3
  pyarrow==19.0.1
4
  gradio_toggle==2.0.2
5
+ transformers==4.50.0
6
+ accelerate==1.6.0
7
+ nnsight==0.4.3
8
+ triton==3.1.0
9
+ torchtyping==0.1.5
10
+ tiktoken==0.8.0
11
+ transformers_stream_generator==0.0.5
12
+ zstandard==0.23.0
scheduler.py CHANGED
@@ -14,7 +14,7 @@ logger = logging.getLogger(__name__)
14
 
15
  def load_scheduler():
16
  return ParquetScheduler(
17
- repo_id="hannahcyberey/Censorship-Steering-Logs", every=10,
18
  private=True,
19
  squash_history=False,
20
  schema={
 
14
 
15
  def load_scheduler():
16
  return ParquetScheduler(
17
+ repo_id="hannahcyberey/Censorship-Steering-Logs", every=60,
18
  private=True,
19
  squash_history=False,
20
  schema={
schemas.py CHANGED
@@ -32,18 +32,11 @@ class UserRequest(BaseModel):
32
  else:
33
  self.k = self.vec_scale * vector_scaling[self.layer]["k_neg"]
34
 
35
- def get_api_format(self):
36
  return {
37
- "prompt": self.prompt,
38
- "steering": self.steering,
39
- "coeff": self.coeff,
40
- "k": self.k,
41
- "layer": self.layer,
42
- "generation_config": {
43
- "max_new_tokens": self.max_new_tokens,
44
- "top_p": self.top_p,
45
- "temperature": self.temperature
46
- }
47
  }
48
 
49
 
 
32
  else:
33
  self.k = self.vec_scale * vector_scaling[self.layer]["k_neg"]
34
 
35
+ def generation_config(self):
36
  return {
37
+ "max_new_tokens": self.max_new_tokens,
38
+ "top_p": self.top_p,
39
+ "temperature": self.temperature
 
 
 
 
 
 
 
40
  }
41
 
42