Nanny7's picture
encoder space
9b75985
"""
Hugging Face Space โ€” CodeT5-large encoder for the pseudoscore-x backend.
Exposes a Gradio API at /encode that:
- tokenises text (with the same <criterion> / <score> special tokens
the notebook used)
- runs the FROZEN encoder forward pass
- returns last_hidden_state (float16, base64-encoded), the attention
mask, and the cleaned subword tokens used for signal extraction
Designed for the FREE CPU tier on HF Spaces. The encoder weights load
once at Space startup; subsequent requests are just forward passes.
Call from Python:
from gradio_client import Client
client = Client("YOUR_USERNAME/pseudoscorex-encoder")
out = client.predict("hello world", api_name="/encode")
"""
import base64
import os
import gradio as gr
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
ENCODER_NAME = os.getenv("ENCODER_NAME", "Salesforce/codet5-large")
MAX_LENGTH = int(os.getenv("MAX_LENGTH", "512"))
# โ”€โ”€ Boot: load tokenizer + frozen encoder once โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
print(f"[boot] Loading tokenizer: {ENCODER_NAME}")
tokenizer = AutoTokenizer.from_pretrained(ENCODER_NAME)
tokenizer.add_tokens(["<criterion>", "<score>"], special_tokens=True)
print(f"[boot] Loading encoder: {ENCODER_NAME}")
full_model = AutoModelForSeq2SeqLM.from_pretrained(ENCODER_NAME)
encoder = full_model.encoder
encoder.resize_token_embeddings(len(tokenizer))
encoder.eval()
for p in encoder.parameters():
p.requires_grad = False
del full_model # decoder unused
print("[boot] Encoder ready.")
SPECIAL_TOKENS = {"", "<s>", "</s>", "<pad>", "<criterion>", "<score>"}
def _decode_clean_tokens(text: str):
"""Mirrors model/signals.py::decode_clean_tokens on the server."""
ids = tokenizer(text, max_length=MAX_LENGTH, truncation=True)["input_ids"]
toks = tokenizer.convert_ids_to_tokens(ids)
special = set(tokenizer.all_special_tokens)
clean = []
for t in toks:
if t in special or t.strip() in ["", "โ–"]:
continue
cleaned = t.replace("โ–", "").replace("ฤ ", "").strip()
if cleaned:
clean.append(cleaned)
return clean
@torch.no_grad()
def encode(text: str):
"""
Returns a JSON-serialisable dict:
{
"hidden_b64": <base64 string of float16 array>,
"shape": [seq_len, hidden_dim],
"attention_mask": [int, ...], # length = seq_len
"clean_tokens": [str, ...], # for signal extraction
}
"""
if not isinstance(text, str) or not text.strip():
raise gr.Error("text must be a non-empty string")
inp = tokenizer(
text,
max_length=MAX_LENGTH,
truncation=True,
padding="max_length",
return_tensors="pt",
)
hidden = encoder(**inp).last_hidden_state # (1, seq_len, 1024)
arr = hidden[0].cpu().numpy().astype(np.float16) # (seq_len, 1024)
return {
"hidden_b64": base64.b64encode(arr.tobytes()).decode("ascii"),
"shape": list(arr.shape),
"attention_mask": inp["attention_mask"][0].cpu().tolist(),
"clean_tokens": _decode_clean_tokens(text),
}
# โ”€โ”€ Gradio UI + API โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.Blocks(title="pseudoscore-x encoder") as demo:
gr.Markdown(
"# pseudoscore-x encoder\n"
"CodeT5-large encoder with `<criterion>` and `<score>` special tokens.\n"
"Use the **/encode** API endpoint from your backend."
)
inp = gr.Textbox(label="Text", lines=4, placeholder="Paste text to encodeโ€ฆ")
out = gr.JSON(label="Encoded output")
btn = gr.Button("Encode")
btn.click(fn=encode, inputs=inp, outputs=out, api_name="encode")
if __name__ == "__main__":
demo.queue(max_size=8).launch(server_name="0.0.0.0", server_port=7860)