Spaces:
Sleeping
Sleeping
| import os | |
| import torch | |
| os.environ["HF_HOME"] = "/tmp/hf" | |
| os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf/transformers" | |
| from fastapi import FastAPI | |
| from pydantic import BaseModel | |
| from transformers import AutoTokenizer, AutoModel, TextIteratorStreamer | |
| from fastapi.responses import StreamingResponse | |
| import threading | |
| app = FastAPI() | |
| model_id = "GEB-AGI/geb-1.3b" | |
| model = AutoModel.from_pretrained(model_id, trust_remote_code=True) | |
| tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) | |
| class ChatRequest(BaseModel): | |
| message: str | |
| async def chat_stream(request: ChatRequest): | |
| prompt = f"Responde en español de forma clara y breve como un asistente IA.\nUsuario: {request.message}\nIA:" | |
| # 1. Tokeniza a tokens (sin padding, sin encode) | |
| tokens = tokenizer.tokenize(prompt) | |
| token_ids = tokenizer.convert_tokens_to_ids(tokens) | |
| # 2. Añade manualmente los tokens especiales | |
| input_ids = tokenizer.build_inputs_with_special_tokens(token_ids) | |
| input_ids = torch.tensor([input_ids]) | |
| streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) | |
| generation_kwargs = dict( | |
| input_ids=input_ids, | |
| max_new_tokens=48, | |
| temperature=0.7, | |
| top_p=0.9, | |
| do_sample=True, | |
| streamer=streamer, | |
| pad_token_id=getattr(tokenizer, "eos_token_id", None), | |
| ) | |
| thread = threading.Thread(target=model.generate, kwargs=generation_kwargs) | |
| thread.start() | |
| async def event_generator(): | |
| for new_text in streamer: | |
| yield new_text | |
| return StreamingResponse(event_generator(), media_type="text/plain") | |