Spaces:
Sleeping
Sleeping
File size: 2,179 Bytes
f84e083 caa64e7 f84e083 9441c54 0f34bf3 bc5e3f5 f84e083 9441c54 bc5e3f5 f84e083 e40242b 9441c54 f84e083 bc5e3f5 f84e083 bc5e3f5 f84e083 9441c54 f84e083 9441c54 bc5e3f5 d0c61b6 215f4a9 bc5e3f5 215f4a9 d0c61b6 f84e083 9441c54 d0c61b6 9441c54 bc5e3f5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
from fastapi import FastAPI
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
from huggingface_hub import InferenceClient
import uvicorn
from typing import Generator
import json # Asegúrate de que esta línea esté al principio del archivo
import torch
app = FastAPI()
# Initialize the InferenceClient with your model
client = InferenceClient("meta-llama/Llama-2-7b-chat")
class Item(BaseModel):
prompt: str
history: list
system_prompt: str
temperature: float = 0.8
max_new_tokens: int = 9000
top_p: float = 0.15
repetition_penalty: float = 1.0
def format_prompt(message, history):
# Simple structure: alternating lines of dialogue, no special tokens unless specified by the model documentation
conversation = ""
for user_prompt, bot_response in history:
conversation += f"User: {user_prompt}\nBot: {bot_response}\n"
conversation += f"User: {message}"
return conversation
# No changes needed in the format_prompt function unless the new model requires different prompt formatting
def generate_stream(item: Item) -> Generator[bytes, None, None]:
formatted_prompt = format_prompt(f"{item.system_prompt}, {item.prompt}", item.history)
generate_kwargs = {
"temperature": item.temperature,
"max_new_tokens": item.max_new_tokens,
"top_p": item.top_p,
"repetition_penalty": item.repetition_penalty,
"do_sample": True,
"seed": 42, # Adjust or omit the seed as needed
}
# Stream the response from the InferenceClient
for response in client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True):
# Check if the 'details' flag and response structure are the same for the new model
chunk = {
"text": response.token.text,
"complete": response.generated_text is not None
}
yield json.dumps(chunk).encode("utf-8") + b"\n"
@app.post("/generate/")
async def generate_text(item: Item):
return StreamingResponse(generate_stream(item), media_type="application/x-ndjson")
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000) |