alpaca-lora / app.py
sagu7's picture
Update app.py
b282f54
raw
history blame
1.98 kB
import random
from typing import Optional
from fastapi import FastAPI
from pydantic import BaseModel
from peft import PeftModel
from transformers import LLaMATokenizer, LLaMAForCausalLM, GenerationConfig
app = FastAPI()
tokenizer = LLaMATokenizer.from_pretrained("decapoda-research/llama-7b-hf")
model = LLaMAForCausalLM.from_pretrained(
"decapoda-research/llama-7b-hf",
load_in_8bit=True,
device_map="auto",
)
model = PeftModel.from_pretrained(model, "tloen/alpaca-lora-7b")
class InputPrompt(BaseModel):
instruction: str
input: Optional[str] = None
class OutputResponse(BaseModel):
response: str
@app.post("/evaluate")
def evaluate(input_prompt: InputPrompt):
temperature = 0.9
generation_config = GenerationConfig(
temperature=temperature,
top_p=0.75,
num_beams=1, do_sample=True
)
prompt = generate_prompt(input_prompt.instruction, input_prompt.input)
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs["input_ids"].cuda()
generation_output = model.generate(
input_ids=input_ids,
generation_config=generation_config,
return_dict_in_generate=True,
output_scores=True,
max_new_tokens=256
)
for s in generation_output.sequences:
output = tokenizer.decode(s)
return OutputResponse(response=output.split("### Response:")[1].strip())
def generate_prompt(instruction, input=None):
if input:
return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Input:
{input}
### Response:"""
else:
return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Response:"""
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)