MrOvkill commited on
Commit
ccb82da
1 Parent(s): 83a5aeb

Update handler.py

Browse files
Files changed (1) hide show
  1. handler.py +55 -18
handler.py CHANGED
@@ -1,23 +1,60 @@
1
- from flask import Flask, request, jsonify
2
- from transformers import AutoTokenizer, AutoModelForCausalLM
 
3
  import torch
 
4
 
5
- app = Flask(__name__)
 
6
 
7
- # Loading
8
- tokenizer = AutoTokenizer.from_pretrained("MrOvkill/Phi-3-Instruct-Bloated")
9
- model = AutoModelForCausalLM.from_pretrained("MrOvkill/Phi-3-Instruct-Bloated")
 
 
 
 
10
 
11
- @app.route('/predict', methods=['POST'])
12
- def predict():
13
- data = request.json
14
- prompt = data["prompt"]
15
- kwargs = data.get('kwargs', {})
16
- inputs = tokenizer(prompt, return_tensors="pt")
17
- with torch.no_grad():
18
- outputs = model.generate(**inputs, **kwargs)
19
- response = {"response": tokenizer.decode(outputs[0], skip_special_tokens=True)}
20
- return jsonify(response)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
- if __name__ == '__main__':
23
- app.run()
 
1
+ import json
2
+ import os
3
+ from typing import Dict, List, Any
4
  import torch
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer
6
 
7
+ MAX_TOKENS=8192
8
+ GPU_LAYERS=99 if torch.cuda.is_available() else 0
9
 
10
+ class EndpointHandler():
11
+ def __init__(self, data):
12
+ cfg = {
13
+ "repo": "MrOvkill/Phi-3-Instruct-Bloated",
14
+ }
15
+ self.model = AutoModelForCausalLM.from_pretrained(cfg['repo'])
16
+ self.tokenizer = AutoTokenizer.from_pretrained(cfg['repo'])
17
 
18
+ def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
19
+ inputs = data.pop("inputs", "")
20
+ temperature = data.pop("temperature", None)
21
+ if not temperature:
22
+ temperature = data.pop("temp", 0.33)
23
+ if temperature > 3 or temperature < 0:
24
+ return json.dumps({
25
+ "status": "error",
26
+ "reason": "invalid temperature ( 0.01 - 1.00 )"
27
+ })
28
+ top_p = data.pop("top-p", 0.85)
29
+ if top_p > 3 or top_p < 0:
30
+ return json.dumps({
31
+ "status": "error",
32
+ "reason": "invalid top percentage ( 0.01 - 1.00 )"
33
+ })
34
+ top_k = data.pop("top-k", 42)
35
+ if top_k > 100 or top_k < 0:
36
+ return json.dumps({
37
+ "status": "error",
38
+ "reason": "invalid top k ( 1 - 99 )"
39
+ })
40
+ system_prompt = data.pop("system-prompt", "You are a helpful assistant.")
41
+ fmat = data.pop("format", f"<|system|>\n{system_prompt} <|end|>\n<|user|>\n{inputs} <|end|>\n<|assistant|>")
42
+ try:
43
+ fmat = fmat.format(system_prompt = system_prompt, prompt = inputs)
44
+ except Exception as e:
45
+ return json.dumps({
46
+ "status": "error",
47
+ "reason": "invalid format"
48
+ })
49
+ max_length = data.pop("max_length", 1024)
50
+ try:
51
+ max_length = int(max_length)
52
+ except Exception as e:
53
+ return json.dumps({
54
+ "status": "error",
55
+ "reason": "max_length was passed as something that was absolutely not a plain old int"
56
+ })
57
+
58
+ res = self.model(fmat, temperature=temperature, top_p=top_p, top_k=top_k, max_tokens=max_length)
59
 
60
+ return res