Update app.py
Browse files
app.py
CHANGED
@@ -71,20 +71,20 @@ stop_generation = False
|
|
71 |
def generate_tokens(model, generator):
|
72 |
global stop_generation
|
73 |
app.logger.info('generate_tokens started')
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
|
89 |
@app.route('/stop_generation', methods=['GET'])
|
90 |
def handler_stop_generation():
|
@@ -133,7 +133,7 @@ def generate_search_request():
|
|
133 |
logits_all=True,
|
134 |
#n_threads=12,
|
135 |
verbose=True,
|
136 |
-
n_gpu_layers=
|
137 |
n_gqa=8 #must be set for 70b models
|
138 |
)
|
139 |
|
@@ -183,7 +183,7 @@ def generate_response():
|
|
183 |
logits_all=True,
|
184 |
#n_threads=12,
|
185 |
verbose=True,
|
186 |
-
n_gpu_layers=
|
187 |
n_gqa=8 #must be set for 70b models
|
188 |
)
|
189 |
|
@@ -239,4 +239,4 @@ def generate_response():
|
|
239 |
return Response(generate_tokens(model, generator), content_type='text/plain', status=200, direct_passthrough=True)
|
240 |
|
241 |
if __name__ == "__main__":
|
242 |
-
app.run(host="0.0.0.0", port=7860, debug=False, threaded=
|
|
|
71 |
def generate_tokens(model, generator):
|
72 |
global stop_generation
|
73 |
app.logger.info('generate_tokens started')
|
74 |
+
with lock:
|
75 |
+
try:
|
76 |
+
for token in generator:
|
77 |
+
if token == model.token_eos() or stop_generation:
|
78 |
+
stop_generation = False
|
79 |
+
app.logger.info('Abort generating')
|
80 |
+
yield b'' # End of chunk
|
81 |
+
break
|
82 |
+
|
83 |
+
token_str = model.detokenize([token])#.decode("utf-8", errors="ignore")
|
84 |
+
yield token_str
|
85 |
+
except Exception as e:
|
86 |
+
app.logger.info('generator exception')
|
87 |
+
yield b'' # End of chunk
|
88 |
|
89 |
@app.route('/stop_generation', methods=['GET'])
|
90 |
def handler_stop_generation():
|
|
|
133 |
logits_all=True,
|
134 |
#n_threads=12,
|
135 |
verbose=True,
|
136 |
+
n_gpu_layers=30,
|
137 |
n_gqa=8 #must be set for 70b models
|
138 |
)
|
139 |
|
|
|
183 |
logits_all=True,
|
184 |
#n_threads=12,
|
185 |
verbose=True,
|
186 |
+
n_gpu_layers=30,
|
187 |
n_gqa=8 #must be set for 70b models
|
188 |
)
|
189 |
|
|
|
239 |
return Response(generate_tokens(model, generator), content_type='text/plain', status=200, direct_passthrough=True)
|
240 |
|
241 |
if __name__ == "__main__":
|
242 |
+
app.run(host="0.0.0.0", port=7860, debug=False, threaded=False)
|