from flask import Flask, request, Response import logging from llama_cpp import Llama import threading from huggingface_hub import snapshot_download, Repository import huggingface_hub import gc import os.path from datetime import datetime import xml.etree.ElementTree as ET SYSTEM_PROMPT = "Ты — русскоязычный автоматический ассистент. Ты максимально точно и отвечаешь на запросы пользователя, используя русский язык." SYSTEM_TOKEN = 1788 USER_TOKEN = 1404 BOT_TOKEN = 9225 LINEBREAK_TOKEN = 13 ROLE_TOKENS = { "user": USER_TOKEN, "bot": BOT_TOKEN, "system": SYSTEM_TOKEN } CONTEXT_SIZE = 4001 ENABLE_GPU = True GPU_LAYERS = 70 # Create a lock object lock = threading.Lock() app = Flask(__name__) # Configure Flask logging app.logger.setLevel(logging.DEBUG) # Set the desired logging level # Initialize the model when the application starts #model_path = "../models/model-q4_K.gguf" # Replace with the actual model path #model_name = "model/ggml-model-q4_K.gguf" #repo_name = "IlyaGusev/saiga2_13b_gguf" #model_name = "model-q4_K.gguf" repo_name = "TheBloke/Llama-2-70B-Chat-GGUF" model_name = "llama-2-70b-chat.Q4_K_M.gguf" #repo_name = "IlyaGusev/saiga2_7b_gguf" #model_name = "model-q4_K.gguf" local_dir = '.' if os.path.isdir('/data'): app.logger.info('Persistent storage enabled') model = None model_path = snapshot_download(repo_id=repo_name, allow_patterns=model_name) + '/' + model_name app.logger.info('Model path: ' + model_path) DATASET_REPO_URL = "https://huggingface.co/datasets/muryshev/saiga-chat" DATA_FILENAME = "llama-2-70b-q4-k-m.xml" DATA_FILE = os.path.join("dataset", DATA_FILENAME) HF_TOKEN = os.environ.get("HF_TOKEN") app.logger.info("hfh: "+huggingface_hub.__version__) repo = Repository( local_dir="dataset", clone_from=DATASET_REPO_URL, use_auth_token=HF_TOKEN ) def log(req: str = '', resp: str = ''): if req or resp: element = ET.Element("row", {"time": str(datetime.now()) }) req_element = ET.SubElement(element, "request") req_element.text = req resp_element = ET.SubElement(element, "response") resp_element.text = resp with open(DATA_FILE, "ab+") as xml_file: xml_file.write(ET.tostring(element, encoding="utf-8")) commit_url = repo.push_to_hub() app.logger.info(commit_url) def init_model(context_size, enable_gpu=False, gpu_layer_number=35): global model if model is not None: del model gc.collect() if enable_gpu: model = Llama( model_path=model_path, n_ctx=context_size, n_parts=1, #n_batch=100, logits_all=True, #n_threads=12, verbose=True, n_gpu_layers=gpu_layer_number, n_gqa=8 #must be set for 70b models ) return model else: model = Llama( model_path=model_path, n_ctx=context_size, n_parts=1, #n_batch=100, logits_all=True, #n_threads=12, verbose=True, n_gqa=8 #must be set for 70b models ) return model init_model(CONTEXT_SIZE, ENABLE_GPU, GPU_LAYERS) def get_message_tokens(model, role, content): message_tokens = model.tokenize(content.encode("utf-8")) message_tokens.insert(1, ROLE_TOKENS[role]) message_tokens.insert(2, LINEBREAK_TOKEN) message_tokens.append(model.token_eos()) return message_tokens def get_system_tokens(model): system_message = { "role": "system", "content": SYSTEM_PROMPT } return get_message_tokens(model, **system_message) def get_system_tokens_for_preprompt(model, preprompt): system_message = { "role": "system", "content": preprompt } return get_message_tokens(model, **system_message) #app.logger.info('Evaluating system tokens start') #system_tokens = get_system_tokens(model) #model.eval(system_tokens) #app.logger.info('Evaluating system tokens end') stop_generation = False def generate_tokens(model, generator): global stop_generation app.logger.info('generate_tokens started') with lock: try: for token in generator: if token == model.token_eos() or stop_generation: stop_generation = False app.logger.info('End generating') yield b'' # End of chunk break token_str = model.detokenize([token])#.decode("utf-8", errors="ignore") yield token_str except Exception as e: app.logger.info('generator exception') app.logger.info(e) yield b'' # End of chunk @app.route('/change_context_size', methods=['GET']) def handler_change_context_size(): global stop_generation, model stop_generation = True new_size = int(request.args.get('size', CONTEXT_SIZE)) init_model(new_size, ENABLE_GPU, GPU_LAYERS) return Response('Size changed', content_type='text/plain') @app.route('/stop_generation', methods=['GET']) def handler_stop_generation(): global stop_generation stop_generation = True return Response('Stopped', content_type='text/plain') @app.route('/', methods=['GET', 'PUT', 'DELETE', 'PATCH']) def generate_unknown_response(): app.logger.info('unknown method: '+request.method) try: request_payload = request.get_json() app.logger.info('payload: '+request.get_json()) except Exception as e: app.logger.info('payload empty') return Response('What do you want?', content_type='text/plain') response_tokens = bytearray() def generate_and_log_tokens(user_request, model, generator): #global response_tokens for token in generate_tokens(model, generator): #if token == b'': # or (max_new_tokens is not None and i >= max_new_tokens): # log(user_request, response_tokens)#.decode("utf-8", errors="ignore")) # response_tokens = bytearray() # break #response_tokens.extend(token) yield token @app.route('/', methods=['POST']) def generate_response(): global stop_generation raw_content = request.data tokens = model.tokenize(raw_content) generator = model.generate( tokens[:CONTEXT_SIZE] ) app.logger.info('Generator created') # Use Response to stream tokens return Response(generate_and_log_tokens(raw_content, model, generator), content_type='text/plain', status=200, direct_passthrough=True) if __name__ == "__main__": app.run(host="0.0.0.0", port=7860, debug=False, threaded=False)