# Importing libraries from transformers import M2M100Tokenizer, M2M100ForConditionalGeneration from quart import Quart, request from llama_cpp import Llama import psutil # Initing things app = Quart(__name__) # Quart app llm = Llama(model_path="./model.bin") # LLaMa model llama_model_name = "TheBloke/Llama-2-13B-chat-GGUF" translator_tokenizer = M2M100Tokenizer.from_pretrained( # tokenizer for translator "facebook/m2m100_1.2B", cache_dir="translator/" ) translator_model = M2M100ForConditionalGeneration.from_pretrained( # translator model "facebook/m2m100_1.2B", cache_dir="translator/" ) translator_model.eval() # Preparing things to work translator_tokenizer.src_lang = "en" # Loading prompt with open('system.prompt', 'r', encoding='utf-8') as f: prompt = f.read() # Defining @app.post("/request") async def echo(): try: data = await request.get_json() maxTokens = data.get("max_tokens", 64) if isinstance(data.get("system_prompt"), str): userPrompt = data.get("system_prompt") + "\n\nUser: " + data['request'] + "\nAssistant: " else: userPrompt = prompt + "\n\nUser: " + data['request'] + "\nAssistant: " except: return {"error": "Not enough data", "output": "Oops! Error occured! If you're a developer, using this API, check 'error' key."}, 400 try: output = llm(userPrompt, max_tokens=maxTokens, stop=["User:", "\n"], echo=False) text = output["choices"][0]["text"] # i allowed only certain languages: # russian (ru), ukranian (uk), chinese (zh) if isinstance(data.get("target_lang"), str) and data.get("target_lang").lower() in ["ru", "uk", "zh"]: encoded_input = translator_tokenizer(output, return_tensors="pt") generated_tokens = translator_model.generate( **encoded_input, forced_bos_token_id=translator_tokenizer.get_lang_id(data.get("target_lang")) ) translated_text = translator_tokenizer.batch_decode( generated_tokens, skip_special_tokens=True )[0] return {"output": text, "translated_output": translated_text} return {"output": text} except Exception as e: print(e) return {"error": str(e), "output": "Oops! Internal server error. Check the logs. If you're a developer, using this API, check 'error' key."}, 500 @app.get("/") async def get(): return '''
`CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS`
in Dockerfile on `CMAKE_ARGS="-DLLAMA_CUBLAS=on"`
. Also you can try `DLLAMA_CLBLAST`
, `DLLAMA_METAL`
or `DLLAMA_METAL`
.`run-docker.sh`
for ya. To stop container run `docker ps`
, find name of container and run `docker stop _dockerContainerName_`