--- license: mit --- Codellama 34b base model fine-tuned on the text chunk from the OpenAssistant-Guanaco dataset instead of Q&A pair, so it struggles to determine the end of the answer. recommend using a stop string like "### Human:" to prevent the model from talking to itself. Prompt template: ``` ### Human: {prompt} ### Assistant: ``` Run the model via text-generation-inference One GPU: ``` sudo docker run --gpus all --shm-size 1g -p 5000:80 -v $PWD/models:/data ghcr.io/huggingface/text-generation-inference:latest --max-total-tokens 4096 --quantize gptq --model-id mzbac/CodeLlama-34b-guanaco-gptq ``` Two GPUs: ``` docker run --gpus all --shm-size 1g -p 5000:80 -v $PWD/models:/data ghcr.io/huggingface/text-generation-inference:latest --max-total-tokens 4096 --max-input-length 4000 --max-batch-prefill-tokens 4096 --quantize gptq --num-shard 2 --model-id mzbac/CodeLlama-34b-guanaco-gptq ``` Query the mode via curl ``` curl 127.0.0.1:8001/generate \ -X POST \ -d '{"inputs":"### Human: 给我准备一个去日本旅行的计划\n### Assistant:", "parameters":{"max_new_tokens":2048, "stop": [ "### Human:" ]}}' \ -H 'Content-Type: application/json' ``` or From typescript: ``` npm install axios readline ``` ``` import axios from 'axios'; import readline from 'readline'; const rl = readline.createInterface({ input: process.stdin, output: process.stdout }); const makeRequest = async (conversation: Array<{ role: string, content: string }>) => { const url = "http://127.0.0.1:5000/generate"; const conversationString = conversation.map( (entry) => `### ${entry.role}: ${entry.content}` ).join('\n'); const conversationWithPrompt = `${conversationString}\n### Assistant:`; const payload = { inputs: conversationWithPrompt, parameters: { temperature: 0.7, max_new_tokens: 2000, repetition_penalty: 1.03, top_k: 45, top_p: 0.95, typical_p: 0.95, stop: ['### Human'] } }; const headers = { 'Content-Type': 'application/json' }; try { const response = await axios.post(url, payload, { headers }); return response.data; } catch (error) { return `Failed to make request, status code: ${error.response?.status}`; } }; const main = async () => { console.log("Welcome to the CLI tool. Type 'exit' to quit."); const conversation: Array<{ role: string, content: string }> = []; const question = (prompt: string) => { return new Promise(resolve => { rl.question(prompt, (answer) => { resolve(answer); }); }); }; while (true) { const humanInput = await question("### Human: "); if (humanInput.toLowerCase() === 'exit') { console.log("Exiting..."); rl.close(); break; } conversation.push({ role: 'Human', content: humanInput }); console.log("### Assistant:"); const assistantOutput = await makeRequest(conversation); const assistantText = assistantOutput.generated_text || 'Failed to get a valid response.'; console.log(assistantText); conversation.push({ role: 'Assistant', content: assistantText }); } }; main(); ```