mzbac/CodeLlama-34b-guanaco-gptq

Codellama 34b base model fine-tuned on the text chunk from the OpenAssistant-Guanaco dataset instead of Q&A pair, so it struggles to determine the end of the answer. recommend using a stop string like "### Human:" to prevent the model from talking to itself.

Prompt template:

### Human: {prompt}
### Assistant:

Run the model via text-generation-inference

One GPU:

sudo docker run --gpus all --shm-size 1g -p 5000:80 -v $PWD/models:/data ghcr.io/huggingface/text-generation-inference:latest --max-total-tokens 4096 --quantize gptq --model-id mzbac/CodeLlama-34b-guanaco-gptq

Two GPUs:

docker run --gpus all --shm-size 1g -p 5000:80 -v $PWD/models:/data ghcr.io/huggingface/text-generation-inference:latest --max-total-tokens 4096 --max-input-length 4000 --max-batch-prefill-tokens 4096 --quantize gptq --num-shard 2 --model-id mzbac/CodeLlama-34b-guanaco-gptq

Query the mode via curl

curl 127.0.0.1:8001/generate \
    -X POST \
    -d '{"inputs":"### Human: 给我准备一个去日本旅行的计划\n### Assistant:", "parameters":{"max_new_tokens":2048, "stop": [
      "### Human:"
    ]}}' \
    -H 'Content-Type: application/json'

or From typescript:

npm install axios readline

import axios from 'axios';
import readline from 'readline';

const rl = readline.createInterface({
    input: process.stdin,
    output: process.stdout
});

const makeRequest = async (conversation: Array<{ role: string, content: string }>) => {
    const url = "http://127.0.0.1:5000/generate";
    const conversationString = conversation.map(
        (entry) => `### ${entry.role}: ${entry.content}`
    ).join('\n');

    const conversationWithPrompt = `${conversationString}\n### Assistant:`;

    const payload = {
        inputs: conversationWithPrompt,
        parameters: {
            temperature: 0.7,
            max_new_tokens: 2000,
            repetition_penalty: 1.03,
            top_k: 45,
            top_p: 0.95,
            typical_p: 0.95,
            stop: ['### Human']
        }
    };
    const headers = { 'Content-Type': 'application/json' };
    try {
        const response = await axios.post(url, payload, { headers });
        return response.data;
    } catch (error) {
        return `Failed to make request, status code: ${error.response?.status}`;
    }
};

const main = async () => {
    console.log("Welcome to the CLI tool. Type 'exit' to quit.");

    const conversation: Array<{ role: string, content: string }> = [];

    const question = (prompt: string) => {
        return new Promise<string>(resolve => {
            rl.question(prompt, (answer) => {
                resolve(answer);
            });
        });
    };

    while (true) {
        const humanInput = await question("### Human: ");

        if (humanInput.toLowerCase() === 'exit') {
            console.log("Exiting...");
            rl.close();
            break;
        }

        conversation.push({ role: 'Human', content: humanInput });

        console.log("### Assistant:");
        const assistantOutput = await makeRequest(conversation);

        const assistantText = assistantOutput.generated_text || 'Failed to get a valid response.';

        console.log(assistantText);
        conversation.push({ role: 'Assistant', content: assistantText });
    }
};

main();