Edit model card

Uploaded model

  • Developed by: AlberBshara
  • License: apache-2.0
  • Finetuned from model : llama-3-8b-Instruct-bnb-4bit

you can use this model for QA task, the context window of this model is 8K

How to Use it:

Installs Unsloth, Xformers (Flash Attention) and all other packages!

%%capture !pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" !pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes !pip install triton

import sys, os
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from unsloth.chat_templates import get_chat_template
from typing import Tuple, Dict, Any
import torch

class LLM:
    def __init__(self, load_in_4bit: bool = True,
                 load_cpu_mem_usage: bool = True,
                 hf_model_path: str = "AlberBshara/scholara_QA"):
        """
        Args:
            load_in_4bit (bool): Use 4-bit quantization. Defaults to True.
            load_cpu_mem_usage (bool): Reduce CPU memory usage. Defaults to True.
            hf_model_path (str): The path of your model on HuggingFace-Hub like "your-user-name/model-name".
        """
        assert torch.cuda.is_available(), "CUDA is not available. An NVIDIA GPU is required."
        hf_auth_token = HUGGING_FACE_API_TOKEN
        # Specify the quantization config
        self._bnb_config = BitsAndBytesConfig(load_in_4bit=load_in_4bit)

        # Load model directly with quantization config
        self._model = AutoModelForCausalLM.from_pretrained(
            hf_model_path,
            low_cpu_mem_usage=load_cpu_mem_usage,
            quantization_config=self._bnb_config,
            use_auth_token=hf_auth_token
        )

        # Load the tokenizer
        self._tokenizer = AutoTokenizer.from_pretrained(
            hf_model_path,
            use_auth_token=hf_auth_token
        )
        self._tokenizer = get_chat_template(
            self._tokenizer,
            chat_template="llama-3",
            mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"}, # ShareGPT style
        )

        self._hf_model_path = hf_model_path
        self._EOS_TOKEN_ID = self._tokenizer.eos_token_id

        self._prompt = lambda context, question: f"""
        Answer the following question, use the given context.
        Context: [{context}]
        Question: [{question}]
        """

    def invoke(self, context: str, question: str) -> Tuple:
        if not question.strip():
            raise ValueError("question cannot be empty or None")

        if not context.strip():
            raise ValueError("context cannot be empty or None")

        inputs = self._prompt(context, question)

        messages = [{"from": "human", "value": inputs}]
        inputs = self._tokenizer.apply_chat_template(
              messages,
              tokenize=True,
              add_generation_prompt=True, # Must add for generation
              return_tensors="pt",
        ).to("cuda")

        # Increase the max_new_tokens to allow more detailed responses
        output_ids = self._model.generate(inputs, max_new_tokens=2048, pad_token_id=self._EOS_TOKEN_ID)
        output_ids = output_ids.tolist()[0] if output_ids.size(0) == 1 else output_ids.tolist()

        output_text = self._tokenizer.decode(output_ids, skip_special_tokens=True)

        # free GPU Mem.
        del inputs
        torch.cuda.empty_cache()

        return output_text, output_ids, None

    def extract_answer(self, response: str) -> str:
        start_with: str = ".assistant"
        start_index = response.find(start_with)

        # If the word is found, extract the substring from that point onward
        if start_index != -1:
            # Move start_index to the end of the word
            start_index += len(start_with)
            return response[start_index:]
        else:
            return response

    def get_metadata(self) -> Dict[str, Any]:
        return {
            "class_name": self.__class__.__name__,
            "init_params": {
                "load_in_4bit": True,
                "load_cpu_mem_usage": True,
                "hf_model_path": "AlberBshara/scholara_QA",
                "hf_auth_token": "--%$%--"
            },
            "methods": ["invoke", "extract_answer"]
        }


test_llm = LLM()
Downloads last month

-

Downloads are not tracked for this model. How to track
Inference API
Unable to determine this model’s pipeline type. Check the docs .

Model tree for AlberBshara/scholara_QA

Finetuned
(834)
this model