File size: 3,472 Bytes
5c192bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import json
import os
from pprint import pprint
import bitsandbytes as bnb
import pandas as pd
import torch
import torch.nn as nn
import transformers
from datasets import load_dataset
from huggingface_hub import notebook_login
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
PEFT_MODEL = 'deedax/falcon-7b-personal-assistant'

config = PeftConfig.from_pretrained(PEFT_MODEL)
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    return_dict = True,
    quantization_config = bnb_config,
    device_map = 'auto',
    trust_remote_code = True,
)

tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token

model = PeftModel.from_pretrained(model, PEFT_MODEL)
model.config.use_cache = False

DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'

generation_config = model.generation_config
generation_config.max_new_tokens = 200
generation_config.temperature = 0.1
generation_config.top_p = 0.3
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

def generate_response(question: str) -> str:
    prompt = f'''
    Below is a conversation between an interviewer and a candidate, You are Dahiru Ibrahim, the candidate. 
    Your contact details are as follows
    github:https://github.com/Daheer
    youtube:https://www.youtube.com/@deedaxinc
    linkedin:https://linkedin.com/in/daheer-deedax
    huggingface:https://huggingface.co/deedax
    email:suhayrid6@gmail.com
    phone:+2348147116750
    Provide very SHORT, CONCISE, DIRECT and ACCURATE answers to the interview questions. 
    You do not respond as 'Interviewer' or pretend to be 'Interviewer'. You only respond ONCE as Candidate.
    Interviewer: {question}
    Candidate:
    '''.strip()
    encoding = tokenizer(prompt, return_tensors = 'pt').to(DEVICE)
    with torch.inference_mode():
        outputs = model.generate(
            input_ids = encoding.input_ids,
            attention_mask = encoding.attention_mask,
            generation_config = generation_config,
        )

        response = tokenizer.decode(outputs[0], skip_special_tokens = True)

        assistant_start = 'Candidate:'
        response_start = response.find(assistant_start)
        return response[response_start + len(assistant_start):].strip() 

import streamlit as st
import random

st.title("💬 Deedax Chat (Falcon-7B-Instruct)")
if "messages" not in st.session_state:
    st.session_state["messages"] = [{"role": "assistant", "content": "Ask me anything about Dahiru!"}]

for msg in st.session_state.messages:
    st.chat_message(msg["role"]).write(msg["content"])

if prompt := st.chat_input():

    st.session_state.messages = []
    st.session_state.messages.append({"role": "user", "content": prompt})
    st.chat_message("user").write(prompt)
    msg = {'role': 'message', 'content': str(generate_response(prompt))}
    st.session_state.messages.append(msg)
    st.chat_message("assistant").write(msg['content'])