deedax-chat / app.py
deedax's picture
Upload 2 files
5c192bc
import json
import os
from pprint import pprint
import bitsandbytes as bnb
import pandas as pd
import torch
import torch.nn as nn
import transformers
from datasets import load_dataset
from huggingface_hub import notebook_login
from peft import (
LoraConfig,
PeftConfig,
PeftModel,
get_peft_model,
prepare_model_for_kbit_training,
)
from transformers import (
AutoConfig,
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
)
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
PEFT_MODEL = 'deedax/falcon-7b-personal-assistant'
config = PeftConfig.from_pretrained(PEFT_MODEL)
bnb_config = BitsAndBytesConfig(
load_in_4bit = True,
bnb_4bit_use_double_quant = True,
bnb_4bit_quant_type = 'nf4',
bnb_4bit_compute_dtype = torch.bfloat16,
)
model = AutoModelForCausalLM.from_pretrained(
config.base_model_name_or_path,
return_dict = True,
quantization_config = bnb_config,
device_map = 'auto',
trust_remote_code = True,
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token
model = PeftModel.from_pretrained(model, PEFT_MODEL)
model.config.use_cache = False
DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'
generation_config = model.generation_config
generation_config.max_new_tokens = 200
generation_config.temperature = 0.1
generation_config.top_p = 0.3
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id
def generate_response(question: str) -> str:
prompt = f'''
Below is a conversation between an interviewer and a candidate, You are Dahiru Ibrahim, the candidate.
Your contact details are as follows
github:https://github.com/Daheer
youtube:https://www.youtube.com/@deedaxinc
linkedin:https://linkedin.com/in/daheer-deedax
huggingface:https://huggingface.co/deedax
email:suhayrid6@gmail.com
phone:+2348147116750
Provide very SHORT, CONCISE, DIRECT and ACCURATE answers to the interview questions.
You do not respond as 'Interviewer' or pretend to be 'Interviewer'. You only respond ONCE as Candidate.
Interviewer: {question}
Candidate:
'''.strip()
encoding = tokenizer(prompt, return_tensors = 'pt').to(DEVICE)
with torch.inference_mode():
outputs = model.generate(
input_ids = encoding.input_ids,
attention_mask = encoding.attention_mask,
generation_config = generation_config,
)
response = tokenizer.decode(outputs[0], skip_special_tokens = True)
assistant_start = 'Candidate:'
response_start = response.find(assistant_start)
return response[response_start + len(assistant_start):].strip()
import streamlit as st
import random
st.title("πŸ’¬ Deedax Chat (Falcon-7B-Instruct)")
if "messages" not in st.session_state:
st.session_state["messages"] = [{"role": "assistant", "content": "Ask me anything about Dahiru!"}]
for msg in st.session_state.messages:
st.chat_message(msg["role"]).write(msg["content"])
if prompt := st.chat_input():
st.session_state.messages = []
st.session_state.messages.append({"role": "user", "content": prompt})
st.chat_message("user").write(prompt)
msg = {'role': 'message', 'content': str(generate_response(prompt))}
st.session_state.messages.append(msg)
st.chat_message("assistant").write(msg['content'])