|
import gradio as gr |
|
from datasets import load_dataset |
|
|
|
import os |
|
import spaces |
|
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig |
|
import torch |
|
from threading import Thread |
|
from sentence_transformers import SentenceTransformer |
|
from datasets import load_dataset |
|
import time |
|
|
|
token = os.environ["HF_TOKEN"] |
|
ST = SentenceTransformer("BM-K/KoSimCSE-roberta-multitask") |
|
|
|
dataset = load_dataset("not-lain/wikipedia",revision = "embedded") |
|
|
|
data = dataset["train"] |
|
data = data.add_faiss_index("embeddings") |
|
|
|
|
|
model_id = "mintaeng/small_fut_final" |
|
|
|
|
|
bnb_config = BitsAndBytesConfig( |
|
load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16 |
|
) |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_id,token=token) |
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_id, |
|
torch_dtype=torch.bfloat16, |
|
device_map="auto", |
|
quantization_config=bnb_config, |
|
token=token |
|
) |
|
terminators = [ |
|
tokenizer.eos_token_id, |
|
tokenizer.convert_tokens_to_ids("<|eot_id|>") |
|
] |
|
|
|
SYS_PROMPT = """ |
|
"Below is an instruction that describes a task. Write a response that appropriately completes the request."\ |
|
"μ μνλ contextμμλ§ λλ΅νκ³ contextμ μλ λ΄μ©μ μμ±νμ§λ§"\ |
|
"make answer in korean. νκ΅μ΄λ‘ λλ΅νμΈμ"\ |
|
"\n\nContext:\n{context}\n;"\ |
|
"Question: {question}"\ |
|
"\n\nAnswer:" |
|
""" |
|
|
|
|
|
|
|
def search(query: str, k: int = 3 ): |
|
"""a function that embeds a new query and returns the most probable results""" |
|
embedded_query = ST.encode(query) |
|
scores, retrieved_examples = data.get_nearest_examples( |
|
"embeddings", embedded_query, |
|
k=k |
|
) |
|
return scores, retrieved_examples |
|
|
|
def format_prompt(prompt,retrieved_documents,k): |
|
"""using the retrieved documents we will prompt the model to generate our responses""" |
|
PROMPT = f"Question:{prompt}\nContext:" |
|
for idx in range(k) : |
|
PROMPT+= f"{retrieved_documents['text'][idx]}\n" |
|
return PROMPT |
|
|
|
|
|
@spaces.GPU(duration=150) |
|
def talk(prompt,history): |
|
k = 1 |
|
scores , retrieved_documents = search(prompt, k) |
|
formatted_prompt = format_prompt(prompt,retrieved_documents,k) |
|
formatted_prompt = formatted_prompt[:2000] |
|
messages = [{"role":"system","content":SYS_PROMPT},{"role":"user","content":formatted_prompt}] |
|
|
|
input_ids = tokenizer.apply_chat_template( |
|
messages, |
|
add_generation_prompt=True, |
|
return_tensors="pt" |
|
).to(model.device) |
|
outputs = model.generate( |
|
input_ids, |
|
max_new_tokens=1024, |
|
eos_token_id=terminators, |
|
do_sample=True, |
|
temperature=0.6, |
|
top_p=0.9, |
|
) |
|
streamer = TextIteratorStreamer( |
|
tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True |
|
) |
|
generate_kwargs = dict( |
|
input_ids= input_ids, |
|
streamer=streamer, |
|
max_new_tokens=1024, |
|
do_sample=True, |
|
top_p=0.95, |
|
temperature=0.75, |
|
eos_token_id=terminators, |
|
) |
|
t = Thread(target=model.generate, kwargs=generate_kwargs) |
|
t.start() |
|
|
|
outputs = [] |
|
for text in streamer: |
|
outputs.append(text) |
|
print(outputs) |
|
yield "".join(outputs) |
|
|
|
|
|
TITLE = "# RAG" |
|
|
|
DESCRIPTION = """ |
|
A rag pipeline with a chatbot feature |
|
Resources used to build this project : |
|
* embedding model : https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1 |
|
* dataset : https://huggingface.co/datasets/not-lain/wikipedia |
|
* faiss docs : |
|
* chatbot : |
|
* Full documentation : https://huggingface.co/blog/not-lain/rag-chatbot-using-llama3 |
|
""" |
|
|
|
|
|
demo = gr.ChatInterface( |
|
fn=talk, |
|
chatbot=gr.Chatbot( |
|
show_label=True, |
|
show_share_button=True, |
|
show_copy_button=True, |
|
likeable=True, |
|
layout="bubble", |
|
bubble_full_width=False, |
|
), |
|
theme="Soft", |
|
examples=[["νμ΄ κ²½κΈ° κ·μΉμ΄ λμΌ? "]], |
|
title=TITLE, |
|
description=DESCRIPTION, |
|
|
|
) |
|
demo.launch(debug=True) |