|
import gradio as gr |
|
from huggingface_hub import InferenceClient |
|
from huggingface_hub import login |
|
import re |
|
import pandas as pd |
|
from langchain.schema import Document |
|
from langchain.text_splitter import TokenTextSplitter |
|
from transformers import AutoTokenizer |
|
import copy |
|
from langchain_community.retrievers import BM25Retriever |
|
from langchain_huggingface.llms.huggingface_endpoint import HuggingFaceEndpoint |
|
""" |
|
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference |
|
""" |
|
client = InferenceClient("HuggingFaceH4/zephyr-7b-beta") |
|
df1 = pd.read_csv("./data/champions_data_lol.csv") |
|
df1['Story'] = df1['Story'].astype(str) |
|
|
|
def preprocess_for_bm25(text): |
|
|
|
text = text.replace("...", " _ELLIPSIS_ ") |
|
|
|
|
|
text = re.sub(r'([.,!?()"\'])', r' \1 ', text) |
|
|
|
|
|
text = text.replace("_ELLIPSIS_", "...") |
|
|
|
|
|
text = re.sub(r'\s+', ' ', text).strip() |
|
text = text.lower() |
|
return text |
|
|
|
"""Pre-processing""" |
|
|
|
documents = [] |
|
for _, row in df1.iterrows(): |
|
biography_text = row['Story'] |
|
documents.append(Document( |
|
page_content= biography_text, |
|
metadata= { |
|
'champion_name': row['Champion'], |
|
'role': row['Role']} |
|
)) |
|
|
|
"""Chunking""" |
|
|
|
|
|
EMBEDDING_MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" |
|
tokenizer_name = EMBEDDING_MODEL_NAME |
|
|
|
|
|
text_splitter = TokenTextSplitter.from_huggingface_tokenizer( |
|
tokenizer=AutoTokenizer.from_pretrained(tokenizer_name), |
|
chunk_size=150, |
|
chunk_overlap=15 |
|
) |
|
|
|
chunks = text_splitter.split_documents(documents) |
|
|
|
chunks_bm25 = copy.deepcopy(chunks) |
|
|
|
for i, doc in enumerate(chunks_bm25): |
|
doc.page_content = preprocess_for_bm25(doc.page_content) |
|
doc.metadata["index"] = i |
|
|
|
for i, doc in enumerate(chunks): |
|
doc.metadata["index"] = i |
|
|
|
"""Retriever""" |
|
bm25_retriever = BM25Retriever.from_documents(chunks_bm25, k = 4) |
|
|
|
def retriever(query): |
|
tmp = bm25_retriever.invoke(preprocess_for_bm25(query)) |
|
context = [] |
|
for doc in tmp: |
|
index = doc.metadata['index'] |
|
context.append(chunks[index]) |
|
return context |
|
|
|
"""Chain""" |
|
|
|
|
|
|
|
|
|
from langchain_community.llms.huggingface_hub import HuggingFaceHub |
|
|
|
|
|
|
|
|
|
|
|
"""llm = HuggingFaceHub( |
|
repo_id="HuggingFaceH4/zephyr-7b-beta", |
|
#repo_id="google-bert/bert-base-uncased", |
|
model_kwargs={ |
|
"temperature": 0.1, |
|
"max_length": 5, |
|
"return_full_text": False |
|
} |
|
""" |
|
|
|
|
|
|
|
def ra(user_question): |
|
|
|
|
|
|
|
|
|
messages_q=[ |
|
{"role": "system", "content": "You are familiar with League of Legends lore. You help correct grammar and clarity without giving additional explanations."}, |
|
|
|
|
|
{"role": "user", "content": f"Fix any grammar or clarity issues in the following question. Only return the corrected question itself.\n\n{user_question}"} |
|
|
|
|
|
] |
|
print(messages_q) |
|
res = client.chat_completion(messages_q, |
|
max_tokens=30, |
|
stream=False, |
|
temperature=0.1, |
|
stop = ['(']) |
|
return copy.deepcopy(res["choices"][0]["message"]["content"]) |
|
|
|
|
|
|
|
"""-------------------------------------------------------------------""" |
|
def respond( |
|
message, |
|
history: list[tuple[str, str]], |
|
max_tokens, |
|
temperature, |
|
top_p, |
|
): |
|
new_query = ra(message) |
|
print("old: ",new_query) |
|
|
|
if new_query[-1] == "(": |
|
new_query = new_query[:-1] |
|
print("new: ",new_query) |
|
system_message = f"""You are an expert in League of Legends (LoL) lore. You will only answer questions related to the champions and their stories within the game. |
|
|
|
Instructions: |
|
1. Use only the provided context to answer. Do not make assumptions beyond it. |
|
2. If a question is outside LoL lore, respond: "Please ask something related to League of Legends lore." |
|
3. If the context lacks a clear answer, respond: "I'm unsure based on the provided context." |
|
4. Answer up to two sentences, ensuring clarity and completeness. |
|
|
|
""" |
|
system_message = f""" |
|
You are an expert in League of Legends (LoL) lore. You will only answer questions related to the champions and their stories within the game.= |
|
Instructions: |
|
1. Only use the context provided below to answer the question. Reference the context directly for accuracy. |
|
2. If the question is outside the scope of League of Legends lore, respond: "Please ask something related to League of Legends lore." |
|
3. If the provided context does not provide a clear answer, respond: "I'm unsure based on the provided context." |
|
|
|
""" |
|
print(system_message) |
|
messages = [{"role": "system", "content": "You are an expert in League of Legends (LoL) lore. You will only answer questions related to the champions and their stories within the game."}] |
|
|
|
for val in history: |
|
if val[0]: |
|
messages.append({"role": "user", "content": val[0]}) |
|
if val[1]: |
|
messages.append({"role": "assistant", "content": val[1]}) |
|
messages.append({"role": "user", "content": "Context: "+str(retriever(new_query))+"\n\nQuestion: "+new_query +"\n\nAnswer: "}) |
|
print("Context: "+str(retriever(new_query))+"\n\nQuestion: "+new_query +"\n\nAnswer: ") |
|
response = "" |
|
|
|
for message in client.chat_completion( |
|
messages, |
|
max_tokens=200, |
|
stream=True, |
|
temperature=0.1 |
|
): |
|
token = message.choices[0].delta.content |
|
|
|
response += token |
|
yield response |
|
|
|
|
|
""" |
|
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface |
|
""" |
|
with gr.Blocks() as demo: |
|
gr.Markdown(""" |
|
# League of Legends Lore Chatbot |
|
Welcome to the **LoL Lore Chatbot**! 🏆 |
|
Here, you can ask questions about League of Legends champions and their stories. |
|
|
|
**Example Question:** |
|
*Why does Kayn have different forms?* |
|
""") |
|
|
|
chat = gr.ChatInterface(respond) |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|