File size: 3,301 Bytes
56ebedb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
af74b0f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import gradio as gr
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
import os

os.environ['CURL_CA_BUNDLE'] = ''

# Load dataset
issues_dataset = load_dataset("gvozdev/subspace-info-v2", split="train")

# Load tokenizer and model
model_ckpt = "sentence-transformers/all-MiniLM-L12-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt, trust_remote_code=True)

# Text concatenation - not used in this case as mapping only on subject returns better results
# def concatenate_text(examples):
#     return {
#         "text": examples["subject"]
#         + " \n "
#         + examples["details"]
#     }


issues_dataset = issues_dataset.map()

# To speed up embedding, we can switch to GPU (change device to "cuda") - for larger models
device = torch.device("cpu")
model.to(device)


# CLS pooling on model’s outputs: collect the last hidden state for the special [CLS] token
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]


# Tokenize a list of documents, place the tensors on the CPU/GPU, feed them to the model,
# and apply CLS pooling to the outputs
def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

# Test if the  function works
# embedding = get_embeddings(issues_dataset["details"][0])
# print(embedding.shape)


# Use Dataset.map() to apply get_embeddings() function to each row in the dataset and create a new "embeddings" column
# Convert the embeddings to NumPy arrays as Datasets requires this format when we try to index them with FAISS
embeddings_dataset = issues_dataset.map(
    lambda x: {"embeddings": get_embeddings(x["subject"]).detach().cpu().numpy()[0]}
)

# Create a FAISS index
embeddings_dataset.add_faiss_index(column="embeddings")


#
def answer_question(question):
    # Get an embedding for the question
    question_embedding = get_embeddings([question]).cpu().detach().numpy()

    # Find a nearest neighbor in our dataset
    scores, samples = embeddings_dataset.get_nearest_examples(
        "embeddings", question_embedding, k=1
    )

    samples_df = pd.DataFrame.from_dict(samples)

    # This part is needed in case we use k>1
    # samples_df["scores"] = scores
    # samples_df.sort_values("scores", ascending=False, inplace=True)

    return samples_df["details"].values[0]


# Gradio interface
title = "Subspace Docs bot"
description = '<p style="text-align: center;">This is a bot trained on Subspace Network documentation ' \
              'to answer the most common questions about the project</p>'


def chat(message, history):
    history = history or []
    response = answer_question(message)
    history.append((message, response))
    return history, history


iface = gr.Interface(
    chat,
    ["text", "state"],
    ["chatbot", "state"],
    allow_flagging="never",
    title=title,
    description=description,
    theme="Monochrome",
    examples=["What is Subspace Network?", "Do you have a token?", "System requirements"]
)

iface.launch(share=False)