File size: 1,625 Bytes
b352653
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
import torch

model_ckpt = "BAAI/bge-large-en-v1.5"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)


embeddings_doc_dataset = load_dataset("fashxp/pimcore-docs-embeddings-gpe")
embeddings_doc_dataset = embeddings_doc_dataset['train']
embeddings_doc_dataset.add_faiss_index(column="embeddings")

import pandas as pd

def find_in_docs(question):
  question_embedding = get_embeddings([question]).cpu().detach().numpy()
  question_embedding.shape

  scores, samples = embeddings_doc_dataset.get_nearest_examples(
      "embeddings", question_embedding, k=10
  )

  samples_df = pd.DataFrame.from_dict(samples)
  samples_df["scores"] = scores
  samples_df.sort_values("scores", ascending=False, inplace=True)

  result = ''

  for _, row in samples_df.iterrows():
      result = result + f"HEADING: {row.heading}\n" + f"SCORE: {row.scores}\n" + f"URL: {row.url}\n" + ("=" * 50) + "\n\n"

  return result



import gradio as gr

demo = gr.Interface(fn=find_in_docs, inputs="text", outputs="text")

demo.launch(share=True)