Upload 8 files
Browse files- .gitattributes +35 -35
- app.py +40 -0
- app2.py +16 -0
- doc_faiss_search.py +49 -0
- doc_faiss_train.py +105 -0
- faiss_test.py +50 -0
- faiss_train.py +99 -0
- requirements.txt +5 -0
.gitattributes
CHANGED
@@ -1,35 +1,35 @@
|
|
1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import csv
|
2 |
+
import xml.etree.ElementTree as ET
|
3 |
+
import glob, os
|
4 |
+
|
5 |
+
rootFolder = "c:/317"
|
6 |
+
|
7 |
+
file = open(rootFolder + "/result.csv", "w", encoding="utf-8")
|
8 |
+
file.write("prompt,text,rejected_text\n")
|
9 |
+
|
10 |
+
def parseXML(xmlFile):
|
11 |
+
|
12 |
+
prompt = xmlFile.replace("Using_", "").replace(".xml", "").replace(".", " ").replace("_", " ")
|
13 |
+
text = ""
|
14 |
+
|
15 |
+
try:
|
16 |
+
tree = ET.parse(rootFolder + "/" + xmlFile)
|
17 |
+
root = tree.getroot()
|
18 |
+
|
19 |
+
for item in root.findall(".//text"):
|
20 |
+
text += item.text
|
21 |
+
|
22 |
+
if text.find("а") == -1:
|
23 |
+
#file.write("### prompt\n")
|
24 |
+
file.write(prompt + "," + text.replace(",", " ") + "\n")
|
25 |
+
#file.write("### text\n")
|
26 |
+
#file.write(text.replace(",", " "))
|
27 |
+
|
28 |
+
except:
|
29 |
+
print("=======")
|
30 |
+
|
31 |
+
|
32 |
+
os.chdir(rootFolder)
|
33 |
+
for xmlFile in glob.glob("*.xml"):
|
34 |
+
print(xmlFile)
|
35 |
+
parseXML(xmlFile)
|
36 |
+
|
37 |
+
|
38 |
+
## parseXML('Using_WinRT_Viewer.Search_Panel.xml')
|
39 |
+
|
40 |
+
file.close()
|
app2.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
device = torch.device("cuda")
|
4 |
+
|
5 |
+
tenz = torch.tensor([1.,2.], device=device)
|
6 |
+
#tenz.toDevice(device)
|
7 |
+
|
8 |
+
print(torch.cuda.is_available())
|
9 |
+
|
10 |
+
from datasets import Dataset
|
11 |
+
|
12 |
+
dataset = Dataset.from_dict({"a": [0, 1, 2]})
|
13 |
+
dataset_with_duplicates = dataset.map(lambda batch: {"b": batch["a"] * 2})
|
14 |
+
print(dataset_with_duplicates.shape)
|
15 |
+
len(dataset_with_duplicates)
|
16 |
+
dataset_with_duplicates[:]
|
doc_faiss_search.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_dataset, load_from_disk, Dataset
|
2 |
+
from transformers import AutoTokenizer, AutoModel
|
3 |
+
import torch
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
model_ckpt = "nomic-ai/nomic-embed-text-v1.5"
|
7 |
+
|
8 |
+
|
9 |
+
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
|
10 |
+
model = AutoModel.from_pretrained(model_ckpt, trust_remote_code=True)
|
11 |
+
|
12 |
+
device = torch.device("cpu")
|
13 |
+
model.to(device)
|
14 |
+
|
15 |
+
def cls_pooling(model_output):
|
16 |
+
return model_output.last_hidden_state[:, 0]
|
17 |
+
|
18 |
+
def get_embeddings(text_list):
|
19 |
+
encoded_input = tokenizer(
|
20 |
+
text_list, padding=True, truncation=True, return_tensors="pt"
|
21 |
+
)
|
22 |
+
encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
|
23 |
+
model_output = model(**encoded_input)
|
24 |
+
return cls_pooling(model_output)
|
25 |
+
|
26 |
+
|
27 |
+
embeddings_dataset = Dataset.load_from_disk("dataset/embeddings")
|
28 |
+
|
29 |
+
embeddings_dataset.load_faiss_index("embeddings", "index/embeddings")
|
30 |
+
|
31 |
+
question = "Download license key"
|
32 |
+
|
33 |
+
question_embedding = get_embeddings([question]).cpu().detach().numpy()
|
34 |
+
|
35 |
+
scores, samples = embeddings_dataset.get_nearest_examples(
|
36 |
+
"embeddings", question_embedding, k=10
|
37 |
+
)
|
38 |
+
|
39 |
+
samples_df = pd.DataFrame.from_dict(samples)
|
40 |
+
samples_df["scores"] = scores
|
41 |
+
samples_df.sort_values("scores", ascending=True, inplace=True)
|
42 |
+
|
43 |
+
for _, row in samples_df.iterrows():
|
44 |
+
print(f"COMMENT: {row.text}")
|
45 |
+
print(f"SCORE: {row.scores}")
|
46 |
+
print(f"PROMPT: {row.prompt}")
|
47 |
+
print("=" * 50)
|
48 |
+
print()
|
49 |
+
|
doc_faiss_train.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_dataset, load_from_disk, Dataset
|
2 |
+
import os
|
3 |
+
from transformers import AutoTokenizer, AutoModel
|
4 |
+
import torch
|
5 |
+
import pandas as pd
|
6 |
+
import xml.etree.ElementTree as ET
|
7 |
+
import glob, os
|
8 |
+
|
9 |
+
rootFolder = "c:/317"
|
10 |
+
file = open(rootFolder + "/result.csv", "w", encoding="utf-8")
|
11 |
+
|
12 |
+
def parseXML(xmlFile):
|
13 |
+
|
14 |
+
prompt = xmlFile.replace("Using_", "").replace(".xml", "").replace(".", " ").replace("_", " ")
|
15 |
+
text = ""
|
16 |
+
|
17 |
+
try:
|
18 |
+
tree = ET.parse(rootFolder + "/" + xmlFile)
|
19 |
+
root = tree.getroot()
|
20 |
+
|
21 |
+
for item in root.findall(".//text"):
|
22 |
+
text += (item.text + " ")
|
23 |
+
|
24 |
+
if len(text) > 500:
|
25 |
+
text = text[:500]
|
26 |
+
|
27 |
+
if text.find("а") == -1:
|
28 |
+
file.write(text + "\n")
|
29 |
+
return {"text": text, "prompt": prompt}
|
30 |
+
else:
|
31 |
+
return None
|
32 |
+
|
33 |
+
except Exception as error:
|
34 |
+
print(error)
|
35 |
+
|
36 |
+
|
37 |
+
def generator():
|
38 |
+
|
39 |
+
for xmlFile in glob.glob("*.xml", root_dir=rootFolder):
|
40 |
+
print(xmlFile)
|
41 |
+
data = parseXML(xmlFile)
|
42 |
+
if not (data == None) : yield data
|
43 |
+
|
44 |
+
|
45 |
+
ds = Dataset.from_generator(generator)
|
46 |
+
|
47 |
+
file.close()
|
48 |
+
|
49 |
+
##########################################################
|
50 |
+
|
51 |
+
|
52 |
+
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
|
53 |
+
#model_ckpt = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
|
54 |
+
# model_ckpt = "sentence-transformers/msmarco-bert-base-dot-v5"
|
55 |
+
model_ckpt = "nomic-ai/nomic-embed-text-v1.5"
|
56 |
+
|
57 |
+
|
58 |
+
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
|
59 |
+
model = AutoModel.from_pretrained(model_ckpt, trust_remote_code=True)
|
60 |
+
|
61 |
+
device = torch.device("cuda")
|
62 |
+
model.to(device)
|
63 |
+
|
64 |
+
def cls_pooling(model_output):
|
65 |
+
return model_output.last_hidden_state[:, 0]
|
66 |
+
|
67 |
+
def get_embeddings(text_list):
|
68 |
+
encoded_input = tokenizer(
|
69 |
+
text_list, padding=True, truncation=True, return_tensors="pt"
|
70 |
+
)
|
71 |
+
encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
|
72 |
+
model_output = model(**encoded_input)
|
73 |
+
return cls_pooling(model_output)
|
74 |
+
|
75 |
+
embeddings_dataset = ds.map(
|
76 |
+
lambda x: {"embeddings": get_embeddings(x["text"]).detach().cpu().numpy()[0]}
|
77 |
+
)
|
78 |
+
|
79 |
+
embeddings_dataset.save_to_disk("dataset/embeddings")
|
80 |
+
|
81 |
+
embeddings_dataset = Dataset.load_from_disk("dataset/embeddings")
|
82 |
+
|
83 |
+
embeddings_dataset.add_faiss_index(column="embeddings")
|
84 |
+
|
85 |
+
embeddings_dataset.save_faiss_index("embeddings", "index/embeddings")
|
86 |
+
|
87 |
+
question = "Download license key"
|
88 |
+
|
89 |
+
question_embedding = get_embeddings([question]).cpu().detach().numpy()
|
90 |
+
|
91 |
+
scores, samples = embeddings_dataset.get_nearest_examples(
|
92 |
+
"embeddings", question_embedding, k=10
|
93 |
+
)
|
94 |
+
|
95 |
+
samples_df = pd.DataFrame.from_dict(samples)
|
96 |
+
samples_df["scores"] = scores
|
97 |
+
samples_df.sort_values("scores", ascending=True, inplace=True)
|
98 |
+
|
99 |
+
for _, row in samples_df.iterrows():
|
100 |
+
print(f"COMMENT: {row.text}")
|
101 |
+
print(f"SCORE: {row.scores}")
|
102 |
+
print(f"PROMPT: {row.prompt}")
|
103 |
+
print("=" * 50)
|
104 |
+
print()
|
105 |
+
|
faiss_test.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_dataset, load_from_disk, Dataset
|
2 |
+
import os
|
3 |
+
from transformers import AutoTokenizer, AutoModel
|
4 |
+
import torch
|
5 |
+
import pandas as pd
|
6 |
+
import faiss
|
7 |
+
|
8 |
+
########################
|
9 |
+
|
10 |
+
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
|
11 |
+
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
|
12 |
+
model = AutoModel.from_pretrained(model_ckpt)
|
13 |
+
|
14 |
+
device = torch.device("cuda")
|
15 |
+
model.to(device)
|
16 |
+
|
17 |
+
def cls_pooling(model_output):
|
18 |
+
return model_output.last_hidden_state[:, 0]
|
19 |
+
|
20 |
+
def get_embeddings(text_list):
|
21 |
+
encoded_input = tokenizer(
|
22 |
+
text_list, padding=True, truncation=True, return_tensors="pt"
|
23 |
+
)
|
24 |
+
encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
|
25 |
+
model_output = model(**encoded_input)
|
26 |
+
return cls_pooling(model_output)
|
27 |
+
|
28 |
+
embeddings_dataset = load_from_disk("dataset/embeddings")
|
29 |
+
|
30 |
+
embeddings_dataset.add_faiss_index(column="embeddings")
|
31 |
+
|
32 |
+
question = "How can I load a dataset offline?"
|
33 |
+
question_embedding = get_embeddings([question]).cpu().detach().numpy()
|
34 |
+
|
35 |
+
scores, samples = embeddings_dataset.get_nearest_examples(
|
36 |
+
"embeddings", question_embedding, k=5
|
37 |
+
)
|
38 |
+
|
39 |
+
samples_df = pd.DataFrame.from_dict(samples)
|
40 |
+
samples_df["scores"] = scores
|
41 |
+
samples_df.sort_values("scores", ascending=False, inplace=True)
|
42 |
+
|
43 |
+
for _, row in samples_df.iterrows():
|
44 |
+
print(f"COMMENT: {row.comments}")
|
45 |
+
print(f"SCORE: {row.scores}")
|
46 |
+
print(f"TITLE: {row.title}")
|
47 |
+
print(f"URL: {row.html_url}")
|
48 |
+
print("=" * 50)
|
49 |
+
print()
|
50 |
+
|
faiss_train.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_dataset, load_from_disk, Dataset
|
2 |
+
import os
|
3 |
+
from transformers import AutoTokenizer, AutoModel
|
4 |
+
import torch
|
5 |
+
import pandas as pd
|
6 |
+
|
7 |
+
datasetPath = "dataset/github.ds"
|
8 |
+
|
9 |
+
if os.path.exists(datasetPath):
|
10 |
+
issues_dataset = load_from_disk(datasetPath)
|
11 |
+
else:
|
12 |
+
issues_dataset = load_dataset("lewtun/github-issues", split="train")
|
13 |
+
issues_dataset.save_to_disk(datasetPath)
|
14 |
+
|
15 |
+
issues_dataset = issues_dataset.filter(
|
16 |
+
lambda x: (x["is_pull_request"] == False and len(x["comments"]) > 0)
|
17 |
+
)
|
18 |
+
|
19 |
+
columns = issues_dataset.column_names
|
20 |
+
columns_to_keep = ["title", "body", "html_url", "comments"]
|
21 |
+
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
|
22 |
+
issues_dataset = issues_dataset.remove_columns(columns_to_remove)
|
23 |
+
|
24 |
+
issues_dataset.set_format("pandas")
|
25 |
+
df = issues_dataset[:]
|
26 |
+
|
27 |
+
comments_df = df.explode("comments", ignore_index=True)
|
28 |
+
|
29 |
+
comments_dataset = Dataset.from_pandas(comments_df)
|
30 |
+
|
31 |
+
comments_dataset = comments_dataset.map(
|
32 |
+
lambda x: {"comment_length": len(x["comments"].split())}
|
33 |
+
)
|
34 |
+
|
35 |
+
comments_dataset = comments_dataset.filter(lambda x: x["comment_length"] > 15)
|
36 |
+
|
37 |
+
def concatenate_text(examples):
|
38 |
+
return {
|
39 |
+
"text": examples["title"]
|
40 |
+
+ " \n "
|
41 |
+
+ examples["body"]
|
42 |
+
+ " \n "
|
43 |
+
+ examples["comments"]
|
44 |
+
}
|
45 |
+
|
46 |
+
|
47 |
+
comments_dataset = comments_dataset.map(concatenate_text)
|
48 |
+
|
49 |
+
########################
|
50 |
+
|
51 |
+
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
|
52 |
+
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
|
53 |
+
model = AutoModel.from_pretrained(model_ckpt)
|
54 |
+
|
55 |
+
device = torch.device("cuda")
|
56 |
+
model.to(device)
|
57 |
+
|
58 |
+
def cls_pooling(model_output):
|
59 |
+
return model_output.last_hidden_state[:, 0]
|
60 |
+
|
61 |
+
def get_embeddings(text_list):
|
62 |
+
encoded_input = tokenizer(
|
63 |
+
text_list, padding=True, truncation=True, return_tensors="pt"
|
64 |
+
)
|
65 |
+
encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
|
66 |
+
model_output = model(**encoded_input)
|
67 |
+
return cls_pooling(model_output)
|
68 |
+
|
69 |
+
embedding = get_embeddings(comments_dataset["text"][0])
|
70 |
+
|
71 |
+
embeddings_dataset = comments_dataset.map(
|
72 |
+
lambda x: {"embeddings": get_embeddings(x["text"]).detach().cpu().numpy()[0]}
|
73 |
+
)
|
74 |
+
|
75 |
+
embeddings_dataset.add_faiss_index(column="embeddings")
|
76 |
+
|
77 |
+
# embeddings_dataset.save_to_disk("dataset/embeddings")
|
78 |
+
|
79 |
+
question = "How can I load a dataset offline?"
|
80 |
+
question_embedding = get_embeddings([question]).cpu().detach().numpy()
|
81 |
+
|
82 |
+
scores, samples = embeddings_dataset.get_nearest_examples(
|
83 |
+
"embeddings", question_embedding, k=5
|
84 |
+
)
|
85 |
+
|
86 |
+
samples_df = pd.DataFrame.from_dict(samples)
|
87 |
+
samples_df["scores"] = scores
|
88 |
+
samples_df.sort_values("scores", ascending=False, inplace=True)
|
89 |
+
|
90 |
+
for _, row in samples_df.iterrows():
|
91 |
+
print(f"COMMENT: {row.comments}")
|
92 |
+
print(f"SCORE: {row.scores}")
|
93 |
+
print(f"TITLE: {row.title}")
|
94 |
+
print(f"URL: {row.html_url}")
|
95 |
+
print("=" * 50)
|
96 |
+
print()
|
97 |
+
|
98 |
+
|
99 |
+
print(issues_dataset)
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
datasets
|
2 |
+
transformers
|
3 |
+
torch
|
4 |
+
pandas
|
5 |
+
#faiss-gpu
|