Spaces:
Running
Running
from sentence_transformers import SentenceTransformer, util | |
import torch | |
import pandas as pd | |
from sentence_transformers import SentenceTransformer, InputExample, losses | |
from torch.utils.data import DataLoader | |
from datasets import load_dataset | |
import pinecone | |
embedder = SentenceTransformer('all-mpnet-base-v2') | |
dataset = load_dataset("gopikrsmscs/torch-issues") | |
columns = dataset['train'].column_names | |
columns_to_keep = ["Title", "Body"] | |
columns_to_remove = set(columns_to_keep).symmetric_difference(columns) | |
dataset = dataset.remove_columns(columns_to_remove) | |
dataset_dict = dataset['train'] | |
examples = [] | |
for row in dataset_dict: | |
title = row['Title'] | |
#body = row['Body'] | |
examples.append(title) | |
file_paths = 'saved_corpus.txt' | |
torch.save(examples,file_paths) | |
corpus_embeddings = embedder.encode(examples, convert_to_tensor=True) | |
file_path = 'saved_corpus.pt' | |
# Save the tensor to the file | |
torch.save(corpus_embeddings, file_path) | |