Michael-Geis
reorganized
7cc8002
import os
import pandas as pd
from storage import query_to_df
from cleaning import TextCleaner
from sentence_transformers import SentenceTransformer
def main(library_name, query, max_results, model_name):
path_to_library = os.path.join("./data/libraries", library_name)
os.mkdir(path_to_library)
## Generate metadata from query and save
metadata = query_to_df(query=query, max_results=max_results)
metadata.to_feather(os.path.join(path_to_library, "metadata.feather"))
## Process and clean the data
sentences = TextCleaner().transform(metadata)
## Generate and save embeddings
embeddings = SentenceTransformer(model_name_or_path=model_name).encode(
sentences=sentences, show_progress_bar=True
)
embeddings_df = pd.DataFrame(embeddings)
embeddings_df.columns = [str(col_name) for col_name in embeddings_df.columns]
embeddings_df.to_feather(os.path.join(path_to_library, "embeddings.feather"))