Spaces:

JPBianchi
/

vectorsearch

Running

File size: 7,069 Bytes

from modal import App, Volume, Image, Mount

from typing import List, Dict, Tuple, Union, Callable
# from preprocessing import FileIO

# assets = modal.Mount.from_local_dir(
#     "./data",
#     # condition=lambda pth: not ".venv" in pth,
#     remote_path="./data",
# )

app = App("vector-search-project")
vector_search = Image.debian_slim().pip_install(
    "sentence_transformers==2.2.2", "llama_index==0.9.6.post1", "angle_emb==0.1.5"
    )

vol = Volume.from_name("vector-search-volume")
# ^ volume must be created manually with CLI: modal volume create vector-search-volume 


@app.function(image=vector_search, 
               gpu="A100", 
               timeout=600,
               volumes={"/root/models": vol}
               # secrets are available in the environment with os.environ["SECRET_NAME"]
               # secret=modal.Secret.from_name("my-huggingface-secret")
               )
def encode_content_splits(content_splits,
                            model=None,  # path or name of model
                            **kwargs
                            ):
    """ kwargs provided in case encode method has extra arguments """
    from sentence_transformers import SentenceTransformer
    
    import os, time
    models_list = os.listdir('/root/models')
    print("Models:", models_list)
    
    if isinstance(model, str) and model[-1] == "/":
        model = model[:-1]
        
    if isinstance(model, str):
        model = model.split('/')[-1]
    
    if isinstance(model, str) and model in models_list:
        
        if "UAE-Large-V1-300" in model:
            print("Loading finetuned UAE-Large-V1-300 model from Modal Volume")
            
            from angle_emb import AnglE 
            model = AnglE.from_pretrained('WhereIsAI/UAE-Large-V1',
                                          pretrained_model_path=os.path.join('/root/models', model), 
                                          pooling_strategy='cls').cuda()
            kwargs['to_numpy'] = True
            
            # this model doesn't accept list of lists
            if isinstance(content_splits[0], list):
                content_splits = [chunk for episode in content_splits for chunk in episode]

        else:
            print(f"Loading model {model} from Modal volume")
            model = SentenceTransformer(os.path.join('/root/models', model))
            
    elif isinstance(model, str):
        if model in models_list:
            print(f"Loading model {model} from Modal volume")
            model = SentenceTransformer(os.path.join('/root/models', model))
        else:
            print(f"Model {model} not found in Modal volume, loading from HuggingFace")
            model = SentenceTransformer(model)
            
    else:
        print(f"Using model provided as argument")
        if 'save' in kwargs:
            if isinstance(kwargs['save'], str) and kwargs['save'][-1] == '/':
                kwargs['save'] = kwargs['save'][:-1]
            kwargs['save'] = kwargs['save'].split('/')[-1]
            fname = os.path.join('/root/models',  kwargs['save'])
            print(f"Saving model in {fname}")
            # model.save(fname)
            print(f"Model saved in {fname}")
            kwargs.pop('save')
        
    print("Starting encoding")
    start = time.perf_counter()

    emb = [list(zip(episode, model.encode(episode, **kwargs))) for episode in content_splits]
    end = time.perf_counter() - start
    print(f"GPU processing lasted {end:.2f} seconds")
    print("Encoding finished")
    
    return emb


@app.function(image=vector_search, gpu="A100", timeout=240,
               mounts=[Mount.from_local_dir("./data",
                                                  remote_path="/root/data", 
                                                  condition=lambda pth: ".json" in pth)],
               volumes={"/root/models": vol}
)
def finetune(training_path='./data/training_data_300.json', 
             valid_path='./data/validation_data_100.json', 
             model_id=None,
             ignore_existing=False):

    import os
    print("Data:", os.listdir('/root/data'))
    print("Models:", os.listdir('/root/models'))
    
    if model_id is None:
        print("No model ID provided")
        return None
    elif isinstance(model_id, str) and model_id[-1] == "/":
        model_id = model_id[:-1]

    
    from llama_index.finetuning import EmbeddingQAFinetuneDataset
    
    training_set = EmbeddingQAFinetuneDataset.from_json(training_path)
    valid_set = EmbeddingQAFinetuneDataset.from_json(valid_path)
    print("Datasets loaded")
    
    num_training_examples = len(training_set.queries)
    print(f"Training examples: {num_training_examples}")
    
    from llama_index.finetuning import SentenceTransformersFinetuneEngine

    print(f"Model Name is {model_id}")
    model_ext = model_id.split('/')[1]
        
    ft_model_name = f'finetuned-{model_ext}-{num_training_examples}'
    model_outpath = os.path.join("/root/models", ft_model_name)

    print(f'Model ID: {model_id}')
    print(f'Model Outpath: {model_outpath}')

    finetune_engine = SentenceTransformersFinetuneEngine(
        training_set,
        batch_size=32,
        model_id=model_id,
        model_output_path=model_outpath,
        val_dataset=valid_set,
        epochs=10
    )
    import io, os, zipfile, glob, time
    try:
        start = time.perf_counter()
        finetune_engine.finetune()
        end = time.perf_counter() - start
        print(f"GPU processing lasted {end:.2f} seconds")
        
        print(os.listdir('/root/models'))
        app.volume.commit()  # Persist changes, ie the finetumed model
        
        # TODO SHARE THE MODEL ON HUGGINGFACE
        # https://huggingface.co/docs/transformers/v4.15.0/model_sharing
        
        folder_to_zip = model_outpath
        # Zip the contents of the folder at 'folder_path' and return a BytesIO object.
        bytes_buffer = io.BytesIO()

        with zipfile.ZipFile(bytes_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
            for file_path in glob.glob(folder_to_zip + "/**", recursive=True):
                print(f"Processed file {file_path}")
                zip_file.write(file_path, os.path.relpath(file_path, start=folder_to_zip))

        # Move the pointer to the start of the BytesIO buffer before returning
        bytes_buffer.seek(0)
        # You can now return this zipped_folder object, write it to a file, send it over a network, etc.
        # Replace with the path to the folder you want to zip
        zippedio = bytes_buffer
        
        return zippedio
    except Exception:
        return "Finetuning failed"
    
    
@app.local_entrypoint()
def test_method(content_splits=[["a"]]):
    output = encode_content_splits.remote(content_splits)
    return output
  
# deploy it with
# modal token set --token-id ak-xxxxxx --token-secret as-xxxxx # given when we create a new token
# modal deploy podcast/1/backend.py
# View Deployment: https://modal.com/apps/jpbianchi/falcon_hackaton-project <<< use this project name