File size: 4,625 Bytes
30ffb9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import os
import json
import pandas as pd
from typing import List, Union, Dict
from loguru import logger
import pandas as pd
import pathlib


## Set of helper functions that support data preprocessing 
class FileIO:
    '''
    Convenience class for saving and loading data in parquet and 
    json formats to/from disk.
    '''

    def save_as_parquet(self, 
                        file_path: str, 
                        data: Union[List[dict], pd.DataFrame], 
                        overwrite: bool=False) -> None:
        '''
        Saves DataFrame to disk as a parquet file.  Removes the index. 

        Args:
        -----
        file_path : str
            Output path to save file, if not included "parquet" will be appended
            as file extension.
        data : Union[List[dict], pd.DataFrame]
            Data to save as parquet file. If data is a list of dicts, it will be
            converted to a DataFrame before saving.
        overwrite : bool
            Overwrite existing file if True, otherwise raise FileExistsError.
        '''
        if isinstance(data, list):
           data = self._convert_toDataFrame(data)
        if not file_path.endswith('parquet'):
            file_path = self._rename_file_extension(file_path, 'parquet')
        self._check_file_path(file_path, overwrite=overwrite)
        data.to_parquet(file_path, index=False)
        logger.info(f'DataFrame saved as parquet file here: {file_path}')
        
    def _convert_toDataFrame(self, data: List[dict]) -> pd.DataFrame:
        return pd.DataFrame().from_dict(data)

    def _rename_file_extension(self, file_path: str, extension: str):
        '''
        Renames file with appropriate extension if file_path
        does not already have correct extension.
        '''
        prefix = os.path.splitext(file_path)[0]
        file_path = prefix + '.' + extension
        return file_path

    def _check_file_path(self, file_path: str, overwrite: bool) -> None:
        '''
        Checks for existence of file and overwrite permissions.
        '''
        if os.path.exists(file_path) and overwrite == False:
            raise FileExistsError(f'File by name {file_path} already exists, try using another file name or set overwrite to True.')
        elif os.path.exists(file_path):
            os.remove(file_path)
        else: 
            file_name = os.path.basename(file_path)
            dir_structure = file_path.replace(file_name, '')
            pathlib.Path(dir_structure).mkdir(parents=True, exist_ok=True)
    
    def load_parquet(self, file_path: str, verbose: bool=True) -> List[dict]:
        '''
        Loads parquet from disk, converts to pandas DataFrame as intermediate
        step and outputs a list of dicts (docs).
        '''
        df = pd.read_parquet(file_path)
        vector_labels = ['content_vector', 'image_vector', 'content_embedding']
        for label in vector_labels:
            if label in df.columns:
                df[label] = df[label].apply(lambda x: x.tolist())
        if verbose:
            memory_usage = round(df.memory_usage().sum()/(1024*1024),2)
            print(f'Shape of data: {df.values.shape}')
            print(f'Memory Usage: {memory_usage}+ MB')
        list_of_dicts = df.to_dict('records')
        return list_of_dicts
    
    def load_json(self, file_path: str):
        '''
        Loads json file from disk.
        '''
        with open(file_path) as f:
            data = json.load(f)
        return data
    
    def save_as_json(self, 
                     file_path: str, 
                     data: Union[List[dict], dict], 
                     indent: int=4,
                     overwrite: bool=False
                     ) -> None:
        '''
        Saves data to disk as a json file. Data can be a list of dicts or a single dict.
        '''
        if not file_path.endswith('json'):
            file_path = self._rename_file_extension(file_path, 'json')
        self._check_file_path(file_path, overwrite=overwrite)
        with open(file_path, 'w') as f:
            json.dump(data, f, indent=indent)
        logger.info(f'Data saved as json file here: {file_path}')

class Utilities: 

    def create_video_url(self, video_id: str, playlist_id: str):
        '''
        Creates a hyperlink to a video episode given a video_id and playlist_id.

        Args:
        -----
        video_id : str
            Video id of the episode from YouTube
        playlist_id : str
            Playlist id of the episode from YouTube
        '''
        return f'https://www.youtube.com/watch?v={video_id}&list={playlist_id}'