File size: 971 Bytes
283e21a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import util
import pandas as pd
import os

class Library(object):
    
    def load_from_file(self,library_name):
        self.raw_lib = pd.read_parquet(os.path.join('./data',library_name))
    
    def load_from_query(self,query_string,max_results):
        self.raw_lib = util.query_to_df(query_string,max_results)
    
    def clean_library(self):
        
        ## drop columns that we aren't going to modify
        cols = ['title','summary','authors','primary_category','categories']
        input_lib = self.raw_lib[cols].copy()

        input_lib['title'] = input_lib['title'].apply(util.cleanse)
        input_lib['summary'] = input_lib['summary'].apply(util.cleanse)
        input_lib['hyph_in_summary'] = input_lib['summary'].apply(util.find_hyph)
        input_lib['hyph_in_title'] = input_lib['title'].apply(util.find_hyph)
        input_lib['msc_tags'] = input_lib.categories.apply(util.find_msc).apply(util.msc_to_eng)

        self.clean_lib = input_lib