ArXivRecommenderSystem / library_class.py
Michael-Geis
created lib class, msc dict
283e21a
raw
history blame
971 Bytes
import util
import pandas as pd
import os
class Library(object):
def load_from_file(self,library_name):
self.raw_lib = pd.read_parquet(os.path.join('./data',library_name))
def load_from_query(self,query_string,max_results):
self.raw_lib = util.query_to_df(query_string,max_results)
def clean_library(self):
## drop columns that we aren't going to modify
cols = ['title','summary','authors','primary_category','categories']
input_lib = self.raw_lib[cols].copy()
input_lib['title'] = input_lib['title'].apply(util.cleanse)
input_lib['summary'] = input_lib['summary'].apply(util.cleanse)
input_lib['hyph_in_summary'] = input_lib['summary'].apply(util.find_hyph)
input_lib['hyph_in_title'] = input_lib['title'].apply(util.find_hyph)
input_lib['msc_tags'] = input_lib.categories.apply(util.find_msc).apply(util.msc_to_eng)
self.clean_lib = input_lib