from .sourcer import search_web import pandas as pd import os import glob root_dir = 'data/datasets' pira_df = pd.read_csv(os.path.join(root_dir, 'pira_simplified.csv')) pira_corpus = pira_df.text.to_list() txt_path = os.path.join(root_dir, 'onu') filenames = glob.glob(txt_path + '/*.txt') onu_corpus = [] for filename in filenames: with open(filename, 'r') as f: onu_corpus.append(f.read()) def gen_corpus(query: str, pira: bool=True, ONU: bool=True, web: bool=True)->list: corpus = [] if not (pira or ONU or web): # TODO: raise error pass if pira: corpus += pira_corpus if ONU: corpus += onu_corpus if web: corpus += search_web(query) return corpus