autosumm / corpora /corpora.py
mhsvieira's picture
Add ONU PDFs
8883a1c
from .sourcer import search_web
import pandas as pd
import os
import glob
root_dir = 'data/datasets'
pira_df = pd.read_csv(os.path.join(root_dir, 'pira_simplified.csv'))
pira_corpus = pira_df.text.to_list()
txt_path = os.path.join(root_dir, 'onu')
filenames = glob.glob(txt_path + '/*.txt')
onu_corpus = []
for filename in filenames:
with open(filename, 'r') as f:
onu_corpus.append(f.read())
def gen_corpus(query: str, pira: bool=True, ONU: bool=True, web: bool=True)->list:
corpus = []
if not (pira or ONU or web):
# TODO: raise error
pass
if pira:
corpus += pira_corpus
if ONU:
corpus += onu_corpus
if web:
corpus += search_web(query)
return corpus