File size: 733 Bytes
e539b70
 
 
8883a1c
e539b70
 
8883a1c
e539b70
8883a1c
 
 
 
 
 
 
 
 
e539b70
 
 
 
 
 
 
8883a1c
e539b70
8883a1c
e539b70
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
from .sourcer import search_web
import pandas as pd
import os
import glob

root_dir = 'data/datasets'

pira_df = pd.read_csv(os.path.join(root_dir, 'pira_simplified.csv'))
pira_corpus = pira_df.text.to_list()

txt_path = os.path.join(root_dir, 'onu')
filenames = glob.glob(txt_path + '/*.txt')

onu_corpus = []
for filename in filenames:
    with open(filename, 'r') as f:
        onu_corpus.append(f.read())

def gen_corpus(query: str, pira: bool=True, ONU: bool=True, web: bool=True)->list:
    corpus = []
    if not (pira or ONU or web):
        # TODO: raise error
        pass
    if pira:
        corpus += pira_corpus
    if ONU:
        corpus += onu_corpus
    if web:
        corpus += search_web(query)

    return corpus