File size: 1,342 Bytes
780c913
 
fe19632
 
419f9af
 
 
 
fe19632
 
419f9af
780c913
 
 
 
 
fe19632
 
 
 
 
 
 
 
 
 
 
 
 
 
 
419f9af
fe19632
 
 
 
 
 
 
419f9af
 
 
fe19632
 
 
419f9af
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import os

# can be used to add metadata to the index, for instance URL
metadata_by_file_path = {
    "data/Daoism/Tao_Te_Ching.pdf": {"url": "https://www.with.org/tao_te_ching_en.pdf"},
    "data/Confucianism/Analects of Confucius.pdf": {
        "url": "https://chinatxt.sitehost.iu.edu/Analects_of_Confucius_(Eno-2015).pdf"
    },
}


def get_domains():
    domains = []
    for root, dirs, files in os.walk("data"):
        for dir in dirs:
            domains.append(dir)
    return domains


def get_sources():
    res = []
    for root, dirs, files in os.walk("data"):
        for file in files:
            if file.endswith(".pdf"):
                file_path = os.path.join(root, file)
                print("file_path", file_path)
                res.append(
                    {
                        "domain": parse_domain(file_path),
                        "name": parse_name(file_path),
                        "file_path": file_path,
                        **metadata_by_file_path.get(file_path, {}),
                    }
                )

    return res


def parse_name(source: str) -> str:
    filename = os.path.basename(source)
    name, _ = os.path.splitext(filename)
    return name.replace("_", " ")


def parse_domain(source: str) -> str:
    domain = source.split(os.sep)[1]
    return domain.replace("_", " ")