LOUIS SANNA
fix(bugs)
419f9af
import os
# can be used to add metadata to the index, for instance URL
metadata_by_file_path = {
"data/Daoism/Tao_Te_Ching.pdf": {"url": "https://www.with.org/tao_te_ching_en.pdf"},
"data/Confucianism/Analects of Confucius.pdf": {
"url": "https://chinatxt.sitehost.iu.edu/Analects_of_Confucius_(Eno-2015).pdf"
},
}
def get_domains():
domains = []
for root, dirs, files in os.walk("data"):
for dir in dirs:
domains.append(dir)
return domains
def get_sources():
res = []
for root, dirs, files in os.walk("data"):
for file in files:
if file.endswith(".pdf"):
file_path = os.path.join(root, file)
print("file_path", file_path)
res.append(
{
"domain": parse_domain(file_path),
"name": parse_name(file_path),
"file_path": file_path,
**metadata_by_file_path.get(file_path, {}),
}
)
return res
def parse_name(source: str) -> str:
filename = os.path.basename(source)
name, _ = os.path.splitext(filename)
return name.replace("_", " ")
def parse_domain(source: str) -> str:
domain = source.split(os.sep)[1]
return domain.replace("_", " ")