LOUIS SANNA commited on
Commit
419f9af
1 Parent(s): a193066
Files changed (2) hide show
  1. anyqa/build_index.py +3 -1
  2. anyqa/config.py +11 -5
anyqa/build_index.py CHANGED
@@ -41,7 +41,9 @@ def parse_data():
41
  for chunk in doc_chunks:
42
  chunk.metadata["name"] = source["name"]
43
  chunk.metadata["domain"] = source["domain"]
44
- chunk.metadata["url"] = source.get("url", None)
 
 
45
  chunk.metadata["page_number"] = chunk.metadata["page"]
46
  chunk.metadata["short_name"] = chunk.metadata["name"]
47
  docs.append(chunk)
 
41
  for chunk in doc_chunks:
42
  chunk.metadata["name"] = source["name"]
43
  chunk.metadata["domain"] = source["domain"]
44
+ url = source.get("url", None)
45
+ if url:
46
+ chunk.metadata["url"] = source.get("url", None)
47
  chunk.metadata["page_number"] = chunk.metadata["page"]
48
  chunk.metadata["short_name"] = chunk.metadata["name"]
49
  docs.append(chunk)
anyqa/config.py CHANGED
@@ -2,10 +2,13 @@ import os
2
 
3
  # can be used to add metadata to the index, for instance URL
4
  metadata_by_file_path = {
5
- "data/Daoism/Tao_Te_Ching.pdf": { "url": "https://www.with.org/tao_te_ching_en.pdf" },
6
- "data/Confucianism/Analects of Confucius.pdf": { "url": "https://chinatxt.sitehost.iu.edu/Analects_of_Confucius_(Eno-2015).pdf" },
 
 
7
  }
8
 
 
9
  def get_domains():
10
  domains = []
11
  for root, dirs, files in os.walk("data"):
@@ -26,7 +29,7 @@ def get_sources():
26
  "domain": parse_domain(file_path),
27
  "name": parse_name(file_path),
28
  "file_path": file_path,
29
- **metadata_by_file_path.get(file_path, {})
30
  }
31
  )
32
 
@@ -34,8 +37,11 @@ def get_sources():
34
 
35
 
36
  def parse_name(source: str) -> str:
37
- return source.split("/")[-1].split(".")[0].replace("_", " ")
 
 
38
 
39
 
40
  def parse_domain(source: str) -> str:
41
- return source.split("/")[1].replace("_", " ")
 
 
2
 
3
  # can be used to add metadata to the index, for instance URL
4
  metadata_by_file_path = {
5
+ "data/Daoism/Tao_Te_Ching.pdf": {"url": "https://www.with.org/tao_te_ching_en.pdf"},
6
+ "data/Confucianism/Analects of Confucius.pdf": {
7
+ "url": "https://chinatxt.sitehost.iu.edu/Analects_of_Confucius_(Eno-2015).pdf"
8
+ },
9
  }
10
 
11
+
12
  def get_domains():
13
  domains = []
14
  for root, dirs, files in os.walk("data"):
 
29
  "domain": parse_domain(file_path),
30
  "name": parse_name(file_path),
31
  "file_path": file_path,
32
+ **metadata_by_file_path.get(file_path, {}),
33
  }
34
  )
35
 
 
37
 
38
 
39
  def parse_name(source: str) -> str:
40
+ filename = os.path.basename(source)
41
+ name, _ = os.path.splitext(filename)
42
+ return name.replace("_", " ")
43
 
44
 
45
  def parse_domain(source: str) -> str:
46
+ domain = source.split(os.sep)[1]
47
+ return domain.replace("_", " ")