neuralworm commited on
Commit
8e59f09
1 Parent(s): 0567ebe
Files changed (1) hide show
  1. util.py +37 -0
util.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+
4
+ def process_json_files(start, end, step, strip_in_braces=False, strip_diacritics=False):
5
+ base_path = "texts"
6
+ results = []
7
+
8
+ for i in range(start, end + 1, step):
9
+ file_name = f"{base_path}/{i:02}.json"
10
+ try:
11
+ with open(file_name, 'r', encoding='utf-8') as file:
12
+ data = json.load(file)
13
+ text_blocks = data.get("text", [])
14
+
15
+ full_text = " ".join([' '.join(block) for block in text_blocks])
16
+
17
+ if strip_in_braces:
18
+ full_text = re.sub(r"\[.*?\]", "", full_text, flags=re.DOTALL)
19
+ if strip_diacritics:
20
+ full_text = re.sub(r"[^\u05D0-\u05EA ]+", "", full_text)
21
+
22
+ # Check if the full text is not empty after processing
23
+ if full_text.strip():
24
+ results.append({
25
+ "book": i,
26
+ "title": data.get("title", "No title"),
27
+ "text": data.get("text", "No text"),
28
+ })
29
+
30
+ except FileNotFoundError:
31
+ results.append({"error": f"File {file_name} not found."})
32
+ except json.JSONDecodeError as e:
33
+ results.append({"error": f"File {file_name} could not be read as JSON: {e}"})
34
+ except KeyError as e:
35
+ results.append({"error": f"Expected key 'text' is missing in {file_name}: {e}"})
36
+
37
+ return results