Spaces:
Running
Running
Alexander Sanchez commited on
Commit Β·
456bf68
1
Parent(s): 83f560f
1886 documents created in the corpus
Browse files- .DS_Store +0 -0
- corpus/corpus.json +0 -0
- corpus_loader.py +8 -8
.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
corpus/corpus.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
corpus_loader.py
CHANGED
|
@@ -42,9 +42,9 @@ class CorpusLoader:
|
|
| 42 |
pairs.extend(self._normalize(data, source=f.stem))
|
| 43 |
elif isinstance(data, dict): # un solo documento
|
| 44 |
pairs.append(self._normalize_one(data, source=f.stem))
|
| 45 |
-
print(f"
|
| 46 |
except Exception as e:
|
| 47 |
-
print(f"
|
| 48 |
|
| 49 |
# ββ CSV βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 50 |
for f in sorted(self.corpus_path.glob("*.csv")):
|
|
@@ -53,16 +53,16 @@ class CorpusLoader:
|
|
| 53 |
reader = csv.DictReader(fh)
|
| 54 |
rows = list(reader)
|
| 55 |
pairs.extend(self._normalize(rows, source=f.stem))
|
| 56 |
-
print(f"
|
| 57 |
except Exception as e:
|
| 58 |
-
print(f"
|
| 59 |
|
| 60 |
# ββ TXT pareado βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 61 |
htr_files = sorted(self.corpus_path.glob("*.htr.txt"))
|
| 62 |
for htr_file in htr_files:
|
| 63 |
gt_file = htr_file.with_suffix("").with_suffix(".gt.txt")
|
| 64 |
if not gt_file.exists():
|
| 65 |
-
print(f"
|
| 66 |
continue
|
| 67 |
try:
|
| 68 |
htr_text = htr_file.read_text(encoding="utf-8").strip()
|
|
@@ -77,12 +77,12 @@ class CorpusLoader:
|
|
| 77 |
"source": "txt",
|
| 78 |
})
|
| 79 |
except Exception as e:
|
| 80 |
-
print(f"
|
| 81 |
|
| 82 |
if htr_files:
|
| 83 |
-
print(f"
|
| 84 |
|
| 85 |
-
print(f"\n
|
| 86 |
return pairs
|
| 87 |
|
| 88 |
# ββ helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 42 |
pairs.extend(self._normalize(data, source=f.stem))
|
| 43 |
elif isinstance(data, dict): # un solo documento
|
| 44 |
pairs.append(self._normalize_one(data, source=f.stem))
|
| 45 |
+
print(f" JSON cargado: {f.name} ({len(data)} pares)")
|
| 46 |
except Exception as e:
|
| 47 |
+
print(f" Error leyendo {f.name}: {e}")
|
| 48 |
|
| 49 |
# ββ CSV βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 50 |
for f in sorted(self.corpus_path.glob("*.csv")):
|
|
|
|
| 53 |
reader = csv.DictReader(fh)
|
| 54 |
rows = list(reader)
|
| 55 |
pairs.extend(self._normalize(rows, source=f.stem))
|
| 56 |
+
print(f" CSV cargado: {f.name} ({len(rows)} pares)")
|
| 57 |
except Exception as e:
|
| 58 |
+
print(f" Error leyendo {f.name}: {e}")
|
| 59 |
|
| 60 |
# ββ TXT pareado βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 61 |
htr_files = sorted(self.corpus_path.glob("*.htr.txt"))
|
| 62 |
for htr_file in htr_files:
|
| 63 |
gt_file = htr_file.with_suffix("").with_suffix(".gt.txt")
|
| 64 |
if not gt_file.exists():
|
| 65 |
+
print(f" Sin GT para {htr_file.name}, omitido.")
|
| 66 |
continue
|
| 67 |
try:
|
| 68 |
htr_text = htr_file.read_text(encoding="utf-8").strip()
|
|
|
|
| 77 |
"source": "txt",
|
| 78 |
})
|
| 79 |
except Exception as e:
|
| 80 |
+
print(f" Error leyendo {htr_file.name}: {e}")
|
| 81 |
|
| 82 |
if htr_files:
|
| 83 |
+
print(f" TXT cargados: {len(htr_files)} pares")
|
| 84 |
|
| 85 |
+
print(f"\n Total pares cargados desde disco: {len(pairs)}")
|
| 86 |
return pairs
|
| 87 |
|
| 88 |
# ββ helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|