Alexander Sanchez commited on
Commit
456bf68
Β·
1 Parent(s): 83f560f

1886 documents created in the corpus

Browse files
Files changed (3) hide show
  1. .DS_Store +0 -0
  2. corpus/corpus.json +0 -0
  3. corpus_loader.py +8 -8
.DS_Store ADDED
Binary file (6.15 kB). View file
 
corpus/corpus.json ADDED
The diff for this file is too large to render. See raw diff
 
corpus_loader.py CHANGED
@@ -42,9 +42,9 @@ class CorpusLoader:
42
  pairs.extend(self._normalize(data, source=f.stem))
43
  elif isinstance(data, dict): # un solo documento
44
  pairs.append(self._normalize_one(data, source=f.stem))
45
- print(f"βœ… JSON cargado: {f.name} ({len(data)} pares)")
46
  except Exception as e:
47
- print(f"❌ Error leyendo {f.name}: {e}")
48
 
49
  # ── CSV ───────────────────────────────────────────────────────────────
50
  for f in sorted(self.corpus_path.glob("*.csv")):
@@ -53,16 +53,16 @@ class CorpusLoader:
53
  reader = csv.DictReader(fh)
54
  rows = list(reader)
55
  pairs.extend(self._normalize(rows, source=f.stem))
56
- print(f"βœ… CSV cargado: {f.name} ({len(rows)} pares)")
57
  except Exception as e:
58
- print(f"❌ Error leyendo {f.name}: {e}")
59
 
60
  # ── TXT pareado ───────────────────────────────────────────────────────
61
  htr_files = sorted(self.corpus_path.glob("*.htr.txt"))
62
  for htr_file in htr_files:
63
  gt_file = htr_file.with_suffix("").with_suffix(".gt.txt")
64
  if not gt_file.exists():
65
- print(f"⚠ Sin GT para {htr_file.name}, omitido.")
66
  continue
67
  try:
68
  htr_text = htr_file.read_text(encoding="utf-8").strip()
@@ -77,12 +77,12 @@ class CorpusLoader:
77
  "source": "txt",
78
  })
79
  except Exception as e:
80
- print(f"❌ Error leyendo {htr_file.name}: {e}")
81
 
82
  if htr_files:
83
- print(f"βœ… TXT cargados: {len(htr_files)} pares")
84
 
85
- print(f"\nπŸ“š Total pares cargados desde disco: {len(pairs)}")
86
  return pairs
87
 
88
  # ── helpers ───────────────────────────────────────────────────────────────
 
42
  pairs.extend(self._normalize(data, source=f.stem))
43
  elif isinstance(data, dict): # un solo documento
44
  pairs.append(self._normalize_one(data, source=f.stem))
45
+ print(f" JSON cargado: {f.name} ({len(data)} pares)")
46
  except Exception as e:
47
+ print(f" Error leyendo {f.name}: {e}")
48
 
49
  # ── CSV ───────────────────────────────────────────────────────────────
50
  for f in sorted(self.corpus_path.glob("*.csv")):
 
53
  reader = csv.DictReader(fh)
54
  rows = list(reader)
55
  pairs.extend(self._normalize(rows, source=f.stem))
56
+ print(f" CSV cargado: {f.name} ({len(rows)} pares)")
57
  except Exception as e:
58
+ print(f" Error leyendo {f.name}: {e}")
59
 
60
  # ── TXT pareado ───────────────────────────────────────────────────────
61
  htr_files = sorted(self.corpus_path.glob("*.htr.txt"))
62
  for htr_file in htr_files:
63
  gt_file = htr_file.with_suffix("").with_suffix(".gt.txt")
64
  if not gt_file.exists():
65
+ print(f" Sin GT para {htr_file.name}, omitido.")
66
  continue
67
  try:
68
  htr_text = htr_file.read_text(encoding="utf-8").strip()
 
77
  "source": "txt",
78
  })
79
  except Exception as e:
80
+ print(f" Error leyendo {htr_file.name}: {e}")
81
 
82
  if htr_files:
83
+ print(f" TXT cargados: {len(htr_files)} pares")
84
 
85
+ print(f"\n Total pares cargados desde disco: {len(pairs)}")
86
  return pairs
87
 
88
  # ── helpers ───────────────────────────────────────────────────────────────