Commit
Β·
2148ddd
1
Parent(s):
bf7b066
Modified model load for custom languages with spaCy. Languages should load successfully now.
Browse files
tools/load_spacy_model_custom_recognisers.py
CHANGED
|
@@ -99,7 +99,7 @@ def load_spacy_model(language: str = DEFAULT_LANGUAGE):
|
|
| 99 |
base_lang = _base_language_code(lang_norm)
|
| 100 |
|
| 101 |
candidates_by_lang = {
|
| 102 |
-
# English
|
| 103 |
"en": [
|
| 104 |
"en_core_web_lg",
|
| 105 |
"en_core_web_trf",
|
|
@@ -110,7 +110,7 @@ def load_spacy_model(language: str = DEFAULT_LANGUAGE):
|
|
| 110 |
"en_trf": ["en_core_web_trf"],
|
| 111 |
"en_md": ["en_core_web_md"],
|
| 112 |
"en_sm": ["en_core_web_sm"],
|
| 113 |
-
# Major languages (news pipelines)
|
| 114 |
"ca": ["ca_core_news_lg", "ca_core_news_md", "ca_core_news_sm"], # Catalan
|
| 115 |
"da": ["da_core_news_lg", "da_core_news_md", "da_core_news_sm"], # Danish
|
| 116 |
"de": ["de_core_news_lg", "de_core_news_md", "de_core_news_sm"], # German
|
|
@@ -156,11 +156,20 @@ def load_spacy_model(language: str = DEFAULT_LANGUAGE):
|
|
| 156 |
candidates = candidates_by_lang["xx"]
|
| 157 |
|
| 158 |
last_error = None
|
| 159 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
# Try importable package first (fast-path when installed as a package)
|
| 161 |
try:
|
| 162 |
module = __import__(candidate)
|
| 163 |
-
print(f"Successfully imported spaCy model: {candidate}")
|
| 164 |
return module.load()
|
| 165 |
except Exception as e:
|
| 166 |
last_error = e
|
|
@@ -168,38 +177,34 @@ def load_spacy_model(language: str = DEFAULT_LANGUAGE):
|
|
| 168 |
# Try spacy.load if package is linked/installed
|
| 169 |
try:
|
| 170 |
nlp = spacy.load(candidate)
|
| 171 |
-
print(f"Successfully loaded spaCy model via spacy.load: {candidate}")
|
| 172 |
-
return nlp
|
| 173 |
-
except Exception as e:
|
| 174 |
-
last_error = e
|
| 175 |
-
|
| 176 |
-
# Check if model is already downloaded before attempting to download
|
| 177 |
-
try:
|
| 178 |
-
# Try to load the model to see if it's already available
|
| 179 |
-
nlp = spacy.load(candidate)
|
| 180 |
-
print(f"Model {candidate} is already available, skipping download")
|
| 181 |
return nlp
|
| 182 |
except OSError:
|
| 183 |
# Model not found, proceed with download
|
| 184 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
except Exception as e:
|
|
|
|
| 186 |
last_error = e
|
| 187 |
continue
|
| 188 |
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
print(f"Successfully downloaded and loaded spaCy model: {candidate}")
|
| 195 |
-
return nlp
|
| 196 |
-
except Exception as e:
|
| 197 |
-
last_error = e
|
| 198 |
-
continue
|
| 199 |
|
| 200 |
-
raise RuntimeError(
|
| 201 |
-
f"Failed to load spaCy model for language '{language}'. Last error: {last_error}"
|
| 202 |
-
)
|
| 203 |
|
| 204 |
|
| 205 |
# Language-aware spaCy model loader
|
|
|
|
| 99 |
base_lang = _base_language_code(lang_norm)
|
| 100 |
|
| 101 |
candidates_by_lang = {
|
| 102 |
+
# English - prioritize lg, then trf, then md, then sm
|
| 103 |
"en": [
|
| 104 |
"en_core_web_lg",
|
| 105 |
"en_core_web_trf",
|
|
|
|
| 110 |
"en_trf": ["en_core_web_trf"],
|
| 111 |
"en_md": ["en_core_web_md"],
|
| 112 |
"en_sm": ["en_core_web_sm"],
|
| 113 |
+
# Major languages (news pipelines) - prioritize lg, then md, then sm
|
| 114 |
"ca": ["ca_core_news_lg", "ca_core_news_md", "ca_core_news_sm"], # Catalan
|
| 115 |
"da": ["da_core_news_lg", "da_core_news_md", "da_core_news_sm"], # Danish
|
| 116 |
"de": ["de_core_news_lg", "de_core_news_md", "de_core_news_sm"], # German
|
|
|
|
| 156 |
candidates = candidates_by_lang["xx"]
|
| 157 |
|
| 158 |
last_error = None
|
| 159 |
+
print(
|
| 160 |
+
f"Attempting to load spaCy model for language '{language}' with candidates: {candidates}"
|
| 161 |
+
)
|
| 162 |
+
print(
|
| 163 |
+
"Note: Models are prioritized by size (lg > md > sm) - will stop after first successful load"
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
for i, candidate in enumerate(candidates):
|
| 167 |
+
print(f"Trying candidate {i+1}/{len(candidates)}: {candidate}")
|
| 168 |
+
|
| 169 |
# Try importable package first (fast-path when installed as a package)
|
| 170 |
try:
|
| 171 |
module = __import__(candidate)
|
| 172 |
+
print(f"β Successfully imported spaCy model: {candidate}")
|
| 173 |
return module.load()
|
| 174 |
except Exception as e:
|
| 175 |
last_error = e
|
|
|
|
| 177 |
# Try spacy.load if package is linked/installed
|
| 178 |
try:
|
| 179 |
nlp = spacy.load(candidate)
|
| 180 |
+
print(f"β Successfully loaded spaCy model via spacy.load: {candidate}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
return nlp
|
| 182 |
except OSError:
|
| 183 |
# Model not found, proceed with download
|
| 184 |
+
print(f"Model {candidate} not found, attempting to download...")
|
| 185 |
+
try:
|
| 186 |
+
download(candidate)
|
| 187 |
+
print(f"β Successfully downloaded spaCy model: {candidate}")
|
| 188 |
+
# Try to load the downloaded model
|
| 189 |
+
nlp = spacy.load(candidate)
|
| 190 |
+
print(f"β Successfully loaded downloaded spaCy model: {candidate}")
|
| 191 |
+
return nlp
|
| 192 |
+
except Exception as download_error:
|
| 193 |
+
print(f"β Failed to download or load {candidate}: {download_error}")
|
| 194 |
+
last_error = download_error
|
| 195 |
+
continue
|
| 196 |
except Exception as e:
|
| 197 |
+
print(f"β Failed to load {candidate}: {e}")
|
| 198 |
last_error = e
|
| 199 |
continue
|
| 200 |
|
| 201 |
+
# Provide more helpful error message
|
| 202 |
+
error_msg = f"Failed to load spaCy model for language '{language}'"
|
| 203 |
+
if last_error:
|
| 204 |
+
error_msg += f". Last error: {last_error}"
|
| 205 |
+
error_msg += f". Tried candidates: {candidates}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
|
| 207 |
+
raise RuntimeError(error_msg)
|
|
|
|
|
|
|
| 208 |
|
| 209 |
|
| 210 |
# Language-aware spaCy model loader
|