seanpedrickcase commited on
Commit
2148ddd
Β·
1 Parent(s): bf7b066

Modified model load for custom languages with spaCy. Languages should load successfully now.

Browse files
tools/load_spacy_model_custom_recognisers.py CHANGED
@@ -99,7 +99,7 @@ def load_spacy_model(language: str = DEFAULT_LANGUAGE):
99
  base_lang = _base_language_code(lang_norm)
100
 
101
  candidates_by_lang = {
102
- # English
103
  "en": [
104
  "en_core_web_lg",
105
  "en_core_web_trf",
@@ -110,7 +110,7 @@ def load_spacy_model(language: str = DEFAULT_LANGUAGE):
110
  "en_trf": ["en_core_web_trf"],
111
  "en_md": ["en_core_web_md"],
112
  "en_sm": ["en_core_web_sm"],
113
- # Major languages (news pipelines)
114
  "ca": ["ca_core_news_lg", "ca_core_news_md", "ca_core_news_sm"], # Catalan
115
  "da": ["da_core_news_lg", "da_core_news_md", "da_core_news_sm"], # Danish
116
  "de": ["de_core_news_lg", "de_core_news_md", "de_core_news_sm"], # German
@@ -156,11 +156,20 @@ def load_spacy_model(language: str = DEFAULT_LANGUAGE):
156
  candidates = candidates_by_lang["xx"]
157
 
158
  last_error = None
159
- for candidate in candidates:
 
 
 
 
 
 
 
 
 
160
  # Try importable package first (fast-path when installed as a package)
161
  try:
162
  module = __import__(candidate)
163
- print(f"Successfully imported spaCy model: {candidate}")
164
  return module.load()
165
  except Exception as e:
166
  last_error = e
@@ -168,38 +177,34 @@ def load_spacy_model(language: str = DEFAULT_LANGUAGE):
168
  # Try spacy.load if package is linked/installed
169
  try:
170
  nlp = spacy.load(candidate)
171
- print(f"Successfully loaded spaCy model via spacy.load: {candidate}")
172
- return nlp
173
- except Exception as e:
174
- last_error = e
175
-
176
- # Check if model is already downloaded before attempting to download
177
- try:
178
- # Try to load the model to see if it's already available
179
- nlp = spacy.load(candidate)
180
- print(f"Model {candidate} is already available, skipping download")
181
  return nlp
182
  except OSError:
183
  # Model not found, proceed with download
184
- pass
 
 
 
 
 
 
 
 
 
 
 
185
  except Exception as e:
 
186
  last_error = e
187
  continue
188
 
189
- # Attempt to download then load
190
- try:
191
- print(f"Downloading spaCy model: {candidate}")
192
- download(candidate)
193
- nlp = spacy.load(candidate)
194
- print(f"Successfully downloaded and loaded spaCy model: {candidate}")
195
- return nlp
196
- except Exception as e:
197
- last_error = e
198
- continue
199
 
200
- raise RuntimeError(
201
- f"Failed to load spaCy model for language '{language}'. Last error: {last_error}"
202
- )
203
 
204
 
205
  # Language-aware spaCy model loader
 
99
  base_lang = _base_language_code(lang_norm)
100
 
101
  candidates_by_lang = {
102
+ # English - prioritize lg, then trf, then md, then sm
103
  "en": [
104
  "en_core_web_lg",
105
  "en_core_web_trf",
 
110
  "en_trf": ["en_core_web_trf"],
111
  "en_md": ["en_core_web_md"],
112
  "en_sm": ["en_core_web_sm"],
113
+ # Major languages (news pipelines) - prioritize lg, then md, then sm
114
  "ca": ["ca_core_news_lg", "ca_core_news_md", "ca_core_news_sm"], # Catalan
115
  "da": ["da_core_news_lg", "da_core_news_md", "da_core_news_sm"], # Danish
116
  "de": ["de_core_news_lg", "de_core_news_md", "de_core_news_sm"], # German
 
156
  candidates = candidates_by_lang["xx"]
157
 
158
  last_error = None
159
+ print(
160
+ f"Attempting to load spaCy model for language '{language}' with candidates: {candidates}"
161
+ )
162
+ print(
163
+ "Note: Models are prioritized by size (lg > md > sm) - will stop after first successful load"
164
+ )
165
+
166
+ for i, candidate in enumerate(candidates):
167
+ print(f"Trying candidate {i+1}/{len(candidates)}: {candidate}")
168
+
169
  # Try importable package first (fast-path when installed as a package)
170
  try:
171
  module = __import__(candidate)
172
+ print(f"βœ“ Successfully imported spaCy model: {candidate}")
173
  return module.load()
174
  except Exception as e:
175
  last_error = e
 
177
  # Try spacy.load if package is linked/installed
178
  try:
179
  nlp = spacy.load(candidate)
180
+ print(f"βœ“ Successfully loaded spaCy model via spacy.load: {candidate}")
 
 
 
 
 
 
 
 
 
181
  return nlp
182
  except OSError:
183
  # Model not found, proceed with download
184
+ print(f"Model {candidate} not found, attempting to download...")
185
+ try:
186
+ download(candidate)
187
+ print(f"βœ“ Successfully downloaded spaCy model: {candidate}")
188
+ # Try to load the downloaded model
189
+ nlp = spacy.load(candidate)
190
+ print(f"βœ“ Successfully loaded downloaded spaCy model: {candidate}")
191
+ return nlp
192
+ except Exception as download_error:
193
+ print(f"βœ— Failed to download or load {candidate}: {download_error}")
194
+ last_error = download_error
195
+ continue
196
  except Exception as e:
197
+ print(f"βœ— Failed to load {candidate}: {e}")
198
  last_error = e
199
  continue
200
 
201
+ # Provide more helpful error message
202
+ error_msg = f"Failed to load spaCy model for language '{language}'"
203
+ if last_error:
204
+ error_msg += f". Last error: {last_error}"
205
+ error_msg += f". Tried candidates: {candidates}"
 
 
 
 
 
206
 
207
+ raise RuntimeError(error_msg)
 
 
208
 
209
 
210
  # Language-aware spaCy model loader