theislab
/

Nicheformer

@@ -277,61 +277,73 @@ class NicheformerTokenizer(PreTrainedTokenizer):
         """
         if isinstance(data, ad.AnnData):
             adata = data.copy()
-            print("READING")
-            print(f"modality dtype: {adata.obs['modality'].dtype}")
-            print(f"specie dtype: {adata.obs['specie'].dtype}")
-            print(f"assay dtype: {adata.obs['assay'].dtype}")
             # Align with reference model if available
             if hasattr(self, '_load_reference_model'):
                 reference_model = self._load_reference_model()
                 if reference_model is not None:
                     # Concatenate and then remove the reference
                     adata = ad.concat([reference_model, adata], join='outer', axis=0)
                     adata = adata[1:]
-            print("AFTER CONCATENATION")
-            print(f"modality dtype: {adata.obs['modality'].dtype}")
-            print(f"specie dtype: {adata.obs['specie'].dtype}")
-            print(f"assay dtype: {adata.obs['assay'].dtype}")
             # Get gene expression data
             X = adata.X
             # Get metadata for special tokens
-            # Print column types
-            print("\nColumn types:")
-            if 'modality' in adata.obs.columns:
-                print(f"modality type: {type(adata.obs['modality'])} with dtype: {adata.obs['modality'].dtype}")
-            if 'specie' in adata.obs.columns:
-                print(f"specie type: {type(adata.obs['specie'])} with dtype: {adata.obs['specie'].dtype}")
-            if 'assay' in adata.obs.columns:
-                print(f"assay type: {type(adata.obs['assay'])} with dtype: {adata.obs['assay'].dtype}")
             modality = adata.obs['modality'] if 'modality' in adata.obs.columns else None
             species = adata.obs['specie'] if 'specie' in adata.obs.columns else None
             technology = adata.obs['assay'] if 'assay' in adata.obs.columns else None
-            print(f"Modality: {modality}")
-            print(f"Species: {species}")
-            print(f"Technology: {technology}")
             # Use integer values directly if available
-            if modality is not None and pd.api.types.is_numeric_dtype(modality):
-                modality_tokens = modality.astype(int).tolist()
             else:
-                modality_tokens = [self.modality_dict.get(m, self._vocabulary["[PAD]"]) for m in modality] if modality is not None else None
-            if species is not None and pd.api.types.is_numeric_dtype(species):
-                species_tokens = species.astype(int).tolist()
-                print(f"Species tokens: {species_tokens}")
             else:
-                species_tokens = [self.species_dict.get(s, self._vocabulary["[PAD]"]) for s in species] if species is not None else None
-                print(f"Species tokens resort: {species_tokens}")
-            if technology is not None and pd.api.types.is_numeric_dtype(technology):
-                technology_tokens = technology.astype(int).tolist()
-                print(f"Technology tokens: {technology_tokens}")
             else:
-                technology_tokens = [self.technology_dict.get(t, self._vocabulary["[PAD]"]) for t in technology] if technology is not None else None
-                print(f"Technology tokens resort: {technology_tokens}")
         else:
             X = data
             modality_tokens = None

         """
         if isinstance(data, ad.AnnData):
             adata = data.copy()
             # Align with reference model if available
             if hasattr(self, '_load_reference_model'):
                 reference_model = self._load_reference_model()
                 if reference_model is not None:
+                    # Store original column types before concatenation
+                    original_types = {}
+                    for col in ['modality', 'specie', 'assay']:
+                        if col in adata.obs.columns:
+                            original_types[col] = adata.obs[col].dtype
                     # Concatenate and then remove the reference
                     adata = ad.concat([reference_model, adata], join='outer', axis=0)
                     adata = adata[1:]
+                    # Restore original column types after concatenation
+                    for col, dtype in original_types.items():
+                        if col in adata.obs.columns:
+                            try:
+                                adata.obs[col] = adata.obs[col].astype(dtype)
+                            except Exception as e:
+                                print(f"Warning: Could not convert {col} back to {dtype}: {e}")
             # Get gene expression data
             X = adata.X
             # Get metadata for special tokens
             modality = adata.obs['modality'] if 'modality' in adata.obs.columns else None
             species = adata.obs['specie'] if 'specie' in adata.obs.columns else None
             technology = adata.obs['assay'] if 'assay' in adata.obs.columns else None
             # Use integer values directly if available
+            if modality is not None:
+                try:
+                    if pd.api.types.is_numeric_dtype(modality):
+                        modality_tokens = modality.astype(int).tolist()
+                    else:
+                        modality_tokens = [self.modality_dict.get(m, self._vocabulary["[PAD]"]) for m in modality]
+                except Exception as e:
+                    print(f"Warning: Error processing modality tokens: {e}")
+                    modality_tokens = [self._vocabulary["[PAD]"]] * len(adata)
             else:
+                modality_tokens = None
+            if species is not None:
+                try:
+                    if pd.api.types.is_numeric_dtype(species):
+                        species_tokens = species.astype(int).tolist()
+                    else:
+                        species_tokens = [self.species_dict.get(s, self._vocabulary["[PAD]"]) for s in species]
+                except Exception as e:
+                    print(f"Warning: Error processing species tokens: {e}")
+                    species_tokens = [self._vocabulary["[PAD]"]] * len(adata)
             else:
+                species_tokens = None
+            if technology is not None:
+                try:
+                    if pd.api.types.is_numeric_dtype(technology):
+                        technology_tokens = technology.astype(int).tolist()
+                    else:
+                        technology_tokens = [self.technology_dict.get(t, self._vocabulary["[PAD]"]) for t in technology]
+                except Exception as e:
+                    print(f"Warning: Error processing technology tokens: {e}")
+                    technology_tokens = [self._vocabulary["[PAD]"]] * len(adata)
             else:
+                technology_tokens = None
         else:
             X = data
             modality_tokens = None