Spaces:

QSBench
/

Circuit_Family_Classifier

Running

App Files Files Community

QSBench commited on 4 days ago

Commit

a63cf6b

verified ·

1 Parent(s): 170aab6

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -28

app.py CHANGED Viewed

@@ -75,55 +75,90 @@ def sync_ml_metrics(ds_name: str):
     defaults = [f for f in ["gate_entropy", "meyer_wallach", "adjacency", "depth", "cx_count"] if f in valid_features]
     return gr.update(choices=valid_features, value=defaults)
 def train_classifier(ds_name: str, features: List[str]):
-    if not features: return None, "### ❌ Select features first."
     assets = load_all_assets(ds_name)
     df = assets["df"]
-    # Automatically determine available classes in the dataset, excluding empty values
-    available_in_df = df['circuit_type_requested'].dropna().unique()
-    # Filter: keep only those that are in our list of interests (case-insensitive)
-    # Or simply take all available types if we want universality
-    train_df = df[df['circuit_type_requested'].isin(available_in_df)].dropna(subset=features)
-    if train_df.empty:
-        return None, f"### ❌ Error: No data found for features {features}. Check if these columns are empty in the dataset."
-    X, y = train_df[features], train_df['circuit_type_requested']
-    # Check number of classes
-    if len(y.unique()) < 2:
-        return None, f"### ❌ Error: Need at least 2 classes to train. Found only: {y.unique()}"
     le = LabelEncoder()
     y_encoded = le.fit_transform(y)
-    try:
-        X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
-    except ValueError as e:
-        return None, f"### ❌ Split Error: {str(e)}"
-    clf = RandomForestClassifier(n_estimators=100, max_depth=12, n_jobs=-1).fit(X_train, y_train)
     preds = clf.predict(X_test)
     sns.set_theme(style="whitegrid")
     fig, axes = plt.subplots(1, 2, figsize=(20, 8))
     cm = confusion_matrix(y_test, preds)
-    sns.heatmap(cm, annot=True, fmt='d', cmap='magma',
-                xticklabels=le.classes_, yticklabels=le.classes_,
-                ax=axes[0], cbar=False)
     axes[0].set_title(f"Confusion Matrix (Acc: {accuracy_score(y_test, preds):.2%})")
     importances = clf.feature_importances_
-    idx = np.argsort(importances)[-10:]
-    axes[1].barh([features[i] for i in idx], importances[idx], color='#3498db')
-    axes[1].set_title("Feature Importance")
     plt.tight_layout()
-    report = classification_report(y_test, preds, target_names=le.classes_)
-    return fig, f"### 🏆 Results for {ds_name}\n```\n{report}\n```"
 def update_explorer(ds_name: str, split_name: str):
     assets = load_all_assets(ds_name)

     defaults = [f for f in ["gate_entropy", "meyer_wallach", "adjacency", "depth", "cx_count"] if f in valid_features]
     return gr.update(choices=valid_features, value=defaults)
+Судя по ошибке Found only: ['mixed'], в вашем столбце circuit_type_requested вместо конкретных названий семейств (QFT, HEA и т.д.) записано значение 'mixed'. Это часто случается в демонстрационных подмножествах, где данные уже перемешаны и помечены общим тегом.
+Для классификации нам нужны исходные метки. В датасетах QSBench они обычно находятся в столбце circuit_type_resolved.
+Вот обновленный код функции train_classifier с исправленной логикой выбора столбца и более надежной обработкой ошибок.
+Исправленный код (App Code)
+Python
 def train_classifier(ds_name: str, features: List[str]):
+    if not features:
+        return None, "### ❌ Error: No features selected. Please pick structural metrics."
     assets = load_all_assets(ds_name)
     df = assets["df"]
+    # Try 'resolved' column first as 'requested' might contain 'mixed' in demo shards
+    target_col = 'circuit_type_resolved' if 'circuit_type_resolved' in df.columns else 'circuit_type_requested'
+    # Clean data: remove NaNs and ensure we have valid target strings
+    train_df = df.dropna(subset=features + [target_col])
+    # Filter out rows where the target might be 'mixed' or generic if others are available
+    unique_types = train_df[target_col].unique()
+    if 'mixed' in unique_types and len(unique_types) > 1:
+        train_df = train_df[train_df[target_col] != 'mixed']
+    X = train_df[features]
+    y = train_df[target_col]
+    # Verification: Do we have at least 2 distinct classes to perform classification?
+    current_classes = y.unique()
+    if len(current_classes) < 2:
+        return None, f"### ❌ Classification Error\nFound only one class: `{current_classes}` in column `{target_col}`. " \
+                     "Try a different dataset or check if the source file has labels."
+    # Encode labels to integers
     le = LabelEncoder()
     y_encoded = le.fit_transform(y)
+    class_names = le.classes_
+    # Split dataset
+    try:
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
+        )
+    except ValueError:
+        # Fallback if stratify fails due to very small class sizes
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y_encoded, test_size=0.2, random_state=42
+        )
+    # Train Random Forest Classifier
+    clf = RandomForestClassifier(n_estimators=100, max_depth=12, n_jobs=-1, random_state=42)
+    clf.fit(X_train, y_train)
     preds = clf.predict(X_test)
+    # Visuals
     sns.set_theme(style="whitegrid")
     fig, axes = plt.subplots(1, 2, figsize=(20, 8))
+    # Plot 1: Confusion Matrix
     cm = confusion_matrix(y_test, preds)
+    sns.heatmap(cm, annot=True, fmt='d', cmap='viridis',
+                xticklabels=class_names, yticklabels=class_names, ax=axes[0], cbar=False)
     axes[0].set_title(f"Confusion Matrix (Acc: {accuracy_score(y_test, preds):.2%})")
+    axes[0].set_xlabel("Predicted Label")
+    axes[0].set_ylabel("True Label")
+    # Plot 2: Feature Importance
     importances = clf.feature_importances_
+    indices = np.argsort(importances)[-10:]
+    axes[1].barh([features[i] for i in indices], importances[indices], color='#2ecc71')
+    axes[1].set_title("Top-10 Discriminative Features")
     plt.tight_layout()
+    # Generate text report
+    report_dict = classification_report(y_test, preds, target_names=class_names)
+    summary = f"### 🏆 Classifier Results: {ds_name}\n" \
+              f"**Target Column used:** `{target_col}`\n" \
+              f"**Accuracy:** {accuracy_score(y_test, preds):.2%}\n\n" \
+              f"**Report:**\n```\n{report_dict}\n```"
+    return fig, summary
 def update_explorer(ds_name: str, split_name: str):
     assets = load_all_assets(ds_name)