Spaces:

codemichaeld
/

new03

Running

App Files Files Community

codemichaeld commited on 14 days ago

Commit

672b8b5

verified ·

1 Parent(s): c31eee4

Update app.py

Browse files

Files changed (1) hide show

app.py +304 -142

app.py CHANGED Viewed

@@ -10,6 +10,8 @@ from huggingface_hub import HfApi, hf_hub_download
 from safetensors.torch import load_file, save_file
 import torch
 import torch.nn.functional as F
 try:
     from modelscope.hub.file_download import model_file_download as ms_file_download
     from modelscope.hub.api import HubApi as ModelScopeApi
@@ -17,64 +19,192 @@ try:
 except ImportError:
     MODELScope_AVAILABLE = False
-def low_rank_decomposition(weight, rank=128):
-    """Improved LoRA decomposition that maintains compatibility with existing merge scripts."""
-    if weight.ndim != 2:
         return None, None
     try:
-        # Convert to float32 for numerical stability during SVD
-        weight_f32 = weight.float()
-        # Perform SVD
-        U, S, Vh = torch.linalg.svd(weight_f32, full_matrices=False)
-        # Ensure rank doesn't exceed available singular values
-        actual_rank = min(rank, len(S))
-        if actual_rank < 8:
             return None, None
-        # Create LoRA matrices using standard factorization
-        A = Vh[:actual_rank, :].contiguous()
-        B = U[:, :actual_rank] @ torch.diag(S[:actual_rank])
-        return A.to(torch.float16), B.to(torch.float16)
     except Exception as e:
-        print(f"Decomposition error: {e}")
-        return None, None
 def extract_correction_factors(original_weight, fp8_weight):
-    """Extract per-channel/tensor correction factors instead of LoRA decomposition for VAE."""
     with torch.no_grad():
-        # Convert to float32 for precision
         orig = original_weight.float()
         quant = fp8_weight.float()
-        # Compute error (what needs to be added to FP8 to recover original)
         error = orig - quant
-        # Skip if error is negligible
         error_norm = torch.norm(error)
         orig_norm = torch.norm(orig)
-        if orig_norm > 1e-6 and error_norm / orig_norm < 0.01:
             return None
-        # For 4D tensors (common in VAE), compute per-channel correction
         if orig.ndim == 4:
-            # Channel dimension is typically dimension 0 (output channels)
-            channel_dim = 0
             channel_mean = error.mean(dim=tuple(i for i in range(1, orig.ndim)), keepdim=True)
             return channel_mean.to(original_weight.dtype)
-        # For 2D tensors, use per-row correction
         elif orig.ndim == 2:
             row_mean = error.mean(dim=1, keepdim=True)
             return row_mean.to(original_weight.dtype)
         else:
-            # For bias/batchnorm etc., use scalar correction
             return error.mean().to(original_weight.dtype)
 def convert_safetensors_to_fp8_with_lora(safetensors_path, output_dir, fp8_format, lora_rank=128, architecture="auto", progress=gr.Progress()):
-    progress(0.1, desc="Starting FP8 conversion with LoRA extraction...")
     try:
         def read_safetensors_metadata(path):
             with open(path, 'rb') as f:
@@ -89,105 +219,111 @@ def convert_safetensors_to_fp8_with_lora(safetensors_path, output_dir, fp8_forma
         state_dict = load_file(safetensors_path)
         progress(0.4, desc="Loaded weights.")
-        if fp8_format == "e5m2":
-            fp8_dtype = torch.float8_e5m2
-        else:
-            fp8_dtype = torch.float8_e4m3fn
         sd_fp8 = {}
         lora_weights = {}
         correction_factors = {}
-        total = len(state_dict)
         stats = {
-            "total_layers": total,
             "eligible_layers": 0,
             "processed_layers": 0,
             "correction_layers": 0,
             "skipped_layers": [],
-            "architecture_detected": ""
         }
-        # Auto-detect architecture if needed
-        if architecture == "auto":
-            model_keys = " ".join(state_dict.keys()).lower()
-            if "text" in model_keys or "emb" in model_keys:
-                architecture = "text_encoder"
-            elif "vae" in model_keys or "encoder" in model_keys or "decoder" in model_keys:
-                architecture = "vae"
-            elif "attn" in model_keys or "transformer" in model_keys:
-                architecture = "transformer"
-            else:
-                architecture = "all"
-        stats["architecture_detected"] = architecture
-        use_correction = architecture == "vae"
         for i, key in enumerate(state_dict):
             progress(0.4 + 0.4 * (i / total), desc=f"Processing {i+1}/{total}...")
             weight = state_dict[key]
-            lower_key = key.lower()
             if weight.dtype in [torch.float16, torch.float32, torch.bfloat16]:
-                fp8_weight = weight.to(fp8_dtype)
-                sd_fp8[key] = fp8_weight
-                # Determine if this layer should be processed based on architecture
-                should_process = False
-                if architecture == "text_encoder":
-                    should_process = "text" in lower_key or "emb" in lower_key or "encoder" in lower_key or "attn" in lower_key
-                elif architecture == "transformer":
-                    should_process = "attn" in lower_key or "transformer" in lower_key or "mlp" in lower_key
-                elif architecture == "vae":
-                    should_process = "vae" in lower_key or "decoder" in lower_key or "encoder" in lower_key or "conv" in lower_key
-                elif architecture == "all":
-                    should_process = True
-                else:  # "auto" fallback
-                    should_process = True
                 if should_process:
-                    if use_correction:
-                        # For VAE, use correction factors instead of LoRA
-                        corr = extract_correction_factors(weight, fp8_weight)
-                        if corr is not None:
-                            correction_factors[f"correction.{key}"] = corr
-                            stats["correction_layers"] += 1
-                            stats["processed_layers"] += 1
-                    else:
-                        # For other architectures, use LoRA
-                        stats["eligible_layers"] += 1
-                        # Handle 2D tensors with standard LoRA
-                        if weight.ndim == 2:
                             try:
-                                # Adjust rank for smaller matrices
-                                adjusted_rank = lora_rank
-                                if min(weight.shape) < lora_rank:
-                                    adjusted_rank = max(8, min(weight.shape) // 2)
-                                A, B = low_rank_decomposition(weight, rank=adjusted_rank)
                                 if A is not None and B is not None:
-                                    lora_weights[f"lora_A.{key}"] = A
-                                    lora_weights[f"lora_B.{key}"] = B
                                     stats["processed_layers"] += 1
                                 else:
                                     stats["skipped_layers"].append(f"{key}: decomposition failed")
                             except Exception as e:
                                 stats["skipped_layers"].append(f"{key}: error - {str(e)}")
-                        # Skip 4D tensors for non-VAE architectures
-                        elif weight.ndim == 4:
-                            stats["skipped_layers"].append(f"{key}: 4D tensor skipped for non-VAE architecture")
             else:
                 sd_fp8[key] = weight
                 stats["skipped_layers"].append(f"{key}: non-float dtype")
         base_name = os.path.splitext(os.path.basename(safetensors_path))[0]
         fp8_path = os.path.join(output_dir, f"{base_name}-fp8-{fp8_format}.safetensors")
-        # Save FP8 model
         save_file(sd_fp8, fp8_path, metadata={"format": "pt", "fp8_format": fp8_format, **metadata})
-        # Save LoRA weights if any were generated
         if lora_weights:
             lora_path = os.path.join(output_dir, f"{base_name}-lora-r{lora_rank}-{architecture}.safetensors")
             lora_metadata = {
@@ -199,7 +335,6 @@ def convert_safetensors_to_fp8_with_lora(safetensors_path, output_dir, fp8_forma
             }
             save_file(lora_weights, lora_path, metadata=lora_metadata)
-        # Save correction factors if any were generated (for VAE)
         if correction_factors:
             correction_path = os.path.join(output_dir, f"{base_name}-correction-{architecture}.safetensors")
             correction_metadata = {
@@ -210,24 +345,25 @@ def convert_safetensors_to_fp8_with_lora(safetensors_path, output_dir, fp8_forma
             }
             save_file(correction_factors, correction_path, metadata=correction_metadata)
-        progress(0.9, desc="Saved FP8 and LoRA/correction files.")
-        progress(1.0, desc="✅ FP8 + LoRA/correction extraction complete!")
         stats_msg = f"FP8 ({fp8_format}) with precision recovery saved.\n"
-        stats_msg += f"Architecture detected: {stats['architecture_detected']}\n"
-        if use_correction:
             stats_msg += f"Correction factors generated for {stats['correction_layers']} layers."
         else:
-            stats_msg += f"Processed {stats['processed_layers']}/{stats['eligible_layers']} eligible layers with LoRA rank {lora_rank}."
         if stats['processed_layers'] == 0 and stats['correction_layers'] == 0:
-            stats_msg += "\n⚠️ No precision recovery weights were generated. Try a different architecture selection or parameters."
         return True, stats_msg, stats
     except Exception as e:
-        import traceback
         error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
         return False, error_msg, None
@@ -336,12 +472,13 @@ def process_and_upload_fp8(
         # Determine which precision recovery file was generated
         precision_recovery_file = ""
-        precision_recovery_type = "LoRA"
-        if stats.get("correction_layers", 0) > 0:
             precision_recovery_file = f"{base_name}-correction-{architecture}.safetensors"
             precision_recovery_type = "Correction Factors"
-        elif stats.get("processed_layers", 0) > 0:
             precision_recovery_file = f"{base_name}-lora-r{lora_rank}-{architecture}.safetensors"
         readme = f"""---
 library_name: diffusers
@@ -358,42 +495,51 @@ tags:
 - **FP8 Format**: `{fp8_format.upper()}`
 - **Architecture**: {architecture}
 - **Precision Recovery Type**: {precision_recovery_type}
-- **Precision Recovery File**: `{precision_recovery_file}`
 - **FP8 File**: `{fp8_filename}`
 ## Usage (Inference)
 ```python
 from safetensors.torch import load_file
 import torch
 # Load FP8 model
 fp8_state = load_file("{fp8_filename}")
-# Load precision recovery file
-recovery_state = load_file("{precision_recovery_file}") if "{precision_recovery_file}" else {{}}
 # Reconstruct high-precision weights
 reconstructed = {{}}
 for key in fp8_state:
-    fp8_weight = fp8_state[key].to(torch.float32)
     if recovery_state:
         # For LoRA approach
-        if "lora_A" in recovery_state:
-            if f"lora_A.{{key}}" in recovery_state and f"lora_B.{{key}}" in recovery_state:
-                A = recovery_state[f"lora_A.{{key}}"].to(torch.float32)
-                B = recovery_state[f"lora_B.{{key}}"].to(torch.float32)
-                lora_weight = B @ A
-                reconstructed[key] = fp8_weight + lora_weight
-            else:
-                reconstructed[key] = fp8_weight
         # For correction factor approach
         elif f"correction.{{key}}" in recovery_state:
             correction = recovery_state[f"correction.{{key}}"].to(torch.float32)
-            reconstructed[key] = fp8_weight + correction
         else:
-            reconstructed[key] = fp8_weight
     else:
-        reconstructed[key] = fp8_weight
 ```
-> Requires PyTorch ≥ 2.1 for FP8 support.
 """
         with open(os.path.join(output_dir, "README.md"), "w") as f:
             f.write(readme)
@@ -407,17 +553,22 @@ for key in fp8_state:
             )
         progress(1.0, desc="✅ Done!")
         result_html = f"""
 ✅ Success!
 Model uploaded to: <a href="{repo_url_final}" target="_blank">{new_repo_id}</a>
-Includes: FP8 model + precision recovery ({precision_recovery_type}).
 """
         return gr.HTML(result_html), "✅ FP8 + precision recovery upload successful!", msg
     except Exception as e:
-        import traceback
-        error_details = f"❌ Error: {str(e)}\n{traceback.format_exc()}"
-        return None, error_details, ""
     finally:
         if temp_dir:
@@ -425,8 +576,8 @@ Includes: FP8 model + precision recovery ({precision_recovery_type}).
         shutil.rmtree(output_dir, ignore_errors=True)
 with gr.Blocks(title="FP8 + Precision Recovery Extractor") as demo:
-    gr.Markdown("# 🔄 FP8 Pruner with Architecture-Specific Precision Recovery")
-    gr.Markdown("Convert `.safetensors` → **FP8** + **precision recovery** (LoRA or correction factors). Supports Hugging Face ↔ ModelScope.")
     with gr.Row():
         with gr.Column():
@@ -436,13 +587,15 @@ with gr.Blocks(title="FP8 + Precision Recovery Extractor") as demo:
             with gr.Accordion("Advanced Settings", open=True):
                 fp8_format = gr.Radio(["e4m3fn", "e5m2"], value="e5m2", label="FP8 Format")
-                lora_rank = gr.Slider(minimum=8, maximum=256, step=8, value=128, label="LoRA Rank (for text/transformers)")
                 architecture = gr.Dropdown(
                     choices=[
                         ("Auto-detect architecture", "auto"),
                         ("Text Encoder (LoRA)", "text_encoder"),
                         ("Transformer blocks (LoRA)", "transformer"),
                         ("VAE (Correction Factors)", "vae"),
                         ("All layers (LoRA where applicable)", "all")
                     ],
                     value="auto",
@@ -451,11 +604,12 @@ with gr.Blocks(title="FP8 + Precision Recovery Extractor") as demo:
             with gr.Accordion("Authentication", open=False):
                 hf_token = gr.Textbox(label="Hugging Face Token", type="password")
-                modelscope_token = gr.Textbox(label="ModelScope Token (optional)", type="password", visible=MODELScope_AVAILABLE)
         with gr.Column():
             target_type = gr.Radio(["huggingface", "modelscope"], value="huggingface", label="Target")
-            new_repo_id = gr.Textbox(label="New Repo ID", placeholder="user/model-fp8")
             private_repo = gr.Checkbox(label="Private Repository (HF only)", value=False)
             status_output = gr.Markdown()
@@ -485,31 +639,39 @@ with gr.Blocks(title="FP8 + Precision Recovery Extractor") as demo:
     gr.Examples(
         examples=[
-            ["huggingface", "https://huggingface.co/runwayml/stable-diffusion-v1-5/tree/main/text_encoder", "model.safetensors", "e5m2", 96, "text_encoder", "huggingface"],
-            ["huggingface", "https://huggingface.co/stabilityai/sdxl-vae", "diffusion_pytorch_model.safetensors", "e4m3fn", 64, "vae", "huggingface"],
-            ["huggingface", "https://huggingface.co/Yabo/FramePainter/tree/main", "unet_diffusion_pytorch_model.safetensors", "e5m2", 128, "transformer", "huggingface"]
         ],
-        inputs=[source_type, repo_url, safetensors_filename, fp8_format, lora_rank, architecture, target_type],
         label="Example Conversions"
     )
     gr.Markdown("""
-    ## 💡 Architecture-Specific Precision Recovery
-    This tool automatically selects the best precision recovery method based on architecture:
-    - **Text Encoder & Transformers**: Uses **LoRA decomposition** (best for attention layers)
-      - Higher ranks (96-128) recommended for text encoders
-      - Medium ranks (64-128) for transformers
-    - **VAE**: Uses **per-channel correction factors** (better for convolutional layers)
-      - No rank parameter needed - automatically computes channel-wise corrections
-      - Works with 4D convolutional weights that LoRA cannot handle well
-    - **Auto-detect**: Analyzes model structure to select appropriate method
-    > **Note**: VAE models typically contain 4D convolutional weights that don't work well with standard LoRA.
-    > The correction factor approach used for VAE matches the successful method from the attached file.
     """)
 demo.launch()

 from safetensors.torch import load_file, save_file
 import torch
 import torch.nn.functional as F
+import traceback
+import math
 try:
     from modelscope.hub.file_download import model_file_download as ms_file_download
     from modelscope.hub.api import HubApi as ModelScopeApi
 except ImportError:
     MODELScope_AVAILABLE = False
+def get_fp8_dtype(fp8_format):
+    """Get torch FP8 dtype."""
+    if fp8_format == "e5m2":
+        return torch.float8_e5m2
+    else:
+        return torch.float8_e4m3fn
+def quantize_and_get_error(weight, fp8_dtype):
+    """Quantize weight to FP8 and return both quantized weight and error."""
+    weight_fp8 = weight.to(fp8_dtype)
+    weight_dequantized = weight_fp8.to(weight.dtype)
+    error = weight - weight_dequantized
+    return weight_fp8, error
+def low_rank_decomposition_error(error_tensor, rank=32, min_error_threshold=1e-6):
+    """Decompose error tensor with proper rank reduction."""
+    if error_tensor.ndim not in [2, 4]:
         return None, None
     try:
+        # Calculate error magnitude
+        error_norm = torch.norm(error_tensor.float())
+        if error_norm < min_error_threshold:
             return None, None
+        # For 2D tensors (linear layers)
+        if error_tensor.ndim == 2:
+            U, S, Vh = torch.linalg.svd(error_tensor.float(), full_matrices=False)
+            # Calculate rank based on variance explained (keep 95% of error)
+            total_variance = torch.sum(S ** 2)
+            cumulative = torch.cumsum(S ** 2, dim=0)
+            keep_components = torch.sum(cumulative <= 0.95 * total_variance).item() + 1
+            # Limit rank to much smaller than original
+            max_rank = min(error_tensor.shape)
+            actual_rank = min(rank, keep_components, max_rank // 2)
+            if actual_rank < 2:
+                return None, None
+            A = Vh[:actual_rank, :].contiguous()
+            B = U[:, :actual_rank] @ torch.diag(S[:actual_rank]).contiguous()
+            return A, B
+        # For 4D convolutions
+        elif error_tensor.ndim == 4:
+            out_ch, in_ch, kH, kW = error_tensor.shape
+            # Reshape to 2D for decomposition
+            error_2d = error_tensor.view(out_ch, in_ch * kH * kW)
+            U, S, Vh = torch.linalg.svd(error_2d.float(), full_matrices=False)
+            # Calculate rank based on variance explained (90% for conv)
+            total_variance = torch.sum(S ** 2)
+            cumulative = torch.cumsum(S ** 2, dim=0)
+            keep_components = torch.sum(cumulative <= 0.90 * total_variance).item() + 1
+            # Use even lower rank for conv
+            max_rank = min(error_2d.shape)
+            actual_rank = min(rank // 2, keep_components, max_rank // 4)
+            if actual_rank < 2:
+                return None, None
+            A = Vh[:actual_rank, :].contiguous()
+            B = U[:, :actual_rank] @ torch.diag(S[:actual_rank]).contiguous()
+            # Reshape back for convolutional format
+            if kH == 1 and kW == 1:
+                B = B.view(out_ch, actual_rank, 1, 1)
+                A = A.view(actual_rank, in_ch, 1, 1)
+            else:
+                B = B.view(out_ch, actual_rank, 1, 1)
+                A = A.view(actual_rank, in_ch, kH, kW)
+            return A, B
     except Exception as e:
+        print(f"Error decomposition failed: {e}")
+    return None, None
 def extract_correction_factors(original_weight, fp8_weight):
+    """Extract simple correction factors for VAE."""
     with torch.no_grad():
         orig = original_weight.float()
         quant = fp8_weight.float()
         error = orig - quant
         error_norm = torch.norm(error)
         orig_norm = torch.norm(orig)
+        if orig_norm > 1e-6 and error_norm / orig_norm < 0.001:
             return None
+        # For 4D tensors (VAE), compute per-channel correction
         if orig.ndim == 4:
             channel_mean = error.mean(dim=tuple(i for i in range(1, orig.ndim)), keepdim=True)
             return channel_mean.to(original_weight.dtype)
         elif orig.ndim == 2:
             row_mean = error.mean(dim=1, keepdim=True)
             return row_mean.to(original_weight.dtype)
         else:
             return error.mean().to(original_weight.dtype)
+def get_architecture_settings(architecture, base_rank):
+    """Get optimal settings for different architectures."""
+    settings = {
+        "text_encoder": {
+            "rank": base_rank,
+            "error_threshold": 5e-5,
+            "min_rank": 8,
+            "max_rank_factor": 0.4,
+            "method": "lora"
+        },
+        "transformer": {
+            "rank": base_rank,
+            "error_threshold": 1e-5,
+            "min_rank": 12,
+            "max_rank_factor": 0.35,
+            "method": "lora"
+        },
+        "vae": {
+            "rank": base_rank // 2,
+            "error_threshold": 1e-4,
+            "min_rank": 4,
+            "max_rank_factor": 0.3,
+            "method": "correction"
+        },
+        "unet_conv": {
+            "rank": base_rank // 3,
+            "error_threshold": 2e-5,
+            "min_rank": 8,
+            "max_rank_factor": 0.25,
+            "method": "lora"
+        },
+        "auto": {
+            "rank": base_rank,
+            "error_threshold": 1e-5,
+            "min_rank": 8,
+            "max_rank_factor": 0.3,
+            "method": "lora"
+        },
+        "all": {
+            "rank": base_rank,
+            "error_threshold": 1e-5,
+            "min_rank": 8,
+            "max_rank_factor": 0.3,
+            "method": "lora"
+        }
+    }
+    return settings.get(architecture, settings["auto"])
+def should_process_layer(key, weight, architecture):
+    """Determine if layer should be processed for LoRA/correction."""
+    lower_key = key.lower()
+    # Skip biases and normalization layers
+    if 'bias' in key or 'norm' in key.lower() or 'bn' in key.lower():
+        return False
+    if weight.numel() < 100:
+        return False
+    # Architecture-specific filtering
+    if architecture == "text_encoder":
+        return ('text' in lower_key or 'emb' in lower_key or
+                'encoder' in lower_key or 'attn' in lower_key)
+    elif architecture == "transformer":
+        return ('attn' in lower_key or 'transformer' in lower_key or
+                'mlp' in lower_key or 'to_out' in lower_key)
+    elif architecture == "vae":
+        return ('vae' in lower_key or 'encoder' in lower_key or
+                'decoder' in lower_key or 'conv' in lower_key)
+    elif architecture == "unet_conv":
+        return ('conv' in lower_key or 'resnet' in lower_key or
+                'downsample' in lower_key or 'upsample' in lower_key)
+    elif architecture in ["all", "auto"]:
+        return True
+    return False
 def convert_safetensors_to_fp8_with_lora(safetensors_path, output_dir, fp8_format, lora_rank=128, architecture="auto", progress=gr.Progress()):
+    progress(0.1, desc="Starting FP8 conversion with error recovery...")
     try:
         def read_safetensors_metadata(path):
             with open(path, 'rb') as f:
         state_dict = load_file(safetensors_path)
         progress(0.4, desc="Loaded weights.")
+        # Auto-detect architecture if needed
+        if architecture == "auto":
+            model_keys = " ".join(state_dict.keys()).lower()
+            if "vae" in model_keys or ("encoder" in model_keys and "decoder" in model_keys):
+                architecture = "vae"
+            elif "text" in model_keys or "emb" in model_keys:
+                architecture = "text_encoder"
+            elif "attn" in model_keys or "transformer" in model_keys:
+                architecture = "transformer"
+            elif "conv" in model_keys or "resnet" in model_keys:
+                architecture = "unet_conv"
+            else:
+                architecture = "all"
+        settings = get_architecture_settings(architecture, lora_rank)
+        fp8_dtype = get_fp8_dtype(fp8_format)
         sd_fp8 = {}
         lora_weights = {}
         correction_factors = {}
         stats = {
+            "total_layers": len(state_dict),
             "eligible_layers": 0,
+            "layers_with_error": 0,
             "processed_layers": 0,
             "correction_layers": 0,
             "skipped_layers": [],
+            "architecture": architecture,
+            "method": settings["method"],
+            "error_magnitudes": []
         }
+        total = len(state_dict)
         for i, key in enumerate(state_dict):
             progress(0.4 + 0.4 * (i / total), desc=f"Processing {i+1}/{total}...")
             weight = state_dict[key]
             if weight.dtype in [torch.float16, torch.float32, torch.bfloat16]:
+                # Quantize to FP8 and calculate error
+                weight_fp8, error = quantize_and_get_error(weight, fp8_dtype)
+                sd_fp8[key] = weight_fp8
+                # Calculate error magnitude
+                error_norm = torch.norm(error.float())
+                weight_norm = torch.norm(weight.float())
+                relative_error = (error_norm / weight_norm).item() if weight_norm > 0 else 0
+                stats["error_magnitudes"].append({
+                    "key": key,
+                    "relative_error": relative_error
+                })
+                # Check if layer should be processed
+                should_process = should_process_layer(key, weight, architecture)
                 if should_process:
+                    stats["eligible_layers"] += 1
+                    # Only process if error is significant
+                    if relative_error > settings["error_threshold"]:
+                        stats["layers_with_error"] += 1
+                        if settings["method"] == "correction":
+                            # Use correction factors for VAE
+                            correction = extract_correction_factors(weight, weight_fp8)
+                            if correction is not None:
+                                correction_factors[f"correction.{key}"] = correction
+                                stats["correction_layers"] += 1
+                                stats["processed_layers"] += 1
+                        else:
+                            # Use LoRA decomposition for other architectures
                             try:
+                                A, B = low_rank_decomposition_error(
+                                    error,
+                                    rank=settings["rank"],
+                                    min_error_threshold=settings["error_threshold"]
+                                )
                                 if A is not None and B is not None:
+                                    lora_weights[f"lora_A.{key}"] = A.to(torch.float16)
+                                    lora_weights[f"lora_B.{key}"] = B.to(torch.float16)
                                     stats["processed_layers"] += 1
                                 else:
                                     stats["skipped_layers"].append(f"{key}: decomposition failed")
                             except Exception as e:
                                 stats["skipped_layers"].append(f"{key}: error - {str(e)}")
+                    else:
+                        stats["skipped_layers"].append(f"{key}: error too small ({relative_error:.6f})")
             else:
                 sd_fp8[key] = weight
                 stats["skipped_layers"].append(f"{key}: non-float dtype")
+        # Calculate average error
+        if stats["error_magnitudes"]:
+            errors = [e["relative_error"] for e in stats["error_magnitudes"]]
+            stats["avg_error"] = sum(errors) / len(errors) if errors else 0
+            stats["max_error"] = max(errors) if errors else 0
         base_name = os.path.splitext(os.path.basename(safetensors_path))[0]
         fp8_path = os.path.join(output_dir, f"{base_name}-fp8-{fp8_format}.safetensors")
         save_file(sd_fp8, fp8_path, metadata={"format": "pt", "fp8_format": fp8_format, **metadata})
+        # Save precision recovery weights
         if lora_weights:
             lora_path = os.path.join(output_dir, f"{base_name}-lora-r{lora_rank}-{architecture}.safetensors")
             lora_metadata = {
             }
             save_file(lora_weights, lora_path, metadata=lora_metadata)
         if correction_factors:
             correction_path = os.path.join(output_dir, f"{base_name}-correction-{architecture}.safetensors")
             correction_metadata = {
             }
             save_file(correction_factors, correction_path, metadata=correction_metadata)
+        progress(0.9, desc="Saved FP8 and precision recovery files.")
+        progress(1.0, desc="✅ FP8 + precision recovery extraction complete!")
         stats_msg = f"FP8 ({fp8_format}) with precision recovery saved.\n"
+        stats_msg += f"Architecture: {architecture}\n"
+        stats_msg += f"Method: {settings['method']}\n"
+        stats_msg += f"Average quantization error: {stats.get('avg_error', 0):.6f}\n"
+        if settings["method"] == "correction":
             stats_msg += f"Correction factors generated for {stats['correction_layers']} layers."
         else:
+            stats_msg += f"LoRA generated for {stats['processed_layers']}/{stats['eligible_layers']} eligible layers (rank {lora_rank})."
         if stats['processed_layers'] == 0 and stats['correction_layers'] == 0:
+            stats_msg += "\n⚠️ No precision recovery weights were generated. FP8 quantization error may be too small."
         return True, stats_msg, stats
     except Exception as e:
         error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
         return False, error_msg, None
         # Determine which precision recovery file was generated
         precision_recovery_file = ""
+        precision_recovery_type = ""
+        if stats.get("method") == "correction" and stats.get("correction_layers", 0) > 0:
             precision_recovery_file = f"{base_name}-correction-{architecture}.safetensors"
             precision_recovery_type = "Correction Factors"
+        elif stats.get("method") == "lora" and stats.get("processed_layers", 0) > 0:
             precision_recovery_file = f"{base_name}-lora-r{lora_rank}-{architecture}.safetensors"
+            precision_recovery_type = "LoRA"
         readme = f"""---
 library_name: diffusers
 - **FP8 Format**: `{fp8_format.upper()}`
 - **Architecture**: {architecture}
 - **Precision Recovery Type**: {precision_recovery_type}
+- **Precision Recovery File**: `{precision_recovery_file}` if available
 - **FP8 File**: `{fp8_filename}`
 ## Usage (Inference)
 ```python
 from safetensors.torch import load_file
 import torch
 # Load FP8 model
 fp8_state = load_file("{fp8_filename}")
+# Load precision recovery file if available
+recovery_state = {{}}
+if "{precision_recovery_file}":
+    recovery_state = load_file("{precision_recovery_file}")
 # Reconstruct high-precision weights
 reconstructed = {{}}
 for key in fp8_state:
+    # Dequantize FP8 to target precision
+    fp_weight = fp8_state[key].to(torch.float32)
     if recovery_state:
         # For LoRA approach
+        if f"lora_A.{{key}}" in recovery_state and f"lora_B.{{key}}" in recovery_state:
+            A = recovery_state[f"lora_A.{{key}}"].to(torch.float32)
+            B = recovery_state[f"lora_B.{{key}}"].to(torch.float32)
+            error_correction = B @ A
+            reconstructed[key] = fp_weight + error_correction
         # For correction factor approach
         elif f"correction.{{key}}" in recovery_state:
             correction = recovery_state[f"correction.{{key}}"].to(torch.float32)
+            reconstructed[key] = fp_weight + correction
         else:
+            reconstructed[key] = fp_weight
     else:
+        reconstructed[key] = fp_weight
+print("Model reconstructed with FP8 error recovery")
 ```
+> **Note**: This precision recovery targets FP8 quantization errors.
+> Average quantization error: {stats.get('avg_error', 0):.6f}
 """
         with open(os.path.join(output_dir, "README.md"), "w") as f:
             f.write(readme)
             )
         progress(1.0, desc="✅ Done!")
         result_html = f"""
 ✅ Success!
 Model uploaded to: <a href="{repo_url_final}" target="_blank">{new_repo_id}</a>
+Includes: FP8 model + precision recovery ({precision_recovery_type}).
+Average quantization error: {stats.get('avg_error', 0):.6f}
 """
+        if stats['processed_layers'] > 0 or stats['correction_layers'] > 0:
+            result_html += f"<br>Precision recovery applied to {stats['processed_layers'] + stats['correction_layers']} layers."
         return gr.HTML(result_html), "✅ FP8 + precision recovery upload successful!", msg
     except Exception as e:
+        error_msg = f"❌ Error: {str(e)}\n{traceback.format_exc()}"
+        return None, error_msg, ""
     finally:
         if temp_dir:
         shutil.rmtree(output_dir, ignore_errors=True)
 with gr.Blocks(title="FP8 + Precision Recovery Extractor") as demo:
+    gr.Markdown("# 🔄 FP8 Converter with Architecture-Specific Precision Recovery")
+    gr.Markdown("Convert models to **FP8** with **error-based precision recovery**.")
     with gr.Row():
         with gr.Column():
             with gr.Accordion("Advanced Settings", open=True):
                 fp8_format = gr.Radio(["e4m3fn", "e5m2"], value="e5m2", label="FP8 Format")
+                lora_rank = gr.Slider(minimum=8, maximum=256, step=8, value=128,
+                                     label="LoRA Rank (for text/transformers)")
                 architecture = gr.Dropdown(
                     choices=[
                         ("Auto-detect architecture", "auto"),
                         ("Text Encoder (LoRA)", "text_encoder"),
                         ("Transformer blocks (LoRA)", "transformer"),
                         ("VAE (Correction Factors)", "vae"),
+                        ("UNet Convolutions (LoRA)", "unet_conv"),
                         ("All layers (LoRA where applicable)", "all")
                     ],
                     value="auto",
             with gr.Accordion("Authentication", open=False):
                 hf_token = gr.Textbox(label="Hugging Face Token", type="password")
+                modelscope_token = gr.Textbox(label="ModelScope Token (optional)", type="password",
+                                              visible=MODELScope_AVAILABLE)
         with gr.Column():
             target_type = gr.Radio(["huggingface", "modelscope"], value="huggingface", label="Target")
+            new_repo_id = gr.Textbox(label="New Repo ID", placeholder="user/model-fp8-precision")
             private_repo = gr.Checkbox(label="Private Repository (HF only)", value=False)
             status_output = gr.Markdown()
     gr.Examples(
         examples=[
+            ["huggingface", "https://huggingface.co/runwayml/stable-diffusion-v1-5/tree/main/text_encoder",
+             "model.safetensors", "e5m2", 96, "text_encoder"],
+            ["huggingface", "https://huggingface.co/stabilityai/sdxl-vae",
+             "diffusion_pytorch_model.safetensors", "e4m3fn", 64, "vae"],
+            ["huggingface", "https://huggingface.co/Yabo/FramePainter/tree/main",
+             "unet_diffusion_pytorch_model.safetensors", "e5m2", 128, "transformer"]
         ],
+        inputs=[source_type, repo_url, safetensors_filename, fp8_format, lora_rank, architecture],
         label="Example Conversions"
     )
     gr.Markdown("""
+    ## 🎯 What This Tool Does
+    Unlike traditional LoRA fine-tuning, this tool:
+    1. **Quantizes** the model to FP8 (loses precision)
+    2. **Measures** the quantization error for each weight
+    3. **Extracts recovery weights** that specifically recover this error
+    4. **Only applies** recovery where error is significant (>0.001%)
+    ## 💡 Recommended Settings
+    - **Text Encoders**: rank 64-96 (text is sensitive)
+    - **Transformers**: rank 96-128
+    - **VAE**: Uses correction factors (no rank needed)
+    - **UNet Convolutions**: rank 32-64
+    ## ⚠️ Important Notes
+    - This recovers **FP8 quantization errors**, not fine-tuning changes
+    - If FP8 error is tiny (<0.0001%), recovery may not be generated
+    - Higher rank ≠ better for error recovery (use recommended ranges)
     """)
 demo.launch()