API Reference: Benchmarks

Comparison Suite

`compare_to`

╔══════════════════════════════════════════════════════════════════════════════╗ ║ MHRQI - Multi-scale Hierarchical Representation of Quantum Images ║ ║ Classical Denoiser Comparison: BM3D, NL-Means, SRAD ║ ║ ║ ║ Author: Keno S. Jose ║ ║ License: Apache 2.0 ║ ╚══════════════════════════════════════════════════════════════════════════════╝

`BenchmarkSuite`

Class-based interface for running classical denoiser benchmarks.

Source code in mhrqi/benchmarks/compare_to.py

class BenchmarkSuite:
    """
    Class-based interface for running classical denoiser benchmarks.
    """

    def __init__(self, noisy_image, reference_image=None, save_dir=None):
        self.noisy_image = to_float01(noisy_image)
        self.reference_image = to_float01(reference_image) if reference_image is not None else None
        self.save_dir = save_dir
        self.results = []

        if self.save_dir:
            os.makedirs(self.save_dir, exist_ok=True)

        try:
            self.roi = auto_homogeneous_roi(self.noisy_image)
        except RuntimeError:
            self.roi = (0, 0, self.noisy_image.shape[0], self.noisy_image.shape[1])

    def run(self, methods="all", proposed_image=None):
        """
        Run benchmarks and return results.
        """
        denoisers = {"bm3d": denoise_bm3d, "nlmeans": denoise_nlmeans, "srad": denoise_srad}

        if methods == "all":
            methods_list = list(denoisers.keys())
        elif isinstance(methods, str):
            methods_list = [methods]
        else:
            methods_list = methods

        # Build list of (name, function)
        to_run = [("Original", None)]
        for name in methods_list:
            if name in denoisers:
                to_run.append((name, denoisers[name]))

        if proposed_image is not None:
            to_run.append(("Proposed", None))
            proposed_float = to_float01(proposed_image)
        else:
            proposed_float = None

        self.results = []
        for name, func in to_run:
            if name == "Original":
                res_img = self.noisy_image
            elif name == "Proposed":
                res_img = proposed_float
            else:
                res_img = func(self.noisy_image)

            metrics = self._compute_metrics(res_img)
            self.results.append({"name": name, "metrics": metrics, "image": to_uint8(res_img)})
        return self.results

    def _compute_metrics(self, res_img):
        m = {}
        m["NIQE"] = plots.compute_niqe(res_img)
        m["SSI"] = plots.compute_ssi(self.noisy_image, res_img, self.roi)
        m["SMPI"] = plots.compute_smpi(self.noisy_image, res_img)
        m["ENL"] = plots.compute_enl(res_img, self.roi)
        cnr_result = plots.compute_cnr(res_img)
        m["CNR"] = cnr_result[0]
        omqdi_result = OMQDI(self.noisy_image, res_img)
        m["OMQDI"] = omqdi_result[0]
        m["EPF"] = omqdi_result[1]
        m["NSF"] = omqdi_result[2]
        m["EPI"] = plots.compute_epi(self.noisy_image, res_img)

        if self.reference_image is not None:
            m["FSIM"] = plots.compute_fsim(self.reference_image, res_img)
            m["SSIM"] = plots.compute_ssim(self.reference_image, res_img)
        else:
            m["FSIM"] = np.nan
            m["SSIM"] = np.nan
        return m

    def save_reports(self, prefix="report"):
        """Save comparison reports and images."""
        if not self.save_dir:
            raise ValueError("save_dir must be set to save reports")

        for res in self.results:
            cv2.imwrite(os.path.join(self.save_dir, f"{prefix}_{res['name']}.png"), res["image"])

        # Split results for report generation
        display_results = [r for r in self.results if r["name"] != "Original"]

        if self.reference_image is not None:
            plots.MetricsPlotter.save_summary_report(
                to_uint8(self.reference_image),
                display_results,
                ["FSIM", "SSIM"],
                "Full Reference Metrics",
                f"{prefix}_full_ref",
                self.save_dir,
            )

        plots.MetricsPlotter.save_summary_report(
            to_uint8(self.noisy_image),
            display_results,
            ["SSI", "SMPI", "NSF", "ENL", "CNR"],
            "Speckle Reduction Metrics",
            f"{prefix}_speckle",
            self.save_dir,
        )

        plots.MetricsPlotter.save_summary_report(
            to_uint8(self.noisy_image),
            display_results,
            ["EPF", "EPI", "OMQDI"],
            "Structural Similarity Metrics",
            f"{prefix}_structural",
            self.save_dir,
        )

`run(methods='all', proposed_image=None)`

Run benchmarks and return results.

Source code in mhrqi/benchmarks/compare_to.py

def run(self, methods="all", proposed_image=None):
    """
    Run benchmarks and return results.
    """
    denoisers = {"bm3d": denoise_bm3d, "nlmeans": denoise_nlmeans, "srad": denoise_srad}

    if methods == "all":
        methods_list = list(denoisers.keys())
    elif isinstance(methods, str):
        methods_list = [methods]
    else:
        methods_list = methods

    # Build list of (name, function)
    to_run = [("Original", None)]
    for name in methods_list:
        if name in denoisers:
            to_run.append((name, denoisers[name]))

    if proposed_image is not None:
        to_run.append(("Proposed", None))
        proposed_float = to_float01(proposed_image)
    else:
        proposed_float = None

    self.results = []
    for name, func in to_run:
        if name == "Original":
            res_img = self.noisy_image
        elif name == "Proposed":
            res_img = proposed_float
        else:
            res_img = func(self.noisy_image)

        metrics = self._compute_metrics(res_img)
        self.results.append({"name": name, "metrics": metrics, "image": to_uint8(res_img)})
    return self.results

`save_reports(prefix='report')`

Save comparison reports and images.

Source code in mhrqi/benchmarks/compare_to.py

def save_reports(self, prefix="report"):
    """Save comparison reports and images."""
    if not self.save_dir:
        raise ValueError("save_dir must be set to save reports")

    for res in self.results:
        cv2.imwrite(os.path.join(self.save_dir, f"{prefix}_{res['name']}.png"), res["image"])

    # Split results for report generation
    display_results = [r for r in self.results if r["name"] != "Original"]

    if self.reference_image is not None:
        plots.MetricsPlotter.save_summary_report(
            to_uint8(self.reference_image),
            display_results,
            ["FSIM", "SSIM"],
            "Full Reference Metrics",
            f"{prefix}_full_ref",
            self.save_dir,
        )

    plots.MetricsPlotter.save_summary_report(
        to_uint8(self.noisy_image),
        display_results,
        ["SSI", "SMPI", "NSF", "ENL", "CNR"],
        "Speckle Reduction Metrics",
        f"{prefix}_speckle",
        self.save_dir,
    )

    plots.MetricsPlotter.save_summary_report(
        to_uint8(self.noisy_image),
        display_results,
        ["EPF", "EPI", "OMQDI"],
        "Structural Similarity Metrics",
        f"{prefix}_structural",
        self.save_dir,
    )

`En(lvlCoeffs, alpha=0.8)`

Compute the weighted energy for one level of wavelet decomposition.

Parameters:

Name	Type	Description	Default
`lvlCoeffs`	`tuple`	Tuple of (LHn, HLn, HHn) coefficient arrays.	required
`alpha`		Weight for the HH sub-band relative to LH and HL. Recommended value 0.8 per ISBN 0139353224.	`0.8`

Returns:

Type	Description
`float`	Weighted energy scalar.

Source code in mhrqi/benchmarks/compare_to.py

def En(lvlCoeffs: tuple, alpha=0.8) -> float:
    """
    Compute the weighted energy for one level of wavelet decomposition.

    Args:
        lvlCoeffs: Tuple of (LHn, HLn, HHn) coefficient arrays.
        alpha: Weight for the HH sub-band relative to LH and HL.
               Recommended value 0.8 per ISBN 0139353224.

    Returns:
        Weighted energy scalar.
    """
    LHn, HLn, HHn = lvlCoeffs
    return (1 - alpha) * (sbEn(LHn) + sbEn(HLn)) / 2 + alpha * sbEn(HHn)

`OMQDI(X, Y, C=1e-10)`

Compute the Objective Measure of Quality of Denoised Images.

Reference: DOI 10.1016/j.bspc.2021.102962

Parameters:

Name	Type	Description	Default
`X`	`array`	Noisy input image of shape (M, N).	required
`Y`	`array`	Denoised output image of shape (M, N).	required
`C`		Small constant to avoid division by zero.	`1e-10`

Returns:

Name	Type	Description
`Tuple`	`(OMQDI, Q1, Q2)`	OMQDI: Combined metric Q1 + Q2, ideal value 2, range [1, 2]. Q1: Edge-Preservation Factor, ideal value 1, range [0, 1]. Q2: Noise-Suppression Factor, ideal value 1, range [0, 1].

Source code in mhrqi/benchmarks/compare_to.py

def OMQDI(X: np.array, Y: np.array, C=1e-10) -> tuple:
    """
    Compute the Objective Measure of Quality of Denoised Images.

    Reference: DOI 10.1016/j.bspc.2021.102962

    Args:
        X: Noisy input image of shape (M, N).
        Y: Denoised output image of shape (M, N).
        C: Small constant to avoid division by zero.

    Returns:
        Tuple (OMQDI, Q1, Q2):
            - OMQDI: Combined metric Q1 + Q2, ideal value 2, range [1, 2].
            - Q1: Edge-Preservation Factor, ideal value 1, range [0, 1].
            - Q2: Noise-Suppression Factor, ideal value 1, range [0, 1].
    """
    CDF97 = getCDF97()

    coeffX = pywt.wavedec2(X, CDF97, level=3)
    coeffY = pywt.wavedec2(Y, CDF97, level=3)

    SX = S(coeffX[1:])
    SY = S(coeffY[1:])

    npX = noise_power(X)
    npY = noise_power(Y)

    Q1 = (2 * SX * SY + C) / (SX**2 + SY**2 + C)
    Q2 = (npX - npY) ** 2 / (npX**2 + npY**2 + C)
    return (Q1 + Q2, Q1, Q2)

`S(decompCoeffs, alpha=0.8)`

Compute the cumulative multi-level wavelet energy.

Parameters:

Name	Type	Description	Default
`decompCoeffs`	`list`	List of (LHn, HLn, HHn) tuples for each decomposition level, ordered from coarsest to finest.	required
`alpha`		Weight for the HH sub-band. See En().	`0.8`

Returns:

Type	Description
`float`	Cumulative energy scalar.

Source code in mhrqi/benchmarks/compare_to.py

def S(decompCoeffs: list, alpha=0.8) -> float:
    """
    Compute the cumulative multi-level wavelet energy.

    Args:
        decompCoeffs: List of (LHn, HLn, HHn) tuples for each decomposition level,
                      ordered from coarsest to finest.
        alpha: Weight for the HH sub-band. See En().

    Returns:
        Cumulative energy scalar.
    """
    energy = 0
    for i, lvlCoeffs in enumerate(decompCoeffs):
        n = i + 1
        energy += 2 ** (3 - n) * En(lvlCoeffs, alpha)
    return energy

`auto_homogeneous_roi(img, win=20, stride=10)`

Find the most homogeneous window in the image.

Selects the window with lowest coefficient of variation (CoV), with a center-bias penalty to prefer central regions.

Parameters:

Name	Description	Default
`img`	Grayscale float image in [0, 1].	required
`win`	Window size in pixels.	`20`
`stride`	Step size for the sliding window search.	`10`

Returns:

Type	Description
	Tuple (y, x, h, w) of the selected ROI.

Raises:

Type	Description
`RuntimeError`	If no valid homogeneous region is found.

Source code in mhrqi/benchmarks/compare_to.py

def auto_homogeneous_roi(img, win=20, stride=10):
    """
    Find the most homogeneous window in the image.

    Selects the window with lowest coefficient of variation (CoV),
    with a center-bias penalty to prefer central regions.

    Args:
        img: Grayscale float image in [0, 1].
        win: Window size in pixels.
        stride: Step size for the sliding window search.

    Returns:
        Tuple (y, x, h, w) of the selected ROI.

    Raises:
        RuntimeError: If no valid homogeneous region is found.
    """
    H, W = img.shape
    best_cov = np.inf
    best_roi = None
    eps = 1e-6

    for y in range(0, H - win, stride):
        for x in range(0, W - win, stride):
            patch = img[y : y + win, x : x + win]
            mu = patch.mean()
            if mu < eps or mu > 0.95:
                continue
            sigma = patch.std()
            cov = sigma / mu
            dist_to_center = np.sqrt((y + win / 2 - H / 2) ** 2 + (x + win / 2 - W / 2) ** 2)
            dist_norm = dist_to_center / np.sqrt((H / 2) ** 2 + (W / 2) ** 2)
            cost = cov + 0.1 * dist_norm
            if cost < best_cov:
                best_cov = cost
                best_roi = (y, x, win, win)

    if best_roi is None:
        raise RuntimeError("Failed to find homogeneous ROI")

    return best_roi

`compare_to(image_input, **kwargs)`

Legacy wrapper for BenchmarkSuite.

Source code in mhrqi/benchmarks/compare_to.py

def compare_to(image_input, **kwargs):
    """Legacy wrapper for BenchmarkSuite."""
    suite = BenchmarkSuite(
        image_input, reference_image=kwargs.get("reference_image"), save_dir=kwargs.get("save_dir")
    )
    results = suite.run(
        methods=kwargs.get("methods", "all"), proposed_image=kwargs.get("proposed_img")
    )
    if kwargs.get("save", True):
        suite.save_reports(prefix=kwargs.get("save_prefix", "denoised"))
    return results

`getCDF97(weight=1)`

Return the CDF 9/7 biorthogonal wavelet used by OMQDI.

Parameters:

Name	Type	Description	Default
`weight`		Optional scalar weight applied to all filter coefficients.	`1`

Returns:

Type	Description
	pywt.Wavelet object configured with CDF 9/7 filter banks.

Source code in mhrqi/benchmarks/compare_to.py

def getCDF97(weight=1):
    """
    Return the CDF 9/7 biorthogonal wavelet used by OMQDI.

    Args:
        weight: Optional scalar weight applied to all filter coefficients.

    Returns:
        pywt.Wavelet object configured with CDF 9/7 filter banks.
    """
    analysis_LP = np.array(
        [
            0,
            0.026748757411,
            -0.016864118443,
            -0.078223266529,
            0.266864118443,
            0.602949018236,
            0.266864118443,
            -0.078223266529,
            -0.016864118443,
            0.026748757411,
        ]
    )
    analysis_LP *= weight

    analysis_HP = np.array(
        [
            0,
            0.091271763114,
            -0.057543526229,
            -0.591271763114,
            1.11508705,
            -0.591271763114,
            -0.057543526229,
            0.091271763114,
            0,
            0,
        ]
    )
    analysis_HP *= weight

    synthesis_LP = np.array(
        [
            0,
            -0.091271763114,
            -0.057543526229,
            0.591271763114,
            1.11508705,
            0.591271763114,
            -0.057543526229,
            -0.091271763114,
            0,
            0,
        ]
    )
    synthesis_LP *= weight

    synthesis_HP = np.array(
        [
            0,
            0.026748757411,
            0.016864118443,
            -0.078223266529,
            -0.266864118443,
            0.602949018236,
            -0.266864118443,
            -0.078223266529,
            0.016864118443,
            0.026748757411,
        ]
    )
    synthesis_HP *= weight

    return pywt.Wavelet("CDF97", [analysis_LP, analysis_HP, synthesis_LP, synthesis_HP])

`local_mean(img, window=3, pad_mode='reflect')`

Compute the local mean of pixel intensities using a uniform kernel.

Parameters:

Name	Type	Description	Default
`img`	`array`	Input image of shape (M, N).	required
`window`		Kernel size.	`3`
`pad_mode`		Padding mode for convolution.	`'reflect'`

Returns:

Type	Description
`array`	Local mean array of shape (M, N).

Source code in mhrqi/benchmarks/compare_to.py

def local_mean(img: np.array, window=3, pad_mode="reflect") -> np.array:
    """
    Compute the local mean of pixel intensities using a uniform kernel.

    Args:
        img: Input image of shape (M, N).
        window: Kernel size.
        pad_mode: Padding mode for convolution.

    Returns:
        Local mean array of shape (M, N).
    """
    return convolve(img, np.full((window, window), 1 / window**2), mode=pad_mode)

`local_variance(img)`

Compute the local variance of an image.

Parameters:

Name	Type	Description	Default
`img`	`array`	Input image of shape (M, N).	required

Returns:

Type	Description
`array`	Local variance array of shape (M, N).

Source code in mhrqi/benchmarks/compare_to.py

def local_variance(img: np.array) -> np.array:
    """
    Compute the local variance of an image.

    Args:
        img: Input image of shape (M, N).

    Returns:
        Local variance array of shape (M, N).
    """
    mu_sq = np.square(local_mean(img))
    return local_mean(np.square(img)) - mu_sq

`noise_power(img)`

Estimate the noise power (σ̂) of an image as the mean local variance.

Parameters:

Name	Type	Description	Default
`img`	`array`	Input image of shape (M, N).	required

Returns:

Type	Description
`float`	Estimated noise power scalar.

Source code in mhrqi/benchmarks/compare_to.py

def noise_power(img: np.array) -> float:
    """
    Estimate the noise power (σ̂) of an image as the mean local variance.

    Args:
        img: Input image of shape (M, N).

    Returns:
        Estimated noise power scalar.
    """
    return np.mean(local_variance(img))

`sbEn(coeffs)`

Compute the energy of a wavelet sub-band.

Parameters:

Name	Type	Description	Default
`coeffs`	`array`	2D array of wavelet coefficients for the sub-band.	required

Returns:

Type	Description
`float`	Sub-band energy as a scalar.

Source code in mhrqi/benchmarks/compare_to.py

def sbEn(coeffs: np.array) -> float:
    """
    Compute the energy of a wavelet sub-band.

    Args:
        coeffs: 2D array of wavelet coefficients for the sub-band.

    Returns:
        Sub-band energy as a scalar.
    """
    rows, cols = coeffs.shape
    return np.log(1 + np.sum(np.square(coeffs)) / (rows * cols))

Statistical Benchmarks

`statistical_benchmark`

╔══════════════════════════════════════════════════════════════════════════════╗ ║ Statistical Treatment: Wilcoxon Signed-Rank Test ║ ║ Benchmark MHRQI vs State-of-the-Art on Medical OCT Images ║ ╚══════════════════════════════════════════════════════════════════════════════╝

Folder Structure: benchmark/ └── YYYYMMDD_HHMMSS/ ├── cnv1/ │ ├── original.png │ ├── denoised_bm3d.png │ ├── denoised_nlmeans.png │ ├── denoised_srad.png │ ├── denoised_proposed.png │ └── report_*.png ├── cnv2/ │ └── ... └── metrics/ ├── raw_results.json ├── summary.json ├── speckle_consistency_metrics.png └── no-reference_quality.png

SOTA Reference: BM3D (for Full-Reference metrics)

`create_results_table(all_results, metrics_dir)`

Create summary tables for all metrics.

Source code in mhrqi/benchmarks/statistical_benchmark.py

def create_results_table(all_results, metrics_dir):
    """
    Create summary tables for all metrics.
    """
    # Aggregate metrics across all images
    all_metrics_list = ALL_SPECKLE_METRICS + STRUCTURAL_METRICS
    method_metrics = {m: {k: [] for k in all_metrics_list} for m in METHODS}

    for _img_name, methods in all_results.items():
        for method in METHODS:
            if method in methods:
                for metric in all_metrics_list:
                    val = methods[method].get(metric, float("nan"))
                    if not np.isnan(val):
                        method_metrics[method][metric].append(val)

    # Calculate means and stds
    summary = {}
    for method in METHODS:
        summary[method] = {}
        for metric in all_metrics_list:
            vals = method_metrics[method][metric]
            if vals:
                summary[method][metric] = {
                    "mean": np.mean(vals),
                    "std": np.std(vals),
                    "n": len(vals),
                }
            else:
                summary[method][metric] = {"mean": float("nan"), "std": float("nan"), "n": 0}

    # Print table
    print("\n" + "=" * 100)
    print("BENCHMARK RESULTS (Mean ± Std across all images)")
    print("=" * 100)

    all_metrics = all_metrics_list
    header = f"{'Method':<12}" + "".join(f"{m:<15}" for m in all_metrics)
    print(header)
    print("-" * 80)

    for method in METHODS:
        row = f"{method:<12}"
        for metric in all_metrics:
            s = summary[method][metric]
            if s["n"] > 0:
                row += f"{s['mean']:.4f}±{s['std']:.4f}  "
            else:
                row += "N/A           "
        print(row)

    # ==========================================================================
    # STATISTICAL SIGNIFICANCE (Wilcoxon Signed-Rank Test)
    # All metrics are paired samples (same images, different methods)
    # ==========================================================================

    print("\n" + "=" * 90)
    print("STATISTICAL SIGNIFICANCE (Wilcoxon Signed-Rank Test)")
    print("=" * 90)

    # Define metric categories with hypotheses
    metric_categories = [
        {
            "name": "Speckle Reduction (Lower Better)",
            "metrics": SPECKLE_METRICS_LOWER,
            "higher_better": False,  # Lower is better for SSI, SMPI
            "H0": "MHRQI achieves the same speckle reduction as [competitor]",
            "H1": "MHRQI achieves different speckle reduction than [competitor]",
        },
        {
            "name": "Speckle Reduction (Higher Better)",
            "metrics": SPECKLE_METRICS_HIGHER,
            "higher_better": True,  # Higher is better for ENL, CNR, NSF
            "H0": "MHRQI achieves the same speckle reduction as [competitor]",
            "H1": "MHRQI achieves different speckle reduction than [competitor]",
        },
        {
            "name": "Structural Similarity",
            "metrics": STRUCTURAL_METRICS,
            "higher_better": True,  # Higher is better for EPF, EPI, OMQDI
            "H0": "MHRQI achieves the same structural preservation as [competitor]",
            "H1": "MHRQI achieves different structural preservation than [competitor]",
        },
        # Naturalness removed (NIQE computed but not reported - biased for medical images)
    ]

    # Store results for summary
    stat_results = []

    for category in metric_categories:
        print(f"\n{'─' * 90}")
        print(f"Category: {category['name']}")
        print(f"  H₀: {category['H0']}")
        print(f"  H₁: {category['H1']}")
        print(f"  Direction: {'Higher' if category['higher_better'] else 'Lower'} is better")
        print(f"{'─' * 90}")

        for other_method in ["bm3d", "nlmeans", "srad"]:
            print(f"\n  {other_method.upper()} vs MHRQI:")

            for metric in category["metrics"]:
                stat, p_val, diffs = wilcoxon_test(all_results, "proposed", other_method, metric)

                if p_val is not None:
                    mean_diff = np.mean(diffs)

                    # Interpret based on direction
                    if category["higher_better"]:
                        mhrqi_better = mean_diff > 0
                    else:
                        mhrqi_better = mean_diff < 0  # Lower is better

                    # Determine significance and language
                    if p_val < 0.05:
                        decision = "Reject H₀"
                        if mhrqi_better:
                            interpretation = "MHRQI significantly better"
                        else:
                            interpretation = f"{other_method} significantly better"
                    else:
                        decision = "Fail to reject H₀"
                        interpretation = "Comparable (no significant difference)"

                    sig = (
                        "***"
                        if p_val < 0.001
                        else "**"
                        if p_val < 0.01
                        else "*"
                        if p_val < 0.05
                        else "n.s."
                    )

                    print(f"    {metric:<10}: p={p_val:.4f} ({sig:<4}) → {decision}")
                    print(f"               Mean Δ={mean_diff:+.4f} → {interpretation}")

                    stat_results.append(
                        {
                            "category": category["name"],
                            "competitor": other_method,
                            "metric": metric,
                            "p_value": p_val,
                            "mean_diff": mean_diff,
                            "interpretation": interpretation,
                            "significant": p_val < 0.05,
                        }
                    )
                else:
                    print(f"    {metric:<10}: insufficient data (need ≥3 paired samples)")

    # Save statistical results
    with open(os.path.join(metrics_dir, "statistical_results.json"), "w") as f:
        json.dump(
            stat_results,
            f,
            indent=2,
            default=lambda x: int(x)
            if isinstance(x, (bool, np.bool_))
            else float(x)
            if isinstance(x, np.floating)
            else x,
        )

    # Save summary
    with open(os.path.join(metrics_dir, "summary.json"), "w") as f:
        json.dump(
            summary, f, indent=2, default=lambda x: float(x) if isinstance(x, np.floating) else x
        )

    return summary, stat_results

`create_summary_heatmap(stat_results, metrics_dir)`

Create a WIN/TIE/LOSS heatmap from pre-computed statistical results.

Parameters:

Name	Type	Description	Default
`stat_results`		List of dicts from create_results_table() (same structure as statistical_results.json).	required
`metrics_dir`		Directory where the PNG is saved.	required

Source code in mhrqi/benchmarks/statistical_benchmark.py

def create_summary_heatmap(stat_results, metrics_dir):
    """
    Create a WIN/TIE/LOSS heatmap from pre-computed statistical results.

    Args:
        stat_results: List of dicts from create_results_table() (same structure
                      as statistical_results.json).
        metrics_dir:  Directory where the PNG is saved.
    """
    import matplotlib.colors as mcolors
    import matplotlib.pyplot as plt
    from matplotlib.patches import Patch

    if not stat_results:
        print("  No statistical results to plot; skipping heatmap.")
        return

    competitors = sorted({d["competitor"] for d in stat_results})
    metrics = sorted({d["metric"] for d in stat_results})

    # Preserve category grouping
    categories = {d["metric"]: d["category"] for d in stat_results}
    metrics.sort(key=lambda x: categories[x])

    # Build matrix: 2 = WIN, 1 = TIE, 0 = LOSS
    heatmap_data = np.zeros((len(competitors), len(metrics)))
    for d in stat_results:
        row = competitors.index(d["competitor"])
        col = metrics.index(d["metric"])
        interp = d["interpretation"].lower()
        if "mhrqi significantly better" in interp:
            heatmap_data[row, col] = 2
        elif "comparable" in interp:
            heatmap_data[row, col] = 1
        # else 0 (competitor better)

    fig, ax = plt.subplots(figsize=(14, 7))

    cmap = mcolors.ListedColormap(["#FF8A8A", "#FFFBD1", "#A3EBB1"])
    bounds = [-0.5, 0.5, 1.5, 2.5]
    norm = mcolors.BoundaryNorm(bounds, cmap.N)

    ax.imshow(heatmap_data, cmap=cmap, norm=norm, aspect="auto")

    # Grid lines
    ax.set_xticks(np.arange(len(metrics) + 1) - 0.5, minor=True)
    ax.set_yticks(np.arange(len(competitors) + 1) - 0.5, minor=True)
    ax.grid(which="minor", color="white", linestyle="-", linewidth=3)
    ax.tick_params(which="minor", bottom=False, left=False)

    # Axis labels
    ax.set_xticks(np.arange(len(metrics)))
    ax.set_yticks(np.arange(len(competitors)))
    ax.set_xticklabels(metrics, fontsize=11, fontweight="bold", rotation=0)
    ax.set_yticklabels([c.upper() for c in competitors], fontsize=11, fontweight="bold")

    # Cell annotations
    for i in range(len(competitors)):
        for j in range(len(metrics)):
            val = heatmap_data[i, j]
            label = "WIN" if val == 2 else "TIE" if val == 1 else "LOSS"
            ax.text(
                j,
                i,
                label,
                ha="center",
                va="center",
                color="#333333",
                fontsize=11,
                fontweight="bold",
            )

    # Category span headers
    unique_cats, cat_starts, current_cat = [], [], None
    for i, m in enumerate(metrics):
        if categories[m] != current_cat:
            unique_cats.append(categories[m])
            cat_starts.append(i)
            current_cat = categories[m]

    for i, cat in enumerate(unique_cats):
        start = cat_starts[i]
        end = cat_starts[i + 1] if i + 1 < len(cat_starts) else len(metrics)
        mid = (start + end - 1) / 2
        ax.annotate(
            "",
            xy=(start - 0.4, -0.6),
            xytext=(end - 0.6, -0.6),
            xycoords="data",
            textcoords="data",
            arrowprops={"arrowstyle": "-", "color": "gray", "lw": 1.5},
        )
        ax.text(
            mid,
            -0.8,
            cat,
            ha="center",
            va="bottom",
            fontsize=10,
            fontweight="bold",
            style="italic",
            color="#555555",
        )

    ax.spines[:].set_visible(False)
    ax.set_title(
        "MHRQI Performance Benchmark Summary\n(Statistical Significance vs SOTA)",
        fontsize=15,
        fontweight="bold",
        pad=50,
    )

    legend_elements = [
        Patch(facecolor="#A3EBB1", label="MHRQI Significantly Better (p < 0.05)"),
        Patch(facecolor="#FFFBD1", label="Competitive / No Significant Difference"),
        Patch(facecolor="#FF8A8A", label="Competitor Significantly Better (p < 0.05)"),
    ]
    ax.legend(
        handles=legend_elements,
        loc="upper center",
        bbox_to_anchor=(0.5, -0.15),
        ncol=3,
        frameon=False,
        fontsize=10,
    )

    plt.tight_layout()
    output_path = os.path.join(metrics_dir, "benchmark_summary_heatmap.png")
    plt.savefig(output_path, dpi=150, bbox_inches="tight")
    plt.close()
    print("  Saved: benchmark_summary_heatmap.png")

`create_visualization(all_results, metrics_dir)`

Create separate visualizations for each metric category.

Source code in mhrqi/benchmarks/statistical_benchmark.py

def create_visualization(all_results, metrics_dir):
    """
    Create separate visualizations for each metric category.
    """
    import matplotlib.colors as mcolors
    import matplotlib.pyplot as plt
    from matplotlib.patches import Patch

    methods = METHODS

    metric_groups = [
        ("Speckle Reduction (Lower Better)", SPECKLE_METRICS_LOWER, False),
        ("Speckle Reduction (Higher Better)", SPECKLE_METRICS_HIGHER, True),
        ("Structural Similarity Metrics", STRUCTURAL_METRICS, True),
        # Naturalness removed (NIQE computed but not reported)
    ]

    for group_name, metrics, higher_better in metric_groups:
        method_means = {m: [] for m in methods}
        method_stds = {m: [] for m in methods}

        for metric in metrics:
            for method in methods:
                vals = []
                for img_results in all_results.values():
                    if method in img_results:
                        v = img_results[method].get(metric, float("nan"))
                        if not np.isnan(v):
                            vals.append(v)
                if vals:
                    # Clip to 10000 for visualization if needed
                    m_val = np.mean(vals)
                    s_val = np.std(vals)
                    if m_val > 10000:
                        m_val = 10000
                    if s_val > 10000:
                        s_val = 10000
                    method_means[method].append(m_val)
                    method_stds[method].append(s_val)
                else:
                    method_means[method].append(0)
                    method_stds[method].append(0)

        # Skip if no data
        if all(all(v == 0 for v in method_means[m]) for m in methods):
            continue

        width = 0.6

        fig, axes = plt.subplots(len(metrics), 1, figsize=(8, 4 * len(metrics)), sharex=False)
        if len(metrics) == 1:
            axes = [axes]

        colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728"]
        labels = ["BM3D", "NL-Means", "SRAD", "MHRQI (Ours)"]

        for idx, (ax, metric) in enumerate(zip(axes, metrics)):
            means = [method_means[m][idx] for m in methods]
            stds = [method_stds[m][idx] for m in methods]

            # Use bars instead of grouped bars since we have subplots
            ax.bar(methods, means, width, yerr=stds, capsize=5, color=colors)

            ax.set_title(f"Metric: {metric}", fontsize=12, fontweight="bold")
            ax.set_ylabel("Score", fontsize=10)
            ax.grid(axis="y", alpha=0.3)

            # Use labels for methods
            ax.set_xticks(np.arange(len(methods)))
            ax.set_xticklabels(labels)

            direction_note = "↑ Higher is better" if higher_better else "↓ Lower is better"
            ax.annotate(
                direction_note,
                xy=(0.98, 0.02),
                xycoords="axes fraction",
                ha="right",
                fontsize=9,
                style="italic",
                color="gray",
            )

        fig.suptitle(group_name, fontsize=14, fontweight="bold", y=0.98)
        plt.tight_layout(rect=[0, 0.03, 1, 0.95])
        filename = (
            group_name.lower()
            .replace(" ", "_")
            .replace("/", "_")
            .replace("(", "")
            .replace(")", "")
            .replace("=", "")
            + ".png"
        )
        plt.savefig(os.path.join(metrics_dir, filename), dpi=150)
        plt.close()

        print(f"  Saved: {filename}")

    print(f"\nVisualizations saved to: {metrics_dir}")

`run_benchmark(n=64, _strength=None)`

Run benchmark on all medical images and collect metrics.

Folder structure: - benchmark/timestamp/imagename/ for each image - benchmark/timestamp/metrics/ for aggregated stats

Parameters:

Name	Type	Description	Default
`n`		Image size	`64`
`strength`		Denoiser strength parameter	required

Returns:

Name	Type	Description
`results`		Dict mapping image -> method -> metrics
`base_dir`		Path to benchmark output directory

Source code in mhrqi/benchmarks/statistical_benchmark.py

def run_benchmark(n=64, _strength=None):
    """
    Run benchmark on all medical images and collect metrics.

    Folder structure:
    - benchmark/timestamp/imagename/ for each image
    - benchmark/timestamp/metrics/ for aggregated stats

    Args:
        n: Image size
        strength: Denoiser strength parameter

    Returns:
        results: Dict mapping image -> method -> metrics
        base_dir: Path to benchmark output directory
    """
    # Create base directory: benchmark/timestamp/ (matching runs/ format)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M")  # Same as runs/ folder
    base_dir = os.path.join("benchmark", timestamp)
    metrics_dir = os.path.join(base_dir, "metrics")
    os.makedirs(metrics_dir, exist_ok=True)

    all_results = {}

    for img_path in MEDICAL_IMAGES:
        img_name = os.path.basename(img_path).replace(".jpeg", "")
        img_dir = os.path.join(base_dir, img_name)
        os.makedirs(img_dir, exist_ok=True)

        print(f"\n{'=' * 60}")
        print(f"Processing: {img_name}")
        print(f"Output: {img_dir}")
        print(f"{'=' * 60}")

        # Run MHRQI pipeline
        orig, recon, run_dir = main.main(
            shots=1000,
            n=n,
            d=2,
            denoise=True,
            use_shots=False,
            fast=True,
            verbose_plots=False,
            img_path=img_path,
            run_comparison=False,
        )

        # Save original
        cv2.imwrite(os.path.join(img_dir, "original.png"), orig)

        # Run comparison with BM3D as SOTA reference for FR metrics
        noisy_img = compare_to.to_float01(orig)

        # Run comparison (no synthetic clean reference)
        comparison_results = compare_to.compare_to(
            noisy_img,
            proposed_img=compare_to.to_float01(recon),
            methods="all",
            plot=True,  # Generate report plots
            save=True,
            save_prefix="denoised",
            save_dir=img_dir,
            reference_image=None,  # No synthetic reference
        )

        # Extract metrics for each method
        img_results = {}
        for r in comparison_results:
            method_name = r["name"]
            img_results[method_name] = r["metrics"]

        all_results[img_name] = img_results

    # Save raw results to metrics folder
    with open(os.path.join(metrics_dir, "raw_results.json"), "w") as f:
        serializable = {}
        for img, methods in all_results.items():
            serializable[img] = {}
            for method, metrics in methods.items():
                serializable[img][method] = {
                    k: float(v) if not np.isnan(v) else None for k, v in metrics.items()
                }
        json.dump(serializable, f, indent=2)

    return all_results, base_dir, metrics_dir

`wilcoxon_test(all_results, method1, method2, metric)`

Perform Wilcoxon signed-rank test between two methods on a specific metric.

Source code in mhrqi/benchmarks/statistical_benchmark.py

def wilcoxon_test(all_results, method1, method2, metric):
    """
    Perform Wilcoxon signed-rank test between two methods on a specific metric.
    """
    diffs = []
    for _img_name, methods in all_results.items():
        if method1 in methods and method2 in methods:
            v1 = methods[method1].get(metric, float("nan"))
            v2 = methods[method2].get(metric, float("nan"))
            if not np.isnan(v1) and not np.isnan(v2):
                diffs.append(v1 - v2)

    if len(diffs) < 3:
        return None, None, diffs

    try:
        stat, p_value = stats.wilcoxon(diffs)
        return stat, p_value, diffs
    except Exception as e:
        print(f"Wilcoxon test failed: {e}")
        return None, None, diffs