Skip to content

API Reference: Benchmarks

Comparison Suite

compare_to

╔══════════════════════════════════════════════════════════════════════════════╗ ║ MHRQI - Multi-scale Hierarchical Representation of Quantum Images ║ ║ Classical Denoiser Comparison: BM3D, NL-Means, SRAD ║ ║ ║ ║ Author: Keno S. Jose ║ ║ License: Apache 2.0 ║ ╚══════════════════════════════════════════════════════════════════════════════╝

BenchmarkSuite

Class-based interface for running classical denoiser benchmarks.

Source code in mhrqi/benchmarks/compare_to.py
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
class BenchmarkSuite:
    """
    Class-based interface for running classical denoiser benchmarks.
    """

    def __init__(self, noisy_image, reference_image=None, save_dir=None):
        self.noisy_image = to_float01(noisy_image)
        self.reference_image = to_float01(reference_image) if reference_image is not None else None
        self.save_dir = save_dir
        self.results = []

        if self.save_dir:
            os.makedirs(self.save_dir, exist_ok=True)

        try:
            self.roi = auto_homogeneous_roi(self.noisy_image)
        except RuntimeError:
            self.roi = (0, 0, self.noisy_image.shape[0], self.noisy_image.shape[1])

    def run(self, methods="all", proposed_image=None):
        """
        Run benchmarks and return results.
        """
        denoisers = {"bm3d": denoise_bm3d, "nlmeans": denoise_nlmeans, "srad": denoise_srad}

        if methods == "all":
            methods_list = list(denoisers.keys())
        elif isinstance(methods, str):
            methods_list = [methods]
        else:
            methods_list = methods

        # Build list of (name, function)
        to_run = [("Original", None)]
        for name in methods_list:
            if name in denoisers:
                to_run.append((name, denoisers[name]))

        if proposed_image is not None:
            to_run.append(("Proposed", None))
            proposed_float = to_float01(proposed_image)
        else:
            proposed_float = None

        self.results = []
        for name, func in to_run:
            if name == "Original":
                res_img = self.noisy_image
            elif name == "Proposed":
                res_img = proposed_float
            else:
                res_img = func(self.noisy_image)

            metrics = self._compute_metrics(res_img)
            self.results.append({"name": name, "metrics": metrics, "image": to_uint8(res_img)})
        return self.results

    def _compute_metrics(self, res_img):
        m = {}
        m["NIQE"] = plots.compute_niqe(res_img)
        m["SSI"] = plots.compute_ssi(self.noisy_image, res_img, self.roi)
        m["SMPI"] = plots.compute_smpi(self.noisy_image, res_img)
        m["ENL"] = plots.compute_enl(res_img, self.roi)
        cnr_result = plots.compute_cnr(res_img)
        m["CNR"] = cnr_result[0]
        omqdi_result = OMQDI(self.noisy_image, res_img)
        m["OMQDI"] = omqdi_result[0]
        m["EPF"] = omqdi_result[1]
        m["NSF"] = omqdi_result[2]
        m["EPI"] = plots.compute_epi(self.noisy_image, res_img)

        if self.reference_image is not None:
            m["FSIM"] = plots.compute_fsim(self.reference_image, res_img)
            m["SSIM"] = plots.compute_ssim(self.reference_image, res_img)
        else:
            m["FSIM"] = np.nan
            m["SSIM"] = np.nan
        return m

    def save_reports(self, prefix="report"):
        """Save comparison reports and images."""
        if not self.save_dir:
            raise ValueError("save_dir must be set to save reports")

        for res in self.results:
            cv2.imwrite(os.path.join(self.save_dir, f"{prefix}_{res['name']}.png"), res["image"])

        # Split results for report generation
        display_results = [r for r in self.results if r["name"] != "Original"]

        if self.reference_image is not None:
            plots.MetricsPlotter.save_summary_report(
                to_uint8(self.reference_image),
                display_results,
                ["FSIM", "SSIM"],
                "Full Reference Metrics",
                f"{prefix}_full_ref",
                self.save_dir,
            )

        plots.MetricsPlotter.save_summary_report(
            to_uint8(self.noisy_image),
            display_results,
            ["SSI", "SMPI", "NSF", "ENL", "CNR"],
            "Speckle Reduction Metrics",
            f"{prefix}_speckle",
            self.save_dir,
        )

        plots.MetricsPlotter.save_summary_report(
            to_uint8(self.noisy_image),
            display_results,
            ["EPF", "EPI", "OMQDI"],
            "Structural Similarity Metrics",
            f"{prefix}_structural",
            self.save_dir,
        )

run(methods='all', proposed_image=None)

Run benchmarks and return results.

Source code in mhrqi/benchmarks/compare_to.py
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
def run(self, methods="all", proposed_image=None):
    """
    Run benchmarks and return results.
    """
    denoisers = {"bm3d": denoise_bm3d, "nlmeans": denoise_nlmeans, "srad": denoise_srad}

    if methods == "all":
        methods_list = list(denoisers.keys())
    elif isinstance(methods, str):
        methods_list = [methods]
    else:
        methods_list = methods

    # Build list of (name, function)
    to_run = [("Original", None)]
    for name in methods_list:
        if name in denoisers:
            to_run.append((name, denoisers[name]))

    if proposed_image is not None:
        to_run.append(("Proposed", None))
        proposed_float = to_float01(proposed_image)
    else:
        proposed_float = None

    self.results = []
    for name, func in to_run:
        if name == "Original":
            res_img = self.noisy_image
        elif name == "Proposed":
            res_img = proposed_float
        else:
            res_img = func(self.noisy_image)

        metrics = self._compute_metrics(res_img)
        self.results.append({"name": name, "metrics": metrics, "image": to_uint8(res_img)})
    return self.results

save_reports(prefix='report')

Save comparison reports and images.

Source code in mhrqi/benchmarks/compare_to.py
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
def save_reports(self, prefix="report"):
    """Save comparison reports and images."""
    if not self.save_dir:
        raise ValueError("save_dir must be set to save reports")

    for res in self.results:
        cv2.imwrite(os.path.join(self.save_dir, f"{prefix}_{res['name']}.png"), res["image"])

    # Split results for report generation
    display_results = [r for r in self.results if r["name"] != "Original"]

    if self.reference_image is not None:
        plots.MetricsPlotter.save_summary_report(
            to_uint8(self.reference_image),
            display_results,
            ["FSIM", "SSIM"],
            "Full Reference Metrics",
            f"{prefix}_full_ref",
            self.save_dir,
        )

    plots.MetricsPlotter.save_summary_report(
        to_uint8(self.noisy_image),
        display_results,
        ["SSI", "SMPI", "NSF", "ENL", "CNR"],
        "Speckle Reduction Metrics",
        f"{prefix}_speckle",
        self.save_dir,
    )

    plots.MetricsPlotter.save_summary_report(
        to_uint8(self.noisy_image),
        display_results,
        ["EPF", "EPI", "OMQDI"],
        "Structural Similarity Metrics",
        f"{prefix}_structural",
        self.save_dir,
    )

En(lvlCoeffs, alpha=0.8)

Compute the weighted energy for one level of wavelet decomposition.

Parameters:

Name Type Description Default
lvlCoeffs tuple

Tuple of (LHn, HLn, HHn) coefficient arrays.

required
alpha

Weight for the HH sub-band relative to LH and HL. Recommended value 0.8 per ISBN 0139353224.

0.8

Returns:

Type Description
float

Weighted energy scalar.

Source code in mhrqi/benchmarks/compare_to.py
357
358
359
360
361
362
363
364
365
366
367
368
369
370
def En(lvlCoeffs: tuple, alpha=0.8) -> float:
    """
    Compute the weighted energy for one level of wavelet decomposition.

    Args:
        lvlCoeffs: Tuple of (LHn, HLn, HHn) coefficient arrays.
        alpha: Weight for the HH sub-band relative to LH and HL.
               Recommended value 0.8 per ISBN 0139353224.

    Returns:
        Weighted energy scalar.
    """
    LHn, HLn, HHn = lvlCoeffs
    return (1 - alpha) * (sbEn(LHn) + sbEn(HLn)) / 2 + alpha * sbEn(HHn)

OMQDI(X, Y, C=1e-10)

Compute the Objective Measure of Quality of Denoised Images.

Reference: DOI 10.1016/j.bspc.2021.102962

Parameters:

Name Type Description Default
X array

Noisy input image of shape (M, N).

required
Y array

Denoised output image of shape (M, N).

required
C

Small constant to avoid division by zero.

1e-10

Returns:

Name Type Description
Tuple (OMQDI, Q1, Q2)
  • OMQDI: Combined metric Q1 + Q2, ideal value 2, range [1, 2].
  • Q1: Edge-Preservation Factor, ideal value 1, range [0, 1].
  • Q2: Noise-Suppression Factor, ideal value 1, range [0, 1].
Source code in mhrqi/benchmarks/compare_to.py
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
def OMQDI(X: np.array, Y: np.array, C=1e-10) -> tuple:
    """
    Compute the Objective Measure of Quality of Denoised Images.

    Reference: DOI 10.1016/j.bspc.2021.102962

    Args:
        X: Noisy input image of shape (M, N).
        Y: Denoised output image of shape (M, N).
        C: Small constant to avoid division by zero.

    Returns:
        Tuple (OMQDI, Q1, Q2):
            - OMQDI: Combined metric Q1 + Q2, ideal value 2, range [1, 2].
            - Q1: Edge-Preservation Factor, ideal value 1, range [0, 1].
            - Q2: Noise-Suppression Factor, ideal value 1, range [0, 1].
    """
    CDF97 = getCDF97()

    coeffX = pywt.wavedec2(X, CDF97, level=3)
    coeffY = pywt.wavedec2(Y, CDF97, level=3)

    SX = S(coeffX[1:])
    SY = S(coeffY[1:])

    npX = noise_power(X)
    npY = noise_power(Y)

    Q1 = (2 * SX * SY + C) / (SX**2 + SY**2 + C)
    Q2 = (npX - npY) ** 2 / (npX**2 + npY**2 + C)
    return (Q1 + Q2, Q1, Q2)

S(decompCoeffs, alpha=0.8)

Compute the cumulative multi-level wavelet energy.

Parameters:

Name Type Description Default
decompCoeffs list

List of (LHn, HLn, HHn) tuples for each decomposition level, ordered from coarsest to finest.

required
alpha

Weight for the HH sub-band. See En().

0.8

Returns:

Type Description
float

Cumulative energy scalar.

Source code in mhrqi/benchmarks/compare_to.py
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
def S(decompCoeffs: list, alpha=0.8) -> float:
    """
    Compute the cumulative multi-level wavelet energy.

    Args:
        decompCoeffs: List of (LHn, HLn, HHn) tuples for each decomposition level,
                      ordered from coarsest to finest.
        alpha: Weight for the HH sub-band. See En().

    Returns:
        Cumulative energy scalar.
    """
    energy = 0
    for i, lvlCoeffs in enumerate(decompCoeffs):
        n = i + 1
        energy += 2 ** (3 - n) * En(lvlCoeffs, alpha)
    return energy

auto_homogeneous_roi(img, win=20, stride=10)

Find the most homogeneous window in the image.

Selects the window with lowest coefficient of variation (CoV), with a center-bias penalty to prefer central regions.

Parameters:

Name Type Description Default
img

Grayscale float image in [0, 1].

required
win

Window size in pixels.

20
stride

Step size for the sliding window search.

10

Returns:

Type Description

Tuple (y, x, h, w) of the selected ROI.

Raises:

Type Description
RuntimeError

If no valid homogeneous region is found.

Source code in mhrqi/benchmarks/compare_to.py
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
def auto_homogeneous_roi(img, win=20, stride=10):
    """
    Find the most homogeneous window in the image.

    Selects the window with lowest coefficient of variation (CoV),
    with a center-bias penalty to prefer central regions.

    Args:
        img: Grayscale float image in [0, 1].
        win: Window size in pixels.
        stride: Step size for the sliding window search.

    Returns:
        Tuple (y, x, h, w) of the selected ROI.

    Raises:
        RuntimeError: If no valid homogeneous region is found.
    """
    H, W = img.shape
    best_cov = np.inf
    best_roi = None
    eps = 1e-6

    for y in range(0, H - win, stride):
        for x in range(0, W - win, stride):
            patch = img[y : y + win, x : x + win]
            mu = patch.mean()
            if mu < eps or mu > 0.95:
                continue
            sigma = patch.std()
            cov = sigma / mu
            dist_to_center = np.sqrt((y + win / 2 - H / 2) ** 2 + (x + win / 2 - W / 2) ** 2)
            dist_norm = dist_to_center / np.sqrt((H / 2) ** 2 + (W / 2) ** 2)
            cost = cov + 0.1 * dist_norm
            if cost < best_cov:
                best_cov = cost
                best_roi = (y, x, win, win)

    if best_roi is None:
        raise RuntimeError("Failed to find homogeneous ROI")

    return best_roi

compare_to(image_input, **kwargs)

Legacy wrapper for BenchmarkSuite.

Source code in mhrqi/benchmarks/compare_to.py
245
246
247
248
249
250
251
252
253
254
255
def compare_to(image_input, **kwargs):
    """Legacy wrapper for BenchmarkSuite."""
    suite = BenchmarkSuite(
        image_input, reference_image=kwargs.get("reference_image"), save_dir=kwargs.get("save_dir")
    )
    results = suite.run(
        methods=kwargs.get("methods", "all"), proposed_image=kwargs.get("proposed_img")
    )
    if kwargs.get("save", True):
        suite.save_reports(prefix=kwargs.get("save_prefix", "denoised"))
    return results

getCDF97(weight=1)

Return the CDF 9/7 biorthogonal wavelet used by OMQDI.

Parameters:

Name Type Description Default
weight

Optional scalar weight applied to all filter coefficients.

1

Returns:

Type Description

pywt.Wavelet object configured with CDF 9/7 filter banks.

Source code in mhrqi/benchmarks/compare_to.py
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
def getCDF97(weight=1):
    """
    Return the CDF 9/7 biorthogonal wavelet used by OMQDI.

    Args:
        weight: Optional scalar weight applied to all filter coefficients.

    Returns:
        pywt.Wavelet object configured with CDF 9/7 filter banks.
    """
    analysis_LP = np.array(
        [
            0,
            0.026748757411,
            -0.016864118443,
            -0.078223266529,
            0.266864118443,
            0.602949018236,
            0.266864118443,
            -0.078223266529,
            -0.016864118443,
            0.026748757411,
        ]
    )
    analysis_LP *= weight

    analysis_HP = np.array(
        [
            0,
            0.091271763114,
            -0.057543526229,
            -0.591271763114,
            1.11508705,
            -0.591271763114,
            -0.057543526229,
            0.091271763114,
            0,
            0,
        ]
    )
    analysis_HP *= weight

    synthesis_LP = np.array(
        [
            0,
            -0.091271763114,
            -0.057543526229,
            0.591271763114,
            1.11508705,
            0.591271763114,
            -0.057543526229,
            -0.091271763114,
            0,
            0,
        ]
    )
    synthesis_LP *= weight

    synthesis_HP = np.array(
        [
            0,
            0.026748757411,
            0.016864118443,
            -0.078223266529,
            -0.266864118443,
            0.602949018236,
            -0.266864118443,
            -0.078223266529,
            0.016864118443,
            0.026748757411,
        ]
    )
    synthesis_HP *= weight

    return pywt.Wavelet("CDF97", [analysis_LP, analysis_HP, synthesis_LP, synthesis_HP])

local_mean(img, window=3, pad_mode='reflect')

Compute the local mean of pixel intensities using a uniform kernel.

Parameters:

Name Type Description Default
img array

Input image of shape (M, N).

required
window

Kernel size.

3
pad_mode

Padding mode for convolution.

'reflect'

Returns:

Type Description
array

Local mean array of shape (M, N).

Source code in mhrqi/benchmarks/compare_to.py
392
393
394
395
396
397
398
399
400
401
402
403
404
def local_mean(img: np.array, window=3, pad_mode="reflect") -> np.array:
    """
    Compute the local mean of pixel intensities using a uniform kernel.

    Args:
        img: Input image of shape (M, N).
        window: Kernel size.
        pad_mode: Padding mode for convolution.

    Returns:
        Local mean array of shape (M, N).
    """
    return convolve(img, np.full((window, window), 1 / window**2), mode=pad_mode)

local_variance(img)

Compute the local variance of an image.

Parameters:

Name Type Description Default
img array

Input image of shape (M, N).

required

Returns:

Type Description
array

Local variance array of shape (M, N).

Source code in mhrqi/benchmarks/compare_to.py
407
408
409
410
411
412
413
414
415
416
417
418
def local_variance(img: np.array) -> np.array:
    """
    Compute the local variance of an image.

    Args:
        img: Input image of shape (M, N).

    Returns:
        Local variance array of shape (M, N).
    """
    mu_sq = np.square(local_mean(img))
    return local_mean(np.square(img)) - mu_sq

noise_power(img)

Estimate the noise power (σ̂) of an image as the mean local variance.

Parameters:

Name Type Description Default
img array

Input image of shape (M, N).

required

Returns:

Type Description
float

Estimated noise power scalar.

Source code in mhrqi/benchmarks/compare_to.py
421
422
423
424
425
426
427
428
429
430
431
def noise_power(img: np.array) -> float:
    """
    Estimate the noise power (σ̂) of an image as the mean local variance.

    Args:
        img: Input image of shape (M, N).

    Returns:
        Estimated noise power scalar.
    """
    return np.mean(local_variance(img))

sbEn(coeffs)

Compute the energy of a wavelet sub-band.

Parameters:

Name Type Description Default
coeffs array

2D array of wavelet coefficients for the sub-band.

required

Returns:

Type Description
float

Sub-band energy as a scalar.

Source code in mhrqi/benchmarks/compare_to.py
343
344
345
346
347
348
349
350
351
352
353
354
def sbEn(coeffs: np.array) -> float:
    """
    Compute the energy of a wavelet sub-band.

    Args:
        coeffs: 2D array of wavelet coefficients for the sub-band.

    Returns:
        Sub-band energy as a scalar.
    """
    rows, cols = coeffs.shape
    return np.log(1 + np.sum(np.square(coeffs)) / (rows * cols))

Statistical Benchmarks

statistical_benchmark

╔══════════════════════════════════════════════════════════════════════════════╗ ║ Statistical Treatment: Wilcoxon Signed-Rank Test ║ ║ Benchmark MHRQI vs State-of-the-Art on Medical OCT Images ║ ╚══════════════════════════════════════════════════════════════════════════════╝

Folder Structure: benchmark/ └── YYYYMMDD_HHMMSS/ ├── cnv1/ │ ├── original.png │ ├── denoised_bm3d.png │ ├── denoised_nlmeans.png │ ├── denoised_srad.png │ ├── denoised_proposed.png │ └── report_*.png ├── cnv2/ │ └── ... └── metrics/ ├── raw_results.json ├── summary.json ├── speckle_consistency_metrics.png └── no-reference_quality.png

SOTA Reference: BM3D (for Full-Reference metrics)

create_results_table(all_results, metrics_dir)

Create summary tables for all metrics.

Source code in mhrqi/benchmarks/statistical_benchmark.py
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
def create_results_table(all_results, metrics_dir):
    """
    Create summary tables for all metrics.
    """
    # Aggregate metrics across all images
    all_metrics_list = ALL_SPECKLE_METRICS + STRUCTURAL_METRICS
    method_metrics = {m: {k: [] for k in all_metrics_list} for m in METHODS}

    for _img_name, methods in all_results.items():
        for method in METHODS:
            if method in methods:
                for metric in all_metrics_list:
                    val = methods[method].get(metric, float("nan"))
                    if not np.isnan(val):
                        method_metrics[method][metric].append(val)

    # Calculate means and stds
    summary = {}
    for method in METHODS:
        summary[method] = {}
        for metric in all_metrics_list:
            vals = method_metrics[method][metric]
            if vals:
                summary[method][metric] = {
                    "mean": np.mean(vals),
                    "std": np.std(vals),
                    "n": len(vals),
                }
            else:
                summary[method][metric] = {"mean": float("nan"), "std": float("nan"), "n": 0}

    # Print table
    print("\n" + "=" * 100)
    print("BENCHMARK RESULTS (Mean ± Std across all images)")
    print("=" * 100)

    all_metrics = all_metrics_list
    header = f"{'Method':<12}" + "".join(f"{m:<15}" for m in all_metrics)
    print(header)
    print("-" * 80)

    for method in METHODS:
        row = f"{method:<12}"
        for metric in all_metrics:
            s = summary[method][metric]
            if s["n"] > 0:
                row += f"{s['mean']:.4f}±{s['std']:.4f}  "
            else:
                row += "N/A           "
        print(row)

    # ==========================================================================
    # STATISTICAL SIGNIFICANCE (Wilcoxon Signed-Rank Test)
    # All metrics are paired samples (same images, different methods)
    # ==========================================================================

    print("\n" + "=" * 90)
    print("STATISTICAL SIGNIFICANCE (Wilcoxon Signed-Rank Test)")
    print("=" * 90)

    # Define metric categories with hypotheses
    metric_categories = [
        {
            "name": "Speckle Reduction (Lower Better)",
            "metrics": SPECKLE_METRICS_LOWER,
            "higher_better": False,  # Lower is better for SSI, SMPI
            "H0": "MHRQI achieves the same speckle reduction as [competitor]",
            "H1": "MHRQI achieves different speckle reduction than [competitor]",
        },
        {
            "name": "Speckle Reduction (Higher Better)",
            "metrics": SPECKLE_METRICS_HIGHER,
            "higher_better": True,  # Higher is better for ENL, CNR, NSF
            "H0": "MHRQI achieves the same speckle reduction as [competitor]",
            "H1": "MHRQI achieves different speckle reduction than [competitor]",
        },
        {
            "name": "Structural Similarity",
            "metrics": STRUCTURAL_METRICS,
            "higher_better": True,  # Higher is better for EPF, EPI, OMQDI
            "H0": "MHRQI achieves the same structural preservation as [competitor]",
            "H1": "MHRQI achieves different structural preservation than [competitor]",
        },
        # Naturalness removed (NIQE computed but not reported - biased for medical images)
    ]

    # Store results for summary
    stat_results = []

    for category in metric_categories:
        print(f"\n{'─' * 90}")
        print(f"Category: {category['name']}")
        print(f"  H₀: {category['H0']}")
        print(f"  H₁: {category['H1']}")
        print(f"  Direction: {'Higher' if category['higher_better'] else 'Lower'} is better")
        print(f"{'─' * 90}")

        for other_method in ["bm3d", "nlmeans", "srad"]:
            print(f"\n  {other_method.upper()} vs MHRQI:")

            for metric in category["metrics"]:
                stat, p_val, diffs = wilcoxon_test(all_results, "proposed", other_method, metric)

                if p_val is not None:
                    mean_diff = np.mean(diffs)

                    # Interpret based on direction
                    if category["higher_better"]:
                        mhrqi_better = mean_diff > 0
                    else:
                        mhrqi_better = mean_diff < 0  # Lower is better

                    # Determine significance and language
                    if p_val < 0.05:
                        decision = "Reject H₀"
                        if mhrqi_better:
                            interpretation = "MHRQI significantly better"
                        else:
                            interpretation = f"{other_method} significantly better"
                    else:
                        decision = "Fail to reject H₀"
                        interpretation = "Comparable (no significant difference)"

                    sig = (
                        "***"
                        if p_val < 0.001
                        else "**"
                        if p_val < 0.01
                        else "*"
                        if p_val < 0.05
                        else "n.s."
                    )

                    print(f"    {metric:<10}: p={p_val:.4f} ({sig:<4}) → {decision}")
                    print(f"               Mean Δ={mean_diff:+.4f}{interpretation}")

                    stat_results.append(
                        {
                            "category": category["name"],
                            "competitor": other_method,
                            "metric": metric,
                            "p_value": p_val,
                            "mean_diff": mean_diff,
                            "interpretation": interpretation,
                            "significant": p_val < 0.05,
                        }
                    )
                else:
                    print(f"    {metric:<10}: insufficient data (need ≥3 paired samples)")

    # Save statistical results
    with open(os.path.join(metrics_dir, "statistical_results.json"), "w") as f:
        json.dump(
            stat_results,
            f,
            indent=2,
            default=lambda x: int(x)
            if isinstance(x, (bool, np.bool_))
            else float(x)
            if isinstance(x, np.floating)
            else x,
        )

    # Save summary
    with open(os.path.join(metrics_dir, "summary.json"), "w") as f:
        json.dump(
            summary, f, indent=2, default=lambda x: float(x) if isinstance(x, np.floating) else x
        )

    return summary, stat_results

create_summary_heatmap(stat_results, metrics_dir)

Create a WIN/TIE/LOSS heatmap from pre-computed statistical results.

Parameters:

Name Type Description Default
stat_results

List of dicts from create_results_table() (same structure as statistical_results.json).

required
metrics_dir

Directory where the PNG is saved.

required
Source code in mhrqi/benchmarks/statistical_benchmark.py
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
def create_summary_heatmap(stat_results, metrics_dir):
    """
    Create a WIN/TIE/LOSS heatmap from pre-computed statistical results.

    Args:
        stat_results: List of dicts from create_results_table() (same structure
                      as statistical_results.json).
        metrics_dir:  Directory where the PNG is saved.
    """
    import matplotlib.colors as mcolors
    import matplotlib.pyplot as plt
    from matplotlib.patches import Patch

    if not stat_results:
        print("  No statistical results to plot; skipping heatmap.")
        return

    competitors = sorted({d["competitor"] for d in stat_results})
    metrics = sorted({d["metric"] for d in stat_results})

    # Preserve category grouping
    categories = {d["metric"]: d["category"] for d in stat_results}
    metrics.sort(key=lambda x: categories[x])

    # Build matrix: 2 = WIN, 1 = TIE, 0 = LOSS
    heatmap_data = np.zeros((len(competitors), len(metrics)))
    for d in stat_results:
        row = competitors.index(d["competitor"])
        col = metrics.index(d["metric"])
        interp = d["interpretation"].lower()
        if "mhrqi significantly better" in interp:
            heatmap_data[row, col] = 2
        elif "comparable" in interp:
            heatmap_data[row, col] = 1
        # else 0 (competitor better)

    fig, ax = plt.subplots(figsize=(14, 7))

    cmap = mcolors.ListedColormap(["#FF8A8A", "#FFFBD1", "#A3EBB1"])
    bounds = [-0.5, 0.5, 1.5, 2.5]
    norm = mcolors.BoundaryNorm(bounds, cmap.N)

    ax.imshow(heatmap_data, cmap=cmap, norm=norm, aspect="auto")

    # Grid lines
    ax.set_xticks(np.arange(len(metrics) + 1) - 0.5, minor=True)
    ax.set_yticks(np.arange(len(competitors) + 1) - 0.5, minor=True)
    ax.grid(which="minor", color="white", linestyle="-", linewidth=3)
    ax.tick_params(which="minor", bottom=False, left=False)

    # Axis labels
    ax.set_xticks(np.arange(len(metrics)))
    ax.set_yticks(np.arange(len(competitors)))
    ax.set_xticklabels(metrics, fontsize=11, fontweight="bold", rotation=0)
    ax.set_yticklabels([c.upper() for c in competitors], fontsize=11, fontweight="bold")

    # Cell annotations
    for i in range(len(competitors)):
        for j in range(len(metrics)):
            val = heatmap_data[i, j]
            label = "WIN" if val == 2 else "TIE" if val == 1 else "LOSS"
            ax.text(
                j,
                i,
                label,
                ha="center",
                va="center",
                color="#333333",
                fontsize=11,
                fontweight="bold",
            )

    # Category span headers
    unique_cats, cat_starts, current_cat = [], [], None
    for i, m in enumerate(metrics):
        if categories[m] != current_cat:
            unique_cats.append(categories[m])
            cat_starts.append(i)
            current_cat = categories[m]

    for i, cat in enumerate(unique_cats):
        start = cat_starts[i]
        end = cat_starts[i + 1] if i + 1 < len(cat_starts) else len(metrics)
        mid = (start + end - 1) / 2
        ax.annotate(
            "",
            xy=(start - 0.4, -0.6),
            xytext=(end - 0.6, -0.6),
            xycoords="data",
            textcoords="data",
            arrowprops={"arrowstyle": "-", "color": "gray", "lw": 1.5},
        )
        ax.text(
            mid,
            -0.8,
            cat,
            ha="center",
            va="bottom",
            fontsize=10,
            fontweight="bold",
            style="italic",
            color="#555555",
        )

    ax.spines[:].set_visible(False)
    ax.set_title(
        "MHRQI Performance Benchmark Summary\n(Statistical Significance vs SOTA)",
        fontsize=15,
        fontweight="bold",
        pad=50,
    )

    legend_elements = [
        Patch(facecolor="#A3EBB1", label="MHRQI Significantly Better (p < 0.05)"),
        Patch(facecolor="#FFFBD1", label="Competitive / No Significant Difference"),
        Patch(facecolor="#FF8A8A", label="Competitor Significantly Better (p < 0.05)"),
    ]
    ax.legend(
        handles=legend_elements,
        loc="upper center",
        bbox_to_anchor=(0.5, -0.15),
        ncol=3,
        frameon=False,
        fontsize=10,
    )

    plt.tight_layout()
    output_path = os.path.join(metrics_dir, "benchmark_summary_heatmap.png")
    plt.savefig(output_path, dpi=150, bbox_inches="tight")
    plt.close()
    print("  Saved: benchmark_summary_heatmap.png")

create_visualization(all_results, metrics_dir)

Create separate visualizations for each metric category.

Source code in mhrqi/benchmarks/statistical_benchmark.py
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
def create_visualization(all_results, metrics_dir):
    """
    Create separate visualizations for each metric category.
    """
    import matplotlib.colors as mcolors
    import matplotlib.pyplot as plt
    from matplotlib.patches import Patch

    methods = METHODS

    metric_groups = [
        ("Speckle Reduction (Lower Better)", SPECKLE_METRICS_LOWER, False),
        ("Speckle Reduction (Higher Better)", SPECKLE_METRICS_HIGHER, True),
        ("Structural Similarity Metrics", STRUCTURAL_METRICS, True),
        # Naturalness removed (NIQE computed but not reported)
    ]

    for group_name, metrics, higher_better in metric_groups:
        method_means = {m: [] for m in methods}
        method_stds = {m: [] for m in methods}

        for metric in metrics:
            for method in methods:
                vals = []
                for img_results in all_results.values():
                    if method in img_results:
                        v = img_results[method].get(metric, float("nan"))
                        if not np.isnan(v):
                            vals.append(v)
                if vals:
                    # Clip to 10000 for visualization if needed
                    m_val = np.mean(vals)
                    s_val = np.std(vals)
                    if m_val > 10000:
                        m_val = 10000
                    if s_val > 10000:
                        s_val = 10000
                    method_means[method].append(m_val)
                    method_stds[method].append(s_val)
                else:
                    method_means[method].append(0)
                    method_stds[method].append(0)

        # Skip if no data
        if all(all(v == 0 for v in method_means[m]) for m in methods):
            continue

        width = 0.6

        fig, axes = plt.subplots(len(metrics), 1, figsize=(8, 4 * len(metrics)), sharex=False)
        if len(metrics) == 1:
            axes = [axes]

        colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728"]
        labels = ["BM3D", "NL-Means", "SRAD", "MHRQI (Ours)"]

        for idx, (ax, metric) in enumerate(zip(axes, metrics)):
            means = [method_means[m][idx] for m in methods]
            stds = [method_stds[m][idx] for m in methods]

            # Use bars instead of grouped bars since we have subplots
            ax.bar(methods, means, width, yerr=stds, capsize=5, color=colors)

            ax.set_title(f"Metric: {metric}", fontsize=12, fontweight="bold")
            ax.set_ylabel("Score", fontsize=10)
            ax.grid(axis="y", alpha=0.3)

            # Use labels for methods
            ax.set_xticks(np.arange(len(methods)))
            ax.set_xticklabels(labels)

            direction_note = "↑ Higher is better" if higher_better else "↓ Lower is better"
            ax.annotate(
                direction_note,
                xy=(0.98, 0.02),
                xycoords="axes fraction",
                ha="right",
                fontsize=9,
                style="italic",
                color="gray",
            )

        fig.suptitle(group_name, fontsize=14, fontweight="bold", y=0.98)
        plt.tight_layout(rect=[0, 0.03, 1, 0.95])
        filename = (
            group_name.lower()
            .replace(" ", "_")
            .replace("/", "_")
            .replace("(", "")
            .replace(")", "")
            .replace("=", "")
            + ".png"
        )
        plt.savefig(os.path.join(metrics_dir, filename), dpi=150)
        plt.close()

        print(f"  Saved: {filename}")

    print(f"\nVisualizations saved to: {metrics_dir}")

run_benchmark(n=64, _strength=None)

Run benchmark on all medical images and collect metrics.

Folder structure: - benchmark/timestamp/imagename/ for each image - benchmark/timestamp/metrics/ for aggregated stats

Parameters:

Name Type Description Default
n

Image size

64
strength

Denoiser strength parameter

required

Returns:

Name Type Description
results

Dict mapping image -> method -> metrics

base_dir

Path to benchmark output directory

Source code in mhrqi/benchmarks/statistical_benchmark.py
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
def run_benchmark(n=64, _strength=None):
    """
    Run benchmark on all medical images and collect metrics.

    Folder structure:
    - benchmark/timestamp/imagename/ for each image
    - benchmark/timestamp/metrics/ for aggregated stats

    Args:
        n: Image size
        strength: Denoiser strength parameter

    Returns:
        results: Dict mapping image -> method -> metrics
        base_dir: Path to benchmark output directory
    """
    # Create base directory: benchmark/timestamp/ (matching runs/ format)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M")  # Same as runs/ folder
    base_dir = os.path.join("benchmark", timestamp)
    metrics_dir = os.path.join(base_dir, "metrics")
    os.makedirs(metrics_dir, exist_ok=True)

    all_results = {}

    for img_path in MEDICAL_IMAGES:
        img_name = os.path.basename(img_path).replace(".jpeg", "")
        img_dir = os.path.join(base_dir, img_name)
        os.makedirs(img_dir, exist_ok=True)

        print(f"\n{'=' * 60}")
        print(f"Processing: {img_name}")
        print(f"Output: {img_dir}")
        print(f"{'=' * 60}")

        # Run MHRQI pipeline
        orig, recon, run_dir = main.main(
            shots=1000,
            n=n,
            d=2,
            denoise=True,
            use_shots=False,
            fast=True,
            verbose_plots=False,
            img_path=img_path,
            run_comparison=False,
        )

        # Save original
        cv2.imwrite(os.path.join(img_dir, "original.png"), orig)

        # Run comparison with BM3D as SOTA reference for FR metrics
        noisy_img = compare_to.to_float01(orig)

        # Run comparison (no synthetic clean reference)
        comparison_results = compare_to.compare_to(
            noisy_img,
            proposed_img=compare_to.to_float01(recon),
            methods="all",
            plot=True,  # Generate report plots
            save=True,
            save_prefix="denoised",
            save_dir=img_dir,
            reference_image=None,  # No synthetic reference
        )

        # Extract metrics for each method
        img_results = {}
        for r in comparison_results:
            method_name = r["name"]
            img_results[method_name] = r["metrics"]

        all_results[img_name] = img_results

    # Save raw results to metrics folder
    with open(os.path.join(metrics_dir, "raw_results.json"), "w") as f:
        serializable = {}
        for img, methods in all_results.items():
            serializable[img] = {}
            for method, metrics in methods.items():
                serializable[img][method] = {
                    k: float(v) if not np.isnan(v) else None for k, v in metrics.items()
                }
        json.dump(serializable, f, indent=2)

    return all_results, base_dir, metrics_dir

wilcoxon_test(all_results, method1, method2, metric)

Perform Wilcoxon signed-rank test between two methods on a specific metric.

Source code in mhrqi/benchmarks/statistical_benchmark.py
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
def wilcoxon_test(all_results, method1, method2, metric):
    """
    Perform Wilcoxon signed-rank test between two methods on a specific metric.
    """
    diffs = []
    for _img_name, methods in all_results.items():
        if method1 in methods and method2 in methods:
            v1 = methods[method1].get(metric, float("nan"))
            v2 = methods[method2].get(metric, float("nan"))
            if not np.isnan(v1) and not np.isnan(v2):
                diffs.append(v1 - v2)

    if len(diffs) < 3:
        return None, None, diffs

    try:
        stat, p_value = stats.wilcoxon(diffs)
        return stat, p_value, diffs
    except Exception as e:
        print(f"Wilcoxon test failed: {e}")
        return None, None, diffs