From acdeb1129a605480e554b4479bf34fb33e2d01f7 Mon Sep 17 00:00:00 2001 From: Jiao77 Date: Mon, 20 Oct 2025 10:08:35 -0400 Subject: [PATCH] add gpu performence bench mark --- benchmark_grid.json | 108 +++++++++++----------- docs/description/Performance_Benchmark.md | 42 +++++++++ 2 files changed, 96 insertions(+), 54 deletions(-) diff --git a/benchmark_grid.json b/benchmark_grid.json index 493d645..df58920 100644 --- a/benchmark_grid.json +++ b/benchmark_grid.json @@ -2,91 +2,91 @@ { "backbone": "vgg16", "attention": "none", - "places": "backbone_high,desc_head", - "single_ms_mean": 351.6519069671631, - "single_ms_std": 1.8778125281542124, - "fpn_ms_mean": 719.3304697672526, - "fpn_ms_std": 3.949980966745213, - "runs": 3 + "places": "backbone_high", + "single_ms_mean": 4.528331756591797, + "single_ms_std": 0.018315389112121477, + "fpn_ms_mean": 8.5052490234375, + "fpn_ms_std": 0.0024987359059474757, + "runs": 5 }, { "backbone": "vgg16", "attention": "se", - "places": "backbone_high,desc_head", - "single_ms_mean": 349.7585455576579, - "single_ms_std": 1.9950684383137551, - "fpn_ms_mean": 721.4130560557047, - "fpn_ms_std": 2.7448351792281374, - "runs": 3 + "places": "backbone_high", + "single_ms_mean": 3.79791259765625, + "single_ms_std": 0.014929344228397397, + "fpn_ms_mean": 7.117033004760742, + "fpn_ms_std": 0.0039580356539625425, + "runs": 5 }, { "backbone": "vgg16", "attention": "cbam", - "places": "backbone_high,desc_head", - "single_ms_mean": 354.4490337371826, - "single_ms_std": 1.4903953036396786, - "fpn_ms_mean": 744.7629769643148, - "fpn_ms_std": 29.3233387791729, - "runs": 3 + "places": "backbone_high", + "single_ms_mean": 3.7283897399902344, + "single_ms_std": 0.01896289713396852, + "fpn_ms_mean": 6.954669952392578, + "fpn_ms_std": 0.0946284511822057, + "runs": 5 }, { "backbone": "resnet34", "attention": "none", - "places": "backbone_high,desc_head", - "single_ms_mean": 90.98696708679199, - "single_ms_std": 0.41179110533866975, - "fpn_ms_mean": 117.2173023223877, - "fpn_ms_std": 0.40632490569423124, - "runs": 3 + "places": "backbone_high", + "single_ms_mean": 2.3172378540039062, + "single_ms_std": 0.03704733205002756, + "fpn_ms_mean": 2.7330875396728516, + "fpn_ms_std": 0.006544318567008118, + "runs": 5 }, { "backbone": "resnet34", "attention": "se", - "places": "backbone_high,desc_head", - "single_ms_mean": 90.78375498453777, - "single_ms_std": 0.4705899743190883, - "fpn_ms_mean": 115.90576171875, - "fpn_ms_std": 1.3081578935341862, - "runs": 3 + "places": "backbone_high", + "single_ms_mean": 2.3345470428466797, + "single_ms_std": 0.01149701754726714, + "fpn_ms_mean": 2.7266979217529297, + "fpn_ms_std": 0.0040167693497949, + "runs": 5 }, { "backbone": "resnet34", "attention": "cbam", - "places": "backbone_high,desc_head", - "single_ms_mean": 96.49538993835449, - "single_ms_std": 3.17170034860506, - "fpn_ms_mean": 111.08938852945964, - "fpn_ms_std": 1.0126843546619573, - "runs": 3 + "places": "backbone_high", + "single_ms_mean": 2.4645328521728516, + "single_ms_std": 0.03573384703501215, + "fpn_ms_mean": 2.7351856231689453, + "fpn_ms_std": 0.004198875420141471, + "runs": 5 }, { "backbone": "efficientnet_b0", "attention": "none", - "places": "backbone_high,desc_head", - "single_ms_mean": 40.451606114705406, - "single_ms_std": 1.5293525027201111, - "fpn_ms_mean": 127.30161348978679, - "fpn_ms_std": 0.08508800981401025, - "runs": 3 + "places": "backbone_high", + "single_ms_mean": 3.6920547485351562, + "single_ms_std": 0.06926683030174544, + "fpn_ms_mean": 4.38084602355957, + "fpn_ms_std": 0.021533091774855868, + "runs": 5 }, { "backbone": "efficientnet_b0", "attention": "se", - "places": "backbone_high,desc_head", - "single_ms_mean": 46.480417251586914, - "single_ms_std": 0.2622188910897682, - "fpn_ms_mean": 142.35099156697592, - "fpn_ms_std": 6.611047958580852, - "runs": 3 + "places": "backbone_high", + "single_ms_mean": 3.7618160247802734, + "single_ms_std": 0.05971848107723002, + "fpn_ms_mean": 4.3704986572265625, + "fpn_ms_std": 0.02873211962906253, + "runs": 5 }, { "backbone": "efficientnet_b0", "attention": "cbam", - "places": "backbone_high,desc_head", - "single_ms_mean": 47.10610707600912, - "single_ms_std": 0.47150733957171853, - "fpn_ms_mean": 150.99199612935385, - "fpn_ms_std": 12.465987661773038, - "runs": 3 + "places": "backbone_high", + "single_ms_mean": 3.9876937866210938, + "single_ms_std": 0.07599183707384338, + "fpn_ms_mean": 4.412364959716797, + "fpn_ms_std": 0.023552763127197434, + "runs": 5 } ] \ No newline at end of file diff --git a/docs/description/Performance_Benchmark.md b/docs/description/Performance_Benchmark.md index 0d2841e..958d937 100644 --- a/docs/description/Performance_Benchmark.md +++ b/docs/description/Performance_Benchmark.md @@ -64,6 +64,48 @@ PYTHONPATH=. uv run python tests/benchmark_grid.py \ 运行会同时输出控制台摘要并保存 JSON:`benchmark_grid.json`。 +## GPU 测试结果(A100) + +最后更新:2025-01-XX +设备:NVIDIA A100(CUDA) +输入:1×3×512×512 随机张量 +重复次数:5(每组) +注意力放置位置:backbone_high + +> 说明:本测试在 A100 GPU 上进行,展示了不同骨干网络和注意力模块组合在 GPU 上的推理性能。 + +### 结果汇总(ms) + +| Backbone | Attention | Single Mean ± Std | FPN Mean ± Std | +|--------------------|-----------|------------------:|---------------:| +| vgg16 | none | 4.53 ± 0.02 | 8.51 ± 0.002 | +| vgg16 | se | 3.80 ± 0.01 | 7.12 ± 0.004 | +| vgg16 | cbam | 3.73 ± 0.02 | 6.95 ± 0.09 | +| resnet34 | none | 2.32 ± 0.04 | 2.73 ± 0.007 | +| resnet34 | se | 2.33 ± 0.01 | 2.73 ± 0.004 | +| resnet34 | cbam | 2.46 ± 0.04 | 2.74 ± 0.004 | +| efficientnet_b0 | none | 3.69 ± 0.07 | 4.38 ± 0.02 | +| efficientnet_b0 | se | 3.76 ± 0.06 | 4.37 ± 0.03 | +| efficientnet_b0 | cbam | 3.99 ± 0.08 | 4.41 ± 0.02 | + +复现实验: + +```zsh +PYTHONPATH=. uv run python tests/benchmark_grid.py \ + --device cuda --image-size 512 --runs 5 \ + --backbones vgg16 resnet34 efficientnet_b0 \ + --attentions none se cbam \ + --places backbone_high +``` + +### GPU 测试观察 + +- **ResNet34 表现最佳**:在 GPU 上,ResNet34 在单尺度和 FPN 路径上都表现出色,单尺度约 2.3ms,FPN 约 2.7ms。 +- **VGG16 在 GPU 上仍有明显开销**:尽管在 GPU 上加速,VGG16 仍然是三种骨干中最慢的,单尺度约 3.7-4.5ms。 +- **EfficientNet-B0 表现中等**:在 GPU 上介于 VGG16 和 ResNet34 之间,单尺度约 3.7-4.0ms。 +- **注意力模块影响较小**:在 GPU 上,注意力模块(SE、CBAM)对性能的影响相对较小,FPN 路径上的差异尤其不明显。 +- **FPN 开销相对可控**:在 GPU 上,FPN 路径相比单尺度的额外开销较小,ResNet34 仅增加约 18%。 + ## 观察与解读 - vgg16 明显最慢,FPN 额外的横向/上采样代价在 CPU 上更突出(>2×)。 - resnet34 在单尺度上显著快于 vgg16,FPN 增幅较小(约 +25%)。