## M1 Pro (old 22c96b4)

make -j && ./scripts/bench-all.sh 8

Running memcpy benchmark

memcpy:   39.10 GB/s (heat-up)
memcpy:   44.75 GB/s ( 1 thread)
memcpy:   44.78 GB/s ( 1 thread)
memcpy:   44.97 GB/s ( 2 thread)
memcpy:   48.04 GB/s ( 3 thread)
memcpy:   50.55 GB/s ( 4 thread)
memcpy:   55.20 GB/s ( 5 thread)
memcpy:   65.60 GB/s ( 6 thread)
memcpy:   70.64 GB/s ( 7 thread)
memcpy:   73.34 GB/s ( 8 thread)
sum:    -5120002535.000000


make -j && ./scripts/bench-all.sh 1 0 0

Running ggml_mul_mat benchmark with 1 threads

  64 x   64: Q4_0   237.1 GFLOPS (128 runs) | Q4_1   168.6 GFLOPS (128 runs)
  64 x   64: Q5_0   136.4 GFLOPS (128 runs) | Q5_1   135.6 GFLOPS (128 runs) | Q8_0   243.1 GFLOPS (128 runs)
  64 x   64: F16    140.4 GFLOPS (128 runs) | F32    316.6 GFLOPS (128 runs)
 128 x  128: Q4_0   496.6 GFLOPS (128 runs) | Q4_1   348.6 GFLOPS (128 runs)
 128 x  128: Q5_0   273.2 GFLOPS (128 runs) | Q5_1   274.1 GFLOPS (128 runs) | Q8_0   505.1 GFLOPS (128 runs)
 128 x  128: F16    300.4 GFLOPS (128 runs) | F32    653.9 GFLOPS (128 runs)
 256 x  256: Q4_0   791.7 GFLOPS (128 runs) | Q4_1   615.3 GFLOPS (128 runs)
 256 x  256: Q5_0   651.0 GFLOPS (128 runs) | Q5_1   674.7 GFLOPS (128 runs) | Q8_0   803.1 GFLOPS (128 runs)
 256 x  256: F16    869.6 GFLOPS (128 runs) | F32    957.2 GFLOPS (128 runs)
 512 x  512: Q4_0   973.3 GFLOPS (128 runs) | Q4_1   897.9 GFLOPS (128 runs)
 512 x  512: Q5_0  1078.8 GFLOPS (128 runs) | Q5_1   998.4 GFLOPS (128 runs) | Q8_0   752.4 GFLOPS (128 runs)
 512 x  512: F16    892.5 GFLOPS (128 runs) | F32   1399.6 GFLOPS (128 runs)
1024 x 1024: Q4_0  1402.7 GFLOPS (128 runs) | Q4_1  1218.5 GFLOPS (128 runs)
1024 x 1024: Q5_0  1444.8 GFLOPS (128 runs) | Q5_1  1444.7 GFLOPS (128 runs) | Q8_0  1395.7 GFLOPS (128 runs)
1024 x 1024: F16   1524.1 GFLOPS (128 runs) | F32   1726.6 GFLOPS (128 runs)
2048 x 2048: Q4_0  1479.4 GFLOPS ( 87 runs) | Q4_1  1378.5 GFLOPS ( 81 runs)
2048 x 2048: Q5_0  1454.6 GFLOPS ( 85 runs) | Q5_1  1462.9 GFLOPS ( 86 runs) | Q8_0  1483.2 GFLOPS ( 87 runs)
2048 x 2048: F16   1488.0 GFLOPS ( 87 runs) | F32   1538.2 GFLOPS ( 90 runs)
4096 x 4096: Q4_0  1509.7 GFLOPS ( 11 runs) | Q4_1  1433.0 GFLOPS ( 11 runs)
4096 x 4096: Q5_0  1422.4 GFLOPS ( 11 runs) | Q5_1  1437.0 GFLOPS ( 11 runs) | Q8_0  1523.0 GFLOPS ( 12 runs)
4096 x 4096: F16   1551.3 GFLOPS ( 12 runs) | F32   1451.0 GFLOPS ( 11 runs)

|    CPU | Config |         Model |  Th |  FA |    Enc. |    Dec. |    Bch5 |      PP |  Commit |
|    --- |    --- |           --- | --- | --- |     --- |     --- |     --- |     --- |     --- |
| M1 Pro |  METAL |          tiny |   1 |   0 |   32.44 |    1.71 |    0.43 |    0.04 | 8a67c55c |
| M1 Pro |  METAL |          base |   1 |   0 |   63.54 |    2.62 |    0.71 |    0.06 | 8a67c55c |
| M1 Pro |  METAL |         small |   1 |   0 |  200.30 |    5.34 |    1.72 |    0.17 | 8a67c55c |
| M1 Pro |  METAL |        medium |   1 |   0 |  580.06 |   11.71 |    4.18 |    0.45 | 8a67c55c |


make -j && ./scripts/bench-all.sh 1 1 1

|    CPU | Config |         Model |  Th |  FA |    Enc. |    Dec. |    Bch5 |      PP |  Commit |
|    --- |    --- |           --- | --- | --- |     --- |     --- |     --- |     --- |     --- |
| M1 Pro |  METAL |          tiny |   1 |   1 |   22.09 |    1.84 |    0.43 |    0.03 | 8a67c55c |
| M1 Pro |  METAL |          base |   1 |   1 |   40.57 |    2.22 |    0.44 |    0.04 | 8a67c55c |
| M1 Pro |  METAL |         small |   1 |   1 |  135.15 |    4.23 |    0.95 |    0.12 | 8a67c55c |
| M1 Pro |  METAL |        medium |   1 |   1 |  395.18 |    9.14 |    2.21 |    0.30 | 8a67c55c |


## M2 Ultra

make -j && ./scripts/bench-all.sh 8

Running memcpy benchmark

memcpy:   48.01 GB/s (heat-up)
memcpy:   56.00 GB/s ( 1 thread)
memcpy:   56.20 GB/s ( 1 thread)
memcpy:  102.69 GB/s ( 2 thread)
memcpy:  140.32 GB/s ( 3 thread)
memcpy:  179.04 GB/s ( 4 thread)
memcpy:  159.61 GB/s ( 5 thread)
memcpy:  159.02 GB/s ( 6 thread)
memcpy:  180.29 GB/s ( 7 thread)
memcpy:  198.10 GB/s ( 8 thread)
sum:    -5119999345.000000


make -j && ./scripts/bench-all.sh 1

Running ggml_mul_mat benchmark with 1 threads

  64 x   64: Q4_0    37.7 GFLOPS (128 runs) | Q4_1    36.0 GFLOPS (128 runs)
  64 x   64: Q5_0    20.1 GFLOPS (128 runs) | Q5_1    19.8 GFLOPS (128 runs) | Q8_0    39.5 GFLOPS (128 runs)
  64 x   64: F16     29.9 GFLOPS (128 runs) | F32     22.6 GFLOPS (128 runs)
 128 x  128: Q4_0    71.0 GFLOPS (128 runs) | Q4_1    62.2 GFLOPS (128 runs)
 128 x  128: Q5_0    33.4 GFLOPS (128 runs) | Q5_1    31.6 GFLOPS (128 runs) | Q8_0    79.8 GFLOPS (128 runs)
 128 x  128: F16     52.4 GFLOPS (128 runs) | F32     32.7 GFLOPS (128 runs)
 256 x  256: Q4_0    88.6 GFLOPS (128 runs) | Q4_1    77.2 GFLOPS (128 runs)
 256 x  256: Q5_0    40.3 GFLOPS (128 runs) | Q5_1    36.8 GFLOPS (128 runs) | Q8_0   102.5 GFLOPS (128 runs)
 256 x  256: F16     64.6 GFLOPS (128 runs) | F32     36.4 GFLOPS (128 runs)
 512 x  512: Q4_0    94.7 GFLOPS (128 runs) | Q4_1    83.6 GFLOPS (128 runs)
 512 x  512: Q5_0    45.9 GFLOPS (128 runs) | Q5_1    41.3 GFLOPS (128 runs) | Q8_0   112.8 GFLOPS (128 runs)
 512 x  512: F16     72.3 GFLOPS (128 runs) | F32     37.7 GFLOPS (128 runs)
1024 x 1024: Q4_0    98.9 GFLOPS ( 47 runs) | Q4_1    88.2 GFLOPS ( 42 runs)
1024 x 1024: Q5_0    49.0 GFLOPS ( 23 runs) | Q5_1    43.9 GFLOPS ( 21 runs) | Q8_0   121.0 GFLOPS ( 57 runs)
1024 x 1024: F16     72.6 GFLOPS ( 34 runs) | F32     36.0 GFLOPS ( 17 runs)
2048 x 2048: Q4_0   101.3 GFLOPS (  6 runs) | Q4_1    90.0 GFLOPS (  6 runs)
2048 x 2048: Q5_0    50.8 GFLOPS (  3 runs) | Q5_1    45.3 GFLOPS (  3 runs) | Q8_0   124.1 GFLOPS (  8 runs)
2048 x 2048: F16     70.7 GFLOPS (  5 runs) | F32     30.4 GFLOPS (  3 runs)
4096 x 4096: Q4_0   101.7 GFLOPS (  3 runs) | Q4_1    90.3 GFLOPS (  3 runs)
4096 x 4096: Q5_0    52.2 GFLOPS (  3 runs) | Q5_1    45.7 GFLOPS (  3 runs) | Q8_0   123.0 GFLOPS (  3 runs)
4096 x 4096: F16     60.3 GFLOPS (  3 runs) | F32     29.8 GFLOPS (  3 runs)


make -j && ./scripts/bench-all.sh 1 1 0

|      CPU | Config |         Model       |  Th |  FA |    Enc. |    Dec. |    Bch5 |      PP |  Commit |
|      --- |    --- |           ---       | --- | --- |     --- |     --- |     --- |     --- |     --- |
| M2 ULTRA |  METAL |          tiny       |   1 |   0 |    8.10 |    1.03 |    0.25 |    0.01 | f14ae77f |
| M2 ULTRA |  METAL |     tiny-q5_0       |   1 |   0 |    8.53 |    1.02 |    0.26 |    0.01 | f14ae77f |
| M2 ULTRA |  METAL |     tiny-q5_1       |   1 |   0 |    8.67 |    1.00 |    0.26 |    0.01 | f14ae77f |
| M2 ULTRA |  METAL |     tiny-q8_0       |   1 |   0 |    9.32 |    1.02 |    0.26 |    0.01 | f14ae77f |
| M2 ULTRA |  METAL |          base       |   1 |   0 |   15.50 |    1.51 |    0.40 |    0.02 | f14ae77f |
| M2 ULTRA |  METAL |     base-q5_0       |   1 |   0 |   16.63 |    1.45 |    0.40 |    0.02 | f14ae77f |
| M2 ULTRA |  METAL |     base-q5_1       |   1 |   0 |   16.76 |    1.44 |    0.39 |    0.02 | f14ae77f |
| M2 ULTRA |  METAL |     base-q8_0       |   1 |   0 |   15.73 |    1.43 |    0.38 |    0.02 | f14ae77f |
| M2 ULTRA |  METAL |         small       |   1 |   0 |   45.43 |    2.93 |    0.83 |    0.05 | f14ae77f |
| M2 ULTRA |  METAL |    small-q5_0       |   1 |   0 |   49.78 |    2.85 |    0.84 |    0.06 | f14ae77f |
| M2 ULTRA |  METAL |    small-q5_1       |   1 |   0 |   50.22 |    2.85 |    0.84 |    0.06 | f14ae77f |
| M2 ULTRA |  METAL |    small-q8_0       |   1 |   0 |   47.08 |    2.78 |    0.83 |    0.05 | f14ae77f |
| M2 ULTRA |  METAL |        medium       |   1 |   0 |  125.19 |    6.10 |    1.88 |    0.12 | f14ae77f |
| M2 ULTRA |  METAL |   medium-q5_0       |   1 |   0 |  142.49 |    5.59 |    1.90 |    0.14 | f14ae77f |
| M2 ULTRA |  METAL |   medium-q5_1       |   1 |   0 |  142.63 |    5.68 |    1.92 |    0.14 | f14ae77f |
| M2 ULTRA |  METAL |   medium-q8_0       |   1 |   0 |  130.98 |    5.83 |    1.87 |    0.13 | f14ae77f |
| M2 ULTRA |  METAL |    medium-dis       |   1 |   0 |  113.95 |    0.88 |    0.24 |    0.02 | f14ae77f |
| M2 ULTRA |  METAL |      large-v2       |   1 |   0 |  239.27 |    8.97 |    2.92 |    0.21 | f14ae77f |
| M2 ULTRA |  METAL | large-v2-q5_0       |   1 |   0 |  275.07 |    8.56 |    2.92 |    0.24 | f14ae77f |
| M2 ULTRA |  METAL | large-v2-q5_1       |   1 |   0 |  274.28 |    8.62 |    2.93 |    0.24 | f14ae77f |
| M2 ULTRA |  METAL | large-v2-q8_0       |   1 |   0 |  248.90 |    8.32 |    2.81 |    0.22 | f14ae77f |
| M2 ULTRA |  METAL |  large-v2-dis       |   1 |   0 |  214.26 |    0.97 |    0.27 |    0.02 | f14ae77f |
| M2 ULTRA |  METAL | large-v3-turbo      |   1 |   0 |  222.47 |    1.49 |    0.45 |    0.03 | f14ae77f |
| M2 ULTRA |  METAL | large-v3-turbo-q5_0 |   1 |   0 |  250.56 |    1.35 |    0.45 |    0.04 | f14ae77f |
| M2 ULTRA |  METAL | large-v3-turbo-q8_0 |   1 |   0 |  228.57 |    1.33 |    0.43 |    0.03 | f14ae77f |

make -j && ./scripts/bench-all.sh 1 1 1

|      CPU | Config |         Model       |  Th |  FA |    Enc. |    Dec. |    Bch5 |      PP |  Commit |
|      --- |    --- |           ---       | --- | --- |     --- |     --- |     --- |     --- |     --- |
| M2 ULTRA |  METAL |          tiny       |   1 |   1 |    6.03 |    0.86 |    0.20 |    0.01 | f14ae77f |
| M2 ULTRA |  METAL |     tiny-q5_0       |   1 |   1 |    6.46 |    0.84 |    0.21 |    0.01 | f14ae77f |
| M2 ULTRA |  METAL |     tiny-q5_1       |   1 |   1 |    6.46 |    0.85 |    0.21 |    0.01 | f14ae77f |
| M2 ULTRA |  METAL |     tiny-q8_0       |   1 |   1 |    6.14 |    0.88 |    0.20 |    0.01 | f14ae77f |
| M2 ULTRA |  METAL |          base       |   1 |   1 |   10.87 |    1.24 |    0.31 |    0.01 | f14ae77f |
| M2 ULTRA |  METAL |     base-q5_0       |   1 |   1 |   11.98 |    1.18 |    0.31 |    0.02 | f14ae77f |
| M2 ULTRA |  METAL |     base-q5_1       |   1 |   1 |   12.07 |    1.18 |    0.31 |    0.02 | f14ae77f |
| M2 ULTRA |  METAL |     base-q8_0       |   1 |   1 |   11.13 |    1.19 |    0.30 |    0.02 | f14ae77f |
| M2 ULTRA |  METAL |         small       |   1 |   1 |   31.46 |    2.37 |    0.63 |    0.04 | f14ae77f |
| M2 ULTRA |  METAL |    small-q5_0       |   1 |   1 |   36.16 |    2.31 |    0.65 |    0.04 | f14ae77f |
| M2 ULTRA |  METAL |    small-q5_1       |   1 |   1 |   36.57 |    2.31 |    0.65 |    0.04 | f14ae77f |
| M2 ULTRA |  METAL |    small-q8_0       |   1 |   1 |   32.94 |    2.27 |    0.63 |    0.04 | f14ae77f |
| M2 ULTRA |  METAL |        medium       |   1 |   1 |   89.86 |    4.92 |    1.41 |    0.09 | f14ae77f |
| M2 ULTRA |  METAL |   medium-q5_0       |   1 |   1 |  107.12 |    4.72 |    1.42 |    0.10 | f14ae77f |
| M2 ULTRA |  METAL |   medium-q5_1       |   1 |   1 |  107.00 |    4.70 |    1.42 |    0.10 | f14ae77f |
| M2 ULTRA |  METAL |   medium-q8_0       |   1 |   1 |   94.93 |    4.56 |    1.37 |    0.09 | f14ae77f |
| M2 ULTRA |  METAL |    medium-dis       |   1 |   1 |   79.66 |    0.78 |    0.20 |    0.01 | f14ae77f |
| M2 ULTRA |  METAL |      large-v2       |   1 |   1 |  170.06 |    7.13 |    2.15 |    0.16 | f14ae77f |
| M2 ULTRA |  METAL | large-v2-q5_0       |   1 |   1 |  205.16 |    6.80 |    2.18 |    0.20 | f14ae77f |
| M2 ULTRA |  METAL | large-v2-q5_1       |   1 |   1 |  204.22 |    6.69 |    2.16 |    0.20 | f14ae77f |
| M2 ULTRA |  METAL | large-v2-q8_0       |   1 |   1 |  179.78 |    6.35 |    2.13 |    0.18 | f14ae77f |
| M2 ULTRA |  METAL |  large-v2-dis       |   1 |   1 |  148.11 |    0.89 |    0.22 |    0.02 | f14ae77f |
| M2 ULTRA |  METAL | large-v3-turbo      |   1 |   1 |  149.23 |    1.29 |    0.34 |    0.03 | f14ae77f |
| M2 ULTRA |  METAL | large-v3-turbo-q5_0 |   1 |   1 |  180.77 |    1.13 |    0.35 |    0.03 | f14ae77f |
| M2 ULTRA |  METAL | large-v3-turbo-q8_0 |   1 |   1 |  158.66 |    1.10 |    0.33 |    0.03 | f14ae77f |


## M4 Max

make -j && ./scripts/bench-all.sh 8

Running memcpy benchmark

memcpy:   57.23 GB/s (heat-up)
memcpy:   68.85 GB/s ( 1 thread)
memcpy:   70.00 GB/s ( 1 thread)
memcpy:  104.83 GB/s ( 2 thread)
memcpy:  124.54 GB/s ( 3 thread)
memcpy:  144.30 GB/s ( 4 thread)
memcpy:  141.24 GB/s ( 5 thread)
memcpy:  147.03 GB/s ( 6 thread)
memcpy:  147.18 GB/s ( 7 thread)
memcpy:  149.83 GB/s ( 8 thread)
sum:    -5120001475.000000


make -j && ./scripts/bench-all.sh 1

Running ggml_mul_mat benchmark with 1 threads

  64 x   64: Q4_0    49.6 GFLOPS (128 runs) | Q4_1    46.8 GFLOPS (128 runs)
  64 x   64: Q5_0    28.1 GFLOPS (128 runs) | Q5_1    26.8 GFLOPS (128 runs) | Q8_0    52.3 GFLOPS (128 runs)
  64 x   64: F16     38.1 GFLOPS (128 runs) | F32     26.0 GFLOPS (128 runs)
 128 x  128: Q4_0    87.6 GFLOPS (128 runs) | Q4_1    79.9 GFLOPS (128 runs)
 128 x  128: Q5_0    44.7 GFLOPS (128 runs) | Q5_1    41.6 GFLOPS (128 runs) | Q8_0    98.9 GFLOPS (128 runs)
 128 x  128: F16     64.1 GFLOPS (128 runs) | F32     35.4 GFLOPS (128 runs)
 256 x  256: Q4_0   104.2 GFLOPS (128 runs) | Q4_1    92.3 GFLOPS (128 runs)
 256 x  256: Q5_0    57.3 GFLOPS (128 runs) | Q5_1    51.5 GFLOPS (128 runs) | Q8_0   127.7 GFLOPS (128 runs)
 256 x  256: F16     71.4 GFLOPS (128 runs) | F32     40.6 GFLOPS (128 runs)
 512 x  512: Q4_0   109.5 GFLOPS (128 runs) | Q4_1    98.0 GFLOPS (128 runs)
 512 x  512: Q5_0    62.4 GFLOPS (128 runs) | Q5_1    54.6 GFLOPS (128 runs) | Q8_0   135.0 GFLOPS (128 runs)
 512 x  512: F16     82.6 GFLOPS (128 runs) | F32     44.6 GFLOPS (128 runs)
1024 x 1024: Q4_0   112.1 GFLOPS ( 53 runs) | Q4_1   100.9 GFLOPS ( 47 runs)
1024 x 1024: Q5_0    65.4 GFLOPS ( 31 runs) | Q5_1    56.7 GFLOPS ( 27 runs) | Q8_0   140.9 GFLOPS ( 66 runs)
1024 x 1024: F16     88.0 GFLOPS ( 41 runs) | F32     43.4 GFLOPS ( 21 runs)
2048 x 2048: Q4_0   113.4 GFLOPS (  7 runs) | Q4_1   102.0 GFLOPS (  6 runs)
2048 x 2048: Q5_0    67.1 GFLOPS (  4 runs) | Q5_1    57.7 GFLOPS (  4 runs) | Q8_0   142.7 GFLOPS (  9 runs)
2048 x 2048: F16     84.6 GFLOPS (  5 runs) | F32     37.5 GFLOPS (  3 runs)
4096 x 4096: Q4_0   113.8 GFLOPS (  3 runs) | Q4_1   102.0 GFLOPS (  3 runs)
4096 x 4096: Q5_0    67.7 GFLOPS (  3 runs) | Q5_1    58.0 GFLOPS (  3 runs) | Q8_0   142.9 GFLOPS (  3 runs)
4096 x 4096: F16     73.7 GFLOPS (  3 runs) | F32     36.1 GFLOPS (  3 runs)


make -j && ./scripts/bench-all.sh 1 1 0

|    CPU |  Config |         Model  |  Th |  FA |    Enc. |    Dec. |    Bch5 |      PP |  Commit |
|    --- |     --- |           ---  | --- | --- |     --- |     --- |     --- |     --- |     --- |
| M4 Max |   METAL |          tiny  |   1 |   0 |   10.75 |    0.87 |    0.24 |    0.01 | 47af2fb7 |
| M4 Max |   METAL |     tiny-q8_0  |   1 |   0 |   11.15 |    0.85 |    0.24 |    0.01 | 47af2fb7 |
| M4 Max |   METAL |          base  |   1 |   0 |   20.12 |    1.34 |    0.36 |    0.02 | 47af2fb7 |
| M4 Max |   METAL |     base-q8_0  |   1 |   0 |   20.40 |    1.25 |    0.37 |    0.02 | 47af2fb7 |
| M4 Max |   METAL |         small  |   1 |   0 |   63.80 |    2.75 |    0.77 |    0.06 | 47af2fb7 |
| M4 Max |   METAL |    small-q8_0  |   1 |   0 |   65.46 |    2.43 |    0.77 |    0.06 | 47af2fb7 |
| M4 Max |   METAL |        medium  |   1 |   0 |  184.43 |    6.21 |    1.82 |    0.15 | 47af2fb7 |
| M4 Max |   METAL |   medium-q8_0  |   1 |   0 |  190.19 |    5.76 |    1.86 |    0.15 | 47af2fb7 |
| M4 Max |   METAL |      large-v2  |   1 |   0 |  344.05 |   10.64 |    3.07 |    0.26 | 47af2fb7 |
| M4 Max |   METAL | large-v2-q8_0  |   1 |   0 |  355.43 |    8.83 |    3.03 |    0.27 | 47af2fb7 |
| M4 Max |   METAL | large-v3-turbo |   1 |   0 |  306.64 |    1.82 |    0.49 |    0.04 | 47af2fb7 |


make -j && ./scripts/bench-all.sh 1 1 1

|    CPU |  Config |         Model  |  Th |  FA |    Enc. |    Dec. |    Bch5 |      PP |  Commit |
|    --- |     --- |           ---  | --- | --- |     --- |     --- |     --- |     --- |     --- |
| M4 Max |   METAL |          tiny  |   1 |   1 |    8.23 |    0.72 |    0.16 |    0.01 | 47af2fb7 |
| M4 Max |   METAL |     tiny-q8_0  |   1 |   1 |    8.38 |    0.68 |    0.16 |    0.01 | 47af2fb7 |
| M4 Max |   METAL |          base  |   1 |   1 |   15.66 |    1.16 |    0.26 |    0.02 | 47af2fb7 |
| M4 Max |   METAL |     base-q8_0  |   1 |   1 |   15.88 |    1.08 |    0.27 |    0.02 | 47af2fb7 |
| M4 Max |   METAL |         small  |   1 |   1 |   50.34 |    2.38 |    0.54 |    0.05 | 47af2fb7 |
| M4 Max |   METAL |    small-q8_0  |   1 |   1 |   51.90 |    1.98 |    0.54 |    0.05 | 47af2fb7 |
| M4 Max |   METAL |        medium  |   1 |   1 |  149.55 |    5.59 |    1.30 |    0.12 | 47af2fb7 |
| M4 Max |   METAL |   medium-q8_0  |   1 |   1 |  154.34 |    4.65 |    1.28 |    0.13 | 47af2fb7 |
| M4 Max |   METAL |      large-v2  |   1 |   1 |  291.28 |    9.16 |    2.14 |    0.22 | 47af2fb7 |
| M4 Max |   METAL | large-v2-q8_0  |   1 |   1 |  301.06 |    7.21 |    2.08 |    0.23 | 47af2fb7 |
| M4 Max |   METAL | large-v3-turbo |   1 |   1 |  256.23 |    1.61 |    0.38 |    0.04 | 47af2fb7 |


## M5 Max

make -j && ./scripts/bench-all.sh 1 1 0

|    CPU |  Config |         Model  |  Th |  FA |    Enc. |    Dec. |    Bch5 |      PP |  Commit |
|    --- |     --- |           ---  | --- | --- |     --- |     --- |     --- |     --- |     --- |
| M5 Max |   METAL |          tiny  |   1 |   0 |    4.88 |    0.65 |    0.17 |    0.01 | f14ae77f |
| M5 Max |   METAL |     tiny-q8_0  |   1 |   0 |    4.84 |    0.63 |    0.17 |    0.01 | f14ae77f |
| M5 Max |   METAL |          base  |   1 |   0 |    8.95 |    1.02 |    0.24 |    0.01 | f14ae77f |
| M5 Max |   METAL |     base-q8_0  |   1 |   0 |    9.12 |    0.94 |    0.24 |    0.01 | f14ae77f |
| M5 Max |   METAL |         small  |   1 |   0 |   25.61 |    2.15 |    0.52 |    0.03 | f14ae77f |
| M5 Max |   METAL |    small-q8_0  |   1 |   0 |   25.77 |    1.93 |    0.50 |    0.03 | f14ae77f |
| M5 Max |   METAL |        medium  |   1 |   0 |   73.96 |    4.61 |    1.16 |    0.08 | f14ae77f |
| M5 Max |   METAL |   medium-q8_0  |   1 |   0 |   74.89 |    3.94 |    1.12 |    0.08 | f14ae77f |
| M5 Max |   METAL |      large-v2  |   1 |   0 |  132.06 |    6.91 |    1.86 |    0.13 | f14ae77f |
| M5 Max |   METAL | large-v2-q8_0  |   1 |   0 |  132.56 |    6.00 |    1.76 |    0.13 | f14ae77f |
| M5 Max |   METAL | large-v3-turbo |   1 |   0 |  119.34 |    1.30 |    0.32 |    0.02 | f14ae77f |


make -j && ./scripts/bench-all.sh 1 1 1

|    CPU |  Config |         Model  |  Th |  FA |    Enc. |    Dec. |    Bch5 |      PP |  Commit |
|    --- |     --- |           ---  | --- | --- |     --- |     --- |     --- |     --- |     --- |
| M5 Max |   METAL |          tiny  |   1 |   1 |    4.31 |    0.59 |    0.13 |    0.01 | f14ae77f |
| M5 Max |   METAL |     tiny-q8_0  |   1 |   1 |    4.51 |    0.55 |    0.12 |    0.01 | f14ae77f |
| M5 Max |   METAL |          base  |   1 |   1 |    7.77 |    0.91 |    0.20 |    0.01 | f14ae77f |
| M5 Max |   METAL |     base-q8_0  |   1 |   1 |    7.67 |    0.78 |    0.19 |    0.01 | f14ae77f |
| M5 Max |   METAL |         small  |   1 |   1 |   20.90 |    1.76 |    0.40 |    0.03 | f14ae77f |
| M5 Max |   METAL |    small-q8_0  |   1 |   1 |   21.32 |    1.62 |    0.38 |    0.03 | f14ae77f |
| M5 Max |   METAL |        medium  |   1 |   1 |   60.40 |    3.98 |    0.89 |    0.07 | f14ae77f |
| M5 Max |   METAL |   medium-q8_0  |   1 |   1 |   60.72 |    3.35 |    0.86 |    0.07 | f14ae77f |
| M5 Max |   METAL |      large-v2  |   1 |   1 |  110.57 |    6.06 |    1.41 |    0.12 | f14ae77f |
| M5 Max |   METAL | large-v2-q8_0  |   1 |   1 |  110.92 |    5.00 |    1.31 |    0.12 | f14ae77f |
| M5 Max |   METAL | large-v3-turbo |   1 |   1 |   98.36 |    1.19 |    0.27 |    0.02 | f14ae77f |


# RTX 5090

make -j && ./scripts/bench-all.sh 1 1 0

|      GPU | Config |         Model       |  Th |  FA |    Enc. |    Dec. |    Bch5 |      PP |  Commit |
|      --- |    --- |           ---       | --- | --- |     --- |     --- |     --- |     --- |     --- |
| RTX 5090 |   CUDA |          tiny       |   1 |   0 |    2.17 |    0.38 |    0.10 |    0.00 | f14ae77f |
| RTX 5090 |   CUDA |     tiny-q8_0       |   1 |   0 |    2.31 |    0.37 |    0.10 |    0.01 | f14ae77f |
| RTX 5090 |   CUDA |          base       |   1 |   0 |    3.94 |    0.56 |    0.17 |    0.01 | f14ae77f |
| RTX 5090 |   CUDA |     base-q8_0       |   1 |   0 |    4.13 |    0.53 |    0.14 |    0.01 | f14ae77f |
| RTX 5090 |   CUDA |         small       |   1 |   0 |   12.06 |    1.09 |    0.34 |    0.02 | f14ae77f |
| RTX 5090 |   CUDA |    small-q8_0       |   1 |   0 |   12.50 |    1.11 |    0.30 |    0.02 | f14ae77f |
| RTX 5090 |   CUDA |        medium       |   1 |   0 |   33.08 |    2.38 |    0.70 |    0.04 | f14ae77f |
| RTX 5090 |   CUDA |   medium-q8_0       |   1 |   0 |   32.57 |    2.26 |    0.62 |    0.04 | f14ae77f |
| RTX 5090 |   CUDA |      large-v2       |   1 |   0 |   54.27 |    3.68 |    1.03 |    0.06 | f14ae77f |
| RTX 5090 |   CUDA | large-v2-q8_0       |   1 |   0 |   53.11 |    3.22 |    0.89 |    0.06 | f14ae77f |
| RTX 5090 |   CUDA | large-v3-turbo      |   1 |   0 |   50.56 |    0.58 |    0.15 |    0.01 | f14ae77f |
| RTX 5090 |   CUDA | large-v3-turbo-q8_0 |   1 |   0 |   49.39 |    0.49 |    0.13 |    0.01 | f14ae77f |

make -j && ./scripts/bench-all.sh 1 1 1

|      GPU | Config |         Model       |  Th |  FA |    Enc. |    Dec. |    Bch5 |      PP |  Commit |
|      --- |    --- |           ---       | --- | --- |     --- |     --- |     --- |     --- |     --- |
| RTX 5090 |   CUDA |          tiny       |   1 |   1 |    1.29 |    0.31 |    0.07 |    0.00 | f14ae77f |
| RTX 5090 |   CUDA |     tiny-q8_0       |   1 |   1 |    1.45 |    0.31 |    0.07 |    0.00 | f14ae77f |
| RTX 5090 |   CUDA |          base       |   1 |   1 |    2.15 |    0.44 |    0.13 |    0.01 | f14ae77f |
| RTX 5090 |   CUDA |     base-q8_0       |   1 |   1 |    2.27 |    0.43 |    0.10 |    0.01 | f14ae77f |
| RTX 5090 |   CUDA |         small       |   1 |   1 |    5.54 |    0.83 |    0.26 |    0.01 | f14ae77f |
| RTX 5090 |   CUDA |    small-q8_0       |   1 |   1 |    5.95 |    0.84 |    0.22 |    0.01 | f14ae77f |
| RTX 5090 |   CUDA |        medium       |   1 |   1 |   15.43 |    1.81 |    0.53 |    0.02 | f14ae77f |
| RTX 5090 |   CUDA |   medium-q8_0       |   1 |   1 |   14.71 |    1.66 |    0.46 |    0.03 | f14ae77f |
| RTX 5090 |   CUDA |      large-v2       |   1 |   1 |   24.73 |    2.92 |    0.81 |    0.04 | f14ae77f |
| RTX 5090 |   CUDA | large-v2-q8_0       |   1 |   1 |   23.35 |    2.43 |    0.67 |    0.04 | f14ae77f |
| RTX 5090 |   CUDA | large-v3-turbo      |   1 |   1 |   21.36 |    0.49 |    0.13 |    0.01 | f14ae77f |
| RTX 5090 |   CUDA | large-v3-turbo-q8_0 |   1 |   1 |   20.07 |    0.39 |    0.10 |    0.01 | f14ae77f |


# DGX Spark

make -j && ./scripts/bench-all.sh 1 1 0

|      GPU | Config |         Model       |  Th |  FA |    Enc. |    Dec. |    Bch5 |      PP |  Commit |
|      --- |    --- |           ---       | --- | --- |     --- |     --- |     --- |     --- |     --- |
| DGX Spk. |   CUDA |          tiny       |   1 |   0 |    9.00 |    0.85 |    0.14 |    0.01 | f5b477ab |
| DGX Spk. |   CUDA |     tiny-q8_0       |   1 |   0 |    8.86 |    0.83 |    0.12 |    0.01 | f5b477ab |
| DGX Spk. |   CUDA |          base       |   1 |   0 |   18.48 |    1.38 |    0.22 |    0.02 | f5b477ab |
| DGX Spk. |   CUDA |     base-q8_0       |   1 |   0 |   17.28 |    1.22 |    0.19 |    0.02 | f5b477ab |
| DGX Spk. |   CUDA |         small       |   1 |   0 |   56.43 |    3.01 |    0.51 |    0.04 | f5b477ab |
| DGX Spk. |   CUDA |    small-q8_0       |   1 |   0 |   55.70 |    2.68 |    0.44 |    0.04 | f5b477ab |
| DGX Spk. |   CUDA |        medium       |   1 |   0 |  160.20 |    7.52 |    1.25 |    0.11 | f5b477ab |
| DGX Spk. |   CUDA |   medium-q8_0       |   1 |   0 |  150.84 |    6.01 |    1.01 |    0.12 | f5b477ab |
| DGX Spk. |   CUDA |      large-v2       |   1 |   0 |  276.42 |   12.29 |    2.16 |    0.20 | f5b477ab |
| DGX Spk. |   CUDA | large-v2-q8_0       |   1 |   0 |  264.92 |    9.32 |    1.67 |    0.20 | f5b477ab |
| DGX Spk. |   CUDA | large-v3-turbo      |   1 |   0 |  264.90 |    2.03 |    0.37 |    0.03 | f5b477ab |
| DGX Spk. |   CUDA | large-v3-turbo-q8_0 |   1 |   0 |  253.56 |    1.48 |    0.27 |    0.03 | f5b477ab |

|      GPU | Config |         Model       |  Th |  FA |    Enc. |    Dec. |    Bch5 |      PP |  Commit |
|      --- |    --- |           ---       | --- | --- |     --- |     --- |     --- |     --- |     --- |
| DGX Spk. |   CUDA |          tiny       |   1 |   0 |    9.79 |    0.65 |    0.14 |    0.01 | f14ae77f |
| DGX Spk. |   CUDA |     tiny-q8_0       |   1 |   0 |    8.97 |    0.56 |    0.12 |    0.01 | f14ae77f |
| DGX Spk. |   CUDA |          base       |   1 |   0 |   18.58 |    1.04 |    0.22 |    0.01 | f14ae77f |
| DGX Spk. |   CUDA |     base-q8_0       |   1 |   0 |   17.36 |    0.88 |    0.18 |    0.02 | f14ae77f |
| DGX Spk. |   CUDA |         small       |   1 |   0 |   56.78 |    2.33 |    0.51 |    0.04 | f14ae77f |
| DGX Spk. |   CUDA |    small-q8_0       |   1 |   0 |   55.47 |    1.99 |    0.43 |    0.04 | f14ae77f |
| DGX Spk. |   CUDA |        medium       |   1 |   0 |  158.21 |    5.71 |    1.23 |    0.11 | f14ae77f |
| DGX Spk. |   CUDA |   medium-q8_0       |   1 |   0 |  151.17 |    4.54 |    0.97 |    0.11 | f14ae77f |
| DGX Spk. |   CUDA |      large-v2       |   1 |   0 |  269.59 |   10.48 |    2.13 |    0.20 | f14ae77f |
| DGX Spk. |   CUDA | large-v2-q8_0       |   1 |   0 |  262.82 |    7.43 |    1.61 |    0.20 | f14ae77f |
| DGX Spk. |   CUDA | large-v3-turbo      |   1 |   0 |  263.91 |    1.80 |    0.37 |    0.03 | f14ae77f |
| DGX Spk. |   CUDA | large-v3-turbo-q8_0 |   1 |   0 |  252.89 |    1.23 |    0.26 |    0.03 | f14ae77f |

make -j && ./scripts/bench-all.sh 1 1 1

|      GPU | Config |         Model       |  Th |  FA |    Enc. |    Dec. |    Bch5 |      PP |  Commit |
|      --- |    --- |           ---       | --- | --- |     --- |     --- |     --- |     --- |     --- |
| DGX Spk. |   CUDA |          tiny       |   1 |   1 |    2.72 |    0.56 |    0.13 |    0.01 | f14ae77f |
| DGX Spk. |   CUDA |     tiny-q8_0       |   1 |   1 |    2.55 |    0.47 |    0.11 |    0.01 | f14ae77f |
| DGX Spk. |   CUDA |          base       |   1 |   1 |    5.08 |    0.90 |    0.20 |    0.01 | f14ae77f |
| DGX Spk. |   CUDA |     base-q8_0       |   1 |   1 |    4.38 |    0.72 |    0.16 |    0.01 | f14ae77f |
| DGX Spk. |   CUDA |         small       |   1 |   1 |   16.95 |    2.00 |    0.47 |    0.02 | f14ae77f |
| DGX Spk. |   CUDA |    small-q8_0       |   1 |   1 |   15.67 |    1.67 |    0.39 |    0.02 | f14ae77f |
| DGX Spk. |   CUDA |        medium       |   1 |   1 |   53.12 |    5.10 |    1.24 |    0.06 | f14ae77f |
| DGX Spk. |   CUDA |   medium-q8_0       |   1 |   1 |   43.64 |    3.87 |    0.91 |    0.05 | f14ae77f |
| DGX Spk. |   CUDA |      large-v2       |   1 |   1 |  102.15 |    9.58 |    2.02 |    0.08 | f14ae77f |
| DGX Spk. |   CUDA | large-v2-q8_0       |   1 |   1 |   93.86 |    6.54 |    1.49 |    0.08 | f14ae77f |
| DGX Spk. |   CUDA | large-v3-turbo      |   1 |   1 |   90.29 |    1.69 |    0.36 |    0.02 | f14ae77f |
| DGX Spk. |   CUDA | large-v3-turbo-q8_0 |   1 |   1 |   82.79 |    1.13 |    0.25 |    0.01 | f14ae77f |


# V100

GGML_CUDA=1 make -j && ./scripts/bench-all.sh 8 1 0

|  GPU |    Config |         Model |  Th |  FA |    Enc. |    Dec. |    Bch5 |      PP |  Commit |
|  --- |       --- |           --- | --- | --- |     --- |     --- |     --- |     --- |     --- |
| V100 | AVX2 CUDA |          tiny |   8 |   0 |    5.99 |    1.01 |    0.30 |    0.01 | dc8dda60 |
| V100 | AVX2 CUDA |     tiny-q5_1 |   8 |   0 |    6.07 |    1.00 |    0.26 |    0.01 | dc8dda60 |
| V100 | AVX2 CUDA |          base |   8 |   0 |   10.96 |    1.44 |    0.43 |    0.02 | dc8dda60 |
| V100 | AVX2 CUDA |     base-q5_1 |   8 |   0 |   11.11 |    1.41 |    0.37 |    0.02 | dc8dda60 |
| V100 | AVX2 CUDA |         small |   8 |   0 |   31.04 |    2.84 |    0.86 |    0.04 | dc8dda60 |
| V100 | AVX2 CUDA |    small-q5_1 |   8 |   0 |   31.69 |    2.82 |    0.71 |    0.04 | dc8dda60 |
| V100 | AVX2 CUDA |        medium |   8 |   0 |   83.95 |    6.05 |    1.82 |    0.09 | dc8dda60 |
| V100 | AVX2 CUDA |   medium-q5_0 |   8 |   0 |   85.86 |    5.58 |    1.45 |    0.10 | dc8dda60 |
| V100 | AVX2 CUDA |      large-v2 |   8 |   0 |  138.50 |    8.70 |    2.71 |    0.15 | dc8dda60 |
| V100 | AVX2 CUDA | large-v2-q5_0 |   8 |   0 |  142.31 |    7.82 |    2.03 |    0.16 | dc8dda60 |
| V100 | AVX2 CUDA | large-v3-turbo |   8 |   0 |  128.39 |    1.42 |    0.44 |    0.02 | dc8dda60 |
| V100 | AVX2 CUDA | large-v3-turbo-q5_0 |   8 |   0 |  131.24 |    1.17 |    0.33 |    0.03 | dc8dda60 |


GGML_CUDA=1 make -j && ./scripts/bench-all.sh 8 1 1

|  GPU |    Config |         Model |  Th |  FA |    Enc. |    Dec. |    Bch5 |      PP |  Commit |
|  --- |       --- |           --- | --- | --- |     --- |     --- |     --- |     --- |     --- |
| V100 | AVX2 CUDA |          tiny |   8 |   1 |    4.85 |    0.97 |    0.26 |    0.01 | dc8dda60 |
| V100 | AVX2 CUDA |     tiny-q5_1 |   8 |   1 |    4.97 |    0.89 |    0.19 |    0.01 | dc8dda60 |
| V100 | AVX2 CUDA |          base |   8 |   1 |    7.23 |    1.28 |    0.35 |    0.02 | dc8dda60 |
| V100 | AVX2 CUDA |     base-q5_1 |   8 |   1 |    7.38 |    1.24 |    0.26 |    0.02 | dc8dda60 |
| V100 | AVX2 CUDA |         small |   8 |   1 |   20.87 |    2.44 |    0.71 |    0.03 | dc8dda60 |
| V100 | AVX2 CUDA |    small-q5_1 |   8 |   1 |   19.80 |    2.35 |    0.51 |    0.03 | dc8dda60 |
| V100 | AVX2 CUDA |        medium |   8 |   1 |   54.56 |    5.31 |    1.46 |    0.06 | dc8dda60 |
| V100 | AVX2 CUDA |   medium-q5_0 |   8 |   1 |   56.09 |    4.67 |    1.05 |    0.07 | dc8dda60 |
| V100 | AVX2 CUDA |      large-v2 |   8 |   1 |   87.05 |    7.65 |    2.16 |    0.10 | dc8dda60 |
| V100 | AVX2 CUDA | large-v2-q5_0 |   8 |   1 |   94.65 |    6.60 |    1.47 |    0.11 | dc8dda60 |
| V100 | AVX2 CUDA | large-v3-turbo |   8 |   1 |   76.46 |    1.29 |    0.37 |    0.02 | dc8dda60 |
| V100 | AVX2 CUDA | large-v3-turbo-q5_0 |   8 |   1 |   79.62 |    1.03 |    0.23 |    0.02 | dc8dda60 |
