MATLAB 的 conv/conv2/convn 本身并不用 FFT；它们实现的是“滑动求和”的直接卷积（time-domain）并在底层用高度优化的 C 循环与并行机制来提速。加速主要来自：多线程/向量化的内部实现、对 GPU/tall/distributed 数组的原生支持；当核或信号很大时，应改用频域卷积（FFT、overlap-add/save）或能自动切换算法的函数/工具。

MATLAB的内置算法为何更“快”

直接法 + 内部并行：conv 等函数按定义做逐点乘加（O(M·N)），但实现是编译代码并带隐式多线程优化；对中小尺寸核，这通常比 FFT 更快。另外有第三方测评指出 MATLAB 内部对 conv* 采用滑窗并配合隐式多线程（非官方，但与实测一致）。

直接法 + 内部并行：conv 等函数按定义做逐点乘加（O(M·N)），但实现是编译代码并带隐式多线程优化；对中小尺寸核，这通常比 FFT 更快。另外有第三方测评指出 MATLAB 内部对 conv* 采用滑窗并配合隐式多线程（非官方，但与实测一致）。

GPU / Tall / Distributed：把输入放到 gpuArray 上可让 conv 在 GPU 上执行；也支持 thread-based、tall 和分布式数组，用更多核/更多内存堆栈来提速或放大规模。

如何测出 conv（直接法） vs. FFT 卷积（含 overlap-add）的分界点

function bench_conv_vs_fft()
% BENCH_CONV_VS_FFT
% 在你的硬件/版本上基准比较：
% 1) conv（直接卷积，CPU/GPU）
% 2) 一次性 FFT 卷积（CPU/GPU）
% 3) Overlap-Add FFT 卷积（CPU/GPU）
%
% 结果：打印出“FFT 比 conv 更快”的近似分界核大小，并画出时间对比图。
% 你可以修改“用户参数”以匹配你的真实数据规模。

%% ==== 用户参数 ====
mode = "sweepKernel";     % "sweepKernel" 固定信号长度扫核大小；或 "sweepSignal" 固定核扫信号长度
N_fixed = 2^18;           % 固定信号长度（sweepKernel 模式下使用）
K_list  = [3 5 9 15 31 63 127 255 511 1023 2047];  % 扫描的核长度（奇数常见，可按需改）
K_fixed = 255;            % 固定核长度（sweepSignal 模式下使用）
N_list  = round(logspace(3,6,10)); % 扫描的信号长度

use_overlap_add = true;   % 是否包含 overlap-add（分块 FFT）方案
OA_block_target = 1<<14;  % 期望块长的量级（会自动取最近的合适 FFT 点）

rng(0);                   % 固定随机种子，保证可复现
nRepsSafety = 1;          % 大问题规模时，避免超长基准，timeit 自带多次测量

hasGPU = exist('gpuDeviceCount','file') && gpuDeviceCount>0;

%% ==== 生成尺寸序列 ====
if mode=="sweepKernel"
    N_vec = N_fixed*ones(size(K_list));
    K_vec = K_list(:);
    x_label = 'Kernel length K';
    x_ticks = K_list;
elseif mode=="sweepSignal"
    N_vec = N_list(:);
    K_vec = K_fixed*ones(size(N_list));
    x_label = 'Signal length N';
    x_ticks = N_list;
else
    error('mode 必须是 "sweepKernel" 或 "sweepSignal"');
end
nCases = numel(N_vec);

%% ==== 结果数组 ====
t_conv_cpu   = nan(nCases,1);
t_fft_cpu    = nan(nCases,1);
t_oa_cpu     = nan(nCases,1);
t_conv_gpu   = nan(nCases,1);
t_fft_gpu    = nan(nCases,1);
t_oa_gpu     = nan(nCases,1);

%% ==== 主循环 ====
for i = 1:nCases
    N = N_vec(i);
    K = K_vec(i);

    % 随机双精度数据（更接近实际）
    x = randn(N,1);
    h = randn(K,1);

    % -------- CPU：conv 直接法 --------
    t_conv_cpu(i) = timeit(@() conv(x,h,'full'), nRepsSafety);

    % -------- CPU：一次性 FFT 卷积 --------
    t_fft_cpu(i) = timeit(@() localFFTConv(x,h), nRepsSafety);

    % -------- CPU：Overlap-Add --------
    if use_overlap_add
        t_oa_cpu(i) = timeit(@() localOverlapAdd(x,h,OA_block_target), nRepsSafety);
    end

    % -------- GPU（若可用）--------
    if hasGPU
        xg = gpuArray(x); hg = gpuArray(h);

        t_conv_gpu(i) = gputimeit(@() conv(xg,hg,'full'));

        t_fft_gpu(i)  = gputimeit(@() localFFTConv(xg,hg));

        if use_overlap_add
            t_oa_gpu(i) = gputimeit(@() localOverlapAdd(xg,hg,OA_block_target));
        end

        % 清理 GPU 内存压力
        reset(gpuDevice);
    end

    % 正确性抽检（避免静默数值问题）
    % 仅在 CPU 上做一次检查，允许 FFT 有轻微浮点误差
    if i==1
        y_ref = conv(x,h,'full');
        y_f   = localFFTConv(x,h);
        err_f = norm(y_ref - y_f, inf) / max(1, norm(y_ref, inf));
        assert(err_f < 1e-9, '一次性 FFT 卷积结果误差过大：%g', err_f);

        if use_overlap_add
            y_oa  = localOverlapAdd(x,h,OA_block_target);
            err_oa = norm(y_ref - y_oa, inf) / max(1, norm(y_ref, inf));
            assert(err_oa < 1e-9, 'Overlap-Add 卷积结果误差过大：%g', err_oa);
        end
    end

    fprintf('Case %d/%d: N=%d, K=%d | conv=%.4gs, fft=%.4gs, oa=%.4gs%s\n', ...
        i, nCases, N, K, t_conv_cpu(i), t_fft_cpu(i), t_oa_cpu(i), ...
        tern(hasGPU, sprintf(' | GPU conv=%.4gs, fft=%.4gs, oa=%.4gs', t_conv_gpu(i), t_fft_gpu(i), t_oa_gpu(i)), ''));
end

%% ==== 统计并打印分界点（CPU/GPU） ====
fprintf('\n=== 分界点（以“快过 10%%”为准）===\n');
if mode=="sweepKernel"
    idx_cpu_fft = find(t_fft_cpu./t_conv_cpu < 0.9, 1, 'first');
    idx_cpu_oa  = find(t_oa_cpu./t_conv_cpu  < 0.9, 1, 'first');
    if ~isempty(idx_cpu_fft), fprintf('CPU：一次性 FFT 在 K ≈ %d 起更快\n', x_ticks(idx_cpu_fft)); end
    if use_overlap_add && ~isempty(idx_cpu_oa), fprintf('CPU：Overlap-Add 在 K ≈ %d 起更快\n', x_ticks(idx_cpu_oa)); end
    if hasGPU
        idx_gpu_fft = find(t_fft_gpu./t_conv_gpu < 0.9, 1, 'first');
        idx_gpu_oa  = find(t_oa_gpu./t_conv_gpu  < 0.9, 1, 'first');
        if ~isempty(idx_gpu_fft), fprintf('GPU：一次性 FFT 在 K ≈ %d 起更快\n', x_ticks(idx_gpu_fft)); end
        if use_overlap_add && ~isempty(idx_gpu_oa), fprintf('GPU：Overlap-Add 在 K ≈ %d 起更快\n', x_ticks(idx_gpu_oa)); end
    end
else
    idx_cpu_fft = find(t_fft_cpu./t_conv_cpu < 0.9, 1, 'first');
    idx_cpu_oa  = find(t_oa_cpu./t_conv_cpu  < 0.9, 1, 'first');
    if ~isempty(idx_cpu_fft), fprintf('CPU：一次性 FFT 在 N ≈ %d 起更快\n', x_ticks(idx_cpu_fft)); end
    if use_overlap_add && ~isempty(idx_cpu_oa), fprintf('CPU：Overlap-Add 在 N ≈ %d 起更快\n', x_ticks(idx_cpu_oa)); end
    if hasGPU
        idx_gpu_fft = find(t_fft_gpu./t_conv_gpu < 0.9, 1, 'first');
        idx_gpu_oa  = find(t_oa_gpu./t_conv_gpu  < 0.9, 1, 'first');
        if ~isempty(idx_gpu_fft), fprintf('GPU：一次性 FFT 在 N ≈ %d 起更快\n', x_ticks(idx_gpu_fft)); end
        if use_overlap_add && ~isempty(idx_gpu_oa), fprintf('GPU：Overlap-Add 在 N ≈ %d 起更快\n', x_ticks(idx_gpu_oa)); end
    end
end

%% ==== 画图（CPU / 可选 GPU） ====
figure('Name','conv vs FFT benchmark'); hold on; grid on;
if mode=="sweepKernel"
    xval = K_vec;
else
    xval = N_vec;
end

% CPU
plot(xval, t_conv_cpu, '-', 'DisplayName','CPU conv');
plot(xval, t_fft_cpu,  '-', 'DisplayName','CPU FFT');
if use_overlap_add
    plot(xval, t_oa_cpu,   '-', 'DisplayName','CPU FFT (overlap-add)');
end

% GPU
if hasGPU
    plot(xval, t_conv_gpu, '--', 'DisplayName','GPU conv');
    plot(xval, t_fft_gpu,  '--', 'DisplayName','GPU FFT');
    if use_overlap_add
        plot(xval, t_oa_gpu,   '--', 'DisplayName','GPU FFT (overlap-add)');
    end
end

set(gca,'XScale','log','YScale','log');
xlabel(x_label); ylabel('Time (s)');
legend('Location','best');
title(sprintf('N (fixed or sweep) / K (fixed or sweep): N=%s, K=%s', num2str(N_fixed), num2str(K_fixed)));

end % main function

%% ====== 辅助函数 ======

function y = localFFTConv(x,h)
% 一次性 FFT 卷积（full 长度）
n = numel(x) + numel(h) - 1;
nfft = 2^nextpow2(n);
y = ifft( fft(x, nfft) .* fft(h, nfft) );
y = y(1:n);
% 保持原类型（支持 gpuArray）
if ~isa(y, 'gpuArray')
    y = real(y); % CPU 上去除数值噪声的虚部
end
end

function y = localOverlapAdd(x,h,OA_block_target)
% Overlap-Add（分块 FFT），full 长度
Nx = numel(x); M = numel(h);
L  = max(1, OA_block_target);            % 期望块长
nfft = 2^nextpow2(L + M - 1);            % 实际 FFT 长度
L  = nfft - M + 1;                        % 真正每块放入的样本数
H  = fft(h, nfft);
y  = zeros(Nx + M - 1, 1, 'like', x);     % 和 x 同类型（支持 gpuArray）

pos = 1;
while pos <= Nx
    Lb = min(L, Nx - pos + 1);
    xb = x(pos : pos + Lb - 1);
    Yb = ifft( fft(xb, nfft) .* H );
    y(pos : pos + Lb + M - 2) = y(pos : pos + Lb + M - 2) + Yb(1 : Lb + M - 1);
    pos = pos + Lb;
end

if ~isa(y, 'gpuArray'), y = real(y); end
end

function s = tern(cond, a, b)
if cond, s = a; else, s = b; end
end

站内导航

MATLAB的内置conv算法

MATLAB的内置算法为何更“快”

如何测出 conv（直接法） vs. FFT 卷积（含 overlap-add）的分界点

MATLAB的内置算法为何更“快”

如何测出 conv（直接法） vs. FFT 卷积（含 overlap-add） 的分界点

如何测出 conv（直接法） vs. FFT 卷积（含 overlap-add）的分界点