From 6450e8abbf609089f679c4b9b95d6de51a5cdfbc Mon Sep 17 00:00:00 2001
From: Stephen Gutekanst <stephen@hexops.com>
Date: Sun, 29 Dec 2024 15:15:56 -0700
Subject: [PATCH] Audio: rewrite sample mixing to use SIMD properly

Signed-off-by: Stephen Gutekanst <stephen@hexops.com>
---
 build.zig.zon                   |   4 +-
 examples/hardware-check/App.zig |   4 +-
 examples/piano/App.zig          |   4 +-
 src/Audio.zig                   | 270 ++++++++++++++++++++++++++------
 src/main.zig                    |   4 +-
 5 files changed, 234 insertions(+), 52 deletions(-)

diff --git a/build.zig.zon b/build.zig.zon
index 7c9f69cc..9a6bf020 100644
--- a/build.zig.zon
+++ b/build.zig.zon
@@ -70,8 +70,8 @@
             .lazy = true,
         },
         .mach_opus = .{
-            .url = "https://pkg.machengine.org/mach-opus/278dc3f47f6924fa92fa9b6e20c143407c9889f7.tar.gz",
-            .hash = "12201343abe6c023540da4c1cf0a419439a9d4b18dd29ad10cf46befee8868a2e85e",
+            .url = "https://pkg.machengine.org/mach-opus/04b01d6bea29f7280a6a21cf306e2225b9e706b3.tar.gz",
+            .hash = "12200bd51cc3fc645a71e962cc71b055e9f33d72145d0d69a1840ca8a374d332749c",
             .lazy = true,
         },
         .mach_example_assets = .{
diff --git a/examples/hardware-check/App.zig b/examples/hardware-check/App.zig
index 83afa2a8..293ff105 100644
--- a/examples/hardware-check/App.zig
+++ b/examples/hardware-check/App.zig
@@ -275,7 +275,7 @@ pub fn tick(
             sprite.objects.delete(sprite_id);
 
             // Play a new sound
-            const samples = try app.allocator.dupe(f32, app.sfx.samples);
+            const samples = try app.allocator.alignedAlloc(f32, mach.Audio.alignment, app.sfx.samples.len);
             @memcpy(samples, app.sfx.samples);
             audio.buffers.lock();
             defer audio.buffers.unlock();
@@ -283,7 +283,7 @@ pub fn tick(
                 .samples = samples,
                 .channels = app.sfx.channels,
             });
-            _ = sound_id; // autofix
+            _ = sound_id;
             app.score += 1;
         } else {
             var transform = Mat4x4.ident;
diff --git a/examples/piano/App.zig b/examples/piano/App.zig
index 29abe586..214853d1 100644
--- a/examples/piano/App.zig
+++ b/examples/piano/App.zig
@@ -203,13 +203,13 @@ pub fn tick(
     window.queue.submit(&[_]*gpu.CommandBuffer{command});
 }
 
-fn fillTone(app: *App, audio: *mach.Audio, frequency: f32) ![]const f32 {
+fn fillTone(app: *App, audio: *mach.Audio, frequency: f32) ![]align(mach.Audio.alignment) const f32 {
     const channels = audio.player.channels().len;
     const sample_rate: f32 = @floatFromInt(audio.player.sampleRate());
     const duration: f32 = 1.5 * @as(f32, @floatFromInt(channels)) * sample_rate; // play the tone for 1.5s
     const gain = 0.1;
 
-    const samples = try app.allocator.alloc(f32, @intFromFloat(duration));
+    const samples = try app.allocator.alignedAlloc(f32, mach.Audio.alignment, @intFromFloat(duration));
 
     var i: usize = 0;
     while (i < samples.len) : (i += channels) {
diff --git a/src/Audio.zig b/src/Audio.zig
index 07c29668..3e8f695e 100644
--- a/src/Audio.zig
+++ b/src/Audio.zig
@@ -2,6 +2,7 @@ const std = @import("std");
 const builtin = @import("builtin");
 const mach = @import("main.zig");
 const sysaudio = mach.sysaudio;
+const testing = mach.testing;
 
 pub const Opus = @import("mach-opus");
 
@@ -15,9 +16,7 @@ pub const mach_systems = .{ .init, .tick, .deinit };
 /// aligned to simd_vector_length * @sizeOf(f32).
 pub const simd_vector_length = std.simd.suggestVectorLength(f32) orelse 1;
 
-/// The number of f32s which should be reserved for padding at the start of an []f32 buffer, assuming
-/// it is @alignOf(f32) / 4-byte aligned, in order to achieve @Vector(simd_vector_length, f32) alignment.
-pub const simd_vector_f32_buffer_padding = (simd_vector_length - (4 % simd_vector_length)) % simd_vector_length;
+pub const alignment = simd_vector_length * @sizeOf(f32);
 
 const log = std.log.scoped(mach_module);
 
@@ -33,7 +32,7 @@ buffers: mach.Objects(
     .{},
     struct {
         /// The actual audio samples
-        samples: []const f32 align(simd_vector_length),
+        samples: []align(alignment) const f32,
 
         /// The number of channels in the samples buffer
         channels: u8,
@@ -62,7 +61,7 @@ player: sysaudio.Player,
 allocator: std.mem.Allocator,
 ctx: sysaudio.Context,
 output: SampleBuffer,
-mixing_buffer: ?std.ArrayListUnmanaged(f32) = null,
+mixing_buffer: ?std.ArrayListAlignedUnmanaged(f32, alignment) = null,
 shutdown: std.atomic.Value(bool) = .init(false),
 mod: mach.Mod(Audio),
 driver_needs_num_samples: usize = 0,
@@ -146,11 +145,11 @@ pub fn tick(audio: *Audio, audio_mod: mach.Mod(Audio)) !void {
     // Ensure our f32 mixing buffer has enough space for the samples we will render right now.
     // This will allocate to grow but never shrink.
     var mixing_buffer = if (audio.mixing_buffer) |*b| b else blk: {
-        const b = try std.ArrayListUnmanaged(f32).initCapacity(allocator, simd_vector_f32_buffer_padding + render_num_samples);
+        const b = try std.ArrayListAlignedUnmanaged(f32, alignment).initCapacity(allocator, render_num_samples);
         audio.mixing_buffer = b;
         break :blk &audio.mixing_buffer.?;
     };
-    try mixing_buffer.resize(allocator, simd_vector_f32_buffer_padding + render_num_samples); // grows, but never shrinks
+    try mixing_buffer.resize(allocator, render_num_samples); // grows, but never shrinks
 
     // Zero the mixing buffer to silence: if no audio is mixed in below, then we want silence
     // not undefined memory noise.
@@ -167,26 +166,20 @@ pub fn tick(audio: *Audio, audio_mod: mach.Mod(Audio)) !void {
             if (!buffer.playing) continue;
             defer audio.buffers.setValue(buf_id, buffer);
 
-            const channels_diff = player_channels - buffer.channels + 1;
-            const mixing_buffer_len = mixing_buffer.items.len - simd_vector_f32_buffer_padding;
-            const to_read = (@min(buffer.samples.len - buffer.index, mixing_buffer_len - simd_vector_f32_buffer_padding) / channels_diff) + @rem(@min(buffer.samples.len - buffer.index, mixing_buffer_len), channels_diff);
-            if (buffer.channels == 1 and player_channels > 1) {
-                // Duplicate samples for mono sounds
-                var i: usize = simd_vector_f32_buffer_padding;
-                for (buffer.samples[buffer.index..][0..to_read]) |sample| {
-                    mixSamplesDuplicate(mixing_buffer.items[i..][0..player_channels], sample * buffer.volume);
-                    i += player_channels;
-                }
-            } else {
-                mixSamples(mixing_buffer.items[simd_vector_f32_buffer_padding..to_read], buffer.samples[buffer.index..][0..to_read], buffer.volume);
-            }
-
-            if (buffer.index + to_read >= buffer.samples.len) {
+            const new_index = mixSamples(
+                mixing_buffer.items,
+                player_channels,
+                buffer.samples,
+                buffer.index,
+                buffer.channels,
+                buffer.volume,
+            );
+            if (new_index >= buffer.samples.len) {
                 // No longer playing, we've read all samples
                 did_state_change = true;
                 buffer.playing = false;
                 buffer.index = 0;
-            } else buffer.index = buffer.index + to_read;
+            } else buffer.index = new_index;
         }
     }
     if (did_state_change) if (audio.on_state_change) |f| audio_mod.run(f);
@@ -195,10 +188,10 @@ pub fn tick(audio: *Audio, audio_mod: mach.Mod(Audio)) !void {
     // samples to the format the driver expects.
     const out_buffer_len = render_num_samples * player.format().size();
     const out_buffer = try audio.output.writableWithSize(out_buffer_len); // TODO(audio): handle potential OOM here better
-    std.debug.assert((mixing_buffer.items.len - simd_vector_f32_buffer_padding) == render_num_samples);
+    std.debug.assert(mixing_buffer.items.len == render_num_samples);
     sysaudio.convertTo(
         f32,
-        mixing_buffer.items[simd_vector_f32_buffer_padding..],
+        mixing_buffer.items[0..],
         player.format(),
         out_buffer[0..out_buffer_len], // writableWithSize may return a larger slice than needed
     );
@@ -264,28 +257,217 @@ fn writeFn(audio_opaque: ?*anyopaque, output: []u8) void {
     }
 }
 
+/// Mixes audio samples using SIMD. Returns the src_index progressed by the number of samples
+/// consumed.
 inline fn mixSamples(
-    a: []align(simd_vector_length) f32,
-    b: []align(simd_vector_length) const f32,
-    volume: f32,
-) void {
-    std.debug.assert(a.len >= b.len);
+    /// The destination where audio buffers should be mixed into. This buffer will be populated with
+    /// as many samples from src as possible, until either dst is full or src has no more available.
+    dst: []align(alignment) f32,
+    /// The number of channels in the dst buffer.
+    dst_channels: u8,
+    /// The audio buffer whose samples src[src_index..] should be mixed into the dst
+    src: []align(alignment) const f32,
+    src_index: usize,
+    /// The number of channels in the src buffer
+    src_channels: u8,
+    /// The volume/gain that should be applied to samples in src before mixing them into dst.
+    src_volume: f32,
+) usize {
+    const dst_frames = dst.len / dst_channels;
+    const src_frames = (src.len - src_index) / src_channels;
+    const frames_to_process = @min(dst_frames, src_frames);
+    const samples_to_process = frames_to_process * src_channels;
+
+    if (samples_to_process == 0) return src_index;
+
     const Vec = @Vector(simd_vector_length, f32);
-    const vec_blocks_len = b.len - (b.len % simd_vector_length);
-    var i: usize = 0;
-    while (i < vec_blocks_len) : (i += simd_vector_length) {
-        const b_vec: Vec = b[i..][0..simd_vector_length].*;
-        const a_vec: *Vec = @ptrCast(@alignCast(a[i..][0..simd_vector_length]));
-        a_vec.* += b_vec * @as(Vec, @splat(volume));
+    const volume_vec: Vec = @splat(src_volume);
+
+    var current_index = src_index;
+
+    // Handle unaligned start if necessary, since src[src_index..] may not be SIMD aligned - so
+    // we handle the starting portion with scalars instead.
+    const src_ptr: [*]align(alignment) const f32 = @ptrCast(src.ptr);
+    const misalignment = (@intFromPtr(src_ptr + current_index) % alignment) / @sizeOf(f32);
+    if (misalignment != 0) {
+        const scalar_count = alignment / @sizeOf(f32) - misalignment;
+        const end_index = @min(current_index + scalar_count, src_index + samples_to_process);
+
+        while (current_index < end_index) : (current_index += 1) {
+            const src_sample = src[current_index] * src_volume;
+            const frame_index = (current_index - src_index) / src_channels;
+            const dst_index = frame_index * dst_channels;
+
+            var channel: u8 = 0;
+            while (channel < dst_channels) : (channel += 1) {
+                const src_channel = if (channel < src_channels) channel else channel % src_channels;
+                if (src_channel == (current_index - src_index) % src_channels) {
+                    dst[dst_index + channel] += src_sample;
+                }
+            }
+        }
     }
+
+    // SIMD processing for aligned portion
+    const remaining_samples = samples_to_process - (current_index - src_index);
+    const vec_samples = remaining_samples / simd_vector_length;
+    const vec_count = vec_samples * simd_vector_length;
+    var vec_index: usize = 0;
+    while (vec_index < vec_count) : (vec_index += simd_vector_length) {
+        const src_offset = current_index + vec_index;
+        const src_vec: Vec = src[src_offset..][0..simd_vector_length].*;
+        const scaled_vec = src_vec * volume_vec;
+
+        const frame_index = (src_offset - src_index) / src_channels;
+        var dst_base = frame_index * dst_channels;
+        var i: usize = 0;
+        while (i < simd_vector_length) : (i += 1) {
+            const sample = scaled_vec[i];
+            const src_channel = (src_offset - src_index + i) % src_channels;
+            var channel: u8 = 0;
+
+            while (channel < dst_channels) : (channel += 1) {
+                const dst_channel = if (channel < src_channels) channel else channel % src_channels;
+                if (dst_channel == src_channel) dst[dst_base + channel] += sample;
+            }
+            if (src_channel == src_channels - 1) dst_base += dst_channels;
+        }
+    }
+    current_index += vec_count;
+
+    // Handle remaining samples, similar to how we may need to handle an unaligned start we also
+    // need to handle an unaligned end - if dst wants more samples but not a full SIMD vector worth
+    // at the end.
+    while (current_index < src_index + samples_to_process) : (current_index += 1) {
+        const src_sample = src[current_index] * src_volume;
+        const frame_index = (current_index - src_index) / src_channels;
+        const dst_index = frame_index * dst_channels;
+
+        var channel: u8 = 0;
+        while (channel < dst_channels) : (channel += 1) {
+            const src_channel = if (channel < src_channels) channel else channel % src_channels;
+            if (src_channel == (current_index - src_index) % src_channels) {
+                dst[dst_index + channel] += src_sample;
+            }
+        }
+    }
+
+    return current_index;
 }
 
-inline fn mixSamplesDuplicate(a: []align(simd_vector_length) f32, b: f32) void {
-    const Vec = @Vector(simd_vector_length, f32);
-    const vec_blocks_len = a.len - (a.len % simd_vector_length);
-    var i: usize = 0;
-    while (i < vec_blocks_len) : (i += simd_vector_length) {
-        const a_vec: *Vec = @ptrCast(@alignCast(a[i..][0..simd_vector_length]));
-        a_vec.* += @as(Vec, @splat(b));
-    }
+test "mixSamples - basic mono to mono mixing" {
+    var dst_buffer align(alignment) = [_]f32{0} ** 16;
+    const src_buffer align(alignment) = [_]f32{ 1.0, 2.0, 3.0, 4.0 } ** 4;
+
+    const new_index = mixSamples(
+        &dst_buffer,
+        1, // dst_channels
+        &src_buffer,
+        0, // src_index
+        1, // src_channels
+        0.5, // src_volume
+    );
+
+    try testing.expect(usize, 16).eql(new_index);
+    try testing.expect(f32, 0.5).eql(dst_buffer[0]);
+    try testing.expect(f32, 1.0).eql(dst_buffer[1]);
+    try testing.expect(f32, 1.5).eql(dst_buffer[2]);
+    try testing.expect(f32, 2.0).eql(dst_buffer[3]);
+}
+
+test "mixSamples - stereo to stereo mixing" {
+    var dst_buffer align(alignment) = [_]f32{0} ** 16;
+    const src_buffer align(alignment) = [_]f32{ 1.0, -1.0, 2.0, -2.0, 3.0, -3.0, 4.0, -4.0 } ** 2;
+
+    const new_index = mixSamples(
+        &dst_buffer,
+        2, // dst_channels
+        &src_buffer,
+        0, // src_index
+        2, // src_channels
+        1.0, // src_volume
+    );
+
+    try testing.expect(usize, 16).eql(new_index);
+    try testing.expect(f32, 1.0).eql(dst_buffer[0]); // Left
+    try testing.expect(f32, -1.0).eql(dst_buffer[1]); // Right
+    try testing.expect(f32, 2.0).eql(dst_buffer[2]); // Left
+    try testing.expect(f32, -2.0).eql(dst_buffer[3]); // Right
+}
+
+test "mixSamples - mono to stereo mixing (channel duplication)" {
+    var dst_buffer align(alignment) = [_]f32{0} ** 16;
+    const src_buffer align(alignment) = [_]f32{ 1.0, 2.0, 3.0, 4.0 } ** 2;
+
+    const new_index = mixSamples(
+        &dst_buffer,
+        2, // dst_channels
+        &src_buffer,
+        0, // src_index
+        1, // src_channels
+        1.0, // src_volume
+    );
+
+    try testing.expect(usize, 8).eql(new_index);
+    try testing.expect(f32, 1.0).eql(dst_buffer[0]); // Left
+    try testing.expect(f32, 1.0).eql(dst_buffer[1]); // Right
+    try testing.expect(f32, 2.0).eql(dst_buffer[2]); // Left
+    try testing.expect(f32, 2.0).eql(dst_buffer[3]); // Right
+}
+
+test "mixSamples - partial buffer processing" {
+    var dst_buffer align(alignment) = [_]f32{0} ** 8;
+    const src_buffer align(alignment) = [_]f32{ 1.0, 2.0, 3.0, 4.0 } ** 4;
+
+    const new_index = mixSamples(
+        &dst_buffer,
+        1, // dst_channels
+        &src_buffer,
+        4, // src_index
+        1, // src_channels
+        1.0, // src_volume
+    );
+
+    try testing.expect(usize, 12).eql(new_index);
+    try testing.expect(f32, 1.0).eql(dst_buffer[0]);
+    try testing.expect(f32, 2.0).eql(dst_buffer[1]);
+    try testing.expect(f32, 3.0).eql(dst_buffer[2]);
+}
+
+test "mixSamples - mixing with volume adjustment" {
+    var dst_buffer align(alignment) = [_]f32{0} ** 8;
+    const src_buffer align(alignment) = [_]f32{ 1.0, 2.0, 3.0, 4.0 } ** 2;
+
+    const new_index = mixSamples(
+        &dst_buffer,
+        1, // dst_channels
+        &src_buffer,
+        0, // src_index
+        1, // src_channels
+        0.5, // src_volume
+    );
+
+    try testing.expect(usize, 8).eql(new_index);
+    try testing.expect(f32, 0.5).eql(dst_buffer[0]);
+    try testing.expect(f32, 1.0).eql(dst_buffer[1]);
+    try testing.expect(f32, 1.5).eql(dst_buffer[2]);
+}
+
+test "mixSamples - accumulation test" {
+    var dst_buffer align(alignment) = [_]f32{1.0} ** 8;
+    const src_buffer align(alignment) = [_]f32{ 1.0, 2.0, 3.0, 4.0 } ** 2;
+
+    const new_index = mixSamples(
+        &dst_buffer,
+        1, // dst_channels
+        &src_buffer,
+        0, // src_index
+        1, // src_channels
+        1.0, // src_volume
+    );
+
+    try testing.expect(usize, 8).eql(new_index);
+    try testing.expect(f32, 2.0).eql(dst_buffer[0]);
+    try testing.expect(f32, 3.0).eql(dst_buffer[1]);
+    try testing.expect(f32, 4.0).eql(dst_buffer[2]);
 }
diff --git a/src/main.zig b/src/main.zig
index 13de49f5..0124e81c 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -42,8 +42,8 @@ test {
     _ = gpu;
     _ = sysaudio;
     _ = sysgpu;
-    // TODO(object)
-    // _ = gfx;
+    _ = gfx;
+    _ = Audio;
     _ = Audio;
     _ = math;
     _ = testing;