Driver/receive: Implement audio reorder/jitter buffer (#156)

This PR Introduces a new `VoiceTick` event which collects and reorders all RTP packets to smooth over network instability, as well as to synchronise user audio streams. Raw packet events have been moved to `RtpPacket`, while `SpeakingUpdate`s have been removed as they can be easily computed using the `silent`/`speaking` audio maps included in each event. Closes #146.
2023-01-09 00:22:30 +00:00
parent ab18f9e092
commit c60c454cf5
19 changed files with 923 additions and 611 deletions
--- a/src/events/context/data/mod.rs
+++ b/src/events/context/data/mod.rs
@@ -6,13 +6,13 @@ mod disconnect;
 #[cfg(feature = "receive")]
 mod rtcp;
 #[cfg(feature = "receive")]
-mod speaking;
+mod rtp;
 #[cfg(feature = "receive")]
 mod voice;

 #[cfg(feature = "receive")]
-use discortp::{rtcp::Rtcp, rtp::Rtp};
+use bytes::Bytes;

 pub use self::{connect::*, disconnect::*};
 #[cfg(feature = "receive")]
-pub use self::{rtcp::*, speaking::*, voice::*};
+pub use self::{rtcp::*, rtp::*, voice::*};
--- a/src/events/context/data/rtcp.rs
+++ b/src/events/context/data/rtcp.rs
@@ -1,3 +1,5 @@
+use discortp::rtcp::RtcpPacket;
+
 use super::*;

 #[derive(Clone, Debug, Eq, PartialEq)]
@@ -5,11 +7,22 @@ use super::*;
 /// Telemetry/statistics packet, received from another stream (detailed in `packet`).
 /// `payload_offset` contains the true payload location within the raw packet's `payload()`,
 /// to allow manual decoding of `Rtcp` packet bodies.
-pub struct RtcpData<'a> {
+pub struct RtcpData {
    /// Raw RTCP packet data.
-    pub packet: &'a Rtcp,
+    pub packet: Bytes,
    /// Byte index into the packet body (after headers) for where the payload begins.
    pub payload_offset: usize,
    /// Number of bytes at the end of the packet to discard.
    pub payload_end_pad: usize,
 }
+
+impl RtcpData {
+    /// Create a zero-copy view of the inner RTCP packet.
+    ///
+    /// This allows easy access to packet header fields, taking them from the underlying
+    /// `Bytes` as needed while handling endianness etc.
+    pub fn rtcp(&'_ self) -> RtcpPacket<'_> {
+        RtcpPacket::new(&self.packet)
+            .expect("FATAL: leaked illegally small RTP packet from UDP Rx task.")
+    }
+}
--- a/src/events/context/data/rtp.rs
+++ b/src/events/context/data/rtp.rs
@@ -0,0 +1,30 @@
+use discortp::rtp::RtpPacket;
+
+use super::*;
+
+#[derive(Clone, Debug, Eq, PartialEq)]
+#[non_exhaustive]
+/// Opus audio packet, received from another stream (detailed in `packet`).
+/// `payload_offset` contains the true payload location within the raw packet's `payload()`,
+/// if extensions or raw packet data are required.
+pub struct RtpData {
+    /// Raw RTP packet data.
+    ///
+    /// Includes the SSRC (i.e., sender) of this packet.
+    pub packet: Bytes,
+    /// Byte index into the packet body (after headers) for where the payload begins.
+    pub payload_offset: usize,
+    /// Number of bytes at the end of the packet to discard.
+    pub payload_end_pad: usize,
+}
+
+impl RtpData {
+    /// Create a zero-copy view of the inner RTP packet.
+    ///
+    /// This allows easy access to packet header fields, taking them from the underlying
+    /// `Bytes` as needed while handling endianness etc.
+    pub fn rtp(&'_ self) -> RtpPacket<'_> {
+        RtpPacket::new(&self.packet)
+            .expect("FATAL: leaked illegally small RTP packet from UDP Rx task.")
+    }
+}
--- a/src/events/context/data/speaking.rs
+++ b/src/events/context/data/speaking.rs
@@ -1,14 +0,0 @@
-#[derive(Clone, Debug, Eq, Hash, PartialEq)]
-#[non_exhaustive]
-/// Speaking state transition, describing whether a given source has started/stopped
-/// transmitting. This fires in response to a silent burst, or the first packet
-/// breaking such a burst.
-pub struct SpeakingUpdateData {
-    /// Whether this user is currently speaking.
-    pub speaking: bool,
-    /// Synchronisation Source of the user who has begun speaking.
-    ///
-    /// This must be combined with another event class to map this back to
-    /// its original UserId.
-    pub ssrc: u32,
-}
--- a/src/events/context/data/voice.rs
+++ b/src/events/context/data/voice.rs
@@ -1,28 +1,38 @@
+use std::collections::{HashMap, HashSet};
+
 use super::*;

 #[derive(Clone, Debug, Eq, PartialEq)]
 #[non_exhaustive]
-/// Opus audio packet, received from another stream (detailed in `packet`).
-/// `payload_offset` contains the true payload location within the raw packet's `payload()`,
-/// if extensions or raw packet data are required.
+/// Audio data from all users in a voice channel, fired every 20ms.
 ///
-/// Valid audio data (`Some(audio)` where `audio.len >= 0`) contains up to 20ms of 16-bit stereo PCM audio
-/// at 48kHz, using native endianness. Songbird will not send audio for silent regions, these should
-/// be inferred using [`SpeakingUpdate`]s (and filled in by the user if required using arrays of zeroes).
+/// Songbird implements a jitter buffer to sycnhronise user packets, smooth out network latency, and
+/// handle packet reordering by the network. Packet playout  via this event is delayed by approximately
+/// [`Config::playout_buffer_length`]` * 20ms` from its original arrival.
 ///
-/// If `audio.len() == 0`, then this packet arrived out-of-order. If `None`, songbird was not configured
-/// to decode received packets.
-///
-/// [`SpeakingUpdate`]: crate::events::CoreEvent::SpeakingUpdate
-pub struct VoiceData<'a> {
-    /// Decoded audio from this packet.
-    pub audio: &'a Option<Vec<i16>>,
-    /// Raw RTP packet data.
-    ///
-    /// Includes the SSRC (i.e., sender) of this packet.
-    pub packet: &'a Rtp,
-    /// Byte index into the packet body (after headers) for where the payload begins.
-    pub payload_offset: usize,
-    /// Number of bytes at the end of the packet to discard.
-    pub payload_end_pad: usize,
+/// [`Config::playout_buffer_length`]: crate::Config::playout_buffer_length
+pub struct VoiceTick {
+    /// Decoded voice data and source packets sent by each user.
+    pub speaking: HashMap<u32, VoiceData>,
+
+    /// Set of all SSRCs currently known in the call who aren't included in [`Self::speaking`].
+    pub silent: HashSet<u32>,
+}
+
+#[derive(Clone, Debug, Eq, PartialEq)]
+#[non_exhaustive]
+/// Voice packet and audio data for a single user, from a single tick.
+pub struct VoiceData {
+    /// RTP packet clocked out for this tick.
+    ///
+    /// If `None`, then the packet was lost, and [`Self::decoded_voice`] may include
+    /// around one codec delay's worth of audio.
+    pub packet: Option<RtpData>,
+    /// PCM audio obtained from a user.
+    ///
+    /// Valid audio data (`Some(audio)` where `audio.len >= 0`) typically contains 20ms of 16-bit stereo PCM audio
+    /// at 48kHz, using native endianness. Channels are interleaved (i.e., `L, R, L, R, ...`).
+    ///
+    /// This value will be `None` if Songbird is not configured to decode audio.
+    pub decoded_voice: Option<Vec<i16>>,
 }
--- a/src/events/context/internal_data.rs
+++ b/src/events/context/internal_data.rs
@@ -41,53 +41,36 @@ impl<'a> From<&'a InternalDisconnect> for DisconnectData<'a> {
 #[cfg(feature = "receive")]
 mod receive {
    use super::*;
-    use discortp::{rtcp::Rtcp, rtp::Rtp};
-
-    #[derive(Clone, Debug, Eq, Hash, PartialEq)]
-    pub struct InternalSpeakingUpdate {
-        pub ssrc: u32,
-        pub speaking: bool,
-    }
+    use bytes::Bytes;

    #[derive(Clone, Debug, Eq, PartialEq)]
-    pub struct InternalVoicePacket {
-        pub audio: Option<Vec<i16>>,
-        pub packet: Rtp,
+    pub struct InternalRtpPacket {
+        pub packet: Bytes,
        pub payload_offset: usize,
        pub payload_end_pad: usize,
    }

    #[derive(Clone, Debug, Eq, PartialEq)]
    pub struct InternalRtcpPacket {
-        pub packet: Rtcp,
+        pub packet: Bytes,
        pub payload_offset: usize,
        pub payload_end_pad: usize,
    }

-    impl<'a> From<&'a InternalSpeakingUpdate> for SpeakingUpdateData {
-        fn from(val: &'a InternalSpeakingUpdate) -> Self {
+    impl<'a> From<&'a InternalRtpPacket> for RtpData {
+        fn from(val: &'a InternalRtpPacket) -> Self {
            Self {
-                speaking: val.speaking,
-                ssrc: val.ssrc,
-            }
-        }
-    }
-
-    impl<'a> From<&'a InternalVoicePacket> for VoiceData<'a> {
-        fn from(val: &'a InternalVoicePacket) -> Self {
-            Self {
-                audio: &val.audio,
-                packet: &val.packet,
+                packet: val.packet.clone(),
                payload_offset: val.payload_offset,
                payload_end_pad: val.payload_end_pad,
            }
        }
    }

-    impl<'a> From<&'a InternalRtcpPacket> for RtcpData<'a> {
+    impl<'a> From<&'a InternalRtcpPacket> for RtcpData {
        fn from(val: &'a InternalRtcpPacket) -> Self {
            Self {
-                packet: &val.packet,
+                packet: val.packet.clone(),
                payload_offset: val.payload_offset,
                payload_end_pad: val.payload_end_pad,
            }
--- a/src/events/context/mod.rs
+++ b/src/events/context/mod.rs
@@ -33,18 +33,16 @@ pub enum EventContext<'a> {
    SpeakingStateUpdate(Speaking),

    #[cfg(feature = "receive")]
-    /// Speaking state transition, describing whether a given source has started/stopped
-    /// transmitting. This fires in response to a silent burst, or the first packet
-    /// breaking such a burst.
-    SpeakingUpdate(SpeakingUpdateData),
+    /// Reordered and decoded audio packets, received every 20ms.
+    VoiceTick(VoiceTick),

    #[cfg(feature = "receive")]
    /// Opus audio packet, received from another stream.
-    VoicePacket(VoiceData<'a>),
+    RtpPacket(RtpData),

    #[cfg(feature = "receive")]
    /// Telemetry/statistics packet, received from another stream.
-    RtcpPacket(RtcpData<'a>),
+    RtcpPacket(RtcpData),

    /// Fired whenever a client disconnects.
    ClientDisconnect(ClientDisconnect),
@@ -63,9 +61,9 @@ pub enum EventContext<'a> {
 pub enum CoreContext {
    SpeakingStateUpdate(Speaking),
    #[cfg(feature = "receive")]
-    SpeakingUpdate(InternalSpeakingUpdate),
+    VoiceTick(VoiceTick),
    #[cfg(feature = "receive")]
-    VoicePacket(InternalVoicePacket),
+    RtpPacket(InternalRtpPacket),
    #[cfg(feature = "receive")]
    RtcpPacket(InternalRtcpPacket),
    ClientDisconnect(ClientDisconnect),
@@ -79,10 +77,9 @@ impl<'a> CoreContext {
        match self {
            Self::SpeakingStateUpdate(evt) => EventContext::SpeakingStateUpdate(*evt),
            #[cfg(feature = "receive")]
-            Self::SpeakingUpdate(evt) =>
-                EventContext::SpeakingUpdate(SpeakingUpdateData::from(evt)),
+            Self::VoiceTick(evt) => EventContext::VoiceTick(evt.clone()),
            #[cfg(feature = "receive")]
-            Self::VoicePacket(evt) => EventContext::VoicePacket(VoiceData::from(evt)),
+            Self::RtpPacket(evt) => EventContext::RtpPacket(RtpData::from(evt)),
            #[cfg(feature = "receive")]
            Self::RtcpPacket(evt) => EventContext::RtcpPacket(RtcpData::from(evt)),
            Self::ClientDisconnect(evt) => EventContext::ClientDisconnect(*evt),
@@ -102,9 +99,9 @@ impl EventContext<'_> {
        match self {
            Self::SpeakingStateUpdate(_) => Some(CoreEvent::SpeakingStateUpdate),
            #[cfg(feature = "receive")]
-            Self::SpeakingUpdate(_) => Some(CoreEvent::SpeakingUpdate),
+            Self::VoiceTick(_) => Some(CoreEvent::VoiceTick),
            #[cfg(feature = "receive")]
-            Self::VoicePacket(_) => Some(CoreEvent::VoicePacket),
+            Self::RtpPacket(_) => Some(CoreEvent::RtpPacket),
            #[cfg(feature = "receive")]
            Self::RtcpPacket(_) => Some(CoreEvent::RtcpPacket),
            Self::ClientDisconnect(_) => Some(CoreEvent::ClientDisconnect),
--- a/src/events/core.rs
+++ b/src/events/core.rs
@@ -9,14 +9,11 @@
 /// when a client leaves the session ([`ClientDisconnect`]).
 ///
 /// When the `"receive"` feature is enabled, songbird can also handle voice packets
-#[cfg_attr(feature = "receive", doc = "([`VoicePacket`](Self::VoicePacket)),")]
-#[cfg_attr(not(feature = "receive"), doc = "(`VoicePacket`),")]
-/// detect speech starting/stopping
-#[cfg_attr(
-    feature = "receive",
-    doc = "([`SpeakingUpdate`](Self::SpeakingUpdate)),"
-)]
-#[cfg_attr(not(feature = "receive"), doc = "(`SpeakingUpdate`),")]
+#[cfg_attr(feature = "receive", doc = "([`RtpPacket`](Self::RtpPacket)),")]
+#[cfg_attr(not(feature = "receive"), doc = "(`RtpPacket`),")]
+/// decode and track speaking users
+#[cfg_attr(feature = "receive", doc = "([`VoiceTick`](Self::VoiceTick)),")]
+#[cfg_attr(not(feature = "receive"), doc = "(`VoiceTick`),")]
 /// and handle telemetry data
 #[cfg_attr(feature = "receive", doc = "([`RtcpPacket`](Self::RtcpPacket)).")]
 #[cfg_attr(not(feature = "receive"), doc = "(`RtcpPacket`).")]
@@ -49,9 +46,9 @@ pub enum CoreEvent {
    SpeakingStateUpdate,

    #[cfg(feature = "receive")]
-    /// Fires when a source starts speaking, or stops speaking
-    /// (*i.e.*, 5 consecutive silent frames).
-    SpeakingUpdate,
+    /// Fires every 20ms, containing the scheduled voice packet and decoded audio
+    /// data for each live user.
+    VoiceTick,

    #[cfg(feature = "receive")]
    /// Fires on receipt of a voice packet from another stream in the voice call.
@@ -59,7 +56,7 @@ pub enum CoreEvent {
    /// As RTP packets do not map to Discord's notion of users, SSRCs must be mapped
    /// back using the user IDs seen through client connection, disconnection,
    /// or speaking state update.
-    VoicePacket,
+    RtpPacket,

    #[cfg(feature = "receive")]
    /// Fires on receipt of an RTCP packet, containing various call stats