Driver/receive: Implement audio reorder/jitter buffer (#156)
This PR Introduces a new `VoiceTick` event which collects and reorders all RTP packets to smooth over network instability, as well as to synchronise user audio streams. Raw packet events have been moved to `RtpPacket`, while `SpeakingUpdate`s have been removed as they can be easily computed using the `silent`/`speaking` audio maps included in each event. Closes #146.
This commit is contained in:
@@ -6,13 +6,13 @@ mod disconnect;
|
||||
#[cfg(feature = "receive")]
|
||||
mod rtcp;
|
||||
#[cfg(feature = "receive")]
|
||||
mod speaking;
|
||||
mod rtp;
|
||||
#[cfg(feature = "receive")]
|
||||
mod voice;
|
||||
|
||||
#[cfg(feature = "receive")]
|
||||
use discortp::{rtcp::Rtcp, rtp::Rtp};
|
||||
use bytes::Bytes;
|
||||
|
||||
pub use self::{connect::*, disconnect::*};
|
||||
#[cfg(feature = "receive")]
|
||||
pub use self::{rtcp::*, speaking::*, voice::*};
|
||||
pub use self::{rtcp::*, rtp::*, voice::*};
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
use discortp::rtcp::RtcpPacket;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||
@@ -5,11 +7,22 @@ use super::*;
|
||||
/// Telemetry/statistics packet, received from another stream (detailed in `packet`).
|
||||
/// `payload_offset` contains the true payload location within the raw packet's `payload()`,
|
||||
/// to allow manual decoding of `Rtcp` packet bodies.
|
||||
pub struct RtcpData<'a> {
|
||||
pub struct RtcpData {
|
||||
/// Raw RTCP packet data.
|
||||
pub packet: &'a Rtcp,
|
||||
pub packet: Bytes,
|
||||
/// Byte index into the packet body (after headers) for where the payload begins.
|
||||
pub payload_offset: usize,
|
||||
/// Number of bytes at the end of the packet to discard.
|
||||
pub payload_end_pad: usize,
|
||||
}
|
||||
|
||||
impl RtcpData {
|
||||
/// Create a zero-copy view of the inner RTCP packet.
|
||||
///
|
||||
/// This allows easy access to packet header fields, taking them from the underlying
|
||||
/// `Bytes` as needed while handling endianness etc.
|
||||
pub fn rtcp(&'_ self) -> RtcpPacket<'_> {
|
||||
RtcpPacket::new(&self.packet)
|
||||
.expect("FATAL: leaked illegally small RTP packet from UDP Rx task.")
|
||||
}
|
||||
}
|
||||
|
||||
30
src/events/context/data/rtp.rs
Normal file
30
src/events/context/data/rtp.rs
Normal file
@@ -0,0 +1,30 @@
|
||||
use discortp::rtp::RtpPacket;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||
#[non_exhaustive]
|
||||
/// Opus audio packet, received from another stream (detailed in `packet`).
|
||||
/// `payload_offset` contains the true payload location within the raw packet's `payload()`,
|
||||
/// if extensions or raw packet data are required.
|
||||
pub struct RtpData {
|
||||
/// Raw RTP packet data.
|
||||
///
|
||||
/// Includes the SSRC (i.e., sender) of this packet.
|
||||
pub packet: Bytes,
|
||||
/// Byte index into the packet body (after headers) for where the payload begins.
|
||||
pub payload_offset: usize,
|
||||
/// Number of bytes at the end of the packet to discard.
|
||||
pub payload_end_pad: usize,
|
||||
}
|
||||
|
||||
impl RtpData {
|
||||
/// Create a zero-copy view of the inner RTP packet.
|
||||
///
|
||||
/// This allows easy access to packet header fields, taking them from the underlying
|
||||
/// `Bytes` as needed while handling endianness etc.
|
||||
pub fn rtp(&'_ self) -> RtpPacket<'_> {
|
||||
RtpPacket::new(&self.packet)
|
||||
.expect("FATAL: leaked illegally small RTP packet from UDP Rx task.")
|
||||
}
|
||||
}
|
||||
@@ -1,14 +0,0 @@
|
||||
#[derive(Clone, Debug, Eq, Hash, PartialEq)]
|
||||
#[non_exhaustive]
|
||||
/// Speaking state transition, describing whether a given source has started/stopped
|
||||
/// transmitting. This fires in response to a silent burst, or the first packet
|
||||
/// breaking such a burst.
|
||||
pub struct SpeakingUpdateData {
|
||||
/// Whether this user is currently speaking.
|
||||
pub speaking: bool,
|
||||
/// Synchronisation Source of the user who has begun speaking.
|
||||
///
|
||||
/// This must be combined with another event class to map this back to
|
||||
/// its original UserId.
|
||||
pub ssrc: u32,
|
||||
}
|
||||
@@ -1,28 +1,38 @@
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||
#[non_exhaustive]
|
||||
/// Opus audio packet, received from another stream (detailed in `packet`).
|
||||
/// `payload_offset` contains the true payload location within the raw packet's `payload()`,
|
||||
/// if extensions or raw packet data are required.
|
||||
/// Audio data from all users in a voice channel, fired every 20ms.
|
||||
///
|
||||
/// Valid audio data (`Some(audio)` where `audio.len >= 0`) contains up to 20ms of 16-bit stereo PCM audio
|
||||
/// at 48kHz, using native endianness. Songbird will not send audio for silent regions, these should
|
||||
/// be inferred using [`SpeakingUpdate`]s (and filled in by the user if required using arrays of zeroes).
|
||||
/// Songbird implements a jitter buffer to sycnhronise user packets, smooth out network latency, and
|
||||
/// handle packet reordering by the network. Packet playout via this event is delayed by approximately
|
||||
/// [`Config::playout_buffer_length`]` * 20ms` from its original arrival.
|
||||
///
|
||||
/// If `audio.len() == 0`, then this packet arrived out-of-order. If `None`, songbird was not configured
|
||||
/// to decode received packets.
|
||||
///
|
||||
/// [`SpeakingUpdate`]: crate::events::CoreEvent::SpeakingUpdate
|
||||
pub struct VoiceData<'a> {
|
||||
/// Decoded audio from this packet.
|
||||
pub audio: &'a Option<Vec<i16>>,
|
||||
/// Raw RTP packet data.
|
||||
///
|
||||
/// Includes the SSRC (i.e., sender) of this packet.
|
||||
pub packet: &'a Rtp,
|
||||
/// Byte index into the packet body (after headers) for where the payload begins.
|
||||
pub payload_offset: usize,
|
||||
/// Number of bytes at the end of the packet to discard.
|
||||
pub payload_end_pad: usize,
|
||||
/// [`Config::playout_buffer_length`]: crate::Config::playout_buffer_length
|
||||
pub struct VoiceTick {
|
||||
/// Decoded voice data and source packets sent by each user.
|
||||
pub speaking: HashMap<u32, VoiceData>,
|
||||
|
||||
/// Set of all SSRCs currently known in the call who aren't included in [`Self::speaking`].
|
||||
pub silent: HashSet<u32>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||
#[non_exhaustive]
|
||||
/// Voice packet and audio data for a single user, from a single tick.
|
||||
pub struct VoiceData {
|
||||
/// RTP packet clocked out for this tick.
|
||||
///
|
||||
/// If `None`, then the packet was lost, and [`Self::decoded_voice`] may include
|
||||
/// around one codec delay's worth of audio.
|
||||
pub packet: Option<RtpData>,
|
||||
/// PCM audio obtained from a user.
|
||||
///
|
||||
/// Valid audio data (`Some(audio)` where `audio.len >= 0`) typically contains 20ms of 16-bit stereo PCM audio
|
||||
/// at 48kHz, using native endianness. Channels are interleaved (i.e., `L, R, L, R, ...`).
|
||||
///
|
||||
/// This value will be `None` if Songbird is not configured to decode audio.
|
||||
pub decoded_voice: Option<Vec<i16>>,
|
||||
}
|
||||
|
||||
@@ -41,53 +41,36 @@ impl<'a> From<&'a InternalDisconnect> for DisconnectData<'a> {
|
||||
#[cfg(feature = "receive")]
|
||||
mod receive {
|
||||
use super::*;
|
||||
use discortp::{rtcp::Rtcp, rtp::Rtp};
|
||||
|
||||
#[derive(Clone, Debug, Eq, Hash, PartialEq)]
|
||||
pub struct InternalSpeakingUpdate {
|
||||
pub ssrc: u32,
|
||||
pub speaking: bool,
|
||||
}
|
||||
use bytes::Bytes;
|
||||
|
||||
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||
pub struct InternalVoicePacket {
|
||||
pub audio: Option<Vec<i16>>,
|
||||
pub packet: Rtp,
|
||||
pub struct InternalRtpPacket {
|
||||
pub packet: Bytes,
|
||||
pub payload_offset: usize,
|
||||
pub payload_end_pad: usize,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||
pub struct InternalRtcpPacket {
|
||||
pub packet: Rtcp,
|
||||
pub packet: Bytes,
|
||||
pub payload_offset: usize,
|
||||
pub payload_end_pad: usize,
|
||||
}
|
||||
|
||||
impl<'a> From<&'a InternalSpeakingUpdate> for SpeakingUpdateData {
|
||||
fn from(val: &'a InternalSpeakingUpdate) -> Self {
|
||||
impl<'a> From<&'a InternalRtpPacket> for RtpData {
|
||||
fn from(val: &'a InternalRtpPacket) -> Self {
|
||||
Self {
|
||||
speaking: val.speaking,
|
||||
ssrc: val.ssrc,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a InternalVoicePacket> for VoiceData<'a> {
|
||||
fn from(val: &'a InternalVoicePacket) -> Self {
|
||||
Self {
|
||||
audio: &val.audio,
|
||||
packet: &val.packet,
|
||||
packet: val.packet.clone(),
|
||||
payload_offset: val.payload_offset,
|
||||
payload_end_pad: val.payload_end_pad,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a InternalRtcpPacket> for RtcpData<'a> {
|
||||
impl<'a> From<&'a InternalRtcpPacket> for RtcpData {
|
||||
fn from(val: &'a InternalRtcpPacket) -> Self {
|
||||
Self {
|
||||
packet: &val.packet,
|
||||
packet: val.packet.clone(),
|
||||
payload_offset: val.payload_offset,
|
||||
payload_end_pad: val.payload_end_pad,
|
||||
}
|
||||
|
||||
@@ -33,18 +33,16 @@ pub enum EventContext<'a> {
|
||||
SpeakingStateUpdate(Speaking),
|
||||
|
||||
#[cfg(feature = "receive")]
|
||||
/// Speaking state transition, describing whether a given source has started/stopped
|
||||
/// transmitting. This fires in response to a silent burst, or the first packet
|
||||
/// breaking such a burst.
|
||||
SpeakingUpdate(SpeakingUpdateData),
|
||||
/// Reordered and decoded audio packets, received every 20ms.
|
||||
VoiceTick(VoiceTick),
|
||||
|
||||
#[cfg(feature = "receive")]
|
||||
/// Opus audio packet, received from another stream.
|
||||
VoicePacket(VoiceData<'a>),
|
||||
RtpPacket(RtpData),
|
||||
|
||||
#[cfg(feature = "receive")]
|
||||
/// Telemetry/statistics packet, received from another stream.
|
||||
RtcpPacket(RtcpData<'a>),
|
||||
RtcpPacket(RtcpData),
|
||||
|
||||
/// Fired whenever a client disconnects.
|
||||
ClientDisconnect(ClientDisconnect),
|
||||
@@ -63,9 +61,9 @@ pub enum EventContext<'a> {
|
||||
pub enum CoreContext {
|
||||
SpeakingStateUpdate(Speaking),
|
||||
#[cfg(feature = "receive")]
|
||||
SpeakingUpdate(InternalSpeakingUpdate),
|
||||
VoiceTick(VoiceTick),
|
||||
#[cfg(feature = "receive")]
|
||||
VoicePacket(InternalVoicePacket),
|
||||
RtpPacket(InternalRtpPacket),
|
||||
#[cfg(feature = "receive")]
|
||||
RtcpPacket(InternalRtcpPacket),
|
||||
ClientDisconnect(ClientDisconnect),
|
||||
@@ -79,10 +77,9 @@ impl<'a> CoreContext {
|
||||
match self {
|
||||
Self::SpeakingStateUpdate(evt) => EventContext::SpeakingStateUpdate(*evt),
|
||||
#[cfg(feature = "receive")]
|
||||
Self::SpeakingUpdate(evt) =>
|
||||
EventContext::SpeakingUpdate(SpeakingUpdateData::from(evt)),
|
||||
Self::VoiceTick(evt) => EventContext::VoiceTick(evt.clone()),
|
||||
#[cfg(feature = "receive")]
|
||||
Self::VoicePacket(evt) => EventContext::VoicePacket(VoiceData::from(evt)),
|
||||
Self::RtpPacket(evt) => EventContext::RtpPacket(RtpData::from(evt)),
|
||||
#[cfg(feature = "receive")]
|
||||
Self::RtcpPacket(evt) => EventContext::RtcpPacket(RtcpData::from(evt)),
|
||||
Self::ClientDisconnect(evt) => EventContext::ClientDisconnect(*evt),
|
||||
@@ -102,9 +99,9 @@ impl EventContext<'_> {
|
||||
match self {
|
||||
Self::SpeakingStateUpdate(_) => Some(CoreEvent::SpeakingStateUpdate),
|
||||
#[cfg(feature = "receive")]
|
||||
Self::SpeakingUpdate(_) => Some(CoreEvent::SpeakingUpdate),
|
||||
Self::VoiceTick(_) => Some(CoreEvent::VoiceTick),
|
||||
#[cfg(feature = "receive")]
|
||||
Self::VoicePacket(_) => Some(CoreEvent::VoicePacket),
|
||||
Self::RtpPacket(_) => Some(CoreEvent::RtpPacket),
|
||||
#[cfg(feature = "receive")]
|
||||
Self::RtcpPacket(_) => Some(CoreEvent::RtcpPacket),
|
||||
Self::ClientDisconnect(_) => Some(CoreEvent::ClientDisconnect),
|
||||
|
||||
@@ -9,14 +9,11 @@
|
||||
/// when a client leaves the session ([`ClientDisconnect`]).
|
||||
///
|
||||
/// When the `"receive"` feature is enabled, songbird can also handle voice packets
|
||||
#[cfg_attr(feature = "receive", doc = "([`VoicePacket`](Self::VoicePacket)),")]
|
||||
#[cfg_attr(not(feature = "receive"), doc = "(`VoicePacket`),")]
|
||||
/// detect speech starting/stopping
|
||||
#[cfg_attr(
|
||||
feature = "receive",
|
||||
doc = "([`SpeakingUpdate`](Self::SpeakingUpdate)),"
|
||||
)]
|
||||
#[cfg_attr(not(feature = "receive"), doc = "(`SpeakingUpdate`),")]
|
||||
#[cfg_attr(feature = "receive", doc = "([`RtpPacket`](Self::RtpPacket)),")]
|
||||
#[cfg_attr(not(feature = "receive"), doc = "(`RtpPacket`),")]
|
||||
/// decode and track speaking users
|
||||
#[cfg_attr(feature = "receive", doc = "([`VoiceTick`](Self::VoiceTick)),")]
|
||||
#[cfg_attr(not(feature = "receive"), doc = "(`VoiceTick`),")]
|
||||
/// and handle telemetry data
|
||||
#[cfg_attr(feature = "receive", doc = "([`RtcpPacket`](Self::RtcpPacket)).")]
|
||||
#[cfg_attr(not(feature = "receive"), doc = "(`RtcpPacket`).")]
|
||||
@@ -49,9 +46,9 @@ pub enum CoreEvent {
|
||||
SpeakingStateUpdate,
|
||||
|
||||
#[cfg(feature = "receive")]
|
||||
/// Fires when a source starts speaking, or stops speaking
|
||||
/// (*i.e.*, 5 consecutive silent frames).
|
||||
SpeakingUpdate,
|
||||
/// Fires every 20ms, containing the scheduled voice packet and decoded audio
|
||||
/// data for each live user.
|
||||
VoiceTick,
|
||||
|
||||
#[cfg(feature = "receive")]
|
||||
/// Fires on receipt of a voice packet from another stream in the voice call.
|
||||
@@ -59,7 +56,7 @@ pub enum CoreEvent {
|
||||
/// As RTP packets do not map to Discord's notion of users, SSRCs must be mapped
|
||||
/// back using the user IDs seen through client connection, disconnection,
|
||||
/// or speaking state update.
|
||||
VoicePacket,
|
||||
RtpPacket,
|
||||
|
||||
#[cfg(feature = "receive")]
|
||||
/// Fires on receipt of an RTCP packet, containing various call stats
|
||||
|
||||
Reference in New Issue
Block a user