`SsrcState` objects are created on a per-user basis when "receive" is enabled, but were previously never destroyed. This PR adds some shared dashmaps for the WS task to communicate SSRC-to-ID mappings to the UDP Rx task, as well as any disconnections. Additionally, decoder state is pruned a default 1 minute after a user last speaks. This was tested using `cargo make ready` and via `examples/serenity/voice_receive/`. Closes #133
472 lines
16 KiB
Rust
472 lines
16 KiB
Rust
use super::{
|
|
error::{Error, Result},
|
|
message::*,
|
|
Config,
|
|
};
|
|
use crate::{
|
|
constants::*,
|
|
driver::{CryptoMode, DecodeMode},
|
|
events::{internal_data::*, CoreContext},
|
|
};
|
|
use audiopus::{
|
|
coder::Decoder as OpusDecoder,
|
|
error::{Error as OpusError, ErrorCode},
|
|
packet::Packet as OpusPacket,
|
|
Channels,
|
|
};
|
|
use discortp::{
|
|
demux::{self, DemuxedMut},
|
|
rtp::{RtpExtensionPacket, RtpPacket},
|
|
FromPacket,
|
|
Packet,
|
|
PacketSize,
|
|
};
|
|
use flume::Receiver;
|
|
use std::{collections::HashMap, convert::TryInto, sync::Arc, time::Duration};
|
|
use tokio::{net::UdpSocket, select, time::Instant};
|
|
use tracing::{error, instrument, trace, warn};
|
|
use xsalsa20poly1305::XSalsa20Poly1305 as Cipher;
|
|
|
|
#[derive(Debug)]
|
|
struct SsrcState {
|
|
silent_frame_count: u16,
|
|
decoder: OpusDecoder,
|
|
last_seq: u16,
|
|
decode_size: PacketDecodeSize,
|
|
prune_time: Instant,
|
|
disconnected: bool,
|
|
}
|
|
|
|
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
|
enum PacketDecodeSize {
|
|
/// Minimum frame size on Discord.
|
|
TwentyMillis,
|
|
/// Hybrid packet, sent by Firefox web client.
|
|
///
|
|
/// Likely 20ms frame + 10ms frame.
|
|
ThirtyMillis,
|
|
/// Next largest frame size.
|
|
FortyMillis,
|
|
/// Maximum Opus frame size.
|
|
SixtyMillis,
|
|
/// Maximum Opus packet size: 120ms.
|
|
Max,
|
|
}
|
|
|
|
impl PacketDecodeSize {
|
|
fn bump_up(self) -> Self {
|
|
match self {
|
|
Self::TwentyMillis => Self::ThirtyMillis,
|
|
Self::ThirtyMillis => Self::FortyMillis,
|
|
Self::FortyMillis => Self::SixtyMillis,
|
|
Self::SixtyMillis | Self::Max => Self::Max,
|
|
}
|
|
}
|
|
|
|
fn can_bump_up(self) -> bool {
|
|
self != Self::Max
|
|
}
|
|
|
|
fn len(self) -> usize {
|
|
match self {
|
|
Self::TwentyMillis => STEREO_FRAME_SIZE,
|
|
Self::ThirtyMillis => (STEREO_FRAME_SIZE / 2) * 3,
|
|
Self::FortyMillis => 2 * STEREO_FRAME_SIZE,
|
|
Self::SixtyMillis => 3 * STEREO_FRAME_SIZE,
|
|
Self::Max => 6 * STEREO_FRAME_SIZE,
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
|
enum SpeakingDelta {
|
|
Same,
|
|
Start,
|
|
Stop,
|
|
}
|
|
|
|
impl SsrcState {
|
|
fn new(pkt: &RtpPacket<'_>, state_timeout: Duration) -> Self {
|
|
Self {
|
|
silent_frame_count: 5, // We do this to make the first speech packet fire an event.
|
|
decoder: OpusDecoder::new(SAMPLE_RATE, Channels::Stereo)
|
|
.expect("Failed to create new Opus decoder for source."),
|
|
last_seq: pkt.get_sequence().into(),
|
|
decode_size: PacketDecodeSize::TwentyMillis,
|
|
prune_time: Instant::now() + state_timeout,
|
|
disconnected: false,
|
|
}
|
|
}
|
|
|
|
fn refresh_timer(&mut self, state_timeout: Duration) {
|
|
if !self.disconnected {
|
|
self.prune_time = Instant::now() + state_timeout;
|
|
}
|
|
}
|
|
|
|
fn process(
|
|
&mut self,
|
|
pkt: &RtpPacket<'_>,
|
|
data_offset: usize,
|
|
data_trailer: usize,
|
|
decode_mode: DecodeMode,
|
|
decrypted: bool,
|
|
) -> Result<(SpeakingDelta, Option<Vec<i16>>)> {
|
|
let new_seq: u16 = pkt.get_sequence().into();
|
|
let payload_len = pkt.payload().len();
|
|
|
|
let extensions = pkt.get_extension() != 0;
|
|
let seq_delta = new_seq.wrapping_sub(self.last_seq);
|
|
Ok(if seq_delta >= (1 << 15) {
|
|
// Overflow, reordered (previously missing) packet.
|
|
(SpeakingDelta::Same, Some(vec![]))
|
|
} else {
|
|
self.last_seq = new_seq;
|
|
let missed_packets = seq_delta.saturating_sub(1);
|
|
|
|
// Note: we still need to handle this for non-decoded.
|
|
// This is mainly because packet events and speaking events can be handed to the
|
|
// user.
|
|
let (audio, pkt_size) = if decode_mode.should_decrypt() && decrypted {
|
|
self.scan_and_decode(
|
|
&pkt.payload()[data_offset..payload_len - data_trailer],
|
|
extensions,
|
|
missed_packets,
|
|
decode_mode == DecodeMode::Decode,
|
|
)?
|
|
} else {
|
|
// The latter part is an upper bound, as we cannot determine
|
|
// how long packet extensions are.
|
|
// WIthout decryption, speaking detection is thus broken.
|
|
(None, payload_len - data_offset - data_trailer)
|
|
};
|
|
|
|
let delta = if pkt_size == SILENT_FRAME.len() {
|
|
// Frame is silent.
|
|
let old = self.silent_frame_count;
|
|
self.silent_frame_count =
|
|
self.silent_frame_count.saturating_add(1 + missed_packets);
|
|
|
|
if self.silent_frame_count >= 5 && old < 5 {
|
|
SpeakingDelta::Stop
|
|
} else {
|
|
SpeakingDelta::Same
|
|
}
|
|
} else {
|
|
// Frame has meaningful audio.
|
|
let out = if self.silent_frame_count >= 5 {
|
|
SpeakingDelta::Start
|
|
} else {
|
|
SpeakingDelta::Same
|
|
};
|
|
self.silent_frame_count = 0;
|
|
out
|
|
};
|
|
|
|
(delta, audio)
|
|
})
|
|
}
|
|
|
|
fn scan_and_decode(
|
|
&mut self,
|
|
data: &[u8],
|
|
extension: bool,
|
|
missed_packets: u16,
|
|
decode: bool,
|
|
) -> Result<(Option<Vec<i16>>, usize)> {
|
|
let start = if extension {
|
|
RtpExtensionPacket::new(data)
|
|
.map(|pkt| pkt.packet_size())
|
|
.ok_or_else(|| {
|
|
error!("Extension packet indicated, but insufficient space.");
|
|
Error::IllegalVoicePacket
|
|
})
|
|
} else {
|
|
Ok(0)
|
|
}?;
|
|
|
|
let pkt = if decode {
|
|
let mut out = vec![0; self.decode_size.len()];
|
|
|
|
for _ in 0..missed_packets {
|
|
let missing_frame: Option<OpusPacket> = None;
|
|
let dest_samples = (&mut out[..])
|
|
.try_into()
|
|
.expect("Decode logic will cap decode buffer size at i32::MAX.");
|
|
if let Err(e) = self.decoder.decode(missing_frame, dest_samples, false) {
|
|
warn!("Issue while decoding for missed packet: {:?}.", e);
|
|
}
|
|
}
|
|
|
|
// In general, we should expect 20 ms frames.
|
|
// However, Discord occasionally like to surprise us with something bigger.
|
|
// This is *sender-dependent behaviour*.
|
|
//
|
|
// This should scan up to find the "correct" size that a source is using,
|
|
// and then remember that.
|
|
loop {
|
|
let tried_audio_len = self.decoder.decode(
|
|
Some(data[start..].try_into()?),
|
|
(&mut out[..]).try_into()?,
|
|
false,
|
|
);
|
|
match tried_audio_len {
|
|
Ok(audio_len) => {
|
|
// Decoding to stereo: audio_len refers to sample count irrespective of channel count.
|
|
// => multiply by number of channels.
|
|
out.truncate(2 * audio_len);
|
|
|
|
break;
|
|
},
|
|
Err(OpusError::Opus(ErrorCode::BufferTooSmall)) => {
|
|
if self.decode_size.can_bump_up() {
|
|
self.decode_size = self.decode_size.bump_up();
|
|
out = vec![0; self.decode_size.len()];
|
|
} else {
|
|
error!("Received packet larger than Opus standard maximum,");
|
|
return Err(Error::IllegalVoicePacket);
|
|
}
|
|
},
|
|
Err(e) => {
|
|
error!("Failed to decode received packet: {:?}.", e);
|
|
return Err(e.into());
|
|
},
|
|
}
|
|
}
|
|
|
|
Some(out)
|
|
} else {
|
|
None
|
|
};
|
|
|
|
Ok((pkt, data.len() - start))
|
|
}
|
|
}
|
|
|
|
struct UdpRx {
|
|
cipher: Cipher,
|
|
decoder_map: HashMap<u32, SsrcState>,
|
|
config: Config,
|
|
packet_buffer: [u8; VOICE_PACKET_MAX],
|
|
rx: Receiver<UdpRxMessage>,
|
|
ssrc_signalling: Arc<SsrcTracker>,
|
|
udp_socket: UdpSocket,
|
|
}
|
|
|
|
impl UdpRx {
|
|
#[instrument(skip(self))]
|
|
async fn run(&mut self, interconnect: &mut Interconnect) {
|
|
let mut cleanup_time = Instant::now();
|
|
|
|
loop {
|
|
select! {
|
|
Ok((len, _addr)) = self.udp_socket.recv_from(&mut self.packet_buffer[..]) => {
|
|
self.process_udp_message(interconnect, len);
|
|
},
|
|
msg = self.rx.recv_async() => {
|
|
match msg {
|
|
Ok(UdpRxMessage::ReplaceInterconnect(i)) => {
|
|
*interconnect = i;
|
|
},
|
|
Ok(UdpRxMessage::SetConfig(c)) => {
|
|
self.config = c;
|
|
},
|
|
Err(flume::RecvError::Disconnected) => break,
|
|
}
|
|
},
|
|
_ = tokio::time::sleep_until(cleanup_time) => {
|
|
// periodic cleanup.
|
|
let now = Instant::now();
|
|
|
|
// check ssrc map to see if the WS task has informed us of any disconnects.
|
|
loop {
|
|
// This is structured in an odd way to prevent deadlocks.
|
|
// while-let seemed to keep the dashmap iter() alive for block scope, rather than
|
|
// just the initialiser.
|
|
let id = {
|
|
if let Some(id) = self.ssrc_signalling.disconnected_users.iter().next().map(|v| *v.key()) {
|
|
id
|
|
} else {
|
|
break;
|
|
}
|
|
};
|
|
|
|
let _ = self.ssrc_signalling.disconnected_users.remove(&id);
|
|
if let Some((_, ssrc)) = self.ssrc_signalling.user_ssrc_map.remove(&id) {
|
|
if let Some(state) = self.decoder_map.get_mut(&ssrc) {
|
|
// don't cleanup immediately: leave for later cycle
|
|
// this is key with reorder/jitter buffers where we may
|
|
// still need to decode post disconnect for ~0.2s.
|
|
state.prune_time = now + Duration::from_secs(1);
|
|
state.disconnected = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
// now remove all dead ssrcs.
|
|
self.decoder_map.retain(|_, v| v.prune_time > now);
|
|
|
|
cleanup_time = now + Duration::from_secs(5);
|
|
},
|
|
}
|
|
}
|
|
}
|
|
|
|
fn process_udp_message(&mut self, interconnect: &Interconnect, len: usize) {
|
|
// NOTE: errors here (and in general for UDP) are not fatal to the connection.
|
|
// Panics should be avoided due to adversarial nature of rx'd packets,
|
|
// but correct handling should not prompt a reconnect.
|
|
//
|
|
// For simplicity, we nominate the mixing context to rebuild the event
|
|
// context if it fails (hence, the `let _ =` statements.), as it will try to
|
|
// make contact every 20ms.
|
|
let crypto_mode = self.config.crypto_mode;
|
|
let packet = &mut self.packet_buffer[..len];
|
|
|
|
match demux::demux_mut(packet) {
|
|
DemuxedMut::Rtp(mut rtp) => {
|
|
if !rtp_valid(&rtp.to_immutable()) {
|
|
error!("Illegal RTP message received.");
|
|
return;
|
|
}
|
|
|
|
let packet_data = if self.config.decode_mode.should_decrypt() {
|
|
let out = crypto_mode
|
|
.decrypt_in_place(&mut rtp, &self.cipher)
|
|
.map(|(s, t)| (s, t, true));
|
|
|
|
if let Err(e) = out {
|
|
warn!("RTP decryption failed: {:?}", e);
|
|
}
|
|
|
|
out.ok()
|
|
} else {
|
|
None
|
|
};
|
|
|
|
let rtp = rtp.to_immutable();
|
|
let (rtp_body_start, rtp_body_tail, decrypted) = packet_data.unwrap_or_else(|| {
|
|
(
|
|
CryptoMode::payload_prefix_len(),
|
|
crypto_mode.payload_suffix_len(),
|
|
false,
|
|
)
|
|
});
|
|
|
|
let entry = self
|
|
.decoder_map
|
|
.entry(rtp.get_ssrc())
|
|
.or_insert_with(|| SsrcState::new(&rtp, self.config.decode_state_timeout));
|
|
|
|
// Only do this on RTP, rather than RTCP -- this pins decoder state liveness
|
|
// to *speech* rather than just presence.
|
|
entry.refresh_timer(self.config.decode_state_timeout);
|
|
|
|
if let Ok((delta, audio)) = entry.process(
|
|
&rtp,
|
|
rtp_body_start,
|
|
rtp_body_tail,
|
|
self.config.decode_mode,
|
|
decrypted,
|
|
) {
|
|
match delta {
|
|
SpeakingDelta::Start => {
|
|
drop(interconnect.events.send(EventMessage::FireCoreEvent(
|
|
CoreContext::SpeakingUpdate(InternalSpeakingUpdate {
|
|
ssrc: rtp.get_ssrc(),
|
|
speaking: true,
|
|
}),
|
|
)));
|
|
},
|
|
SpeakingDelta::Stop => {
|
|
drop(interconnect.events.send(EventMessage::FireCoreEvent(
|
|
CoreContext::SpeakingUpdate(InternalSpeakingUpdate {
|
|
ssrc: rtp.get_ssrc(),
|
|
speaking: false,
|
|
}),
|
|
)));
|
|
},
|
|
SpeakingDelta::Same => {},
|
|
}
|
|
|
|
drop(interconnect.events.send(EventMessage::FireCoreEvent(
|
|
CoreContext::VoicePacket(InternalVoicePacket {
|
|
audio,
|
|
packet: rtp.from_packet(),
|
|
payload_offset: rtp_body_start,
|
|
payload_end_pad: rtp_body_tail,
|
|
}),
|
|
)));
|
|
} else {
|
|
warn!("RTP decoding/processing failed.");
|
|
}
|
|
},
|
|
DemuxedMut::Rtcp(mut rtcp) => {
|
|
let packet_data = if self.config.decode_mode.should_decrypt() {
|
|
let out = crypto_mode.decrypt_in_place(&mut rtcp, &self.cipher);
|
|
|
|
if let Err(e) = out {
|
|
warn!("RTCP decryption failed: {:?}", e);
|
|
}
|
|
|
|
out.ok()
|
|
} else {
|
|
None
|
|
};
|
|
|
|
let (start, tail) = packet_data.unwrap_or_else(|| {
|
|
(
|
|
CryptoMode::payload_prefix_len(),
|
|
crypto_mode.payload_suffix_len(),
|
|
)
|
|
});
|
|
|
|
drop(interconnect.events.send(EventMessage::FireCoreEvent(
|
|
CoreContext::RtcpPacket(InternalRtcpPacket {
|
|
packet: rtcp.from_packet(),
|
|
payload_offset: start,
|
|
payload_end_pad: tail,
|
|
}),
|
|
)));
|
|
},
|
|
DemuxedMut::FailedParse(t) => {
|
|
warn!("Failed to parse message of type {:?}.", t);
|
|
},
|
|
DemuxedMut::TooSmall => {
|
|
warn!("Illegal UDP packet from voice server.");
|
|
},
|
|
}
|
|
}
|
|
}
|
|
|
|
#[instrument(skip(interconnect, rx, cipher))]
|
|
pub(crate) async fn runner(
|
|
mut interconnect: Interconnect,
|
|
rx: Receiver<UdpRxMessage>,
|
|
cipher: Cipher,
|
|
config: Config,
|
|
udp_socket: UdpSocket,
|
|
ssrc_signalling: Arc<SsrcTracker>,
|
|
) {
|
|
trace!("UDP receive handle started.");
|
|
|
|
let mut state = UdpRx {
|
|
cipher,
|
|
decoder_map: HashMap::new(),
|
|
config,
|
|
packet_buffer: [0u8; VOICE_PACKET_MAX],
|
|
rx,
|
|
ssrc_signalling,
|
|
udp_socket,
|
|
};
|
|
|
|
state.run(&mut interconnect).await;
|
|
|
|
trace!("UDP receive handle stopped.");
|
|
}
|
|
|
|
#[inline]
|
|
fn rtp_valid(packet: &RtpPacket<'_>) -> bool {
|
|
packet.get_version() == RTP_VERSION && packet.get_payload_type() == RTP_PROFILE_TYPE
|
|
}
|