use super::{ error::{Error, Result}, message::*, Config, }; use crate::{ constants::*, driver::{CryptoMode, DecodeMode}, events::{internal_data::*, CoreContext}, }; use audiopus::{ coder::Decoder as OpusDecoder, error::{Error as OpusError, ErrorCode}, packet::Packet as OpusPacket, Channels, }; use discortp::{ demux::{self, DemuxedMut}, rtp::{RtpExtensionPacket, RtpPacket}, FromPacket, Packet, PacketSize, }; use flume::Receiver; use std::{collections::HashMap, convert::TryInto, sync::Arc, time::Duration}; use tokio::{net::UdpSocket, select, time::Instant}; use tracing::{error, instrument, trace, warn}; use xsalsa20poly1305::XSalsa20Poly1305 as Cipher; #[derive(Debug)] struct SsrcState { silent_frame_count: u16, decoder: OpusDecoder, last_seq: u16, decode_size: PacketDecodeSize, prune_time: Instant, disconnected: bool, } #[derive(Clone, Copy, Debug, Eq, PartialEq)] enum PacketDecodeSize { /// Minimum frame size on Discord. TwentyMillis, /// Hybrid packet, sent by Firefox web client. /// /// Likely 20ms frame + 10ms frame. ThirtyMillis, /// Next largest frame size. FortyMillis, /// Maximum Opus frame size. SixtyMillis, /// Maximum Opus packet size: 120ms. Max, } impl PacketDecodeSize { fn bump_up(self) -> Self { match self { Self::TwentyMillis => Self::ThirtyMillis, Self::ThirtyMillis => Self::FortyMillis, Self::FortyMillis => Self::SixtyMillis, Self::SixtyMillis | Self::Max => Self::Max, } } fn can_bump_up(self) -> bool { self != Self::Max } fn len(self) -> usize { match self { Self::TwentyMillis => STEREO_FRAME_SIZE, Self::ThirtyMillis => (STEREO_FRAME_SIZE / 2) * 3, Self::FortyMillis => 2 * STEREO_FRAME_SIZE, Self::SixtyMillis => 3 * STEREO_FRAME_SIZE, Self::Max => 6 * STEREO_FRAME_SIZE, } } } #[derive(Clone, Copy, Debug, Eq, PartialEq)] enum SpeakingDelta { Same, Start, Stop, } impl SsrcState { fn new(pkt: &RtpPacket<'_>, state_timeout: Duration) -> Self { Self { silent_frame_count: 5, // We do this to make the first speech packet fire an event. decoder: OpusDecoder::new(SAMPLE_RATE, Channels::Stereo) .expect("Failed to create new Opus decoder for source."), last_seq: pkt.get_sequence().into(), decode_size: PacketDecodeSize::TwentyMillis, prune_time: Instant::now() + state_timeout, disconnected: false, } } fn refresh_timer(&mut self, state_timeout: Duration) { if !self.disconnected { self.prune_time = Instant::now() + state_timeout; } } fn process( &mut self, pkt: &RtpPacket<'_>, data_offset: usize, data_trailer: usize, decode_mode: DecodeMode, decrypted: bool, ) -> Result<(SpeakingDelta, Option>)> { let new_seq: u16 = pkt.get_sequence().into(); let payload_len = pkt.payload().len(); let extensions = pkt.get_extension() != 0; let seq_delta = new_seq.wrapping_sub(self.last_seq); Ok(if seq_delta >= (1 << 15) { // Overflow, reordered (previously missing) packet. (SpeakingDelta::Same, Some(vec![])) } else { self.last_seq = new_seq; let missed_packets = seq_delta.saturating_sub(1); // Note: we still need to handle this for non-decoded. // This is mainly because packet events and speaking events can be handed to the // user. let (audio, pkt_size) = if decode_mode.should_decrypt() && decrypted { self.scan_and_decode( &pkt.payload()[data_offset..payload_len - data_trailer], extensions, missed_packets, decode_mode == DecodeMode::Decode, )? } else { // The latter part is an upper bound, as we cannot determine // how long packet extensions are. // WIthout decryption, speaking detection is thus broken. (None, payload_len - data_offset - data_trailer) }; let delta = if pkt_size == SILENT_FRAME.len() { // Frame is silent. let old = self.silent_frame_count; self.silent_frame_count = self.silent_frame_count.saturating_add(1 + missed_packets); if self.silent_frame_count >= 5 && old < 5 { SpeakingDelta::Stop } else { SpeakingDelta::Same } } else { // Frame has meaningful audio. let out = if self.silent_frame_count >= 5 { SpeakingDelta::Start } else { SpeakingDelta::Same }; self.silent_frame_count = 0; out }; (delta, audio) }) } fn scan_and_decode( &mut self, data: &[u8], extension: bool, missed_packets: u16, decode: bool, ) -> Result<(Option>, usize)> { let start = if extension { RtpExtensionPacket::new(data) .map(|pkt| pkt.packet_size()) .ok_or_else(|| { error!("Extension packet indicated, but insufficient space."); Error::IllegalVoicePacket }) } else { Ok(0) }?; let pkt = if decode { let mut out = vec![0; self.decode_size.len()]; for _ in 0..missed_packets { let missing_frame: Option = None; let dest_samples = (&mut out[..]) .try_into() .expect("Decode logic will cap decode buffer size at i32::MAX."); if let Err(e) = self.decoder.decode(missing_frame, dest_samples, false) { warn!("Issue while decoding for missed packet: {:?}.", e); } } // In general, we should expect 20 ms frames. // However, Discord occasionally like to surprise us with something bigger. // This is *sender-dependent behaviour*. // // This should scan up to find the "correct" size that a source is using, // and then remember that. loop { let tried_audio_len = self.decoder.decode( Some(data[start..].try_into()?), (&mut out[..]).try_into()?, false, ); match tried_audio_len { Ok(audio_len) => { // Decoding to stereo: audio_len refers to sample count irrespective of channel count. // => multiply by number of channels. out.truncate(2 * audio_len); break; }, Err(OpusError::Opus(ErrorCode::BufferTooSmall)) => { if self.decode_size.can_bump_up() { self.decode_size = self.decode_size.bump_up(); out = vec![0; self.decode_size.len()]; } else { error!("Received packet larger than Opus standard maximum,"); return Err(Error::IllegalVoicePacket); } }, Err(e) => { error!("Failed to decode received packet: {:?}.", e); return Err(e.into()); }, } } Some(out) } else { None }; Ok((pkt, data.len() - start)) } } struct UdpRx { cipher: Cipher, decoder_map: HashMap, config: Config, packet_buffer: [u8; VOICE_PACKET_MAX], rx: Receiver, ssrc_signalling: Arc, udp_socket: UdpSocket, } impl UdpRx { #[instrument(skip(self))] async fn run(&mut self, interconnect: &mut Interconnect) { let mut cleanup_time = Instant::now(); loop { select! { Ok((len, _addr)) = self.udp_socket.recv_from(&mut self.packet_buffer[..]) => { self.process_udp_message(interconnect, len); }, msg = self.rx.recv_async() => { match msg { Ok(UdpRxMessage::ReplaceInterconnect(i)) => { *interconnect = i; }, Ok(UdpRxMessage::SetConfig(c)) => { self.config = c; }, Err(flume::RecvError::Disconnected) => break, } }, _ = tokio::time::sleep_until(cleanup_time) => { // periodic cleanup. let now = Instant::now(); // check ssrc map to see if the WS task has informed us of any disconnects. loop { // This is structured in an odd way to prevent deadlocks. // while-let seemed to keep the dashmap iter() alive for block scope, rather than // just the initialiser. let id = { if let Some(id) = self.ssrc_signalling.disconnected_users.iter().next().map(|v| *v.key()) { id } else { break; } }; let _ = self.ssrc_signalling.disconnected_users.remove(&id); if let Some((_, ssrc)) = self.ssrc_signalling.user_ssrc_map.remove(&id) { if let Some(state) = self.decoder_map.get_mut(&ssrc) { // don't cleanup immediately: leave for later cycle // this is key with reorder/jitter buffers where we may // still need to decode post disconnect for ~0.2s. state.prune_time = now + Duration::from_secs(1); state.disconnected = true; } } } // now remove all dead ssrcs. self.decoder_map.retain(|_, v| v.prune_time > now); cleanup_time = now + Duration::from_secs(5); }, } } } fn process_udp_message(&mut self, interconnect: &Interconnect, len: usize) { // NOTE: errors here (and in general for UDP) are not fatal to the connection. // Panics should be avoided due to adversarial nature of rx'd packets, // but correct handling should not prompt a reconnect. // // For simplicity, we nominate the mixing context to rebuild the event // context if it fails (hence, the `let _ =` statements.), as it will try to // make contact every 20ms. let crypto_mode = self.config.crypto_mode; let packet = &mut self.packet_buffer[..len]; match demux::demux_mut(packet) { DemuxedMut::Rtp(mut rtp) => { if !rtp_valid(&rtp.to_immutable()) { error!("Illegal RTP message received."); return; } let packet_data = if self.config.decode_mode.should_decrypt() { let out = crypto_mode .decrypt_in_place(&mut rtp, &self.cipher) .map(|(s, t)| (s, t, true)); if let Err(e) = out { warn!("RTP decryption failed: {:?}", e); } out.ok() } else { None }; let rtp = rtp.to_immutable(); let (rtp_body_start, rtp_body_tail, decrypted) = packet_data.unwrap_or_else(|| { ( CryptoMode::payload_prefix_len(), crypto_mode.payload_suffix_len(), false, ) }); let entry = self .decoder_map .entry(rtp.get_ssrc()) .or_insert_with(|| SsrcState::new(&rtp, self.config.decode_state_timeout)); // Only do this on RTP, rather than RTCP -- this pins decoder state liveness // to *speech* rather than just presence. entry.refresh_timer(self.config.decode_state_timeout); if let Ok((delta, audio)) = entry.process( &rtp, rtp_body_start, rtp_body_tail, self.config.decode_mode, decrypted, ) { match delta { SpeakingDelta::Start => { drop(interconnect.events.send(EventMessage::FireCoreEvent( CoreContext::SpeakingUpdate(InternalSpeakingUpdate { ssrc: rtp.get_ssrc(), speaking: true, }), ))); }, SpeakingDelta::Stop => { drop(interconnect.events.send(EventMessage::FireCoreEvent( CoreContext::SpeakingUpdate(InternalSpeakingUpdate { ssrc: rtp.get_ssrc(), speaking: false, }), ))); }, SpeakingDelta::Same => {}, } drop(interconnect.events.send(EventMessage::FireCoreEvent( CoreContext::VoicePacket(InternalVoicePacket { audio, packet: rtp.from_packet(), payload_offset: rtp_body_start, payload_end_pad: rtp_body_tail, }), ))); } else { warn!("RTP decoding/processing failed."); } }, DemuxedMut::Rtcp(mut rtcp) => { let packet_data = if self.config.decode_mode.should_decrypt() { let out = crypto_mode.decrypt_in_place(&mut rtcp, &self.cipher); if let Err(e) = out { warn!("RTCP decryption failed: {:?}", e); } out.ok() } else { None }; let (start, tail) = packet_data.unwrap_or_else(|| { ( CryptoMode::payload_prefix_len(), crypto_mode.payload_suffix_len(), ) }); drop(interconnect.events.send(EventMessage::FireCoreEvent( CoreContext::RtcpPacket(InternalRtcpPacket { packet: rtcp.from_packet(), payload_offset: start, payload_end_pad: tail, }), ))); }, DemuxedMut::FailedParse(t) => { warn!("Failed to parse message of type {:?}.", t); }, DemuxedMut::TooSmall => { warn!("Illegal UDP packet from voice server."); }, } } } #[instrument(skip(interconnect, rx, cipher))] pub(crate) async fn runner( mut interconnect: Interconnect, rx: Receiver, cipher: Cipher, config: Config, udp_socket: UdpSocket, ssrc_signalling: Arc, ) { trace!("UDP receive handle started."); let mut state = UdpRx { cipher, decoder_map: HashMap::new(), config, packet_buffer: [0u8; VOICE_PACKET_MAX], rx, ssrc_signalling, udp_socket, }; state.run(&mut interconnect).await; trace!("UDP receive handle stopped."); } #[inline] fn rtp_valid(packet: &RtpPacket<'_>) -> bool { packet.get_version() == RTP_VERSION && packet.get_payload_type() == RTP_PROFILE_TYPE }