Files
songbird/src/driver/tasks/udp_rx.rs
Kyle Simpson 4eb95d4b59 Deps: Update to Audiopus v0.3.0-rc.0 (#125)
Tested using `cargo make ready`.

Co-authored-by: André Vennberg <andre.vennberg@gmail.com>
2022-07-22 15:41:18 +01:00

427 lines
14 KiB
Rust

use super::{
error::{Error, Result},
message::*,
Config,
};
use crate::{
constants::*,
driver::DecodeMode,
events::{internal_data::*, CoreContext},
};
use audiopus::{
coder::Decoder as OpusDecoder,
error::{Error as OpusError, ErrorCode},
packet::Packet as OpusPacket,
Channels,
};
use discortp::{
demux::{self, DemuxedMut},
rtp::{RtpExtensionPacket, RtpPacket},
FromPacket,
Packet,
PacketSize,
};
use flume::Receiver;
use std::{collections::HashMap, convert::TryInto, sync::Arc};
use tokio::{net::UdpSocket, select};
use tracing::{error, instrument, trace, warn};
use xsalsa20poly1305::XSalsa20Poly1305 as Cipher;
#[derive(Debug)]
struct SsrcState {
silent_frame_count: u16,
decoder: OpusDecoder,
last_seq: u16,
decode_size: PacketDecodeSize,
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
enum PacketDecodeSize {
/// Minimum frame size on Discord.
TwentyMillis,
/// Hybrid packet, sent by Firefox web client.
///
/// Likely 20ms frame + 10ms frame.
ThirtyMillis,
/// Next largest frame size.
FortyMillis,
/// Maximum Opus frame size.
SixtyMillis,
/// Maximum Opus packet size: 120ms.
Max,
}
impl PacketDecodeSize {
fn bump_up(self) -> Self {
use PacketDecodeSize::*;
match self {
TwentyMillis => ThirtyMillis,
ThirtyMillis => FortyMillis,
FortyMillis => SixtyMillis,
SixtyMillis | Max => Max,
}
}
fn can_bump_up(self) -> bool {
self != PacketDecodeSize::Max
}
fn len(self) -> usize {
use PacketDecodeSize::*;
match self {
TwentyMillis => STEREO_FRAME_SIZE,
ThirtyMillis => (STEREO_FRAME_SIZE / 2) * 3,
FortyMillis => 2 * STEREO_FRAME_SIZE,
SixtyMillis => 3 * STEREO_FRAME_SIZE,
Max => 6 * STEREO_FRAME_SIZE,
}
}
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
enum SpeakingDelta {
Same,
Start,
Stop,
}
impl SsrcState {
fn new(pkt: RtpPacket<'_>) -> Self {
Self {
silent_frame_count: 5, // We do this to make the first speech packet fire an event.
decoder: OpusDecoder::new(SAMPLE_RATE, Channels::Stereo)
.expect("Failed to create new Opus decoder for source."),
last_seq: pkt.get_sequence().into(),
decode_size: PacketDecodeSize::TwentyMillis,
}
}
fn process(
&mut self,
pkt: RtpPacket<'_>,
data_offset: usize,
data_trailer: usize,
decode_mode: DecodeMode,
decrypted: bool,
) -> Result<(SpeakingDelta, Option<Vec<i16>>)> {
let new_seq: u16 = pkt.get_sequence().into();
let payload_len = pkt.payload().len();
let extensions = pkt.get_extension() != 0;
let seq_delta = new_seq.wrapping_sub(self.last_seq);
Ok(if seq_delta >= (1 << 15) {
// Overflow, reordered (previously missing) packet.
(SpeakingDelta::Same, Some(vec![]))
} else {
self.last_seq = new_seq;
let missed_packets = seq_delta.saturating_sub(1);
// Note: we still need to handle this for non-decoded.
// This is mainly because packet events and speaking events can be handed to the
// user.
let (audio, pkt_size) = if decode_mode.should_decrypt() && decrypted {
self.scan_and_decode(
&pkt.payload()[data_offset..payload_len - data_trailer],
extensions,
missed_packets,
decode_mode == DecodeMode::Decode,
)?
} else {
// The latter part is an upper bound, as we cannot determine
// how long packet extensions are.
// WIthout decryption, speaking detection is thus broken.
(None, payload_len - data_offset - data_trailer)
};
let delta = if pkt_size == SILENT_FRAME.len() {
// Frame is silent.
let old = self.silent_frame_count;
self.silent_frame_count =
self.silent_frame_count.saturating_add(1 + missed_packets);
if self.silent_frame_count >= 5 && old < 5 {
SpeakingDelta::Stop
} else {
SpeakingDelta::Same
}
} else {
// Frame has meaningful audio.
let out = if self.silent_frame_count >= 5 {
SpeakingDelta::Start
} else {
SpeakingDelta::Same
};
self.silent_frame_count = 0;
out
};
(delta, audio)
})
}
fn scan_and_decode(
&mut self,
data: &[u8],
extension: bool,
missed_packets: u16,
decode: bool,
) -> Result<(Option<Vec<i16>>, usize)> {
let start = if extension {
RtpExtensionPacket::new(data)
.map(|pkt| pkt.packet_size())
.ok_or_else(|| {
error!("Extension packet indicated, but insufficient space.");
Error::IllegalVoicePacket
})
} else {
Ok(0)
}?;
let pkt = if decode {
let mut out = vec![0; self.decode_size.len()];
for _ in 0..missed_packets {
let missing_frame: Option<OpusPacket> = None;
let dest_samples = (&mut out[..])
.try_into()
.expect("Decode logic will cap decode buffer size at i32::MAX.");
if let Err(e) = self.decoder.decode(missing_frame, dest_samples, false) {
warn!("Issue while decoding for missed packet: {:?}.", e);
}
}
// In general, we should expect 20 ms frames.
// However, Discord occasionally like to surprise us with something bigger.
// This is *sender-dependent behaviour*.
//
// This should scan up to find the "correct" size that a source is using,
// and then remember that.
loop {
let tried_audio_len = self.decoder.decode(
Some((&data[start..]).try_into()?),
(&mut out[..]).try_into()?,
false,
);
match tried_audio_len {
Ok(audio_len) => {
// Decoding to stereo: audio_len refers to sample count irrespective of channel count.
// => multiply by number of channels.
out.truncate(2 * audio_len);
break;
},
Err(OpusError::Opus(ErrorCode::BufferTooSmall)) => {
if self.decode_size.can_bump_up() {
self.decode_size = self.decode_size.bump_up();
out = vec![0; self.decode_size.len()];
} else {
error!("Received packet larger than Opus standard maximum,");
return Err(Error::IllegalVoicePacket);
}
},
Err(e) => {
error!("Failed to decode received packet: {:?}.", e);
return Err(e.into());
},
}
}
Some(out)
} else {
None
};
Ok((pkt, data.len() - start))
}
}
struct UdpRx {
cipher: Cipher,
decoder_map: HashMap<u32, SsrcState>,
#[allow(dead_code)]
config: Config,
packet_buffer: [u8; VOICE_PACKET_MAX],
rx: Receiver<UdpRxMessage>,
udp_socket: Arc<UdpSocket>,
}
impl UdpRx {
#[instrument(skip(self))]
async fn run(&mut self, interconnect: &mut Interconnect) {
loop {
select! {
Ok((len, _addr)) = self.udp_socket.recv_from(&mut self.packet_buffer[..]) => {
self.process_udp_message(interconnect, len);
}
msg = self.rx.recv_async() => {
use UdpRxMessage::*;
match msg {
Ok(ReplaceInterconnect(i)) => {
*interconnect = i;
},
Ok(SetConfig(c)) => {
self.config = c;
},
Ok(Poison) | Err(_) => break,
}
}
}
}
}
fn process_udp_message(&mut self, interconnect: &Interconnect, len: usize) {
// NOTE: errors here (and in general for UDP) are not fatal to the connection.
// Panics should be avoided due to adversarial nature of rx'd packets,
// but correct handling should not prompt a reconnect.
//
// For simplicity, we nominate the mixing context to rebuild the event
// context if it fails (hence, the `let _ =` statements.), as it will try to
// make contact every 20ms.
let crypto_mode = self.config.crypto_mode;
let packet = &mut self.packet_buffer[..len];
match demux::demux_mut(packet) {
DemuxedMut::Rtp(mut rtp) => {
if !rtp_valid(rtp.to_immutable()) {
error!("Illegal RTP message received.");
return;
}
let packet_data = if self.config.decode_mode.should_decrypt() {
let out = crypto_mode
.decrypt_in_place(&mut rtp, &self.cipher)
.map(|(s, t)| (s, t, true));
if let Err(e) = out {
warn!("RTP decryption failed: {:?}", e);
}
out.ok()
} else {
None
};
let (rtp_body_start, rtp_body_tail, decrypted) = packet_data.unwrap_or_else(|| {
(
crypto_mode.payload_prefix_len(),
crypto_mode.payload_suffix_len(),
false,
)
});
let entry = self
.decoder_map
.entry(rtp.get_ssrc())
.or_insert_with(|| SsrcState::new(rtp.to_immutable()));
if let Ok((delta, audio)) = entry.process(
rtp.to_immutable(),
rtp_body_start,
rtp_body_tail,
self.config.decode_mode,
decrypted,
) {
match delta {
SpeakingDelta::Start => {
let _ = interconnect.events.send(EventMessage::FireCoreEvent(
CoreContext::SpeakingUpdate(InternalSpeakingUpdate {
ssrc: rtp.get_ssrc(),
speaking: true,
}),
));
},
SpeakingDelta::Stop => {
let _ = interconnect.events.send(EventMessage::FireCoreEvent(
CoreContext::SpeakingUpdate(InternalSpeakingUpdate {
ssrc: rtp.get_ssrc(),
speaking: false,
}),
));
},
_ => {},
}
let _ = interconnect.events.send(EventMessage::FireCoreEvent(
CoreContext::VoicePacket(InternalVoicePacket {
audio,
packet: rtp.from_packet(),
payload_offset: rtp_body_start,
payload_end_pad: rtp_body_tail,
}),
));
} else {
warn!("RTP decoding/processing failed.");
}
},
DemuxedMut::Rtcp(mut rtcp) => {
let packet_data = if self.config.decode_mode.should_decrypt() {
let out = crypto_mode.decrypt_in_place(&mut rtcp, &self.cipher);
if let Err(e) = out {
warn!("RTCP decryption failed: {:?}", e);
}
out.ok()
} else {
None
};
let (start, tail) = packet_data.unwrap_or_else(|| {
(
crypto_mode.payload_prefix_len(),
crypto_mode.payload_suffix_len(),
)
});
let _ =
interconnect
.events
.send(EventMessage::FireCoreEvent(CoreContext::RtcpPacket(
InternalRtcpPacket {
packet: rtcp.from_packet(),
payload_offset: start,
payload_end_pad: tail,
},
)));
},
DemuxedMut::FailedParse(t) => {
warn!("Failed to parse message of type {:?}.", t);
},
_ => {
warn!("Illegal UDP packet from voice server.");
},
}
}
}
#[instrument(skip(interconnect, rx, cipher))]
pub(crate) async fn runner(
mut interconnect: Interconnect,
rx: Receiver<UdpRxMessage>,
cipher: Cipher,
config: Config,
udp_socket: Arc<UdpSocket>,
) {
trace!("UDP receive handle started.");
let mut state = UdpRx {
cipher,
decoder_map: Default::default(),
config,
packet_buffer: [0u8; VOICE_PACKET_MAX],
rx,
udp_socket,
};
state.run(&mut interconnect).await;
trace!("UDP receive handle stopped.");
}
#[inline]
fn rtp_valid(packet: RtpPacket<'_>) -> bool {
packet.get_version() == RTP_VERSION && packet.get_payload_type() == RTP_PROFILE_TYPE
}