nvaccess · seanbudd · Feb 11, 2025 · Jan 24, 2025 · Jan 24, 2025 · Jan 24, 2025
@@ -82,6 +82,7 @@ EXPORTS
 	wasPlay_pause
 	wasPlay_resume
 	wasPlay_setChannelVolume
+	wasPlay_startTrimmingLeadingSilence
 	wasPlay_startup
 	wasSilence_init
 	wasSilence_playFor

@@ -0,0 +1,246 @@
+// A part of NonVisual Desktop Access (NVDA)
+// This file is covered by the GNU General Public License.
+// See the file COPYING for more details.
+// Copyright (C) 2025 NV Access Limited, gexgd0419
+
+#ifndef SILENCEDETECT_H
+#define SILENCEDETECT_H
+
+#include <windows.h>
+#include <mmreg.h>
+#include <stdint.h>
+#include <type_traits>
+#include <limits>
+
+namespace SilenceDetect {
+
+/**
+ * Compile-time wave format tag.
+ * Supports integer and floating-point formats.
+ * `SampleType` should be the smallest numeric type that can hold a sample, for example, 32-bit int for 24-bit format.
+ * Signedness of `SampleType` matters. For unsigned types, the zero point is at middle, e.g. 128 for 8-bit unsigned.
+ * `bytesPerSample` should be <= `sizeof(SampleType)` for integer formats,
+ * and == `sizeof(SampleType)` for floating-point formats.
+ * Assumes C++20 standard.
+ */
+template <typename SampleType, size_t bytesPerSample = sizeof(SampleType)>
+struct WaveFormat {
+	static_assert(std::is_arithmetic_v<SampleType>, "SampleType should be an integer or floating-point type");
+	static_assert(!(std::is_floating_point_v<SampleType> && bytesPerSample != sizeof(SampleType)),
+		"When SampleType is a floating-point type, bytesPerSample should be equal to sizeof(SampleType)");
+	static_assert(!(std::is_integral_v<SampleType> && !(bytesPerSample <= sizeof(SampleType) && bytesPerSample > 0)),
+		"When SampleType is an integer type, bytesPerSample should be less than or equal to sizeof(SampleType) and greater than 0");
+
+	typedef SampleType SampleType;
+	static constexpr size_t bytesPerSample = bytesPerSample;
+
+	static constexpr SampleType zeroPoint() {
+		// for unsigned types, zero point is at middle
+		// for signed types, zero is zero
+		if constexpr (std::is_unsigned_v<SampleType>)
+			return SampleType(1) << (bytesPerSample * 8 - 1);
+		else
+			return SampleType();
+	}
+
+	static constexpr SampleType (max)() {
+		if constexpr (std::is_floating_point_v<SampleType>) {
+			// For floating-point samples, maximum value is 1.0
+			return SampleType(1);
+		} else {
+			// Trim the maximum value to `bytesPerSample` bytes
+			return (std::numeric_limits<SampleType>::max)() >> ((sizeof(SampleType) - bytesPerSample) * 8);
+		}
+	}
+
+	static constexpr SampleType (min)() {
+		if constexpr (std::is_floating_point_v<SampleType>) {
+			// For floating-point samples, minimum value is -1.0
+			return SampleType(-1);
+		} else {
+			// Trim the minimum value to `bytesPerSample` bytes
+			return (std::numeric_limits<SampleType>::min)() >> ((sizeof(SampleType) - bytesPerSample) * 8);
+		}
+	}
+
+	static constexpr SampleType defaultThreshold() {
+		// Default threshold: 1 / 2^10 or 0.0009765625
+		if constexpr (std::is_floating_point_v<SampleType>)
+			return SampleType(1) / (1 << 10);
+		else if constexpr (bytesPerSample * 8 > 10)
+			return SampleType(1) << (bytesPerSample * 8 - 10);
+		else
+			return SampleType();
+	}
+
+	static constexpr auto toSigned(SampleType smp) {
+		if constexpr (std::is_integral_v<SampleType>) {
+			// In C++20, signed integer types must use two's complement,
+			// so the following conversion is well-defined.
+			using SignedType = std::make_signed_t<SampleType>;
+			return SignedType(smp - zeroPoint());
+		} else {
+			return smp;
+		}
+	}
+
+	static constexpr SampleType fromSigned(SampleType smp) {
+		if constexpr (std::is_integral_v<SampleType>) {
+			// Signed overflow is undefined behavior,
+			// so convert to unsigned first.
+			using UnsignedType = std::make_unsigned_t<SampleType>;
+			return SampleType(UnsignedType(smp) + zeroPoint());
+		} else {
+			return smp;
+		}
+	}
+
+	static constexpr SampleType signExtend(SampleType smp) {
+		if constexpr (std::is_unsigned_v<SampleType> || bytesPerSample == sizeof(SampleType)) {
+			return smp;
+		} else {
+			constexpr auto shift = (sizeof(SampleType) - bytesPerSample) * 8;
+			// Convert to unsigned first to prevent left-shifting negative numbers
+			using UnsignedType = std::make_unsigned_t<SampleType>;
+			return SampleType(UnsignedType(smp) << shift) >> shift;
+		}
+	}
+
+	template <class SrcFmt>
+	static constexpr SampleType convertFrom(SrcFmt::SampleType smp) {
+		using SrcType = SrcFmt::SampleType;
+		if constexpr (std::is_floating_point_v<SrcType> && std::is_floating_point_v<SampleType>) {
+			// both floating points, convert directly
+			return SampleType(smp);
+		} else if constexpr (std::is_integral_v<SrcType> && std::is_integral_v<SampleType>) {
+			// both integers, do bit shifting
+			const auto dstsmp = SrcFmt::toSigned(smp);
+			if constexpr (bytesPerSample >= SrcFmt::bytesPerSample) {
+				constexpr auto shift = (bytesPerSample - SrcFmt::bytesPerSample) * 8;
+				// Convert to unsigned target type first to prevent overflows and left-shifting negative numbers
+				using UnsignedType = std::make_unsigned_t<SampleType>;
+				return fromSigned(UnsignedType(dstsmp) << shift);
+			} else {
+				constexpr auto shift = (SrcFmt::bytesPerSample - bytesPerSample) * 8;
+				return fromSigned(dstsmp >> shift);
+			}
+		} else if constexpr (std::is_floating_point_v<SrcType> && std::is_integral_v<SampleType>) {
+			// floating point to integer, e.g. [-1.0f, 1.0f] -> [-32767, 32767]
+			return fromSigned(smp * ((max)() - zeroPoint()));
+		} else {
+			// integer to floating point, e.g. [-32768, 32767] -> [-1.0f, 1.0f)
+			return SampleType(SrcFmt::toSigned(smp) / (SrcFmt::zeroPoint() - (SrcFmt::min)()));
+		}
+	}
+};
+
+inline WORD getFormatTag(const WAVEFORMATEX* wfx) {
+	if (wfx->wFormatTag == WAVE_FORMAT_EXTENSIBLE) {
+		auto wfext = reinterpret_cast<const WAVEFORMATEXTENSIBLE*>(wfx);
+		if (IS_VALID_WAVEFORMATEX_GUID(&wfext->SubFormat))
+			return EXTRACT_WAVEFORMATEX_ID(&wfext->SubFormat);
+	}
+	return wfx->wFormatTag;
+}
+
+/**
+ * Return the leading silence wave data length, in bytes.
+ * Assumes the wave data to be of one channel (mono).
+ * Uses a `WaveFormat` type (`Fmt`) to determine the wave format.
+ */
+template <class Fmt>
+size_t getLeadingSilenceSizeMono(
+	const unsigned char* waveData,
+	size_t size,
+	typename Fmt::SampleType threshold
+) {
+	using SampleType = Fmt::SampleType;
+	constexpr size_t bytesPerSample = Fmt::bytesPerSample;
+
+	if (size < bytesPerSample)
+		return 0;
+
+	constexpr SampleType zeroPoint = Fmt::zeroPoint();
+	const SampleType minValue = zeroPoint - threshold, maxValue = zeroPoint + threshold;
+
+	// Check each sample
+	SampleType smp = SampleType();
+	const unsigned char* const pEnd = waveData + (size - (size % bytesPerSample));
+	for (const unsigned char* p = waveData; p < pEnd; p += bytesPerSample) {
+		memcpy(&smp, p, bytesPerSample);
+		smp = Fmt::signExtend(smp);
+		// this sample is out of range, so the previous sample is the final sample of leading silence.
+		if (smp < minValue || smp > maxValue)
+			return p - waveData;
+	}
+
+	// The whole data block is silence
+	return size;
+}
+
+/**
+ * Invoke a functor with an argument of a WaveFormat type that corresponds to the specified WAVEFORMATEX.
+ * Return false if the WAVEFORMATEX is unknown.
+ */
+template <class Func>
+bool callByWaveFormat(const WAVEFORMATEX* wfx, Func&& func) {
+	switch (getFormatTag(wfx)) {
+	case WAVE_FORMAT_PCM:
+		switch (wfx->wBitsPerSample) {
+		case 8:  // 8-bits are unsigned, others are signed
+			func(WaveFormat<uint8_t>());
+			break;
+		case 16:
+			func(WaveFormat<int16_t>());
+			break;
+		case 24:
+			func(WaveFormat<int32_t, 3>());
+			break;
+		case 32:
+			func(WaveFormat<int32_t>());
+			break;
+		default:
+			return false;
+		}
+		break;
+	case WAVE_FORMAT_IEEE_FLOAT:
+		switch (wfx->wBitsPerSample) {
+		case 32:
+			func(WaveFormat<float>());
+			break;
+		case 64:
+			func(WaveFormat<double>());
+			break;
+		default:
+			return false;
+		}
+		break;
+	default:
+		return false;
+	}
+	return true;
+}
+
+/**
+ * Return the leading silence wave data length, in bytes.
+ * Uses a `WAVEFORMATEX` to determine the wave format.
+ */
+inline size_t getLeadingSilenceSize(
+	const WAVEFORMATEX* wfx,
+	const unsigned char* waveData,
+	size_t size
+) {
+	size_t len;
+	if (!callByWaveFormat(wfx, [=, &len](auto fmtTag) {
+			using Fmt = decltype(fmtTag);
+			len = getLeadingSilenceSizeMono<Fmt>(
+				waveData, size, Fmt::defaultThreshold());
+		}))
+		return 0;
+
+	return len - len % wfx->nBlockAlign;  // round down to block (channel) boundaries
+}
+
+}  // namespace SilenceDetect
+
+#endif  // SILENCEDETECT_H
@@ -24,6 +24,7 @@ This license can be found at:
 #include <mmdeviceapi.h>
 #include <common/log.h>
 #include <random>
+#include "silenceDetect.h"
 
 /**
  * Support for audio playback using WASAPI.
@@ -194,6 +195,8 @@ class WasapiPlayer {
 	HRESULT resume();
 	HRESULT setChannelVolume(unsigned int channel, float level);
 
+	void startTrimmingLeadingSilence(bool start);
+
 	private:
 	void maybeFireCallback();
 
@@ -245,6 +248,7 @@ class WasapiPlayer {
 	unsigned int defaultDeviceChangeCount;
 	unsigned int deviceStateChangeCount;
 	bool isUsingPreferredDevice = false;
+	bool isTrimmingLeadingSilence = false;
 };
 
 WasapiPlayer::WasapiPlayer(wchar_t* endpointId, WAVEFORMATEX format,
@@ -342,6 +346,19 @@ HRESULT WasapiPlayer::feed(unsigned char* data, unsigned int size,
 		return true;
 	};
 
+	if (isTrimmingLeadingSilence) {
+		size_t silenceSize = SilenceDetect::getLeadingSilenceSize(&format, data, size);
+		if (silenceSize >= size) {
+			// The whole chunk is silence. Continue checking for silence in the next chunk.
+			remainingFrames = 0;
+		} else {
+			// Silence ends in this chunk. Skip the silence and continue.
+			data += silenceSize;
+			remainingFrames = (size - silenceSize) / format.nBlockAlign;
+			isTrimmingLeadingSilence = false;  // Stop checking for silence
+		}
+	}
+
 	while (remainingFrames > 0) {
 		UINT32 paddingFrames;
 
@@ -643,6 +660,10 @@ HRESULT WasapiPlayer::setChannelVolume(unsigned int channel, float level) {
 	return volume->SetChannelVolume(channel, level);
 }
 
+void WasapiPlayer::startTrimmingLeadingSilence(bool start) {
+	isTrimmingLeadingSilence = start;
+}
+
 HRESULT WasapiPlayer::disableCommunicationDucking(IMMDevice* device) {
 	// Disable the default ducking experience used when a communication audio
 	// session is active, as we never want NVDA's audio to be ducked.
@@ -839,6 +860,10 @@ HRESULT wasPlay_setChannelVolume(
 	return player->setChannelVolume(channel, level);
 }
 
+void wasPlay_startTrimmingLeadingSilence(WasapiPlayer* player, bool start) {
+	player->startTrimmingLeadingSilence(start);
+}
+
 /**
  * This must be called once per session at startup before wasPlay_create is
  * called.

@@ -1393,6 +1393,7 @@ For examples of how to define and use new extension points, please see the code
 |`Action` |`synthIndexReached` |Notifies when a synthesizer reaches an index during speech.|
 |`Action` |`synthDoneSpeaking` |Notifies when a synthesizer finishes speaking.|
 |`Action` |`synthChanged` |Notifies of synthesizer changes.|
+|`Action` |`pre_synthSpeak` |Notifies when the current synthesizer is about to speak something.|
 
 ### tones {#tonesExtPts}
 

@@ -45,6 +45,7 @@
 	autoDialectSwitching = boolean(default=false)
 	delayedCharacterDescriptions = boolean(default=false)
 	excludedSpeechModes = int_list(default=list())
+	trimLeadingSilence = boolean(default=true)
 
 	[[__many__]]
 		capPitchChange = integer(default=30,min=-100,max=100)

@@ -3720,6 +3720,7 @@ def __init__(self, parent):
 		#  Advanced settings panel
 		label = _("Speech")
 		speechSizer = wx.StaticBoxSizer(wx.VERTICAL, self, label=label)
+		speechBox = speechSizer.GetStaticBox()
 		speechGroup = guiHelper.BoxSizerHelper(speechSizer, sizer=speechSizer)
 		sHelper.addItem(speechGroup)
 
@@ -3750,6 +3751,14 @@ def __init__(self, parent):
 			["featureFlag", "cancelExpiredFocusSpeech"],
 		)
 
+		# Translators: This is the label for a checkbox control in the
+		#  Advanced settings panel.
+		label = _("Trim leading silence in speech audio")
+		self.trimLeadingSilenceCheckBox = speechGroup.addItem(wx.CheckBox(speechBox, label=label))
+		self.bindHelpEvent("TrimLeadingSilenceSpeech", self.trimLeadingSilenceCheckBox)
+		self.trimLeadingSilenceCheckBox.SetValue(config.conf["speech"]["trimLeadingSilence"])
+		self.trimLeadingSilenceCheckBox.defaultValue = self._getDefaultValue(["speech", "trimLeadingSilence"])
+
 		# Translators: This is the label for a group of advanced options in the
 		#  Advanced settings panel
 		label = _("Virtual Buffers")
@@ -3934,6 +3943,7 @@ def haveConfigDefaultsBeenRestored(self):
 			and self.wtStrategyCombo.isValueConfigSpecDefault()
 			and self.cancelExpiredFocusSpeechCombo.GetSelection()
 			== self.cancelExpiredFocusSpeechCombo.defaultValue
+			and self.trimLeadingSilenceCheckBox.IsChecked() == self.trimLeadingSilenceCheckBox.defaultValue
 			and self.loadChromeVBufWhenBusyCombo.isValueConfigSpecDefault()
 			and self.caretMoveTimeoutSpinControl.GetValue() == self.caretMoveTimeoutSpinControl.defaultValue
 			and self.reportTransparentColorCheckBox.GetValue()
@@ -3963,6 +3973,7 @@ def restoreToDefaults(self):
 		self.diffAlgoCombo.SetSelection(self.diffAlgoCombo.defaultValue)
 		self.wtStrategyCombo.resetToConfigSpecDefault()
 		self.cancelExpiredFocusSpeechCombo.SetSelection(self.cancelExpiredFocusSpeechCombo.defaultValue)
+		self.trimLeadingSilenceCheckBox.SetValue(self.trimLeadingSilenceCheckBox.defaultValue)
 		self.loadChromeVBufWhenBusyCombo.resetToConfigSpecDefault()
 		self.caretMoveTimeoutSpinControl.SetValue(self.caretMoveTimeoutSpinControl.defaultValue)
 		self.reportTransparentColorCheckBox.SetValue(self.reportTransparentColorCheckBox.defaultValue)
@@ -3974,6 +3985,14 @@ def restoreToDefaults(self):
 
 	def onSave(self):
 		log.debug("Saving advanced config")
+
+		if config.conf["speech"]["trimLeadingSilence"] != self.trimLeadingSilenceCheckBox.IsChecked():
+			# Reload the synthesizer if "trimLeadingSilence" changes
+			config.conf["speech"]["trimLeadingSilence"] = self.trimLeadingSilenceCheckBox.IsChecked()
+			currentSynth = getSynth()
+			if not setSynth(currentSynth.name):
+				_synthWarningDialog(currentSynth.name)
+
 		config.conf["development"]["enableScratchpadDir"] = self.scratchpadCheckBox.IsChecked()
 		selectiveUIAEventRegistrationChoice = self.selectiveUIAEventRegistrationCombo.GetSelection()
 		config.conf["UIA"]["eventRegistration"] = self.selectiveUIAEventRegistrationVals[