ESPHome 2026.5.0b1
Loading...
Searching...
No Matches
i2s_audio_speaker_standard.cpp
Go to the documentation of this file.
2
3#ifdef USE_ESP32
4
5#include <driver/i2s_std.h>
6
9
10#include "esphome/core/hal.h"
11#include "esphome/core/log.h"
12
13#include "esp_timer.h"
14
15namespace esphome::i2s_audio {
16
17static const char *const TAG = "i2s_audio.speaker.std";
18
19static constexpr size_t DMA_BUFFERS_COUNT = 4;
20// Sized to comfortably absorb scheduling jitter: at most DMA_BUFFERS_COUNT events can be in flight,
21// doubled so that a transient backlog never overruns the queue (which would desync the lockstep
22// invariant between i2s_event_queue_ and write_records_queue_).
23static constexpr size_t I2S_EVENT_QUEUE_COUNT = DMA_BUFFERS_COUNT * 2;
24// Generous timeout for ``i2s_channel_write`` blocking. A buffer frees roughly every
25// DMA_BUFFER_DURATION_MS, so a multiple of that gives plenty of slack against scheduling jitter
26// without masking real failures.
27static constexpr TickType_t WRITE_TIMEOUT_TICKS = pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS * (DMA_BUFFERS_COUNT + 1));
28
31 const char *fmt_str;
32 switch (this->i2s_comm_fmt_) {
33 case I2SCommFmt::PCM:
34 fmt_str = "pcm";
35 break;
36 case I2SCommFmt::MSB:
37 fmt_str = "msb";
38 break;
39 default:
40 fmt_str = "std";
41 break;
42 }
43 ESP_LOGCONFIG(TAG, " Communication format: %s", fmt_str);
44}
45
47 xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::TASK_STARTING);
48
49 const uint32_t dma_buffers_duration_ms = DMA_BUFFER_DURATION_MS * DMA_BUFFERS_COUNT;
50 // Ensure ring buffer duration is at least the duration of all DMA buffers
51 const uint32_t ring_buffer_duration = std::max(dma_buffers_duration_ms, this->buffer_duration_ms_);
52
53 // The DMA buffers may have more bits per sample, so calculate buffer sizes based on the input audio stream info
54 const size_t bytes_per_frame = this->current_stream_info_.frames_to_bytes(1);
55 // Round the ring buffer size down to a multiple of bytes_per_frame so the wrap boundary stays frame-aligned and
56 // avoids unnecessary single-frame splices.
57 const size_t ring_buffer_size =
58 (this->current_stream_info_.ms_to_bytes(ring_buffer_duration) / bytes_per_frame) * bytes_per_frame;
59 const uint32_t frames_per_dma_buffer = this->current_stream_info_.ms_to_frames(DMA_BUFFER_DURATION_MS);
60 const size_t dma_buffer_bytes = this->current_stream_info_.frames_to_bytes(frames_per_dma_buffer);
61
62 bool successful_setup = false;
63
64 std::unique_ptr<audio::RingBufferAudioSource> audio_source;
65
66 // Pre-zeroed buffer used to silence-pad each DMA descriptor whenever real audio doesn't fully fill it.
67 RAMAllocator<uint8_t> silence_allocator;
68 uint8_t *silence_buffer = silence_allocator.allocate(dma_buffer_bytes);
69
70 if (silence_buffer != nullptr) {
71 memset(silence_buffer, 0, dma_buffer_bytes);
72
73 std::shared_ptr<ring_buffer::RingBuffer> temp_ring_buffer = ring_buffer::RingBuffer::create(ring_buffer_size);
74 audio_source =
75 audio::RingBufferAudioSource::create(temp_ring_buffer, dma_buffer_bytes, static_cast<uint8_t>(bytes_per_frame));
76
77 if (audio_source != nullptr) {
78 // audio_source is nullptr if the ring buffer fails to allocate
79 this->audio_ring_buffer_ = temp_ring_buffer;
80 successful_setup = true;
81 }
82 }
83
84 if (successful_setup) {
85 // Preload every DMA descriptor with silence and push a matching zero-real-frames record per buffer.
86 // This guarantees that every on_sent event has a corresponding write record from the start, so
87 // ``i2s_event_queue_`` and ``write_records_queue_`` stay in lockstep for the entire task lifetime.
88 for (size_t i = 0; i < DMA_BUFFERS_COUNT; i++) {
89 size_t bytes_loaded = 0;
90 esp_err_t err = i2s_channel_preload_data(this->tx_handle_, silence_buffer, dma_buffer_bytes, &bytes_loaded);
91 if (err != ESP_OK || bytes_loaded != dma_buffer_bytes) {
92 ESP_LOGV(TAG, "Failed to preload silence into DMA buffer %u (err=%d, loaded=%u)", (unsigned) i, (int) err,
93 (unsigned) bytes_loaded);
94 successful_setup = false;
95 break;
96 }
97 uint32_t zero_real_frames = 0;
98 if (xQueueSend(this->write_records_queue_, &zero_real_frames, 0) != pdTRUE) {
99 // Should never happen: the queue was just reset and is sized for DMA_BUFFERS_COUNT * 2 entries.
100 ESP_LOGV(TAG, "Failed to push preload write record");
101 successful_setup = false;
102 break;
103 }
104 }
105 }
106
107 if (successful_setup) {
108 // Register the on_sent callback BEFORE enabling the channel so the very first transmitted buffer
109 // generates a queued event that pairs with the first preloaded silence record.
110 const i2s_event_callbacks_t callbacks = {.on_sent = i2s_on_sent_cb};
111 i2s_channel_register_event_callback(this->tx_handle_, &callbacks, this);
112
113 if (i2s_channel_enable(this->tx_handle_) != ESP_OK) {
114 ESP_LOGV(TAG, "Failed to enable I2S channel");
115 successful_setup = false;
116 }
117 }
118
119 if (!successful_setup) {
120 xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_ESP_NO_MEM);
121 } else {
122 bool stop_gracefully = false;
123 // Number of records currently in ``write_records_queue_`` that carry real audio. Used by graceful
124 // stop to wait until every real-audio buffer has been confirmed played by an ISR event.
125 uint32_t pending_real_buffers = 0;
126 uint32_t last_data_received_time = millis();
127
128 xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::TASK_RUNNING);
129
130 // Main speaker task loop. Continues while:
131 // - Paused, OR
132 // - No timeout configured, OR
133 // - Timeout hasn't elapsed since last data
134 //
135 // Always-fill model: every iteration writes exactly one DMA buffer's worth, mixing real audio
136 // and silence padding as needed. The blocking ``i2s_channel_write`` paces the loop at the DMA
137 // consumption rate, and every buffer write is matched 1:1 with a record on ``write_records_queue_``.
138 //
139 // While paused, the real-audio fill is skipped and the entire DMA buffer is filled with silence;
140 // the same blocking ``i2s_channel_write`` provides natural pacing (one buffer per ~DMA_BUFFER_DURATION_MS),
141 // so the lockstep invariant is preserved without burning CPU.
142 while (this->pause_state_ || !this->timeout_.has_value() ||
143 (millis() - last_data_received_time) <= this->timeout_.value()) {
144 uint32_t event_group_bits = xEventGroupGetBits(this->event_group_);
145
146 if (event_group_bits & SpeakerEventGroupBits::COMMAND_STOP) {
147 // COMMAND_STOP is set both by user-initiated stop() and by the ISR when it drops a completion
148 // event (paired with ERR_DROPPED_EVENT so loop() can distinguish the two cases).
149 xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::COMMAND_STOP);
150 ESP_LOGV(TAG, "Exiting: COMMAND_STOP received");
151 break;
152 }
153 if (event_group_bits & SpeakerEventGroupBits::COMMAND_STOP_GRACEFULLY) {
154 xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::COMMAND_STOP_GRACEFULLY);
155 stop_gracefully = true;
156 }
157
158 if (this->audio_stream_info_ != this->current_stream_info_) {
159 // Audio stream info changed, stop the speaker task so it will restart with the proper settings.
160 ESP_LOGV(TAG, "Exiting: stream info changed");
161 break;
162 }
163
164 // Drain ISR-stamped completion events. Each event corresponds 1:1 with a write_records_queue_
165 // entry by construction (preloaded records at startup, plus exactly one record pushed per
166 // iteration alongside exactly one DMA-buffer-sized write).
167 int64_t write_timestamp;
168 bool lockstep_broken = false;
169 while (xQueueReceive(this->i2s_event_queue_, &write_timestamp, 0)) {
170 uint32_t real_frames = 0;
171 if (xQueueReceive(this->write_records_queue_, &real_frames, 0) != pdTRUE) {
172 // Should never happen: would indicate the lockstep invariant is broken.
173 ESP_LOGV(TAG, "Event without matching write record");
175 lockstep_broken = true;
176 break;
177 }
178 if (real_frames > 0) {
179 pending_real_buffers--;
180 // Real audio is packed at the start of each DMA buffer with any silence padding on the
181 // tail, so the real audio finished playing earlier than the buffer-completion timestamp
182 // by the duration of the trailing zeros.
183 const uint32_t silence_frames = frames_per_dma_buffer - real_frames;
184 const int64_t adjusted_ts =
185 write_timestamp - this->current_stream_info_.frames_to_microseconds(silence_frames);
186 this->audio_output_callback_(real_frames, adjusted_ts);
187 }
188 }
189 if (lockstep_broken) {
190 break;
191 }
192
193 // Graceful stop: exit only after the source's exposed chunk is drained, the underlying ring
194 // buffer has nothing left to hand over, and every real-audio buffer we submitted has been
195 // confirmed played. ``has_buffered_data()`` returns bytes still sitting in the ring buffer
196 // awaiting fill().
197 if (stop_gracefully && audio_source->available() == 0 && !this->has_buffered_data() &&
198 pending_real_buffers == 0) {
199 ESP_LOGV(TAG, "Exiting: graceful stop complete");
200 break;
201 }
202
203 // Compose exactly one DMA buffer's worth: drain as much real audio as the source currently
204 // exposes (may take multiple fill() calls when crossing a ring buffer wrap), then pad any
205 // remainder with silence. All writes pack into the next free DMA descriptor in order, so the
206 // descriptor ends up holding [real audio][silence padding].
207 size_t bytes_written_total = 0;
208 size_t real_bytes_total = 0;
209 bool partial_write_failure = false;
210
211 if (!this->pause_state_) {
212 while (bytes_written_total < dma_buffer_bytes) {
213 size_t bytes_read = audio_source->fill(pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS) / 2, false);
214 if (bytes_read > 0) {
215 uint8_t *new_data = audio_source->mutable_data() + audio_source->available() - bytes_read;
216 this->apply_software_volume_(new_data, bytes_read);
217 this->swap_esp32_mono_samples_(new_data, bytes_read);
218 }
219
220 const size_t to_write = std::min(audio_source->available(), dma_buffer_bytes - bytes_written_total);
221 if (to_write == 0) {
222 // Ring buffer has nothing more to hand over right now; pad the rest of this DMA buffer
223 // with silence so the lockstep invariant (one write per iteration) is preserved.
224 break;
225 }
226
227 size_t bw = 0;
228 i2s_channel_write(this->tx_handle_, audio_source->data(), to_write, &bw, WRITE_TIMEOUT_TICKS);
229 if (bw != to_write) {
230 // A short real-audio write breaks DMA descriptor alignment for every subsequent event;
231 // the only safe recovery is to restart the task.
232 ESP_LOGV(TAG, "Partial real audio write: %u of %u bytes", (unsigned) bw, (unsigned) to_write);
234 partial_write_failure = true;
235 break;
236 }
237 audio_source->consume(bw);
238 bytes_written_total += bw;
239 real_bytes_total += bw;
240 }
241 if (real_bytes_total > 0) {
242 last_data_received_time = millis();
243 }
244 }
245
246 if (partial_write_failure) {
247 break;
248 }
249
250 const size_t silence_bytes = dma_buffer_bytes - bytes_written_total;
251 if (silence_bytes > 0) {
252 size_t bw = 0;
253 i2s_channel_write(this->tx_handle_, silence_buffer, silence_bytes, &bw, WRITE_TIMEOUT_TICKS);
254 if (bw != silence_bytes) {
255 // Same descriptor-alignment hazard as a partial real-audio write.
256 ESP_LOGV(TAG, "Partial silence write: %u of %u bytes", (unsigned) bw, (unsigned) silence_bytes);
258 break;
259 }
260 }
261
262 const uint32_t real_frames_in_buffer = this->current_stream_info_.bytes_to_frames(real_bytes_total);
263 // Push the matching write record. Capacity headroom in I2S_EVENT_QUEUE_COUNT guarantees this
264 // succeeds even with a transient backlog of unprocessed events; if it ever fails the lockstep
265 // invariant is broken and every subsequent timestamp would be silently wrong, so bail.
266 if (xQueueSend(this->write_records_queue_, &real_frames_in_buffer, 0) != pdTRUE) {
267 ESP_LOGV(TAG, "Exiting: write records queue full");
269 break;
270 }
271 if (real_frames_in_buffer > 0) {
272 pending_real_buffers++;
273 }
274 }
275 }
276
277 xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::TASK_STOPPING);
278
279 audio_source.reset();
280
281 if (silence_buffer != nullptr) {
282 silence_allocator.deallocate(silence_buffer, dma_buffer_bytes);
283 silence_buffer = nullptr;
284 }
285
286 xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::TASK_STOPPED);
287
288 while (true) {
289 // Continuously delay until the loop method deletes the task
290 vTaskDelay(pdMS_TO_TICKS(10));
291 }
292}
293
295 this->current_stream_info_ = audio_stream_info;
296
297 if ((this->i2s_role_ & I2S_ROLE_SLAVE) && (this->sample_rate_ != audio_stream_info.get_sample_rate())) { // NOLINT
298 // Can't reconfigure I2S bus, so the sample rate must match the configured value
299 ESP_LOGE(TAG, "Incompatible stream settings");
300 return ESP_ERR_NOT_SUPPORTED;
301 }
302
303 if (this->slot_bit_width_ != I2S_SLOT_BIT_WIDTH_AUTO &&
304 (i2s_slot_bit_width_t) audio_stream_info.get_bits_per_sample() > this->slot_bit_width_) {
305 // Currently can't handle the case when the incoming audio has more bits per sample than the configured value
306 ESP_LOGE(TAG, "Stream bits per sample must be less than or equal to the speaker's configuration");
307 return ESP_ERR_NOT_SUPPORTED;
308 }
309
310 if (!this->parent_->try_lock()) {
311 ESP_LOGE(TAG, "Parent bus is busy");
312 return ESP_ERR_INVALID_STATE;
313 }
314
315 uint32_t dma_buffer_length = audio_stream_info.ms_to_frames(DMA_BUFFER_DURATION_MS);
316
317 i2s_role_t i2s_role = this->i2s_role_;
318 i2s_clock_src_t clk_src = I2S_CLK_SRC_DEFAULT;
319
320#if SOC_CLK_APLL_SUPPORTED
321 if (this->use_apll_) {
322 clk_src = i2s_clock_src_t::I2S_CLK_SRC_APLL;
323 }
324#endif // SOC_CLK_APLL_SUPPORTED
325
326 // Log DMA configuration for debugging
327 ESP_LOGV(TAG, "I2S DMA config: %zu buffers x %lu frames", (size_t) DMA_BUFFERS_COUNT,
328 (unsigned long) dma_buffer_length);
329
330 i2s_chan_config_t chan_cfg = {
331 .id = this->parent_->get_port(),
332 .role = i2s_role,
333 .dma_desc_num = DMA_BUFFERS_COUNT,
334 .dma_frame_num = dma_buffer_length,
335 .auto_clear = true,
336 .intr_priority = 3,
337 };
338
339 // Build standard I2S clock/slot/gpio configuration
340 i2s_std_clk_config_t clk_cfg = {
341 .sample_rate_hz = audio_stream_info.get_sample_rate(),
342 .clk_src = clk_src,
343 .mclk_multiple = this->mclk_multiple_,
344 };
345
346 i2s_slot_mode_t slot_mode = this->slot_mode_;
347 i2s_std_slot_mask_t slot_mask = this->std_slot_mask_;
348 if (audio_stream_info.get_channels() == 1) {
349 slot_mode = I2S_SLOT_MODE_MONO;
350 } else if (audio_stream_info.get_channels() == 2) {
351 slot_mode = I2S_SLOT_MODE_STEREO;
352 slot_mask = I2S_STD_SLOT_BOTH;
353 }
354
355 i2s_std_slot_config_t slot_cfg;
356 switch (this->i2s_comm_fmt_) {
357 case I2SCommFmt::PCM:
358 slot_cfg =
359 I2S_STD_PCM_SLOT_DEFAULT_CONFIG((i2s_data_bit_width_t) audio_stream_info.get_bits_per_sample(), slot_mode);
360 break;
361 case I2SCommFmt::MSB:
362 slot_cfg =
363 I2S_STD_MSB_SLOT_DEFAULT_CONFIG((i2s_data_bit_width_t) audio_stream_info.get_bits_per_sample(), slot_mode);
364 break;
365 default:
366 slot_cfg = I2S_STD_PHILIPS_SLOT_DEFAULT_CONFIG((i2s_data_bit_width_t) audio_stream_info.get_bits_per_sample(),
367 slot_mode);
368 break;
369 }
370
371#ifdef USE_ESP32_VARIANT_ESP32
372 // There seems to be a bug on the ESP32 (non-variant) platform where setting the slot bit width higher than the
373 // bits per sample causes the audio to play too fast. Setting the ws_width to the configured slot bit width seems
374 // to make it play at the correct speed while sending more bits per slot.
375 if (this->slot_bit_width_ != I2S_SLOT_BIT_WIDTH_AUTO) {
376 uint32_t configured_bit_width = static_cast<uint32_t>(this->slot_bit_width_);
377 slot_cfg.ws_width = configured_bit_width;
378 if (configured_bit_width > 16) {
379 slot_cfg.msb_right = false;
380 }
381 }
382#else
383 slot_cfg.slot_bit_width = this->slot_bit_width_;
384 if (this->slot_bit_width_ != I2S_SLOT_BIT_WIDTH_AUTO) {
385 slot_cfg.ws_width = static_cast<uint32_t>(this->slot_bit_width_);
386 }
387#endif // USE_ESP32_VARIANT_ESP32
388 slot_cfg.slot_mask = slot_mask;
389
390 i2s_std_gpio_config_t gpio_cfg = this->parent_->get_pin_config();
391 gpio_cfg.dout = this->dout_pin_;
392
393 i2s_std_config_t std_cfg = {
394 .clk_cfg = clk_cfg,
395 .slot_cfg = slot_cfg,
396 .gpio_cfg = gpio_cfg,
397 };
398
399 esp_err_t err = this->init_i2s_channel_(chan_cfg, std_cfg, I2S_EVENT_QUEUE_COUNT);
400 if (err != ESP_OK) {
401 return err;
402 }
403
404 // The speaker task will enable the channel after preloading.
405
406 return ESP_OK;
407}
408
409} // namespace esphome::i2s_audio
410
411#endif // USE_ESP32
An STL allocator that uses SPI or internal RAM.
Definition helpers.h:2053
void deallocate(T *p, size_t n)
Definition helpers.h:2110
T * allocate(size_t n)
Definition helpers.h:2080
size_t ms_to_bytes(uint32_t ms) const
Converts duration to bytes.
Definition audio.h:72
size_t frames_to_bytes(uint32_t frames) const
Converts frames to bytes.
Definition audio.h:52
uint8_t get_bits_per_sample() const
Definition audio.h:27
uint32_t frames_to_microseconds(uint32_t frames) const
Computes the duration, in microseconds, the given amount of frames represents.
Definition audio.cpp:25
uint32_t bytes_to_frames(size_t bytes) const
Convert bytes to frames.
Definition audio.h:42
uint8_t get_channels() const
Definition audio.h:28
uint32_t ms_to_frames(uint32_t ms) const
Converts duration to frames.
Definition audio.h:62
uint32_t get_sample_rate() const
Definition audio.h:29
static std::unique_ptr< RingBufferAudioSource > create(std::shared_ptr< ring_buffer::RingBuffer > ring_buffer, size_t max_fill_bytes, uint8_t alignment_bytes=1)
Creates a new ring-buffer-backed audio source after validating its parameters.
i2s_std_slot_mask_t std_slot_mask_
Definition i2s_audio.h:28
i2s_slot_bit_width_t slot_bit_width_
Definition i2s_audio.h:29
i2s_mclk_multiple_t mclk_multiple_
Definition i2s_audio.h:32
static bool i2s_on_sent_cb(i2s_chan_handle_t handle, i2s_event_data_t *event, void *user_ctx)
Callback function used to send playback timestamps to the speaker task.
void apply_software_volume_(uint8_t *data, size_t bytes_read)
Apply software volume control using Q15 fixed-point scaling.
std::weak_ptr< ring_buffer::RingBuffer > audio_ring_buffer_
void swap_esp32_mono_samples_(uint8_t *data, size_t bytes_read)
Swap adjacent 16-bit mono samples for ESP32 (non-variant) hardware quirk.
esp_err_t init_i2s_channel_(const i2s_chan_config_t &chan_cfg, const i2s_std_config_t &std_cfg, size_t event_queue_size)
Shared I2S channel allocation, initialization, and event queue setup.
esp_err_t start_i2s_driver(audio::AudioStreamInfo &audio_stream_info) override
static std::unique_ptr< RingBuffer > create(size_t len, MemoryPreference preference=MemoryPreference::EXTERNAL_FIRST)
CallbackManager< void(uint32_t, int64_t)> audio_output_callback_
Definition speaker.h:122
audio::AudioStreamInfo audio_stream_info_
Definition speaker.h:114
auto * new_data
Definition helpers.cpp:29
uint32_t IRAM_ATTR HOT millis()
Definition hal.cpp:28
static void uint32_t