Real-Time TTS with WebSockets

Implement low-latency streaming Text-to-Speech using Deepgram’s WebSocket API.

Why Use WebSockets for TTS?

WebSockets provide a continuous audio stream flowing directly to the playback device without saving files to disk. This approach is essential for voice agents and conversational AI that require minimal latency and natural-sounding speech.

Key benefits include low latency, which allows audio playback to begin as soon as the first data chunk arrives, continuous streaming that maintains a persistent connection for rapid audio delivery, and efficient processing by streaming audio directly to playback devices.

WebSocket Implementation Examples

The following examples demonstrate how to implement real-time TTS using Deepgram’s WebSocket API:

1 # For help migrating to the new Python SDK, check out our migration guide:
2 # https://github.com/deepgram/deepgram-python-sdk/blob/main/docs/Migrating-v3-to-v5.md
3 
4 import sounddevice as sd
5 import numpy as np
6 import time
7 
8 from deepgram import DeepgramClient
9 from deepgram.core.events import EventType
10 from deepgram.extensions.types.sockets import SpeakV1SocketClientResponse
11 
12 TTS_TEXT = "Hello, this is a text to speech example using Deepgram."
13 
14 def main():
15     try:
16         # Create a Deepgram client using the API key from environment variables
17         deepgram = DeepgramClient()
18 
19         # Create a websocket connection to Deepgram
20         with deepgram.speak.v1.connect(
21             model="aura-2-thalia-en",
22             encoding="linear16",
23             sample_rate=48000
24         ) as dg_connection:
25 
26             def on_message(message: SpeakV1SocketClientResponse) -> None:
27                 if isinstance(message, bytes):
28                     print("Received audio chunk")
29                     # Convert binary data to audio format playback devices understand
30                     array = np.frombuffer(message, dtype=np.int16)
31                     # Play the audio immediately upon receiving each chunk
32                     sd.play(array, 48000)
33                     sd.wait()
34                 else:
35                     msg_type = getattr(message, "type", "Unknown")
36                     print(f"Received {msg_type} event")
37 
38             dg_connection.on(EventType.OPEN, lambda _: print("Connection opened"))
39             dg_connection.on(EventType.MESSAGE, on_message)
40             dg_connection.on(EventType.CLOSE, lambda _: print("Connection closed"))
41             dg_connection.on(EventType.ERROR, lambda error: print(f"Error: {error}"))
42 
43             dg_connection.start_listening()
44 
45             # Send text to be converted to speech
46             from deepgram.extensions.types.sockets import SpeakV1TextMessage
47             dg_connection.send_text(SpeakV1TextMessage(text=TTS_TEXT))
48 
49             # Send control messages
50             from deepgram.extensions.types.sockets import SpeakV1ControlMessage
51             dg_connection.send_control(SpeakV1ControlMessage(type="Flush"))
52 
53             # Allow time for playback
54             time.sleep(5)
55 
56             dg_connection.send_control(SpeakV1ControlMessage(type="Close"))
57             print("TTS stream completed")
58 
59     except Exception as e:
60         print(f"An error occurred: {e}")
61 
62 if __name__ == "__main__":
63     main()

For optimal text handling, see our guide onText Chunking for TTS.

1	# For help migrating to the new Python SDK, check out our migration guide:
2	# https://github.com/deepgram/deepgram-python-sdk/blob/main/docs/Migrating-v3-to-v5.md
3
4	import sounddevice as sd
5	import numpy as np
6	import time
7
8	from deepgram import DeepgramClient
9	from deepgram.core.events import EventType
10	from deepgram.extensions.types.sockets import SpeakV1SocketClientResponse
11
12	TTS_TEXT = "Hello, this is a text to speech example using Deepgram."
13
14	def main():
15	try:
16	# Create a Deepgram client using the API key from environment variables
17	deepgram = DeepgramClient()
18
19	# Create a websocket connection to Deepgram
20	with deepgram.speak.v1.connect(
21	model="aura-2-thalia-en",
22	encoding="linear16",
23	sample_rate=48000
24	) as dg_connection:
25
26	def on_message(message: SpeakV1SocketClientResponse) -> None:
27	if isinstance(message, bytes):
28	print("Received audio chunk")
29	# Convert binary data to audio format playback devices understand
30	array = np.frombuffer(message, dtype=np.int16)
31	# Play the audio immediately upon receiving each chunk
32	sd.play(array, 48000)
33	sd.wait()
34	else:
35	msg_type = getattr(message, "type", "Unknown")
36	print(f"Received {msg_type} event")
37
38	dg_connection.on(EventType.OPEN, lambda _: print("Connection opened"))
39	dg_connection.on(EventType.MESSAGE, on_message)
40	dg_connection.on(EventType.CLOSE, lambda _: print("Connection closed"))
41	dg_connection.on(EventType.ERROR, lambda error: print(f"Error: {error}"))
42
43	dg_connection.start_listening()
44
45	# Send text to be converted to speech
46	from deepgram.extensions.types.sockets import SpeakV1TextMessage
47	dg_connection.send_text(SpeakV1TextMessage(text=TTS_TEXT))
48
49	# Send control messages
50	from deepgram.extensions.types.sockets import SpeakV1ControlMessage
51	dg_connection.send_control(SpeakV1ControlMessage(type="Flush"))
52
53	# Allow time for playback
54	time.sleep(5)
55
56	dg_connection.send_control(SpeakV1ControlMessage(type="Close"))
57	print("TTS stream completed")
58
59	except Exception as e:
60	print(f"An error occurred: {e}")
61
62	if __name__ == "__main__":
63	main()

Movatterモバイル変換

Why Use WebSockets for TTS?

WebSocket Implementation Examples