Last active
April 17, 2024 15:45
-
-
Save p-i-/417dab2ddccac7aaef8c798af665093a to your computer and use it in GitHub Desktop.
Realtime (Ultralow-latency) Speech Chunker using Voice Activity Detection (VAD) using WebRTC VAD
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
# Realtime (Ultralow-latency) Voice Activity Detection (VAD) using WebRTC VAD | |
v3.2 | |
π 17 Apr 2024 | |
## Usage | |
```python | |
def on_segment(segment): # np.array of np.int16 | |
print(f'☏ CALLBACK: Got segment: {len(segment)} frames') | |
ch = MicChunker(verbose=True, dump_wav=True, on_segment=on_segment) | |
``` | |
## Notes | |
- pyaudio triggers a callback we supply (mic-data @ {16kHz, 16-bit PCM, 1 channel}) | |
- each frame is 320 samples, as WebRTC's VAD requires 320 samples @ 16kHz | |
- We use WebRT's VoiceAudioDetection (VAD) to detect presence of speech (bool) in each frame. | |
- We avoid allocations in audio thread (it's a high-priority system thread) | |
- P consec good frames initiate recording, Q consec bad frames conclude recording | |
- Upon conclude, the audio segment is written to a queue for processing | |
- A worker thread monitors the queue, triggering our on_segment callback for each segment | |
## Terminology | |
I'm redefining some buzzwords here: | |
- In common lingo, the mic-callback is returning a BUFFER of FRAMES. Each FRAME has nChannels (1=MONO). | |
- In this file, the mic-callback is returning a FRAME of SAMPLES. And I'm using a RING-BUFFER of FRAMES. | |
## Sample output | |
``` | |
Recording... press Ctrl+C to stop | |
STARTED RECORDING: start_frame=27 | |
STOPPED RECORDING: end_frame=68, length=41 frames | |
☏ CALLBACK: Got segment: 41984 frames | |
STARTED RECORDING: start_frame=113 | |
STOPPED RECORDING: end_frame=149, length=36 frames | |
☏ CALLBACK: Got segment: 36864 frames | |
^C👋 | |
``` | |
## TODOs | |
There is currently no provision for the user speaking for too long (> buf_s seconds). | |
''' | |
import time | |
import queue as Queue | |
from threading import Thread | |
import wave | |
import numpy as np | |
import pyaudio | |
import webrtcvad | |
class DictToObject: | |
def __init__(self, dictionary): | |
for key, value in dictionary.items(): | |
setattr(self, key, value) | |
class MicChunker: | |
audioFormat = { | |
'format': pyaudio.paInt16, | |
'channels': 1, | |
'rate': 16000, # vad wants 16kHz | |
'frames_per_buffer': 320 # vad wants 320 samples | |
} | |
def __init__(self, **kwargs): | |
defaults = { | |
'verbose': False, | |
'dump_wav': False, | |
'on_segment': lambda: None, | |
'buf_s': 300, | |
'nframes_to_prepad': 10, | |
'n_good_frames_to_start_recording': 10, | |
'n_bad_frames_to_end_recording': 16, | |
} | |
self.settings = DictToObject(defaults | kwargs) | |
S, A = self.settings, DictToObject(MicChunker.audioFormat) | |
# We use a 2D ringbuffer to keep the pointer-math simple | |
samps_per_frame = A.frames_per_buffer | |
nFrames = S.buf_s * A.rate // samps_per_frame | |
self.circular_buf = np.zeros((nFrames, samps_per_frame), dtype=np.int16) | |
self.vad = webrtcvad.Vad(mode=3) # 0=off, 1=minimal, 2=low, 3=aggressive | |
# We use a generator to encapsulate state machine | |
self.process_frame = self.frame_processor() | |
next(self.process_frame) # prime it | |
self.pending_segments = Queue.Queue() | |
if S.dump_wav or S.on_segment: | |
Thread(target=self.worker, daemon=True).start() # consumes queue | |
def mic_callback(in_data, frame_count, time_info, status): | |
self.process_frame.send(in_data) | |
return (in_data, pyaudio.paContinue) | |
self.p = pyaudio.PyAudio() | |
self.stream = self.p.open( | |
**MicChunker.audioFormat, | |
input=True, | |
stream_callback=mic_callback | |
) | |
self.stream.start_stream() | |
def frame_processor(self): | |
S, frame_index, nconsec, recording = self.settings, 0, 0, False | |
dprint = print if S.verbose else lambda s: None | |
while True: | |
in_data = yield | |
self.circular_buf[frame_index] = np.frombuffer(in_data, dtype=np.int16) | |
is_speech = self.vad.is_speech(in_data, MicChunker.audioFormat['rate']) # Needs to be 320 samples, 16kHz (I think) | |
nconsec = 0 if recording == is_speech else nconsec + 1 | |
if nconsec == (S.n_bad_frames_to_end_recording if recording else S.n_good_frames_to_start_recording): | |
nconsec = 0 | |
recording = not recording | |
if recording: | |
start_frame = (frame_index - S.n_good_frames_to_start_recording - S.nframes_to_prepad) % len(self.circular_buf) | |
dprint(f'STARTED RECORDING: start_frame={start_frame}') | |
else: | |
end_frame = (frame_index - S.n_bad_frames_to_end_recording) % len(self.circular_buf) | |
get_frames = lambda B, s, e: B[s:e] if s < e else np.concatenate([B[s:], B[:e]]) | |
segment = get_frames(self.circular_buf, start_frame, end_frame).flatten() | |
self.pending_segments.put(segment) | |
dprint(f'STOPPED RECORDING: end_frame={end_frame}') | |
frame_index = (frame_index + 1) % len(self.circular_buf) | |
def worker(self): | |
S = self.settings | |
file_index = 0 | |
while True: | |
while (segment := self.pending_segments.get()) is None: | |
time.sleep(.01) | |
file_index += 1 | |
if S.dump_wav: | |
with wave.open(f'segment-{file_index:03}.wav', 'wb') as f: | |
f.setnchannels(1) | |
f.setsampwidth(2) | |
f.setframerate(16000) | |
f.writeframes(segment.tobytes()) | |
S.on_segment(segment) | |
def terminate(self): | |
self.stream.stop_stream() | |
self.stream.close() | |
self.p.terminate() | |
if __name__ == "__main__": | |
def on_segment(segment): | |
print(f'☏ CALLBACK: Got segment: {len(segment)} frames') | |
ch = MicChunker(verbose=True, dump_wav=True, on_segment=on_segment) | |
print('Recording... press Ctrl+C to stop') | |
try: | |
while True: | |
time.sleep(0.1) | |
except KeyboardInterrupt: | |
print('👋') | |
ch.terminate() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment