Add voice activity detection

This commit is contained in:
Jonas Herzig 2017-09-20 15:16:49 +02:00
parent c49dabbfc4
commit 80f766379d
6 changed files with 156 additions and 24 deletions

View file

@ -67,11 +67,23 @@
<td>
<select data-bind='value: voiceMode'>
<option value="cont">Continuous</option>
<option value="vad" disabled>Voice Activity</option>
<option value="vad">Voice Activity</option>
<option value="ptt">Push To Talk</option>
</td>
</tr>
<tr data-bind="style: {visibility: voiceMode() == 'ptt' ? 'visible' : 'hidden'}">
<tr data-bind="visible: voiceMode() == 'vad'">
<td colspan="2">
<div class="mic-volume-container">
<div class="mic-volume" data-bind="style: {
width: testVadLevel()*100 + '%',
background: testVadActive() ? 'green' : 'red'
}"></div>
</div>
<input type="range" min="0" max="1" step="0.01"
data-bind="value: vadLevel">
</td>
</tr>
<tr data-bind="visible: voiceMode() == 'ptt'">
<td>PTT Key</td>
<td>
<input type="button" data-bind="value: pttKeyDisplay, click: recordPttKey">

View file

@ -9,7 +9,7 @@ import ko from 'knockout'
import _dompurify from 'dompurify'
import keyboardjs from 'keyboardjs'
import { ContinuousVoiceHandler, PushToTalkVoiceHandler, initVoice } from './voice'
import { ContinuousVoiceHandler, PushToTalkVoiceHandler, VADVoiceHandler, initVoice } from './voice'
const dompurify = _dompurify(window)
@ -58,11 +58,34 @@ class SettingsDialog {
this.voiceMode = ko.observable(settings.voiceMode)
this.pttKey = ko.observable(settings.pttKey)
this.pttKeyDisplay = ko.observable(settings.pttKey)
this.vadLevel = ko.observable(settings.vadLevel)
this.testVadLevel = ko.observable(0)
this.testVadActive = ko.observable(false)
this._setupTestVad()
this.vadLevel.subscribe(() => this._setupTestVad())
}
_setupTestVad () {
if (this._testVad) {
this._testVad.end()
}
this._testVad = new VADVoiceHandler(null, this.vadLevel())
this._testVad.on('started_talking', () => this.testVadActive(true))
.on('stopped_talking', () => this.testVadActive(false))
.on('level', level => this.testVadLevel(level))
testVoiceHandler = this._testVad
}
applyTo (settings) {
settings.voiceMode = this.voiceMode()
settings.pttKey = this.pttKey()
settings.vadLevel = this.vadLevel()
}
end () {
this._testVad.end()
testVoiceHandler = null
}
recordPttKey () {
@ -89,14 +112,16 @@ class SettingsDialog {
class Settings {
constructor () {
const load = key => window.localStorage.getItem('mumble.' + key)
this.voiceMode = load('voiceMode') || 'cont'
this.voiceMode = load('voiceMode') || 'vad'
this.pttKey = load('pttKey') || 'ctrl + shift'
this.vadLevel = load('vadLevel') || 0.3
}
save () {
const save = (key, val) => window.localStorage.setItem('mumble.' + key, val)
save('voiceMode', this.voiceMode)
save('pttKey', this.pttKey)
save('vadLevel', this.vadLevel)
}
}
@ -130,10 +155,13 @@ class GlobalBindings {
this._updateVoiceHandler()
this.settings.save()
this.settingsDialog(null)
this.closeSettings()
}
this.closeSettings = () => {
if (this.settingsDialog()) {
this.settingsDialog().end()
}
this.settingsDialog(null)
}
@ -360,7 +388,7 @@ class GlobalBindings {
} else if (mode === 'ptt') {
voiceHandler = new PushToTalkVoiceHandler(this.client, this.settings.pttKey)
} else if (mode === 'vad') {
voiceHandler = new VADVoiceHandler(this.client, this.settings.vadLevel)
} else {
log('Unknown voice mode:', mode)
return
@ -586,15 +614,19 @@ function userToState () {
}
var voiceHandler
var testVoiceHandler
initVoice(data => {
if (testVoiceHandler) {
testVoiceHandler.write(data)
}
if (!ui.client) {
if (voiceHandler) {
voiceHandler.end()
}
voiceHandler = null
} else if (voiceHandler) {
voiceHandler.write(new Float32Array(data.buffer, data.byteOffset, data.byteLength / 4))
voiceHandler.write(data)
}
}, err => {
log('Cannot initialize user media. Microphone will not work:', err)

View file

@ -1,10 +1,12 @@
import { Writable } from 'stream'
import { Writable, Transform } from 'stream'
import MicrophoneStream from 'microphone-stream'
import audioContext from 'audio-context'
import chunker from 'stream-chunker'
import Resampler from 'libsamplerate.js'
import getUserMedia from 'getusermedia'
import keyboardjs from 'keyboardjs'
import vad from 'voice-activity-detection'
import DropStream from 'drop-stream'
class VoiceHandler extends Writable {
constructor (client) {
@ -15,7 +17,29 @@ class VoiceHandler extends Writable {
_getOrCreateOutbound () {
if (!this._outbound) {
this._outbound = this._client.createVoiceStream()
if (!this._client) {
this._outbound = DropStream.obj()
this.emit('started_talking')
return this._outbound
}
this._outbound = new Resampler({
unsafe: true,
type: Resampler.Type.SINC_FASTEST,
ratio: 48000 / audioContext.sampleRate
})
const buffer2Float32Array = new Transform({
transform (data, _, callback) {
callback(null, new Float32Array(data.buffer, data.byteOffset, data.byteLength / 4))
},
readableObjectMode: true
})
this._outbound
.pipe(chunker(4 * 480))
.pipe(buffer2Float32Array)
.pipe(this._client.createVoiceStream())
this.emit('started_talking')
}
return this._outbound
@ -74,24 +98,76 @@ export class PushToTalkVoiceHandler extends VoiceHandler {
}
}
export class VADVoiceHandler extends VoiceHandler {
constructor (client, level) {
super(client)
const self = this
this._vad = vad(audioContext, theUserMedia, {
onVoiceStart () {
console.log('vad: start')
self._active = true
},
onVoiceStop () {
console.log('vad: stop')
self._stopOutbound()
self._active = false
},
onUpdate (val) {
self._level = val
self.emit('level', val)
},
noiseCaptureDuration: 0,
minNoiseLevel: level,
maxNoiseLevel: level
})
// Need to keep a backlog of the last ~150ms (dependent on sample rate)
// because VAD will activate with ~125ms delay
this._backlog = []
this._backlogLength = 0
this._backlogLengthMin = 1024 * 6 * 4 // vadBufferLen * (vadDelay + 1) * bytesPerSample
}
_write (data, _, callback) {
if (this._active) {
if (this._backlog.length > 0) {
for (let oldData of this._backlog) {
this._getOrCreateOutbound().write(oldData)
}
this._backlog = []
this._backlogLength = 0
}
this._getOrCreateOutbound().write(data, callback)
} else {
// Make sure we always keep the backlog filled if we're not (yet) talking
this._backlog.push(data)
this._backlogLength += data.length
// Check if we can discard the oldest element without becoming too short
if (this._backlogLength - this._backlog[0].length > this._backlogLengthMin) {
this._backlogLength -= this._backlog.shift().length
}
callback()
}
}
_final (callback) {
super._final(e => {
this._vad.destroy()
callback(e)
})
}
}
var theUserMedia = null
export function initVoice (onData, onUserMediaError) {
var resampler = new Resampler({
unsafe: true,
type: Resampler.Type.SINC_FASTEST,
ratio: 48000 / audioContext.sampleRate
})
resampler.pipe(chunker(4 * 480)).on('data', data => {
onData(data)
})
getUserMedia({ audio: true }, (err, userMedia) => {
if (err) {
onUserMediaError(err)
} else {
var micStream = new MicrophoneStream(userMedia, { objectMode: true })
theUserMedia = userMedia
var micStream = new MicrophoneStream(userMedia, { objectMode: true, bufferSize: 1024 })
micStream.on('data', data => {
resampler.write(Buffer.from(data.getChannelData(0).buffer))
onData(Buffer.from(data.getChannelData(0).buffer))
})
}
})