Add voice activity detection

This commit is contained in:
Jonas Herzig 2017-09-20 15:16:49 +02:00
parent c49dabbfc4
commit 80f766379d
6 changed files with 156 additions and 24 deletions

View file

@ -11,7 +11,7 @@ Instead Websockets are used for all communications.
libopus, libcelt (0.7.1) and libsamplerate, compiled to JS via emscripten, are used for audio decoding. libopus, libcelt (0.7.1) and libsamplerate, compiled to JS via emscripten, are used for audio decoding.
Therefore, at the moment only the Opus and CELT Alpha codecs are supported. Therefore, at the moment only the Opus and CELT Alpha codecs are supported.
Quite a few features, most noticeably voice activity detection and all Quite a few features, most noticeably all
administrative functionallity, are still missing. administrative functionallity, are still missing.
### Installing ### Installing

View file

@ -67,11 +67,23 @@
<td> <td>
<select data-bind='value: voiceMode'> <select data-bind='value: voiceMode'>
<option value="cont">Continuous</option> <option value="cont">Continuous</option>
<option value="vad" disabled>Voice Activity</option> <option value="vad">Voice Activity</option>
<option value="ptt">Push To Talk</option> <option value="ptt">Push To Talk</option>
</td> </td>
</tr> </tr>
<tr data-bind="style: {visibility: voiceMode() == 'ptt' ? 'visible' : 'hidden'}"> <tr data-bind="visible: voiceMode() == 'vad'">
<td colspan="2">
<div class="mic-volume-container">
<div class="mic-volume" data-bind="style: {
width: testVadLevel()*100 + '%',
background: testVadActive() ? 'green' : 'red'
}"></div>
</div>
<input type="range" min="0" max="1" step="0.01"
data-bind="value: vadLevel">
</td>
</tr>
<tr data-bind="visible: voiceMode() == 'ptt'">
<td>PTT Key</td> <td>PTT Key</td>
<td> <td>
<input type="button" data-bind="value: pttKeyDisplay, click: recordPttKey"> <input type="button" data-bind="value: pttKeyDisplay, click: recordPttKey">

View file

@ -9,7 +9,7 @@ import ko from 'knockout'
import _dompurify from 'dompurify' import _dompurify from 'dompurify'
import keyboardjs from 'keyboardjs' import keyboardjs from 'keyboardjs'
import { ContinuousVoiceHandler, PushToTalkVoiceHandler, initVoice } from './voice' import { ContinuousVoiceHandler, PushToTalkVoiceHandler, VADVoiceHandler, initVoice } from './voice'
const dompurify = _dompurify(window) const dompurify = _dompurify(window)
@ -58,11 +58,34 @@ class SettingsDialog {
this.voiceMode = ko.observable(settings.voiceMode) this.voiceMode = ko.observable(settings.voiceMode)
this.pttKey = ko.observable(settings.pttKey) this.pttKey = ko.observable(settings.pttKey)
this.pttKeyDisplay = ko.observable(settings.pttKey) this.pttKeyDisplay = ko.observable(settings.pttKey)
this.vadLevel = ko.observable(settings.vadLevel)
this.testVadLevel = ko.observable(0)
this.testVadActive = ko.observable(false)
this._setupTestVad()
this.vadLevel.subscribe(() => this._setupTestVad())
}
_setupTestVad () {
if (this._testVad) {
this._testVad.end()
}
this._testVad = new VADVoiceHandler(null, this.vadLevel())
this._testVad.on('started_talking', () => this.testVadActive(true))
.on('stopped_talking', () => this.testVadActive(false))
.on('level', level => this.testVadLevel(level))
testVoiceHandler = this._testVad
} }
applyTo (settings) { applyTo (settings) {
settings.voiceMode = this.voiceMode() settings.voiceMode = this.voiceMode()
settings.pttKey = this.pttKey() settings.pttKey = this.pttKey()
settings.vadLevel = this.vadLevel()
}
end () {
this._testVad.end()
testVoiceHandler = null
} }
recordPttKey () { recordPttKey () {
@ -89,14 +112,16 @@ class SettingsDialog {
class Settings { class Settings {
constructor () { constructor () {
const load = key => window.localStorage.getItem('mumble.' + key) const load = key => window.localStorage.getItem('mumble.' + key)
this.voiceMode = load('voiceMode') || 'cont' this.voiceMode = load('voiceMode') || 'vad'
this.pttKey = load('pttKey') || 'ctrl + shift' this.pttKey = load('pttKey') || 'ctrl + shift'
this.vadLevel = load('vadLevel') || 0.3
} }
save () { save () {
const save = (key, val) => window.localStorage.setItem('mumble.' + key, val) const save = (key, val) => window.localStorage.setItem('mumble.' + key, val)
save('voiceMode', this.voiceMode) save('voiceMode', this.voiceMode)
save('pttKey', this.pttKey) save('pttKey', this.pttKey)
save('vadLevel', this.vadLevel)
} }
} }
@ -130,10 +155,13 @@ class GlobalBindings {
this._updateVoiceHandler() this._updateVoiceHandler()
this.settings.save() this.settings.save()
this.settingsDialog(null) this.closeSettings()
} }
this.closeSettings = () => { this.closeSettings = () => {
if (this.settingsDialog()) {
this.settingsDialog().end()
}
this.settingsDialog(null) this.settingsDialog(null)
} }
@ -360,7 +388,7 @@ class GlobalBindings {
} else if (mode === 'ptt') { } else if (mode === 'ptt') {
voiceHandler = new PushToTalkVoiceHandler(this.client, this.settings.pttKey) voiceHandler = new PushToTalkVoiceHandler(this.client, this.settings.pttKey)
} else if (mode === 'vad') { } else if (mode === 'vad') {
voiceHandler = new VADVoiceHandler(this.client, this.settings.vadLevel)
} else { } else {
log('Unknown voice mode:', mode) log('Unknown voice mode:', mode)
return return
@ -586,15 +614,19 @@ function userToState () {
} }
var voiceHandler var voiceHandler
var testVoiceHandler
initVoice(data => { initVoice(data => {
if (testVoiceHandler) {
testVoiceHandler.write(data)
}
if (!ui.client) { if (!ui.client) {
if (voiceHandler) { if (voiceHandler) {
voiceHandler.end() voiceHandler.end()
} }
voiceHandler = null voiceHandler = null
} else if (voiceHandler) { } else if (voiceHandler) {
voiceHandler.write(new Float32Array(data.buffer, data.byteOffset, data.byteLength / 4)) voiceHandler.write(data)
} }
}, err => { }, err => {
log('Cannot initialize user media. Microphone will not work:', err) log('Cannot initialize user media. Microphone will not work:', err)

View file

@ -1,10 +1,12 @@
import { Writable } from 'stream' import { Writable, Transform } from 'stream'
import MicrophoneStream from 'microphone-stream' import MicrophoneStream from 'microphone-stream'
import audioContext from 'audio-context' import audioContext from 'audio-context'
import chunker from 'stream-chunker' import chunker from 'stream-chunker'
import Resampler from 'libsamplerate.js' import Resampler from 'libsamplerate.js'
import getUserMedia from 'getusermedia' import getUserMedia from 'getusermedia'
import keyboardjs from 'keyboardjs' import keyboardjs from 'keyboardjs'
import vad from 'voice-activity-detection'
import DropStream from 'drop-stream'
class VoiceHandler extends Writable { class VoiceHandler extends Writable {
constructor (client) { constructor (client) {
@ -15,7 +17,29 @@ class VoiceHandler extends Writable {
_getOrCreateOutbound () { _getOrCreateOutbound () {
if (!this._outbound) { if (!this._outbound) {
this._outbound = this._client.createVoiceStream() if (!this._client) {
this._outbound = DropStream.obj()
this.emit('started_talking')
return this._outbound
}
this._outbound = new Resampler({
unsafe: true,
type: Resampler.Type.SINC_FASTEST,
ratio: 48000 / audioContext.sampleRate
})
const buffer2Float32Array = new Transform({
transform (data, _, callback) {
callback(null, new Float32Array(data.buffer, data.byteOffset, data.byteLength / 4))
},
readableObjectMode: true
})
this._outbound
.pipe(chunker(4 * 480))
.pipe(buffer2Float32Array)
.pipe(this._client.createVoiceStream())
this.emit('started_talking') this.emit('started_talking')
} }
return this._outbound return this._outbound
@ -74,24 +98,76 @@ export class PushToTalkVoiceHandler extends VoiceHandler {
} }
} }
export class VADVoiceHandler extends VoiceHandler {
constructor (client, level) {
super(client)
const self = this
this._vad = vad(audioContext, theUserMedia, {
onVoiceStart () {
console.log('vad: start')
self._active = true
},
onVoiceStop () {
console.log('vad: stop')
self._stopOutbound()
self._active = false
},
onUpdate (val) {
self._level = val
self.emit('level', val)
},
noiseCaptureDuration: 0,
minNoiseLevel: level,
maxNoiseLevel: level
})
// Need to keep a backlog of the last ~150ms (dependent on sample rate)
// because VAD will activate with ~125ms delay
this._backlog = []
this._backlogLength = 0
this._backlogLengthMin = 1024 * 6 * 4 // vadBufferLen * (vadDelay + 1) * bytesPerSample
}
_write (data, _, callback) {
if (this._active) {
if (this._backlog.length > 0) {
for (let oldData of this._backlog) {
this._getOrCreateOutbound().write(oldData)
}
this._backlog = []
this._backlogLength = 0
}
this._getOrCreateOutbound().write(data, callback)
} else {
// Make sure we always keep the backlog filled if we're not (yet) talking
this._backlog.push(data)
this._backlogLength += data.length
// Check if we can discard the oldest element without becoming too short
if (this._backlogLength - this._backlog[0].length > this._backlogLengthMin) {
this._backlogLength -= this._backlog.shift().length
}
callback()
}
}
_final (callback) {
super._final(e => {
this._vad.destroy()
callback(e)
})
}
}
var theUserMedia = null
export function initVoice (onData, onUserMediaError) { export function initVoice (onData, onUserMediaError) {
var resampler = new Resampler({
unsafe: true,
type: Resampler.Type.SINC_FASTEST,
ratio: 48000 / audioContext.sampleRate
})
resampler.pipe(chunker(4 * 480)).on('data', data => {
onData(data)
})
getUserMedia({ audio: true }, (err, userMedia) => { getUserMedia({ audio: true }, (err, userMedia) => {
if (err) { if (err) {
onUserMediaError(err) onUserMediaError(err)
} else { } else {
var micStream = new MicrophoneStream(userMedia, { objectMode: true }) theUserMedia = userMedia
var micStream = new MicrophoneStream(userMedia, { objectMode: true, bufferSize: 1024 })
micStream.on('data', data => { micStream.on('data', data => {
resampler.write(Buffer.from(data.getChannelData(0).buffer)) onData(Buffer.from(data.getChannelData(0).buffer))
}) })
} }
}) })

View file

@ -25,6 +25,7 @@
"brfs": "^1.4.3", "brfs": "^1.4.3",
"css-loader": "^0.26.0", "css-loader": "^0.26.0",
"dompurify": "^0.8.9", "dompurify": "^0.8.9",
"drop-stream": "^1.0.0",
"duplex-maker": "^1.0.0", "duplex-maker": "^1.0.0",
"extract-loader": "^0.1.0", "extract-loader": "^0.1.0",
"file-loader": "^0.9.0", "file-loader": "^0.9.0",
@ -39,10 +40,11 @@
"regexp-replace-loader": "0.0.1", "regexp-replace-loader": "0.0.1",
"stream-chunker": "^1.2.8", "stream-chunker": "^1.2.8",
"transform-loader": "^0.2.3", "transform-loader": "^0.2.3",
"voice-activity-detection": "johni0702/voice-activity-detection#9f8bd90",
"webpack": "^1.13.3", "webpack": "^1.13.3",
"webworkify-webpack-dropin": "^1.1.9", "webworkify-webpack-dropin": "^1.1.9",
"libsamplerate.js": "^1.0.0", "libsamplerate.js": "^1.0.0",
"mumble-client-codecs-browser": "^1.1.0", "mumble-client-codecs-browser": "^1.1.1",
"mumble-client-websocket": "^1.0.0", "mumble-client-websocket": "^1.0.0",
"mumble-client": "^1.1.0", "mumble-client": "^1.1.0",
"web-audio-buffer-queue": "^1.0.0" "web-audio-buffer-queue": "^1.0.0"

View file

@ -208,8 +208,10 @@ form {
border-bottom: 1px solid darkgray; border-bottom: 1px solid darkgray;
} }
.dialog-footer { .dialog-footer {
position: absolute;
bottom: 0px;
width: calc(100% - 20px);
margin: 10px; margin: 10px;
margin-bottom: 0px;
} }
.dialog-close { .dialog-close {
float: left; float: left;
@ -255,6 +257,14 @@ form {
} }
.settings-dialog table input { .settings-dialog table input {
width: 100%; width: 100%;
margin: 0px;
}
.settings-dialog .mic-volume-container {
height: 10px;
border: 3px solid black;
}
.settings-dialog .mic-volume {
height: 100%;
} }
.connect-dialog { .connect-dialog {
width: 300px; width: 300px;