Add voice activity detection
This commit is contained in:
parent
c49dabbfc4
commit
80f766379d
|
@ -11,7 +11,7 @@ Instead Websockets are used for all communications.
|
||||||
libopus, libcelt (0.7.1) and libsamplerate, compiled to JS via emscripten, are used for audio decoding.
|
libopus, libcelt (0.7.1) and libsamplerate, compiled to JS via emscripten, are used for audio decoding.
|
||||||
Therefore, at the moment only the Opus and CELT Alpha codecs are supported.
|
Therefore, at the moment only the Opus and CELT Alpha codecs are supported.
|
||||||
|
|
||||||
Quite a few features, most noticeably voice activity detection and all
|
Quite a few features, most noticeably all
|
||||||
administrative functionallity, are still missing.
|
administrative functionallity, are still missing.
|
||||||
|
|
||||||
### Installing
|
### Installing
|
||||||
|
|
|
@ -67,11 +67,23 @@
|
||||||
<td>
|
<td>
|
||||||
<select data-bind='value: voiceMode'>
|
<select data-bind='value: voiceMode'>
|
||||||
<option value="cont">Continuous</option>
|
<option value="cont">Continuous</option>
|
||||||
<option value="vad" disabled>Voice Activity</option>
|
<option value="vad">Voice Activity</option>
|
||||||
<option value="ptt">Push To Talk</option>
|
<option value="ptt">Push To Talk</option>
|
||||||
</td>
|
</td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr data-bind="style: {visibility: voiceMode() == 'ptt' ? 'visible' : 'hidden'}">
|
<tr data-bind="visible: voiceMode() == 'vad'">
|
||||||
|
<td colspan="2">
|
||||||
|
<div class="mic-volume-container">
|
||||||
|
<div class="mic-volume" data-bind="style: {
|
||||||
|
width: testVadLevel()*100 + '%',
|
||||||
|
background: testVadActive() ? 'green' : 'red'
|
||||||
|
}"></div>
|
||||||
|
</div>
|
||||||
|
<input type="range" min="0" max="1" step="0.01"
|
||||||
|
data-bind="value: vadLevel">
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
<tr data-bind="visible: voiceMode() == 'ptt'">
|
||||||
<td>PTT Key</td>
|
<td>PTT Key</td>
|
||||||
<td>
|
<td>
|
||||||
<input type="button" data-bind="value: pttKeyDisplay, click: recordPttKey">
|
<input type="button" data-bind="value: pttKeyDisplay, click: recordPttKey">
|
||||||
|
|
42
app/index.js
42
app/index.js
|
@ -9,7 +9,7 @@ import ko from 'knockout'
|
||||||
import _dompurify from 'dompurify'
|
import _dompurify from 'dompurify'
|
||||||
import keyboardjs from 'keyboardjs'
|
import keyboardjs from 'keyboardjs'
|
||||||
|
|
||||||
import { ContinuousVoiceHandler, PushToTalkVoiceHandler, initVoice } from './voice'
|
import { ContinuousVoiceHandler, PushToTalkVoiceHandler, VADVoiceHandler, initVoice } from './voice'
|
||||||
|
|
||||||
const dompurify = _dompurify(window)
|
const dompurify = _dompurify(window)
|
||||||
|
|
||||||
|
@ -58,11 +58,34 @@ class SettingsDialog {
|
||||||
this.voiceMode = ko.observable(settings.voiceMode)
|
this.voiceMode = ko.observable(settings.voiceMode)
|
||||||
this.pttKey = ko.observable(settings.pttKey)
|
this.pttKey = ko.observable(settings.pttKey)
|
||||||
this.pttKeyDisplay = ko.observable(settings.pttKey)
|
this.pttKeyDisplay = ko.observable(settings.pttKey)
|
||||||
|
this.vadLevel = ko.observable(settings.vadLevel)
|
||||||
|
this.testVadLevel = ko.observable(0)
|
||||||
|
this.testVadActive = ko.observable(false)
|
||||||
|
|
||||||
|
this._setupTestVad()
|
||||||
|
this.vadLevel.subscribe(() => this._setupTestVad())
|
||||||
|
}
|
||||||
|
|
||||||
|
_setupTestVad () {
|
||||||
|
if (this._testVad) {
|
||||||
|
this._testVad.end()
|
||||||
|
}
|
||||||
|
this._testVad = new VADVoiceHandler(null, this.vadLevel())
|
||||||
|
this._testVad.on('started_talking', () => this.testVadActive(true))
|
||||||
|
.on('stopped_talking', () => this.testVadActive(false))
|
||||||
|
.on('level', level => this.testVadLevel(level))
|
||||||
|
testVoiceHandler = this._testVad
|
||||||
}
|
}
|
||||||
|
|
||||||
applyTo (settings) {
|
applyTo (settings) {
|
||||||
settings.voiceMode = this.voiceMode()
|
settings.voiceMode = this.voiceMode()
|
||||||
settings.pttKey = this.pttKey()
|
settings.pttKey = this.pttKey()
|
||||||
|
settings.vadLevel = this.vadLevel()
|
||||||
|
}
|
||||||
|
|
||||||
|
end () {
|
||||||
|
this._testVad.end()
|
||||||
|
testVoiceHandler = null
|
||||||
}
|
}
|
||||||
|
|
||||||
recordPttKey () {
|
recordPttKey () {
|
||||||
|
@ -89,14 +112,16 @@ class SettingsDialog {
|
||||||
class Settings {
|
class Settings {
|
||||||
constructor () {
|
constructor () {
|
||||||
const load = key => window.localStorage.getItem('mumble.' + key)
|
const load = key => window.localStorage.getItem('mumble.' + key)
|
||||||
this.voiceMode = load('voiceMode') || 'cont'
|
this.voiceMode = load('voiceMode') || 'vad'
|
||||||
this.pttKey = load('pttKey') || 'ctrl + shift'
|
this.pttKey = load('pttKey') || 'ctrl + shift'
|
||||||
|
this.vadLevel = load('vadLevel') || 0.3
|
||||||
}
|
}
|
||||||
|
|
||||||
save () {
|
save () {
|
||||||
const save = (key, val) => window.localStorage.setItem('mumble.' + key, val)
|
const save = (key, val) => window.localStorage.setItem('mumble.' + key, val)
|
||||||
save('voiceMode', this.voiceMode)
|
save('voiceMode', this.voiceMode)
|
||||||
save('pttKey', this.pttKey)
|
save('pttKey', this.pttKey)
|
||||||
|
save('vadLevel', this.vadLevel)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -130,10 +155,13 @@ class GlobalBindings {
|
||||||
this._updateVoiceHandler()
|
this._updateVoiceHandler()
|
||||||
|
|
||||||
this.settings.save()
|
this.settings.save()
|
||||||
this.settingsDialog(null)
|
this.closeSettings()
|
||||||
}
|
}
|
||||||
|
|
||||||
this.closeSettings = () => {
|
this.closeSettings = () => {
|
||||||
|
if (this.settingsDialog()) {
|
||||||
|
this.settingsDialog().end()
|
||||||
|
}
|
||||||
this.settingsDialog(null)
|
this.settingsDialog(null)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -360,7 +388,7 @@ class GlobalBindings {
|
||||||
} else if (mode === 'ptt') {
|
} else if (mode === 'ptt') {
|
||||||
voiceHandler = new PushToTalkVoiceHandler(this.client, this.settings.pttKey)
|
voiceHandler = new PushToTalkVoiceHandler(this.client, this.settings.pttKey)
|
||||||
} else if (mode === 'vad') {
|
} else if (mode === 'vad') {
|
||||||
|
voiceHandler = new VADVoiceHandler(this.client, this.settings.vadLevel)
|
||||||
} else {
|
} else {
|
||||||
log('Unknown voice mode:', mode)
|
log('Unknown voice mode:', mode)
|
||||||
return
|
return
|
||||||
|
@ -586,15 +614,19 @@ function userToState () {
|
||||||
}
|
}
|
||||||
|
|
||||||
var voiceHandler
|
var voiceHandler
|
||||||
|
var testVoiceHandler
|
||||||
|
|
||||||
initVoice(data => {
|
initVoice(data => {
|
||||||
|
if (testVoiceHandler) {
|
||||||
|
testVoiceHandler.write(data)
|
||||||
|
}
|
||||||
if (!ui.client) {
|
if (!ui.client) {
|
||||||
if (voiceHandler) {
|
if (voiceHandler) {
|
||||||
voiceHandler.end()
|
voiceHandler.end()
|
||||||
}
|
}
|
||||||
voiceHandler = null
|
voiceHandler = null
|
||||||
} else if (voiceHandler) {
|
} else if (voiceHandler) {
|
||||||
voiceHandler.write(new Float32Array(data.buffer, data.byteOffset, data.byteLength / 4))
|
voiceHandler.write(data)
|
||||||
}
|
}
|
||||||
}, err => {
|
}, err => {
|
||||||
log('Cannot initialize user media. Microphone will not work:', err)
|
log('Cannot initialize user media. Microphone will not work:', err)
|
||||||
|
|
104
app/voice.js
104
app/voice.js
|
@ -1,10 +1,12 @@
|
||||||
import { Writable } from 'stream'
|
import { Writable, Transform } from 'stream'
|
||||||
import MicrophoneStream from 'microphone-stream'
|
import MicrophoneStream from 'microphone-stream'
|
||||||
import audioContext from 'audio-context'
|
import audioContext from 'audio-context'
|
||||||
import chunker from 'stream-chunker'
|
import chunker from 'stream-chunker'
|
||||||
import Resampler from 'libsamplerate.js'
|
import Resampler from 'libsamplerate.js'
|
||||||
import getUserMedia from 'getusermedia'
|
import getUserMedia from 'getusermedia'
|
||||||
import keyboardjs from 'keyboardjs'
|
import keyboardjs from 'keyboardjs'
|
||||||
|
import vad from 'voice-activity-detection'
|
||||||
|
import DropStream from 'drop-stream'
|
||||||
|
|
||||||
class VoiceHandler extends Writable {
|
class VoiceHandler extends Writable {
|
||||||
constructor (client) {
|
constructor (client) {
|
||||||
|
@ -15,7 +17,29 @@ class VoiceHandler extends Writable {
|
||||||
|
|
||||||
_getOrCreateOutbound () {
|
_getOrCreateOutbound () {
|
||||||
if (!this._outbound) {
|
if (!this._outbound) {
|
||||||
this._outbound = this._client.createVoiceStream()
|
if (!this._client) {
|
||||||
|
this._outbound = DropStream.obj()
|
||||||
|
this.emit('started_talking')
|
||||||
|
return this._outbound
|
||||||
|
}
|
||||||
|
this._outbound = new Resampler({
|
||||||
|
unsafe: true,
|
||||||
|
type: Resampler.Type.SINC_FASTEST,
|
||||||
|
ratio: 48000 / audioContext.sampleRate
|
||||||
|
})
|
||||||
|
|
||||||
|
const buffer2Float32Array = new Transform({
|
||||||
|
transform (data, _, callback) {
|
||||||
|
callback(null, new Float32Array(data.buffer, data.byteOffset, data.byteLength / 4))
|
||||||
|
},
|
||||||
|
readableObjectMode: true
|
||||||
|
})
|
||||||
|
|
||||||
|
this._outbound
|
||||||
|
.pipe(chunker(4 * 480))
|
||||||
|
.pipe(buffer2Float32Array)
|
||||||
|
.pipe(this._client.createVoiceStream())
|
||||||
|
|
||||||
this.emit('started_talking')
|
this.emit('started_talking')
|
||||||
}
|
}
|
||||||
return this._outbound
|
return this._outbound
|
||||||
|
@ -74,24 +98,76 @@ export class PushToTalkVoiceHandler extends VoiceHandler {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export class VADVoiceHandler extends VoiceHandler {
|
||||||
|
constructor (client, level) {
|
||||||
|
super(client)
|
||||||
|
const self = this
|
||||||
|
this._vad = vad(audioContext, theUserMedia, {
|
||||||
|
onVoiceStart () {
|
||||||
|
console.log('vad: start')
|
||||||
|
self._active = true
|
||||||
|
},
|
||||||
|
onVoiceStop () {
|
||||||
|
console.log('vad: stop')
|
||||||
|
self._stopOutbound()
|
||||||
|
self._active = false
|
||||||
|
},
|
||||||
|
onUpdate (val) {
|
||||||
|
self._level = val
|
||||||
|
self.emit('level', val)
|
||||||
|
},
|
||||||
|
noiseCaptureDuration: 0,
|
||||||
|
minNoiseLevel: level,
|
||||||
|
maxNoiseLevel: level
|
||||||
|
})
|
||||||
|
// Need to keep a backlog of the last ~150ms (dependent on sample rate)
|
||||||
|
// because VAD will activate with ~125ms delay
|
||||||
|
this._backlog = []
|
||||||
|
this._backlogLength = 0
|
||||||
|
this._backlogLengthMin = 1024 * 6 * 4 // vadBufferLen * (vadDelay + 1) * bytesPerSample
|
||||||
|
}
|
||||||
|
|
||||||
|
_write (data, _, callback) {
|
||||||
|
if (this._active) {
|
||||||
|
if (this._backlog.length > 0) {
|
||||||
|
for (let oldData of this._backlog) {
|
||||||
|
this._getOrCreateOutbound().write(oldData)
|
||||||
|
}
|
||||||
|
this._backlog = []
|
||||||
|
this._backlogLength = 0
|
||||||
|
}
|
||||||
|
this._getOrCreateOutbound().write(data, callback)
|
||||||
|
} else {
|
||||||
|
// Make sure we always keep the backlog filled if we're not (yet) talking
|
||||||
|
this._backlog.push(data)
|
||||||
|
this._backlogLength += data.length
|
||||||
|
// Check if we can discard the oldest element without becoming too short
|
||||||
|
if (this._backlogLength - this._backlog[0].length > this._backlogLengthMin) {
|
||||||
|
this._backlogLength -= this._backlog.shift().length
|
||||||
|
}
|
||||||
|
callback()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
_final (callback) {
|
||||||
|
super._final(e => {
|
||||||
|
this._vad.destroy()
|
||||||
|
callback(e)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var theUserMedia = null
|
||||||
|
|
||||||
export function initVoice (onData, onUserMediaError) {
|
export function initVoice (onData, onUserMediaError) {
|
||||||
var resampler = new Resampler({
|
|
||||||
unsafe: true,
|
|
||||||
type: Resampler.Type.SINC_FASTEST,
|
|
||||||
ratio: 48000 / audioContext.sampleRate
|
|
||||||
})
|
|
||||||
|
|
||||||
resampler.pipe(chunker(4 * 480)).on('data', data => {
|
|
||||||
onData(data)
|
|
||||||
})
|
|
||||||
|
|
||||||
getUserMedia({ audio: true }, (err, userMedia) => {
|
getUserMedia({ audio: true }, (err, userMedia) => {
|
||||||
if (err) {
|
if (err) {
|
||||||
onUserMediaError(err)
|
onUserMediaError(err)
|
||||||
} else {
|
} else {
|
||||||
var micStream = new MicrophoneStream(userMedia, { objectMode: true })
|
theUserMedia = userMedia
|
||||||
|
var micStream = new MicrophoneStream(userMedia, { objectMode: true, bufferSize: 1024 })
|
||||||
micStream.on('data', data => {
|
micStream.on('data', data => {
|
||||||
resampler.write(Buffer.from(data.getChannelData(0).buffer))
|
onData(Buffer.from(data.getChannelData(0).buffer))
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
|
@ -25,6 +25,7 @@
|
||||||
"brfs": "^1.4.3",
|
"brfs": "^1.4.3",
|
||||||
"css-loader": "^0.26.0",
|
"css-loader": "^0.26.0",
|
||||||
"dompurify": "^0.8.9",
|
"dompurify": "^0.8.9",
|
||||||
|
"drop-stream": "^1.0.0",
|
||||||
"duplex-maker": "^1.0.0",
|
"duplex-maker": "^1.0.0",
|
||||||
"extract-loader": "^0.1.0",
|
"extract-loader": "^0.1.0",
|
||||||
"file-loader": "^0.9.0",
|
"file-loader": "^0.9.0",
|
||||||
|
@ -39,10 +40,11 @@
|
||||||
"regexp-replace-loader": "0.0.1",
|
"regexp-replace-loader": "0.0.1",
|
||||||
"stream-chunker": "^1.2.8",
|
"stream-chunker": "^1.2.8",
|
||||||
"transform-loader": "^0.2.3",
|
"transform-loader": "^0.2.3",
|
||||||
|
"voice-activity-detection": "johni0702/voice-activity-detection#9f8bd90",
|
||||||
"webpack": "^1.13.3",
|
"webpack": "^1.13.3",
|
||||||
"webworkify-webpack-dropin": "^1.1.9",
|
"webworkify-webpack-dropin": "^1.1.9",
|
||||||
"libsamplerate.js": "^1.0.0",
|
"libsamplerate.js": "^1.0.0",
|
||||||
"mumble-client-codecs-browser": "^1.1.0",
|
"mumble-client-codecs-browser": "^1.1.1",
|
||||||
"mumble-client-websocket": "^1.0.0",
|
"mumble-client-websocket": "^1.0.0",
|
||||||
"mumble-client": "^1.1.0",
|
"mumble-client": "^1.1.0",
|
||||||
"web-audio-buffer-queue": "^1.0.0"
|
"web-audio-buffer-queue": "^1.0.0"
|
||||||
|
|
|
@ -208,8 +208,10 @@ form {
|
||||||
border-bottom: 1px solid darkgray;
|
border-bottom: 1px solid darkgray;
|
||||||
}
|
}
|
||||||
.dialog-footer {
|
.dialog-footer {
|
||||||
|
position: absolute;
|
||||||
|
bottom: 0px;
|
||||||
|
width: calc(100% - 20px);
|
||||||
margin: 10px;
|
margin: 10px;
|
||||||
margin-bottom: 0px;
|
|
||||||
}
|
}
|
||||||
.dialog-close {
|
.dialog-close {
|
||||||
float: left;
|
float: left;
|
||||||
|
@ -255,6 +257,14 @@ form {
|
||||||
}
|
}
|
||||||
.settings-dialog table input {
|
.settings-dialog table input {
|
||||||
width: 100%;
|
width: 100%;
|
||||||
|
margin: 0px;
|
||||||
|
}
|
||||||
|
.settings-dialog .mic-volume-container {
|
||||||
|
height: 10px;
|
||||||
|
border: 3px solid black;
|
||||||
|
}
|
||||||
|
.settings-dialog .mic-volume {
|
||||||
|
height: 100%;
|
||||||
}
|
}
|
||||||
.connect-dialog {
|
.connect-dialog {
|
||||||
width: 300px;
|
width: 300px;
|
||||||
|
|
Loading…
Reference in a new issue