Add voice activity detection

2017-09-20 15:16:49 +02:00 · 2017-09-20 15:16:49 +02:00 · 80f766379d
commit 80f766379d
parent c49dabbfc4
6 changed files with 156 additions and 24 deletions
--- a/README.md
+++ b/README.md
@ -11,7 +11,7 @@ Instead Websockets are used for all communications.
 libopus, libcelt (0.7.1) and libsamplerate, compiled to JS via emscripten, are used for audio decoding.
 Therefore, at the moment only the Opus and CELT Alpha codecs are supported.

-Quite a few features, most noticeably voice activity detection and all
+Quite a few features, most noticeably all
 administrative functionallity, are still missing.

 ### Installing
--- a/app/index.html
+++ b/app/index.html
@ -67,11 +67,23 @@
                  <td>
                    <select data-bind='value: voiceMode'>
                      <option value="cont">Continuous</option>
-                      <option value="vad" disabled>Voice Activity</option>
+                      <option value="vad">Voice Activity</option>
                      <option value="ptt">Push To Talk</option>
                  </td>
                </tr>
-                <tr data-bind="style: {visibility: voiceMode() == 'ptt' ? 'visible' : 'hidden'}">
+                <tr data-bind="visible: voiceMode() == 'vad'">
+                  <td colspan="2">
+                    <div class="mic-volume-container">
+                      <div class="mic-volume" data-bind="style: {
+                          width: testVadLevel()*100 + '%',
+                          background: testVadActive() ? 'green' : 'red'
+                        }"></div>
+                    </div>
+                    <input type="range" min="0" max="1" step="0.01"
+                           data-bind="value: vadLevel">
+                  </td>
+                </tr>
+                <tr data-bind="visible: voiceMode() == 'ptt'">
                  <td>PTT Key</td>
                  <td>
                    <input type="button" data-bind="value: pttKeyDisplay, click: recordPttKey">
--- a/app/index.js
+++ b/app/index.js
@ -9,7 +9,7 @@ import ko from 'knockout'
 import _dompurify from 'dompurify'
 import keyboardjs from 'keyboardjs'

-import { ContinuousVoiceHandler, PushToTalkVoiceHandler, initVoice } from './voice'
+import { ContinuousVoiceHandler, PushToTalkVoiceHandler, VADVoiceHandler, initVoice } from './voice'

 const dompurify = _dompurify(window)

@ -58,11 +58,34 @@ class SettingsDialog {
    this.voiceMode = ko.observable(settings.voiceMode)
    this.pttKey = ko.observable(settings.pttKey)
    this.pttKeyDisplay = ko.observable(settings.pttKey)
+    this.vadLevel = ko.observable(settings.vadLevel)
+    this.testVadLevel = ko.observable(0)
+    this.testVadActive = ko.observable(false)
+
+    this._setupTestVad()
+    this.vadLevel.subscribe(() => this._setupTestVad())
+  }
+
+  _setupTestVad () {
+    if (this._testVad) {
+      this._testVad.end()
+    }
+    this._testVad = new VADVoiceHandler(null, this.vadLevel())
+    this._testVad.on('started_talking', () => this.testVadActive(true))
+                 .on('stopped_talking', () => this.testVadActive(false))
+                 .on('level', level => this.testVadLevel(level))
+    testVoiceHandler = this._testVad
  }

  applyTo (settings) {
    settings.voiceMode = this.voiceMode()
    settings.pttKey = this.pttKey()
+    settings.vadLevel = this.vadLevel()
+  }
+
+  end () {
+    this._testVad.end()
+    testVoiceHandler = null
  }

  recordPttKey () {
@ -89,14 +112,16 @@ class SettingsDialog {
 class Settings {
  constructor () {
    const load = key => window.localStorage.getItem('mumble.' + key)
-    this.voiceMode = load('voiceMode') || 'cont'
+    this.voiceMode = load('voiceMode') || 'vad'
    this.pttKey = load('pttKey') || 'ctrl + shift'
+    this.vadLevel = load('vadLevel') || 0.3
  }

  save () {
    const save = (key, val) => window.localStorage.setItem('mumble.' + key, val)
    save('voiceMode', this.voiceMode)
    save('pttKey', this.pttKey)
+    save('vadLevel', this.vadLevel)
  }
 }

@ -130,10 +155,13 @@ class GlobalBindings {
      this._updateVoiceHandler()

      this.settings.save()
-      this.settingsDialog(null)
+      this.closeSettings()
    }

    this.closeSettings = () => {
+      if (this.settingsDialog()) {
+        this.settingsDialog().end()
+      }
      this.settingsDialog(null)
    }

@ -360,7 +388,7 @@ class GlobalBindings {
      } else if (mode === 'ptt') {
        voiceHandler = new PushToTalkVoiceHandler(this.client, this.settings.pttKey)
      } else if (mode === 'vad') {
-
+        voiceHandler = new VADVoiceHandler(this.client, this.settings.vadLevel)
      } else {
        log('Unknown voice mode:', mode)
        return
@ -586,15 +614,19 @@ function userToState () {
 }

 var voiceHandler
+var testVoiceHandler

 initVoice(data => {
+  if (testVoiceHandler) {
+    testVoiceHandler.write(data)
+  }
  if (!ui.client) {
    if (voiceHandler) {
      voiceHandler.end()
    }
    voiceHandler = null
  } else if (voiceHandler) {
-    voiceHandler.write(new Float32Array(data.buffer, data.byteOffset, data.byteLength / 4))
+    voiceHandler.write(data)
  }
 }, err => {
  log('Cannot initialize user media. Microphone will not work:', err)
--- a/app/voice.js
+++ b/app/voice.js
@ -1,10 +1,12 @@
-import { Writable } from 'stream'
+import { Writable, Transform } from 'stream'
 import MicrophoneStream from 'microphone-stream'
 import audioContext from 'audio-context'
 import chunker from 'stream-chunker'
 import Resampler from 'libsamplerate.js'
 import getUserMedia from 'getusermedia'
 import keyboardjs from 'keyboardjs'
+import vad from 'voice-activity-detection'
+import DropStream from 'drop-stream'

 class VoiceHandler extends Writable {
  constructor (client) {
@ -15,7 +17,29 @@ class VoiceHandler extends Writable {

  _getOrCreateOutbound () {
    if (!this._outbound) {
-      this._outbound = this._client.createVoiceStream()
+      if (!this._client) {
+        this._outbound = DropStream.obj()
+        this.emit('started_talking')
+        return this._outbound
+      }
+      this._outbound = new Resampler({
+        unsafe: true,
+        type: Resampler.Type.SINC_FASTEST,
+        ratio: 48000 / audioContext.sampleRate
+      })
+
+      const buffer2Float32Array = new Transform({
+        transform (data, _, callback) {
+          callback(null, new Float32Array(data.buffer, data.byteOffset, data.byteLength / 4))
+        },
+        readableObjectMode: true
+      })
+
+      this._outbound
+        .pipe(chunker(4 * 480))
+        .pipe(buffer2Float32Array)
+        .pipe(this._client.createVoiceStream())
+
      this.emit('started_talking')
    }
    return this._outbound
@ -74,24 +98,76 @@ export class PushToTalkVoiceHandler extends VoiceHandler {
  }
 }

+export class VADVoiceHandler extends VoiceHandler {
+  constructor (client, level) {
+    super(client)
+    const self = this
+    this._vad = vad(audioContext, theUserMedia, {
+      onVoiceStart () {
+        console.log('vad: start')
+        self._active = true
+      },
+      onVoiceStop () {
+        console.log('vad: stop')
+        self._stopOutbound()
+        self._active = false
+      },
+      onUpdate (val) {
+        self._level = val
+        self.emit('level', val)
+      },
+      noiseCaptureDuration: 0,
+      minNoiseLevel: level,
+      maxNoiseLevel: level
+    })
+    // Need to keep a backlog of the last ~150ms (dependent on sample rate)
+    // because VAD will activate with ~125ms delay
+    this._backlog = []
+    this._backlogLength = 0
+    this._backlogLengthMin = 1024 * 6 * 4 // vadBufferLen * (vadDelay + 1) * bytesPerSample
+  }
+
+  _write (data, _, callback) {
+    if (this._active) {
+      if (this._backlog.length > 0) {
+        for (let oldData of this._backlog) {
+          this._getOrCreateOutbound().write(oldData)
+        }
+        this._backlog = []
+        this._backlogLength = 0
+      }
+      this._getOrCreateOutbound().write(data, callback)
+    } else {
+      // Make sure we always keep the backlog filled if we're not (yet) talking
+      this._backlog.push(data)
+      this._backlogLength += data.length
+      // Check if we can discard the oldest element without becoming too short
+      if (this._backlogLength - this._backlog[0].length > this._backlogLengthMin) {
+        this._backlogLength -= this._backlog.shift().length
+      }
+      callback()
+    }
+  }
+
+  _final (callback) {
+    super._final(e => {
+      this._vad.destroy()
+      callback(e)
+    })
+  }
+}
+
+var theUserMedia = null
+
 export function initVoice (onData, onUserMediaError) {
-  var resampler = new Resampler({
-    unsafe: true,
-    type: Resampler.Type.SINC_FASTEST,
-    ratio: 48000 / audioContext.sampleRate
-  })
-
-  resampler.pipe(chunker(4 * 480)).on('data', data => {
-    onData(data)
-  })
-
  getUserMedia({ audio: true }, (err, userMedia) => {
    if (err) {
      onUserMediaError(err)
    } else {
-      var micStream = new MicrophoneStream(userMedia, { objectMode: true })
+      theUserMedia = userMedia
+      var micStream = new MicrophoneStream(userMedia, { objectMode: true, bufferSize: 1024 })
      micStream.on('data', data => {
-        resampler.write(Buffer.from(data.getChannelData(0).buffer))
+        onData(Buffer.from(data.getChannelData(0).buffer))
      })
    }
  })
--- a/package.json
+++ b/package.json
@ -25,6 +25,7 @@
    "brfs": "^1.4.3",
    "css-loader": "^0.26.0",
    "dompurify": "^0.8.9",
+    "drop-stream": "^1.0.0",
    "duplex-maker": "^1.0.0",
    "extract-loader": "^0.1.0",
    "file-loader": "^0.9.0",
@ -39,10 +40,11 @@
    "regexp-replace-loader": "0.0.1",
    "stream-chunker": "^1.2.8",
    "transform-loader": "^0.2.3",
+    "voice-activity-detection": "johni0702/voice-activity-detection#9f8bd90",
    "webpack": "^1.13.3",
    "webworkify-webpack-dropin": "^1.1.9",
    "libsamplerate.js": "^1.0.0",
-    "mumble-client-codecs-browser": "^1.1.0",
+    "mumble-client-codecs-browser": "^1.1.1",
    "mumble-client-websocket": "^1.0.0",
    "mumble-client": "^1.1.0",
    "web-audio-buffer-queue": "^1.0.0"
--- a/themes/MetroMumbleLight/main.css
+++ b/themes/MetroMumbleLight/main.css
@ -208,8 +208,10 @@ form {
  border-bottom: 1px solid darkgray;
 }
 .dialog-footer {
+  position: absolute;
+  bottom: 0px;
+  width: calc(100% - 20px);
  margin: 10px;
-  margin-bottom: 0px;
 }
 .dialog-close {
  float: left;
@ -255,6 +257,14 @@ form {
 }
 .settings-dialog table input {
  width: 100%;
+  margin: 0px;
+}
+.settings-dialog .mic-volume-container {
+  height: 10px;
+  border: 3px solid black;
+}
+.settings-dialog .mic-volume {
+  height: 100%;
 }
 .connect-dialog {
  width: 300px;