From 80f766379df13719240a6bf234f3c88bf41d41f2 Mon Sep 17 00:00:00 2001
From: Jonas Herzig <me@johni0702.de>
Date: Wed, 20 Sep 2017 15:16:49 +0200
Subject: [PATCH] Add voice activity detection

---
 README.md                        |   2 +-
 app/index.html                   |  16 ++++-
 app/index.js                     |  42 +++++++++++--
 app/voice.js                     | 104 ++++++++++++++++++++++++++-----
 package.json                     |   4 +-
 themes/MetroMumbleLight/main.css |  12 +++-
 6 files changed, 156 insertions(+), 24 deletions(-)
diff --git a/README.md b/README.md
index 31aec7d..91bc5e3 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ Instead Websockets are used for all communications.
 libopus, libcelt (0.7.1) and libsamplerate, compiled to JS via emscripten, are used for audio decoding.
 Therefore, at the moment only the Opus and CELT Alpha codecs are supported.
 
-Quite a few features, most noticeably voice activity detection and all
+Quite a few features, most noticeably all
 administrative functionallity, are still missing.
 
 ### Installing
diff --git a/app/index.html b/app/index.html
index 18641ae..4f47c83 100644
--- a/app/index.html
+++ b/app/index.html
@@ -67,11 +67,23 @@
                   <td>
                     <select data-bind='value: voiceMode'>
                       <option value="cont">Continuous</option>
-                      <option value="vad" disabled>Voice Activity</option>
+                      <option value="vad">Voice Activity</option>
                       <option value="ptt">Push To Talk</option>
                   </td>
                 </tr>
-                <tr data-bind="style: {visibility: voiceMode() == 'ptt' ? 'visible' : 'hidden'}">
+                <tr data-bind="visible: voiceMode() == 'vad'">
+                  <td colspan="2">
+                    <div class="mic-volume-container">
+                      <div class="mic-volume" data-bind="style: {
+                          width: testVadLevel()*100 + '%',
+                          background: testVadActive() ? 'green' : 'red'
+                        }"></div>
+                    </div>
+                    <input type="range" min="0" max="1" step="0.01"
+                           data-bind="value: vadLevel">
+                  </td>
+                </tr>
+                <tr data-bind="visible: voiceMode() == 'ptt'">
                   <td>PTT Key</td>
                   <td>
                     <input type="button" data-bind="value: pttKeyDisplay, click: recordPttKey">
diff --git a/app/index.js b/app/index.js
index 53717e4..ee860a9 100644
--- a/app/index.js
+++ b/app/index.js
@@ -9,7 +9,7 @@ import ko from 'knockout'
 import _dompurify from 'dompurify'
 import keyboardjs from 'keyboardjs'
 
-import { ContinuousVoiceHandler, PushToTalkVoiceHandler, initVoice } from './voice'
+import { ContinuousVoiceHandler, PushToTalkVoiceHandler, VADVoiceHandler, initVoice } from './voice'
 
 const dompurify = _dompurify(window)
 
@@ -58,11 +58,34 @@ class SettingsDialog {
     this.voiceMode = ko.observable(settings.voiceMode)
     this.pttKey = ko.observable(settings.pttKey)
     this.pttKeyDisplay = ko.observable(settings.pttKey)
+    this.vadLevel = ko.observable(settings.vadLevel)
+    this.testVadLevel = ko.observable(0)
+    this.testVadActive = ko.observable(false)
+
+    this._setupTestVad()
+    this.vadLevel.subscribe(() => this._setupTestVad())
+  }
+
+  _setupTestVad () {
+    if (this._testVad) {
+      this._testVad.end()
+    }
+    this._testVad = new VADVoiceHandler(null, this.vadLevel())
+    this._testVad.on('started_talking', () => this.testVadActive(true))
+                 .on('stopped_talking', () => this.testVadActive(false))
+                 .on('level', level => this.testVadLevel(level))
+    testVoiceHandler = this._testVad
   }
 
   applyTo (settings) {
     settings.voiceMode = this.voiceMode()
     settings.pttKey = this.pttKey()
+    settings.vadLevel = this.vadLevel()
+  }
+
+  end () {
+    this._testVad.end()
+    testVoiceHandler = null
   }
 
   recordPttKey () {
@@ -89,14 +112,16 @@ class SettingsDialog {
 class Settings {
   constructor () {
     const load = key => window.localStorage.getItem('mumble.' + key)
-    this.voiceMode = load('voiceMode') || 'cont'
+    this.voiceMode = load('voiceMode') || 'vad'
     this.pttKey = load('pttKey') || 'ctrl + shift'
+    this.vadLevel = load('vadLevel') || 0.3
   }
 
   save () {
     const save = (key, val) => window.localStorage.setItem('mumble.' + key, val)
     save('voiceMode', this.voiceMode)
     save('pttKey', this.pttKey)
+    save('vadLevel', this.vadLevel)
   }
 }
 
@@ -130,10 +155,13 @@ class GlobalBindings {
       this._updateVoiceHandler()
 
       this.settings.save()
-      this.settingsDialog(null)
+      this.closeSettings()
     }
 
     this.closeSettings = () => {
+      if (this.settingsDialog()) {
+        this.settingsDialog().end()
+      }
       this.settingsDialog(null)
     }
 
@@ -360,7 +388,7 @@ class GlobalBindings {
       } else if (mode === 'ptt') {
         voiceHandler = new PushToTalkVoiceHandler(this.client, this.settings.pttKey)
       } else if (mode === 'vad') {
-
+        voiceHandler = new VADVoiceHandler(this.client, this.settings.vadLevel)
       } else {
         log('Unknown voice mode:', mode)
         return
@@ -586,15 +614,19 @@ function userToState () {
 }
 
 var voiceHandler
+var testVoiceHandler
 
 initVoice(data => {
+  if (testVoiceHandler) {
+    testVoiceHandler.write(data)
+  }
   if (!ui.client) {
     if (voiceHandler) {
       voiceHandler.end()
     }
     voiceHandler = null
   } else if (voiceHandler) {
-    voiceHandler.write(new Float32Array(data.buffer, data.byteOffset, data.byteLength / 4))
+    voiceHandler.write(data)
   }
 }, err => {
   log('Cannot initialize user media. Microphone will not work:', err)
diff --git a/app/voice.js b/app/voice.js
index 73ac001..ea48c20 100644
--- a/app/voice.js
+++ b/app/voice.js
@@ -1,10 +1,12 @@
-import { Writable } from 'stream'
+import { Writable, Transform } from 'stream'
 import MicrophoneStream from 'microphone-stream'
 import audioContext from 'audio-context'
 import chunker from 'stream-chunker'
 import Resampler from 'libsamplerate.js'
 import getUserMedia from 'getusermedia'
 import keyboardjs from 'keyboardjs'
+import vad from 'voice-activity-detection'
+import DropStream from 'drop-stream'
 
 class VoiceHandler extends Writable {
   constructor (client) {
@@ -15,7 +17,29 @@ class VoiceHandler extends Writable {
 
   _getOrCreateOutbound () {
     if (!this._outbound) {
-      this._outbound = this._client.createVoiceStream()
+      if (!this._client) {
+        this._outbound = DropStream.obj()
+        this.emit('started_talking')
+        return this._outbound
+      }
+      this._outbound = new Resampler({
+        unsafe: true,
+        type: Resampler.Type.SINC_FASTEST,
+        ratio: 48000 / audioContext.sampleRate
+      })
+
+      const buffer2Float32Array = new Transform({
+        transform (data, _, callback) {
+          callback(null, new Float32Array(data.buffer, data.byteOffset, data.byteLength / 4))
+        },
+        readableObjectMode: true
+      })
+
+      this._outbound
+        .pipe(chunker(4 * 480))
+        .pipe(buffer2Float32Array)
+        .pipe(this._client.createVoiceStream())
+
       this.emit('started_talking')
     }
     return this._outbound
@@ -74,24 +98,76 @@ export class PushToTalkVoiceHandler extends VoiceHandler {
   }
 }
 
+export class VADVoiceHandler extends VoiceHandler {
+  constructor (client, level) {
+    super(client)
+    const self = this
+    this._vad = vad(audioContext, theUserMedia, {
+      onVoiceStart () {
+        console.log('vad: start')
+        self._active = true
+      },
+      onVoiceStop () {
+        console.log('vad: stop')
+        self._stopOutbound()
+        self._active = false
+      },
+      onUpdate (val) {
+        self._level = val
+        self.emit('level', val)
+      },
+      noiseCaptureDuration: 0,
+      minNoiseLevel: level,
+      maxNoiseLevel: level
+    })
+    // Need to keep a backlog of the last ~150ms (dependent on sample rate)
+    // because VAD will activate with ~125ms delay
+    this._backlog = []
+    this._backlogLength = 0
+    this._backlogLengthMin = 1024 * 6 * 4 // vadBufferLen * (vadDelay + 1) * bytesPerSample
+  }
+
+  _write (data, _, callback) {
+    if (this._active) {
+      if (this._backlog.length > 0) {
+        for (let oldData of this._backlog) {
+          this._getOrCreateOutbound().write(oldData)
+        }
+        this._backlog = []
+        this._backlogLength = 0
+      }
+      this._getOrCreateOutbound().write(data, callback)
+    } else {
+      // Make sure we always keep the backlog filled if we're not (yet) talking
+      this._backlog.push(data)
+      this._backlogLength += data.length
+      // Check if we can discard the oldest element without becoming too short
+      if (this._backlogLength - this._backlog[0].length > this._backlogLengthMin) {
+        this._backlogLength -= this._backlog.shift().length
+      }
+      callback()
+    }
+  }
+
+  _final (callback) {
+    super._final(e => {
+      this._vad.destroy()
+      callback(e)
+    })
+  }
+}
+
+var theUserMedia = null
+
 export function initVoice (onData, onUserMediaError) {
-  var resampler = new Resampler({
-    unsafe: true,
-    type: Resampler.Type.SINC_FASTEST,
-    ratio: 48000 / audioContext.sampleRate
-  })
-
-  resampler.pipe(chunker(4 * 480)).on('data', data => {
-    onData(data)
-  })
-
   getUserMedia({ audio: true }, (err, userMedia) => {
     if (err) {
       onUserMediaError(err)
     } else {
-      var micStream = new MicrophoneStream(userMedia, { objectMode: true })
+      theUserMedia = userMedia
+      var micStream = new MicrophoneStream(userMedia, { objectMode: true, bufferSize: 1024 })
       micStream.on('data', data => {
-        resampler.write(Buffer.from(data.getChannelData(0).buffer))
+        onData(Buffer.from(data.getChannelData(0).buffer))
       })
     }
   })
diff --git a/package.json b/package.json
index e6721ff..53ecd8a 100644
--- a/package.json
+++ b/package.json
@@ -25,6 +25,7 @@
     "brfs": "^1.4.3",
     "css-loader": "^0.26.0",
     "dompurify": "^0.8.9",
+    "drop-stream": "^1.0.0",
     "duplex-maker": "^1.0.0",
     "extract-loader": "^0.1.0",
     "file-loader": "^0.9.0",
@@ -39,10 +40,11 @@
     "regexp-replace-loader": "0.0.1",
     "stream-chunker": "^1.2.8",
     "transform-loader": "^0.2.3",
+    "voice-activity-detection": "johni0702/voice-activity-detection#9f8bd90",
     "webpack": "^1.13.3",
     "webworkify-webpack-dropin": "^1.1.9",
     "libsamplerate.js": "^1.0.0",
-    "mumble-client-codecs-browser": "^1.1.0",
+    "mumble-client-codecs-browser": "^1.1.1",
     "mumble-client-websocket": "^1.0.0",
     "mumble-client": "^1.1.0",
     "web-audio-buffer-queue": "^1.0.0"
diff --git a/themes/MetroMumbleLight/main.css b/themes/MetroMumbleLight/main.css
index bcff5f4..7ece095 100644
--- a/themes/MetroMumbleLight/main.css
+++ b/themes/MetroMumbleLight/main.css
@@ -208,8 +208,10 @@ form {
   border-bottom: 1px solid darkgray;
 }
 .dialog-footer {
+  position: absolute;
+  bottom: 0px;
+  width: calc(100% - 20px);
   margin: 10px;
-  margin-bottom: 0px;
 }
 .dialog-close {
   float: left;
@@ -255,6 +257,14 @@ form {
 }
 .settings-dialog table input {
   width: 100%;
+  margin: 0px;
+}
+.settings-dialog .mic-volume-container {
+  height: 10px;
+  border: 3px solid black;
+}
+.settings-dialog .mic-volume {
+  height: 100%;
 }
 .connect-dialog {
   width: 300px;