Add voice activity detection

2017-09-20 15:16:49 +02:00 · 2017-09-20 15:16:49 +02:00 · 80f766379d
commit 80f766379d
parent c49dabbfc4
6 changed files with 156 additions and 24 deletions
--- a/app/index.html
+++ b/app/index.html
@ -67,11 +67,23 @@
                  <td>
                    <select data-bind='value: voiceMode'>
                      <option value="cont">Continuous</option>
-                      <option value="vad" disabled>Voice Activity</option>
+                      <option value="vad">Voice Activity</option>
                      <option value="ptt">Push To Talk</option>
                  </td>
                </tr>
-                <tr data-bind="style: {visibility: voiceMode() == 'ptt' ? 'visible' : 'hidden'}">
+                <tr data-bind="visible: voiceMode() == 'vad'">
+                  <td colspan="2">
+                    <div class="mic-volume-container">
+                      <div class="mic-volume" data-bind="style: {
+                          width: testVadLevel()*100 + '%',
+                          background: testVadActive() ? 'green' : 'red'
+                        }"></div>
+                    </div>
+                    <input type="range" min="0" max="1" step="0.01"
+                           data-bind="value: vadLevel">
+                  </td>
+                </tr>
+                <tr data-bind="visible: voiceMode() == 'ptt'">
                  <td>PTT Key</td>
                  <td>
                    <input type="button" data-bind="value: pttKeyDisplay, click: recordPttKey">
--- a/app/index.js
+++ b/app/index.js
@ -9,7 +9,7 @@ import ko from 'knockout'
 import _dompurify from 'dompurify'
 import keyboardjs from 'keyboardjs'

-import { ContinuousVoiceHandler, PushToTalkVoiceHandler, initVoice } from './voice'
+import { ContinuousVoiceHandler, PushToTalkVoiceHandler, VADVoiceHandler, initVoice } from './voice'

 const dompurify = _dompurify(window)

@ -58,11 +58,34 @@ class SettingsDialog {
    this.voiceMode = ko.observable(settings.voiceMode)
    this.pttKey = ko.observable(settings.pttKey)
    this.pttKeyDisplay = ko.observable(settings.pttKey)
+    this.vadLevel = ko.observable(settings.vadLevel)
+    this.testVadLevel = ko.observable(0)
+    this.testVadActive = ko.observable(false)
+
+    this._setupTestVad()
+    this.vadLevel.subscribe(() => this._setupTestVad())
+  }
+
+  _setupTestVad () {
+    if (this._testVad) {
+      this._testVad.end()
+    }
+    this._testVad = new VADVoiceHandler(null, this.vadLevel())
+    this._testVad.on('started_talking', () => this.testVadActive(true))
+                 .on('stopped_talking', () => this.testVadActive(false))
+                 .on('level', level => this.testVadLevel(level))
+    testVoiceHandler = this._testVad
  }

  applyTo (settings) {
    settings.voiceMode = this.voiceMode()
    settings.pttKey = this.pttKey()
+    settings.vadLevel = this.vadLevel()
+  }
+
+  end () {
+    this._testVad.end()
+    testVoiceHandler = null
  }

  recordPttKey () {
@ -89,14 +112,16 @@ class SettingsDialog {
 class Settings {
  constructor () {
    const load = key => window.localStorage.getItem('mumble.' + key)
-    this.voiceMode = load('voiceMode') || 'cont'
+    this.voiceMode = load('voiceMode') || 'vad'
    this.pttKey = load('pttKey') || 'ctrl + shift'
+    this.vadLevel = load('vadLevel') || 0.3
  }

  save () {
    const save = (key, val) => window.localStorage.setItem('mumble.' + key, val)
    save('voiceMode', this.voiceMode)
    save('pttKey', this.pttKey)
+    save('vadLevel', this.vadLevel)
  }
 }

@ -130,10 +155,13 @@ class GlobalBindings {
      this._updateVoiceHandler()

      this.settings.save()
-      this.settingsDialog(null)
+      this.closeSettings()
    }

    this.closeSettings = () => {
+      if (this.settingsDialog()) {
+        this.settingsDialog().end()
+      }
      this.settingsDialog(null)
    }

@ -360,7 +388,7 @@ class GlobalBindings {
      } else if (mode === 'ptt') {
        voiceHandler = new PushToTalkVoiceHandler(this.client, this.settings.pttKey)
      } else if (mode === 'vad') {
-
+        voiceHandler = new VADVoiceHandler(this.client, this.settings.vadLevel)
      } else {
        log('Unknown voice mode:', mode)
        return
@ -586,15 +614,19 @@ function userToState () {
 }

 var voiceHandler
+var testVoiceHandler

 initVoice(data => {
+  if (testVoiceHandler) {
+    testVoiceHandler.write(data)
+  }
  if (!ui.client) {
    if (voiceHandler) {
      voiceHandler.end()
    }
    voiceHandler = null
  } else if (voiceHandler) {
-    voiceHandler.write(new Float32Array(data.buffer, data.byteOffset, data.byteLength / 4))
+    voiceHandler.write(data)
  }
 }, err => {
  log('Cannot initialize user media. Microphone will not work:', err)
--- a/app/voice.js
+++ b/app/voice.js
@ -1,10 +1,12 @@
-import { Writable } from 'stream'
+import { Writable, Transform } from 'stream'
 import MicrophoneStream from 'microphone-stream'
 import audioContext from 'audio-context'
 import chunker from 'stream-chunker'
 import Resampler from 'libsamplerate.js'
 import getUserMedia from 'getusermedia'
 import keyboardjs from 'keyboardjs'
+import vad from 'voice-activity-detection'
+import DropStream from 'drop-stream'

 class VoiceHandler extends Writable {
  constructor (client) {
@ -15,7 +17,29 @@ class VoiceHandler extends Writable {

  _getOrCreateOutbound () {
    if (!this._outbound) {
-      this._outbound = this._client.createVoiceStream()
+      if (!this._client) {
+        this._outbound = DropStream.obj()
+        this.emit('started_talking')
+        return this._outbound
+      }
+      this._outbound = new Resampler({
+        unsafe: true,
+        type: Resampler.Type.SINC_FASTEST,
+        ratio: 48000 / audioContext.sampleRate
+      })
+
+      const buffer2Float32Array = new Transform({
+        transform (data, _, callback) {
+          callback(null, new Float32Array(data.buffer, data.byteOffset, data.byteLength / 4))
+        },
+        readableObjectMode: true
+      })
+
+      this._outbound
+        .pipe(chunker(4 * 480))
+        .pipe(buffer2Float32Array)
+        .pipe(this._client.createVoiceStream())
+
      this.emit('started_talking')
    }
    return this._outbound
@ -74,24 +98,76 @@ export class PushToTalkVoiceHandler extends VoiceHandler {
  }
 }

+export class VADVoiceHandler extends VoiceHandler {
+  constructor (client, level) {
+    super(client)
+    const self = this
+    this._vad = vad(audioContext, theUserMedia, {
+      onVoiceStart () {
+        console.log('vad: start')
+        self._active = true
+      },
+      onVoiceStop () {
+        console.log('vad: stop')
+        self._stopOutbound()
+        self._active = false
+      },
+      onUpdate (val) {
+        self._level = val
+        self.emit('level', val)
+      },
+      noiseCaptureDuration: 0,
+      minNoiseLevel: level,
+      maxNoiseLevel: level
+    })
+    // Need to keep a backlog of the last ~150ms (dependent on sample rate)
+    // because VAD will activate with ~125ms delay
+    this._backlog = []
+    this._backlogLength = 0
+    this._backlogLengthMin = 1024 * 6 * 4 // vadBufferLen * (vadDelay + 1) * bytesPerSample
+  }
+
+  _write (data, _, callback) {
+    if (this._active) {
+      if (this._backlog.length > 0) {
+        for (let oldData of this._backlog) {
+          this._getOrCreateOutbound().write(oldData)
+        }
+        this._backlog = []
+        this._backlogLength = 0
+      }
+      this._getOrCreateOutbound().write(data, callback)
+    } else {
+      // Make sure we always keep the backlog filled if we're not (yet) talking
+      this._backlog.push(data)
+      this._backlogLength += data.length
+      // Check if we can discard the oldest element without becoming too short
+      if (this._backlogLength - this._backlog[0].length > this._backlogLengthMin) {
+        this._backlogLength -= this._backlog.shift().length
+      }
+      callback()
+    }
+  }
+
+  _final (callback) {
+    super._final(e => {
+      this._vad.destroy()
+      callback(e)
+    })
+  }
+}
+
+var theUserMedia = null
+
 export function initVoice (onData, onUserMediaError) {
-  var resampler = new Resampler({
-    unsafe: true,
-    type: Resampler.Type.SINC_FASTEST,
-    ratio: 48000 / audioContext.sampleRate
-  })
-
-  resampler.pipe(chunker(4 * 480)).on('data', data => {
-    onData(data)
-  })
-
  getUserMedia({ audio: true }, (err, userMedia) => {
    if (err) {
      onUserMediaError(err)
    } else {
-      var micStream = new MicrophoneStream(userMedia, { objectMode: true })
+      theUserMedia = userMedia
+      var micStream = new MicrophoneStream(userMedia, { objectMode: true, bufferSize: 1024 })
      micStream.on('data', data => {
-        resampler.write(Buffer.from(data.getChannelData(0).buffer))
+        onData(Buffer.from(data.getChannelData(0).buffer))
      })
    }
  })