Voice Interaction (#30)

# Voice Interaction ## ♻️ Current situation & Problem Currently, users can interact with the chat assistant using a keyboard. Adding a voice-based interaction can increase the accessibility of interactions and allow for contactless communication. ## ⚙️ Release Notes Added voice recognition based on Apple's SpeechRecognizer based on (https://developer.apple.com/documentation/speech/recognizing_speech_in_live_audio). A button is added next to the send message icon which allows the user to start/stop recording their voice. When the recording is started, the message text field is populated in real-time with the recognized text from the user (set to recognize the system language). If the user clicks the send button, it will send the recognized text. The user can also say "send" and the message will be sent. The button will indicate that it is recording through an animation and by turning red. Voice interaction is added to the ```MessageInputView.swift``` in a more condensed format without relying on while loops. ## 📝 Code of Conduct & Contributing Guidelines By submitting creating this pull request, you agree to follow our [Code of Conduct](https://github.com/StanfordSpezi/.github/blob/main/CODE_OF_CONDUCT.md) and [Contributing Guidelines](https://github.com/StanfordSpezi/.github/blob/main/CONTRIBUTING.md): - [x] I agree to follow the [Code of Conduct](https://github.com/StanfordSpezi/.github/blob/main/CODE_OF_CONDUCT.md) and [Contributing Guidelines](https://github.com/StanfordSpezi/.github/blob/main/CONTRIBUTING.md). --------- Co-authored-by: Paul Schmiedmayer <PSchmiedmayer@users.noreply.github.com>
StanfordSpezi · Oct 10, 2023 · 76e8d78 · 76e8d78
1 parent 747a409
commit 76e8d78
Show file tree

Hide file tree

Showing 7 changed files with 255 additions and 38 deletions.
diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
@@ -20,20 +20,13 @@ jobs:
     name: Build and Test Swift Package
     uses: StanfordSpezi/.github/.github/workflows/xcodebuild-or-fastlane.yml@v2
     with:
-      artifactname: SpeziML.xcresult
+      artifactname: SpeziML-Package.xcresult
       runsonlabels: '["macOS", "self-hosted"]'
-      scheme: SpeziML
-  build:
-    name: Build Swift Package on Xcode 14
-    uses: StanfordSpezi/.github/.github/workflows/xcodebuild-or-fastlane.yml@v2
-    with:
-      runsonlabels: '["macos-13"]'
-      scheme: SpeziML
+      scheme: SpeziML-Package
   buildandtestuitests:
     name: Build and Test UI Tests
     uses: StanfordSpezi/.github/.github/workflows/xcodebuild-or-fastlane.yml@v2
     with:
-      xcodeversion: latest
       artifactname: TestApp.xcresult
       runsonlabels: '["macOS", "self-hosted"]'
       path: 'Tests/UITests'
@@ -43,4 +36,4 @@ jobs:
     needs: [buildandtest, buildandtestuitests]
     uses: StanfordSpezi/.github/.github/workflows/create-and-upload-coverage-report.yml@v2
     with:
-      coveragereports: SpeziML.xcresult TestApp.xcresult
+      coveragereports: SpeziML-Package.xcresult TestApp.xcresult
diff --git a/Package.swift b/Package.swift
@@ -1,4 +1,4 @@
-// swift-tools-version:5.7
+// swift-tools-version:5.9
 
 //
 // This source file is part of the Stanford Spezi open source project
@@ -18,7 +18,8 @@ let package = Package(
         .iOS(.v16)
     ],
     products: [
-        .library(name: "SpeziOpenAI", targets: ["SpeziOpenAI"])
+        .library(name: "SpeziOpenAI", targets: ["SpeziOpenAI"]),
+        .library(name: "SpeziSpeechRecognizer", targets: ["SpeziSpeechRecognizer"])
     ],
     dependencies: [
         .package(url: "https://github.com/MacPaw/OpenAI", .upToNextMinor(from: "0.2.3")),
@@ -30,13 +31,17 @@ let package = Package(
         .target(
             name: "SpeziOpenAI",
             dependencies: [
+                .target(name: "SpeziSpeechRecognizer"),
                 .product(name: "OpenAI", package: "OpenAI"),
                 .product(name: "Spezi", package: "Spezi"),
                 .product(name: "SpeziLocalStorage", package: "SpeziStorage"),
                 .product(name: "SpeziSecureStorage", package: "SpeziStorage"),
                 .product(name: "SpeziOnboarding", package: "SpeziOnboarding")
             ]
         ),
+        .target(
+            name: "SpeziSpeechRecognizer"
+        ),
         .testTarget(
             name: "SpeziOpenAITests",
             dependencies: [

diff --git a/Sources/SpeziOpenAI/MessageInputView.swift b/Sources/SpeziOpenAI/MessageInputView.swift
@@ -6,17 +6,21 @@
 // SPDX-License-Identifier: MIT
 //
 
+import AVFoundation
 import OpenAI
+import Speech
+import SpeziSpeechRecognizer
 import SwiftUI
 
 
 /// Displays a textfield to append a message to a chat.
 public struct MessageInputView: View {
-    let messagePlaceholder: String
+    private let messagePlaceholder: String
+    @StateObject private var speechRecognizer = SpeechRecognizer()
 
-    @Binding var chat: [Chat]
-    @State var message: String = ""
-    @State var messageViewHeight: CGFloat = 0
+    @Binding private var chat: [Chat]
+    @State private var message: String = ""
+    @State private var messageViewHeight: CGFloat = 0
 
 
     public var body: some View {
@@ -33,32 +37,21 @@ public struct MessageInputView: View {
                             RoundedRectangle(cornerRadius: 20)
                                 .fill(.white.opacity(0.2))
                         }
-                        .padding(.trailing, -30)
+                        .padding(.trailing, -42)
                 }
                 .lineLimit(1...5)
-            Button(
-                action: {
-                    chat.append(Chat(role: .user, content: message))
-                    message = ""
-                },
-                label: {
-                    Image(systemName: "arrow.up.circle.fill")
-                        .accessibilityLabel(String(localized: "SEND_MESSAGE", bundle: .module))
-                        .font(.title)
-                        .padding(.horizontal, -14)
-                        .foregroundColor(
-                            message.isEmpty ? Color(.systemGray5) : .accentColor
-                        )
+            Group {
+                if speechRecognizer.isAvailable && (message.isEmpty || speechRecognizer.isRecording) {
+                    microphoneButton
+                } else {
+                    sendButton
+                        .disabled(message.isEmpty)
                 }
-            )
-                .padding(.trailing, -38)
-                .padding(.bottom, 3)
-                .disabled(message.isEmpty)
+            }
+                .frame(minWidth: 33)
         }
-            .padding(.trailing, 23)
             .padding(.horizontal, 16)
             .padding(.vertical, 6)
-            .background(.white.opacity(0.4))
             .background(.thinMaterial)
             .background {
                 GeometryReader { proxy in
@@ -74,6 +67,46 @@ public struct MessageInputView: View {
             .messageInputViewHeight(messageViewHeight)
     }
 
+    private var sendButton: some View {
+        Button(
+            action: {
+                sendMessageButtonPressed()
+            },
+            label: {
+                Image(systemName: "arrow.up.circle.fill")
+                    .accessibilityLabel(String(localized: "SEND_MESSAGE", bundle: .module))
+                    .font(.title)
+                    .foregroundColor(
+                        message.isEmpty ? Color(.systemGray5) : .accentColor
+                    )
+            }
+        )
+            .offset(x: -2, y: -3)
+    }
+
+    private var microphoneButton: some View {
+        Button(
+            action: {
+                microphoneButtonPressed()
+            },
+            label: {
+                Image(systemName: "mic.fill")
+                    .accessibilityLabel(String(localized: "MICROPHONE_BUTTON", bundle: .module))
+                    .font(.title2)
+                    .foregroundColor(
+                        speechRecognizer.isRecording ? .red : Color(.systemGray2)
+                    )
+                    .scaleEffect(speechRecognizer.isRecording ? 1.2 : 1.0)
+                    .opacity(speechRecognizer.isRecording ? 0.7 : 1.0)
+                    .animation(
+                        speechRecognizer.isRecording ? .easeInOut(duration: 0.5).repeatForever(autoreverses: true) : .default,
+                        value: speechRecognizer.isRecording
+                    )
+            }
+        )
+            .offset(x: -4, y: -6)
+    }
+
 
     /// - Parameters:
     ///   - chat: The chat that should be appended to.
@@ -85,6 +118,31 @@ public struct MessageInputView: View {
         self._chat = chat
         self.messagePlaceholder = messagePlaceholder ?? "Message"
     }
+
+
+    private func sendMessageButtonPressed() {
+        speechRecognizer.stop()
+        chat.append(Chat(role: .user, content: message))
+        message = ""
+    }
+
+    private func microphoneButtonPressed() {
+        if speechRecognizer.isRecording {
+            speechRecognizer.stop()
+        } else {
+            Task {
+                do {
+                    for try await result in speechRecognizer.start() {
+                        if result.bestTranscription.formattedString.contains("send") {
+                            sendMessageButtonPressed()
+                        } else {
+                            message = result.bestTranscription.formattedString
+                        }
+                    }
+                }
+            }
+        }
+    }
 }
 
 

diff --git a/Sources/SpeziOpenAI/Resources/en.lproj/Localizable.strings b/Sources/SpeziOpenAI/Resources/en.lproj/Localizable.strings
@@ -23,4 +23,5 @@
 
 // MARK: Message Views
 "MESSAGE_INPUT_TEXTFIELD" = "Message Input Textfield";
+"MICROPHONE_BUTTON" = "Record Message";
 "SEND_MESSAGE" = "Send Message";
diff --git a/Sources/SpeziSpeechRecognizer/SpeechRecognizer.swift b/Sources/SpeziSpeechRecognizer/SpeechRecognizer.swift
@@ -0,0 +1,144 @@
+//
+// This source file is part of the Stanford Spezi open source project
+//
+// SPDX-FileCopyrightText: 2022 Stanford University and the project authors (see CONTRIBUTORS.md)
+//
+// SPDX-License-Identifier: MIT
+//
+
+import Speech
+
+/// Encapsulates the functionality of the `SFSpeechRecognizer`.
+///
+/// It provides methods to start and stop voice recognition, and publishes the state of recognition and its availability.
+public class SpeechRecognizer: NSObject, ObservableObject, SFSpeechRecognizerDelegate {
+    private let speechRecognizer: SFSpeechRecognizer?
+    private let audioEngine: AVAudioEngine?
+
+    /// Indicates whether the speech recognition is currently in progress.
+    @Published public private(set) var isRecording = false
+    /// Indicates the availability of the speech recognition service.
+    @Published public private(set) var isAvailable: Bool
+
+    private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
+    private var recognitionTask: SFSpeechRecognitionTask?
+
+
+    /// Initializes a new instance of `SpeechRecognizer`.
+    ///
+    /// - Parameter locale: The locale for the speech recognition. Defaults to the current locale.
+    public init(locale: Locale = .current) {
+        if let speechRecognizer = SFSpeechRecognizer(locale: locale) {
+            self.speechRecognizer = speechRecognizer
+            self.isAvailable = speechRecognizer.isAvailable
+        } else {
+            self.speechRecognizer = nil
+            self.isAvailable = false
+        }
+
+        self.audioEngine = AVAudioEngine()
+
+        super.init()
+
+        speechRecognizer?.delegate = self
+    }
+
+
+    /// Starts the speech recognition process.
+    ///
+    /// - Returns: An asynchronous stream of speech recognition results.
+    public func start() -> AsyncThrowingStream<SFSpeechRecognitionResult, Error> { // swiftlint:disable:this function_body_length
+        // We allow a larger function and closure length as the function provides a clear encapsulated functionality and the closue is mainly the function
+        // wrapped in a continuation.
+        AsyncThrowingStream { continuation in // swiftlint:disable:this closure_body_length
+            guard !isRecording else {
+                print("You already having a recording session in progress, please cancel the first one using `stop` before starting a new session.")
+                stop()
+                continuation.finish()
+                return
+            }
+
+            guard isAvailable, let audioEngine, let speechRecognizer else {
+                print("The speechrecognizer is not available.")
+                stop()
+                continuation.finish()
+                return
+            }
+
+            do {
+                let audioSession = AVAudioSession.sharedInstance()
+                try audioSession.setCategory(.record, mode: .measurement, options: .duckOthers)
+                try audioSession.setActive(true, options: .notifyOthersOnDeactivation)
+            } catch {
+                print("Error setting up the audio session: \(error.localizedDescription)")
+                stop()
+                continuation.finish(throwing: error)
+            }
+
+            let inputNode = audioEngine.inputNode
+
+            let recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
+            recognitionRequest.shouldReportPartialResults = true
+            self.recognitionRequest = recognitionRequest
+
+            recognitionTask = speechRecognizer.recognitionTask(with: recognitionRequest) { result, error in
+                if let error {
+                    continuation.finish(throwing: error)
+                }
+
+                guard self.isRecording, let result else {
+                    self.stop()
+                    return
+                }
+
+                continuation.yield(result)
+            }
+
+            let recordingFormat = inputNode.outputFormat(forBus: 0)
+            inputNode.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat) { buffer, _ in
+                self.recognitionRequest?.append(buffer)
+            }
+
+            audioEngine.prepare()
+            do {
+                isRecording = true
+                try audioEngine.start()
+            } catch {
+                print("Error setting up the audio session: \(error.localizedDescription)")
+                stop()
+                continuation.finish(throwing: error)
+            }
+
+            continuation.onTermination = { @Sendable _ in
+                self.stop()
+            }
+        }
+    }
+
+    /// Stops the current speech recognition session.
+    public func stop() {
+        guard isAvailable && isRecording else {
+            return
+        }
+
+        audioEngine?.stop()
+        audioEngine?.inputNode.removeTap(onBus: 0)
+
+        recognitionRequest?.endAudio()
+        recognitionRequest = nil
+
+        recognitionTask?.cancel()
+        recognitionTask = nil
+
+        isRecording = false
+    }
+
+    @_documentation(visibility: internal)
+    public func speechRecognizer(_ speechRecognizer: SFSpeechRecognizer, availabilityDidChange available: Bool) {
+        guard self.speechRecognizer == speechRecognizer else {
+            return
+        }
+
+        self.isAvailable = available
+    }
+}
diff --git a/Tests/UITests/TestAppUITests/TestAppUITests.swift b/Tests/UITests/TestAppUITests/TestAppUITests.swift
@@ -75,12 +75,12 @@ class TestAppUITests: XCTestCase {
 
         XCTAssert(app.staticTexts["User Message!"].waitForExistence(timeout: 2))
         XCTAssert(app.staticTexts["Assistant Message!"].waitForExistence(timeout: 2))
-        XCTAssert(app.buttons["Send Message"].waitForExistence(timeout: 2))
+        XCTAssert(app.buttons["Record Message"].waitForExistence(timeout: 2))
 
         XCTAssertFalse(app.staticTexts["System Message!"].waitForExistence(timeout: 2))
         XCTAssertFalse(app.staticTexts["Function Message!"].waitForExistence(timeout: 2))
 
-        XCTAssertFalse(app.buttons["Send Message"].isEnabled)
+        XCTAssert(app.buttons["Record Message"].isEnabled)
         try app.textViews["Message Input Textfield"].enter(value: "New Message!", dismissKeyboard: false)
         XCTAssert(app.buttons["Send Message"].isEnabled)