Here is what I use to create my recognition request:
func recordSpeech() throws {
// Cancel the previous task if it's running.
if let recognitionTask = recognitionTask {
recognitionTask.cancel()
self.recognitionTask = nil
}
isRecognizing = true
self.delegate?.recognitionStarted(sender: self)
let audioSession = AVAudioSession.sharedInstance()
try audioSession.setCategory(AVAudioSessionCategoryRecord)
try audioSession.setMode(AVAudioSessionModeMeasurement)
try audioSession.setActive(true, with: .notifyOthersOnDeactivation)
recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
guard let inputNode = audioEngine.inputNode else {
print("there was an error in audioEngine.inputNode")
fatalError("Audio engine has no input node")
}
guard let recognitionRequest = recognitionRequest else {
fatalError("Unable to create a SFSpeechAudioBufferRecognitionRequest object")
}
// Configure request so that results are returned before audio recording is finished
recognitionRequest.shouldReportPartialResults = true
// A recognition task represents a speech recognition session.
// We keep a reference to the task so that it can be cancelled.
recognitionTask = recognizer.recognitionTask(with: recognitionRequest) { result, error in
func finalizeResult() {
self.audioEngine.stop()
inputNode.removeTap(onBus: 0)
self.recognitionRequest = nil
self.recognitionTask = nil
}
guard error == nil else {
finalizeResult()
return
}
if !(result?.isFinal)! {
guard self.isRecognizing else {
return
}
// process partial result
self.processRecognition(result: result)
} else {
finalizeResult()
}
}
let recordingFormat = inputNode.outputFormat(forBus: 0)
inputNode.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat) { (buffer, when) in
self.recognitionRequest?.append(buffer)
}
audioEngine.prepare()
do {
try audioEngine.start()
} catch let error as NSError {
print("audio engine start error=\(error)")
}
}
To cancel or stop this at any point I use these methods:
@objc func stopRecording() {
isRecognizing = false
audioEngine.stop()
recognitionRequest?.endAudio()
self.delegate?.recognitionFinished()
}
func cancelRecording() {
isRecognizing = false
audioEngine.stop()
recognitionTask?.cancel()
self.delegate?.recognitionFinished()
}
I would setup a button to trigger speech recognition and tie it to recordSpeech()
. Then setup a button and tie it to stopRecording()
. When the user stops the request, result?.isfinal
will be true and you know that is the final text from the first input. The user could then use speech input again for the second set of speech.
Most of my code came from the 2016 WWDC session on Speech Recognition which you can find here:
Transcript
Video
audioEngine.stop()
andrecognitionRequest?.endAudio()
in the function that is called when the user stops recognition. – Dermatitis