My experimental code below (compileSdk 34
, minSdk 33
), works well as far as text-to-speech and speech-recognition are considered separately:
package com.example.speechandspeak;
import android.Manifest;
import android.content.Intent;
import android.content.pm.PackageManager;
import android.os.Bundle;
import android.speech.RecognitionListener;
import android.speech.RecognizerIntent;
import android.speech.SpeechRecognizer;
import android.speech.tts.TextToSpeech;
import android.util.Log;
import android.widget.Toast;
import androidx.appcompat.app.AppCompatActivity;
import androidx.core.app.ActivityCompat;
import androidx.core.content.ContextCompat;
import java.util.ArrayList;
import java.util.Locale;
public class MainActivity extends AppCompatActivity implements RecognitionListener {
private TextToSpeech tts;
private SpeechRecognizer speechRecognizer;
@Override
protected void onCreate(Bundle savedInstanceState) {
super.onCreate(savedInstanceState);
setContentView(R.layout.activity_main);
// Initialize TextToSpeech
tts = new TextToSpeech(this, new TextToSpeech.OnInitListener() {
@Override
public void onInit(int status) {
if (status == TextToSpeech.SUCCESS) {
int ttsLang = tts.setLanguage(Locale.US);
if (ttsLang == TextToSpeech.LANG_MISSING_DATA || ttsLang == TextToSpeech.LANG_NOT_SUPPORTED) {
Toast.makeText(MainActivity.this, "Language is not supported!", Toast.LENGTH_SHORT).show();
} else {
speakText();
}
} else {
Toast.makeText(MainActivity.this, "TTS Initialization failed!", Toast.LENGTH_SHORT).show();
}
}
});
// Check if microphone permission is granted
if (ContextCompat.checkSelfPermission(this, android.Manifest.permission.RECORD_AUDIO) != PackageManager.PERMISSION_GRANTED) {
// Permission is not granted, request it
ActivityCompat.requestPermissions(this, new String[]{Manifest.permission.RECORD_AUDIO}, 1);
}
// Initialize SpeechRecognizer
speechRecognizer = SpeechRecognizer.createSpeechRecognizer(this);
speechRecognizer.setRecognitionListener(this);
}
private void speakText() {
Log.d(this.getLocalClassName(), "Entered: " + Thread.currentThread().getStackTrace()[2].getMethodName());
String message = getString(R.string.tts_message);
tts.speak(message, TextToSpeech.QUEUE_FLUSH, null, null);
// After speaking, initiate speech recognition
Intent speechRecognizerIntent = new Intent(RecognizerIntent.ACTION_RECOGNIZE_SPEECH);
speechRecognizer.startListening(speechRecognizerIntent);
}
@Override
public void onReadyForSpeech(Bundle params) {
Log.d(this.getLocalClassName(), "Entered: " + Thread.currentThread().getStackTrace()[2].getMethodName());
}
@Override
public void onBeginningOfSpeech() {
Log.d(this.getLocalClassName(), "Entered: " + Thread.currentThread().getStackTrace()[2].getMethodName());
}
@Override
public void onRmsChanged(float rmsdB) {
//Log.d(this.getLocalClassName(), "Entered: " + Thread.currentThread().getStackTrace()[2].getMethodName());
}
@Override
public void onBufferReceived(byte[] bytes) {
Log.d(this.getLocalClassName(), "Entered: " + Thread.currentThread().getStackTrace()[2].getMethodName());
}
@Override
public void onEndOfSpeech() {
Log.d(this.getLocalClassName(), "Entered: " + Thread.currentThread().getStackTrace()[2].getMethodName());
}
@Override
public void onError(int i) {
Log.d(this.getLocalClassName(), "Entered: " + Thread.currentThread().getStackTrace()[2].getMethodName() + "(" + i + ")");
}
@Override
public void onEvent(int eventType, Bundle params) {
Log.d(this.getLocalClassName(), "Entered: " + Thread.currentThread().getStackTrace()[2].getMethodName());
}
@Override
public void onPartialResults(Bundle partialResults) {
Log.d(this.getLocalClassName(), "Entered: " + Thread.currentThread().getStackTrace()[2].getMethodName());
}
@Override
public void onResults(Bundle results) {
Log.d(this.getLocalClassName(), "Entered: " + Thread.currentThread().getStackTrace()[2].getMethodName());
ArrayList<String> matches = results.getStringArrayList(SpeechRecognizer.RESULTS_RECOGNITION);
if (matches != null && !matches.isEmpty()) {
Log.v(this.getLocalClassName(), "Matches: " + matches);
String text = matches.get(0).toLowerCase(); // Convert to lowercase for case-insensitive comparison
if (text.equals("stop")) {
if (tts.isSpeaking()) {
tts.stop();
speechRecognizer.stopListening(); // Stop listening for further commands
Toast.makeText(MainActivity.this, "Stopped!", Toast.LENGTH_SHORT).show();
}
}
}
}
@Override
protected void onDestroy() {
super.onDestroy();
if (tts != null) {
tts.shutdown();
}
}
}
That is, the sample sentence ("This is a demonstration of Speech Recognition and Text-to-Speech. Say stop! to quit.") is being played out clearly, and the speech recognizer is started and working properly as seen by the Logcat debug messages:
12:40:09.125 MainActivity D Entered: speakText
12:40:09.297 D Entered: onReadyForSpeech
12:40:09.761 D Entered: onBeginningOfSpeech
12:40:13.365 D Entered: onEndOfSpeech
12:40:13.394 D Entered: onResults
12:40:13.396 V Matches: [demonstration of speech recognition and text to speech say stop to quit]
12:40:14.524 ProfileInstaller D Installing profile for com.example.speechandspeak
However, because speechRecognizer starts listening practically simultaneously with the TTS being played out (deliberately, this is the intended test case), onResults()
only recognizes the TTS output ("demonstration of speech recognition and text to speech say stop to quit"), ignoring whatever I say during that time.
I would like it, instead, to focus on my voice and ignore the TTS output.
How do I accomplish that?
I know that Android 13+ smartphones are capable of doing that because when I say "Hey Google" while TTS is being played out loud, my speech is being recognized, TTS output pauses, and Google Assistant starts listening...
How does it accomplish that?
Google Assistant
remaining a closed API but the article itself is closed... It appears that VoiceInteractionService is what you are looking for. – Berkeleianism