diff options
author | Volker Hilsheimer <volker.hilsheimer@qt.io> | 2023-01-18 15:04:41 +0100 |
---|---|---|
committer | Volker Hilsheimer <volker.hilsheimer@qt.io> | 2023-02-19 19:36:36 +0100 |
commit | c03afcc297bf250baff8d0693e4db0c8cc77eeed (patch) | |
tree | 257c98299d9c94a3c998b13a5ef19a7d5acd1b78 | |
parent | ea5c48e518789c3387ed9c9d21978eda122e9782 (diff) |
Add QTextToSpeech::synthesize to produce PCM data rather than audio
The function starts the synthesis as an asynchronous process, and
emits a signal 'synthesized()' (or calls a functor) with a chunk of
PCM data as a QByteArray, and the QAudioFormat in which the data is
encoded.
This requires a dependency to Qt Multimedia for Qt Speech for all
platforms; it has so far been required only with flite and winrt
backends.
Implemented for all engines, except speechd and macos engines where
it's not possible - these engines don't provide access to the data.
The test case verifies that the implementation is asynchronous, and
that it produces a reasonable amount of data. Since this involves
timer-based measurements, values need to be compared with some
appropriate margins.
The QML documentation of this API is omitted on purpose; the
QAudioFormat type is not available in QML, and we don't want to
encourage users to operate on raw bytes from QML anyway.
[ChangeLog][QtTextToSpeech][QTextToSpeech] Added the ability to
produce PCM data as a QByteArray. The QtTextToSpeech module now
depends on QtMultimedia on all platforms.
Fixes: QTBUG-109837
Change-Id: I308a3e18998827089c0f75789b720f1bd36e3c46
Reviewed-by: Qt CI Bot <qt_ci_bot@qt-project.org>
Reviewed-by: Axel Spoerl <axel.spoerl@qt.io>
36 files changed, 920 insertions, 55 deletions
diff --git a/src/plugins/tts/android/jar/src/org/qtproject/qt/android/speech/QtTextToSpeech.java b/src/plugins/tts/android/jar/src/org/qtproject/qt/android/speech/QtTextToSpeech.java index a8640ef..d14add6 100644 --- a/src/plugins/tts/android/jar/src/org/qtproject/qt/android/speech/QtTextToSpeech.java +++ b/src/plugins/tts/android/jar/src/org/qtproject/qt/android/speech/QtTextToSpeech.java @@ -5,6 +5,7 @@ package org.qtproject.qt.android.speech; import android.content.ContentResolver; import android.content.Context; +import android.media.AudioFormat; import android.provider.Settings; import android.provider.Settings.SettingNotFoundException; import android.speech.tts.TextToSpeech; @@ -15,6 +16,7 @@ import android.os.Build; import android.os.Bundle; import android.util.Log; import java.lang.Float; +import java.io.File; import java.util.Locale; import java.util.List; import java.util.ArrayList; @@ -23,12 +25,16 @@ import java.util.Set; public class QtTextToSpeech { private static final String UTTERANCE_ID = "UtteranceId"; + private static final String SYNTHESIZE_ID = "SynthesizeId"; // Native callback functions native public void notifyError(long id, long reason); native public void notifyReady(long id); native public void notifySpeaking(long id); native public void notifyRangeStart(long id, int start, int end, int frame); + native public void notifyBeginSynthesis(long id, int sampleRateInHz, int audioFormat, int channelCount); + native public void notifyAudioAvailable(long id, byte[] bytes); + native public void notifyEndSynthesis(long id); private TextToSpeech mTts; private final long mId; @@ -62,6 +68,8 @@ public class QtTextToSpeech Log.d(utteranceTAG, "onDone"); if (utteranceId.equals(UTTERANCE_ID)) { notifyReady(mId); + } else if (utteranceId.equals(SYNTHESIZE_ID)) { + notifyEndSynthesis(mId); } } @@ -96,6 +104,36 @@ public class QtTextToSpeech notifyRangeStart(mId, start, end, frame); } } + + @Override + public void onBeginSynthesis(String utteranceId, int sampleRateInHz, int audioFormat, int channelCount) { + Log.d(utteranceTAG, "onBeginSynthesis"); + if (utteranceId.equals(SYNTHESIZE_ID)) { + switch (audioFormat) { + case AudioFormat.ENCODING_PCM_8BIT: + audioFormat = 1; // QAudioFormat::UInt8 + break; + case AudioFormat.ENCODING_PCM_16BIT: + audioFormat = 2; // QAudioFormat::Int16; + break; + case AudioFormat.ENCODING_PCM_FLOAT: + audioFormat = 4; // QAudioFormat::Float; + break; + default: + audioFormat = 0; // QAudioFormat::Unknown; + } + + notifyBeginSynthesis(mId, sampleRateInHz, audioFormat, channelCount); + } + } + + @Override + public void onAudioAvailable(String utteranceId, byte[] bytes) { + Log.d(utteranceTAG, "onAudioAvailable"); + if (utteranceId.equals(SYNTHESIZE_ID)) { + notifyAudioAvailable(mId, bytes); + } + } }; QtTextToSpeech(final Context context, final long id, String engine) { @@ -139,6 +177,22 @@ public class QtTextToSpeech notifyError(mId, 3); // QTextToSpeech::ErrorReason::Input } + public int synthesize(String text) + { + Log.d(TAG, "TTS synthesize(): " + text); + int result = -1; + + Bundle params = new Bundle(); + params.putFloat(TextToSpeech.Engine.KEY_PARAM_VOLUME, mVolume); + File file = new File("/dev/null"); + result = mTts.synthesizeToFile(text, params, file, SYNTHESIZE_ID); + + Log.d(TAG, "TTS synthesize() result: " + Integer.toString(result)); + if (result == TextToSpeech.ERROR) + notifyError(mId, 3); // QTextToSpeech::ErrorReason::Input + return -1; + } + public void stop() { Log.d(TAG, "Stopping TTS"); diff --git a/src/plugins/tts/android/src/CMakeLists.txt b/src/plugins/tts/android/src/CMakeLists.txt index 6ba9172..538a435 100644 --- a/src/plugins/tts/android/src/CMakeLists.txt +++ b/src/plugins/tts/android/src/CMakeLists.txt @@ -11,6 +11,7 @@ qt_internal_add_plugin(QTextToSpeechEngineAndroid Qt::Core Qt::CorePrivate Qt::TextToSpeech + Qt::Multimedia ) add_dependencies(QTextToSpeechEngineAndroid QtAndroidTextToSpeech) diff --git a/src/plugins/tts/android/src/android_plugin.json b/src/plugins/tts/android/src/android_plugin.json index 099e0a8..d478826 100644 --- a/src/plugins/tts/android/src/android_plugin.json +++ b/src/plugins/tts/android/src/android_plugin.json @@ -5,6 +5,7 @@ "Priority": 100, "Capabilities": [ "Speak", - "WordByWordProgress" + "WordByWordProgress", + "Synthesize" ] } diff --git a/src/plugins/tts/android/src/qtexttospeech_android.cpp b/src/plugins/tts/android/src/qtexttospeech_android.cpp index b627385..3c2eae8 100644 --- a/src/plugins/tts/android/src/qtexttospeech_android.cpp +++ b/src/plugins/tts/android/src/qtexttospeech_android.cpp @@ -67,6 +67,58 @@ static void notifyRangeStart(JNIEnv *env, jobject thiz, jlong id, jint start, ji } Q_DECLARE_JNI_NATIVE_METHOD(notifyRangeStart) +static void notifyBeginSynthesis(JNIEnv *env, jobject thiz, jlong id, int sampleRateInHz, int audioFormat, int channelCount) +{ + Q_UNUSED(env); + Q_UNUSED(thiz); + + QTextToSpeechEngineAndroid *const tts = (*textToSpeechMap)[id]; + if (!tts) + return; + + QAudioFormat format; + format.setSampleRate(sampleRateInHz); + format.setSampleFormat(QAudioFormat::SampleFormat(audioFormat)); + format.setChannelCount(channelCount); + + QMetaObject::invokeMethod(tts, "processNotifyBeginSynthesis", Qt::AutoConnection, + Q_ARG(QAudioFormat, format)); +} +Q_DECLARE_JNI_NATIVE_METHOD(notifyBeginSynthesis) + +static void notifyAudioAvailable(JNIEnv *env, jobject thiz, jlong id, jbyteArray bytes) +{ + Q_UNUSED(thiz); + + QTextToSpeechEngineAndroid *const tts = (*textToSpeechMap)[id]; + if (!tts) + return; + + const auto sz = env->GetArrayLength(bytes); + QByteArray byteArray(sz, Qt::Initialization::Uninitialized); + env->GetByteArrayRegion(bytes, 0, sz, reinterpret_cast<jbyte *>(byteArray.data())); + + QMetaObject::invokeMethod(tts, "processNotifyAudioAvailable", Qt::AutoConnection, + Q_ARG(QByteArray, byteArray)); +} +Q_DECLARE_JNI_NATIVE_METHOD(notifyAudioAvailable) + +static void notifyEndSynthesis(JNIEnv *env, jobject thiz, jlong id) +{ + Q_UNUSED(env); + Q_UNUSED(thiz); + + QTextToSpeechEngineAndroid *const tts = (*textToSpeechMap)[id]; + if (!tts) + return; + + // Queued so that pending processNotifyAudioAvailable + // invocations get processed first. + QMetaObject::invokeMethod(tts, "processNotifyReady", Qt::QueuedConnection); +} +Q_DECLARE_JNI_NATIVE_METHOD(notifyEndSynthesis) + + Q_DECL_EXPORT jint JNICALL JNI_OnLoad(JavaVM *vm, void */*reserved*/) { static bool initialized = false; @@ -95,6 +147,9 @@ Q_DECL_EXPORT jint JNICALL JNI_OnLoad(JavaVM *vm, void */*reserved*/) Q_JNI_NATIVE_METHOD(notifyReady), Q_JNI_NATIVE_METHOD(notifySpeaking), Q_JNI_NATIVE_METHOD(notifyRangeStart), + Q_JNI_NATIVE_METHOD(notifyBeginSynthesis), + Q_JNI_NATIVE_METHOD(notifyAudioAvailable), + Q_JNI_NATIVE_METHOD(notifyEndSynthesis), })) { return JNI_ERR; } @@ -134,6 +189,28 @@ void QTextToSpeechEngineAndroid::say(const QString &text) m_speech.callMethod<void>("say", QJniObject::fromString(m_text).object<jstring>()); } +void QTextToSpeechEngineAndroid::synthesize(const QString &text) +{ + if (text.isEmpty()) + return; + + m_errorReason = QTextToSpeech::ErrorReason::NoError; + m_text = text; + m_speech.callMethod<int>("synthesize", QJniObject::fromString(m_text).object<jstring>()); +} + +void QTextToSpeechEngineAndroid::processNotifyBeginSynthesis(const QAudioFormat &format) +{ + m_format = format; + setState(QTextToSpeech::Synthesizing); +} + +void QTextToSpeechEngineAndroid::processNotifyAudioAvailable(const QByteArray &bytes) +{ + Q_ASSERT(m_format.isValid()); + emit synthesized(m_format, bytes); +} + QTextToSpeech::State QTextToSpeechEngineAndroid::state() const { return m_state; @@ -179,6 +256,8 @@ void QTextToSpeechEngineAndroid::setError(QTextToSpeech::ErrorReason reason, con void QTextToSpeechEngineAndroid::processNotifyReady() { + if (m_state == QTextToSpeech::Synthesizing) + m_format = {}; if (m_state != QTextToSpeech::Paused) setState(QTextToSpeech::Ready); } @@ -232,6 +311,7 @@ void QTextToSpeechEngineAndroid::stop(QTextToSpeech::BoundaryHint boundaryHint) m_speech.callMethod<void>("stop"); setState(QTextToSpeech::Ready); + m_format = {}; } void QTextToSpeechEngineAndroid::pause(QTextToSpeech::BoundaryHint boundaryHint) diff --git a/src/plugins/tts/android/src/qtexttospeech_android.h b/src/plugins/tts/android/src/qtexttospeech_android.h index 9569a8f..c86ffb1 100644 --- a/src/plugins/tts/android/src/qtexttospeech_android.h +++ b/src/plugins/tts/android/src/qtexttospeech_android.h @@ -26,6 +26,7 @@ public: QList<QLocale> availableLocales() const override; QList<QVoice> availableVoices() const override; void say(const QString &text) override; + void synthesize(const QString &text) override; void stop(QTextToSpeech::BoundaryHint boundaryHint) override; void pause(QTextToSpeech::BoundaryHint boundaryHint) override; void resume() override; @@ -48,6 +49,8 @@ public Q_SLOTS: void processNotifyError(int reason); void processNotifySpeaking(); void processNotifyRangeStart(int start, int end, int frame); + void processNotifyBeginSynthesis(const QAudioFormat &format); + void processNotifyAudioAvailable(const QByteArray &bytes); private: void setState(QTextToSpeech::State state); @@ -59,6 +62,7 @@ private: QTextToSpeech::ErrorReason m_errorReason = QTextToSpeech::ErrorReason::Initialization; QString m_errorString; QString m_text; + QAudioFormat m_format; }; Q_DECLARE_JNI_CLASS(QtTextToSpeech, "org/qtproject/qt/android/speech/QtTextToSpeech") diff --git a/src/plugins/tts/darwin/CMakeLists.txt b/src/plugins/tts/darwin/CMakeLists.txt index 9c2cf2f..c16e8f0 100644 --- a/src/plugins/tts/darwin/CMakeLists.txt +++ b/src/plugins/tts/darwin/CMakeLists.txt @@ -13,5 +13,6 @@ qt_internal_add_plugin(QTextToSpeechDarwinPlugin Qt::Core Qt::Gui Qt::TextToSpeech + Qt::Multimedia ${FWAVFoundation} ) diff --git a/src/plugins/tts/darwin/darwin_plugin.json b/src/plugins/tts/darwin/darwin_plugin.json index 52e0087..98b428a 100644 --- a/src/plugins/tts/darwin/darwin_plugin.json +++ b/src/plugins/tts/darwin/darwin_plugin.json @@ -5,6 +5,7 @@ "Priority": 100, "Capabilities": [ "Speak", - "WordByWordProgress" + "WordByWordProgress", + "Synthesize" ] } diff --git a/src/plugins/tts/darwin/qtexttospeech_darwin.h b/src/plugins/tts/darwin/qtexttospeech_darwin.h index ab6a713..cb07732 100644 --- a/src/plugins/tts/darwin/qtexttospeech_darwin.h +++ b/src/plugins/tts/darwin/qtexttospeech_darwin.h @@ -10,6 +10,7 @@ Q_FORWARD_DECLARE_OBJC_CLASS(AVSpeechSynthesizer); Q_FORWARD_DECLARE_OBJC_CLASS(AVSpeechSynthesisVoice); +Q_FORWARD_DECLARE_OBJC_CLASS(AVSpeechUtterance); QT_BEGIN_NAMESPACE @@ -24,6 +25,7 @@ public: QList<QLocale> availableLocales() const override; QList<QVoice> availableVoices() const override; void say(const QString &text) override; + void synthesize(const QString &text) override; void stop(QTextToSpeech::BoundaryHint boundaryHint) override; void pause(QTextToSpeech::BoundaryHint boundaryHint) override; void resume() override; @@ -48,12 +50,14 @@ private: AVSpeechSynthesisVoice *fromQVoice(const QVoice &voice) const; QVoice toQVoice(AVSpeechSynthesisVoice *avVoice) const; void setError(QTextToSpeech::ErrorReason reason, const QString &string); + AVSpeechUtterance *prepareUtterance(const QString &text); AVSpeechSynthesizer *m_speechSynthesizer; QVoice m_voice; QTextToSpeech::State m_state = QTextToSpeech::Error; QTextToSpeech::ErrorReason m_errorReason = QTextToSpeech::ErrorReason::Initialization; QString m_errorString; + QAudioFormat m_format; double m_pitch = 0.0; double m_actualPitch = 1.0; diff --git a/src/plugins/tts/darwin/qtexttospeech_darwin.mm b/src/plugins/tts/darwin/qtexttospeech_darwin.mm index ca24165..8446687 100644 --- a/src/plugins/tts/darwin/qtexttospeech_darwin.mm +++ b/src/plugins/tts/darwin/qtexttospeech_darwin.mm @@ -6,6 +6,7 @@ #include "qtexttospeech_darwin.h" #include <QtCore/QCoreApplication> +#include <QtMultimedia/QAudioFormat> @interface QDarwinSpeechSynthesizerDelegate : NSObject <AVSpeechSynthesizerDelegate> @end @@ -112,10 +113,8 @@ QTextToSpeechEngineDarwin::~QTextToSpeechEngineDarwin() [m_speechSynthesizer release]; } -void QTextToSpeechEngineDarwin::say(const QString &text) +AVSpeechUtterance *QTextToSpeechEngineDarwin::prepareUtterance(const QString &text) { - stop(QTextToSpeech::BoundaryHint::Default); - // Qt pitch: [-1.0, 1.0], 0 is normal // AVF range: [0.5, 2.0], 1.0 is normal const double desiredPitch = 1.0 + (m_pitch >= 0 ? m_pitch : (m_pitch * 0.5)); @@ -158,9 +157,79 @@ void QTextToSpeechEngineDarwin::say(const QString &text) utterance.volume = m_volume; utterance.voice = fromQVoice(m_voice); + return utterance; +} + +void QTextToSpeechEngineDarwin::say(const QString &text) +{ + stop(QTextToSpeech::BoundaryHint::Default); + + AVSpeechUtterance *utterance = prepareUtterance(text); [m_speechSynthesizer speakUtterance:utterance]; } +void QTextToSpeechEngineDarwin::synthesize(const QString &text) +{ + AVSpeechUtterance *utterance = prepareUtterance(text); + m_format = {}; + + const auto bufferCallback = ^(AVAudioBuffer *buffer){ + setState(QTextToSpeech::Synthesizing); + + if (!m_format.isValid()) { + const AVAudioFormat *format = buffer.format; + if (format.channelCount == 1) + m_format.setChannelConfig(QAudioFormat::ChannelConfigMono); + else + m_format.setChannelCount(format.channelCount); + m_format.setSampleRate(format.sampleRate); + m_format.setSampleFormat([&format]{ + switch (format.commonFormat) { + case AVAudioPCMFormatFloat32: + return QAudioFormat::Float; + case AVAudioPCMFormatInt16: + return QAudioFormat::Int16; + case AVAudioPCMFormatInt32: + return QAudioFormat::Int32; + case AVAudioPCMFormatFloat64: + return QAudioFormat::Unknown; + case AVAudioOtherFormat: { + const id bitKey = format.settings[@"AVLinearPCMBitDepthKey"]; + const id isFloatKey = format.settings[@"AVLinearPCMIsFloatKey"]; + if ([isFloatKey isEqual:@(YES)]) { + if ([bitKey isEqual:@(32)]) + return QAudioFormat::Float; + } else if ([bitKey isEqual:@(8)]) { + return QAudioFormat::UInt8; + } else if ([bitKey isEqual:@(16)]) { + return QAudioFormat::Int16; + } else if ([bitKey isEqual:@(32)]) { + return QAudioFormat::Int32; + } + break; + } + default: + break; + } + return QAudioFormat::Unknown; + }()); + if (!m_format.isValid()) + qWarning() << "Audio arrived with invalid format:" << format.settings; + } + + const AudioBufferList *bufferList = buffer.audioBufferList; + for (UInt32 i = 0; i < bufferList->mNumberBuffers; ++i) { + const AudioBuffer &buffer = bufferList->mBuffers[i]; + // we expect all buffers to have the same number of channels + if (int(buffer.mNumberChannels) != m_format.channelCount()) + continue; + emit synthesized(m_format, QByteArray::fromRawData(static_cast<const char *>(buffer.mData), buffer.mDataByteSize)); + } + }; + [m_speechSynthesizer writeUtterance:utterance + toBufferCallback:bufferCallback]; +} + void QTextToSpeechEngineDarwin::stop(QTextToSpeech::BoundaryHint boundaryHint) { Q_UNUSED(boundaryHint); @@ -321,6 +390,8 @@ void QTextToSpeechEngineDarwin::setState(QTextToSpeech::State state) return; m_state = state; + if (m_state == QTextToSpeech::Ready) + m_format = {}; emit stateChanged(m_state); } diff --git a/src/plugins/tts/flite/flite_plugin.json b/src/plugins/tts/flite/flite_plugin.json index 902f5d3..b9fd6ef 100644 --- a/src/plugins/tts/flite/flite_plugin.json +++ b/src/plugins/tts/flite/flite_plugin.json @@ -5,6 +5,7 @@ "Priority": 50, "Capabilities": [ "Speak", - "WordByWordProgress" + "WordByWordProgress", + "Synthesize" ] } diff --git a/src/plugins/tts/flite/qtexttospeech_flite.cpp b/src/plugins/tts/flite/qtexttospeech_flite.cpp index 76c28ed..5925aea 100644 --- a/src/plugins/tts/flite/qtexttospeech_flite.cpp +++ b/src/plugins/tts/flite/qtexttospeech_flite.cpp @@ -33,6 +33,8 @@ QTextToSpeechEngineFlite::QTextToSpeechEngineFlite(const QVariantMap ¶meters &QTextToSpeechEngineFlite::setError); connect(m_processor.get(), &QTextToSpeechProcessorFlite::sayingWord, this, &QTextToSpeechEngine::sayingWord); + connect(m_processor.get(), &QTextToSpeechProcessorFlite::synthesized, this, + &QTextToSpeechEngine::synthesized); // Read voices from processor before moving it to a separate thread const QList<QTextToSpeechProcessorFlite::VoiceInfo> voices = m_processor->voices(); @@ -83,6 +85,13 @@ void QTextToSpeechEngineFlite::say(const QString &text) Q_ARG(double, rate()), Q_ARG(double, volume())); } +void QTextToSpeechEngineFlite::synthesize(const QString &text) +{ + QMetaObject::invokeMethod(m_processor.get(), "synthesize", Qt::QueuedConnection, Q_ARG(QString, text), + Q_ARG(int, voiceData(voice()).toInt()), Q_ARG(double, pitch()), + Q_ARG(double, rate()), Q_ARG(double, volume())); +} + void QTextToSpeechEngineFlite::stop(QTextToSpeech::BoundaryHint boundaryHint) { Q_UNUSED(boundaryHint); diff --git a/src/plugins/tts/flite/qtexttospeech_flite.h b/src/plugins/tts/flite/qtexttospeech_flite.h index a9f8522..5e1c094 100644 --- a/src/plugins/tts/flite/qtexttospeech_flite.h +++ b/src/plugins/tts/flite/qtexttospeech_flite.h @@ -27,6 +27,7 @@ public: QList<QLocale> availableLocales() const override; QList<QVoice> availableVoices() const override; void say(const QString &text) override; + void synthesize(const QString &text) override; void stop(QTextToSpeech::BoundaryHint boundaryHint) override; void pause(QTextToSpeech::BoundaryHint boundaryHint) override; void resume() override; diff --git a/src/plugins/tts/flite/qtexttospeech_flite_processor.cpp b/src/plugins/tts/flite/qtexttospeech_flite_processor.cpp index b45a8d8..3c0d974 100644 --- a/src/plugins/tts/flite/qtexttospeech_flite_processor.cpp +++ b/src/plugins/tts/flite/qtexttospeech_flite_processor.cpp @@ -41,8 +41,8 @@ void QTextToSpeechProcessorFlite::startTokenTimer() m_tokenTimer.start(qMax(token.startTime - playedTime, 0), Qt::PreciseTimer, this); } -int QTextToSpeechProcessorFlite::fliteOutputCb(const cst_wave *w, int start, int size, - int last, cst_audio_streaming_info *asi) +int QTextToSpeechProcessorFlite::audioOutputCb(const cst_wave *w, int start, int size, + int last, cst_audio_streaming_info *asi) { QTextToSpeechProcessorFlite *processor = static_cast<QTextToSpeechProcessorFlite *>(asi->userdata); if (processor) { @@ -72,13 +72,13 @@ int QTextToSpeechProcessorFlite::fliteOutputCb(const cst_wave *w, int start, int } asi->item = item_next(asi->item); } - return processor->fliteOutput(w, start, size, last, asi); + return processor->audioOutput(w, start, size, last, asi); } return CST_AUDIO_STREAM_STOP; } -int QTextToSpeechProcessorFlite::fliteOutput(const cst_wave *w, int start, int size, - int last, cst_audio_streaming_info *asi) +int QTextToSpeechProcessorFlite::audioOutput(const cst_wave *w, int start, int size, + int last, cst_audio_streaming_info *asi) { Q_UNUSED(asi); Q_ASSERT(QThread::currentThread() == thread()); @@ -87,14 +87,19 @@ int QTextToSpeechProcessorFlite::fliteOutput(const cst_wave *w, int start, int s if (start == 0 && !initAudio(w->sample_rate, w->num_channels)) return CST_AUDIO_STREAM_STOP; - int bytesToWrite = size * sizeof(short); - QString errorString; - if (!audioOutput((const char *)(&w->samples[start]), bytesToWrite, errorString)) { - setError(QTextToSpeech::ErrorReason::Playback, errorString); + const qsizetype bytesToWrite = size * sizeof(short); + + if (!m_audioBuffer->write(reinterpret_cast<const char *>(&w->samples[start]), bytesToWrite)) { + setError(QTextToSpeech::ErrorReason::Playback, + QCoreApplication::translate("QTextToSpeech", "Audio streaming error.")); stop(); return CST_AUDIO_STREAM_STOP; } + // Stats for debugging + ++numberChunks; + totalBytes += bytesToWrite; + if (last == 1) { qCDebug(lcSpeechTtsFlite) << "last data chunk written"; m_audioBuffer->close(); @@ -102,6 +107,41 @@ int QTextToSpeechProcessorFlite::fliteOutput(const cst_wave *w, int start, int s return CST_AUDIO_STREAM_CONT; } +int QTextToSpeechProcessorFlite::dataOutputCb(const cst_wave *w, int start, int size, + int last, cst_audio_streaming_info *asi) +{ + QTextToSpeechProcessorFlite *processor = static_cast<QTextToSpeechProcessorFlite *>(asi->userdata); + if (processor) + return processor->dataOutput(w, start, size, last, asi); + return CST_AUDIO_STREAM_STOP; +} + +int QTextToSpeechProcessorFlite::dataOutput(const cst_wave *w, int start, int size, + int last, cst_audio_streaming_info *) +{ + if (start == 0) + emit stateChanged(QTextToSpeech::Synthesizing); + + QAudioFormat format; + if (w->num_channels == 1) + format.setChannelConfig(QAudioFormat::ChannelConfigMono); + else + format.setChannelCount(w->num_channels); + format.setSampleRate(w->sample_rate); + format.setSampleFormat(QAudioFormat::Int16); + + if (!format.isValid()) + return CST_AUDIO_STREAM_STOP; + + const qsizetype bytesToWrite = size * format.bytesPerSample(); + emit synthesized(format, QByteArray(reinterpret_cast<const char *>(&w->samples[start]), bytesToWrite)); + + if (last == 1) + emit stateChanged(QTextToSpeech::Ready); + + return CST_AUDIO_STREAM_CONT; +} + void QTextToSpeechProcessorFlite::timerEvent(QTimerEvent *event) { if (event->timerId() != m_tokenTimer.timerId()) { @@ -121,22 +161,7 @@ void QTextToSpeechProcessorFlite::timerEvent(QTimerEvent *event) startTokenTimer(); } -bool QTextToSpeechProcessorFlite::audioOutput(const char *data, qint64 dataSize, QString &errorString) -{ - // Send data - if (!m_audioBuffer->write(data, dataSize)) { - errorString = QCoreApplication::translate("QTextToSpeech", "Audio streaming error."); - return false; - } - - // Stats for debugging - ++numberChunks; - totalBytes += dataSize; - - return true; -} - -void QTextToSpeechProcessorFlite::processText(const QString &text, int voiceId, double pitch, double rate) +void QTextToSpeechProcessorFlite::processText(const QString &text, int voiceId, double pitch, double rate, OutputHandler outputHandler) { qCDebug(lcSpeechTtsFlite) << "processText() begin"; if (!checkVoice(voiceId)) @@ -150,7 +175,7 @@ void QTextToSpeechProcessorFlite::processText(const QString &text, int voiceId, const VoiceInfo &voiceInfo = m_voices.at(voiceId); cst_voice *voice = voiceInfo.vox; cst_audio_streaming_info *asi = new_audio_streaming_info(); - asi->asc = QTextToSpeechProcessorFlite::fliteOutputCb; + asi->asc = outputHandler; asi->userdata = (void *)this; feat_set(voice->features, "streaming_info", audio_streaming_info_val(asi)); setRateForVoice(voice, rate); @@ -493,7 +518,19 @@ void QTextToSpeechProcessorFlite::say(const QString &text, int voiceId, double p return; m_volume = volume; - processText(text, voiceId, pitch, rate); + processText(text, voiceId, pitch, rate, QTextToSpeechProcessorFlite::audioOutputCb); +} + +void QTextToSpeechProcessorFlite::synthesize(const QString &text, int voiceId, double pitch, double rate, double volume) +{ + if (text.isEmpty()) + return; + + if (!checkVoice(voiceId)) + return; + + m_volume = volume; + processText(text, voiceId, pitch, rate, QTextToSpeechProcessorFlite::dataOutputCb); } QT_END_NAMESPACE diff --git a/src/plugins/tts/flite/qtexttospeech_flite_processor.h b/src/plugins/tts/flite/qtexttospeech_flite_processor.h index 8a54b44..0da6987 100644 --- a/src/plugins/tts/flite/qtexttospeech_flite_processor.h +++ b/src/plugins/tts/flite/qtexttospeech_flite_processor.h @@ -44,6 +44,7 @@ public: }; Q_INVOKABLE void say(const QString &text, int voiceId, double pitch, double rate, double volume); + Q_INVOKABLE void synthesize(const QString &text, int voiceId, double pitch, double rate, double volume); Q_INVOKABLE void pause(); Q_INVOKABLE void resume(); Q_INVOKABLE void stop(); @@ -52,16 +53,18 @@ public: static constexpr QTextToSpeech::State audioStateToTts(QAudio::State audioState); private: - // Process a single text - void processText(const QString &text, int voiceId, double pitch, double rate); - // Flite callbacks - static int fliteOutputCb(const cst_wave *w, int start, int size, + static int audioOutputCb(const cst_wave *w, int start, int size, + int last, cst_audio_streaming_info *asi); + static int dataOutputCb(const cst_wave *w, int start, int size, int last, cst_audio_streaming_info *asi); - int fliteOutput(const cst_wave *w, int start, int size, - int last, cst_audio_streaming_info *asi); - bool audioOutput(const char *data, qint64 dataSize, QString &errorString); + using OutputHandler = decltype(QTextToSpeechProcessorFlite::audioOutputCb); + // Process a single text + void processText(const QString &text, int voiceId, double pitch, double rate, OutputHandler outputHandler); + int audioOutput(const cst_wave *w, int start, int size, int last, cst_audio_streaming_info *asi); + int dataOutput(const cst_wave *w, int start, int size, int last, cst_audio_streaming_info *asi); + void setRateForVoice(cst_voice *voice, float rate); void setPitchForVoice(cst_voice *voice, float pitch); @@ -85,6 +88,7 @@ Q_SIGNALS: void errorOccurred(QTextToSpeech::ErrorReason error, const QString &errorString); void stateChanged(QTextToSpeech::State); void sayingWord(qsizetype begin, qsizetype length); + void synthesized(const QAudioFormat &format, const QByteArray &array); protected: void timerEvent(QTimerEvent *event) override; diff --git a/src/plugins/tts/macos/CMakeLists.txt b/src/plugins/tts/macos/CMakeLists.txt index ccf66d6..3e7cd70 100644 --- a/src/plugins/tts/macos/CMakeLists.txt +++ b/src/plugins/tts/macos/CMakeLists.txt @@ -12,4 +12,5 @@ qt_internal_add_plugin(QTextToSpeechMacOSPlugin Qt::Core Qt::Gui Qt::TextToSpeech + Qt::Multimedia ) diff --git a/src/plugins/tts/macos/qtexttospeech_macos.h b/src/plugins/tts/macos/qtexttospeech_macos.h index 231351b..6b707c0 100644 --- a/src/plugins/tts/macos/qtexttospeech_macos.h +++ b/src/plugins/tts/macos/qtexttospeech_macos.h @@ -30,6 +30,7 @@ public: QList<QLocale> availableLocales() const override; QList<QVoice> availableVoices() const override; void say(const QString &text) override; + void synthesize(const QString &text) override; void stop(QTextToSpeech::BoundaryHint boundaryHint) override; void pause(QTextToSpeech::BoundaryHint boundaryHint) override; void resume() override; diff --git a/src/plugins/tts/macos/qtexttospeech_macos.mm b/src/plugins/tts/macos/qtexttospeech_macos.mm index b08296a..bedb414 100644 --- a/src/plugins/tts/macos/qtexttospeech_macos.mm +++ b/src/plugins/tts/macos/qtexttospeech_macos.mm @@ -150,6 +150,11 @@ void QTextToSpeechEngineMacOS::say(const QString &text) speaking(); } +void QTextToSpeechEngineMacOS::synthesize(const QString &) +{ + setError(QTextToSpeech::ErrorReason::Configuration, tr("Synthesize not supported")); +} + void QTextToSpeechEngineMacOS::stop(QTextToSpeech::BoundaryHint boundaryHint) { if (speechSynthesizer.isSpeaking || m_state == QTextToSpeech::Paused) { diff --git a/src/plugins/tts/mock/mock_plugin.json b/src/plugins/tts/mock/mock_plugin.json index 43c0d9e..7785016 100644 --- a/src/plugins/tts/mock/mock_plugin.json +++ b/src/plugins/tts/mock/mock_plugin.json @@ -5,6 +5,7 @@ "Priority": -1, "Capabilities": [ "Speak", + "Synthesize", "WordByWordProgress" ] } diff --git a/src/plugins/tts/mock/qtexttospeech_mock.cpp b/src/plugins/tts/mock/qtexttospeech_mock.cpp index d3b8967..1118f50 100644 --- a/src/plugins/tts/mock/qtexttospeech_mock.cpp +++ b/src/plugins/tts/mock/qtexttospeech_mock.cpp @@ -69,6 +69,19 @@ void QTextToSpeechEngineMock::say(const QString &text) emit stateChanged(m_state); } +void QTextToSpeechEngineMock::synthesize(const QString &text) +{ + m_text = text; + m_currentIndex = 0; + m_timer.start(wordTime(), Qt::PreciseTimer, this); + m_state = QTextToSpeech::Synthesizing; + emit stateChanged(m_state); + + m_format.setSampleRate(22050); + m_format.setChannelConfig(QAudioFormat::ChannelConfigMono); + m_format.setSampleFormat(QAudioFormat::Int16); +} + void QTextToSpeechEngineMock::stop(QTextToSpeech::BoundaryHint boundaryHint) { Q_UNUSED(boundaryHint); @@ -112,7 +125,7 @@ void QTextToSpeechEngineMock::timerEvent(QTimerEvent *e) return; } - Q_ASSERT(m_state == QTextToSpeech::Speaking); + Q_ASSERT(m_state == QTextToSpeech::Speaking || m_state == QTextToSpeech::Synthesizing); Q_ASSERT(m_text.length()); // Find start of next word, skipping punctuations. This is good enough for testing. @@ -124,6 +137,8 @@ void QTextToSpeechEngineMock::timerEvent(QTimerEvent *e) sayingWord(m_currentIndex, nextSpace - m_currentIndex); m_currentIndex = nextSpace + match.captured().length(); + emit synthesized(m_format, QByteArray(m_format.bytesForDuration(wordTime() * 1000), 0)); + if (m_currentIndex >= m_text.length()) { // done speaking all words m_timer.stop(); diff --git a/src/plugins/tts/mock/qtexttospeech_mock.h b/src/plugins/tts/mock/qtexttospeech_mock.h index 0511416..dece5f4 100644 --- a/src/plugins/tts/mock/qtexttospeech_mock.h +++ b/src/plugins/tts/mock/qtexttospeech_mock.h @@ -21,6 +21,7 @@ public: QList<QVoice> availableVoices() const override; void say(const QString &text) override; + void synthesize(const QString &text) override; void stop(QTextToSpeech::BoundaryHint boundaryHint) override; void pause(QTextToSpeech::BoundaryHint boundaryHint) override; void resume() override; @@ -59,6 +60,7 @@ private: QString m_errorString; bool m_pauseRequested = false; qsizetype m_currentIndex = -1; + QAudioFormat m_format; }; QT_END_NAMESPACE diff --git a/src/plugins/tts/sapi/qtexttospeech_sapi.cpp b/src/plugins/tts/sapi/qtexttospeech_sapi.cpp index 00b2e84..0e92547 100644 --- a/src/plugins/tts/sapi/qtexttospeech_sapi.cpp +++ b/src/plugins/tts/sapi/qtexttospeech_sapi.cpp @@ -22,6 +22,7 @@ QT_BEGIN_NAMESPACE #ifdef Q_CC_MINGW // from sphelper.h static const GUID CLSD_SpVoice = {0x96749377, 0x3391, 0x11d2,{0x9e, 0xe3, 0x0, 0xc0, 0x4f, 0x79, 0x73, 0x96}}; +const GUID SPDFID_WaveFormatEx = {0xC31ADBAE, 0x527F, 0x4ff5,{0xA2, 0x30, 0xF6, 0x2B, 0xB6, 0x1F, 0xF7, 0x0C}}; static inline HRESULT SpGetTokenFromId(const WCHAR *pszTokenId, ISpObjectToken **cpToken, BOOL fCreateIfNotExist = FALSE) { @@ -54,7 +55,6 @@ inline void SpClearEvent(SPEVENT *pe) break; } } - #endif // Q_CC_MINGW QTextToSpeechEngineSapi::QTextToSpeechEngineSapi(const QVariantMap &, QObject *) @@ -117,6 +117,123 @@ void QTextToSpeechEngineSapi::say(const QString &text) QCoreApplication::translate("QTextToSpeech", "Speech synthesizing failure.")); } +void QTextToSpeechEngineSapi::synthesize(const QString &text) +{ + class OutputStream : public ISpStreamFormat + { + ULONG m_ref = 1; + qint64 m_pos = 0; + qint64 m_length = 0; + QTextToSpeechEngineSapi *m_engine = nullptr; + QAudioFormat m_format; + + public: + OutputStream(QTextToSpeechEngineSapi *engine) + : m_engine(engine) + { + m_format.setChannelConfig(QAudioFormat::ChannelConfigMono); + m_format.setSampleRate(16000); + m_format.setSampleFormat(QAudioFormat::Int16); + } + virtual ~OutputStream() = default; + + // IUnknown + ULONG AddRef() override { return ++m_ref; } + ULONG Release() override { + if (!--m_ref) { + delete this; + return 0; + } + return m_ref; + } + + HRESULT QueryInterface(REFIID riid, VOID **ppvInterface) override + { + if (!ppvInterface) + return E_POINTER; + + if (riid == __uuidof(IUnknown)) { + *ppvInterface = static_cast<IUnknown*>(this); + } else if (riid == __uuidof(IStream)) { + *ppvInterface = static_cast<IStream *>(this); + } else if (riid == __uuidof(ISpStreamFormat)) { + *ppvInterface = static_cast<ISpStreamFormat *>(this); + } else { + *ppvInterface = nullptr; + return E_NOINTERFACE; + } + AddRef(); + return S_OK; + } + + // IStream + HRESULT Read(void *,ULONG,ULONG *) override { return E_NOTIMPL; } + HRESULT Write(const void *pv,ULONG cb,ULONG *pcbWritten) override + { + emit m_engine->synthesized(m_format, QByteArray(static_cast<const char *>(pv), cb)); + *pcbWritten = cb; + return S_OK; + } + HRESULT Seek(LARGE_INTEGER dlibMove, DWORD dwOrigin, ULARGE_INTEGER *plibNewPosition) override + { + qint64 move = dlibMove.QuadPart; + switch (dwOrigin) { + case STREAM_SEEK_SET: + m_pos = move; + break; + case STREAM_SEEK_CUR: + m_pos += move; + break; + case STREAM_SEEK_END: + m_pos = m_length + move; + break; + } + (*plibNewPosition).QuadPart = m_pos; + return S_OK; + } + HRESULT SetSize(ULARGE_INTEGER) override { return E_NOTIMPL; } + HRESULT CopyTo(IStream *,ULARGE_INTEGER,ULARGE_INTEGER *,ULARGE_INTEGER *) override { return E_NOTIMPL; } + HRESULT Commit(DWORD) override { return E_NOTIMPL; } + HRESULT Revert(void) override { return E_NOTIMPL; } + HRESULT LockRegion(ULARGE_INTEGER,ULARGE_INTEGER,DWORD) override { return E_NOTIMPL; } + HRESULT UnlockRegion(ULARGE_INTEGER,ULARGE_INTEGER,DWORD) override { return E_NOTIMPL; } + HRESULT Stat(STATSTG *,DWORD) override { return E_NOTIMPL; } + HRESULT Clone(IStream **) override { return E_NOTIMPL; } + + // ISpStreamFormat + HRESULT GetFormat(GUID *pguidFormatId,WAVEFORMATEX **ppCoMemWaveFormatEx) override + { + *pguidFormatId = SPDFID_WaveFormatEx; + WAVEFORMATEX *format = static_cast<WAVEFORMATEX *>(CoTaskMemAlloc(sizeof(WAVEFORMATEX))); + format->wFormatTag = WAVE_FORMAT_PCM; + format->nChannels = m_format.channelCount(); + format->nSamplesPerSec = m_format.sampleRate(); + format->wBitsPerSample = m_format.bytesPerSample() * 8; + format->nBlockAlign = format->nChannels * format->wBitsPerSample / 8; + format->nAvgBytesPerSec = format->nSamplesPerSec * format->nBlockAlign; + format->cbSize = 0; // amount of extra format information + + *ppCoMemWaveFormatEx = format; + return S_OK; + } + }; + + if (text.isEmpty()) + return; + + currentText = text; + const QString prefix = u"<pitch absmiddle=\"%1\"/>"_qs.arg(m_pitch * 10); + textOffset = prefix.length(); + currentText.prepend(prefix); + + OutputStream *outputStream = new OutputStream(this); + m_voice->SetOutput(outputStream, false); + HRESULT hr = m_voice->Speak(currentText.toStdWString().data(), SPF_ASYNC, NULL); + if (!SUCCEEDED(hr)) + setError(QTextToSpeech::ErrorReason::Input, + QCoreApplication::translate("QTextToSpeech", "Speech synthesizing failure.")); +} + void QTextToSpeechEngineSapi::stop(QTextToSpeech::BoundaryHint boundaryHint) { Q_UNUSED(boundaryHint); diff --git a/src/plugins/tts/sapi/qtexttospeech_sapi.h b/src/plugins/tts/sapi/qtexttospeech_sapi.h index 6b139d2..dd0cea5 100644 --- a/src/plugins/tts/sapi/qtexttospeech_sapi.h +++ b/src/plugins/tts/sapi/qtexttospeech_sapi.h @@ -29,6 +29,7 @@ public: QList<QLocale> availableLocales() const override; QList<QVoice> availableVoices() const override; void say(const QString &text) override; + void synthesize(const QString &text) override; void stop(QTextToSpeech::BoundaryHint boundaryHint) override; void pause(QTextToSpeech::BoundaryHint boundaryHint) override; void resume() override; @@ -47,6 +48,7 @@ public: QString errorString() const override; HRESULT STDMETHODCALLTYPE NotifyCallback(WPARAM /*wParam*/, LPARAM /*lParam*/) override; + friend class OputStream; private: bool isSpeaking() const; diff --git a/src/plugins/tts/sapi/sapi_plugin.json b/src/plugins/tts/sapi/sapi_plugin.json index c16b87f..95de0bf 100644 --- a/src/plugins/tts/sapi/sapi_plugin.json +++ b/src/plugins/tts/sapi/sapi_plugin.json @@ -5,6 +5,7 @@ "Priority": 50, "Capabilities": [ "Speak", - "WordByWordProgress" + "WordByWordProgress", + "Synthesize" ] } diff --git a/src/plugins/tts/speechdispatcher/qtexttospeech_speechd.cpp b/src/plugins/tts/speechdispatcher/qtexttospeech_speechd.cpp index 3026d39..f93ea1c 100644 --- a/src/plugins/tts/speechdispatcher/qtexttospeech_speechd.cpp +++ b/src/plugins/tts/speechdispatcher/qtexttospeech_speechd.cpp @@ -151,6 +151,11 @@ void QTextToSpeechEngineSpeechd::say(const QString &text) QCoreApplication::translate("QTextToSpeech", "Text synthesizing failure.")); } +void QTextToSpeechEngineSpeechd::synthesize(const QString &) +{ + setError(QTextToSpeech::ErrorReason::Configuration, tr("Synthesize not supported")); +} + void QTextToSpeechEngineSpeechd::stop(QTextToSpeech::BoundaryHint boundaryHint) { Q_UNUSED(boundaryHint); diff --git a/src/plugins/tts/speechdispatcher/qtexttospeech_speechd.h b/src/plugins/tts/speechdispatcher/qtexttospeech_speechd.h index 1860e9d..b9aaf9f 100644 --- a/src/plugins/tts/speechdispatcher/qtexttospeech_speechd.h +++ b/src/plugins/tts/speechdispatcher/qtexttospeech_speechd.h @@ -28,6 +28,7 @@ public: QList<QLocale> availableLocales() const override; QList<QVoice> availableVoices() const override; void say(const QString &text) override; + void synthesize(const QString &text) override; void stop(QTextToSpeech::BoundaryHint boundaryHint) override; void pause(QTextToSpeech::BoundaryHint boundaryHint) override; void resume() override; diff --git a/src/plugins/tts/winrt/qtexttospeech_winrt.cpp b/src/plugins/tts/winrt/qtexttospeech_winrt.cpp index b98ffe8..fdbaad0 100644 --- a/src/plugins/tts/winrt/qtexttospeech_winrt.cpp +++ b/src/plugins/tts/winrt/qtexttospeech_winrt.cpp @@ -472,6 +472,51 @@ void QTextToSpeechEngineWinRT::say(const QString &text) }); } +void QTextToSpeechEngineWinRT::synthesize(const QString &text) +{ + Q_D(QTextToSpeechEngineWinRT); + + HRESULT hr = S_OK; + + HStringReference nativeText(reinterpret_cast<LPCWSTR>(text.utf16()), text.length()); + + ComPtr<IAsyncOperation<SpeechSynthesisStream*>> synthOperation; + hr = d->synth->SynthesizeTextToStreamAsync(nativeText.Get(), &synthOperation); + if (!SUCCEEDED(hr)) { + d->setError(QTextToSpeech::ErrorReason::Input, + QCoreApplication::translate("QTextToSpeech", "Speech synthesizing failure.")); + return; + } + + // The source will wait for the the data resulting out of the synthOperation, and emits + // streamReady when data is available. This starts a QAudioSink, which pulls the data. + d->audioSource.Attach(new AudioSource(synthOperation)); + + connect(d->audioSource.Get(), &AudioSource::streamReady, this, [d, this](const QAudioFormat &format){ + if (d->state != QTextToSpeech::Synthesizing) { + d->state = QTextToSpeech::Synthesizing; + emit stateChanged(d->state); + } + }); + connect(d->audioSource.Get(), &AudioSource::readyRead, this, [d, this](){ + Q_ASSERT(d->state == QTextToSpeech::Synthesizing); + const QByteArray data = d->audioSource->read(d->audioSource->bytesAvailable()); + emit synthesized(d->audioSource->format(), data); + if (d->audioSource->atEnd()) + d->audioSource->close(); + }); + connect(d->audioSource.Get(), &AudioSource::aboutToClose, this, [d, this]{ + if (d->state != QTextToSpeech::Ready) { + d->state = QTextToSpeech::Ready; + emit stateChanged(d->state); + } + }); + connect(d->audioSource.Get(), &AudioSource::errorInStream, this, [d]{ + d->setError(QTextToSpeech::ErrorReason::Input, + QCoreApplication::translate("QTextToSpeech", "Error synthesizing audio stream.")); + }); +} + void QTextToSpeechEngineWinRT::stop(QTextToSpeech::BoundaryHint boundaryHint) { Q_UNUSED(boundaryHint); diff --git a/src/plugins/tts/winrt/qtexttospeech_winrt.h b/src/plugins/tts/winrt/qtexttospeech_winrt.h index 39cdeab..045f8e0 100644 --- a/src/plugins/tts/winrt/qtexttospeech_winrt.h +++ b/src/plugins/tts/winrt/qtexttospeech_winrt.h @@ -28,6 +28,7 @@ public: QList<QLocale> availableLocales() const override; QList<QVoice> availableVoices() const override; void say(const QString &text) override; + void synthesize(const QString &text) override; void stop(QTextToSpeech::BoundaryHint boundaryHint) override; void pause(QTextToSpeech::BoundaryHint boundaryHint) override; void resume() override; diff --git a/src/plugins/tts/winrt/qtexttospeech_winrt_audiosource.cpp b/src/plugins/tts/winrt/qtexttospeech_winrt_audiosource.cpp index b66a78a..a680ffe 100644 --- a/src/plugins/tts/winrt/qtexttospeech_winrt_audiosource.cpp +++ b/src/plugins/tts/winrt/qtexttospeech_winrt_audiosource.cpp @@ -173,7 +173,7 @@ qint64 AudioSource::readData(char *data, qint64 maxlen) bool AudioSource::atEnd() const { // not done as long as QIODevice's buffer is not empty - if (!QIODevice::atEnd()) + if (!QIODevice::atEnd() && QIODevice::bytesAvailable()) return false; // If we get here, bytesAvailable() has returned 0, so our buffers are @@ -279,8 +279,8 @@ HRESULT AudioSource::Invoke(IAsyncOperation<SpeechSynthesisStream*> *operation, // we are buffered, but we don't want QIODevice to buffer as well open(QIODevice::ReadOnly|QIODevice::Unbuffered); - fetchMore(); emit streamReady(audioFormat); + fetchMore(); return S_OK; } diff --git a/src/plugins/tts/winrt/qtexttospeech_winrt_audiosource.h b/src/plugins/tts/winrt/qtexttospeech_winrt_audiosource.h index 489b265..a9d5ebf 100644 --- a/src/plugins/tts/winrt/qtexttospeech_winrt_audiosource.h +++ b/src/plugins/tts/winrt/qtexttospeech_winrt_audiosource.h @@ -43,6 +43,8 @@ public: bool atEnd() const override; qint64 bytesAvailable() const override; + QAudioFormat format() const { return audioFormat; } + enum PauseState { NoPause, PauseRequested, diff --git a/src/plugins/tts/winrt/winrt_plugin.json b/src/plugins/tts/winrt/winrt_plugin.json index 5e4104b..f47e3f6 100644 --- a/src/plugins/tts/winrt/winrt_plugin.json +++ b/src/plugins/tts/winrt/winrt_plugin.json @@ -5,6 +5,7 @@ "Priority": 80, "Capabilities": [ "Speak", - "WordByWordProgress" + "WordByWordProgress", + "Synthesize" ] } diff --git a/src/tts/CMakeLists.txt b/src/tts/CMakeLists.txt index 71e3d11..b8e92ea 100644 --- a/src/tts/CMakeLists.txt +++ b/src/tts/CMakeLists.txt @@ -16,6 +16,7 @@ qt_internal_add_module(TextToSpeech Qt::QmlIntegration PUBLIC_LIBRARIES Qt::Core + Qt::Multimedia PRIVATE_MODULE_INTERFACE Qt::CorePrivate ) diff --git a/src/tts/qtexttospeech.cpp b/src/tts/qtexttospeech.cpp index e6599b7..29d2fd3 100644 --- a/src/tts/qtexttospeech.cpp +++ b/src/tts/qtexttospeech.cpp @@ -70,9 +70,13 @@ void QTextToSpeechPrivate::setEngineProvider(const QString &engine, const QVaria // Connect signals directly from the engine to the public API signals if (m_engine) { - QObject::connect(m_engine, &QTextToSpeechEngine::stateChanged, q, &QTextToSpeech::stateChanged); + QObject::connect(m_engine, &QTextToSpeechEngine::stateChanged, + q, [this](QTextToSpeech::State newState){ + updateState(newState); + }); QObject::connect(m_engine, &QTextToSpeechEngine::errorOccurred, q, &QTextToSpeech::errorOccurred); QObject::connect(m_engine, &QTextToSpeechEngine::sayingWord, q, &QTextToSpeech::sayingWord); + QObject::connect(m_engine, &QTextToSpeechEngine::synthesized, q, &QTextToSpeech::synthesized); } } @@ -139,6 +143,19 @@ void QTextToSpeechPrivate::loadPluginMetadata(QMultiHash<QString, QCborMap> &lis } } +void QTextToSpeechPrivate::updateState(QTextToSpeech::State newState) +{ + Q_Q(QTextToSpeech); + if (newState == QTextToSpeech::Ready && m_slotObject) { + // If we are done synthesizing and the functor-overload was used, + // clear the temporary connection. + m_slotObject->destroyIfLastRef(); + m_slotObject = nullptr; + m_engine->disconnect(m_synthesizeConnection); + } + emit q->stateChanged(newState); +} + /*! \class QTextToSpeech \brief The QTextToSpeech class provides a convenient access to text-to-speech engines. @@ -209,11 +226,13 @@ void QTextToSpeechPrivate::loadPluginMetadata(QMultiHash<QString, QCborMap> &lis \brief This enum describes the current state of the text-to-speech engine. - \value Ready The synthesizer is ready to start a new text. This is - also the state after a text was finished. - \value Speaking Text is being spoken. - \value Paused The synthesis was paused and can be resumed with \l resume(). - \value Error An error has occurred. Details are given by \l errorReason(). + \value Ready The synthesizer is ready to start a new text. This is + also the state after a text was finished. + \value Speaking Text is being spoken. + \value Synthesizing Text is being synthesized into PCM data. The synthesized() + signal will be emitted with chunks of data. + \value Paused The synthesis was paused and can be resumed with \l resume(). + \value Error An error has occurred. Details are given by \l errorReason(). \sa QTextToSpeech::ErrorReason errorReason() errorString() */ @@ -382,6 +401,8 @@ QString QTextToSpeech::engine() const \value Speak The engine can play audio output from text. \value WordByWordProgress The engine emits the sayingWord() signal for each word that gets spoken. + \value Synthesize The engine can \l{synthesize()}{synthesize} PCM + audio data from text. \sa engineCapabilities() */ @@ -585,7 +606,7 @@ QString QTextToSpeech::errorString() const */ /*! - Starts synthesizing the \a text. + Starts speaking the \a text. This function starts sythesizing the speech asynchronously, and reads the text to the default audio output device. @@ -599,7 +620,7 @@ QString QTextToSpeech::errorString() const set to \l Speaking once the reading starts. When the reading is done, \l state will be set to \l Ready. - \sa stop(), pause(), resume() + \sa stop(), pause(), resume(), synthesize() */ void QTextToSpeech::say(const QString &text) { @@ -609,6 +630,114 @@ void QTextToSpeech::say(const QString &text) } /*! + Synthesizes the \a text into raw audio data. + \since 6.6 + + This function synthesizes the speech asynchronously into raw audio data. + When data is available, the \l synthesized() signal is emitted with the + bytes, and the \l {QAudioFormat}{format} that the data is in. + + The \l state property is set to \l Synthesizing when the synthesis starts, + and to \l Ready once the synthesis is finished. While synthesizing, the + synthesized() signal might be emitted multiple times, possibly with + changing values for \c format. + + \sa say(), stop() +*/ +void QTextToSpeech::synthesize(const QString &text) +{ + Q_D(QTextToSpeech); + if (d->m_engine) + d->m_engine->synthesize(text); +} + +/*! + \fn template<typename Functor> void QTextToSpeech::synthesize( + const QString &text, Functor functor) + \fn template<typename Functor> void QTextToSpeech::synthesize( + const QString &text, const QObject *context, Functor functor) + \since 6.6 + + Synthesizes the \a text into raw audio data. + + This function synthesizes the speech asynchronously into raw audio data. + When data is available, the \a functor will be called as + \c {functor(const QAudioFormat &format, const QByteArray &bytes)}, with + \c format describing the \l {QAudioFormat}{format} of the data in \c bytes. + + The \l state property is set to \l Synthesizing when the synthesis starts, + and to \l Ready once the synthesis is finished. While synthesizing, the + \a functor might be called multiple times, possibly with changing values + for \c format. + + The \a functor can be a callable, like a lambda or free function, with an + optional \a context object: + + \code + tts.synthesize("Hello world", [](const QAudioFormat &format, const QByteArray &bytes){ + // process data according to format + }); + \endcode + + or a slot in the \a context object: + + \code + struct PCMProcessor : QObject + { + void processData(const QAudioFormat &format, const QByteArray &bytes) + { + // process data according to format + } + } processor; + tts.synthesize("Hello world", &processor, &PCMProcessor::processData); + \endcode + + If \a context is destroyed, then the \a functor will no longer get called. + + \note This API requires that the engine has the + \l {QTextToSpeech::Capability::}{Synthesize} capability. + + \sa say(), stop() +*/ + +/*! + \internal + + Handles the engine's synthesized() signal to call \a slotObj on the \a context + object. The slot object and the temporary connection are stored and released + in updateState() when the state of the engine transitions back to Ready. +*/ +void QTextToSpeech::synthesizeImpl(const QString &text, + QtPrivate::QSlotObjectBase *slotObj, const QObject *context) +{ + Q_D(QTextToSpeech); + Q_ASSERT(slotObj); + d->m_slotObject = slotObj; + const auto receive = [d, context](const QAudioFormat &format, const QByteArray &bytes){ + Q_ASSERT(d->m_slotObject); + void *args[] = {nullptr, + const_cast<QAudioFormat *>(&format), + const_cast<QByteArray *>(&bytes)}; + d->m_slotObject->call(const_cast<QObject *>(context), args); + }; + d->m_synthesizeConnection = connect(d->m_engine, &QTextToSpeechEngine::synthesized, + context ? context : this, receive); + synthesize(text); +} + +/*! + \fn void QTextToSpeech::synthesized(const QAudioFormat &format, const QByteArray &data) + + This signal is emitted when pcm \a data is available. The data is encoded in \a format. + A single call to \l synthesize() might result in several emissions of this signal. + + \note This signal requires that the engine has the + \l {QTextToSpeech::Capability::}{Synthesize} capability. + + \sa synthesize() +*/ + +/*! \qmlmethod TextToSpeech::stop(BoundaryHint boundaryHint) Stops the current reading at \a boundaryHint. diff --git a/src/tts/qtexttospeech.h b/src/tts/qtexttospeech.h index e98d615..3d78e0a 100644 --- a/src/tts/qtexttospeech.h +++ b/src/tts/qtexttospeech.h @@ -16,6 +16,8 @@ QT_BEGIN_NAMESPACE +class QAudioFormat; + class QTextToSpeechPrivate; class Q_TEXTTOSPEECH_EXPORT QTextToSpeech : public QObject { @@ -29,12 +31,14 @@ class Q_TEXTTOSPEECH_EXPORT QTextToSpeech : public QObject Q_PROPERTY(QVoice voice READ voice WRITE setVoice NOTIFY voiceChanged) Q_PROPERTY(Capabilities engineCapabilities READ engineCapabilities NOTIFY engineChanged) Q_DECLARE_PRIVATE(QTextToSpeech) + public: enum State { Ready, Speaking, Paused, - Error + Error, + Synthesizing }; Q_ENUM(State) @@ -59,6 +63,7 @@ public: None = 0, Speak = 1 << 0, WordByWordProgress = 1 << 1, + Synthesize = 1 << 2, }; Q_DECLARE_FLAGS(Capabilities, Capability) Q_FLAG(Capabilities) @@ -89,8 +94,71 @@ public: Q_INVOKABLE static QStringList availableEngines(); +# ifdef Q_QDOC + template <typename Functor> + void synthesize(const QString &text, Functor functor); + template <typename Functor> + void synthesize(const QString &text, const QObject *context, Functor functor); +# else + template <typename Slot> // synthesize to a QObject member function + void synthesize(const QString &text, + const typename QtPrivate::FunctionPointer<Slot>::Object *receiver, Slot slot) + { + using CallbackSignature = QtPrivate::FunctionPointer<void (*)(QAudioFormat, QByteArray)>; + using SlotSignature = QtPrivate::FunctionPointer<Slot>; + + static_assert(int(SlotSignature::ArgumentCount) <= int(CallbackSignature::ArgumentCount), + "Slot requires more arguments than what can be provided."); + static_assert((QtPrivate::CheckCompatibleArguments<typename CallbackSignature::Arguments, + typename SlotSignature::Arguments>::value), + "Slot arguments are not compatible (must be QAudioFormat, QByteArray)"); + + auto slotObj = new QtPrivate::QSlotObject<Slot, typename SlotSignature::Arguments, void>(slot); + synthesizeImpl(text, slotObj, receiver); + } + + // synthesize to a functor or function pointer (with context) + template <typename Func, std::enable_if_t< + !QtPrivate::FunctionPointer<Func>::IsPointerToMemberFunction + && !std::is_same<const char *, Func>::value, bool> = true> + void synthesize(const QString &text, const QObject *context, Func func) + { + using CallbackSignature = QtPrivate::FunctionPointer<void (*)(QAudioFormat, QByteArray)>; + constexpr int MatchingArgumentCount = QtPrivate::ComputeFunctorArgumentCount< + Func, CallbackSignature::Arguments>::Value; + + static_assert(MatchingArgumentCount == 0 + || MatchingArgumentCount == CallbackSignature::ArgumentCount, + "Functor arguments are not compatible (must be QAudioFormat, QByteArray)"); + + QtPrivate::QSlotObjectBase *slotObj = nullptr; + if constexpr (MatchingArgumentCount == CallbackSignature::ArgumentCount) { + slotObj = new QtPrivate::QFunctorSlotObject<Func, 2, + typename CallbackSignature::Arguments, void>(std::move(func)); + } else if constexpr (MatchingArgumentCount == 1) { + slotObj = new QtPrivate::QFunctorSlotObject<Func, 1, + typename CallbackSignature::Arguments, void>(std::move(func)); + } else { + slotObj = new QtPrivate::QFunctorSlotObject<Func, 0, + typename QtPrivate::List_Left<void, 0>::Value, void>(std::move(func)); + } + + synthesizeImpl(text, slotObj, context); + } + + // synthesize to a functor or function pointer (without context) + template <typename Func, std::enable_if_t< + !QtPrivate::FunctionPointer<Func>::IsPointerToMemberFunction + && !std::is_same<const char *, Func>::value, bool> = true> + void synthesize(const QString &text, Func func) + { + synthesize(text, nullptr, std::move(func)); + } +# endif // Q_QDOC + public Q_SLOTS: void say(const QString &text); + void synthesize(const QString &text); void stop(QTextToSpeech::BoundaryHint boundaryHint = QTextToSpeech::BoundaryHint::Default); void pause(QTextToSpeech::BoundaryHint boundaryHint = QTextToSpeech::BoundaryHint::Default); void resume(); @@ -113,8 +181,12 @@ Q_SIGNALS: void voiceChanged(const QVoice &voice); void sayingWord(qsizetype start, qsizetype length); + void synthesized(const QAudioFormat &format, const QByteArray &data); private: + void synthesizeImpl(const QString &text, + QtPrivate::QSlotObjectBase *slotObj, const QObject *context); + Q_DISABLE_COPY(QTextToSpeech) }; Q_DECLARE_OPERATORS_FOR_FLAGS(QTextToSpeech::Capabilities) diff --git a/src/tts/qtexttospeech_p.h b/src/tts/qtexttospeech_p.h index cf5e657..6b6a93d 100644 --- a/src/tts/qtexttospeech_p.h +++ b/src/tts/qtexttospeech_p.h @@ -40,6 +40,7 @@ public: private: bool loadMeta(); void loadPlugin(); + void updateState(QTextToSpeech::State newState); static void loadPluginMetadata(QMultiHash<QString, QCborMap> &list); QTextToSpeech *q_ptr; QTextToSpeechPlugin *m_plugin = nullptr; @@ -47,6 +48,8 @@ private: QString m_providerName; QCborMap m_metaData; static QMutex m_mutex; + QMetaObject::Connection m_synthesizeConnection; + QtPrivate::QSlotObjectBase *m_slotObject = nullptr; }; QT_END_NAMESPACE diff --git a/src/tts/qtexttospeechengine.h b/src/tts/qtexttospeechengine.h index 9fdb87b..2fc1825 100644 --- a/src/tts/qtexttospeechengine.h +++ b/src/tts/qtexttospeechengine.h @@ -9,9 +9,12 @@ #include <QtCore/QObject> #include <QtCore/QLocale> #include <QtCore/QDir> +#include <QtMultimedia/QAudioFormat> QT_BEGIN_NAMESPACE +class QAudioFormat; + class Q_TEXTTOSPEECH_EXPORT QTextToSpeechEngine : public QObject { Q_OBJECT @@ -24,6 +27,9 @@ public: virtual QList<QVoice> availableVoices() const = 0; virtual void say(const QString &text) = 0; + virtual void synthesize(const QString &text) { + Q_UNUSED(text); + }; virtual void stop(QTextToSpeech::BoundaryHint boundaryHint) = 0; virtual void pause(QTextToSpeech::BoundaryHint boundaryHint) = 0; virtual void resume() = 0; @@ -52,6 +58,7 @@ Q_SIGNALS: void errorOccurred(QTextToSpeech::ErrorReason error, const QString &errorString); void sayingWord(qsizetype start, qsizetype length); + void synthesized(const QAudioFormat &format, const QByteArray &data); }; QT_END_NAMESPACE diff --git a/tests/auto/qtexttospeech/tst_qtexttospeech.cpp b/tests/auto/qtexttospeech/tst_qtexttospeech.cpp index 09c4f22..fd355cf 100644 --- a/tests/auto/qtexttospeech/tst_qtexttospeech.cpp +++ b/tests/auto/qtexttospeech/tst_qtexttospeech.cpp @@ -6,6 +6,7 @@ #include <QTextToSpeech> #include <QSignalSpy> #include <QMediaDevices> +#include <QAudioFormat> #include <QAudioDevice> #include <QOperatingSystemVersion> #include <QRegularExpression> @@ -53,6 +54,12 @@ private slots: void sayingWordWithPause_data(); void sayingWordWithPause(); + void synthesize_data(); + void synthesize(); + + void synthesizeCallback_data(); + void synthesizeCallback(); + private: static bool hasDefaultAudioOutput() { @@ -74,6 +81,13 @@ private: } } } + + void onError(QTextToSpeech::ErrorReason error, const QString &errorString) { + errorReason = error; + qCritical() << "Error:" << errorString; + } + + QTextToSpeech::ErrorReason errorReason = QTextToSpeech::ErrorReason::NoError; }; void tst_QTextToSpeech::initTestCase_data() @@ -601,5 +615,175 @@ void tst_QTextToSpeech::sayingWordWithPause() debugHelper.dismiss(); } +void tst_QTextToSpeech::synthesize_data() +{ + QTest::addColumn<QString>("text"); + + QTest::addRow("text") << "Let's synthesize some text!"; +} + +void tst_QTextToSpeech::synthesize() +{ + QFETCH_GLOBAL(QString, engine); + if (engine != "mock" && !hasDefaultAudioOutput()) + QSKIP("No audio device present"); + if (engine == "android" && QOperatingSystemVersion::current() < QOperatingSystemVersion::Android10) + QSKIP("Only testing on recent Android versions"); + + QFETCH(QString, text); + + QTextToSpeech tts(engine); + if (!(tts.engineCapabilities() & QTextToSpeech::Capability::Synthesize)) + QSKIP("This engine doesn't support synthesize()"); + + connect(&tts, &QTextToSpeech::errorOccurred, this, &tst_QTextToSpeech::onError); + QTRY_COMPARE(tts.state(), QTextToSpeech::Ready); + selectWorkingVoice(&tts); + + QElapsedTimer speechTimer; + // We can't assume that synthesis isn't done before we can check, and that we only + // have a single change during an event loop cycle, so connect to the signal + // and keep track ourselves. + bool running = false; + bool finished = false; + qint64 speechTime = 0; + connect(&tts, &QTextToSpeech::stateChanged, [&running, &finished, &speechTimer, &speechTime](QTextToSpeech::State state) { + if (state == QTextToSpeech::Synthesizing || state == QTextToSpeech::Speaking) { + speechTimer.start(); + running = true; + finished = false; + } + if (running && state == QTextToSpeech::Ready) { + if (!speechTime) + speechTime = speechTimer.elapsed(); + finished = true; + } + }); + + // first, measure how long it takes to speak the text + tts.say(text); + QTRY_VERIFY(running); + QTRY_VERIFY(finished); + + running = false; + + QAudioFormat pcmFormat; + QByteArray pcmData; + + connect(&tts, &QTextToSpeech::synthesized, + this, [&pcmFormat, &pcmData](const QAudioFormat &format, const QByteArray &bytes) { + pcmFormat = format; + pcmData += bytes; + }); + + QElapsedTimer notBlockingTimer; + notBlockingTimer.start(); + tts.synthesize(text); + QCOMPARE_LT(notBlockingTimer.elapsed(), 250); + QTRY_VERIFY(running); + QTRY_VERIFY(finished); + + QVERIFY(pcmFormat.isValid()); + // bytesForDuration takes micro seconds, we measured in milliseconds. + const qint32 bytesExpected = pcmFormat.bytesForDuration(speechTime * 1000); + + // We should have as much data as the format requires for the time it took + // to play the speech, +/- 10% as we can't measure the exact audio duration. + QCOMPARE_GE(pcmData.size(), double(bytesExpected) * 0.9); + if (engine == "flite") // flite is very unreliable + QCOMPARE_LT(pcmData.size(), double(bytesExpected) * 1.5); + else + QCOMPARE_LT(pcmData.size(), double(bytesExpected) * 1.1); +} + +/*! + API test for the functor variants of synthesize(), using only the mock + engine as the engine implementation is identical to the non-functor + version tested above. +*/ +void tst_QTextToSpeech::synthesizeCallback_data() +{ + QTest::addColumn<QString>("text"); + + QTest::addRow("one") << "test"; + QTest::addRow("several") << "this will produce more than one chunk."; +} + +void tst_QTextToSpeech::synthesizeCallback() +{ + QFETCH_GLOBAL(QString, engine); + if (engine != "mock") + QSKIP("Only testing with mock engine"); + + QTextToSpeech tts(engine); + QVERIFY(tts.engineCapabilities() & QTextToSpeech::Capability::Synthesize); + + QFETCH(QString, text); + + QAudioFormat expectedFormat; + QByteArray expectedBytes; + + // record a reference using the already tested synthesized() signal + auto connection = connect(&tts, &QTextToSpeech::synthesized, + [&expectedFormat, &expectedBytes](const QAudioFormat &format, const QByteArray &bytes){ + expectedFormat = format; + expectedBytes += bytes; + }); + tts.synthesize(text); + QTRY_VERIFY(expectedFormat.isValid()); + QTRY_COMPARE(tts.state(), QTextToSpeech::Ready); + tts.disconnect(connection); + + struct Processor : QObject { + void process(const QAudioFormat &format, const QByteArray &bytes) + { + m_format = format; + m_allBytes += bytes; + } + void audioFormatKnown(const QAudioFormat &format) + { + m_format = format; + } + void reset() + { + m_format = {}; + m_allBytes = {}; + } + QAudioFormat m_format; + QByteArray m_allBytes; + } processor; + + // Functor without context + tts.synthesize(text, [&processor](const QAudioFormat &format, const QByteArray &bytes){ + processor.m_format = format; + processor.m_allBytes += bytes; + }); + QTRY_COMPARE(processor.m_format, expectedFormat); + QTRY_COMPARE(tts.state(), QTextToSpeech::Ready); + QCOMPARE(processor.m_allBytes, expectedBytes); + processor.reset(); + // Functor with context + tts.synthesize(text, &tts, [&processor](const QAudioFormat &format, const QByteArray &bytes){ + processor.m_format = format; + processor.m_allBytes += bytes; + }); + QTRY_COMPARE(processor.m_format, expectedFormat); + QTRY_COMPARE(tts.state(), QTextToSpeech::Ready); + QCOMPARE(processor.m_allBytes, expectedBytes); + processor.reset(); + // PMF + tts.synthesize(text, &processor, &Processor::process); + QTRY_COMPARE(processor.m_format, expectedFormat); + QTRY_COMPARE(tts.state(), QTextToSpeech::Ready); + QCOMPARE(processor.m_allBytes, expectedBytes); + processor.reset(); + // PMF with no QByteArray argument - not very useful, but Qt allows it + tts.synthesize(text, &processor, &Processor::audioFormatKnown); + QTRY_COMPARE(processor.m_format, expectedFormat); + QTRY_COMPARE(tts.state(), QTextToSpeech::Ready); + QCOMPARE(processor.m_allBytes, QByteArray()); + processor.reset(); +} + QTEST_MAIN(tst_QTextToSpeech) #include "tst_qtexttospeech.moc" |