Add QTextToSpeech::synthesize to produce PCM data rather than audio

The function starts the synthesis as an asynchronous process, and emits a signal 'synthesized()' (or calls a functor) with a chunk of PCM data as a QByteArray, and the QAudioFormat in which the data is encoded. This requires a dependency to Qt Multimedia for Qt Speech for all platforms; it has so far been required only with flite and winrt backends. Implemented for all engines, except speechd and macos engines where it's not possible - these engines don't provide access to the data. The test case verifies that the implementation is asynchronous, and that it produces a reasonable amount of data. Since this involves timer-based measurements, values need to be compared with some appropriate margins. The QML documentation of this API is omitted on purpose; the QAudioFormat type is not available in QML, and we don't want to encourage users to operate on raw bytes from QML anyway. [ChangeLog][QtTextToSpeech][QTextToSpeech] Added the ability to produce PCM data as a QByteArray. The QtTextToSpeech module now depends on QtMultimedia on all platforms. Fixes: QTBUG-109837 Change-Id: I308a3e18998827089c0f75789b720f1bd36e3c46 Reviewed-by: Qt CI Bot <qt_ci_bot@qt-project.org> Reviewed-by: Axel Spoerl <axel.spoerl@qt.io>
author: Volker Hilsheimer <volker.hilsheimer@qt.io> 2023-01-18 15:04:41 +0100
committer: Volker Hilsheimer <volker.hilsheimer@qt.io> 2023-02-19 19:36:36 +0100
commit: c03afcc297bf250baff8d0693e4db0c8cc77eeed (patch)
tree: 257c98299d9c94a3c998b13a5ef19a7d5acd1b78
parent: ea5c48e518789c3387ed9c9d21978eda122e9782 (diff)
36 files changed, 920 insertions, 55 deletions
diff --git a/src/plugins/tts/android/jar/src/org/qtproject/qt/android/speech/QtTextToSpeech.java b/src/plugins/tts/android/jar/src/org/qtproject/qt/android/speech/QtTextToSpeech.java
index a8640ef..d14add6 100644
--- a/src/plugins/tts/android/jar/src/org/qtproject/qt/android/speech/QtTextToSpeech.java
+++ b/src/plugins/tts/android/jar/src/org/qtproject/qt/android/speech/QtTextToSpeech.java
@@ -5,6 +5,7 @@ package org.qtproject.qt.android.speech;
 
 import android.content.ContentResolver;
 import android.content.Context;
+import android.media.AudioFormat;
 import android.provider.Settings;
 import android.provider.Settings.SettingNotFoundException;
 import android.speech.tts.TextToSpeech;
@@ -15,6 +16,7 @@ import android.os.Build;
 import android.os.Bundle;
 import android.util.Log;
 import java.lang.Float;
+import java.io.File;
 import java.util.Locale;
 import java.util.List;
 import java.util.ArrayList;
@@ -23,12 +25,16 @@ import java.util.Set;
 public class QtTextToSpeech
 {
     private static final String UTTERANCE_ID = "UtteranceId";
+    private static final String SYNTHESIZE_ID = "SynthesizeId";
 
     // Native callback functions
     native public void notifyError(long id, long reason);
     native public void notifyReady(long id);
     native public void notifySpeaking(long id);
     native public void notifyRangeStart(long id, int start, int end, int frame);
+    native public void notifyBeginSynthesis(long id, int sampleRateInHz, int audioFormat, int channelCount);
+    native public void notifyAudioAvailable(long id, byte[] bytes);
+    native public void notifyEndSynthesis(long id);
 
     private TextToSpeech mTts;
     private final long mId;
@@ -62,6 +68,8 @@ public class QtTextToSpeech
             Log.d(utteranceTAG, "onDone");
             if (utteranceId.equals(UTTERANCE_ID)) {
                 notifyReady(mId);
+            } else if (utteranceId.equals(SYNTHESIZE_ID)) {
+                notifyEndSynthesis(mId);
             }
         }
 
@@ -96,6 +104,36 @@ public class QtTextToSpeech
                 notifyRangeStart(mId, start, end, frame);
             }
         }
+
+        @Override
+        public void onBeginSynthesis(String utteranceId, int sampleRateInHz, int audioFormat, int channelCount) {
+            Log.d(utteranceTAG, "onBeginSynthesis");
+            if (utteranceId.equals(SYNTHESIZE_ID)) {
+                switch (audioFormat) {
+                case AudioFormat.ENCODING_PCM_8BIT:
+                    audioFormat = 1; // QAudioFormat::UInt8
+                    break;
+                case AudioFormat.ENCODING_PCM_16BIT:
+                    audioFormat = 2; // QAudioFormat::Int16;
+                    break;
+                case AudioFormat.ENCODING_PCM_FLOAT:
+                    audioFormat = 4; // QAudioFormat::Float;
+                    break;
+                default:
+                    audioFormat = 0; // QAudioFormat::Unknown;
+                }
+
+                notifyBeginSynthesis(mId, sampleRateInHz, audioFormat, channelCount);
+            }
+        }
+
+        @Override
+        public void onAudioAvailable(String utteranceId, byte[] bytes) {
+            Log.d(utteranceTAG, "onAudioAvailable");
+            if (utteranceId.equals(SYNTHESIZE_ID)) {
+                notifyAudioAvailable(mId, bytes);
+            }
+        }
     };
 
     QtTextToSpeech(final Context context, final long id, String engine) {
@@ -139,6 +177,22 @@ public class QtTextToSpeech
             notifyError(mId, 3); // QTextToSpeech::ErrorReason::Input
     }
 
+    public int synthesize(String text)
+    {
+        Log.d(TAG, "TTS synthesize(): " + text);
+        int result = -1;
+
+        Bundle params = new Bundle();
+        params.putFloat(TextToSpeech.Engine.KEY_PARAM_VOLUME, mVolume);
+        File file = new File("/dev/null");
+        result = mTts.synthesizeToFile(text, params, file, SYNTHESIZE_ID);
+
+        Log.d(TAG, "TTS synthesize() result: " + Integer.toString(result));
+        if (result == TextToSpeech.ERROR)
+            notifyError(mId, 3); // QTextToSpeech::ErrorReason::Input
+        return -1;
+    }
+
     public void stop()
     {
         Log.d(TAG, "Stopping TTS");
diff --git a/src/plugins/tts/android/src/CMakeLists.txt b/src/plugins/tts/android/src/CMakeLists.txt
index 6ba9172..538a435 100644
--- a/src/plugins/tts/android/src/CMakeLists.txt
+++ b/src/plugins/tts/android/src/CMakeLists.txt
@@ -11,6 +11,7 @@ qt_internal_add_plugin(QTextToSpeechEngineAndroid
         Qt::Core
         Qt::CorePrivate
         Qt::TextToSpeech
+        Qt::Multimedia
 )
 
 add_dependencies(QTextToSpeechEngineAndroid QtAndroidTextToSpeech)
diff --git a/src/plugins/tts/android/src/android_plugin.json b/src/plugins/tts/android/src/android_plugin.json
index 099e0a8..d478826 100644
--- a/src/plugins/tts/android/src/android_plugin.json
+++ b/src/plugins/tts/android/src/android_plugin.json
@@ -5,6 +5,7 @@
     "Priority": 100,
     "Capabilities": [
         "Speak",
-        "WordByWordProgress"
+        "WordByWordProgress",
+        "Synthesize"
     ]
 }
diff --git a/src/plugins/tts/android/src/qtexttospeech_android.cpp b/src/plugins/tts/android/src/qtexttospeech_android.cpp
index b627385..3c2eae8 100644
--- a/src/plugins/tts/android/src/qtexttospeech_android.cpp
+++ b/src/plugins/tts/android/src/qtexttospeech_android.cpp
@@ -67,6 +67,58 @@ static void notifyRangeStart(JNIEnv *env, jobject thiz, jlong id, jint start, ji
 }
 Q_DECLARE_JNI_NATIVE_METHOD(notifyRangeStart)
 
+static void notifyBeginSynthesis(JNIEnv *env, jobject thiz, jlong id, int sampleRateInHz, int audioFormat, int channelCount)
+{
+    Q_UNUSED(env);
+    Q_UNUSED(thiz);
+
+    QTextToSpeechEngineAndroid *const tts = (*textToSpeechMap)[id];
+    if (!tts)
+        return;
+
+    QAudioFormat format;
+    format.setSampleRate(sampleRateInHz);
+    format.setSampleFormat(QAudioFormat::SampleFormat(audioFormat));
+    format.setChannelCount(channelCount);
+
+    QMetaObject::invokeMethod(tts, "processNotifyBeginSynthesis", Qt::AutoConnection,
+        Q_ARG(QAudioFormat, format));
+}
+Q_DECLARE_JNI_NATIVE_METHOD(notifyBeginSynthesis)
+
+static void notifyAudioAvailable(JNIEnv *env, jobject thiz, jlong id, jbyteArray bytes)
+{
+    Q_UNUSED(thiz);
+
+    QTextToSpeechEngineAndroid *const tts = (*textToSpeechMap)[id];
+    if (!tts)
+        return;
+
+    const auto sz = env->GetArrayLength(bytes);
+    QByteArray byteArray(sz, Qt::Initialization::Uninitialized);
+    env->GetByteArrayRegion(bytes, 0, sz, reinterpret_cast<jbyte *>(byteArray.data()));
+
+    QMetaObject::invokeMethod(tts, "processNotifyAudioAvailable", Qt::AutoConnection,
+        Q_ARG(QByteArray, byteArray));
+}
+Q_DECLARE_JNI_NATIVE_METHOD(notifyAudioAvailable)
+
+static void notifyEndSynthesis(JNIEnv *env, jobject thiz, jlong id)
+{
+    Q_UNUSED(env);
+    Q_UNUSED(thiz);
+
+    QTextToSpeechEngineAndroid *const tts = (*textToSpeechMap)[id];
+    if (!tts)
+        return;
+
+    // Queued so that pending processNotifyAudioAvailable
+    // invocations get processed first.
+    QMetaObject::invokeMethod(tts, "processNotifyReady", Qt::QueuedConnection);
+}
+Q_DECLARE_JNI_NATIVE_METHOD(notifyEndSynthesis)
+
+
 Q_DECL_EXPORT jint JNICALL JNI_OnLoad(JavaVM *vm, void */*reserved*/)
 {
     static bool initialized = false;
@@ -95,6 +147,9 @@ Q_DECL_EXPORT jint JNICALL JNI_OnLoad(JavaVM *vm, void */*reserved*/)
             Q_JNI_NATIVE_METHOD(notifyReady),
             Q_JNI_NATIVE_METHOD(notifySpeaking),
             Q_JNI_NATIVE_METHOD(notifyRangeStart),
+            Q_JNI_NATIVE_METHOD(notifyBeginSynthesis),
+            Q_JNI_NATIVE_METHOD(notifyAudioAvailable),
+            Q_JNI_NATIVE_METHOD(notifyEndSynthesis),
         })) {
             return JNI_ERR;
         }
@@ -134,6 +189,28 @@ void QTextToSpeechEngineAndroid::say(const QString &text)
     m_speech.callMethod<void>("say", QJniObject::fromString(m_text).object<jstring>());
 }
 
+void QTextToSpeechEngineAndroid::synthesize(const QString &text)
+{
+    if (text.isEmpty())
+        return;
+
+    m_errorReason = QTextToSpeech::ErrorReason::NoError;
+    m_text = text;
+    m_speech.callMethod<int>("synthesize", QJniObject::fromString(m_text).object<jstring>());
+}
+
+void QTextToSpeechEngineAndroid::processNotifyBeginSynthesis(const QAudioFormat &format)
+{
+    m_format = format;
+    setState(QTextToSpeech::Synthesizing);
+}
+
+void QTextToSpeechEngineAndroid::processNotifyAudioAvailable(const QByteArray &bytes)
+{
+    Q_ASSERT(m_format.isValid());
+    emit synthesized(m_format, bytes);
+}
+
 QTextToSpeech::State QTextToSpeechEngineAndroid::state() const
 {
     return m_state;
@@ -179,6 +256,8 @@ void QTextToSpeechEngineAndroid::setError(QTextToSpeech::ErrorReason reason, con
 
 void QTextToSpeechEngineAndroid::processNotifyReady()
 {
+    if (m_state == QTextToSpeech::Synthesizing)
+        m_format = {};
     if (m_state != QTextToSpeech::Paused)
         setState(QTextToSpeech::Ready);
 }
@@ -232,6 +311,7 @@ void QTextToSpeechEngineAndroid::stop(QTextToSpeech::BoundaryHint boundaryHint)
 
     m_speech.callMethod<void>("stop");
     setState(QTextToSpeech::Ready);
+    m_format = {};
 }
 
 void QTextToSpeechEngineAndroid::pause(QTextToSpeech::BoundaryHint boundaryHint)
diff --git a/src/plugins/tts/android/src/qtexttospeech_android.h b/src/plugins/tts/android/src/qtexttospeech_android.h
index 9569a8f..c86ffb1 100644
--- a/src/plugins/tts/android/src/qtexttospeech_android.h
+++ b/src/plugins/tts/android/src/qtexttospeech_android.h
@@ -26,6 +26,7 @@ public:
     QList<QLocale> availableLocales() const override;
     QList<QVoice> availableVoices() const override;
     void say(const QString &text) override;
+    void synthesize(const QString &text) override;
     void stop(QTextToSpeech::BoundaryHint boundaryHint) override;
     void pause(QTextToSpeech::BoundaryHint boundaryHint) override;
     void resume() override;
@@ -48,6 +49,8 @@ public Q_SLOTS:
     void processNotifyError(int reason);
     void processNotifySpeaking();
     void processNotifyRangeStart(int start, int end, int frame);
+    void processNotifyBeginSynthesis(const QAudioFormat &format);
+    void processNotifyAudioAvailable(const QByteArray &bytes);
 
 private:
     void setState(QTextToSpeech::State state);
@@ -59,6 +62,7 @@ private:
     QTextToSpeech::ErrorReason m_errorReason = QTextToSpeech::ErrorReason::Initialization;
     QString m_errorString;
     QString m_text;
+    QAudioFormat m_format;
 };
 
 Q_DECLARE_JNI_CLASS(QtTextToSpeech, "org/qtproject/qt/android/speech/QtTextToSpeech")
diff --git a/src/plugins/tts/darwin/CMakeLists.txt b/src/plugins/tts/darwin/CMakeLists.txt
index 9c2cf2f..c16e8f0 100644
--- a/src/plugins/tts/darwin/CMakeLists.txt
+++ b/src/plugins/tts/darwin/CMakeLists.txt
@@ -13,5 +13,6 @@ qt_internal_add_plugin(QTextToSpeechDarwinPlugin
         Qt::Core
         Qt::Gui
         Qt::TextToSpeech
+        Qt::Multimedia
         ${FWAVFoundation}
 )
diff --git a/src/plugins/tts/darwin/darwin_plugin.json b/src/plugins/tts/darwin/darwin_plugin.json
index 52e0087..98b428a 100644
--- a/src/plugins/tts/darwin/darwin_plugin.json
+++ b/src/plugins/tts/darwin/darwin_plugin.json
@@ -5,6 +5,7 @@
     "Priority": 100,
     "Capabilities": [
         "Speak",
-        "WordByWordProgress"
+        "WordByWordProgress",
+        "Synthesize"
     ]
 }
diff --git a/src/plugins/tts/darwin/qtexttospeech_darwin.h b/src/plugins/tts/darwin/qtexttospeech_darwin.h
index ab6a713..cb07732 100644
--- a/src/plugins/tts/darwin/qtexttospeech_darwin.h
+++ b/src/plugins/tts/darwin/qtexttospeech_darwin.h
@@ -10,6 +10,7 @@
 
 Q_FORWARD_DECLARE_OBJC_CLASS(AVSpeechSynthesizer);
 Q_FORWARD_DECLARE_OBJC_CLASS(AVSpeechSynthesisVoice);
+Q_FORWARD_DECLARE_OBJC_CLASS(AVSpeechUtterance);
 
 QT_BEGIN_NAMESPACE
 
@@ -24,6 +25,7 @@ public:
     QList<QLocale> availableLocales() const override;
     QList<QVoice> availableVoices() const override;
     void say(const QString &text) override;
+    void synthesize(const QString &text) override;
     void stop(QTextToSpeech::BoundaryHint boundaryHint) override;
     void pause(QTextToSpeech::BoundaryHint boundaryHint) override;
     void resume() override;
@@ -48,12 +50,14 @@ private:
     AVSpeechSynthesisVoice *fromQVoice(const QVoice &voice) const;
     QVoice toQVoice(AVSpeechSynthesisVoice *avVoice) const;
     void setError(QTextToSpeech::ErrorReason reason, const QString &string);
+    AVSpeechUtterance *prepareUtterance(const QString &text);
 
     AVSpeechSynthesizer *m_speechSynthesizer;
     QVoice m_voice;
     QTextToSpeech::State m_state = QTextToSpeech::Error;
     QTextToSpeech::ErrorReason m_errorReason = QTextToSpeech::ErrorReason::Initialization;
     QString m_errorString;
+    QAudioFormat m_format;
 
     double m_pitch = 0.0;
     double m_actualPitch = 1.0;
diff --git a/src/plugins/tts/darwin/qtexttospeech_darwin.mm b/src/plugins/tts/darwin/qtexttospeech_darwin.mm
index ca24165..8446687 100644
--- a/src/plugins/tts/darwin/qtexttospeech_darwin.mm
+++ b/src/plugins/tts/darwin/qtexttospeech_darwin.mm
@@ -6,6 +6,7 @@
 #include "qtexttospeech_darwin.h"
 
 #include <QtCore/QCoreApplication>
+#include <QtMultimedia/QAudioFormat>
 
 @interface QDarwinSpeechSynthesizerDelegate : NSObject <AVSpeechSynthesizerDelegate>
 @end
@@ -112,10 +113,8 @@ QTextToSpeechEngineDarwin::~QTextToSpeechEngineDarwin()
     [m_speechSynthesizer release];
 }
 
-void QTextToSpeechEngineDarwin::say(const QString &text)
+AVSpeechUtterance *QTextToSpeechEngineDarwin::prepareUtterance(const QString &text)
 {
-    stop(QTextToSpeech::BoundaryHint::Default);
-
     // Qt pitch: [-1.0, 1.0], 0 is normal
     // AVF range: [0.5, 2.0], 1.0 is normal
     const double desiredPitch = 1.0 + (m_pitch >= 0 ? m_pitch : (m_pitch * 0.5));
@@ -158,9 +157,79 @@ void QTextToSpeechEngineDarwin::say(const QString &text)
     utterance.volume = m_volume;
     utterance.voice = fromQVoice(m_voice);
 
+    return utterance;
+}
+
+void QTextToSpeechEngineDarwin::say(const QString &text)
+{
+    stop(QTextToSpeech::BoundaryHint::Default);
+
+    AVSpeechUtterance *utterance = prepareUtterance(text);
     [m_speechSynthesizer speakUtterance:utterance];
 }
 
+void QTextToSpeechEngineDarwin::synthesize(const QString &text)
+{
+    AVSpeechUtterance *utterance = prepareUtterance(text);
+    m_format = {};
+
+    const auto bufferCallback = ^(AVAudioBuffer *buffer){
+        setState(QTextToSpeech::Synthesizing);
+
+        if (!m_format.isValid()) {
+            const AVAudioFormat *format = buffer.format;
+            if (format.channelCount == 1)
+                m_format.setChannelConfig(QAudioFormat::ChannelConfigMono);
+            else
+                m_format.setChannelCount(format.channelCount);
+            m_format.setSampleRate(format.sampleRate);
+            m_format.setSampleFormat([&format]{
+                switch (format.commonFormat) {
+                case AVAudioPCMFormatFloat32:
+                    return QAudioFormat::Float;
+                case AVAudioPCMFormatInt16:
+                    return QAudioFormat::Int16;
+                case AVAudioPCMFormatInt32:
+                    return QAudioFormat::Int32;
+                case AVAudioPCMFormatFloat64:
+                    return QAudioFormat::Unknown;
+                case AVAudioOtherFormat: {
+                    const id bitKey = format.settings[@"AVLinearPCMBitDepthKey"];
+                    const id isFloatKey = format.settings[@"AVLinearPCMIsFloatKey"];
+                    if ([isFloatKey isEqual:@(YES)]) {
+                        if ([bitKey isEqual:@(32)])
+                            return QAudioFormat::Float;
+                    } else if ([bitKey isEqual:@(8)]) {
+                        return QAudioFormat::UInt8;
+                    } else if ([bitKey isEqual:@(16)]) {
+                        return QAudioFormat::Int16;
+                    } else if ([bitKey isEqual:@(32)]) {
+                        return QAudioFormat::Int32;
+                    }
+                    break;
+                }
+                default:
+                    break;
+                }
+                return QAudioFormat::Unknown;
+            }());
+            if (!m_format.isValid())
+                qWarning() << "Audio arrived with invalid format:" << format.settings;
+        }
+
+        const AudioBufferList *bufferList = buffer.audioBufferList;
+        for (UInt32 i = 0; i < bufferList->mNumberBuffers; ++i) {
+            const AudioBuffer &buffer = bufferList->mBuffers[i];
+            // we expect all buffers to have the same number of channels
+            if (int(buffer.mNumberChannels) != m_format.channelCount())
+                continue;
+            emit synthesized(m_format, QByteArray::fromRawData(static_cast<const char *>(buffer.mData), buffer.mDataByteSize));
+        }
+    };
+    [m_speechSynthesizer writeUtterance:utterance
+                         toBufferCallback:bufferCallback];
+}
+
 void QTextToSpeechEngineDarwin::stop(QTextToSpeech::BoundaryHint boundaryHint)
 {
     Q_UNUSED(boundaryHint);
@@ -321,6 +390,8 @@ void QTextToSpeechEngineDarwin::setState(QTextToSpeech::State state)
         return;
 
     m_state = state;
+    if (m_state == QTextToSpeech::Ready)
+        m_format = {};
     emit stateChanged(m_state);
 }
 
diff --git a/src/plugins/tts/flite/flite_plugin.json b/src/plugins/tts/flite/flite_plugin.json
index 902f5d3..b9fd6ef 100644
--- a/src/plugins/tts/flite/flite_plugin.json
+++ b/src/plugins/tts/flite/flite_plugin.json
@@ -5,6 +5,7 @@
     "Priority": 50,
     "Capabilities": [
         "Speak",
-        "WordByWordProgress"
+        "WordByWordProgress",
+        "Synthesize"
     ]
 }
diff --git a/src/plugins/tts/flite/qtexttospeech_flite.cpp b/src/plugins/tts/flite/qtexttospeech_flite.cpp
index 76c28ed..5925aea 100644
--- a/src/plugins/tts/flite/qtexttospeech_flite.cpp
+++ b/src/plugins/tts/flite/qtexttospeech_flite.cpp
@@ -33,6 +33,8 @@ QTextToSpeechEngineFlite::QTextToSpeechEngineFlite(const QVariantMap &parameters
             &QTextToSpeechEngineFlite::setError);
     connect(m_processor.get(), &QTextToSpeechProcessorFlite::sayingWord, this,
             &QTextToSpeechEngine::sayingWord);
+    connect(m_processor.get(), &QTextToSpeechProcessorFlite::synthesized, this,
+            &QTextToSpeechEngine::synthesized);
 
     // Read voices from processor before moving it to a separate thread
     const QList<QTextToSpeechProcessorFlite::VoiceInfo> voices = m_processor->voices();
@@ -83,6 +85,13 @@ void QTextToSpeechEngineFlite::say(const QString &text)
                               Q_ARG(double, rate()), Q_ARG(double, volume()));
 }
 
+void QTextToSpeechEngineFlite::synthesize(const QString &text)
+{
+    QMetaObject::invokeMethod(m_processor.get(), "synthesize", Qt::QueuedConnection, Q_ARG(QString, text),
+                              Q_ARG(int, voiceData(voice()).toInt()), Q_ARG(double, pitch()),
+                              Q_ARG(double, rate()), Q_ARG(double, volume()));
+}
+
 void QTextToSpeechEngineFlite::stop(QTextToSpeech::BoundaryHint boundaryHint)
 {
     Q_UNUSED(boundaryHint);
diff --git a/src/plugins/tts/flite/qtexttospeech_flite.h b/src/plugins/tts/flite/qtexttospeech_flite.h
index a9f8522..5e1c094 100644
--- a/src/plugins/tts/flite/qtexttospeech_flite.h
+++ b/src/plugins/tts/flite/qtexttospeech_flite.h
@@ -27,6 +27,7 @@ public:
     QList<QLocale> availableLocales() const override;
     QList<QVoice> availableVoices() const override;
     void say(const QString &text) override;
+    void synthesize(const QString &text) override;
     void stop(QTextToSpeech::BoundaryHint boundaryHint) override;
     void pause(QTextToSpeech::BoundaryHint boundaryHint) override;
     void resume() override;
diff --git a/src/plugins/tts/flite/qtexttospeech_flite_processor.cpp b/src/plugins/tts/flite/qtexttospeech_flite_processor.cpp
index b45a8d8..3c0d974 100644
--- a/src/plugins/tts/flite/qtexttospeech_flite_processor.cpp
+++ b/src/plugins/tts/flite/qtexttospeech_flite_processor.cpp
@@ -41,8 +41,8 @@ void QTextToSpeechProcessorFlite::startTokenTimer()
     m_tokenTimer.start(qMax(token.startTime - playedTime, 0), Qt::PreciseTimer, this);
 }
 
-int QTextToSpeechProcessorFlite::fliteOutputCb(const cst_wave *w, int start, int size,
-    int last, cst_audio_streaming_info *asi)
+int QTextToSpeechProcessorFlite::audioOutputCb(const cst_wave *w, int start, int size,
+                                               int last, cst_audio_streaming_info *asi)
 {
     QTextToSpeechProcessorFlite *processor = static_cast<QTextToSpeechProcessorFlite *>(asi->userdata);
     if (processor) {
@@ -72,13 +72,13 @@ int QTextToSpeechProcessorFlite::fliteOutputCb(const cst_wave *w, int start, int
             }
             asi->item = item_next(asi->item);
         }
-        return processor->fliteOutput(w, start, size, last, asi);
+        return processor->audioOutput(w, start, size, last, asi);
     }
     return CST_AUDIO_STREAM_STOP;
 }
 
-int QTextToSpeechProcessorFlite::fliteOutput(const cst_wave *w, int start, int size,
-                int last, cst_audio_streaming_info *asi)
+int QTextToSpeechProcessorFlite::audioOutput(const cst_wave *w, int start, int size,
+                                             int last, cst_audio_streaming_info *asi)
 {
     Q_UNUSED(asi);
     Q_ASSERT(QThread::currentThread() == thread());
@@ -87,14 +87,19 @@ int QTextToSpeechProcessorFlite::fliteOutput(const cst_wave *w, int start, int s
     if (start == 0 && !initAudio(w->sample_rate, w->num_channels))
         return CST_AUDIO_STREAM_STOP;
 
-    int bytesToWrite = size * sizeof(short);
-    QString errorString;
-    if (!audioOutput((const char *)(&w->samples[start]), bytesToWrite, errorString)) {
-        setError(QTextToSpeech::ErrorReason::Playback, errorString);
+    const qsizetype bytesToWrite = size * sizeof(short);
+
+    if (!m_audioBuffer->write(reinterpret_cast<const char *>(&w->samples[start]), bytesToWrite)) {
+        setError(QTextToSpeech::ErrorReason::Playback,
+                 QCoreApplication::translate("QTextToSpeech", "Audio streaming error."));
         stop();
         return CST_AUDIO_STREAM_STOP;
     }
 
+    // Stats for debugging
+    ++numberChunks;
+    totalBytes += bytesToWrite;
+
     if (last == 1) {
         qCDebug(lcSpeechTtsFlite) << "last data chunk written";
         m_audioBuffer->close();
@@ -102,6 +107,41 @@ int QTextToSpeechProcessorFlite::fliteOutput(const cst_wave *w, int start, int s
     return CST_AUDIO_STREAM_CONT;
 }
 
+int QTextToSpeechProcessorFlite::dataOutputCb(const cst_wave *w, int start, int size,
+                                              int last, cst_audio_streaming_info *asi)
+{
+    QTextToSpeechProcessorFlite *processor = static_cast<QTextToSpeechProcessorFlite *>(asi->userdata);
+    if (processor)
+        return processor->dataOutput(w, start, size, last, asi);
+    return CST_AUDIO_STREAM_STOP;
+}
+
+int QTextToSpeechProcessorFlite::dataOutput(const cst_wave *w, int start, int size,
+                                            int last, cst_audio_streaming_info *)
+{
+    if (start == 0)
+        emit stateChanged(QTextToSpeech::Synthesizing);
+
+    QAudioFormat format;
+    if (w->num_channels == 1)
+        format.setChannelConfig(QAudioFormat::ChannelConfigMono);
+    else
+        format.setChannelCount(w->num_channels);
+    format.setSampleRate(w->sample_rate);
+    format.setSampleFormat(QAudioFormat::Int16);
+
+    if (!format.isValid())
+        return CST_AUDIO_STREAM_STOP;
+
+    const qsizetype bytesToWrite = size * format.bytesPerSample();
+    emit synthesized(format, QByteArray(reinterpret_cast<const char *>(&w->samples[start]), bytesToWrite));
+
+    if (last == 1)
+        emit stateChanged(QTextToSpeech::Ready);
+
+    return CST_AUDIO_STREAM_CONT;
+}
+
 void QTextToSpeechProcessorFlite::timerEvent(QTimerEvent *event)
 {
     if (event->timerId() != m_tokenTimer.timerId()) {
@@ -121,22 +161,7 @@ void QTextToSpeechProcessorFlite::timerEvent(QTimerEvent *event)
         startTokenTimer();
 }
 
-bool QTextToSpeechProcessorFlite::audioOutput(const char *data, qint64 dataSize, QString &errorString)
-{
-    // Send data
-    if (!m_audioBuffer->write(data, dataSize)) {
-        errorString = QCoreApplication::translate("QTextToSpeech", "Audio streaming error.");
-        return false;
-    }
-
-    // Stats for debugging
-    ++numberChunks;
-    totalBytes += dataSize;
-
-    return true;
-}
-
-void QTextToSpeechProcessorFlite::processText(const QString &text, int voiceId, double pitch, double rate)
+void QTextToSpeechProcessorFlite::processText(const QString &text, int voiceId, double pitch, double rate, OutputHandler outputHandler)
 {
     qCDebug(lcSpeechTtsFlite) << "processText() begin";
     if (!checkVoice(voiceId))
@@ -150,7 +175,7 @@ void QTextToSpeechProcessorFlite::processText(const QString &text, int voiceId,
     const VoiceInfo &voiceInfo = m_voices.at(voiceId);
     cst_voice *voice = voiceInfo.vox;
     cst_audio_streaming_info *asi = new_audio_streaming_info();
-    asi->asc = QTextToSpeechProcessorFlite::fliteOutputCb;
+    asi->asc = outputHandler;
     asi->userdata = (void *)this;
     feat_set(voice->features, "streaming_info", audio_streaming_info_val(asi));
     setRateForVoice(voice, rate);
@@ -493,7 +518,19 @@ void QTextToSpeechProcessorFlite::say(const QString &text, int voiceId, double p
         return;
 
     m_volume = volume;
-    processText(text, voiceId, pitch, rate);
+    processText(text, voiceId, pitch, rate, QTextToSpeechProcessorFlite::audioOutputCb);
+}
+
+void QTextToSpeechProcessorFlite::synthesize(const QString &text, int voiceId, double pitch, double rate, double volume)
+{
+    if (text.isEmpty())
+        return;
+
+    if (!checkVoice(voiceId))
+        return;
+
+    m_volume = volume;
+    processText(text, voiceId, pitch, rate, QTextToSpeechProcessorFlite::dataOutputCb);
 }
 
 QT_END_NAMESPACE
diff --git a/src/plugins/tts/flite/qtexttospeech_flite_processor.h b/src/plugins/tts/flite/qtexttospeech_flite_processor.h
index 8a54b44..0da6987 100644
--- a/src/plugins/tts/flite/qtexttospeech_flite_processor.h
+++ b/src/plugins/tts/flite/qtexttospeech_flite_processor.h
@@ -44,6 +44,7 @@ public:
     };
 
     Q_INVOKABLE void say(const QString &text, int voiceId, double pitch, double rate, double volume);
+    Q_INVOKABLE void synthesize(const QString &text, int voiceId, double pitch, double rate, double volume);
     Q_INVOKABLE void pause();
     Q_INVOKABLE void resume();
     Q_INVOKABLE void stop();
@@ -52,16 +53,18 @@ public:
     static constexpr QTextToSpeech::State audioStateToTts(QAudio::State audioState);
 
 private:
-    // Process a single text
-    void processText(const QString &text, int voiceId, double pitch, double rate);
-
     // Flite callbacks
-    static int fliteOutputCb(const cst_wave *w, int start, int size,
+    static int audioOutputCb(const cst_wave *w, int start, int size,
+                             int last, cst_audio_streaming_info *asi);
+    static int dataOutputCb(const cst_wave *w, int start, int size,
                             int last, cst_audio_streaming_info *asi);
-    int fliteOutput(const cst_wave *w, int start, int size,
-                    int last, cst_audio_streaming_info *asi);
 
-    bool audioOutput(const char *data, qint64 dataSize, QString &errorString);
+    using OutputHandler = decltype(QTextToSpeechProcessorFlite::audioOutputCb);
+    // Process a single text
+    void processText(const QString &text, int voiceId, double pitch, double rate, OutputHandler outputHandler);
+    int audioOutput(const cst_wave *w, int start, int size, int last, cst_audio_streaming_info *asi);
+    int dataOutput(const cst_wave *w, int start, int size, int last, cst_audio_streaming_info *asi);
+
     void setRateForVoice(cst_voice *voice, float rate);
     void setPitchForVoice(cst_voice *voice, float pitch);
 
@@ -85,6 +88,7 @@ Q_SIGNALS:
     void errorOccurred(QTextToSpeech::ErrorReason error, const QString &errorString);
     void stateChanged(QTextToSpeech::State);
     void sayingWord(qsizetype begin, qsizetype length);
+    void synthesized(const QAudioFormat &format, const QByteArray &array);
 
 protected:
     void timerEvent(QTimerEvent *event) override;
diff --git a/src/plugins/tts/macos/CMakeLists.txt b/src/plugins/tts/macos/CMakeLists.txt
index ccf66d6..3e7cd70 100644
--- a/src/plugins/tts/macos/CMakeLists.txt
+++ b/src/plugins/tts/macos/CMakeLists.txt
@@ -12,4 +12,5 @@ qt_internal_add_plugin(QTextToSpeechMacOSPlugin
         Qt::Core
         Qt::Gui
         Qt::TextToSpeech
+        Qt::Multimedia
 )
diff --git a/src/plugins/tts/macos/qtexttospeech_macos.h b/src/plugins/tts/macos/qtexttospeech_macos.h
index 231351b..6b707c0 100644
--- a/src/plugins/tts/macos/qtexttospeech_macos.h
+++ b/src/plugins/tts/macos/qtexttospeech_macos.h
@@ -30,6 +30,7 @@ public:
     QList<QLocale> availableLocales() const override;
     QList<QVoice> availableVoices() const override;
     void say(const QString &text) override;
+    void synthesize(const QString &text) override;
     void stop(QTextToSpeech::BoundaryHint boundaryHint) override;
     void pause(QTextToSpeech::BoundaryHint boundaryHint) override;
     void resume() override;
diff --git a/src/plugins/tts/macos/qtexttospeech_macos.mm b/src/plugins/tts/macos/qtexttospeech_macos.mm
index b08296a..bedb414 100644
--- a/src/plugins/tts/macos/qtexttospeech_macos.mm
+++ b/src/plugins/tts/macos/qtexttospeech_macos.mm
@@ -150,6 +150,11 @@ void QTextToSpeechEngineMacOS::say(const QString &text)
     speaking();
 }
 
+void QTextToSpeechEngineMacOS::synthesize(const QString &)
+{
+    setError(QTextToSpeech::ErrorReason::Configuration, tr("Synthesize not supported"));
+}
+
 void QTextToSpeechEngineMacOS::stop(QTextToSpeech::BoundaryHint boundaryHint)
 {
     if (speechSynthesizer.isSpeaking || m_state == QTextToSpeech::Paused) {
diff --git a/src/plugins/tts/mock/mock_plugin.json b/src/plugins/tts/mock/mock_plugin.json
index 43c0d9e..7785016 100644
--- a/src/plugins/tts/mock/mock_plugin.json
+++ b/src/plugins/tts/mock/mock_plugin.json
@@ -5,6 +5,7 @@
     "Priority": -1,
     "Capabilities": [
         "Speak",
+        "Synthesize",
         "WordByWordProgress"
     ]
 }
diff --git a/src/plugins/tts/mock/qtexttospeech_mock.cpp b/src/plugins/tts/mock/qtexttospeech_mock.cpp
index d3b8967..1118f50 100644
--- a/src/plugins/tts/mock/qtexttospeech_mock.cpp
+++ b/src/plugins/tts/mock/qtexttospeech_mock.cpp
@@ -69,6 +69,19 @@ void QTextToSpeechEngineMock::say(const QString &text)
     emit stateChanged(m_state);
 }
 
+void QTextToSpeechEngineMock::synthesize(const QString &text)
+{
+    m_text = text;
+    m_currentIndex = 0;
+    m_timer.start(wordTime(), Qt::PreciseTimer, this);
+    m_state = QTextToSpeech::Synthesizing;
+    emit stateChanged(m_state);
+
+    m_format.setSampleRate(22050);
+    m_format.setChannelConfig(QAudioFormat::ChannelConfigMono);
+    m_format.setSampleFormat(QAudioFormat::Int16);
+}
+
 void QTextToSpeechEngineMock::stop(QTextToSpeech::BoundaryHint boundaryHint)
 {
     Q_UNUSED(boundaryHint);
@@ -112,7 +125,7 @@ void QTextToSpeechEngineMock::timerEvent(QTimerEvent *e)
         return;
     }
 
-    Q_ASSERT(m_state == QTextToSpeech::Speaking);
+    Q_ASSERT(m_state == QTextToSpeech::Speaking || m_state == QTextToSpeech::Synthesizing);
     Q_ASSERT(m_text.length());
 
     // Find start of next word, skipping punctuations. This is good enough for testing.
@@ -124,6 +137,8 @@ void QTextToSpeechEngineMock::timerEvent(QTimerEvent *e)
     sayingWord(m_currentIndex, nextSpace - m_currentIndex);
     m_currentIndex = nextSpace + match.captured().length();
 
+    emit synthesized(m_format, QByteArray(m_format.bytesForDuration(wordTime() * 1000), 0));
+
     if (m_currentIndex >= m_text.length()) {
         // done speaking all words
         m_timer.stop();
diff --git a/src/plugins/tts/mock/qtexttospeech_mock.h b/src/plugins/tts/mock/qtexttospeech_mock.h
index 0511416..dece5f4 100644
--- a/src/plugins/tts/mock/qtexttospeech_mock.h
+++ b/src/plugins/tts/mock/qtexttospeech_mock.h
@@ -21,6 +21,7 @@ public:
     QList<QVoice> availableVoices() const override;
 
     void say(const QString &text) override;
+    void synthesize(const QString &text) override;
     void stop(QTextToSpeech::BoundaryHint boundaryHint) override;
     void pause(QTextToSpeech::BoundaryHint boundaryHint) override;
     void resume() override;
@@ -59,6 +60,7 @@ private:
     QString m_errorString;
     bool m_pauseRequested = false;
     qsizetype m_currentIndex = -1;
+    QAudioFormat m_format;
 };
 
 QT_END_NAMESPACE
diff --git a/src/plugins/tts/sapi/qtexttospeech_sapi.cpp b/src/plugins/tts/sapi/qtexttospeech_sapi.cpp
index 00b2e84..0e92547 100644
--- a/src/plugins/tts/sapi/qtexttospeech_sapi.cpp
+++ b/src/plugins/tts/sapi/qtexttospeech_sapi.cpp
@@ -22,6 +22,7 @@ QT_BEGIN_NAMESPACE
 #ifdef Q_CC_MINGW // from sphelper.h
 
 static const GUID CLSD_SpVoice = {0x96749377, 0x3391, 0x11d2,{0x9e, 0xe3, 0x0, 0xc0, 0x4f, 0x79, 0x73, 0x96}};
+const GUID SPDFID_WaveFormatEx = {0xC31ADBAE, 0x527F, 0x4ff5,{0xA2, 0x30, 0xF6, 0x2B, 0xB6, 0x1F, 0xF7, 0x0C}};
 
 static inline HRESULT SpGetTokenFromId(const WCHAR *pszTokenId, ISpObjectToken **cpToken, BOOL fCreateIfNotExist = FALSE)
 {
@@ -54,7 +55,6 @@ inline void SpClearEvent(SPEVENT *pe)
         break;
     }
 }
-
 #endif // Q_CC_MINGW
 
 QTextToSpeechEngineSapi::QTextToSpeechEngineSapi(const QVariantMap &, QObject *)
@@ -117,6 +117,123 @@ void QTextToSpeechEngineSapi::say(const QString &text)
                  QCoreApplication::translate("QTextToSpeech", "Speech synthesizing failure."));
 }
 
+void QTextToSpeechEngineSapi::synthesize(const QString &text)
+{
+    class OutputStream : public ISpStreamFormat
+    {
+        ULONG m_ref = 1;
+        qint64 m_pos = 0;
+        qint64 m_length = 0;
+        QTextToSpeechEngineSapi *m_engine = nullptr;
+        QAudioFormat m_format;
+
+    public:
+        OutputStream(QTextToSpeechEngineSapi *engine)
+            : m_engine(engine)
+        {
+            m_format.setChannelConfig(QAudioFormat::ChannelConfigMono);
+            m_format.setSampleRate(16000);
+            m_format.setSampleFormat(QAudioFormat::Int16);
+        }
+        virtual ~OutputStream() = default;
+
+        // IUnknown
+        ULONG AddRef() override { return ++m_ref; }
+        ULONG Release() override {
+            if (!--m_ref) {
+                delete this;
+                return 0;
+            }
+            return m_ref;
+        }
+
+        HRESULT QueryInterface(REFIID riid, VOID **ppvInterface) override
+        {
+            if (!ppvInterface)
+                return E_POINTER;
+
+            if (riid == __uuidof(IUnknown)) {
+                *ppvInterface = static_cast<IUnknown*>(this);
+            } else if (riid == __uuidof(IStream)) {
+                *ppvInterface = static_cast<IStream *>(this);
+            } else if (riid == __uuidof(ISpStreamFormat)) {
+                *ppvInterface = static_cast<ISpStreamFormat *>(this);
+            } else {
+                *ppvInterface = nullptr;
+                return E_NOINTERFACE;
+            }
+            AddRef();
+            return S_OK;
+        }
+
+        // IStream
+        HRESULT Read(void *,ULONG,ULONG *) override { return E_NOTIMPL; }
+        HRESULT Write(const void *pv,ULONG cb,ULONG *pcbWritten) override
+        {
+            emit m_engine->synthesized(m_format, QByteArray(static_cast<const char *>(pv), cb));
+            *pcbWritten = cb;
+            return S_OK;
+        }
+        HRESULT Seek(LARGE_INTEGER dlibMove, DWORD dwOrigin, ULARGE_INTEGER *plibNewPosition) override
+        {
+            qint64 move = dlibMove.QuadPart;
+            switch (dwOrigin) {
+            case STREAM_SEEK_SET:
+                m_pos = move;
+                break;
+            case STREAM_SEEK_CUR:
+                m_pos += move;
+                break;
+            case STREAM_SEEK_END:
+                m_pos = m_length + move;
+                break;
+            }
+            (*plibNewPosition).QuadPart = m_pos;
+            return S_OK;
+        }
+        HRESULT SetSize(ULARGE_INTEGER) override { return E_NOTIMPL; }
+        HRESULT CopyTo(IStream *,ULARGE_INTEGER,ULARGE_INTEGER *,ULARGE_INTEGER *) override { return E_NOTIMPL; }
+        HRESULT Commit(DWORD) override { return E_NOTIMPL; }
+        HRESULT Revert(void) override { return E_NOTIMPL; }
+        HRESULT LockRegion(ULARGE_INTEGER,ULARGE_INTEGER,DWORD) override { return E_NOTIMPL; }
+        HRESULT UnlockRegion(ULARGE_INTEGER,ULARGE_INTEGER,DWORD) override { return E_NOTIMPL; }
+        HRESULT Stat(STATSTG *,DWORD) override { return E_NOTIMPL; }
+        HRESULT Clone(IStream **) override { return E_NOTIMPL; }
+
+        // ISpStreamFormat
+        HRESULT GetFormat(GUID *pguidFormatId,WAVEFORMATEX **ppCoMemWaveFormatEx) override
+        {
+            *pguidFormatId = SPDFID_WaveFormatEx;
+            WAVEFORMATEX *format = static_cast<WAVEFORMATEX *>(CoTaskMemAlloc(sizeof(WAVEFORMATEX)));
+            format->wFormatTag = WAVE_FORMAT_PCM;
+            format->nChannels = m_format.channelCount();
+            format->nSamplesPerSec = m_format.sampleRate();
+            format->wBitsPerSample = m_format.bytesPerSample() * 8;
+            format->nBlockAlign = format->nChannels * format->wBitsPerSample / 8;
+            format->nAvgBytesPerSec = format->nSamplesPerSec * format->nBlockAlign;
+            format->cbSize = 0; // amount of extra format information
+
+            *ppCoMemWaveFormatEx = format;
+            return S_OK;
+        }
+    };
+
+    if (text.isEmpty())
+        return;
+
+    currentText = text;
+    const QString prefix = u"<pitch absmiddle=\"%1\"/>"_qs.arg(m_pitch * 10);
+    textOffset = prefix.length();
+    currentText.prepend(prefix);
+
+    OutputStream *outputStream = new OutputStream(this);
+    m_voice->SetOutput(outputStream, false);
+    HRESULT hr = m_voice->Speak(currentText.toStdWString().data(), SPF_ASYNC, NULL);
+    if (!SUCCEEDED(hr))
+        setError(QTextToSpeech::ErrorReason::Input,
+                 QCoreApplication::translate("QTextToSpeech", "Speech synthesizing failure."));
+}
+
 void QTextToSpeechEngineSapi::stop(QTextToSpeech::BoundaryHint boundaryHint)
 {
     Q_UNUSED(boundaryHint);
diff --git a/src/plugins/tts/sapi/qtexttospeech_sapi.h b/src/plugins/tts/sapi/qtexttospeech_sapi.h
index 6b139d2..dd0cea5 100644
--- a/src/plugins/tts/sapi/qtexttospeech_sapi.h
+++ b/src/plugins/tts/sapi/qtexttospeech_sapi.h
@@ -29,6 +29,7 @@ public:
     QList<QLocale> availableLocales() const override;
     QList<QVoice> availableVoices() const override;
     void say(const QString &text) override;
+    void synthesize(const QString &text) override;
     void stop(QTextToSpeech::BoundaryHint boundaryHint) override;
     void pause(QTextToSpeech::BoundaryHint boundaryHint) override;
     void resume() override;
@@ -47,6 +48,7 @@ public:
     QString errorString() const override;
 
     HRESULT STDMETHODCALLTYPE NotifyCallback(WPARAM /*wParam*/, LPARAM /*lParam*/) override;
+    friend class OputStream;
 
 private:
     bool isSpeaking() const;
diff --git a/src/plugins/tts/sapi/sapi_plugin.json b/src/plugins/tts/sapi/sapi_plugin.json
index c16b87f..95de0bf 100644
--- a/src/plugins/tts/sapi/sapi_plugin.json
+++ b/src/plugins/tts/sapi/sapi_plugin.json
@@ -5,6 +5,7 @@
     "Priority": 50,
     "Capabilities": [
         "Speak",
-        "WordByWordProgress"
+        "WordByWordProgress",
+        "Synthesize"
     ]
 }
diff --git a/src/plugins/tts/speechdispatcher/qtexttospeech_speechd.cpp b/src/plugins/tts/speechdispatcher/qtexttospeech_speechd.cpp
index 3026d39..f93ea1c 100644
--- a/src/plugins/tts/speechdispatcher/qtexttospeech_speechd.cpp
+++ b/src/plugins/tts/speechdispatcher/qtexttospeech_speechd.cpp
@@ -151,6 +151,11 @@ void QTextToSpeechEngineSpeechd::say(const QString &text)
                  QCoreApplication::translate("QTextToSpeech", "Text synthesizing failure."));
 }
 
+void QTextToSpeechEngineSpeechd::synthesize(const QString &)
+{
+    setError(QTextToSpeech::ErrorReason::Configuration, tr("Synthesize not supported"));
+}
+
 void QTextToSpeechEngineSpeechd::stop(QTextToSpeech::BoundaryHint boundaryHint)
 {
     Q_UNUSED(boundaryHint);
diff --git a/src/plugins/tts/speechdispatcher/qtexttospeech_speechd.h b/src/plugins/tts/speechdispatcher/qtexttospeech_speechd.h
index 1860e9d..b9aaf9f 100644
--- a/src/plugins/tts/speechdispatcher/qtexttospeech_speechd.h
+++ b/src/plugins/tts/speechdispatcher/qtexttospeech_speechd.h
@@ -28,6 +28,7 @@ public:
     QList<QLocale> availableLocales() const override;
     QList<QVoice> availableVoices() const override;
     void say(const QString &text) override;
+    void synthesize(const QString &text) override;
     void stop(QTextToSpeech::BoundaryHint boundaryHint) override;
     void pause(QTextToSpeech::BoundaryHint boundaryHint) override;
     void resume() override;
diff --git a/src/plugins/tts/winrt/qtexttospeech_winrt.cpp b/src/plugins/tts/winrt/qtexttospeech_winrt.cpp
index b98ffe8..fdbaad0 100644
--- a/src/plugins/tts/winrt/qtexttospeech_winrt.cpp
+++ b/src/plugins/tts/winrt/qtexttospeech_winrt.cpp
@@ -472,6 +472,51 @@ void QTextToSpeechEngineWinRT::say(const QString &text)
     });
 }
 
+void QTextToSpeechEngineWinRT::synthesize(const QString &text)
+{
+    Q_D(QTextToSpeechEngineWinRT);
+
+    HRESULT hr = S_OK;
+
+    HStringReference nativeText(reinterpret_cast<LPCWSTR>(text.utf16()), text.length());
+
+    ComPtr<IAsyncOperation<SpeechSynthesisStream*>> synthOperation;
+    hr = d->synth->SynthesizeTextToStreamAsync(nativeText.Get(), &synthOperation);
+    if (!SUCCEEDED(hr)) {
+        d->setError(QTextToSpeech::ErrorReason::Input,
+                    QCoreApplication::translate("QTextToSpeech", "Speech synthesizing failure."));
+        return;
+    }
+
+    // The source will wait for the the data resulting out of the synthOperation, and emits
+    // streamReady when data is available. This starts a QAudioSink, which pulls the data.
+    d->audioSource.Attach(new AudioSource(synthOperation));
+
+    connect(d->audioSource.Get(), &AudioSource::streamReady, this, [d, this](const QAudioFormat &format){
+        if (d->state != QTextToSpeech::Synthesizing) {
+            d->state = QTextToSpeech::Synthesizing;
+            emit stateChanged(d->state);
+        }
+    });
+    connect(d->audioSource.Get(), &AudioSource::readyRead, this, [d, this](){
+        Q_ASSERT(d->state == QTextToSpeech::Synthesizing);
+        const QByteArray data = d->audioSource->read(d->audioSource->bytesAvailable());
+        emit synthesized(d->audioSource->format(), data);
+        if (d->audioSource->atEnd())
+            d->audioSource->close();
+    });
+    connect(d->audioSource.Get(), &AudioSource::aboutToClose, this, [d, this]{
+        if (d->state != QTextToSpeech::Ready) {
+            d->state = QTextToSpeech::Ready;
+            emit stateChanged(d->state);
+        }
+    });
+    connect(d->audioSource.Get(), &AudioSource::errorInStream, this, [d]{
+        d->setError(QTextToSpeech::ErrorReason::Input,
+                    QCoreApplication::translate("QTextToSpeech", "Error synthesizing audio stream."));
+    });
+}
+
 void QTextToSpeechEngineWinRT::stop(QTextToSpeech::BoundaryHint boundaryHint)
 {
     Q_UNUSED(boundaryHint);
diff --git a/src/plugins/tts/winrt/qtexttospeech_winrt.h b/src/plugins/tts/winrt/qtexttospeech_winrt.h
index 39cdeab..045f8e0 100644
--- a/src/plugins/tts/winrt/qtexttospeech_winrt.h
+++ b/src/plugins/tts/winrt/qtexttospeech_winrt.h
@@ -28,6 +28,7 @@ public:
     QList<QLocale> availableLocales() const override;
     QList<QVoice> availableVoices() const override;
     void say(const QString &text) override;
+    void synthesize(const QString &text) override;
     void stop(QTextToSpeech::BoundaryHint boundaryHint) override;
     void pause(QTextToSpeech::BoundaryHint boundaryHint) override;
     void resume() override;
diff --git a/src/plugins/tts/winrt/qtexttospeech_winrt_audiosource.cpp b/src/plugins/tts/winrt/qtexttospeech_winrt_audiosource.cpp
index b66a78a..a680ffe 100644
--- a/src/plugins/tts/winrt/qtexttospeech_winrt_audiosource.cpp
+++ b/src/plugins/tts/winrt/qtexttospeech_winrt_audiosource.cpp
@@ -173,7 +173,7 @@ qint64 AudioSource::readData(char *data, qint64 maxlen)
 bool AudioSource::atEnd() const
 {
     // not done as long as QIODevice's buffer is not empty
-    if (!QIODevice::atEnd())
+    if (!QIODevice::atEnd() && QIODevice::bytesAvailable())
         return false;
 
     // If we get here, bytesAvailable() has returned 0, so our buffers are
@@ -279,8 +279,8 @@ HRESULT AudioSource::Invoke(IAsyncOperation<SpeechSynthesisStream*> *operation,
 
     // we are buffered, but we don't want QIODevice to buffer as well
     open(QIODevice::ReadOnly|QIODevice::Unbuffered);
-    fetchMore();
     emit streamReady(audioFormat);
+    fetchMore();
     return S_OK;
 }
 
diff --git a/src/plugins/tts/winrt/qtexttospeech_winrt_audiosource.h b/src/plugins/tts/winrt/qtexttospeech_winrt_audiosource.h
index 489b265..a9d5ebf 100644
--- a/src/plugins/tts/winrt/qtexttospeech_winrt_audiosource.h
+++ b/src/plugins/tts/winrt/qtexttospeech_winrt_audiosource.h
@@ -43,6 +43,8 @@ public:
     bool atEnd() const override;
     qint64 bytesAvailable() const override;
 
+    QAudioFormat format() const { return audioFormat; }
+
     enum PauseState {
         NoPause,
         PauseRequested,
diff --git a/src/plugins/tts/winrt/winrt_plugin.json b/src/plugins/tts/winrt/winrt_plugin.json
index 5e4104b..f47e3f6 100644
--- a/src/plugins/tts/winrt/winrt_plugin.json
+++ b/src/plugins/tts/winrt/winrt_plugin.json
@@ -5,6 +5,7 @@
     "Priority": 80,
     "Capabilities": [
         "Speak",
-        "WordByWordProgress"
+        "WordByWordProgress",
+        "Synthesize"
     ]
 }
diff --git a/src/tts/CMakeLists.txt b/src/tts/CMakeLists.txt
index 71e3d11..b8e92ea 100644
--- a/src/tts/CMakeLists.txt
+++ b/src/tts/CMakeLists.txt
@@ -16,6 +16,7 @@ qt_internal_add_module(TextToSpeech
         Qt::QmlIntegration
     PUBLIC_LIBRARIES
         Qt::Core
+        Qt::Multimedia
     PRIVATE_MODULE_INTERFACE
         Qt::CorePrivate
 )
diff --git a/src/tts/qtexttospeech.cpp b/src/tts/qtexttospeech.cpp
index e6599b7..29d2fd3 100644
--- a/src/tts/qtexttospeech.cpp
+++ b/src/tts/qtexttospeech.cpp
@@ -70,9 +70,13 @@ void QTextToSpeechPrivate::setEngineProvider(const QString &engine, const QVaria
 
     // Connect signals directly from the engine to the public API signals
     if (m_engine) {
-        QObject::connect(m_engine, &QTextToSpeechEngine::stateChanged, q, &QTextToSpeech::stateChanged);
+        QObject::connect(m_engine, &QTextToSpeechEngine::stateChanged,
+                         q, [this](QTextToSpeech::State newState){
+            updateState(newState);
+        });
         QObject::connect(m_engine, &QTextToSpeechEngine::errorOccurred, q, &QTextToSpeech::errorOccurred);
         QObject::connect(m_engine, &QTextToSpeechEngine::sayingWord, q, &QTextToSpeech::sayingWord);
+        QObject::connect(m_engine, &QTextToSpeechEngine::synthesized, q, &QTextToSpeech::synthesized);
     }
 }
 
@@ -139,6 +143,19 @@ void QTextToSpeechPrivate::loadPluginMetadata(QMultiHash<QString, QCborMap> &lis
     }
 }
 
+void QTextToSpeechPrivate::updateState(QTextToSpeech::State newState)
+{
+    Q_Q(QTextToSpeech);
+    if (newState == QTextToSpeech::Ready && m_slotObject) {
+        // If we are done synthesizing and the functor-overload was used,
+        // clear the temporary connection.
+        m_slotObject->destroyIfLastRef();
+        m_slotObject = nullptr;
+        m_engine->disconnect(m_synthesizeConnection);
+    }
+    emit q->stateChanged(newState);
+}
+
 /*!
     \class QTextToSpeech
     \brief The QTextToSpeech class provides a convenient access to text-to-speech engines.
@@ -209,11 +226,13 @@ void QTextToSpeechPrivate::loadPluginMetadata(QMultiHash<QString, QCborMap> &lis
 
     \brief This enum describes the current state of the text-to-speech engine.
 
-    \value Ready      The synthesizer is ready to start a new text. This is
-                      also the state after a text was finished.
-    \value Speaking   Text is being spoken.
-    \value Paused     The synthesis was paused and can be resumed with \l resume().
-    \value Error      An error has occurred. Details are given by \l errorReason().
+    \value Ready        The synthesizer is ready to start a new text. This is
+                        also the state after a text was finished.
+    \value Speaking     Text is being spoken.
+    \value Synthesizing Text is being synthesized into PCM data. The synthesized()
+                        signal will be emitted with chunks of data.
+    \value Paused       The synthesis was paused and can be resumed with \l resume().
+    \value Error        An error has occurred. Details are given by \l errorReason().
 
     \sa QTextToSpeech::ErrorReason errorReason() errorString()
 */
@@ -382,6 +401,8 @@ QString QTextToSpeech::engine() const
     \value Speak                The engine can play audio output from text.
     \value WordByWordProgress   The engine emits the sayingWord() signal for
                                 each word that gets spoken.
+    \value Synthesize           The engine can \l{synthesize()}{synthesize} PCM
+                                audio data from text.
 
     \sa engineCapabilities()
 */
@@ -585,7 +606,7 @@ QString QTextToSpeech::errorString() const
 */
 
 /*!
-    Starts synthesizing the \a text.
+    Starts speaking the \a text.
 
     This function starts sythesizing the speech asynchronously, and reads the text to the
     default audio output device.
@@ -599,7 +620,7 @@ QString QTextToSpeech::errorString() const
     set to \l Speaking once the reading starts. When the reading is done,
     \l state will be set to \l Ready.
 
-    \sa stop(), pause(), resume()
+    \sa stop(), pause(), resume(), synthesize()
 */
 void QTextToSpeech::say(const QString &text)
 {
@@ -609,6 +630,114 @@ void QTextToSpeech::say(const QString &text)
 }
 
 /*!
+    Synthesizes the \a text into raw audio data.
+    \since 6.6
+
+    This function synthesizes the speech asynchronously into raw audio data.
+    When data is available, the \l synthesized() signal is emitted with the
+    bytes, and the \l {QAudioFormat}{format} that the data is in.
+
+    The \l state property is set to \l Synthesizing when the synthesis starts,
+    and to \l Ready once the synthesis is finished. While synthesizing, the
+    synthesized() signal might be emitted multiple times, possibly with
+    changing values for \c format.
+
+    \sa say(), stop()
+*/
+void QTextToSpeech::synthesize(const QString &text)
+{
+    Q_D(QTextToSpeech);
+    if (d->m_engine)
+        d->m_engine->synthesize(text);
+}
+
+/*!
+    \fn template<typename Functor> void QTextToSpeech::synthesize(
+            const QString &text, Functor functor)
+    \fn template<typename Functor> void QTextToSpeech::synthesize(
+            const QString &text, const QObject *context, Functor functor)
+    \since 6.6
+
+    Synthesizes the \a text into raw audio data.
+
+    This function synthesizes the speech asynchronously into raw audio data.
+    When data is available, the \a functor will be called as
+    \c {functor(const QAudioFormat &format, const QByteArray &bytes)}, with
+    \c format describing the \l {QAudioFormat}{format} of the data in \c bytes.
+
+    The \l state property is set to \l Synthesizing when the synthesis starts,
+    and to \l Ready once the synthesis is finished. While synthesizing, the
+    \a functor might be called multiple times, possibly with changing values
+    for \c format.
+
+    The \a functor can be a callable, like a lambda or free function, with an
+    optional \a context object:
+
+    \code
+    tts.synthesize("Hello world", [](const QAudioFormat &format, const QByteArray &bytes){
+        // process data according to format
+    });
+    \endcode
+
+    or a slot in the \a context object:
+
+    \code
+    struct PCMProcessor : QObject
+    {
+        void processData(const QAudioFormat &format, const QByteArray &bytes)
+        {
+            // process data according to format
+        }
+    } processor;
+    tts.synthesize("Hello world", &processor, &PCMProcessor::processData);
+    \endcode
+
+    If \a context is destroyed, then the \a functor will no longer get called.
+
+    \note This API requires that the engine has the
+    \l {QTextToSpeech::Capability::}{Synthesize} capability.
+
+    \sa say(), stop()
+*/
+
+/*!
+    \internal
+
+    Handles the engine's synthesized() signal to call \a slotObj on the \a context
+    object. The slot object and the temporary connection are stored and released
+    in updateState() when the state of the engine transitions back to Ready.
+*/
+void QTextToSpeech::synthesizeImpl(const QString &text,
+                                   QtPrivate::QSlotObjectBase *slotObj, const QObject *context)
+{
+    Q_D(QTextToSpeech);
+    Q_ASSERT(slotObj);
+    d->m_slotObject = slotObj;
+    const auto receive = [d, context](const QAudioFormat &format, const QByteArray &bytes){
+        Q_ASSERT(d->m_slotObject);
+        void *args[] = {nullptr,
+                        const_cast<QAudioFormat *>(&format),
+                        const_cast<QByteArray *>(&bytes)};
+        d->m_slotObject->call(const_cast<QObject *>(context), args);
+    };
+    d->m_synthesizeConnection = connect(d->m_engine, &QTextToSpeechEngine::synthesized,
+                                        context ? context : this, receive);
+    synthesize(text);
+}
+
+/*!
+    \fn void QTextToSpeech::synthesized(const QAudioFormat &format, const QByteArray &data)
+
+    This signal is emitted when pcm \a data is available. The data is encoded in \a format.
+    A single call to \l synthesize() might result in several emissions of this signal.
+
+    \note This signal requires that the engine has the
+    \l {QTextToSpeech::Capability::}{Synthesize} capability.
+
+    \sa synthesize()
+*/
+
+/*!
     \qmlmethod TextToSpeech::stop(BoundaryHint boundaryHint)
 
     Stops the current reading at \a boundaryHint.
diff --git a/src/tts/qtexttospeech.h b/src/tts/qtexttospeech.h
index e98d615..3d78e0a 100644
--- a/src/tts/qtexttospeech.h
+++ b/src/tts/qtexttospeech.h
@@ -16,6 +16,8 @@
 
 QT_BEGIN_NAMESPACE
 
+class QAudioFormat;
+
 class QTextToSpeechPrivate;
 class Q_TEXTTOSPEECH_EXPORT QTextToSpeech : public QObject
 {
@@ -29,12 +31,14 @@ class Q_TEXTTOSPEECH_EXPORT QTextToSpeech : public QObject
     Q_PROPERTY(QVoice voice READ voice WRITE setVoice NOTIFY voiceChanged)
     Q_PROPERTY(Capabilities engineCapabilities READ engineCapabilities NOTIFY engineChanged)
     Q_DECLARE_PRIVATE(QTextToSpeech)
+
 public:
     enum State {
         Ready,
         Speaking,
         Paused,
-        Error
+        Error,
+        Synthesizing
     };
     Q_ENUM(State)
 
@@ -59,6 +63,7 @@ public:
         None                = 0,
         Speak               = 1 << 0,
         WordByWordProgress  = 1 << 1,
+        Synthesize          = 1 << 2,
     };
     Q_DECLARE_FLAGS(Capabilities, Capability)
     Q_FLAG(Capabilities)
@@ -89,8 +94,71 @@ public:
 
     Q_INVOKABLE static QStringList availableEngines();
 
+# ifdef Q_QDOC
+    template <typename Functor>
+    void synthesize(const QString &text, Functor functor);
+    template <typename Functor>
+    void synthesize(const QString &text, const QObject *context, Functor functor);
+# else
+    template <typename Slot> // synthesize to a QObject member function
+    void synthesize(const QString &text,
+        const typename QtPrivate::FunctionPointer<Slot>::Object *receiver, Slot slot)
+    {
+        using CallbackSignature = QtPrivate::FunctionPointer<void (*)(QAudioFormat, QByteArray)>;
+        using SlotSignature = QtPrivate::FunctionPointer<Slot>;
+
+        static_assert(int(SlotSignature::ArgumentCount) <= int(CallbackSignature::ArgumentCount),
+            "Slot requires more arguments than what can be provided.");
+        static_assert((QtPrivate::CheckCompatibleArguments<typename CallbackSignature::Arguments,
+                      typename SlotSignature::Arguments>::value),
+            "Slot arguments are not compatible (must be QAudioFormat, QByteArray)");
+
+        auto slotObj = new QtPrivate::QSlotObject<Slot, typename SlotSignature::Arguments, void>(slot);
+        synthesizeImpl(text, slotObj, receiver);
+    }
+
+    // synthesize to a functor or function pointer (with context)
+    template <typename Func, std::enable_if_t<
+        !QtPrivate::FunctionPointer<Func>::IsPointerToMemberFunction
+        && !std::is_same<const char *, Func>::value, bool> = true>
+    void synthesize(const QString &text, const QObject *context, Func func)
+    {
+        using CallbackSignature = QtPrivate::FunctionPointer<void (*)(QAudioFormat, QByteArray)>;
+        constexpr int MatchingArgumentCount = QtPrivate::ComputeFunctorArgumentCount<
+            Func, CallbackSignature::Arguments>::Value;
+
+        static_assert(MatchingArgumentCount == 0
+            || MatchingArgumentCount == CallbackSignature::ArgumentCount,
+           "Functor arguments are not compatible (must be QAudioFormat, QByteArray)");
+
+        QtPrivate::QSlotObjectBase *slotObj = nullptr;
+        if constexpr (MatchingArgumentCount == CallbackSignature::ArgumentCount) {
+            slotObj = new QtPrivate::QFunctorSlotObject<Func, 2,
+                typename CallbackSignature::Arguments, void>(std::move(func));
+        } else if constexpr (MatchingArgumentCount == 1) {
+            slotObj = new QtPrivate::QFunctorSlotObject<Func, 1,
+                typename CallbackSignature::Arguments, void>(std::move(func));
+        } else {
+            slotObj = new QtPrivate::QFunctorSlotObject<Func, 0,
+                typename QtPrivate::List_Left<void, 0>::Value, void>(std::move(func));
+        }
+
+        synthesizeImpl(text, slotObj, context);
+    }
+
+    // synthesize to a functor or function pointer (without context)
+    template <typename Func, std::enable_if_t<
+        !QtPrivate::FunctionPointer<Func>::IsPointerToMemberFunction
+        && !std::is_same<const char *, Func>::value, bool> = true>
+    void synthesize(const QString &text, Func func)
+    {
+        synthesize(text, nullptr, std::move(func));
+    }
+# endif // Q_QDOC
+
 public Q_SLOTS:
     void say(const QString &text);
+    void synthesize(const QString &text);
     void stop(QTextToSpeech::BoundaryHint boundaryHint = QTextToSpeech::BoundaryHint::Default);
     void pause(QTextToSpeech::BoundaryHint boundaryHint = QTextToSpeech::BoundaryHint::Default);
     void resume();
@@ -113,8 +181,12 @@ Q_SIGNALS:
     void voiceChanged(const QVoice &voice);
 
     void sayingWord(qsizetype start, qsizetype length);
+    void synthesized(const QAudioFormat &format, const QByteArray &data);
 
 private:
+    void synthesizeImpl(const QString &text,
+        QtPrivate::QSlotObjectBase *slotObj, const QObject *context);
+
     Q_DISABLE_COPY(QTextToSpeech)
 };
 Q_DECLARE_OPERATORS_FOR_FLAGS(QTextToSpeech::Capabilities)
diff --git a/src/tts/qtexttospeech_p.h b/src/tts/qtexttospeech_p.h
index cf5e657..6b6a93d 100644
--- a/src/tts/qtexttospeech_p.h
+++ b/src/tts/qtexttospeech_p.h
@@ -40,6 +40,7 @@ public:
 private:
     bool loadMeta();
     void loadPlugin();
+    void updateState(QTextToSpeech::State newState);
     static void loadPluginMetadata(QMultiHash<QString, QCborMap> &list);
     QTextToSpeech *q_ptr;
     QTextToSpeechPlugin *m_plugin = nullptr;
@@ -47,6 +48,8 @@ private:
     QString m_providerName;
     QCborMap m_metaData;
     static QMutex m_mutex;
+    QMetaObject::Connection m_synthesizeConnection;
+    QtPrivate::QSlotObjectBase *m_slotObject = nullptr;
 };
 
 QT_END_NAMESPACE
diff --git a/src/tts/qtexttospeechengine.h b/src/tts/qtexttospeechengine.h
index 9fdb87b..2fc1825 100644
--- a/src/tts/qtexttospeechengine.h
+++ b/src/tts/qtexttospeechengine.h
@@ -9,9 +9,12 @@
 #include <QtCore/QObject>
 #include <QtCore/QLocale>
 #include <QtCore/QDir>
+#include <QtMultimedia/QAudioFormat>
 
 QT_BEGIN_NAMESPACE
 
+class QAudioFormat;
+
 class Q_TEXTTOSPEECH_EXPORT QTextToSpeechEngine : public QObject
 {
     Q_OBJECT
@@ -24,6 +27,9 @@ public:
     virtual QList<QVoice> availableVoices() const = 0;
 
     virtual void say(const QString &text) = 0;
+    virtual void synthesize(const QString &text) {
+        Q_UNUSED(text);
+    };
     virtual void stop(QTextToSpeech::BoundaryHint boundaryHint) = 0;
     virtual void pause(QTextToSpeech::BoundaryHint boundaryHint) = 0;
     virtual void resume() = 0;
@@ -52,6 +58,7 @@ Q_SIGNALS:
     void errorOccurred(QTextToSpeech::ErrorReason error, const QString &errorString);
 
     void sayingWord(qsizetype start, qsizetype length);
+    void synthesized(const QAudioFormat &format, const QByteArray &data);
 };
 
 QT_END_NAMESPACE
diff --git a/tests/auto/qtexttospeech/tst_qtexttospeech.cpp b/tests/auto/qtexttospeech/tst_qtexttospeech.cpp
index 09c4f22..fd355cf 100644
--- a/tests/auto/qtexttospeech/tst_qtexttospeech.cpp
+++ b/tests/auto/qtexttospeech/tst_qtexttospeech.cpp
@@ -6,6 +6,7 @@
 #include <QTextToSpeech>
 #include <QSignalSpy>
 #include <QMediaDevices>
+#include <QAudioFormat>
 #include <QAudioDevice>
 #include <QOperatingSystemVersion>
 #include <QRegularExpression>
@@ -53,6 +54,12 @@ private slots:
     void sayingWordWithPause_data();
     void sayingWordWithPause();
 
+    void synthesize_data();
+    void synthesize();
+
+    void synthesizeCallback_data();
+    void synthesizeCallback();
+
 private:
     static bool hasDefaultAudioOutput()
     {
@@ -74,6 +81,13 @@ private:
             }
         }
     }
+
+    void onError(QTextToSpeech::ErrorReason error, const QString &errorString) {
+        errorReason = error;
+        qCritical() << "Error:" << errorString;
+    }
+
+    QTextToSpeech::ErrorReason errorReason = QTextToSpeech::ErrorReason::NoError;
 };
 
 void tst_QTextToSpeech::initTestCase_data()
@@ -601,5 +615,175 @@ void tst_QTextToSpeech::sayingWordWithPause()
     debugHelper.dismiss();
 }
 
+void tst_QTextToSpeech::synthesize_data()
+{
+    QTest::addColumn<QString>("text");
+
+    QTest::addRow("text") << "Let's synthesize some text!";
+}
+
+void tst_QTextToSpeech::synthesize()
+{
+    QFETCH_GLOBAL(QString, engine);
+    if (engine != "mock" && !hasDefaultAudioOutput())
+        QSKIP("No audio device present");
+    if (engine == "android" && QOperatingSystemVersion::current() < QOperatingSystemVersion::Android10)
+        QSKIP("Only testing on recent Android versions");
+
+    QFETCH(QString, text);
+
+    QTextToSpeech tts(engine);
+    if (!(tts.engineCapabilities() & QTextToSpeech::Capability::Synthesize))
+        QSKIP("This engine doesn't support synthesize()");
+
+    connect(&tts, &QTextToSpeech::errorOccurred, this, &tst_QTextToSpeech::onError);
+    QTRY_COMPARE(tts.state(), QTextToSpeech::Ready);
+    selectWorkingVoice(&tts);
+
+    QElapsedTimer speechTimer;
+    // We can't assume that synthesis isn't done before we can check, and that we only
+    // have a single change during an event loop cycle, so connect to the signal
+    // and keep track ourselves.
+    bool running = false;
+    bool finished = false;
+    qint64 speechTime = 0;
+    connect(&tts, &QTextToSpeech::stateChanged, [&running, &finished, &speechTimer, &speechTime](QTextToSpeech::State state) {
+        if (state == QTextToSpeech::Synthesizing || state == QTextToSpeech::Speaking) {
+            speechTimer.start();
+            running = true;
+            finished = false;
+        }
+        if (running && state == QTextToSpeech::Ready) {
+            if (!speechTime)
+                speechTime = speechTimer.elapsed();
+            finished = true;
+        }
+    });
+
+    // first, measure how long it takes to speak the text
+    tts.say(text);
+    QTRY_VERIFY(running);
+    QTRY_VERIFY(finished);
+
+    running = false;
+
+    QAudioFormat pcmFormat;
+    QByteArray pcmData;
+
+    connect(&tts, &QTextToSpeech::synthesized,
+            this, [&pcmFormat, &pcmData](const QAudioFormat &format, const QByteArray &bytes) {
+        pcmFormat = format;
+        pcmData += bytes;
+    });
+
+    QElapsedTimer notBlockingTimer;
+    notBlockingTimer.start();
+    tts.synthesize(text);
+    QCOMPARE_LT(notBlockingTimer.elapsed(), 250);
+    QTRY_VERIFY(running);
+    QTRY_VERIFY(finished);
+
+    QVERIFY(pcmFormat.isValid());
+    // bytesForDuration takes micro seconds, we measured in milliseconds.
+    const qint32 bytesExpected = pcmFormat.bytesForDuration(speechTime * 1000);
+
+    // We should have as much data as the format requires for the time it took
+    // to play the speech, +/- 10% as we can't measure the exact audio duration.
+    QCOMPARE_GE(pcmData.size(), double(bytesExpected) * 0.9);
+    if (engine == "flite") // flite is very unreliable
+        QCOMPARE_LT(pcmData.size(), double(bytesExpected) * 1.5);
+    else
+        QCOMPARE_LT(pcmData.size(), double(bytesExpected) * 1.1);
+}
+
+/*!
+    API test for the functor variants of synthesize(), using only the mock
+    engine as the engine implementation is identical to the non-functor
+    version tested above.
+*/
+void tst_QTextToSpeech::synthesizeCallback_data()
+{
+    QTest::addColumn<QString>("text");
+
+    QTest::addRow("one") << "test";
+    QTest::addRow("several") << "this will produce more than one chunk.";
+}
+
+void tst_QTextToSpeech::synthesizeCallback()
+{
+    QFETCH_GLOBAL(QString, engine);
+    if (engine != "mock")
+        QSKIP("Only testing with mock engine");
+
+    QTextToSpeech tts(engine);
+    QVERIFY(tts.engineCapabilities() & QTextToSpeech::Capability::Synthesize);
+
+    QFETCH(QString, text);
+
+    QAudioFormat expectedFormat;
+    QByteArray expectedBytes;
+
+    // record a reference using the already tested synthesized() signal
+    auto connection = connect(&tts, &QTextToSpeech::synthesized,
+            [&expectedFormat, &expectedBytes](const QAudioFormat &format, const QByteArray &bytes){
+        expectedFormat = format;
+        expectedBytes += bytes;
+    });
+    tts.synthesize(text);
+    QTRY_VERIFY(expectedFormat.isValid());
+    QTRY_COMPARE(tts.state(), QTextToSpeech::Ready);
+    tts.disconnect(connection);
+
+    struct Processor : QObject {
+        void process(const QAudioFormat &format, const QByteArray &bytes)
+        {
+            m_format = format;
+            m_allBytes += bytes;
+        }
+        void audioFormatKnown(const QAudioFormat &format)
+        {
+            m_format = format;
+        }
+        void reset()
+        {
+            m_format = {};
+            m_allBytes = {};
+        }
+        QAudioFormat m_format;
+        QByteArray m_allBytes;
+    } processor;
+
+    // Functor without context
+    tts.synthesize(text, [&processor](const QAudioFormat &format, const QByteArray &bytes){
+        processor.m_format = format;
+        processor.m_allBytes += bytes;
+    });
+    QTRY_COMPARE(processor.m_format, expectedFormat);
+    QTRY_COMPARE(tts.state(), QTextToSpeech::Ready);
+    QCOMPARE(processor.m_allBytes, expectedBytes);
+    processor.reset();
+    // Functor with context
+    tts.synthesize(text, &tts, [&processor](const QAudioFormat &format, const QByteArray &bytes){
+        processor.m_format = format;
+        processor.m_allBytes += bytes;
+    });
+    QTRY_COMPARE(processor.m_format, expectedFormat);
+    QTRY_COMPARE(tts.state(), QTextToSpeech::Ready);
+    QCOMPARE(processor.m_allBytes, expectedBytes);
+    processor.reset();
+    // PMF
+    tts.synthesize(text, &processor, &Processor::process);
+    QTRY_COMPARE(processor.m_format, expectedFormat);
+    QTRY_COMPARE(tts.state(), QTextToSpeech::Ready);
+    QCOMPARE(processor.m_allBytes, expectedBytes);
+    processor.reset();
+    // PMF with no QByteArray argument - not very useful, but Qt allows it
+    tts.synthesize(text, &processor, &Processor::audioFormatKnown);
+    QTRY_COMPARE(processor.m_format, expectedFormat);
+    QTRY_COMPARE(tts.state(), QTextToSpeech::Ready);
+    QCOMPARE(processor.m_allBytes, QByteArray());
+    processor.reset();
+}
+
 QTEST_MAIN(tst_QTextToSpeech)
 #include "tst_qtexttospeech.moc"
author	Volker Hilsheimer <volker.hilsheimer@qt.io>	2023-01-18 15:04:41 +0100
committer	Volker Hilsheimer <volker.hilsheimer@qt.io>	2023-02-19 19:36:36 +0100
commit	c03afcc297bf250baff8d0693e4db0c8cc77eeed (patch)
tree	257c98299d9c94a3c998b13a5ef19a7d5acd1b78
parent	ea5c48e518789c3387ed9c9d21978eda122e9782 (diff)