Add QTextToSpeech::synthesize to produce PCM data rather than audio

The function starts the synthesis as an asynchronous process, and emits a signal 'synthesized()' (or calls a functor) with a chunk of PCM data as a QByteArray, and the QAudioFormat in which the data is encoded. This requires a dependency to Qt Multimedia for Qt Speech for all platforms; it has so far been required only with flite and winrt backends. Implemented for all engines, except speechd and macos engines where it's not possible - these engines don't provide access to the data. The test case verifies that the implementation is asynchronous, and that it produces a reasonable amount of data. Since this involves timer-based measurements, values need to be compared with some appropriate margins. The QML documentation of this API is omitted on purpose; the QAudioFormat type is not available in QML, and we don't want to encourage users to operate on raw bytes from QML anyway. [ChangeLog][QtTextToSpeech][QTextToSpeech] Added the ability to produce PCM data as a QByteArray. The QtTextToSpeech module now depends on QtMultimedia on all platforms. Fixes: QTBUG-109837 Change-Id: I308a3e18998827089c0f75789b720f1bd36e3c46 Reviewed-by: Qt CI Bot <qt_ci_bot@qt-project.org> Reviewed-by: Axel Spoerl <axel.spoerl@qt.io>
author: Volker Hilsheimer <volker.hilsheimer@qt.io> 2023-01-18 15:04:41 +0100
committer: Volker Hilsheimer <volker.hilsheimer@qt.io> 2023-02-19 19:36:36 +0100
commit: c03afcc297bf250baff8d0693e4db0c8cc77eeed (patch)
tree: 257c98299d9c94a3c998b13a5ef19a7d5acd1b78 /tests
parent: ea5c48e518789c3387ed9c9d21978eda122e9782 (diff)
1 files changed, 184 insertions, 0 deletions
diff --git a/tests/auto/qtexttospeech/tst_qtexttospeech.cpp b/tests/auto/qtexttospeech/tst_qtexttospeech.cpp
index 09c4f22..fd355cf 100644
--- a/tests/auto/qtexttospeech/tst_qtexttospeech.cpp
+++ b/tests/auto/qtexttospeech/tst_qtexttospeech.cpp
@@ -6,6 +6,7 @@
 #include <QTextToSpeech>
 #include <QSignalSpy>
 #include <QMediaDevices>
+#include <QAudioFormat>
 #include <QAudioDevice>
 #include <QOperatingSystemVersion>
 #include <QRegularExpression>
@@ -53,6 +54,12 @@ private slots:
     void sayingWordWithPause_data();
     void sayingWordWithPause();
 
+    void synthesize_data();
+    void synthesize();
+
+    void synthesizeCallback_data();
+    void synthesizeCallback();
+
 private:
     static bool hasDefaultAudioOutput()
     {
@@ -74,6 +81,13 @@ private:
             }
         }
     }
+
+    void onError(QTextToSpeech::ErrorReason error, const QString &errorString) {
+        errorReason = error;
+        qCritical() << "Error:" << errorString;
+    }
+
+    QTextToSpeech::ErrorReason errorReason = QTextToSpeech::ErrorReason::NoError;
 };
 
 void tst_QTextToSpeech::initTestCase_data()
@@ -601,5 +615,175 @@ void tst_QTextToSpeech::sayingWordWithPause()
     debugHelper.dismiss();
 }
 
+void tst_QTextToSpeech::synthesize_data()
+{
+    QTest::addColumn<QString>("text");
+
+    QTest::addRow("text") << "Let's synthesize some text!";
+}
+
+void tst_QTextToSpeech::synthesize()
+{
+    QFETCH_GLOBAL(QString, engine);
+    if (engine != "mock" && !hasDefaultAudioOutput())
+        QSKIP("No audio device present");
+    if (engine == "android" && QOperatingSystemVersion::current() < QOperatingSystemVersion::Android10)
+        QSKIP("Only testing on recent Android versions");
+
+    QFETCH(QString, text);
+
+    QTextToSpeech tts(engine);
+    if (!(tts.engineCapabilities() & QTextToSpeech::Capability::Synthesize))
+        QSKIP("This engine doesn't support synthesize()");
+
+    connect(&tts, &QTextToSpeech::errorOccurred, this, &tst_QTextToSpeech::onError);
+    QTRY_COMPARE(tts.state(), QTextToSpeech::Ready);
+    selectWorkingVoice(&tts);
+
+    QElapsedTimer speechTimer;
+    // We can't assume that synthesis isn't done before we can check, and that we only
+    // have a single change during an event loop cycle, so connect to the signal
+    // and keep track ourselves.
+    bool running = false;
+    bool finished = false;
+    qint64 speechTime = 0;
+    connect(&tts, &QTextToSpeech::stateChanged, [&running, &finished, &speechTimer, &speechTime](QTextToSpeech::State state) {
+        if (state == QTextToSpeech::Synthesizing || state == QTextToSpeech::Speaking) {
+            speechTimer.start();
+            running = true;
+            finished = false;
+        }
+        if (running && state == QTextToSpeech::Ready) {
+            if (!speechTime)
+                speechTime = speechTimer.elapsed();
+            finished = true;
+        }
+    });
+
+    // first, measure how long it takes to speak the text
+    tts.say(text);
+    QTRY_VERIFY(running);
+    QTRY_VERIFY(finished);
+
+    running = false;
+
+    QAudioFormat pcmFormat;
+    QByteArray pcmData;
+
+    connect(&tts, &QTextToSpeech::synthesized,
+            this, [&pcmFormat, &pcmData](const QAudioFormat &format, const QByteArray &bytes) {
+        pcmFormat = format;
+        pcmData += bytes;
+    });
+
+    QElapsedTimer notBlockingTimer;
+    notBlockingTimer.start();
+    tts.synthesize(text);
+    QCOMPARE_LT(notBlockingTimer.elapsed(), 250);
+    QTRY_VERIFY(running);
+    QTRY_VERIFY(finished);
+
+    QVERIFY(pcmFormat.isValid());
+    // bytesForDuration takes micro seconds, we measured in milliseconds.
+    const qint32 bytesExpected = pcmFormat.bytesForDuration(speechTime * 1000);
+
+    // We should have as much data as the format requires for the time it took
+    // to play the speech, +/- 10% as we can't measure the exact audio duration.
+    QCOMPARE_GE(pcmData.size(), double(bytesExpected) * 0.9);
+    if (engine == "flite") // flite is very unreliable
+        QCOMPARE_LT(pcmData.size(), double(bytesExpected) * 1.5);
+    else
+        QCOMPARE_LT(pcmData.size(), double(bytesExpected) * 1.1);
+}
+
+/*!
+    API test for the functor variants of synthesize(), using only the mock
+    engine as the engine implementation is identical to the non-functor
+    version tested above.
+*/
+void tst_QTextToSpeech::synthesizeCallback_data()
+{
+    QTest::addColumn<QString>("text");
+
+    QTest::addRow("one") << "test";
+    QTest::addRow("several") << "this will produce more than one chunk.";
+}
+
+void tst_QTextToSpeech::synthesizeCallback()
+{
+    QFETCH_GLOBAL(QString, engine);
+    if (engine != "mock")
+        QSKIP("Only testing with mock engine");
+
+    QTextToSpeech tts(engine);
+    QVERIFY(tts.engineCapabilities() & QTextToSpeech::Capability::Synthesize);
+
+    QFETCH(QString, text);
+
+    QAudioFormat expectedFormat;
+    QByteArray expectedBytes;
+
+    // record a reference using the already tested synthesized() signal
+    auto connection = connect(&tts, &QTextToSpeech::synthesized,
+            [&expectedFormat, &expectedBytes](const QAudioFormat &format, const QByteArray &bytes){
+        expectedFormat = format;
+        expectedBytes += bytes;
+    });
+    tts.synthesize(text);
+    QTRY_VERIFY(expectedFormat.isValid());
+    QTRY_COMPARE(tts.state(), QTextToSpeech::Ready);
+    tts.disconnect(connection);
+
+    struct Processor : QObject {
+        void process(const QAudioFormat &format, const QByteArray &bytes)
+        {
+            m_format = format;
+            m_allBytes += bytes;
+        }
+        void audioFormatKnown(const QAudioFormat &format)
+        {
+            m_format = format;
+        }
+        void reset()
+        {
+            m_format = {};
+            m_allBytes = {};
+        }
+        QAudioFormat m_format;
+        QByteArray m_allBytes;
+    } processor;
+
+    // Functor without context
+    tts.synthesize(text, [&processor](const QAudioFormat &format, const QByteArray &bytes){
+        processor.m_format = format;
+        processor.m_allBytes += bytes;
+    });
+    QTRY_COMPARE(processor.m_format, expectedFormat);
+    QTRY_COMPARE(tts.state(), QTextToSpeech::Ready);
+    QCOMPARE(processor.m_allBytes, expectedBytes);
+    processor.reset();
+    // Functor with context
+    tts.synthesize(text, &tts, [&processor](const QAudioFormat &format, const QByteArray &bytes){
+        processor.m_format = format;
+        processor.m_allBytes += bytes;
+    });
+    QTRY_COMPARE(processor.m_format, expectedFormat);
+    QTRY_COMPARE(tts.state(), QTextToSpeech::Ready);
+    QCOMPARE(processor.m_allBytes, expectedBytes);
+    processor.reset();
+    // PMF
+    tts.synthesize(text, &processor, &Processor::process);
+    QTRY_COMPARE(processor.m_format, expectedFormat);
+    QTRY_COMPARE(tts.state(), QTextToSpeech::Ready);
+    QCOMPARE(processor.m_allBytes, expectedBytes);
+    processor.reset();
+    // PMF with no QByteArray argument - not very useful, but Qt allows it
+    tts.synthesize(text, &processor, &Processor::audioFormatKnown);
+    QTRY_COMPARE(processor.m_format, expectedFormat);
+    QTRY_COMPARE(tts.state(), QTextToSpeech::Ready);
+    QCOMPARE(processor.m_allBytes, QByteArray());
+    processor.reset();
+}
+
 QTEST_MAIN(tst_QTextToSpeech)
 #include "tst_qtexttospeech.moc"
author	Volker Hilsheimer <volker.hilsheimer@qt.io>	2023-01-18 15:04:41 +0100
committer	Volker Hilsheimer <volker.hilsheimer@qt.io>	2023-02-19 19:36:36 +0100
commit	c03afcc297bf250baff8d0693e4db0c8cc77eeed (patch)
tree	257c98299d9c94a3c998b13a5ef19a7d5acd1b78 /tests
parent	ea5c48e518789c3387ed9c9d21978eda122e9782 (diff)