diff options
author | Volker Hilsheimer <volker.hilsheimer@qt.io> | 2022-04-09 13:27:39 +0200 |
---|---|---|
committer | Volker Hilsheimer <volker.hilsheimer@qt.io> | 2023-02-18 13:47:13 +0100 |
commit | ea5c48e518789c3387ed9c9d21978eda122e9782 (patch) | |
tree | 1962292820c408f60a0d4a047dc818c671f3ab25 /tests/auto | |
parent | d90b30934beb053f5380b66e9bf089e15efa4b51 (diff) |
Emit information about speech progress
This is useful information in a UI that wants to visualize the progress
by highlighting the words and sentences as they get read. For this to
work, we ideally can emit data through a signal for each word that
allows an application to the progress information to the text that was
previously passed into QTextToSpeech::say, i.e the index and length
of the word within that text.
Implement this for all engines where we can, and add a test that
verifies that we get correct information:
On the macos and darwin backends, the delegate gets called for each
word about to be spoken, with index and length of the content relative
to the text. We don't get access to more detailed information, like
the length of the stream in second or samples, or the current playback
state.
Android provides an equivalent listener callback that tells us which
slice of the text is about to be spoken.
In the WinRT backend, we can ask the speech synthesizer to generate
track data for the generated audio, which gives us access for each
sentence and word, with the start time for each. Since we play the PCM
data ourselves, we don't get called with progress updates, but we can
use the track information to run a timer that iterates over the
boundaries with each tick. This has a risk of getting out of sync with
the actual playback though, but we can try to compensate for that.
We can use a similar strategy on flite, where the symbol tree provides
start times for each token. So we can use a timer, and follow the
progress through the input text for each token.
On speechd we don't have reliable access to anything; it theoretically
supports reporting of embedded <mark> tags when the input is SSML. So
for now, speechd cannot support this functionality.
Add highlighting of the spoken word to the Qt Quick example.
Change-Id: I36ff208b2f0112c9eb261864515ba20c4bf55f25
Reviewed-by: Axel Spoerl <axel.spoerl@qt.io>
Reviewed-by: Qt CI Bot <qt_ci_bot@qt-project.org>
Diffstat (limited to 'tests/auto')
-rw-r--r-- | tests/auto/qtexttospeech/BLACKLIST | 2 | ||||
-rw-r--r-- | tests/auto/qtexttospeech/tst_qtexttospeech.cpp | 132 |
2 files changed, 134 insertions, 0 deletions
diff --git a/tests/auto/qtexttospeech/BLACKLIST b/tests/auto/qtexttospeech/BLACKLIST index 0bcef4c..716d6db 100644 --- a/tests/auto/qtexttospeech/BLACKLIST +++ b/tests/auto/qtexttospeech/BLACKLIST @@ -1,2 +1,4 @@ [pauseResume:darwin] macos-13 ci # QTBUG-108205 +[sayingWordWithPause:darwin] +macos-13 ci # QTBUG-108205 diff --git a/tests/auto/qtexttospeech/tst_qtexttospeech.cpp b/tests/auto/qtexttospeech/tst_qtexttospeech.cpp index b5907aa..09c4f22 100644 --- a/tests/auto/qtexttospeech/tst_qtexttospeech.cpp +++ b/tests/auto/qtexttospeech/tst_qtexttospeech.cpp @@ -8,6 +8,7 @@ #include <QMediaDevices> #include <QAudioDevice> #include <QOperatingSystemVersion> +#include <QRegularExpression> #include <qttexttospeech-config.h> #if QT_CONFIG(speechd) @@ -46,6 +47,12 @@ private slots: void sayWithVoices(); void sayWithRates(); + void sayingWord_data(); + void sayingWord(); + + void sayingWordWithPause_data(); + void sayingWordWithPause(); + private: static bool hasDefaultAudioOutput() { @@ -469,5 +476,130 @@ void tst_QTextToSpeech::sayWithRates() logger.dismiss(); } +void tst_QTextToSpeech::sayingWord_data() +{ + QTest::addColumn<QString>("text"); + + QTest::addRow("one word") << "supercalifragilisticexpialidocious"; + QTest::addRow("sentence") << "this is one word."; + QTest::addRow("punctuation") << "this, if you want: a word!"; + QTest::addRow("two sentences") << "First word. Second word."; +} + +void tst_QTextToSpeech::sayingWord() +{ + QFETCH_GLOBAL(QString, engine); + if (engine != "mock" && !hasDefaultAudioOutput()) + QSKIP("No audio device present"); + if (engine == "android" && QOperatingSystemVersion::current() < QOperatingSystemVersion::Android10) + QSKIP("Only testing on recent Android versions"); + + QFETCH(QString, text); + + const QStringList expectedWords = text.split(QRegularExpression("\\W"), Qt::SkipEmptyParts); + + QTextToSpeech tts(engine); + if (!(tts.engineCapabilities() & QTextToSpeech::Capability::WordByWordProgress)) + QSKIP("This engine doesn't support word-by-word progress"); + + QTRY_COMPARE(tts.state(), QTextToSpeech::Ready); + selectWorkingVoice(&tts); + + QElapsedTimer timer; + QStringList words; + QList<qint64> times; + connect(&tts, &QTextToSpeech::sayingWord, [&words, ×, &timer, text](qsizetype start, qsizetype length) { + words << text.sliced(start, length); + times << timer.elapsed(); + }); + + timer.start(); + tts.say(text); + auto debugHelper = qScopeGuard([&]{ + qWarning() << "Recorded words:" << words; + qWarning() << "Expected words:" << expectedWords; + }); + QTRY_COMPARE(tts.state(), QTextToSpeech::Speaking); + QTRY_COMPARE(tts.state(), QTextToSpeech::Ready); + qint64 totalTime = timer.elapsed(); + + QCOMPARE(words, expectedWords); + + // Makes sure that the last word is reported late. Engines need to warm up, + // and some test data has a "slow" word at the end, but empirically, + // 40% into the total time is reliable and still makes sure that the signal + // doesn't get emitted with all words immediately. + if (words.count() > 1) + QCOMPARE_GE(times.last(), totalTime * 0.4); + + debugHelper.dismiss(); +} + +void tst_QTextToSpeech::sayingWordWithPause_data() +{ + QTest::addColumn<QStringList>("words"); + QTest::addColumn<int>("pauseAt"); + + const QStringList words{"this", "is", "a", "sentence", "with", "words"}; + QTest::addRow("pause1") << words << 1; + QTest::addRow("pause4") << words << 4; +} + +void tst_QTextToSpeech::sayingWordWithPause() +{ + QFETCH_GLOBAL(QString, engine); + if (engine != "mock" && !hasDefaultAudioOutput()) + QSKIP("No audio device present"); + if (engine == "macos") + QSKIP("macos engine's pause support is faulty"); + if (engine == "android" && QOperatingSystemVersion::current() < QOperatingSystemVersion::Android10) + QSKIP("Only testing on recent Android versions"); + + QFETCH(QStringList, words); + QFETCH(int, pauseAt); + + const QString text = words.join(u' '); + + QTextToSpeech tts(engine); + + if (!(tts.engineCapabilities() & QTextToSpeech::Capability::WordByWordProgress)) + QSKIP("This engine doesn't support word-by-word progress"); + + QTRY_COMPARE(tts.state(), QTextToSpeech::Ready); + selectWorkingVoice(&tts); + + QStringList spokenWords; + connect(&tts, &QTextToSpeech::sayingWord, [&](qsizetype start, qsizetype length) { + spokenWords << text.sliced(start, length); + if (spokenWords.size() == pauseAt) + tts.pause(QTextToSpeech::BoundaryHint::Word); + }); + + auto debugHelper = qScopeGuard([&]{ + qWarning() << "Spoken words:" << spokenWords; + }); + + tts.say(text); + QTRY_COMPARE(tts.state(), QTextToSpeech::Paused); + + // the engine might still signal us about the next word + QCOMPARE_LE(spokenWords.size(), words.size()); + // wait and verify that no more words are reported + QTest::qWait(500); + QCOMPARE_LE(spokenWords.size(), pauseAt + 1); + + // Resume, and make sure that all words are reported. + // We might get some words reported twice, depending on how + // the engine supports word bounaries when pausing, and how + // much of the text it repeats when resuming. + tts.resume(); + QTRY_COMPARE(tts.state(), QTextToSpeech::Ready); + QTRY_COMPARE_GE(spokenWords.size(), words.size()); + for (const auto &word : words) + QVERIFY(spokenWords.contains(word)); + + debugHelper.dismiss(); +} + QTEST_MAIN(tst_QTextToSpeech) #include "tst_qtexttospeech.moc" |