summaryrefslogtreecommitdiffstats
path: root/tests
diff options
context:
space:
mode:
authorVolker Hilsheimer <volker.hilsheimer@qt.io>2022-04-09 13:27:39 +0200
committerVolker Hilsheimer <volker.hilsheimer@qt.io>2023-02-18 13:47:13 +0100
commitea5c48e518789c3387ed9c9d21978eda122e9782 (patch)
tree1962292820c408f60a0d4a047dc818c671f3ab25 /tests
parentd90b30934beb053f5380b66e9bf089e15efa4b51 (diff)
Emit information about speech progress
This is useful information in a UI that wants to visualize the progress by highlighting the words and sentences as they get read. For this to work, we ideally can emit data through a signal for each word that allows an application to the progress information to the text that was previously passed into QTextToSpeech::say, i.e the index and length of the word within that text. Implement this for all engines where we can, and add a test that verifies that we get correct information: On the macos and darwin backends, the delegate gets called for each word about to be spoken, with index and length of the content relative to the text. We don't get access to more detailed information, like the length of the stream in second or samples, or the current playback state. Android provides an equivalent listener callback that tells us which slice of the text is about to be spoken. In the WinRT backend, we can ask the speech synthesizer to generate track data for the generated audio, which gives us access for each sentence and word, with the start time for each. Since we play the PCM data ourselves, we don't get called with progress updates, but we can use the track information to run a timer that iterates over the boundaries with each tick. This has a risk of getting out of sync with the actual playback though, but we can try to compensate for that. We can use a similar strategy on flite, where the symbol tree provides start times for each token. So we can use a timer, and follow the progress through the input text for each token. On speechd we don't have reliable access to anything; it theoretically supports reporting of embedded <mark> tags when the input is SSML. So for now, speechd cannot support this functionality. Add highlighting of the spoken word to the Qt Quick example. Change-Id: I36ff208b2f0112c9eb261864515ba20c4bf55f25 Reviewed-by: Axel Spoerl <axel.spoerl@qt.io> Reviewed-by: Qt CI Bot <qt_ci_bot@qt-project.org>
Diffstat (limited to 'tests')
-rw-r--r--tests/auto/qtexttospeech/BLACKLIST2
-rw-r--r--tests/auto/qtexttospeech/tst_qtexttospeech.cpp132
2 files changed, 134 insertions, 0 deletions
diff --git a/tests/auto/qtexttospeech/BLACKLIST b/tests/auto/qtexttospeech/BLACKLIST
index 0bcef4c..716d6db 100644
--- a/tests/auto/qtexttospeech/BLACKLIST
+++ b/tests/auto/qtexttospeech/BLACKLIST
@@ -1,2 +1,4 @@
[pauseResume:darwin]
macos-13 ci # QTBUG-108205
+[sayingWordWithPause:darwin]
+macos-13 ci # QTBUG-108205
diff --git a/tests/auto/qtexttospeech/tst_qtexttospeech.cpp b/tests/auto/qtexttospeech/tst_qtexttospeech.cpp
index b5907aa..09c4f22 100644
--- a/tests/auto/qtexttospeech/tst_qtexttospeech.cpp
+++ b/tests/auto/qtexttospeech/tst_qtexttospeech.cpp
@@ -8,6 +8,7 @@
#include <QMediaDevices>
#include <QAudioDevice>
#include <QOperatingSystemVersion>
+#include <QRegularExpression>
#include <qttexttospeech-config.h>
#if QT_CONFIG(speechd)
@@ -46,6 +47,12 @@ private slots:
void sayWithVoices();
void sayWithRates();
+ void sayingWord_data();
+ void sayingWord();
+
+ void sayingWordWithPause_data();
+ void sayingWordWithPause();
+
private:
static bool hasDefaultAudioOutput()
{
@@ -469,5 +476,130 @@ void tst_QTextToSpeech::sayWithRates()
logger.dismiss();
}
+void tst_QTextToSpeech::sayingWord_data()
+{
+ QTest::addColumn<QString>("text");
+
+ QTest::addRow("one word") << "supercalifragilisticexpialidocious";
+ QTest::addRow("sentence") << "this is one word.";
+ QTest::addRow("punctuation") << "this, if you want: a word!";
+ QTest::addRow("two sentences") << "First word. Second word.";
+}
+
+void tst_QTextToSpeech::sayingWord()
+{
+ QFETCH_GLOBAL(QString, engine);
+ if (engine != "mock" && !hasDefaultAudioOutput())
+ QSKIP("No audio device present");
+ if (engine == "android" && QOperatingSystemVersion::current() < QOperatingSystemVersion::Android10)
+ QSKIP("Only testing on recent Android versions");
+
+ QFETCH(QString, text);
+
+ const QStringList expectedWords = text.split(QRegularExpression("\\W"), Qt::SkipEmptyParts);
+
+ QTextToSpeech tts(engine);
+ if (!(tts.engineCapabilities() & QTextToSpeech::Capability::WordByWordProgress))
+ QSKIP("This engine doesn't support word-by-word progress");
+
+ QTRY_COMPARE(tts.state(), QTextToSpeech::Ready);
+ selectWorkingVoice(&tts);
+
+ QElapsedTimer timer;
+ QStringList words;
+ QList<qint64> times;
+ connect(&tts, &QTextToSpeech::sayingWord, [&words, &times, &timer, text](qsizetype start, qsizetype length) {
+ words << text.sliced(start, length);
+ times << timer.elapsed();
+ });
+
+ timer.start();
+ tts.say(text);
+ auto debugHelper = qScopeGuard([&]{
+ qWarning() << "Recorded words:" << words;
+ qWarning() << "Expected words:" << expectedWords;
+ });
+ QTRY_COMPARE(tts.state(), QTextToSpeech::Speaking);
+ QTRY_COMPARE(tts.state(), QTextToSpeech::Ready);
+ qint64 totalTime = timer.elapsed();
+
+ QCOMPARE(words, expectedWords);
+
+ // Makes sure that the last word is reported late. Engines need to warm up,
+ // and some test data has a "slow" word at the end, but empirically,
+ // 40% into the total time is reliable and still makes sure that the signal
+ // doesn't get emitted with all words immediately.
+ if (words.count() > 1)
+ QCOMPARE_GE(times.last(), totalTime * 0.4);
+
+ debugHelper.dismiss();
+}
+
+void tst_QTextToSpeech::sayingWordWithPause_data()
+{
+ QTest::addColumn<QStringList>("words");
+ QTest::addColumn<int>("pauseAt");
+
+ const QStringList words{"this", "is", "a", "sentence", "with", "words"};
+ QTest::addRow("pause1") << words << 1;
+ QTest::addRow("pause4") << words << 4;
+}
+
+void tst_QTextToSpeech::sayingWordWithPause()
+{
+ QFETCH_GLOBAL(QString, engine);
+ if (engine != "mock" && !hasDefaultAudioOutput())
+ QSKIP("No audio device present");
+ if (engine == "macos")
+ QSKIP("macos engine's pause support is faulty");
+ if (engine == "android" && QOperatingSystemVersion::current() < QOperatingSystemVersion::Android10)
+ QSKIP("Only testing on recent Android versions");
+
+ QFETCH(QStringList, words);
+ QFETCH(int, pauseAt);
+
+ const QString text = words.join(u' ');
+
+ QTextToSpeech tts(engine);
+
+ if (!(tts.engineCapabilities() & QTextToSpeech::Capability::WordByWordProgress))
+ QSKIP("This engine doesn't support word-by-word progress");
+
+ QTRY_COMPARE(tts.state(), QTextToSpeech::Ready);
+ selectWorkingVoice(&tts);
+
+ QStringList spokenWords;
+ connect(&tts, &QTextToSpeech::sayingWord, [&](qsizetype start, qsizetype length) {
+ spokenWords << text.sliced(start, length);
+ if (spokenWords.size() == pauseAt)
+ tts.pause(QTextToSpeech::BoundaryHint::Word);
+ });
+
+ auto debugHelper = qScopeGuard([&]{
+ qWarning() << "Spoken words:" << spokenWords;
+ });
+
+ tts.say(text);
+ QTRY_COMPARE(tts.state(), QTextToSpeech::Paused);
+
+ // the engine might still signal us about the next word
+ QCOMPARE_LE(spokenWords.size(), words.size());
+ // wait and verify that no more words are reported
+ QTest::qWait(500);
+ QCOMPARE_LE(spokenWords.size(), pauseAt + 1);
+
+ // Resume, and make sure that all words are reported.
+ // We might get some words reported twice, depending on how
+ // the engine supports word bounaries when pausing, and how
+ // much of the text it repeats when resuming.
+ tts.resume();
+ QTRY_COMPARE(tts.state(), QTextToSpeech::Ready);
+ QTRY_COMPARE_GE(spokenWords.size(), words.size());
+ for (const auto &word : words)
+ QVERIFY(spokenWords.contains(word));
+
+ debugHelper.dismiss();
+}
+
QTEST_MAIN(tst_QTextToSpeech)
#include "tst_qtexttospeech.moc"