src/plugins/tts/winrt/qtexttospeech_winrt_audiosource.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440

// Copyright (C) 2022 The Qt Company Ltd.
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only

#include "qtexttospeech_winrt_audiosource.h"

#include <QtCore/QDebug>
#include <QtCore/QTimer>

#include <QtCore/private/qfunctions_winrt_p.h>
#include <QtCore/private/qsystemerror_p.h>

#include <robuffer.h>
#include <winrt/base.h>
#include <QtCore/private/qfactorycacheregistration_p.h>
#include <windows.foundation.h>
#include <windows.foundation.collections.h>
#include <windows.media.core.h>
#include <windows.media.speechsynthesis.h>
#include <windows.storage.streams.h>

#include <wrl.h>

using namespace ABI::Windows::Foundation;
using namespace ABI::Windows::Foundation::Collections;
using namespace ABI::Windows::Media::Core;
using namespace ABI::Windows::Media::SpeechSynthesis;
using namespace ABI::Windows::Storage::Streams;
using namespace Microsoft::WRL;
using namespace Microsoft::WRL::Wrappers;

QT_BEGIN_NAMESPACE

/*
    AudioSource implements a sequential QIODevice for a stream of synthesized speech.
    It also implements the handler interfaces for responding to the asynchronous
    operations of the synthesized speech stream being avialable, and reading data from
    the stream via a COM buffer.

    Whenever the QIODevice has read all the data from the COM buffer, more data is
    requested. When the data is available (the BytesReadyHandler's handler's Invoke
    implementation is called), readyRead is emitted.

    The AudioSource directly controls a QAudioSink. As soon as the data stream is
    available, the source calls QAudioSink::start. Pause/resume are delegated to the
    sink; closing the source stops the sink.
*/
AudioSource::AudioSource(ComPtr<IAsyncOperation<SpeechSynthesisStream*>> synthOperation)
    : synthOperation(synthOperation)
{
    synthOperation->put_Completed(this);

    audioFormat.setSampleFormat(QAudioFormat::Int16);
    audioFormat.setSampleRate(16000);
    audioFormat.setChannelConfig(QAudioFormat::ChannelConfigMono);
}

/*
    Calls close, which is virtual and called from ~QIODevice,
    but that won't call our override.
*/
AudioSource::~AudioSource()
{
    Q_ASSERT(ref == 0);
    close();
}

/*
    Cancel any incomplete asynchronous operations, and stop the
    sink before closing the QIODevice.
*/
void AudioSource::close()
{
    ComPtr<IAsyncInfo> asyncInfo;
    AsyncStatus status = AsyncStatus::Completed;
    if (synthOperation) {
        if (HRESULT hr = synthOperation.As(&asyncInfo); SUCCEEDED(hr)) {
            asyncInfo->get_Status(&status);
            if (status != AsyncStatus::Completed)
                asyncInfo->Cancel();
        }
    }
    if (readOperation) {
        if (HRESULT hr = readOperation.As(&asyncInfo); SUCCEEDED(hr)) {
            asyncInfo->get_Status(&status);
            if (status != AsyncStatus::Completed)
                asyncInfo->Cancel();
        }
    }
    QIODevice::close();
}

qint64 AudioSource::bytesAvailable() const
{
    return bytesInBuffer() + QIODevice::bytesAvailable();
}

/*
    Fills data with as many bytes from the COM buffer as possible. If this
    empties the COM buffer, calls fetchMore to start a new asynchronous
    read operation.
*/
qint64 AudioSource::readData(char *data, qint64 maxlen)
{
    // this may happen as per the documentation
    if (!maxlen)
        return 0;

    Q_ASSERT(bufferByteAccess);

    const qint64 available = bytesInBuffer();
    maxlen = qMin(available, maxlen);

    if (!maxlen && atEnd())
        return -1;

    byte *pbyte = nullptr;
    bufferByteAccess->Buffer(&pbyte);
    pbyte += m_bufferOffset;

    // Check and skip the RIFF header if present to prevent an audible
    // click at the start of playback.
    if (!m_riffHeaderChecked) {
        m_riffHeaderChecked = true;
        static const int WaveHeaderLength = 44;
        const char *descriptor = reinterpret_cast<const char*>(pbyte);
        if (maxlen >= WaveHeaderLength && !qstrncmp(descriptor, "RIFF", 4)) {
            pbyte += WaveHeaderLength;
            m_bufferOffset += WaveHeaderLength;
            maxlen -= WaveHeaderLength;
        }
    }

    switch (m_pause) {
    case NoPause:
        break;
    case PauseRequested: {
        Q_ASSERT(audioFormat.sampleFormat() == QAudioFormat::Int16);
        // we are dealing with artificially created sound, so we don't have
        // to find a large enough window with overall low energy; we can just
        // look for a series (e.g. 1/20th of a second) of samples with value 0.
        const int silenceDuration = audioFormat.sampleRate() / 20;
        const short *sample = reinterpret_cast<short*>(pbyte);
        const qsizetype sampleCount = maxlen / sizeof(short);
        qint64 silenceCount = 0;
        for (qint64 index = 0; index < sampleCount; ++index) {
            if (!sample[index]) {
                ++silenceCount;
            } else if (silenceCount > silenceDuration) {
                // long enough silence found, only provide the data until we are in the
                // silence. If the silence is at the beginning of our buffer, start from
                // there, otherwise play a bit of silence now.
                if (index != silenceCount)
                    silenceCount /= 2;

                maxlen = (index - silenceCount) * 2;
                // The next attempt to pull data will return nothing, and the audio sink
                // will move to idle state.
                m_pause = Paused;
                break;
            } else {
                silenceCount = 0;
            }
        }
        break;
    }
    case Paused:
        // starve the sink so that it goes idle
        maxlen = 0;
        break;
    }

    if (!maxlen)
        return 0;

    memcpy(data, pbyte, maxlen);

    // We emptied the buffer, so schedule fetching more
    if (available <= maxlen)
        QTimer::singleShot(0, this, &AudioSource::fetchMore);
    else
        m_bufferOffset += maxlen;

    return maxlen;
}

bool AudioSource::atEnd() const
{
    // not done as long as QIODevice's buffer is not empty
    if (!QIODevice::atEnd() && QIODevice::bytesAvailable())
        return false;

    // If we get here, bytesAvailable() has returned 0, so our buffers are
    // exhaused. Try to see if we are waiting for readOperation to finish.
    AsyncStatus status = AsyncStatus::Completed;
    if (readOperation) {
        ComPtr<IAsyncInfo> asyncInfo;
        if (HRESULT hr = readOperation.As(&asyncInfo); SUCCEEDED(hr))
            asyncInfo->get_Status(&status);
    }
    if (status == AsyncStatus::Started)
        return false;

    // ... or if there is more in the stream
    UINT64 ioPos = 0;
    UINT64 ioSize = 0;
    if (randomAccessStream) {
        randomAccessStream->get_Size(&ioSize);
        randomAccessStream->get_Position(&ioPos);
    }
    return ioPos >= ioSize;
}

HRESULT AudioSource::QueryInterface(REFIID riid, VOID **ppvInterface)
{
    if (!ppvInterface)
        return E_POINTER;

    if (riid == __uuidof(IUnknown)) {
        *ppvInterface = static_cast<IUnknown*>(static_cast<StreamReadyHandler *>(this));
    } else if (riid == __uuidof(StreamReadyHandler)) {
        *ppvInterface = static_cast<StreamReadyHandler *>(this);
    } else if (riid == __uuidof(BytesReadyHandler)) {
        *ppvInterface = static_cast<BytesReadyHandler *>(this);
    } else {
        *ppvInterface = nullptr;
        return E_NOINTERFACE;
    }
    AddRef();
    return S_OK;
}

/*
    Completion handler for synthesising the stream.

    Is called as soon as synthesized pcm data is available, at which point we can
    open the QIODevice, fetch data, and start the audio sink.
*/
HRESULT AudioSource::Invoke(IAsyncOperation<SpeechSynthesisStream*> *operation, AsyncStatus status)
{
    Q_ASSERT(operation == synthOperation.Get());
    ComPtr<IAsyncInfo> asyncInfo;
    synthOperation.As(&asyncInfo);

    if (status == AsyncStatus::Error) {
        QString errorString;
        if (asyncInfo) {
            HRESULT errorCode;
            asyncInfo->get_ErrorCode(&errorCode);
            if (errorCode == 0x80131537) // Windows gives us only an Unknown error
                errorString = QStringLiteral("Error when synthesizing: Input format error");
            else
                errorString = QSystemError(errorCode, QSystemError::NativeError).toString();
        } else {
            errorString = QStringLiteral("Error when synthesizing: no information available");
        }
        setErrorString(errorString);
        emit errorInStream();
    }
    if (status != AsyncStatus::Completed) {
        if (asyncInfo)
            asyncInfo->Close();
        synthOperation.Reset();

        return E_FAIL;
    }


    ComPtr<ISpeechSynthesisStream> speechStream;
    HRESULT hr = operation->GetResults(&speechStream);
    RETURN_HR_IF_FAILED("Could not access stream.");

    hr = speechStream.As(&inputStream);
    RETURN_HR_IF_FAILED("Could not cast to inputStream.");
    inputStream.As(&randomAccessStream);

    ComPtr<IBufferFactory> bufferFactory;
    hr = RoGetActivationFactory(HString::MakeReference(RuntimeClass_Windows_Storage_Streams_Buffer).Get(),
                                IID_PPV_ARGS(&bufferFactory));
    RETURN_HR_IF_FAILED("Could not create buffer factory.");
    // use the same buffer size as default read chunk size
    bufferFactory->Create(16384, m_buffer.GetAddressOf());

    hr = m_buffer->QueryInterface(IID_PPV_ARGS(&bufferByteAccess));
    RETURN_HR_IF_FAILED("Could not access buffer.");

    populateBoundaries();

    // release our reference to the speech stream operation
    if (asyncInfo)
        asyncInfo->Close();
    synthOperation.Reset();

    // we are buffered, but we don't want QIODevice to buffer as well
    open(QIODevice::ReadOnly|QIODevice::Unbuffered);
    emit streamReady(audioFormat);
    fetchMore();
    return S_OK;
}

/*
*/
void AudioSource::populateBoundaries()
{
    ComPtr<ITimedMetadataTrackProvider> metaData;
    if (!SUCCEEDED(inputStream.As(&metaData)))
        return;

    ComPtr<IVectorView<TimedMetadataTrack*>> metaDataTracks;
    metaData->get_TimedMetadataTracks(&metaDataTracks);
    quint32 trackCount = 0;
    metaDataTracks->get_Size(&trackCount);

    for (quint32 i = 0; i < trackCount; ++i) {
        ComPtr<ITimedMetadataTrack> metaDataTrack;
        HRESULT hr = metaDataTracks->GetAt(i, &metaDataTrack);
        Q_ASSERT_SUCCEEDED(hr);

        const auto boundaryType = [metaDataTrack]{
            ComPtr<IMediaTrack> mediaTrack;
            HRESULT hr = metaDataTrack.As(&mediaTrack);
            Q_ASSERT_SUCCEEDED(hr);

            if (HString hstr; SUCCEEDED(mediaTrack->get_Id(hstr.GetAddressOf()))) {
                const QString trackName = QString::fromWCharArray(hstr.GetRawBuffer(0));
                if (trackName == QStringLiteral("SpeechWord"))
                    return Boundary::Word;
                if (trackName == QStringLiteral("SpeechSentence"))
                    return Boundary::Sentence;
            }
            return Boundary::Unknown;
        }();
        if (boundaryType == Boundary::Unknown)
            continue;

        ComPtr<IVectorView<IMediaCue*>> cues;
        if (!SUCCEEDED(metaDataTrack->get_Cues(&cues)))
            continue;

        quint32 cueCount = 0;
        cues->get_Size(&cueCount);
        boundaries.reserve(boundaries.size() + cueCount);

        for (quint32 j = 0; j < cueCount; ++j) {
            ComPtr<IMediaCue> cue;
            hr = cues->GetAt(j, &cue);
            Q_ASSERT_SUCCEEDED(hr);

            ComPtr<ISpeechCue> speechCue;
            if (!SUCCEEDED(cue.As(&speechCue))) {
                qWarning("Invalid cue");
                break;
            }

            QString text;
            if (HString hstr; SUCCEEDED(speechCue->get_Text(hstr.GetAddressOf())))
                text = QString::fromWCharArray(hstr.GetRawBuffer(0));
            int startIndex = -1;
            if (IReference<int> *refInt; SUCCEEDED(speechCue->get_StartPositionInInput(&refInt)))
                refInt->get_Value(&startIndex);
            int endIndex = -1;
            if (IReference<int> *refInt; SUCCEEDED(speechCue->get_EndPositionInInput(&refInt)))
                refInt->get_Value(&endIndex);

            // A time period expressed in 100-nanosecond units. the Duration property is always 0 for speech.
            TimeSpan startTime = {};
            cue->get_StartTime(&startTime);
            const qint64 usec = startTime.Duration / 10; // QAudioSink APIs operate on microseconds
            boundaries.append(Boundary{boundaryType, text, startIndex, endIndex, usec});
        }
    }
    std::sort(boundaries.begin(), boundaries.end());
}

/*
    Completion handler for reading from the stream.

    Resets the COM buffer so that it points at the correct position in the
    stream, and emits readyRead so that the sink pulls more data.
*/
HRESULT AudioSource::Invoke(IAsyncOperationWithProgress<IBuffer*, unsigned int> *read,
                            AsyncStatus status)
{
    if (status != AsyncStatus::Completed)
        return E_FAIL;

    // there should never be multiple read operations
    Q_ASSERT(readOperation.Get() == read);

    HRESULT hr = read->GetResults(&m_buffer);
    RETURN_HR_IF_FAILED("Could not access buffer.");
    m_bufferOffset = 0;

    ComPtr<IAsyncInfo> asyncInfo;
    if (HRESULT hr = readOperation.As(&asyncInfo); SUCCEEDED(hr))
        asyncInfo->Close();
    readOperation.Reset();

    // inform the sink that more data has arrived
    if (m_pause == NoPause && bytesInBuffer())
        emit readyRead();

    return S_OK;
}

qint64 AudioSource::bytesInBuffer() const
{
    if (!m_buffer)
        return 0;

    UINT32 bytes;
    m_buffer->get_Length(&bytes);
    return bytes - m_bufferOffset;
}

/*
    Starts an asynchronous read operation. There can only be one such
    operation pending at any given time, so fetchMore must only be called
    if the buffer provided by a previous operation is exhaused.
*/
bool AudioSource::fetchMore()
{
    Q_ASSERT(m_buffer);

    if (readOperation) {
        qWarning () << "Fetching more while a read operation is already pending";
        return false;
    }

    UINT32 capacity;
    m_buffer->get_Capacity(&capacity);
    InputStreamOptions streamOptions = {};
    HRESULT hr = inputStream->ReadAsync(m_buffer.Get(), capacity, streamOptions, readOperation.GetAddressOf());
    if (!SUCCEEDED(hr))
        return false;

    readOperation->put_Completed(this);
    return true;
}

QT_END_NAMESPACE