summaryrefslogtreecommitdiffstats
path: root/src/multimedia/video/qvideoframeconversionhelper_sse2.cpp
blob: b7049d806986b23db916b089283f6ec2ec3eb2c9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
// Copyright (C) 2016 The Qt Company Ltd.
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only

#include "qvideoframeconversionhelper_p.h"

#ifdef QT_COMPILER_SUPPORTS_SSE2

QT_BEGIN_NAMESPACE

namespace  {

template<int a, int r, int b, int g>
void convert_to_ARGB32_sse2(const QVideoFrame &frame, uchar *output)
{
    FETCH_INFO_PACKED(frame)
    MERGE_LOOPS(width, height, stride, 4)
    quint32 *argb = reinterpret_cast<quint32*>(output);

    const __m128i zero = _mm_setzero_si128();
#if Q_BYTE_ORDER == Q_LITTLE_ENDIAN
    const uchar shuffle = _MM_SHUFFLE(a, r, b, g);
#else
    const uchar shuffle = _MM_SHUFFLE(3-a, 3-r, 3-b, 3-g);
#endif

    using Pixel = const ArgbPixel<a, r, g, b>;

    for (int y = 0; y < height; ++y) {
        auto *pixel = reinterpret_cast<const Pixel *>(src);

        int x = 0;
        QT_MEDIA_ALIGN(16, argb, x, width) {
            *argb = pixel->convert();
            ++pixel;
            ++argb;
        }

        for (; x < width - 3; x += 4) {
            __m128i pixelData = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pixel));
            pixel += 4;
            __m128i lowPixels = _mm_unpacklo_epi8(pixelData, zero);
            __m128i highPixels = _mm_unpackhi_epi8(pixelData, zero);
            lowPixels = _mm_shufflelo_epi16(_mm_shufflehi_epi16(lowPixels, shuffle), shuffle);
            highPixels = _mm_shufflelo_epi16(_mm_shufflehi_epi16(highPixels, shuffle), shuffle);
            pixelData = _mm_packus_epi16(lowPixels, highPixels);
            _mm_store_si128(reinterpret_cast<__m128i*>(argb), pixelData);
            argb += 4;
        }

        // leftovers
        for (; x < width; ++x) {
            *argb = pixel->convert();
            ++pixel;
            ++argb;
        }

        src += stride;
    }
}

}

void QT_FASTCALL qt_convert_ARGB8888_to_ARGB32_sse2(const QVideoFrame &frame, uchar *output)
{
    convert_to_ARGB32_sse2<0, 1, 2, 3>(frame, output);
}

void QT_FASTCALL qt_convert_ABGR8888_to_ARGB32_sse2(const QVideoFrame &frame, uchar *output)
{
    convert_to_ARGB32_sse2<0, 3, 2, 1>(frame, output);
}

void QT_FASTCALL qt_convert_RGBA8888_to_ARGB32_sse2(const QVideoFrame &frame, uchar *output)
{
    convert_to_ARGB32_sse2<3, 0, 1, 2>(frame, output);
}

void QT_FASTCALL qt_convert_BGRA8888_to_ARGB32_sse2(const QVideoFrame &frame, uchar *output)
{
    convert_to_ARGB32_sse2<3, 2, 1, 0>(frame, output);
}

void QT_FASTCALL qt_copy_pixels_with_mask_sse2(uint32_t *dst, const uint32_t *src, size_t size, uint32_t mask)
{
    const auto mask128 = _mm_set_epi32(mask, mask, mask, mask);

    size_t x = 0;

    QT_MEDIA_ALIGN(16, dst, x, size)
        *(dst++) = *(src++) | mask;

    for (; x < size - (4 * 4 - 1); x += 4 * 4) {
        const auto srcData0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
        const auto srcData1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src += 4));
        const auto srcData2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src += 4));
        const auto srcData3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src += 4));

        _mm_store_si128(reinterpret_cast<__m128i *>(dst), _mm_or_si128(srcData0, mask128));
        _mm_store_si128(reinterpret_cast<__m128i *>(dst += 4), _mm_or_si128(srcData1, mask128));
        _mm_store_si128(reinterpret_cast<__m128i *>(dst += 4), _mm_or_si128(srcData2, mask128));
        _mm_store_si128(reinterpret_cast<__m128i *>(dst += 4), _mm_or_si128(srcData3, mask128));

        src += 4;
        dst += 4;
    }

    for (; x < size - 3; x += 4) {
        const auto srcData = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));

        _mm_store_si128(reinterpret_cast<__m128i *>(dst), _mm_or_si128(srcData, mask128));

        src += 4;
        dst += 4;
    }

    // leftovers
    for (; x < size; ++x)
        *(dst++) = *(src++) | mask;
}

QT_END_NAMESPACE

#endif