diff options
author | Jana Grill <janagrill@google.com> | 2021-02-17 12:35:20 +0000 |
---|---|---|
committer | Michael Brüning <michael.bruning@qt.io> | 2021-03-29 09:43:07 +0000 |
commit | ed936c8da203893cb0cf518d4e112021675adb93 (patch) | |
tree | a8c80e50ecec120c8bd3acc420ac44e3482a105b | |
parent | 212a1e9728ba63249bd1ee878a00f06914bc28b4 (diff) |
[Backport] Security bug 1062941
Manual backport (library update) of patch originally reviewed on
https://chromium-review.googlesource.com/c/chromium/src/+/2692542:
Roll src/third_party/libyuv/ 6866adbec..1d3f901aa (17 commits)
https://chromium.googlesource.com/libyuv/libyuv.git/+log/6866adbec5af..1d3f901aa016
$ git log 6866adbec..1d3f901aa --date=short --no-merges --format='%ad %ae %s'
2020-12-25 fbarchard Scale bug fix with msan when scaling up in height and down in width with box filter.
2020-12-22 fbarchard Test Box filter scale plane with 1 dimension growing and the other reducing
2020-12-03 eshr NV12 Copy, include scale_uv.h
2020-11-18 thakis Stop setting mac_xcode_version in DEPS
2020-11-06 libyuv-ci-autoroll-builder Roll chromium_revision 5aaa70b53c..64c8c30faa (822628:824854)
2020-11-03 fbarchard Scale by even factor low level row function
2020-10-30 libyuv-ci-autoroll-builder Roll chromium_revision df9aecfc0b..5aaa70b53c (820568:822628)
2020-10-28 fbarchard PlaneScale, UVScale and ARGBScale test 3x and 4x down sample.
2020-10-27 fbarchard MJPGToNV12 added and build files sorted
2020-10-24 libyuv-ci-autoroll-builder Roll chromium_revision e812106b13..df9aecfc0b (817907:820568)
2020-10-16 libyuv-ci-autoroll-builder Roll chromium_revision 4892423355..e812106b13 (815587:817907)
2020-10-13 fbarchard UVScale down use AVX2 and Neon for aarch32
2020-10-13 fbarchard UVScale down by 4 use SSSE3/NEON
2020-10-12 fbarchard 2x down sample for UV planes ported to SSSE3 / NEON
2020-10-09 libyuv-ci-autoroll-builder Roll chromium_revision ccec2ad009..4892423355 (811963:815587)
2020-10-02 fbarchard I420ToARGB prototype added to convert_from.h
2020-10-01 fbarchard scale neon adjust PRFM instruction to co-issue with math
Created with:
roll-dep src/third_party/libyuv
(cherry picked from commit 1a60856f34aa15def686168c3b392dc37a120c51)
Bug: chromium:1158178, chromium:1062941, libyuv:875, b/176195584
Change-Id: Iecf360198a90acabcbd71e57791634f5e3e861c3
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Eugene Zemtsov <eugene@chromium.org>
Cr-Original-Commit-Position: refs/heads/master@{#839493}
Commit-Queue: Jana Grill <janagrill@chromium.org>
Reviewed-by: Victor-Gabriel Savu <vsavu@google.com>
Cr-Commit-Position: refs/branch-heads/4240@{#1545}
Cr-Branched-From: f297677702651916bbf65e59c0d4bbd4ce57d1ee-refs/heads/master@{#800218}
Reviewed-by: Jüri Valdmann <juri.valdmann@qt.io>
77 files changed, 19870 insertions, 13219 deletions
diff --git a/chromium/DEPS b/chromium/DEPS index 0fa8d0b841a..7a447ea68d4 100644 --- a/chromium/DEPS +++ b/chromium/DEPS @@ -1161,7 +1161,7 @@ deps = { Var('chromium_git') + '/webm/libwebm.git' + '@' + '51ca718c3adf0ddedacd7df25fe45f67dc5a9ce1', 'src/third_party/libyuv': - Var('chromium_git') + '/libyuv/libyuv.git' + '@' + '6afd9becdf58822b1da6770598d8597c583ccfad', # from r1714 + Var('chromium_git') + '/libyuv/libyuv.git' + '@' + '1d3f901aa016d42b5bc0148be2ef6c0fd56f3b81', # from r1768 'src/third_party/lighttpd': { 'url': Var('chromium_git') + '/chromium/deps/lighttpd.git' + '@' + Var('lighttpd_revision'), diff --git a/chromium/third_party/libyuv/Android.bp b/chromium/third_party/libyuv/Android.bp index ea76f9435be..d0b23432628 100644 --- a/chromium/third_party/libyuv/Android.bp +++ b/chromium/third_party/libyuv/Android.bp @@ -9,17 +9,20 @@ cc_library { "source/compare.cc", "source/compare_common.cc", "source/compare_gcc.cc", - "source/compare_neon.cc", - "source/compare_neon64.cc", "source/compare_mmi.cc", "source/compare_msa.cc", + "source/compare_neon.cc", + "source/compare_neon64.cc", "source/convert.cc", "source/convert_argb.cc", "source/convert_from.cc", "source/convert_from_argb.cc", + "source/convert_jpeg.cc", "source/convert_to_argb.cc", "source/convert_to_i420.cc", "source/cpu_id.cc", + "source/mjpeg_decoder.cc", + "source/mjpeg_validate.cc", "source/planar_functions.cc", "source/rotate.cc", "source/rotate_any.cc", @@ -46,10 +49,8 @@ cc_library { "source/scale_msa.cc", "source/scale_neon.cc", "source/scale_neon64.cc", + "source/scale_uv.cc", "source/video_common.cc", - "source/convert_jpeg.cc", - "source/mjpeg_decoder.cc", - "source/mjpeg_validate.cc", ], cflags: [ @@ -79,7 +80,6 @@ cc_test { shared_libs: ["libjpeg"], cflags: ["-Wall", "-Werror"], srcs: [ - "unit_test/unit_test.cc", "unit_test/basictypes_test.cc", "unit_test/color_test.cc", "unit_test/compare_test.cc", @@ -92,6 +92,8 @@ cc_test { "unit_test/rotate_test.cc", "unit_test/scale_argb_test.cc", "unit_test/scale_test.cc", + "unit_test/scale_uv_test.cc", + "unit_test/unit_test.cc", "unit_test/video_common_test.cc", ], } @@ -106,6 +108,15 @@ cc_test { } cc_test { + name: "i444tonv12_eg", + gtest: false, + srcs: [ + "util/i444tonv12_eg.cc", + ], + static_libs: ["libyuv"], +} + +cc_test { name: "cpuid", gtest: false, srcs: [ diff --git a/chromium/third_party/libyuv/Android.mk b/chromium/third_party/libyuv/Android.mk index 0975d64a386..2ceb49281be 100644 --- a/chromium/third_party/libyuv/Android.mk +++ b/chromium/third_party/libyuv/Android.mk @@ -13,6 +13,7 @@ LOCAL_SRC_FILES := \ source/compare_msa.cc \ source/compare_neon.cc \ source/compare_neon64.cc \ + source/compare_win.cc \ source/convert.cc \ source/convert_argb.cc \ source/convert_from.cc \ @@ -30,6 +31,7 @@ LOCAL_SRC_FILES := \ source/rotate_msa.cc \ source/rotate_neon.cc \ source/rotate_neon64.cc \ + source/rotate_win.cc \ source/row_any.cc \ source/row_common.cc \ source/row_gcc.cc \ @@ -37,6 +39,7 @@ LOCAL_SRC_FILES := \ source/row_msa.cc \ source/row_neon.cc \ source/row_neon64.cc \ + source/row_win.cc \ source/scale.cc \ source/scale_any.cc \ source/scale_argb.cc \ @@ -46,6 +49,8 @@ LOCAL_SRC_FILES := \ source/scale_msa.cc \ source/scale_neon.cc \ source/scale_neon64.cc \ + source/scale_uv.cc \ + source/scale_win.cc \ source/video_common.cc common_CFLAGS := -Wall -fexceptions @@ -85,7 +90,6 @@ LOCAL_MODULE_TAGS := tests LOCAL_CPP_EXTENSION := .cc LOCAL_C_INCLUDES += $(LOCAL_PATH)/include LOCAL_SRC_FILES := \ - unit_test/unit_test.cc \ unit_test/basictypes_test.cc \ unit_test/color_test.cc \ unit_test/compare_test.cc \ @@ -98,6 +102,8 @@ LOCAL_SRC_FILES := \ unit_test/rotate_test.cc \ unit_test/scale_argb_test.cc \ unit_test/scale_test.cc \ + unit_test/scale_uv_test.cc \ + unit_test/unit_test.cc \ unit_test/video_common_test.cc LOCAL_MODULE := libyuv_unittest diff --git a/chromium/third_party/libyuv/BUILD.gn b/chromium/third_party/libyuv/BUILD.gn index 1e982916c4f..d733e71de8d 100644 --- a/chromium/third_party/libyuv/BUILD.gn +++ b/chromium/third_party/libyuv/BUILD.gn @@ -6,7 +6,6 @@ # in the file PATENTS. All contributing project authors may # be found in the AUTHORS file in the root of the source tree. -import("//build/config/features.gni") import("//testing/test.gni") import("libyuv.gni") @@ -33,13 +32,12 @@ config("libyuv_config") { # This target is built when no specific target is specified on the command line. group("default") { testonly = true - deps = [ - ":libyuv", - ] + deps = [ ":libyuv" ] if (libyuv_include_tests) { deps += [ ":compare", ":cpuid", + ":i444tonv12_eg", ":libyuv_unittest", ":psnr", ":yuvconvert", @@ -51,15 +49,11 @@ group("libyuv") { all_dependent_configs = [ ":libyuv_config" ] deps = [] - if (is_win && target_cpu == "x64" && !use_qt) { + if (is_win && target_cpu == "x64") { # Compile with clang in order to get inline assembly - public_deps = [ - ":libyuv_internal(//build/toolchain/win:win_clang_x64)", - ] + public_deps = [ ":libyuv_internal(//build/toolchain/win:win_clang_x64)" ] } else { - public_deps = [ - ":libyuv_internal", - ] + public_deps = [ ":libyuv_internal" ] } if (libyuv_use_neon) { @@ -74,7 +68,7 @@ group("libyuv") { deps += [ ":libyuv_mmi" ] } - if (!is_ios) { + if (!is_ios && !libyuv_disable_jpeg) { # Make sure that clients of libyuv link with libjpeg. This can't go in # libyuv_internal because in Windows x64 builds that will generate a clang # build of libjpeg, and we don't want two copies. @@ -104,6 +98,7 @@ static_library("libyuv_internal") { "include/libyuv/scale.h", "include/libyuv/scale_argb.h", "include/libyuv/scale_row.h", + "include/libyuv/scale_uv.h", "include/libyuv/version.h", "include/libyuv/video_common.h", @@ -138,6 +133,7 @@ static_library("libyuv_internal") { "source/scale_argb.cc", "source/scale_common.cc", "source/scale_gcc.cc", + "source/scale_uv.cc", "source/scale_win.cc", "source/video_common.cc", ] @@ -151,7 +147,7 @@ static_library("libyuv_internal") { configs += [ "//build/config/gcc:symbol_visibility_default" ] } - if (!is_ios) { + if (!is_ios && !libyuv_disable_jpeg) { defines += [ "HAVE_JPEG" ] # Needed to pull in libjpeg headers. Can't add //third_party:jpeg to deps @@ -177,6 +173,9 @@ static_library("libyuv_internal") { "-ffp-contract=fast", # Enable fma vectorization for NEON. ] } + if (!libyuv_use_mmi) { + defines += [ "LIBYUV_DISABLE_MMI" ] + } } if (libyuv_use_neon) { @@ -193,9 +192,7 @@ if (libyuv_use_neon) { "source/scale_neon64.cc", ] - deps = [ - ":libyuv_internal", - ] + deps = [ ":libyuv_internal" ] public_configs = [ ":libyuv_config" ] @@ -226,9 +223,7 @@ if (libyuv_use_msa) { "source/scale_msa.cc", ] - deps = [ - ":libyuv_internal", - ] + deps = [ ":libyuv_internal" ] public_configs = [ ":libyuv_config" ] } @@ -244,9 +239,7 @@ if (libyuv_use_mmi) { "source/scale_mmi.cc", ] - deps = [ - ":libyuv_internal", - ] + deps = [ ":libyuv_internal" ] public_configs = [ ":libyuv_config" ] } @@ -276,8 +269,6 @@ if (libyuv_include_tests) { testonly = true sources = [ - # sources - # headers "unit_test/basictypes_test.cc", "unit_test/color_test.cc", "unit_test/compare_test.cc", @@ -290,6 +281,7 @@ if (libyuv_include_tests) { "unit_test/rotate_test.cc", "unit_test/scale_argb_test.cc", "unit_test/scale_test.cc", + "unit_test/scale_uv_test.cc", "unit_test/unit_test.cc", "unit_test/unit_test.h", "unit_test/video_common_test.cc", @@ -308,12 +300,10 @@ if (libyuv_include_tests) { configs += [ ":libyuv_unittest_warnings_config" ] - public_deps = [ - "//testing/gtest", - ] + public_deps = [ "//testing/gtest" ] public_configs = [ ":libyuv_unittest_config" ] - if (is_linux) { + if (is_linux || is_chromeos) { cflags = [ "-fexceptions" ] } if (is_ios) { @@ -350,10 +340,8 @@ if (libyuv_include_tests) { # sources "util/compare.cc", ] - deps = [ - ":libyuv", - ] - if (is_linux) { + deps = [ ":libyuv" ] + if (is_linux || is_chromeos) { cflags = [ "-fexceptions" ] } } @@ -363,10 +351,8 @@ if (libyuv_include_tests) { # sources "util/yuvconvert.cc", ] - deps = [ - ":libyuv", - ] - if (is_linux) { + deps = [ ":libyuv" ] + if (is_linux || is_chromeos) { cflags = [ "-fexceptions" ] } } @@ -378,22 +364,28 @@ if (libyuv_include_tests) { "util/psnr_main.cc", "util/ssim.cc", ] - deps = [ - ":libyuv", - ] + deps = [ ":libyuv" ] if (!is_ios && !libyuv_disable_jpeg) { defines = [ "HAVE_JPEG" ] } } - executable("cpuid") { + executable("i444tonv12_eg") { sources = [ # sources - "util/cpuid.c", + "util/i444tonv12_eg.cc", ] deps = [ ":libyuv", ] } + + executable("cpuid") { + sources = [ + # sources + "util/cpuid.c", + ] + deps = [ ":libyuv" ] + } } diff --git a/chromium/third_party/libyuv/DEPS b/chromium/third_party/libyuv/DEPS index 628d1102210..de185434500 100644 --- a/chromium/third_party/libyuv/DEPS +++ b/chromium/third_party/libyuv/DEPS @@ -1,23 +1,24 @@ vars = { 'chromium_git': 'https://chromium.googlesource.com', - 'chromium_revision': '30ccb0cb5047ead00aff3ec3dad9b66eb0b5ee42', + 'chromium_revision': '64c8c30faaf969c15c028131dfcd0819208039c1', + 'gn_version': 'git_revision:6f13aaac55a977e1948910942675c69f2b4f7a94', } deps = { 'src/build': - Var('chromium_git') + '/chromium/src/build' + '@' + '9112428bb9714439e07f0b66b75e0f06b83303fd', + Var('chromium_git') + '/chromium/src/build' + '@' + '2d2f9f2b85592bb9af5753ef300c055e6feb709f', 'src/buildtools': - Var('chromium_git') + '/chromium/src/buildtools' + '@' + 'cd73d2159864031f52f660ae236bacc0fae108ec', + Var('chromium_git') + '/chromium/src/buildtools' + '@' + '6302c1175607a436e18947a5abe9df2209e845fc', 'src/testing': - Var('chromium_git') + '/chromium/src/testing' + '@' + '3d7e946f5b926dee498ffdcb89733e860f59093c', + Var('chromium_git') + '/chromium/src/testing' + '@' + '40b44171056045ed1f85ca0b57485e46c03d7867', 'src/third_party': - Var('chromium_git') + '/chromium/src/third_party' + '@' + '2ce10380bd45bea2f65337365eb47b97b84467e0', + Var('chromium_git') + '/chromium/src/third_party' + '@' + '24ccdf9b7553446791983bf357261c5e0a4314a0', 'src/buildtools/linux64': { 'packages': [ { 'package': 'gn/gn/linux-amd64', - 'version': 'git_revision:972ed755f8e6d31cae9ba15fcd08136ae1a7886f', + 'version': Var('gn_version'), } ], 'dep_type': 'cipd', @@ -27,7 +28,7 @@ deps = { 'packages': [ { 'package': 'gn/gn/mac-amd64', - 'version': 'git_revision:972ed755f8e6d31cae9ba15fcd08136ae1a7886f', + 'version': Var('gn_version'), } ], 'dep_type': 'cipd', @@ -37,7 +38,7 @@ deps = { 'packages': [ { 'package': 'gn/gn/windows-amd64', - 'version': 'git_revision:972ed755f8e6d31cae9ba15fcd08136ae1a7886f', + 'version': Var('gn_version'), } ], 'dep_type': 'cipd', @@ -47,30 +48,34 @@ deps = { 'src/buildtools/clang_format/script': Var('chromium_git') + '/chromium/llvm-project/cfe/tools/clang-format.git' + '@' + '96636aa0e9f047f17447f2d45a094d0b59ed7917', 'src/buildtools/third_party/libc++/trunk': - Var('chromium_git') + '/chromium/llvm-project/libcxx.git' + '@' + '5938e0582bac570a41edb3d6a2217c299adc1bc6', + Var('chromium_git') + '/external/github.com/llvm/llvm-project/libcxx.git' + '@' + 'd9040c75cfea5928c804ab7c235fed06a63f743a', 'src/buildtools/third_party/libc++abi/trunk': - Var('chromium_git') + '/chromium/llvm-project/libcxxabi.git' + '@' + '0d529660e32d77d9111912d73f2c74fc5fa2a858', + Var('chromium_git') + '/external/github.com/llvm/llvm-project/libcxxabi.git' + '@' + '196ba1aaa8ac285d94f4ea8d9836390a45360533', 'src/buildtools/third_party/libunwind/trunk': - Var('chromium_git') + '/external/llvm.org/libunwind.git' + '@' + '69d9b84cca8354117b9fe9705a4430d789ee599b', + Var('chromium_git') + '/external/github.com/llvm/llvm-project/libunwind.git' + '@' + 'd999d54f4bca789543a2eb6c995af2d9b5a1f3ed', 'src/third_party/catapult': - Var('chromium_git') + '/catapult.git' + '@' + 'c5f9c068212598f72a2f07356be1d7452ce9861b', + Var('chromium_git') + '/catapult.git' + '@' + 'ccc9dd2835f5a7c5c82ae3c1a2fbc2fe2fd9dfd1', 'src/third_party/colorama/src': Var('chromium_git') + '/external/colorama.git' + '@' + '799604a1041e9b3bc5d2789ecbd7e8db2e18e6b8', + 'src/third_party/depot_tools': + Var('chromium_git') + '/chromium/tools/depot_tools.git' + '@' + '91bb7506bd20ed22b8787e7a8b9975cc07e97175', 'src/third_party/freetype/src': - Var('chromium_git') + '/chromium/src/third_party/freetype2.git' + '@' + '3de1b8d0b0983cf49a187a4227e7950395a3b08f', + Var('chromium_git') + '/chromium/src/third_party/freetype2.git' + '@' + '26e2a89598d69c7aba76c83f6a1fcf1db17574ab', 'src/third_party/googletest/src': - Var('chromium_git') + '/external/github.com/google/googletest.git' + '@' + '33a0d4f6d76a0ed6061e612848532cba82d42870', + Var('chromium_git') + '/external/github.com/google/googletest.git' + '@' + '4fe018038f87675c083d0cfb6a6b57c274fb1753', 'src/third_party/harfbuzz-ng/src': - Var('chromium_git') + '/external/github.com/harfbuzz/harfbuzz.git' + '@' + '170b5dd856b1ba8f26e79863fe0c64a52eb68951', + Var('chromium_git') + '/external/github.com/harfbuzz/harfbuzz.git' + '@' + 'c39ab82c90479341dcf28eaa8174af6f08c0d7ae', 'src/third_party/libjpeg_turbo': - Var('chromium_git') + '/chromium/deps/libjpeg_turbo.git' + '@' + 'baa5dc24258bf9af873b9105e8988c558c425a17', + Var('chromium_git') + '/chromium/deps/libjpeg_turbo.git' + '@' + 'd5148db386ceb4a608058320071cbed890bd6ad2', + 'src/third_party/nasm': + Var('chromium_git') + '/chromium/deps/nasm.git' + '@' + '19f3fad68da99277b2882939d3b2fa4c4b8d51d9', 'src/third_party/yasm/source/patched-yasm': Var('chromium_git') + '/chromium/deps/yasm/patched-yasm.git' + '@' + '720b70524a4424b15fc57e82263568c8ba0496ad', 'src/tools': - Var('chromium_git') + '/chromium/src/tools' + '@' + 'a9a3a3075b876faf176385672c9e08c96aa66847', + Var('chromium_git') + '/chromium/src/tools' + '@' + '1bb7c085e67a0fc8c63511af83299d1632f5a3f3', 'src/tools/swarming_client': - Var('chromium_git') + '/infra/luci/client-py.git' + '@' + '96f125709acfd0b48fc1e5dae7d6ea42291726ac', + Var('chromium_git') + '/infra/luci/client-py.git' + '@' + 'd46ea7635f2911208268170512cb611412488fd8', # libyuv-only dependencies (not present in Chromium). 'src/third_party/gflags': @@ -81,7 +86,7 @@ deps = { Var('chromium_git') + '/external/webrtc/deps/third_party/gtest-parallel' + '@' + '1dad0e9f6d82ff994130b529d7d814b40eb32b0e', 'src/third_party/lss': { - 'url': Var('chromium_git') + '/linux-syscall-support.git' + '@' + '8048ece6c16c91acfe0d36d1d3cc0890ab6e945c', + 'url': Var('chromium_git') + '/linux-syscall-support.git' + '@' + '29f7c7e018f4ce706a709f0b0afbf8bacf869480', 'condition': 'checkout_android or checkout_linux', }, @@ -97,20 +102,20 @@ deps = { 'dep_type': 'cipd', }, 'src/third_party/auto/src': { - 'url': Var('chromium_git') + '/external/github.com/google/auto.git' + '@' + '8a81a858ae7b78a1aef71ac3905fade0bbd64e82', + 'url': Var('chromium_git') + '/external/github.com/google/auto.git' + '@' + 'f40317ae215863102cf87fe0679ad66f4b19454e', 'condition': 'checkout_android', }, 'src/third_party/boringssl/src': - 'https://boringssl.googlesource.com/boringssl.git' + '@' + '05cd93068b0a553afc48f69acbceae10c6a17593', + 'https://boringssl.googlesource.com/boringssl.git' + '@' + '1607f54fed72c6589d560254626909a64124f091', 'src/base': { - 'url': Var('chromium_git') + '/chromium/src/base' + '@' + 'a8b47403c0775fc4d6c3bb468ef9c5555d9cafda', + 'url': Var('chromium_git') + '/chromium/src/base' + '@' + 'e096814b0448fba1095c6e7be7c7a0b5d7264251', 'condition': 'checkout_android', }, 'src/third_party/bazel': { 'packages': [ { 'package': 'chromium/third_party/bazel', - 'version': '1794576f65a721eb0af320a0701e48d31f1b2415', + 'version': 'VjMsf48QUWw8n7XtJP2AuSjIGmbQeYdWdwyxVvIRLmAC', }, ], 'condition': 'checkout_android', @@ -127,7 +132,7 @@ deps = { 'dep_type': 'cipd', }, 'src/third_party/android_ndk': { - 'url': Var('chromium_git') + '/android_ndk.git' + '@' + '89e8db0cdf323af8bc24de875d7d2a43a66bf10e', + 'url': Var('chromium_git') + '/android_ndk.git' + '@' + '27c0a8d090c666a50e40fceb4ee5b40b1a2d3f87', 'condition': 'checkout_android', }, 'src/third_party/android_support_test_runner': { @@ -143,12 +148,16 @@ deps = { 'src/third_party/android_sdk/public': { 'packages': [ { - 'package': 'chromium/third_party/android_sdk/public/build-tools', - 'version': '5DL7LQQjVMLClXLzLgmGysccPGsGcjJdvH9z5-uetiIC', + 'package': 'chromium/third_party/android_sdk/public/build-tools/30.0.1', + 'version': '8LZujEmLjSh0g3JciDA3cslSptxKs9HOa_iUPXkOeYQC', + }, + { + 'package': 'chromium/third_party/android_sdk/public/cmdline-tools', + 'version': 'ijpIFSitwBfaEdO9VXBGPqDHUVzPimXy_whw3aHTN9oC', }, { 'package': 'chromium/third_party/android_sdk/public/emulator', - 'version': 'xhyuoquVvBTcJelgRjMKZeoBVSQRjB7pLVJPt5C9saIC', + 'version': 'A4EvXZUIuQho0QRDJopMUpgyp6NA3aiDQjGKPUKbowMC', }, { 'package': 'chromium/third_party/android_sdk/public/extras', @@ -160,23 +169,15 @@ deps = { }, { 'package': 'chromium/third_party/android_sdk/public/platform-tools', - 'version': 'MSnxgXN7IurL-MQs1RrTkSFSb8Xd1UtZjLArI8Ty1FgC', - }, - { - 'package': 'chromium/third_party/android_sdk/public/platforms', - 'version': 'Kg2t9p0YnQk8bldUv4VA3o156uPXLUfIFAmVZ-Gm5ewC', - }, - { - 'package': 'chromium/third_party/android_sdk/public/sources', - 'version': 'K9uEn3JvNELEVjjVK_GQD3ZQD3rqAnJSxCWxjmUmRkgC', + 'version': '8tF0AOj7Dwlv4j7_nfkhxWB0jzrvWWYjEIpirt8FIWYC', }, { - 'package': 'chromium/third_party/android_sdk/public/tools', - 'version': 'wYcRQC2WHsw2dKWs4EA7fw9Qsyzu1ds1_fRjKmGxe5QC', + 'package': 'chromium/third_party/android_sdk/public/platforms/android-30', + 'version': 'YMUu9EHNZ__2Xcxl-KsaSf-dI5TMt_P62IseUVsxktMC', }, { - 'package': 'chromium/third_party/android_sdk/public/tools-lint', - 'version': '89hXqZYzCum3delB5RV7J_QyWkaRodqdtQS0s3LMh3wC', + 'package': 'chromium/third_party/android_sdk/public/sources/android-29', + 'version': '4gxhM8E62bvZpQs7Q3d0DinQaW0RLCIefhXrQBFkNy8C', }, ], 'condition': 'checkout_android', @@ -256,7 +257,7 @@ deps = { }, 'src/third_party/icu': { - 'url': Var('chromium_git') + '/chromium/deps/icu.git' + '@' + '53f6b233a41ec982d8445996247093f7aaf41639', + 'url': Var('chromium_git') + '/chromium/deps/icu.git' + '@' + 'c2a4cae149aae7fd30c4cbe3cf1b30df03b386f1', }, 'src/third_party/icu4j': { 'packages': [ @@ -278,6 +279,20 @@ deps = { 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/jdk': { + 'packages': [ + { + 'package': 'chromium/third_party/jdk', + 'version': 'PfRSnxe8Od6WU4zBXomq-zsgcJgWmm3z4gMQNB-r2QcC', + }, + { + 'package': 'chromium/third_party/jdk/extras', + 'version': 'fkhuOQ3r-zKtWEdKplpo6k0vKkjl-LY_rJTmtzFCQN4C', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, 'src/third_party/jsr-305/src': { 'url': Var('chromium_git') + '/external/jsr-305.git' + '@' + '642c508235471f7220af6d5df2d3210e3bfc0919', 'condition': 'checkout_android', @@ -286,6 +301,10 @@ deps = { 'url': Var('chromium_git') + '/external/junit.git' + '@' + '64155f8a9babcfcf4263cf4d08253a1556e75481', 'condition': 'checkout_android', }, + 'src/third_party/libunwindstack': { + 'url': Var('chromium_git') + '/chromium/src/third_party/libunwindstack.git' + '@' + '11659d420a71e7323b379ea8781f07c6f384bc7e', + 'condition': 'checkout_android', + }, 'src/third_party/mockito/src': { 'url': Var('chromium_git') + '/external/mockito/mockito.git' + '@' + '04a2a289a4222f80ad20717c25144981210d2eac', 'condition': 'checkout_android', @@ -314,7 +333,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/r8', - 'version': '6xVKWv-ssICwyU5FC1osaRpeZio2kM4Tko33I_SIK-EC', + 'version': 'N9LppKV-9lFkp7JQtmcLHhm7xHqFv0SPa6aDPtgNCdwC', }, ], 'condition': 'checkout_android', @@ -331,7 +350,7 @@ deps = { 'dep_type': 'cipd', }, 'src/third_party/requests/src': { - 'url': Var('chromium_git') + '/external/github.com/kennethreitz/requests.git' + '@' + 'f172b30356d821d180fa4ecfa3e71c7274a32de4', + 'url': Var('chromium_git') + '/external/github.com/kennethreitz/requests.git' + '@' + 'refs/tags/v2.23.0', 'condition': 'checkout_android', }, 'src/third_party/robolectric': { @@ -345,7 +364,7 @@ deps = { 'dep_type': 'cipd', }, 'src/third_party/robolectric/robolectric': { - 'url': Var('chromium_git') + '/external/robolectric.git' + '@' + '3d6dcabf5521e028c8efc2778ab6bd8c7b6d923c', + 'url': Var('chromium_git') + '/external/robolectric.git' + '@' + '2f3e0a3ac450a17dbf2e7d4eaab3a1f14dda50e6', 'condition': 'checkout_android', }, 'src/third_party/sqlite4java': { @@ -358,6 +377,20 @@ deps = { 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/turbine': { + 'packages': [ + { + 'package': 'chromium/third_party/turbine', + 'version': 'O_jNDJ4VdwYKBSDbd2BJ3mknaTFoVkvE7Po8XIiKy8sC', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + 'src/third_party/turbine/src': { + 'url': Var('chromium_git') + '/external/github.com/google/turbine.git' + '@' + '0f2a5024fe4a9bb745bcd5ac7c655cebe11649bc', + 'condition': 'checkout_android', + }, 'src/third_party/ub-uiautomator/lib': { 'url': Var('chromium_git') + '/chromium/third_party/ub-uiautomator.git' + '@' + '00270549ce3161ae72ceb24712618ea28b4f9434', 'condition': 'checkout_android', @@ -375,7 +408,7 @@ deps = { # iOS deps: 'src/ios': { - 'url': Var('chromium_git') + '/chromium/src/ios' + '@' + '9101b264f63f88c7dad85e3d19b9d1989c91dc35', + 'url': Var('chromium_git') + '/chromium/src/ios' + '@' + '60ef55beac67e3c0eda1c35ab7944c786b377313', 'condition': 'checkout_ios' }, @@ -387,7 +420,7 @@ deps = { }, # === ANDROID_DEPS Generated Code Start === - # Generated by //tools/android/roll/android_deps/fetch_all.py + # Generated by //third_party/android_deps/fetch_all.py 'src/third_party/android_deps/libs/android_arch_core_common': { 'packages': [ { @@ -399,6 +432,39 @@ deps = { 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/android_arch_core_runtime': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/android_arch_core_runtime', + 'version': 'version:1.1.1-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/android_arch_lifecycle_common': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_common', + 'version': 'version:1.1.1-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/android_arch_lifecycle_common_java8': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_common_java8', + 'version': 'version:1.1.1-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + 'src/third_party/android_deps/libs/android_arch_lifecycle_livedata': { 'packages': [ { @@ -421,10 +487,10 @@ deps = { 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/android_arch_core_runtime': { + 'src/third_party/android_deps/libs/android_arch_lifecycle_runtime': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/android_arch_core_runtime', + 'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_runtime', 'version': 'version:1.1.1-cr0', }, ], @@ -443,65 +509,307 @@ deps = { 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/com_android_support_localbroadcastmanager': { + 'src/third_party/android_deps/libs/androidx_activity_activity': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/com_android_support_localbroadcastmanager', - 'version': 'version:28.0.0-cr0', + 'package': 'chromium/third_party/android_deps/libs/androidx_activity_activity', + 'version': 'version:1.1.0-cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/com_android_support_print': { + 'src/third_party/android_deps/libs/androidx_annotation_annotation': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/com_android_support_print', - 'version': 'version:28.0.0-cr0', + 'package': 'chromium/third_party/android_deps/libs/androidx_annotation_annotation', + 'version': 'version:1.1.0-cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/android_arch_lifecycle_common': { + 'src/third_party/android_deps/libs/androidx_annotation_annotation_experimental': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_common', - 'version': 'version:1.1.1-cr0', + 'package': 'chromium/third_party/android_deps/libs/androidx_annotation_annotation_experimental', + 'version': 'version:1.0.0-cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/android_arch_lifecycle_common_java8': { + 'src/third_party/android_deps/libs/androidx_appcompat_appcompat': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_common_java8', - 'version': 'version:1.1.1-cr0', + 'package': 'chromium/third_party/android_deps/libs/androidx_appcompat_appcompat', + 'version': 'version:1.2.0-beta01-cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/android_arch_lifecycle_runtime': { + 'src/third_party/android_deps/libs/androidx_appcompat_appcompat_resources': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_runtime', - 'version': 'version:1.1.1-cr0', + 'package': 'chromium/third_party/android_deps/libs/androidx_appcompat_appcompat_resources', + 'version': 'version:1.2.0-beta01-cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/androidx_annotation_annotation': { + 'src/third_party/android_deps/libs/androidx_arch_core_core_common': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/androidx_annotation_annotation', + 'package': 'chromium/third_party/android_deps/libs/androidx_arch_core_core_common', + 'version': 'version:2.1.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_arch_core_core_runtime': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_arch_core_core_runtime', + 'version': 'version:2.1.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_asynclayoutinflater_asynclayoutinflater': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_asynclayoutinflater_asynclayoutinflater', + 'version': 'version:1.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_cardview_cardview': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_cardview_cardview', + 'version': 'version:1.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_collection_collection': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_collection_collection', + 'version': 'version:1.1.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_concurrent_concurrent_futures': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_concurrent_concurrent_futures', + 'version': 'version:1.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_coordinatorlayout_coordinatorlayout': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_coordinatorlayout_coordinatorlayout', + 'version': 'version:1.1.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_core_core': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_core_core', + 'version': 'version:1.3.0-beta01-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_cursoradapter_cursoradapter': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_cursoradapter_cursoradapter', + 'version': 'version:1.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_customview_customview': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_customview_customview', + 'version': 'version:1.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_documentfile_documentfile': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_documentfile_documentfile', + 'version': 'version:1.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_drawerlayout_drawerlayout': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_drawerlayout_drawerlayout', + 'version': 'version:1.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_exifinterface_exifinterface': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_exifinterface_exifinterface', + 'version': 'version:1.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_fragment_fragment': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_fragment_fragment', + 'version': 'version:1.2.5-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_gridlayout_gridlayout': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_gridlayout_gridlayout', + 'version': 'version:1.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_interpolator_interpolator': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_interpolator_interpolator', + 'version': 'version:1.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_leanback_leanback': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_leanback_leanback', + 'version': 'version:1.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_leanback_leanback_preference': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_leanback_leanback_preference', + 'version': 'version:1.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_legacy_legacy_preference_v14': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_legacy_legacy_preference_v14', + 'version': 'version:1.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_legacy_legacy_support_core_ui': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_legacy_legacy_support_core_ui', + 'version': 'version:1.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_legacy_legacy_support_core_utils': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_legacy_legacy_support_core_utils', + 'version': 'version:1.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_legacy_legacy_support_v13': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_legacy_legacy_support_v13', + 'version': 'version:1.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_legacy_legacy_support_v4': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_legacy_legacy_support_v4', 'version': 'version:1.0.0-cr0', }, ], @@ -513,6 +821,28 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/androidx_lifecycle_lifecycle_common', + 'version': 'version:2.2.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_lifecycle_lifecycle_common_java8': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_lifecycle_lifecycle_common_java8', + 'version': 'version:2.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_lifecycle_lifecycle_livedata': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_lifecycle_lifecycle_livedata', 'version': 'version:2.0.0-cr0', }, ], @@ -520,11 +850,264 @@ deps = { 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/androidx_lifecycle_lifecycle_livedata_core': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_lifecycle_lifecycle_livedata_core', + 'version': 'version:2.2.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_lifecycle_lifecycle_runtime': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_lifecycle_lifecycle_runtime', + 'version': 'version:2.2.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_lifecycle_lifecycle_viewmodel': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_lifecycle_lifecycle_viewmodel', + 'version': 'version:2.2.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_lifecycle_lifecycle_viewmodel_savedstate': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_lifecycle_lifecycle_viewmodel_savedstate', + 'version': 'version:2.2.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_loader_loader': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_loader_loader', + 'version': 'version:1.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_localbroadcastmanager_localbroadcastmanager': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_localbroadcastmanager_localbroadcastmanager', + 'version': 'version:1.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_media_media': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_media_media', + 'version': 'version:1.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_mediarouter_mediarouter': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_mediarouter_mediarouter', + 'version': 'version:1.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_multidex_multidex': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_multidex_multidex', + 'version': 'version:2.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_palette_palette': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_palette_palette', + 'version': 'version:1.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_preference_preference': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_preference_preference', + 'version': 'version:1.1.1-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_print_print': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_print_print', + 'version': 'version:1.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_recyclerview_recyclerview': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_recyclerview_recyclerview', + 'version': 'version:1.1.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_savedstate_savedstate': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_savedstate_savedstate', + 'version': 'version:1.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_slice_slice_builders': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_slice_slice_builders', + 'version': 'version:1.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_slice_slice_core': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_slice_slice_core', + 'version': 'version:1.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_slidingpanelayout_slidingpanelayout': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_slidingpanelayout_slidingpanelayout', + 'version': 'version:1.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_swiperefreshlayout_swiperefreshlayout': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_swiperefreshlayout_swiperefreshlayout', + 'version': 'version:1.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + 'src/third_party/android_deps/libs/androidx_test_core': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/androidx_test_core', - 'version': 'version:1.0.0-cr0', + 'version': 'version:1.2.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_test_espresso_espresso_contrib': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_test_espresso_espresso_contrib', + 'version': 'version:3.2.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_test_espresso_espresso_core': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_test_espresso_espresso_core', + 'version': 'version:3.2.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_test_espresso_espresso_idling_resource': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_test_espresso_espresso_idling_resource', + 'version': 'version:3.2.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_test_espresso_espresso_intents': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_test_espresso_espresso_intents', + 'version': 'version:3.2.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_test_espresso_espresso_web': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_test_espresso_espresso_web', + 'version': 'version:3.2.0-cr0', }, ], 'condition': 'checkout_android', @@ -535,7 +1118,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/androidx_test_ext_junit', - 'version': 'version:1.0.0-cr0', + 'version': 'version:1.1.1-cr0', }, ], 'condition': 'checkout_android', @@ -546,7 +1129,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/androidx_test_monitor', - 'version': 'version:1.1.0-cr0', + 'version': 'version:1.2.0-cr0', }, ], 'condition': 'checkout_android', @@ -557,7 +1140,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/androidx_test_rules', - 'version': 'version:1.1.0-cr0', + 'version': 'version:1.2.0-cr0', }, ], 'condition': 'checkout_android', @@ -568,6 +1151,72 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/androidx_test_runner', + 'version': 'version:1.2.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_test_uiautomator_uiautomator': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_test_uiautomator_uiautomator', + 'version': 'version:2.2.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_transition_transition': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_transition_transition', + 'version': 'version:1.2.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_tvprovider_tvprovider': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_tvprovider_tvprovider', + 'version': 'version:1.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_vectordrawable_vectordrawable': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_vectordrawable_vectordrawable', + 'version': 'version:1.1.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_vectordrawable_vectordrawable_animated': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_vectordrawable_vectordrawable_animated', + 'version': 'version:1.1.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_versionedparcelable_versionedparcelable': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_versionedparcelable_versionedparcelable', 'version': 'version:1.1.0-cr0', }, ], @@ -575,6 +1224,72 @@ deps = { 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/androidx_viewpager2_viewpager2': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_viewpager2_viewpager2', + 'version': 'version:1.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_viewpager_viewpager': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_viewpager_viewpager', + 'version': 'version:1.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_webkit_webkit': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_webkit_webkit', + 'version': 'version:1.3.0-rc01-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/androidx_window_window': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/androidx_window_window', + 'version': 'version:1.0.0-alpha01-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/backport_util_concurrent_backport_util_concurrent': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/backport_util_concurrent_backport_util_concurrent', + 'version': 'version:3.1-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/classworlds_classworlds': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/classworlds_classworlds', + 'version': 'version:1.1-alpha-2-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + 'src/third_party/android_deps/libs/com_android_support_animated_vector_drawable': { 'packages': [ { @@ -608,10 +1323,10 @@ deps = { 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/com_android_support_coordinatorlayout': { + 'src/third_party/android_deps/libs/com_android_support_cardview_v7': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/com_android_support_coordinatorlayout', + 'package': 'chromium/third_party/android_deps/libs/com_android_support_cardview_v7', 'version': 'version:28.0.0-cr0', }, ], @@ -619,10 +1334,10 @@ deps = { 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/com_android_support_customview': { + 'src/third_party/android_deps/libs/com_android_support_collections': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/com_android_support_customview', + 'package': 'chromium/third_party/android_deps/libs/com_android_support_collections', 'version': 'version:28.0.0-cr0', }, ], @@ -630,10 +1345,10 @@ deps = { 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/com_android_support_drawerlayout': { + 'src/third_party/android_deps/libs/com_android_support_coordinatorlayout': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/com_android_support_drawerlayout', + 'package': 'chromium/third_party/android_deps/libs/com_android_support_coordinatorlayout', 'version': 'version:28.0.0-cr0', }, ], @@ -641,10 +1356,10 @@ deps = { 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/com_android_support_slidingpanelayout': { + 'src/third_party/android_deps/libs/com_android_support_cursoradapter': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/com_android_support_slidingpanelayout', + 'package': 'chromium/third_party/android_deps/libs/com_android_support_cursoradapter', 'version': 'version:28.0.0-cr0', }, ], @@ -652,10 +1367,10 @@ deps = { 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/com_android_support_loader': { + 'src/third_party/android_deps/libs/com_android_support_customview': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/com_android_support_loader', + 'package': 'chromium/third_party/android_deps/libs/com_android_support_customview', 'version': 'version:28.0.0-cr0', }, ], @@ -663,10 +1378,10 @@ deps = { 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/com_android_support_cardview_v7': { + 'src/third_party/android_deps/libs/com_android_support_design': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/com_android_support_cardview_v7', + 'package': 'chromium/third_party/android_deps/libs/com_android_support_design', 'version': 'version:28.0.0-cr0', }, ], @@ -674,10 +1389,21 @@ deps = { 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/com_android_support_design': { + 'src/third_party/android_deps/libs/com_android_support_documentfile': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/com_android_support_design', + 'package': 'chromium/third_party/android_deps/libs/com_android_support_documentfile', + 'version': 'version:28.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/com_android_support_drawerlayout': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/com_android_support_drawerlayout', 'version': 'version:28.0.0-cr0', }, ], @@ -696,6 +1422,17 @@ deps = { 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_android_support_interpolator': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/com_android_support_interpolator', + 'version': 'version:28.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + 'src/third_party/android_deps/libs/com_android_support_leanback_v17': { 'packages': [ { @@ -707,6 +1444,28 @@ deps = { 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_android_support_loader': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/com_android_support_loader', + 'version': 'version:28.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/com_android_support_localbroadcastmanager': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/com_android_support_localbroadcastmanager', + 'version': 'version:28.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + 'src/third_party/android_deps/libs/com_android_support_mediarouter_v7': { 'packages': [ { @@ -773,6 +1532,17 @@ deps = { 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_android_support_print': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/com_android_support_print', + 'version': 'version:28.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + 'src/third_party/android_deps/libs/com_android_support_recyclerview_v7': { 'packages': [ { @@ -784,6 +1554,17 @@ deps = { 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_android_support_slidingpanelayout': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/com_android_support_slidingpanelayout', + 'version': 'version:28.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + 'src/third_party/android_deps/libs/com_android_support_support_annotations': { 'packages': [ { @@ -883,6 +1664,17 @@ deps = { 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_android_support_swiperefreshlayout': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/com_android_support_swiperefreshlayout', + 'version': 'version:28.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + 'src/third_party/android_deps/libs/com_android_support_transition': { 'packages': [ { @@ -894,10 +1686,10 @@ deps = { 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/com_android_support_swiperefreshlayout': { + 'src/third_party/android_deps/libs/com_android_support_versionedparcelable': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/com_android_support_swiperefreshlayout', + 'package': 'chromium/third_party/android_deps/libs/com_android_support_versionedparcelable', 'version': 'version:28.0.0-cr0', }, ], @@ -916,67 +1708,77 @@ deps = { 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/com_android_support_collections': { + 'src/third_party/android_deps/libs/com_android_tools_build_jetifier_jetifier_core': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/com_android_support_collections', - 'version': 'version:28.0.0-cr0', + 'package': 'chromium/third_party/android_deps/libs/com_android_tools_build_jetifier_jetifier_core', + 'version': 'version:1.0.0-beta08-cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/com_android_support_versionedparcelable': { + 'src/third_party/android_deps/libs/com_android_tools_build_jetifier_jetifier_processor': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/com_android_support_versionedparcelable', - 'version': 'version:28.0.0-cr0', + 'package': 'chromium/third_party/android_deps/libs/com_android_tools_build_jetifier_jetifier_processor', + 'version': 'version:1.0.0-beta08-cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/com_android_support_cursoradapter': { + 'src/third_party/android_deps/libs/com_android_tools_desugar_jdk_libs': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/com_android_support_cursoradapter', - 'version': 'version:28.0.0-cr0', + 'package': 'chromium/third_party/android_deps/libs/com_android_tools_desugar_jdk_libs', + 'version': 'version:1.0.10-cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/com_android_support_interpolator': { + 'src/third_party/android_deps/libs/com_android_tools_desugar_jdk_libs_configuration': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/com_android_support_interpolator', - 'version': 'version:28.0.0-cr0', + 'package': 'chromium/third_party/android_deps/libs/com_android_tools_desugar_jdk_libs_configuration', + 'version': 'version:1.0.10-cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/com_android_support_documentfile': { + 'src/third_party/android_deps/libs/com_github_ben_manes_caffeine_caffeine': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/com_android_support_documentfile', - 'version': 'version:28.0.0-cr0', + 'package': 'chromium/third_party/android_deps/libs/com_github_ben_manes_caffeine_caffeine', + 'version': 'version:2.8.0-cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_github_kevinstern_software_and_algorithms': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/com_github_kevinstern_software_and_algorithms', + 'version': 'version:1.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, 'src/third_party/android_deps/libs/com_google_android_gms_play_services_auth': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth', - 'version': 'version:15.0.1-cr0', + 'version': 'version:17.0.0-cr0', }, ], 'condition': 'checkout_android', @@ -987,7 +1789,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth_api_phone', - 'version': 'version:15.0.1-cr0', + 'version': 'version:17.1.0-cr0', }, ], 'condition': 'checkout_android', @@ -998,7 +1800,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth_base', - 'version': 'version:15.0.1-cr0', + 'version': 'version:17.0.0-cr0', }, ], 'condition': 'checkout_android', @@ -1009,7 +1811,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_base', - 'version': 'version:15.0.1-cr0', + 'version': 'version:17.0.0-cr0', }, ], 'condition': 'checkout_android', @@ -1020,7 +1822,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_basement', - 'version': 'version:15.0.1-cr0', + 'version': 'version:17.0.0-cr0', }, ], 'condition': 'checkout_android', @@ -1031,7 +1833,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_cast', - 'version': 'version:16.0.1-cr0', + 'version': 'version:17.0.0-cr0', }, ], 'condition': 'checkout_android', @@ -1042,7 +1844,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_cast_framework', - 'version': 'version:16.0.1-cr0', + 'version': 'version:17.0.0-cr0', }, ], 'condition': 'checkout_android', @@ -1053,7 +1855,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_clearcut', - 'version': 'version:15.0.1-cr0', + 'version': 'version:17.0.0-cr0', }, ], 'condition': 'checkout_android', @@ -1064,7 +1866,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_fido', - 'version': 'version:15.0.1-cr0', + 'version': 'version:18.1.0-cr0', }, ], 'condition': 'checkout_android', @@ -1075,7 +1877,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_flags', - 'version': 'version:15.0.1-cr0', + 'version': 'version:17.0.0-cr0', }, ], 'condition': 'checkout_android', @@ -1086,7 +1888,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_gcm', - 'version': 'version:15.0.1-cr0', + 'version': 'version:17.0.0-cr0', }, ], 'condition': 'checkout_android', @@ -1097,7 +1899,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_iid', - 'version': 'version:15.0.1-cr0', + 'version': 'version:17.0.0-cr0', }, ], 'condition': 'checkout_android', @@ -1108,7 +1910,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_instantapps', - 'version': 'version:16.0.0-cr0', + 'version': 'version:17.0.0-cr0', }, ], 'condition': 'checkout_android', @@ -1119,7 +1921,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_location', - 'version': 'version:15.0.1-cr0', + 'version': 'version:17.0.0-cr0', }, ], 'condition': 'checkout_android', @@ -1130,7 +1932,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_phenotype', - 'version': 'version:15.0.1-cr0', + 'version': 'version:17.0.0-cr0', }, ], 'condition': 'checkout_android', @@ -1141,7 +1943,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_places_placereport', - 'version': 'version:15.0.1-cr0', + 'version': 'version:17.0.0-cr0', }, ], 'condition': 'checkout_android', @@ -1152,7 +1954,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_stats', - 'version': 'version:15.0.1-cr0', + 'version': 'version:17.0.0-cr0', }, ], 'condition': 'checkout_android', @@ -1163,7 +1965,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_tasks', - 'version': 'version:15.0.1-cr0', + 'version': 'version:17.0.0-cr0', }, ], 'condition': 'checkout_android', @@ -1174,7 +1976,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_vision', - 'version': 'version:15.0.1-cr0', + 'version': 'version:18.0.0-cr0', }, ], 'condition': 'checkout_android', @@ -1185,7 +1987,73 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_vision_common', - 'version': 'version:15.0.1-cr0', + 'version': 'version:18.0.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/com_google_android_material_material': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/com_google_android_material_material', + 'version': 'version:1.2.0-alpha06-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/com_google_auto_auto_common': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/com_google_auto_auto_common', + 'version': 'version:0.10-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/com_google_auto_service_auto_service': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/com_google_auto_service_auto_service', + 'version': 'version:1.0-rc6-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/com_google_auto_service_auto_service_annotations': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/com_google_auto_service_auto_service_annotations', + 'version': 'version:1.0-rc6-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/com_google_auto_value_auto_value_annotations': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/com_google_auto_value_auto_value_annotations', + 'version': 'version:1.7-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/com_google_code_findbugs_jFormatString': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/com_google_code_findbugs_jformatstring', + 'version': 'version:3.0.0-cr0', }, ], 'condition': 'checkout_android', @@ -1196,7 +2064,18 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_code_findbugs_jsr305', - 'version': 'version:1.3.9-cr0', + 'version': 'version:3.0.2-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/com_google_code_gson_gson': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/com_google_code_gson_gson', + 'version': 'version:2.8.0-cr0', }, ], 'condition': 'checkout_android', @@ -1207,7 +2086,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger', - 'version': 'version:2.17-cr0', + 'version': 'version:2.26-cr0', }, ], 'condition': 'checkout_android', @@ -1218,7 +2097,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger_compiler', - 'version': 'version:2.17-cr0', + 'version': 'version:2.26-cr0', }, ], 'condition': 'checkout_android', @@ -1229,7 +2108,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger_producers', - 'version': 'version:2.17-cr0', + 'version': 'version:2.26-cr0', }, ], 'condition': 'checkout_android', @@ -1240,7 +2119,18 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger_spi', - 'version': 'version:2.17-cr0', + 'version': 'version:2.26-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/com_google_errorprone_error_prone_annotation': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_annotation', + 'version': 'version:2.4.0-cr0', }, ], 'condition': 'checkout_android', @@ -1251,7 +2141,51 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_annotations', - 'version': 'version:2.1.3-cr0', + 'version': 'version:2.4.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/com_google_errorprone_error_prone_check_api': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_check_api', + 'version': 'version:2.4.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/com_google_errorprone_error_prone_core': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_core', + 'version': 'version:2.4.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/com_google_errorprone_error_prone_type_annotations': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_type_annotations', + 'version': 'version:2.4.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/com_google_errorprone_javac': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_javac', + 'version': 'version:9+181-r4173-1-cr0', }, ], 'condition': 'checkout_android', @@ -1280,11 +2214,33 @@ deps = { 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_guava_failureaccess': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/com_google_guava_failureaccess', + 'version': 'version:1.0.1-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + 'src/third_party/android_deps/libs/com_google_guava_guava': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_guava_guava', - 'version': 'version:25.0-jre-cr0', + 'version': 'version:27.1-jre-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/com_google_guava_listenablefuture': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/com_google_guava_listenablefuture', + 'version': 'version:1.0-cr0', }, ], 'condition': 'checkout_android', @@ -1302,11 +2258,33 @@ deps = { 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/com_google_protobuf_protobuf_lite': { + 'src/third_party/android_deps/libs/com_google_protobuf_protobuf_java': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/com_google_protobuf_protobuf_java', + 'version': 'version:3.4.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/com_google_protobuf_protobuf_javalite': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/com_google_protobuf_protobuf_javalite', + 'version': 'version:3.13.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/com_googlecode_java_diff_utils_diffutils': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/com_google_protobuf_protobuf_lite', - 'version': 'version:3.0.1-cr0', + 'package': 'chromium/third_party/android_deps/libs/com_googlecode_java_diff_utils_diffutils', + 'version': 'version:1.3.0-cr0', }, ], 'condition': 'checkout_android', @@ -1317,7 +2295,40 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_squareup_javapoet', - 'version': 'version:1.11.0-cr0', + 'version': 'version:1.11.1-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/com_squareup_javawriter': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/com_squareup_javawriter', + 'version': 'version:2.1.1-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/commons_cli_commons_cli': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/commons_cli_commons_cli', + 'version': 'version:1.3.1-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/javax_annotation_javax_annotation_api': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/javax_annotation_javax_annotation_api', + 'version': 'version:1.3.2-cr0', }, ], 'condition': 'checkout_android', @@ -1346,6 +2357,39 @@ deps = { 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/nekohtml_nekohtml': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/nekohtml_nekohtml', + 'version': 'version:1.9.6.2-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/nekohtml_xercesMinimal': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/nekohtml_xercesminimal', + 'version': 'version:1.9.6.2-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/net_ltgt_gradle_incap_incap': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/net_ltgt_gradle_incap_incap', + 'version': 'version:0.2-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + 'src/third_party/android_deps/libs/net_sf_kxml_kxml2': { 'packages': [ { @@ -1357,11 +2401,220 @@ deps = { 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/org_apache_ant_ant': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_apache_ant_ant', + 'version': 'version:1.8.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_apache_ant_ant_launcher': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_apache_ant_ant_launcher', + 'version': 'version:1.8.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_apache_maven_maven_ant_tasks': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_ant_tasks', + 'version': 'version:2.1.3-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_apache_maven_maven_artifact': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_artifact', + 'version': 'version:2.2.1-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_apache_maven_maven_artifact_manager': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_artifact_manager', + 'version': 'version:2.2.1-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_apache_maven_maven_error_diagnostics': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_error_diagnostics', + 'version': 'version:2.2.1-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_apache_maven_maven_model': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_model', + 'version': 'version:2.2.1-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_apache_maven_maven_plugin_registry': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_plugin_registry', + 'version': 'version:2.2.1-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_apache_maven_maven_profile': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_profile', + 'version': 'version:2.2.1-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_apache_maven_maven_project': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_project', + 'version': 'version:2.2.1-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_apache_maven_maven_repository_metadata': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_repository_metadata', + 'version': 'version:2.2.1-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_apache_maven_maven_settings': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_settings', + 'version': 'version:2.2.1-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_apache_maven_wagon_wagon_file': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_wagon_wagon_file', + 'version': 'version:1.0-beta-6-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_apache_maven_wagon_wagon_http_lightweight': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_wagon_wagon_http_lightweight', + 'version': 'version:1.0-beta-6-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_apache_maven_wagon_wagon_http_shared': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_wagon_wagon_http_shared', + 'version': 'version:1.0-beta-6-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_apache_maven_wagon_wagon_provider_api': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_wagon_wagon_provider_api', + 'version': 'version:1.0-beta-6-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_ccil_cowan_tagsoup_tagsoup': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_ccil_cowan_tagsoup_tagsoup', + 'version': 'version:1.2.1-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + 'src/third_party/android_deps/libs/org_checkerframework_checker_compat_qual': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_checkerframework_checker_compat_qual', - 'version': 'version:2.3.0-cr0', + 'version': 'version:2.5.3-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_checkerframework_checker_qual': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_checkerframework_checker_qual', + 'version': 'version:2.10.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_checkerframework_dataflow_shaded': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_checkerframework_dataflow_shaded', + 'version': 'version:3.1.2-cr0', }, ], 'condition': 'checkout_android', @@ -1372,7 +2625,315 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_codehaus_mojo_animal_sniffer_annotations', - 'version': 'version:1.14-cr0', + 'version': 'version:1.17-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_codehaus_plexus_plexus_container_default': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_codehaus_plexus_plexus_container_default', + 'version': 'version:1.0-alpha-9-stable-1-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_codehaus_plexus_plexus_interpolation': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_codehaus_plexus_plexus_interpolation', + 'version': 'version:1.11-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_codehaus_plexus_plexus_utils': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_codehaus_plexus_plexus_utils', + 'version': 'version:1.5.15-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_jdom_jdom2': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_jdom_jdom2', + 'version': 'version:2.0.6-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_jetbrains_annotations': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_jetbrains_annotations', + 'version': 'version:13.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib', + 'version': 'version:1.3.50-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_common': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_common', + 'version': 'version:1.3.50-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_metadata_jvm': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_metadata_jvm', + 'version': 'version:0.1.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_ow2_asm_asm': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm', + 'version': 'version:7.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_ow2_asm_asm_analysis': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm_analysis', + 'version': 'version:7.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_ow2_asm_asm_commons': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm_commons', + 'version': 'version:7.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_ow2_asm_asm_tree': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm_tree', + 'version': 'version:7.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_ow2_asm_asm_util': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm_util', + 'version': 'version:7.0-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_pcollections_pcollections': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_pcollections_pcollections', + 'version': 'version:2.1.2-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_robolectric_annotations': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_robolectric_annotations', + 'version': 'version:4.3.1-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_robolectric_junit': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_robolectric_junit', + 'version': 'version:4.3.1-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_robolectric_pluginapi': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_robolectric_pluginapi', + 'version': 'version:4.3.1-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_robolectric_plugins_maven_dependency_resolver': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_robolectric_plugins_maven_dependency_resolver', + 'version': 'version:4.3.1-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_robolectric_resources': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_robolectric_resources', + 'version': 'version:4.3.1-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_robolectric_robolectric': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_robolectric_robolectric', + 'version': 'version:4.3.1-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_robolectric_sandbox': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_robolectric_sandbox', + 'version': 'version:4.3.1-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_robolectric_shadowapi': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_robolectric_shadowapi', + 'version': 'version:4.3.1-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_robolectric_shadows_framework': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_robolectric_shadows_framework', + 'version': 'version:4.3.1-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_robolectric_shadows_multidex': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_robolectric_shadows_multidex', + 'version': 'version:4.3.1-cr1', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_robolectric_shadows_playservices': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_robolectric_shadows_playservices', + 'version': 'version:4.3.1-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_robolectric_utils': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_robolectric_utils', + 'version': 'version:4.3.1-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_robolectric_utils_reflector': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_robolectric_utils_reflector', + 'version': 'version:4.3.1-cr0', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_threeten_threeten_extra': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_threeten_threeten_extra', + 'version': 'version:1.5.0-cr0', }, ], 'condition': 'checkout_android', @@ -1458,17 +3019,32 @@ hooks = [ 'name': 'mac_toolchain', 'pattern': '.', 'action': ['python', 'src/build/mac_toolchain.py'], + 'condition': 'checkout_mac', }, - # Pull binutils for linux, enabled debug fission for faster linking / - # debugging when used with clang on Ubuntu Precise. - # https://code.google.com/p/chromium/issues/detail?id=352046 + # Pull the msan libraries on linux. { - 'name': 'binutils', - 'pattern': 'src/third_party/binutils', - 'action': [ - 'python', - 'src/third_party/binutils/download.py', - ], + 'name': 'msan_chained_origins', + 'pattern': '.', + 'condition': 'checkout_linux', + 'action': [ 'python', + 'src/third_party/depot_tools/download_from_google_storage.py', + '--no_resume', + '--no_auth', + '--bucket', 'chromium-instrumented-libraries', + '-s', 'src/third_party/instrumented_libraries/binaries/msan-chained-origins-trusty.tgz.sha1', + ], + }, + { + 'name': 'msan_no_origins', + 'pattern': '.', + 'condition': 'checkout_linux', + 'action': [ 'python', + 'src/third_party/depot_tools/download_from_google_storage.py', + '--no_resume', + '--no_auth', + '--bucket', 'chromium-instrumented-libraries', + '-s', 'src/third_party/instrumented_libraries/binaries/msan-no-origins-trusty.tgz.sha1', + ], }, { # Pull clang if needed or requested via GYP_DEFINES. diff --git a/chromium/third_party/libyuv/OWNERS b/chromium/third_party/libyuv/OWNERS index f4fb657ce76..755c220be4d 100644 --- a/chromium/third_party/libyuv/OWNERS +++ b/chromium/third_party/libyuv/OWNERS @@ -1,10 +1,12 @@ +mbonadei@chromium.org fbarchard@chromium.org magjed@chromium.org +pbos@chromium.org -per-file *.gn=phoglund@chromium.org +per-file *.gn=mbonadei@chromium.org per-file .gitignore=* per-file AUTHORS=* per-file DEPS=* -per-file PRESUBMIT.py=phoglund@chromium.org +per-file PRESUBMIT.py=mbonadei@chromium.org # COMPONENT: Internals>Images>Codecs diff --git a/chromium/third_party/libyuv/README.chromium b/chromium/third_party/libyuv/README.chromium index b5814b2c5fa..4a7e30b087c 100644 --- a/chromium/third_party/libyuv/README.chromium +++ b/chromium/third_party/libyuv/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1741 +Version: 1768 License: BSD License File: LICENSE diff --git a/chromium/third_party/libyuv/build_overrides/build.gni b/chromium/third_party/libyuv/build_overrides/build.gni index 6d8319b965e..a83860a8eb8 100644 --- a/chromium/third_party/libyuv/build_overrides/build.gni +++ b/chromium/third_party/libyuv/build_overrides/build.gni @@ -44,3 +44,13 @@ if (host_os == "mac") { "hermetic toolchain if the minimum OS version is not met.") use_system_xcode = _result == 0 } + +declare_args() { + # Tracing support requires //third_party/perfetto. + enable_base_tracing = false + use_perfetto_client_library = false + + # Allows googletest to pretty-print various absl types. + # Defined here rather than in gtest.gni to match chromium. + gtest_enable_absl_printers = true +} diff --git a/chromium/third_party/libyuv/docs/formats.md b/chromium/third_party/libyuv/docs/formats.md index 260dd73186a..a29ed5c3043 100644 --- a/chromium/third_party/libyuv/docs/formats.md +++ b/chromium/third_party/libyuv/docs/formats.md @@ -50,7 +50,7 @@ The following is extracted from video_common.h as a complete list of formats sup // 10 bit lsb // 1 Secondary YUV format: row biplanar. - FOURCC_M420 = FOURCC('M', '4', '2', '0'), + FOURCC_M420 = FOURCC('M', '4', '2', '0'), // deprecated. // 11 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'), @@ -166,3 +166,4 @@ The 12 in NV12 refers to 12 bits per pixel. NV12 has a half width and half height chroma channel, and therefore is a 420 subsampling. NV16 is 16 bits per pixel, with half width and full height. aka 422. NV24 is 24 bits per pixel with full sized chroma channel. aka 444. +Most NV12 functions allow the destination Y pointer to be NULL. diff --git a/chromium/third_party/libyuv/docs/getting_started.md b/chromium/third_party/libyuv/docs/getting_started.md index 4426b606e07..3e339712e19 100644 --- a/chromium/third_party/libyuv/docs/getting_started.md +++ b/chromium/third_party/libyuv/docs/getting_started.md @@ -190,7 +190,7 @@ mips make V=1 -f linux.mk make V=1 -f linux.mk clean - make V=1 -f linux.mk CXX=clang++ + make V=1 -f linux.mk CXX=clang++ CC=clang ## Building the library with cmake diff --git a/chromium/third_party/libyuv/include/libyuv.h b/chromium/third_party/libyuv/include/libyuv.h index aeffd5ef7a4..a06e1233abb 100644 --- a/chromium/third_party/libyuv/include/libyuv.h +++ b/chromium/third_party/libyuv/include/libyuv.h @@ -26,6 +26,7 @@ #include "libyuv/scale.h" #include "libyuv/scale_argb.h" #include "libyuv/scale_row.h" +#include "libyuv/scale_uv.h" #include "libyuv/version.h" #include "libyuv/video_common.h" diff --git a/chromium/third_party/libyuv/include/libyuv/convert.h b/chromium/third_party/libyuv/include/libyuv/convert.h index f571142fab3..026b153cefe 100644 --- a/chromium/third_party/libyuv/include/libyuv/convert.h +++ b/chromium/third_party/libyuv/include/libyuv/convert.h @@ -42,6 +42,21 @@ int I444ToI420(const uint8_t* src_y, int width, int height); +// Convert I444 to NV12. +LIBYUV_API +int I444ToNV12(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height); + // Convert I444 to NV21. LIBYUV_API int I444ToNV21(const uint8_t* src_y, @@ -248,19 +263,6 @@ int AYUVToNV21(const uint8_t* src_ayuv, int width, int height); -// Convert M420 to I420. -LIBYUV_API -int M420ToI420(const uint8_t* src_m420, - int src_stride_m420, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - // Convert Android420 to I420. LIBYUV_API int Android420ToI420(const uint8_t* src_y, @@ -418,7 +420,15 @@ int RGB24ToJ400(const uint8_t* src_rgb24, int width, int height); -#ifdef HAVE_JPEG +// RGB big endian (rgb in memory) to J400. +LIBYUV_API +int RAWToJ400(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_yj, + int dst_stride_yj, + int width, + int height); + // src_width/height provided by capture. // dst_width/height for clipping determine final size. LIBYUV_API @@ -448,13 +458,25 @@ int MJPGToNV21(const uint8_t* sample, int dst_width, int dst_height); +// JPEG to NV12 +LIBYUV_API +int MJPGToNV12(const uint8_t* sample, + size_t sample_size, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int src_width, + int src_height, + int dst_width, + int dst_height); + // Query size of MJPG in pixels. LIBYUV_API int MJPGSize(const uint8_t* sample, size_t sample_size, int* width, int* height); -#endif // Convert camera sample to I420 with cropping, rotation and vertical flip. // "src_size" is needed to parse MJPG. diff --git a/chromium/third_party/libyuv/include/libyuv/convert_argb.h b/chromium/third_party/libyuv/include/libyuv/convert_argb.h index bf776348f35..715a3dad97d 100644 --- a/chromium/third_party/libyuv/include/libyuv/convert_argb.h +++ b/chromium/third_party/libyuv/include/libyuv/convert_argb.h @@ -15,16 +15,41 @@ #include "libyuv/rotate.h" // For enum RotationMode. -// TODO(fbarchard): This set of functions should exactly match convert.h -// TODO(fbarchard): Add tests. Create random content of right size and convert -// with C vs Opt and or to I420 and compare. -// TODO(fbarchard): Some of these functions lack parameter setting. - #ifdef __cplusplus namespace libyuv { extern "C" { #endif +// Conversion matrix for YUV to RGB +LIBYUV_API extern const struct YuvConstants kYuvI601Constants; // BT.601 +LIBYUV_API extern const struct YuvConstants kYuvJPEGConstants; // JPeg +LIBYUV_API extern const struct YuvConstants kYuvH709Constants; // BT.709 +LIBYUV_API extern const struct YuvConstants kYuv2020Constants; // BT.2020 + +// Conversion matrix for YVU to BGR +LIBYUV_API extern const struct YuvConstants kYvuI601Constants; // BT.601 +LIBYUV_API extern const struct YuvConstants kYvuJPEGConstants; // JPeg +LIBYUV_API extern const struct YuvConstants kYvuH709Constants; // BT.709 +LIBYUV_API extern const struct YuvConstants kYvu2020Constants; // BT.2020 + +// Macros for end swapped destination Matrix conversions. +// Swap UV and pass mirrored kYvuJPEGConstants matrix. +// TODO(fbarchard): Add macro for each Matrix function. +#define kYuvI601ConstantsVU kYvuI601Constants +#define kYuvJPEGConstantsVU kYvuJPEGConstants +#define kYuvH709ConstantsVU kYvuH709Constants +#define kYuv2020ConstantsVU kYvu2020Constants +#define NV12ToABGRMatrix(a, b, c, d, e, f, g, h, i) \ + NV21ToARGBMatrix(a, b, c, d, e, f, g##VU, h, i) +#define NV21ToABGRMatrix(a, b, c, d, e, f, g, h, i) \ + NV12ToARGBMatrix(a, b, c, d, e, f, g##VU, h, i) +#define NV12ToRAWMatrix(a, b, c, d, e, f, g, h, i) \ + NV21ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i) +#define NV21ToRAWMatrix(a, b, c, d, e, f, g, h, i) \ + NV12ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i) +#define I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \ + I420AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n) + // Alias. #define ARGBToARGB ARGBCopy @@ -657,15 +682,6 @@ int NV21ToRAW(const uint8_t* src_y, int width, int height); -// Convert M420 to ARGB. -LIBYUV_API -int M420ToARGB(const uint8_t* src_m420, - int src_stride_m420, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - // Convert YUY2 to ARGB. LIBYUV_API int YUY2ToARGB(const uint8_t* src_yuy2, @@ -956,7 +972,6 @@ int AR30ToAB30(const uint8_t* src_ar30, int width, int height); -#ifdef HAVE_JPEG // src_width/height provided by capture // dst_width/height for clipping determine final size. LIBYUV_API @@ -968,7 +983,6 @@ int MJPGToARGB(const uint8_t* sample, int src_height, int dst_width, int dst_height); -#endif // Convert Android420 to ARGB. LIBYUV_API @@ -998,6 +1012,561 @@ int Android420ToABGR(const uint8_t* src_y, int width, int height); +// Convert NV12 to RGB565. +LIBYUV_API +int NV12ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height); + +// Convert I422 to BGRA. +LIBYUV_API +int I422ToBGRA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_bgra, + int dst_stride_bgra, + int width, + int height); + +// Convert I422 to ABGR. +LIBYUV_API +int I422ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); + +// Convert I422 to RGBA. +LIBYUV_API +int I422ToRGBA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgba, + int dst_stride_rgba, + int width, + int height); + +LIBYUV_API +int I420ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +LIBYUV_API +int I420ToBGRA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_bgra, + int dst_stride_bgra, + int width, + int height); + +LIBYUV_API +int I420ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); + +LIBYUV_API +int I420ToRGBA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgba, + int dst_stride_rgba, + int width, + int height); + +LIBYUV_API +int I420ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height); + +LIBYUV_API +int I420ToRAW(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height); + +LIBYUV_API +int H420ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height); + +LIBYUV_API +int H420ToRAW(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height); + +LIBYUV_API +int J420ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height); + +LIBYUV_API +int J420ToRAW(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height); + +LIBYUV_API +int I420ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height); + +LIBYUV_API +int J420ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height); + +LIBYUV_API +int H420ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height); + +LIBYUV_API +int I422ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height); + +// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes). +// Values in dither matrix from 0 to 7 recommended. +// The order of the dither matrix is first byte is upper left. + +LIBYUV_API +int I420ToRGB565Dither(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + const uint8_t* dither4x4, + int width, + int height); + +LIBYUV_API +int I420ToARGB1555(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb1555, + int dst_stride_argb1555, + int width, + int height); + +LIBYUV_API +int I420ToARGB4444(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb4444, + int dst_stride_argb4444, + int width, + int height); + +// Convert I420 to AR30. +LIBYUV_API +int I420ToAR30(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height); + +// Convert H420 to AR30. +LIBYUV_API +int H420ToAR30(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height); + +// Convert I420 to ARGB with matrix. +LIBYUV_API +int I420ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height); + +// Convert I422 to ARGB with matrix. +LIBYUV_API +int I422ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height); + +// Convert I444 to ARGB with matrix. +LIBYUV_API +int I444ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height); + +// multiply 10 bit yuv into high bits to allow any number of bits. +LIBYUV_API +int I010ToAR30Matrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height); + +// multiply 10 bit yuv into high bits to allow any number of bits. +LIBYUV_API +int I210ToAR30Matrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height); + +// Convert 10 bit YUV to ARGB with matrix. +LIBYUV_API +int I010ToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height); + +// Convert 10 bit 422 YUV to ARGB with matrix. +LIBYUV_API +int I210ToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height); + +// Convert I420 with Alpha to preattenuated ARGB with matrix. +LIBYUV_API +int I420AlphaToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate); + +// Convert NV12 to ARGB with matrix. +LIBYUV_API +int NV12ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height); + +// Convert NV21 to ARGB with matrix. +LIBYUV_API +int NV21ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height); + +// Convert NV12 to RGB565 with matrix. +LIBYUV_API +int NV12ToRGB565Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + const struct YuvConstants* yuvconstants, + int width, + int height); + +// Convert NV12 to RGB24 with matrix. +LIBYUV_API +int NV12ToRGB24Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, + int height); + +// Convert NV21 to RGB24 with matrix. +LIBYUV_API +int NV21ToRGB24Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, + int height); + +// Convert Android420 to ARGB with matrix. +LIBYUV_API +int Android420ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_pixel_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height); + +// Convert I422 to RGBA with matrix. +LIBYUV_API +int I422ToRGBAMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgba, + int dst_stride_rgba, + const struct YuvConstants* yuvconstants, + int width, + int height); + +// Convert I422 to RGBA with matrix. +LIBYUV_API +int I420ToRGBAMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgba, + int dst_stride_rgba, + const struct YuvConstants* yuvconstants, + int width, + int height); + +// Convert I420 to RGB24 with matrix. +LIBYUV_API +int I420ToRGB24Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, + int height); + +// Convert I420 to RGB565 with specified color matrix. +LIBYUV_API +int I420ToRGB565Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + const struct YuvConstants* yuvconstants, + int width, + int height); + +// Convert I420 to AR30 with matrix. +LIBYUV_API +int I420ToAR30Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height); + +// Convert I400 (grey) to ARGB. Reverse of ARGBToI400. +LIBYUV_API +int I400ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height); + // Convert camera sample to ARGB with cropping, rotation and vertical flip. // "sample_size" is needed to parse MJPG. // "dst_stride_argb" number of bytes in a row of the dst_argb plane. diff --git a/chromium/third_party/libyuv/include/libyuv/convert_from.h b/chromium/third_party/libyuv/include/libyuv/convert_from.h index afc43939a4c..5140ed4f3e9 100644 --- a/chromium/third_party/libyuv/include/libyuv/convert_from.h +++ b/chromium/third_party/libyuv/include/libyuv/convert_from.h @@ -132,6 +132,10 @@ int I420ToUYVY(const uint8_t* src_y, int width, int height); +// The following are from convert_argb.h +// DEPRECATED: The prototypes will be removed in future. Use convert_argb.h + +// Convert I420 to ARGB. LIBYUV_API int I420ToARGB(const uint8_t* src_y, int src_stride_y, @@ -144,18 +148,7 @@ int I420ToARGB(const uint8_t* src_y, int width, int height); -LIBYUV_API -int I420ToBGRA(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_bgra, - int dst_stride_bgra, - int width, - int height); - +// Convert I420 to ABGR. LIBYUV_API int I420ToABGR(const uint8_t* src_y, int src_stride_y, @@ -168,205 +161,6 @@ int I420ToABGR(const uint8_t* src_y, int width, int height); -LIBYUV_API -int I420ToRGBA(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgba, - int dst_stride_rgba, - int width, - int height); - -LIBYUV_API -int I420ToRGB24(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - int width, - int height); - -LIBYUV_API -int I420ToRAW(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_raw, - int dst_stride_raw, - int width, - int height); - -LIBYUV_API -int H420ToRGB24(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - int width, - int height); - -LIBYUV_API -int H420ToRAW(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_raw, - int dst_stride_raw, - int width, - int height); - -LIBYUV_API -int J420ToRGB24(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - int width, - int height); - -LIBYUV_API -int J420ToRAW(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_raw, - int dst_stride_raw, - int width, - int height); - -LIBYUV_API -int I420ToRGB565(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - int width, - int height); - -LIBYUV_API -int J420ToRGB565(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - int width, - int height); - -LIBYUV_API -int H420ToRGB565(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - int width, - int height); - -LIBYUV_API -int I422ToRGB565(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - int width, - int height); - -// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes). -// Values in dither matrix from 0 to 7 recommended. -// The order of the dither matrix is first byte is upper left. - -LIBYUV_API -int I420ToRGB565Dither(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - const uint8_t* dither4x4, - int width, - int height); - -LIBYUV_API -int I420ToARGB1555(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb1555, - int dst_stride_argb1555, - int width, - int height); - -LIBYUV_API -int I420ToARGB4444(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb4444, - int dst_stride_argb4444, - int width, - int height); - -// Convert I420 to AR30. -LIBYUV_API -int I420ToAR30(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, - int width, - int height); - -// Convert H420 to AR30. -LIBYUV_API -int H420ToAR30(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, - int width, - int height); - // Convert I420 to specified format. // "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the // buffer has contiguous rows. Can be negative. A multiple of 16 is optimal. diff --git a/chromium/third_party/libyuv/include/libyuv/convert_from_argb.h b/chromium/third_party/libyuv/include/libyuv/convert_from_argb.h index 057182448d6..d992363cebb 100644 --- a/chromium/third_party/libyuv/include/libyuv/convert_from_argb.h +++ b/chromium/third_party/libyuv/include/libyuv/convert_from_argb.h @@ -77,6 +77,10 @@ int ARGBToAR30(const uint8_t* src_argb, int width, int height); +// Aliases +#define ABGRToRGB24 ARGBToRAW +#define ABGRToRAW ARGBToRGB24 + // Convert ARGB To RGB24. LIBYUV_API int ARGBToRGB24(const uint8_t* src_argb, @@ -281,17 +285,6 @@ int ABGRToNV21(const uint8_t* src_abgr, int width, int height); -// Convert ARGB To NV21. -LIBYUV_API -int ARGBToNV21(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_vu, - int dst_stride_vu, - int width, - int height); - // Convert ARGB To YUY2. LIBYUV_API int ARGBToYUY2(const uint8_t* src_argb, diff --git a/chromium/third_party/libyuv/include/libyuv/cpu_id.h b/chromium/third_party/libyuv/include/libyuv/cpu_id.h index b01cd25c574..3e27cc107dc 100644 --- a/chromium/third_party/libyuv/include/libyuv/cpu_id.h +++ b/chromium/third_party/libyuv/include/libyuv/cpu_id.h @@ -71,6 +71,8 @@ static __inline int TestCpuFlag(int test_flag) { // Internal function for parsing /proc/cpuinfo. LIBYUV_API int ArmCpuCaps(const char* cpuinfo_name); +LIBYUV_API +int MipsCpuCaps(const char* cpuinfo_name); // For testing, allow CPU flags to be disabled. // ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3. diff --git a/chromium/third_party/libyuv/include/libyuv/macros_msa.h b/chromium/third_party/libyuv/include/libyuv/macros_msa.h index 29997ce11fd..4e232b66bfe 100644 --- a/chromium/third_party/libyuv/include/libyuv/macros_msa.h +++ b/chromium/third_party/libyuv/include/libyuv/macros_msa.h @@ -140,6 +140,9 @@ #define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */ #define LD_UB(...) LD_B(const v16u8, __VA_ARGS__) +#define LD_H(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */ +#define LD_UH(...) LD_H(const v8u16, __VA_ARGS__) + #define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */ #define ST_UB(...) ST_B(v16u8, __VA_ARGS__) diff --git a/chromium/third_party/libyuv/include/libyuv/planar_functions.h b/chromium/third_party/libyuv/include/libyuv/planar_functions.h index 5299fe2c0ef..8d868b95425 100644 --- a/chromium/third_party/libyuv/include/libyuv/planar_functions.h +++ b/chromium/third_party/libyuv/include/libyuv/planar_functions.h @@ -105,6 +105,19 @@ void MergeUVPlane(const uint8_t* src_u, int width, int height); +// Scale U and V to half width and height and merge into interleaved UV plane. +// width and height are source size, allowing odd sizes. +// Use for converting I444 or I422 to NV12. +LIBYUV_API +void HalfMergeUVPlane(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height); + // Swap U and V channels in interleaved UV plane. LIBYUV_API void SwapUVPlane(const uint8_t* src_uv, @@ -187,6 +200,16 @@ int I444Copy(const uint8_t* src_y, int width, int height); +// Copy NV12. Supports inverting. +int NV12Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, + int src_stride_uv, uint8_t* dst_y, int dst_stride_y, + uint8_t* dst_uv, int dst_stride_uv, int width, int height); + +// Copy NV21. Supports inverting. +int NV21Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_vu, + int src_stride_vu, uint8_t* dst_y, int dst_stride_y, + uint8_t* dst_vu, int dst_stride_vu, int width, int height); + // Convert YUY2 to I422. LIBYUV_API int YUY2ToI422(const uint8_t* src_yuy2, @@ -302,6 +325,22 @@ int I400Mirror(const uint8_t* src_y, int height); // Alias +#define NV12ToNV12Mirror NV12Mirror + +// NV12 mirror. +LIBYUV_API +int NV12Mirror(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height); + +// Alias #define ARGBToARGBMirror ARGBMirror // ARGB mirror. @@ -313,56 +352,35 @@ int ARGBMirror(const uint8_t* src_argb, int width, int height); -// Convert NV12 to RGB565. -LIBYUV_API -int NV12ToRGB565(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - int width, - int height); +// Alias +#define RGB24ToRGB24Mirror RGB24Mirror -// I422ToARGB is in convert_argb.h -// Convert I422 to BGRA. +// RGB24 mirror. LIBYUV_API -int I422ToBGRA(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_bgra, - int dst_stride_bgra, - int width, - int height); +int RGB24Mirror(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height); -// Convert I422 to ABGR. +// Mirror a plane of data. LIBYUV_API -int I422ToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height); +void MirrorPlane(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height); -// Convert I422 to RGBA. +// Mirror a plane of UV data. LIBYUV_API -int I422ToRGBA(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgba, - int dst_stride_rgba, - int width, - int height); +void MirrorUVPlane(const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height); // Alias #define RGB24ToRAW RAWToRGB24 @@ -743,6 +761,19 @@ int ARGBBlur(const uint8_t* src_argb, int height, int radius); +// Gaussian 5x5 blur a float plane. +// Coefficients of 1, 4, 6, 4, 1. +// Each destination pixel is a blur of the 5x5 +// pixels from the source. +// Source edges are clamped. +LIBYUV_API +int GaussPlane_F32(const float* src, + int src_stride, + float* dst, + int dst_stride, + int width, + int height); + // Multiply ARGB image by ARGB value. LIBYUV_API int ARGBShade(const uint8_t* src_argb, diff --git a/chromium/third_party/libyuv/include/libyuv/rotate.h b/chromium/third_party/libyuv/include/libyuv/rotate.h index c64e0216d53..308882242cb 100644 --- a/chromium/third_party/libyuv/include/libyuv/rotate.h +++ b/chromium/third_party/libyuv/include/libyuv/rotate.h @@ -118,6 +118,10 @@ void RotatePlane270(const uint8_t* src, int width, int height); +// Rotations for when U and V are interleaved. +// These functions take one input pointer and +// split the data into two buffers while +// rotating them. Deprecated. LIBYUV_API void RotateUV90(const uint8_t* src, int src_stride, @@ -128,10 +132,6 @@ void RotateUV90(const uint8_t* src, int width, int height); -// Rotations for when U and V are interleaved. -// These functions take one input pointer and -// split the data into two buffers while -// rotating them. Deprecated. LIBYUV_API void RotateUV180(const uint8_t* src, int src_stride, diff --git a/chromium/third_party/libyuv/include/libyuv/row.h b/chromium/third_party/libyuv/include/libyuv/row.h index b721858f1df..a27788c1f69 100644 --- a/chromium/third_party/libyuv/include/libyuv/row.h +++ b/chromium/third_party/libyuv/include/libyuv/row.h @@ -98,7 +98,6 @@ extern "C" { #define HAS_COPYROW_SSE2 #define HAS_H422TOARGBROW_SSSE3 #define HAS_HALFFLOATROW_SSE2 -#define HAS_I400TOARGBROW_SSE2 #define HAS_I422TOARGB1555ROW_SSSE3 #define HAS_I422TOARGB4444ROW_SSSE3 #define HAS_I422TOARGBROW_SSSE3 @@ -112,7 +111,7 @@ extern "C" { #define HAS_J422TOARGBROW_SSSE3 #define HAS_MERGEUVROW_SSE2 #define HAS_MIRRORROW_SSSE3 -#define HAS_MIRRORUVROW_SSSE3 +#define HAS_MIRRORSPLITUVROW_SSSE3 #define HAS_NV12TOARGBROW_SSSE3 #define HAS_NV12TORGB24ROW_SSSE3 #define HAS_NV12TORGB565ROW_SSSE3 @@ -123,6 +122,8 @@ extern "C" { #define HAS_RAWTOYROW_SSSE3 #define HAS_RGB24TOARGBROW_SSSE3 #define HAS_RGB24TOYROW_SSSE3 +#define HAS_RGB24TOYJROW_SSSE3 +#define HAS_RAWTOYJROW_SSSE3 #define HAS_RGB565TOARGBROW_SSE2 #define HAS_RGBATOUVROW_SSSE3 #define HAS_RGBATOYROW_SSSE3 @@ -194,11 +195,12 @@ extern "C" { #define HAS_ARGBTOUVROW_AVX2 #define HAS_ARGBTOYJROW_AVX2 #define HAS_ARGBTOYROW_AVX2 +#define HAS_RGB24TOYJROW_AVX2 +#define HAS_RAWTOYJROW_AVX2 #define HAS_COPYROW_AVX #define HAS_H422TOARGBROW_AVX2 #define HAS_HALFFLOATROW_AVX2 // #define HAS_HALFFLOATROW_F16C // Enable to test halffloat cast -#define HAS_I400TOARGBROW_AVX2 #define HAS_I422TOARGB1555ROW_AVX2 #define HAS_I422TOARGB4444ROW_AVX2 #define HAS_I422TOARGBROW_AVX2 @@ -269,12 +271,16 @@ extern "C" { #define HAS_ARGBTOAR30ROW_SSSE3 #define HAS_CONVERT16TO8ROW_SSSE3 #define HAS_CONVERT8TO16ROW_SSE2 -// I210 is for H010. 2 = 422. I for 601 vs H for 709. +#define HAS_HALFMERGEUVROW_SSSE3 #define HAS_I210TOAR30ROW_SSSE3 #define HAS_I210TOARGBROW_SSSE3 +#define HAS_I400TOARGBROW_SSE2 #define HAS_I422TOAR30ROW_SSSE3 #define HAS_MERGERGBROW_SSSE3 +#define HAS_MIRRORUVROW_AVX2 +#define HAS_MIRRORUVROW_SSSE3 #define HAS_RAWTORGBAROW_SSSE3 +#define HAS_RGB24MIRRORROW_SSSE3 #define HAS_RGBATOYJROW_SSSE3 #define HAS_SPLITRGBROW_SSSE3 #define HAS_SWAPUVROW_SSSE3 @@ -293,8 +299,10 @@ extern "C" { #define HAS_ARGBTORGB24ROW_AVX2 #define HAS_CONVERT16TO8ROW_AVX2 #define HAS_CONVERT8TO16ROW_AVX2 +#define HAS_HALFMERGEUVROW_AVX2 #define HAS_I210TOAR30ROW_AVX2 #define HAS_I210TOARGBROW_AVX2 +#define HAS_I400TOARGBROW_AVX2 #define HAS_I422TOAR30ROW_AVX2 #define HAS_I422TOUYVYROW_AVX2 #define HAS_I422TOYUY2ROW_AVX2 @@ -338,7 +346,6 @@ extern "C" { #define HAS_ARGBTOUVJROW_NEON #define HAS_ARGBTOUVROW_NEON #define HAS_ARGBTOYJROW_NEON -#define HAS_RGBATOYJROW_NEON #define HAS_ARGBTOYROW_NEON #define HAS_AYUVTOUVROW_NEON #define HAS_AYUVTOVUROW_NEON @@ -348,6 +355,7 @@ extern "C" { #define HAS_BYTETOFLOATROW_NEON #define HAS_COPYROW_NEON #define HAS_HALFFLOATROW_NEON +#define HAS_HALFMERGEUVROW_NEON #define HAS_I400TOARGBROW_NEON #define HAS_I422ALPHATOARGBROW_NEON #define HAS_I422TOARGB1555ROW_NEON @@ -363,6 +371,7 @@ extern "C" { #define HAS_MERGEUVROW_NEON #define HAS_MIRRORROW_NEON #define HAS_MIRRORUVROW_NEON +#define HAS_MIRRORSPLITUVROW_NEON #define HAS_NV12TOARGBROW_NEON #define HAS_NV12TORGB24ROW_NEON #define HAS_NV12TORGB565ROW_NEON @@ -370,17 +379,20 @@ extern "C" { #define HAS_NV21TORGB24ROW_NEON #define HAS_NV21TOYUV24ROW_NEON #define HAS_RAWTOARGBROW_NEON -#define HAS_RAWTORGBAROW_NEON #define HAS_RAWTORGB24ROW_NEON +#define HAS_RAWTORGBAROW_NEON #define HAS_RAWTOUVROW_NEON +#define HAS_RAWTOYJROW_NEON #define HAS_RAWTOYROW_NEON #define HAS_RGB24TOARGBROW_NEON #define HAS_RGB24TOUVROW_NEON +#define HAS_RGB24TOYJROW_NEON #define HAS_RGB24TOYROW_NEON #define HAS_RGB565TOARGBROW_NEON #define HAS_RGB565TOUVROW_NEON #define HAS_RGB565TOYROW_NEON #define HAS_RGBATOUVROW_NEON +#define HAS_RGBATOYJROW_NEON #define HAS_RGBATOYROW_NEON #define HAS_SETROW_NEON #define HAS_SPLITRGBROW_NEON @@ -402,6 +414,7 @@ extern "C" { #define HAS_ARGBCOLORMATRIXROW_NEON #define HAS_ARGBGRAYROW_NEON #define HAS_ARGBMIRRORROW_NEON +#define HAS_RGB24MIRRORROW_NEON #define HAS_ARGBMULTIPLYROW_NEON #define HAS_ARGBQUANTIZEROW_NEON #define HAS_ARGBSEPIAROW_NEON @@ -419,6 +432,9 @@ extern "C" { // The following are available on AArch64 platforms: #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #define HAS_SCALESUMSAMPLES_NEON +#define HAS_GAUSSROW_F32_NEON +#define HAS_GAUSSCOL_F32_NEON + #endif #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) #define HAS_ABGRTOUVROW_MSA @@ -470,6 +486,7 @@ extern "C" { #define HAS_MERGEUVROW_MSA #define HAS_MIRRORROW_MSA #define HAS_MIRRORUVROW_MSA +#define HAS_MIRRORSPLITUVROW_MSA #define HAS_NV12TOARGBROW_MSA #define HAS_NV12TORGB565ROW_MSA #define HAS_NV21TOARGBROW_MSA @@ -552,7 +569,7 @@ extern "C" { #define HAS_MERGERGBROW_MMI #define HAS_MERGEUVROW_MMI #define HAS_MIRRORROW_MMI -#define HAS_MIRRORUVROW_MMI +#define HAS_MIRRORSPLITUVROW_MMI #define HAS_RAWTOARGBROW_MMI #define HAS_RAWTORGB24ROW_MMI #define HAS_RAWTOUVROW_MMI @@ -601,6 +618,7 @@ extern "C" { #endif typedef __declspec(align(16)) int16_t vec16[8]; typedef __declspec(align(16)) int32_t vec32[4]; +typedef __declspec(align(16)) float vecf32[4]; typedef __declspec(align(16)) int8_t vec8[16]; typedef __declspec(align(16)) uint16_t uvec16[8]; typedef __declspec(align(16)) uint32_t uvec32[4]; @@ -620,6 +638,7 @@ typedef __declspec(align(32)) uint8_t ulvec8[32]; #endif typedef int16_t __attribute__((vector_size(16))) vec16; typedef int32_t __attribute__((vector_size(16))) vec32; +typedef float __attribute__((vector_size(16))) vecf32; typedef int8_t __attribute__((vector_size(16))) vec8; typedef uint16_t __attribute__((vector_size(16))) uvec16; typedef uint32_t __attribute__((vector_size(16))) uvec32; @@ -634,6 +653,7 @@ typedef uint8_t __attribute__((vector_size(32))) ulvec8; #define SIMD_ALIGNED(var) var typedef int16_t vec16[8]; typedef int32_t vec32[4]; +typedef float vecf32[4]; typedef int8_t vec8[16]; typedef uint16_t uvec16[8]; typedef uint32_t uvec32[4]; @@ -674,6 +694,7 @@ struct YuvConstants { int16_t kUVBiasG[16]; int16_t kUVBiasR[16]; int16_t kYToRgb[16]; + int16_t kYBiasToRgb[16]; }; // Offsets into YuvConstants structure @@ -684,19 +705,9 @@ struct YuvConstants { #define KUVBIASG 128 #define KUVBIASR 160 #define KYTORGB 192 -#endif - -// Conversion matrix for YUV to RGB -extern const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants); // BT.601 -extern const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants); // JPeg -extern const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants); // BT.709 -extern const struct YuvConstants SIMD_ALIGNED(kYuv2020Constants); // BT.2020 +#define KYBIASTORGB 224 -// Conversion matrix for YVU to BGR -extern const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants); // BT.601 -extern const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants); // JPeg -extern const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants); // BT.709 -extern const struct YuvConstants SIMD_ALIGNED(kYvu2020Constants); // BT.2020 +#endif #define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1))) @@ -965,7 +976,11 @@ void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width); void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width); void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width); void RGB24ToYRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width); +void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width); void RAWToYRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width); +void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width); +void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_y, int width); +void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_y, int width); void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width); void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width); @@ -1134,7 +1149,9 @@ void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width); void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width); void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width); void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width); +void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width); void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width); +void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width); void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width); void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_y, @@ -1165,7 +1182,9 @@ void BGRAToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width); void ABGRToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width); void RGBAToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width); void RGB24ToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width); +void RGB24ToYJRow_C(const uint8_t* src_argb, uint8_t* dst_yj, int width); void RAWToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width); +void RAWToYJRow_C(const uint8_t* src_argb, uint8_t* dst_yj, int width); void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width); void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width); void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width); @@ -1175,8 +1194,14 @@ void RGBAToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void BGRAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ABGRToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RGB24ToYRow_Any_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width); -void RAWToYRow_Any_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width); +void RGB24ToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToYJRow_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void RAWToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -1184,7 +1209,9 @@ void BGRAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ABGRToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB24ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB565ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGB1555ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, @@ -1542,27 +1569,36 @@ void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width); +void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width); +void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width); +void MirrorUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_uv, int width); +void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width); +void MirrorUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void MirrorUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void MirrorUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void MirrorUVRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void MirrorUVRow_SSSE3(const uint8_t* src, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void MirrorUVRow_NEON(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void MirrorUVRow_MSA(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void MirrorUVRow_MMI(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void MirrorUVRow_C(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width); +void MirrorSplitUVRow_SSSE3(const uint8_t* src, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void MirrorSplitUVRow_NEON(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void MirrorSplitUVRow_MSA(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void MirrorSplitUVRow_MMI(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void MirrorSplitUVRow_C(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width); void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width); @@ -1582,6 +1618,16 @@ void ARGBMirrorRow_Any_NEON(const uint8_t* src_ptr, void ARGBMirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBMirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width); +void RGB24MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width); +void RGB24MirrorRow_C(const uint8_t* src, uint8_t* dst, int width); +void RGB24MirrorRow_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void RGB24MirrorRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); + void SplitUVRow_C(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, @@ -1672,6 +1718,34 @@ void MergeUVRow_Any_MMI(const uint8_t* y_buf, uint8_t* dst_ptr, int width); +void HalfMergeUVRow_C(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int width); + +void HalfMergeUVRow_NEON(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int width); + +void HalfMergeUVRow_SSSE3(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int width); + +void HalfMergeUVRow_AVX2(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int width); + void SplitRGBRow_C(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, @@ -2728,23 +2802,50 @@ void I422ToRGB24Row_Any_AVX2(const uint8_t* y_buf, const struct YuvConstants* yuvconstants, int width); -void I400ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, int width); -void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width); -void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width); -void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width); -void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width); -void I400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width); +void I400ToARGBRow_C(const uint8_t* src_y, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I400ToARGBRow_SSE2(const uint8_t* y_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I400ToARGBRow_AVX2(const uint8_t* y_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I400ToARGBRow_NEON(const uint8_t* src_y, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I400ToARGBRow_MSA(const uint8_t* src_y, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I400ToARGBRow_MMI(const uint8_t* src_y, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void I400ToARGBRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, int width); void I400ToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, int width); void I400ToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, int width); -void I400ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void I400ToARGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void I400ToARGBRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I400ToARGBRow_Any_MMI(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); // ARGB preattenuated alpha blend. void ARGBBlendRow_SSSE3(const uint8_t* src_argb0, @@ -4256,6 +4357,25 @@ void UYVYToARGBRow_Any_MMI(const uint8_t* src_ptr, const struct YuvConstants* yuvconstants, int width); +void GaussRow_F32_NEON(const float* src, float* dst, int width); +void GaussRow_F32_C(const float* src, float* dst, int width); + +void GaussCol_F32_NEON(const float* src0, + const float* src1, + const float* src2, + const float* src3, + const float* src4, + float* dst, + int width); + +void GaussCol_F32_C(const float* src0, + const float* src1, + const float* src2, + const float* src3, + const float* src4, + float* dst, + int width); + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/chromium/third_party/libyuv/include/libyuv/scale.h b/chromium/third_party/libyuv/include/libyuv/scale.h index 23ba1634f78..add5a9eb622 100644 --- a/chromium/third_party/libyuv/include/libyuv/scale.h +++ b/chromium/third_party/libyuv/include/libyuv/scale.h @@ -145,6 +145,31 @@ int I444Scale_16(const uint16_t* src_y, int dst_height, enum FilterMode filtering); +// Scales an NV12 image from the src width and height to the +// dst width and height. +// If filtering is kFilterNone, a simple nearest-neighbor algorithm is +// used. This produces basic (blocky) quality at the fastest speed. +// If filtering is kFilterBilinear, interpolation is used to produce a better +// quality image, at the expense of speed. +// kFilterBox is not supported for the UV channel and will be treated as +// bilinear. +// Returns 0 if successful. + +LIBYUV_API +int NV12Scale(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + int src_width, + int src_height, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int dst_width, + int dst_height, + enum FilterMode filtering); + #ifdef __cplusplus // Legacy API. Deprecated. LIBYUV_API diff --git a/chromium/third_party/libyuv/include/libyuv/scale_row.h b/chromium/third_party/libyuv/include/libyuv/scale_row.h index dd20718a82d..a386d499895 100644 --- a/chromium/third_party/libyuv/include/libyuv/scale_row.h +++ b/chromium/third_party/libyuv/include/libyuv/scale_row.h @@ -72,6 +72,22 @@ extern "C" { #define HAS_SCALEROWDOWN4_SSSE3 #endif +// The following are available for gcc/clang x86 platforms: +// TODO(fbarchard): Port to Visual C +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) +#define HAS_SCALEUVROWDOWN2BOX_SSSE3 +#endif + +// The following are available for gcc/clang x86 platforms, but +// require clang 3.4 or gcc 4.7. +// TODO(fbarchard): Port to Visual C +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || defined(__i386__)) && !defined(_MSC_VER) && \ + (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) +#define HAS_SCALEUVROWDOWN2BOX_AVX2 +#endif + // The following are available on all x86 platforms, but // require VS2012, clang 3.4 or gcc 4.7. // The code supports NaCL but requires a new compiler and validator. @@ -96,6 +112,8 @@ extern "C" { #define HAS_SCALEROWDOWN34_NEON #define HAS_SCALEROWDOWN38_NEON #define HAS_SCALEROWDOWN4_NEON +#define HAS_SCALEUVROWDOWN2BOX_NEON +#define HAS_SCALEUVROWDOWNEVEN_NEON #endif #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) @@ -376,6 +394,53 @@ void ScaleARGBFilterCols64_C(uint8_t* dst_argb, int dst_width, int x32, int dx); +void ScaleUVRowDown2_C(const uint8_t* src_uv, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDown2Linear_C(const uint8_t* src_uv, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDown2Box_C(const uint8_t* src_uv, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDownEven_C(const uint8_t* src_uv, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDownEvenBox_C(const uint8_t* src_uv, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_uv, + int dst_width); +void ScaleUVCols_C(uint8_t* dst_uv, + const uint8_t* src_uv, + int dst_width, + int x, + int dx); +void ScaleUVCols64_C(uint8_t* dst_uv, + const uint8_t* src_uv, + int dst_width, + int x32, + int dx); +void ScaleUVColsUp2_C(uint8_t* dst_uv, + const uint8_t* src_uv, + int dst_width, + int, + int); +void ScaleUVFilterCols_C(uint8_t* dst_uv, + const uint8_t* src_uv, + int dst_width, + int x, + int dx); +void ScaleUVFilterCols64_C(uint8_t* dst_uv, + const uint8_t* src_uv, + int dst_width, + int x32, + int dx); // Specialized scalers for x86. void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, @@ -782,6 +847,192 @@ void ScaleARGBRowDownEvenBox_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); +// UV Row functions +void ScaleUVRowDown2_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDown2Linear_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDown2_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleUVRowDown2_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDown2Linear_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDown2Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDown2_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDown2Linear_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDown2Box_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDown2_Any_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDown2Linear_Any_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDown2Box_Any_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDown2Box_Any_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDown2_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDown2Linear_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDown2Box_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDown2_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDown2Linear_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDown2Box_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDown2_Any_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDown2Linear_Any_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDown2Box_Any_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDownEven_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDownEvenBox_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDownEvenBox_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDownEven_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int32_t src_stepx, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDownEvenBox_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDownEven_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int32_t src_stepx, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDownEvenBox_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDownEven_Any_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDownEvenBox_Any_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDownEven_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDownEvenBox_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDownEven_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int32_t src_stepx, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDownEvenBox_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDownEven_Any_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int32_t src_stepx, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDownEvenBox_Any_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_ptr, + int dst_width); + // ScaleRowDown2Box also used by planar functions // NEON downscalers with interpolation. diff --git a/chromium/third_party/libyuv/include/libyuv/scale_uv.h b/chromium/third_party/libyuv/include/libyuv/scale_uv.h new file mode 100644 index 00000000000..1b6327aaed1 --- /dev/null +++ b/chromium/third_party/libyuv/include/libyuv/scale_uv.h @@ -0,0 +1,38 @@ +/* + * Copyright 2020 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_SCALE_UV_H_ +#define INCLUDE_LIBYUV_SCALE_UV_H_ + +#include "libyuv/basic_types.h" +#include "libyuv/scale.h" // For FilterMode + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +LIBYUV_API +int UVScale(const uint8_t* src_uv, + int src_stride_uv, + int src_width, + int src_height, + uint8_t* dst_uv, + int dst_stride_uv, + int dst_width, + int dst_height, + enum FilterMode filtering); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_SCALE_UV_H_ diff --git a/chromium/third_party/libyuv/include/libyuv/version.h b/chromium/third_party/libyuv/include/libyuv/version.h index 4c446ba3d4d..efaac73e3ab 100644 --- a/chromium/third_party/libyuv/include/libyuv/version.h +++ b/chromium/third_party/libyuv/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1741 +#define LIBYUV_VERSION 1768 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/chromium/third_party/libyuv/include/libyuv/video_common.h b/chromium/third_party/libyuv/include/libyuv/video_common.h index 666eb34393d..b9823d71d09 100644 --- a/chromium/third_party/libyuv/include/libyuv/video_common.h +++ b/chromium/third_party/libyuv/include/libyuv/video_common.h @@ -62,7 +62,7 @@ enum FourCC { FOURCC_I010 = FOURCC('I', '0', '1', '0'), // bt.601 10 bit 420 FOURCC_I210 = FOURCC('I', '0', '1', '0'), // bt.601 10 bit 422 - // 1 Secondary YUV format: row biplanar. + // 1 Secondary YUV format: row biplanar. deprecated. FOURCC_M420 = FOURCC('M', '4', '2', '0'), // 11 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc @@ -86,10 +86,14 @@ enum FourCC { FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'), FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'), FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'), // Linux version of I420. - FOURCC_J420 = FOURCC('J', '4', '2', '0'), // jpeg (bt.601 full), unofficial fourcc - FOURCC_J422 = FOURCC('J', '4', '2', '2'), // jpeg (bt.601 full), unofficial fourcc - FOURCC_J444 = FOURCC('J', '4', '4', '4'), // jpeg (bt.601 full), unofficial fourcc - FOURCC_J400 = FOURCC('J', '4', '0', '0'), // jpeg (bt.601 full), unofficial fourcc + FOURCC_J420 = + FOURCC('J', '4', '2', '0'), // jpeg (bt.601 full), unofficial fourcc + FOURCC_J422 = + FOURCC('J', '4', '2', '2'), // jpeg (bt.601 full), unofficial fourcc + FOURCC_J444 = + FOURCC('J', '4', '4', '4'), // jpeg (bt.601 full), unofficial fourcc + FOURCC_J400 = + FOURCC('J', '4', '0', '0'), // jpeg (bt.601 full), unofficial fourcc FOURCC_H420 = FOURCC('H', '4', '2', '0'), // bt.709, unofficial fourcc FOURCC_H422 = FOURCC('H', '4', '2', '2'), // bt.709, unofficial fourcc FOURCC_H444 = FOURCC('H', '4', '4', '4'), // bt.709, unofficial fourcc @@ -144,7 +148,7 @@ enum FourCCBpp { FOURCC_BPP_NV12 = 12, FOURCC_BPP_YUY2 = 16, FOURCC_BPP_UYVY = 16, - FOURCC_BPP_M420 = 12, + FOURCC_BPP_M420 = 12, // deprecated FOURCC_BPP_Q420 = 12, FOURCC_BPP_ARGB = 32, FOURCC_BPP_BGRA = 32, diff --git a/chromium/third_party/libyuv/linux.mk b/chromium/third_party/libyuv/linux.mk index e9a26a79b20..3e93b710d49 100644 --- a/chromium/third_party/libyuv/linux.mk +++ b/chromium/third_party/libyuv/linux.mk @@ -15,13 +15,13 @@ LOCAL_OBJ_FILES := \ source/compare_gcc.o \ source/compare_mmi.o \ source/compare_msa.o \ - source/compare_neon64.o \ source/compare_neon.o \ + source/compare_neon64.o \ source/compare_win.o \ - source/convert_argb.o \ source/convert.o \ - source/convert_from_argb.o \ + source/convert_argb.o \ source/convert_from.o \ + source/convert_from_argb.o \ source/convert_jpeg.o \ source/convert_to_argb.o \ source/convert_to_i420.o \ @@ -29,33 +29,34 @@ LOCAL_OBJ_FILES := \ source/mjpeg_decoder.o \ source/mjpeg_validate.o \ source/planar_functions.o \ + source/rotate.o \ source/rotate_any.o \ source/rotate_argb.o \ - source/rotate.o \ source/rotate_common.o \ source/rotate_gcc.o \ source/rotate_mmi.o \ source/rotate_msa.o \ - source/rotate_neon64.o \ source/rotate_neon.o \ + source/rotate_neon64.o \ source/rotate_win.o \ source/row_any.o \ source/row_common.o \ source/row_gcc.o \ source/row_mmi.o \ source/row_msa.o \ - source/row_neon64.o \ source/row_neon.o \ + source/row_neon64.o \ source/row_win.o \ + source/scale.o \ source/scale_any.o \ source/scale_argb.o \ - source/scale.o \ source/scale_common.o \ source/scale_gcc.o \ source/scale_mmi.o \ source/scale_msa.o \ - source/scale_neon64.o \ source/scale_neon.o \ + source/scale_neon64.o \ + source/scale_uv.o \ source/scale_win.o \ source/video_common.o @@ -65,7 +66,7 @@ LOCAL_OBJ_FILES := \ .c.o: $(CC) -c $(CFLAGS) $*.c -o $*.o -all: libyuv.a yuvconvert cpuid psnr +all: libyuv.a i444tonv12_eg yuvconvert cpuid psnr libyuv.a: $(LOCAL_OBJ_FILES) $(AR) $(ARFLAGS) $@ $(LOCAL_OBJ_FILES) @@ -78,6 +79,10 @@ yuvconvert: util/yuvconvert.cc libyuv.a psnr: util/psnr.cc $(CXX) $(CXXFLAGS) -Iutil/ -o $@ util/psnr.cc util/psnr_main.cc util/ssim.cc +# A simple conversion example. +i444tonv12_eg: util/i444tonv12_eg.cc libyuv.a + $(CC) $(CFLAGS) -o $@ util/i444tonv12_eg.cc libyuv.a + # A C test utility that uses libyuv conversion from C. # gcc 4.4 and older require -fno-exceptions to avoid link error on __gxx_personality_v0 # CC=gcc-4.4 CXXFLAGS=-fno-exceptions CXX=g++-4.4 make -f linux.mk @@ -85,4 +90,4 @@ cpuid: util/cpuid.c libyuv.a $(CC) $(CFLAGS) -o $@ util/cpuid.c libyuv.a clean: - /bin/rm -f source/*.o *.ii *.s libyuv.a yuvconvert cpuid psnr + /bin/rm -f source/*.o *.ii *.s libyuv.a i444tonv12_eg yuvconvert cpuid psnr diff --git a/chromium/third_party/libyuv/source/compare.cc b/chromium/third_party/libyuv/source/compare.cc index 7f4828104fe..e93aba1b53e 100644 --- a/chromium/third_party/libyuv/source/compare.cc +++ b/chromium/third_party/libyuv/source/compare.cc @@ -149,16 +149,16 @@ uint64_t ComputeHammingDistance(const uint8_t* src_a, HammingDistance = HammingDistance_AVX2; } #endif -#if defined(HAS_HAMMINGDISTANCE_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - HammingDistance = HammingDistance_MSA; - } -#endif #if defined(HAS_HAMMINGDISTANCE_MMI) if (TestCpuFlag(kCpuHasMMI)) { HammingDistance = HammingDistance_MMI; } #endif +#if defined(HAS_HAMMINGDISTANCE_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + HammingDistance = HammingDistance_MSA; + } +#endif #ifdef _OPENMP #pragma omp parallel for reduction(+ : diff) @@ -211,16 +211,16 @@ uint64_t ComputeSumSquareError(const uint8_t* src_a, SumSquareError = SumSquareError_AVX2; } #endif -#if defined(HAS_SUMSQUAREERROR_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - SumSquareError = SumSquareError_MSA; - } -#endif #if defined(HAS_SUMSQUAREERROR_MMI) if (TestCpuFlag(kCpuHasMMI)) { SumSquareError = SumSquareError_MMI; } #endif +#if defined(HAS_SUMSQUAREERROR_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SumSquareError = SumSquareError_MSA; + } +#endif #ifdef _OPENMP #pragma omp parallel for reduction(+ : sse) #endif diff --git a/chromium/third_party/libyuv/source/compare_gcc.cc b/chromium/third_party/libyuv/source/compare_gcc.cc index 676527c1b1b..6700f9697e0 100644 --- a/chromium/third_party/libyuv/source/compare_gcc.cc +++ b/chromium/third_party/libyuv/source/compare_gcc.cc @@ -29,38 +29,38 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a, uint64_t diff = 0u; asm volatile( - "xor %3,%3 \n" - "xor %%r8,%%r8 \n" - "xor %%r9,%%r9 \n" - "xor %%r10,%%r10 \n" + "xor %3,%3 \n" + "xor %%r8,%%r8 \n" + "xor %%r9,%%r9 \n" + "xor %%r10,%%r10 \n" // Process 32 bytes per loop. LABELALIGN "1: \n" - "mov (%0),%%rcx \n" - "mov 0x8(%0),%%rdx \n" - "xor (%1),%%rcx \n" - "xor 0x8(%1),%%rdx \n" - "popcnt %%rcx,%%rcx \n" - "popcnt %%rdx,%%rdx \n" - "mov 0x10(%0),%%rsi \n" - "mov 0x18(%0),%%rdi \n" - "xor 0x10(%1),%%rsi \n" - "xor 0x18(%1),%%rdi \n" - "popcnt %%rsi,%%rsi \n" - "popcnt %%rdi,%%rdi \n" - "add $0x20,%0 \n" - "add $0x20,%1 \n" - "add %%rcx,%3 \n" - "add %%rdx,%%r8 \n" - "add %%rsi,%%r9 \n" - "add %%rdi,%%r10 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "mov (%0),%%rcx \n" + "mov 0x8(%0),%%rdx \n" + "xor (%1),%%rcx \n" + "xor 0x8(%1),%%rdx \n" + "popcnt %%rcx,%%rcx \n" + "popcnt %%rdx,%%rdx \n" + "mov 0x10(%0),%%rsi \n" + "mov 0x18(%0),%%rdi \n" + "xor 0x10(%1),%%rsi \n" + "xor 0x18(%1),%%rdi \n" + "popcnt %%rsi,%%rsi \n" + "popcnt %%rdi,%%rdi \n" + "add $0x20,%0 \n" + "add $0x20,%1 \n" + "add %%rcx,%3 \n" + "add %%rdx,%%r8 \n" + "add %%rsi,%%r9 \n" + "add %%rdi,%%r10 \n" + "sub $0x20,%2 \n" + "jg 1b \n" - "add %%r8, %3 \n" - "add %%r9, %3 \n" - "add %%r10, %3 \n" + "add %%r8, %3 \n" + "add %%r9, %3 \n" + "add %%r10, %3 \n" : "+r"(src_a), // %0 "+r"(src_b), // %1 "+r"(count), // %2 @@ -80,26 +80,26 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a, // Process 16 bytes per loop. LABELALIGN "1: \n" - "mov (%0),%%ecx \n" - "mov 0x4(%0),%%edx \n" - "xor (%1),%%ecx \n" - "xor 0x4(%1),%%edx \n" - "popcnt %%ecx,%%ecx \n" - "add %%ecx,%3 \n" - "popcnt %%edx,%%edx \n" - "add %%edx,%3 \n" - "mov 0x8(%0),%%ecx \n" - "mov 0xc(%0),%%edx \n" - "xor 0x8(%1),%%ecx \n" - "xor 0xc(%1),%%edx \n" - "popcnt %%ecx,%%ecx \n" - "add %%ecx,%3 \n" - "popcnt %%edx,%%edx \n" - "add %%edx,%3 \n" - "add $0x10,%0 \n" - "add $0x10,%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "mov (%0),%%ecx \n" + "mov 0x4(%0),%%edx \n" + "xor (%1),%%ecx \n" + "xor 0x4(%1),%%edx \n" + "popcnt %%ecx,%%ecx \n" + "add %%ecx,%3 \n" + "popcnt %%edx,%%edx \n" + "add %%edx,%3 \n" + "mov 0x8(%0),%%ecx \n" + "mov 0xc(%0),%%edx \n" + "xor 0x8(%1),%%ecx \n" + "xor 0xc(%1),%%edx \n" + "popcnt %%ecx,%%ecx \n" + "add %%ecx,%3 \n" + "popcnt %%edx,%%edx \n" + "add %%edx,%3 \n" + "add $0x10,%0 \n" + "add $0x10,%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_a), // %0 "+r"(src_b), // %1 "+r"(count), // %2 @@ -121,46 +121,46 @@ uint32_t HammingDistance_SSSE3(const uint8_t* src_a, uint32_t diff = 0u; asm volatile( - "movdqa %4,%%xmm2 \n" - "movdqa %5,%%xmm3 \n" - "pxor %%xmm0,%%xmm0 \n" - "pxor %%xmm1,%%xmm1 \n" - "sub %0,%1 \n" + "movdqa %4,%%xmm2 \n" + "movdqa %5,%%xmm3 \n" + "pxor %%xmm0,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "sub %0,%1 \n" LABELALIGN "1: \n" - "movdqa (%0),%%xmm4 \n" - "movdqa 0x10(%0), %%xmm5 \n" - "pxor (%0,%1), %%xmm4 \n" - "movdqa %%xmm4,%%xmm6 \n" - "pand %%xmm2,%%xmm6 \n" - "psrlw $0x4,%%xmm4 \n" - "movdqa %%xmm3,%%xmm7 \n" - "pshufb %%xmm6,%%xmm7 \n" - "pand %%xmm2,%%xmm4 \n" - "movdqa %%xmm3,%%xmm6 \n" - "pshufb %%xmm4,%%xmm6 \n" - "paddb %%xmm7,%%xmm6 \n" - "pxor 0x10(%0,%1),%%xmm5 \n" - "add $0x20,%0 \n" - "movdqa %%xmm5,%%xmm4 \n" - "pand %%xmm2,%%xmm5 \n" - "psrlw $0x4,%%xmm4 \n" - "movdqa %%xmm3,%%xmm7 \n" - "pshufb %%xmm5,%%xmm7 \n" - "pand %%xmm2,%%xmm4 \n" - "movdqa %%xmm3,%%xmm5 \n" - "pshufb %%xmm4,%%xmm5 \n" - "paddb %%xmm7,%%xmm5 \n" - "paddb %%xmm5,%%xmm6 \n" - "psadbw %%xmm1,%%xmm6 \n" - "paddd %%xmm6,%%xmm0 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "movdqa (%0),%%xmm4 \n" + "movdqa 0x10(%0), %%xmm5 \n" + "pxor (%0,%1), %%xmm4 \n" + "movdqa %%xmm4,%%xmm6 \n" + "pand %%xmm2,%%xmm6 \n" + "psrlw $0x4,%%xmm4 \n" + "movdqa %%xmm3,%%xmm7 \n" + "pshufb %%xmm6,%%xmm7 \n" + "pand %%xmm2,%%xmm4 \n" + "movdqa %%xmm3,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "paddb %%xmm7,%%xmm6 \n" + "pxor 0x10(%0,%1),%%xmm5 \n" + "add $0x20,%0 \n" + "movdqa %%xmm5,%%xmm4 \n" + "pand %%xmm2,%%xmm5 \n" + "psrlw $0x4,%%xmm4 \n" + "movdqa %%xmm3,%%xmm7 \n" + "pshufb %%xmm5,%%xmm7 \n" + "pand %%xmm2,%%xmm4 \n" + "movdqa %%xmm3,%%xmm5 \n" + "pshufb %%xmm4,%%xmm5 \n" + "paddb %%xmm7,%%xmm5 \n" + "paddb %%xmm5,%%xmm6 \n" + "psadbw %%xmm1,%%xmm6 \n" + "paddd %%xmm6,%%xmm0 \n" + "sub $0x20,%2 \n" + "jg 1b \n" - "pshufd $0xaa,%%xmm0,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "movd %%xmm0, %3 \n" + "pshufd $0xaa,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "movd %%xmm0, %3 \n" : "+r"(src_a), // %0 "+r"(src_b), // %1 "+r"(count), // %2 @@ -182,40 +182,40 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a, asm volatile( "vbroadcastf128 %4,%%ymm2 \n" "vbroadcastf128 %5,%%ymm3 \n" - "vpxor %%ymm0,%%ymm0,%%ymm0 \n" - "vpxor %%ymm1,%%ymm1,%%ymm1 \n" - "sub %0,%1 \n" + "vpxor %%ymm0,%%ymm0,%%ymm0 \n" + "vpxor %%ymm1,%%ymm1,%%ymm1 \n" + "sub %0,%1 \n" LABELALIGN "1: \n" - "vmovdqa (%0),%%ymm4 \n" - "vmovdqa 0x20(%0), %%ymm5 \n" - "vpxor (%0,%1), %%ymm4, %%ymm4 \n" - "vpand %%ymm2,%%ymm4,%%ymm6 \n" - "vpsrlw $0x4,%%ymm4,%%ymm4 \n" - "vpshufb %%ymm6,%%ymm3,%%ymm6 \n" - "vpand %%ymm2,%%ymm4,%%ymm4 \n" - "vpshufb %%ymm4,%%ymm3,%%ymm4 \n" - "vpaddb %%ymm4,%%ymm6,%%ymm6 \n" - "vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n" - "add $0x40,%0 \n" - "vpand %%ymm2,%%ymm4,%%ymm5 \n" - "vpsrlw $0x4,%%ymm4,%%ymm4 \n" - "vpshufb %%ymm5,%%ymm3,%%ymm5 \n" - "vpand %%ymm2,%%ymm4,%%ymm4 \n" - "vpshufb %%ymm4,%%ymm3,%%ymm4 \n" - "vpaddb %%ymm5,%%ymm4,%%ymm4 \n" - "vpaddb %%ymm6,%%ymm4,%%ymm4 \n" - "vpsadbw %%ymm1,%%ymm4,%%ymm4 \n" - "vpaddd %%ymm0,%%ymm4,%%ymm0 \n" - "sub $0x40,%2 \n" - "jg 1b \n" + "vmovdqa (%0),%%ymm4 \n" + "vmovdqa 0x20(%0), %%ymm5 \n" + "vpxor (%0,%1), %%ymm4, %%ymm4 \n" + "vpand %%ymm2,%%ymm4,%%ymm6 \n" + "vpsrlw $0x4,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm6,%%ymm3,%%ymm6 \n" + "vpand %%ymm2,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm4,%%ymm3,%%ymm4 \n" + "vpaddb %%ymm4,%%ymm6,%%ymm6 \n" + "vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n" + "add $0x40,%0 \n" + "vpand %%ymm2,%%ymm4,%%ymm5 \n" + "vpsrlw $0x4,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm5,%%ymm3,%%ymm5 \n" + "vpand %%ymm2,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm4,%%ymm3,%%ymm4 \n" + "vpaddb %%ymm5,%%ymm4,%%ymm4 \n" + "vpaddb %%ymm6,%%ymm4,%%ymm4 \n" + "vpsadbw %%ymm1,%%ymm4,%%ymm4 \n" + "vpaddd %%ymm0,%%ymm4,%%ymm0 \n" + "sub $0x40,%2 \n" + "jg 1b \n" - "vpermq $0xb1,%%ymm0,%%ymm1 \n" - "vpaddd %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xaa,%%ymm0,%%ymm1 \n" - "vpaddd %%ymm1,%%ymm0,%%ymm0 \n" - "vmovd %%xmm0, %3 \n" + "vpermq $0xb1,%%ymm0,%%ymm1 \n" + "vpaddd %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xaa,%%ymm0,%%ymm1 \n" + "vpaddd %%ymm1,%%ymm0,%%ymm0 \n" + "vmovd %%xmm0, %3 \n" "vzeroupper \n" : "+r"(src_a), // %0 "+r"(src_b), // %1 @@ -234,34 +234,34 @@ uint32_t SumSquareError_SSE2(const uint8_t* src_a, int count) { uint32_t sse; asm volatile( - "pxor %%xmm0,%%xmm0 \n" - "pxor %%xmm5,%%xmm5 \n" + "pxor %%xmm0,%%xmm0 \n" + "pxor %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "movdqu (%1),%%xmm2 \n" - "lea 0x10(%1),%1 \n" - "movdqa %%xmm1,%%xmm3 \n" - "psubusb %%xmm2,%%xmm1 \n" - "psubusb %%xmm3,%%xmm2 \n" - "por %%xmm2,%%xmm1 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "punpckhbw %%xmm5,%%xmm2 \n" - "pmaddwd %%xmm1,%%xmm1 \n" - "pmaddwd %%xmm2,%%xmm2 \n" - "paddd %%xmm1,%%xmm0 \n" - "paddd %%xmm2,%%xmm0 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "movdqu (%1),%%xmm2 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psubusb %%xmm2,%%xmm1 \n" + "psubusb %%xmm3,%%xmm2 \n" + "por %%xmm2,%%xmm1 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpckhbw %%xmm5,%%xmm2 \n" + "pmaddwd %%xmm1,%%xmm1 \n" + "pmaddwd %%xmm2,%%xmm2 \n" + "paddd %%xmm1,%%xmm0 \n" + "paddd %%xmm2,%%xmm0 \n" + "sub $0x10,%2 \n" + "jg 1b \n" - "pshufd $0xee,%%xmm0,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "pshufd $0x1,%%xmm0,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "movd %%xmm0,%3 \n" + "pshufd $0xee,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "pshufd $0x1,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "movd %%xmm0,%3 \n" : "+r"(src_a), // %0 "+r"(src_b), // %1 @@ -301,44 +301,44 @@ static const uvec32 kHashMul3 = { uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) { uint32_t hash; asm volatile( - "movd %2,%%xmm0 \n" - "pxor %%xmm7,%%xmm7 \n" - "movdqa %4,%%xmm6 \n" + "movd %2,%%xmm0 \n" + "pxor %%xmm7,%%xmm7 \n" + "movdqa %4,%%xmm6 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "pmulld %%xmm6,%%xmm0 \n" - "movdqa %5,%%xmm5 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm7,%%xmm2 \n" - "movdqa %%xmm2,%%xmm3 \n" - "punpcklwd %%xmm7,%%xmm3 \n" - "pmulld %%xmm5,%%xmm3 \n" - "movdqa %6,%%xmm5 \n" - "movdqa %%xmm2,%%xmm4 \n" - "punpckhwd %%xmm7,%%xmm4 \n" - "pmulld %%xmm5,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "punpckhbw %%xmm7,%%xmm1 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklwd %%xmm7,%%xmm2 \n" - "pmulld %%xmm5,%%xmm2 \n" - "movdqa %8,%%xmm5 \n" - "punpckhwd %%xmm7,%%xmm1 \n" - "pmulld %%xmm5,%%xmm1 \n" - "paddd %%xmm4,%%xmm3 \n" - "paddd %%xmm2,%%xmm1 \n" - "paddd %%xmm3,%%xmm1 \n" - "pshufd $0xe,%%xmm1,%%xmm2 \n" - "paddd %%xmm2,%%xmm1 \n" - "pshufd $0x1,%%xmm1,%%xmm2 \n" - "paddd %%xmm2,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "sub $0x10,%1 \n" - "jg 1b \n" - "movd %%xmm0,%3 \n" + "movdqu (%0),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "pmulld %%xmm6,%%xmm0 \n" + "movdqa %5,%%xmm5 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm7,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm7,%%xmm3 \n" + "pmulld %%xmm5,%%xmm3 \n" + "movdqa %6,%%xmm5 \n" + "movdqa %%xmm2,%%xmm4 \n" + "punpckhwd %%xmm7,%%xmm4 \n" + "pmulld %%xmm5,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "punpckhbw %%xmm7,%%xmm1 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklwd %%xmm7,%%xmm2 \n" + "pmulld %%xmm5,%%xmm2 \n" + "movdqa %8,%%xmm5 \n" + "punpckhwd %%xmm7,%%xmm1 \n" + "pmulld %%xmm5,%%xmm1 \n" + "paddd %%xmm4,%%xmm3 \n" + "paddd %%xmm2,%%xmm1 \n" + "paddd %%xmm3,%%xmm1 \n" + "pshufd $0xe,%%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm1 \n" + "pshufd $0x1,%%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "sub $0x10,%1 \n" + "jg 1b \n" + "movd %%xmm0,%3 \n" : "+r"(src), // %0 "+r"(count), // %1 "+rm"(seed), // %2 diff --git a/chromium/third_party/libyuv/source/compare_neon.cc b/chromium/third_party/libyuv/source/compare_neon.cc index 2a2181e0cb3..afdd6012164 100644 --- a/chromium/third_party/libyuv/source/compare_neon.cc +++ b/chromium/third_party/libyuv/source/compare_neon.cc @@ -29,24 +29,24 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a, uint32_t diff; asm volatile( - "vmov.u16 q4, #0 \n" // accumulator + "vmov.u16 q4, #0 \n" // accumulator "1: \n" - "vld1.8 {q0, q1}, [%0]! \n" - "vld1.8 {q2, q3}, [%1]! \n" - "veor.32 q0, q0, q2 \n" - "veor.32 q1, q1, q3 \n" - "vcnt.i8 q0, q0 \n" - "vcnt.i8 q1, q1 \n" - "subs %2, %2, #32 \n" - "vadd.u8 q0, q0, q1 \n" // 16 byte counts - "vpadal.u8 q4, q0 \n" // 8 shorts - "bgt 1b \n" + "vld1.8 {q0, q1}, [%0]! \n" + "vld1.8 {q2, q3}, [%1]! \n" + "veor.32 q0, q0, q2 \n" + "veor.32 q1, q1, q3 \n" + "vcnt.i8 q0, q0 \n" + "vcnt.i8 q1, q1 \n" + "subs %2, %2, #32 \n" + "vadd.u8 q0, q0, q1 \n" // 16 byte counts + "vpadal.u8 q4, q0 \n" // 8 shorts + "bgt 1b \n" - "vpaddl.u16 q0, q4 \n" // 4 ints - "vpadd.u32 d0, d0, d1 \n" - "vpadd.u32 d0, d0, d0 \n" - "vmov.32 %3, d0[0] \n" + "vpaddl.u16 q0, q4 \n" // 4 ints + "vpadd.u32 d0, d0, d1 \n" + "vpadd.u32 d0, d0, d0 \n" + "vmov.32 %3, d0[0] \n" : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff) : @@ -59,29 +59,29 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a, int count) { uint32_t sse; asm volatile( - "vmov.u8 q8, #0 \n" - "vmov.u8 q10, #0 \n" - "vmov.u8 q9, #0 \n" - "vmov.u8 q11, #0 \n" + "vmov.u8 q8, #0 \n" + "vmov.u8 q10, #0 \n" + "vmov.u8 q9, #0 \n" + "vmov.u8 q11, #0 \n" "1: \n" - "vld1.8 {q0}, [%0]! \n" - "vld1.8 {q1}, [%1]! \n" - "subs %2, %2, #16 \n" - "vsubl.u8 q2, d0, d2 \n" - "vsubl.u8 q3, d1, d3 \n" - "vmlal.s16 q8, d4, d4 \n" - "vmlal.s16 q9, d6, d6 \n" - "vmlal.s16 q10, d5, d5 \n" - "vmlal.s16 q11, d7, d7 \n" - "bgt 1b \n" + "vld1.8 {q0}, [%0]! \n" + "vld1.8 {q1}, [%1]! \n" + "subs %2, %2, #16 \n" + "vsubl.u8 q2, d0, d2 \n" + "vsubl.u8 q3, d1, d3 \n" + "vmlal.s16 q8, d4, d4 \n" + "vmlal.s16 q9, d6, d6 \n" + "vmlal.s16 q10, d5, d5 \n" + "vmlal.s16 q11, d7, d7 \n" + "bgt 1b \n" - "vadd.u32 q8, q8, q9 \n" - "vadd.u32 q10, q10, q11 \n" - "vadd.u32 q11, q8, q10 \n" - "vpaddl.u32 q1, q11 \n" - "vadd.u64 d0, d2, d3 \n" - "vmov.32 %3, d0[0] \n" + "vadd.u32 q8, q8, q9 \n" + "vadd.u32 q10, q10, q11 \n" + "vadd.u32 q11, q8, q10 \n" + "vpaddl.u32 q1, q11 \n" + "vadd.u64 d0, d2, d3 \n" + "vmov.32 %3, d0[0] \n" : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) : : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); diff --git a/chromium/third_party/libyuv/source/compare_neon64.cc b/chromium/third_party/libyuv/source/compare_neon64.cc index 6e8f672ab73..70fb9b9143f 100644 --- a/chromium/third_party/libyuv/source/compare_neon64.cc +++ b/chromium/third_party/libyuv/source/compare_neon64.cc @@ -27,22 +27,24 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a, int count) { uint32_t diff; asm volatile( - "movi v4.8h, #0 \n" + "movi v4.8h, #0 \n" "1: \n" - "ld1 {v0.16b, v1.16b}, [%0], #32 \n" - "ld1 {v2.16b, v3.16b}, [%1], #32 \n" - "eor v0.16b, v0.16b, v2.16b \n" - "eor v1.16b, v1.16b, v3.16b \n" - "cnt v0.16b, v0.16b \n" - "cnt v1.16b, v1.16b \n" - "subs %w2, %w2, #32 \n" - "add v0.16b, v0.16b, v1.16b \n" - "uadalp v4.8h, v0.16b \n" - "b.gt 1b \n" + "ld1 {v0.16b, v1.16b}, [%0], #32 \n" + "ld1 {v2.16b, v3.16b}, [%1], #32 \n" + "eor v0.16b, v0.16b, v2.16b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "eor v1.16b, v1.16b, v3.16b \n" + "cnt v0.16b, v0.16b \n" + "prfm pldl1keep, [%1, 448] \n" + "cnt v1.16b, v1.16b \n" + "subs %w2, %w2, #32 \n" + "add v0.16b, v0.16b, v1.16b \n" + "uadalp v4.8h, v0.16b \n" + "b.gt 1b \n" - "uaddlv s4, v4.8h \n" - "fmov %w3, s4 \n" + "uaddlv s4, v4.8h \n" + "fmov %w3, s4 \n" : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff) : : "cc", "v0", "v1", "v2", "v3", "v4"); @@ -54,28 +56,30 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a, int count) { uint32_t sse; asm volatile( - "eor v16.16b, v16.16b, v16.16b \n" - "eor v18.16b, v18.16b, v18.16b \n" - "eor v17.16b, v17.16b, v17.16b \n" - "eor v19.16b, v19.16b, v19.16b \n" + "eor v16.16b, v16.16b, v16.16b \n" + "eor v18.16b, v18.16b, v18.16b \n" + "eor v17.16b, v17.16b, v17.16b \n" + "eor v19.16b, v19.16b, v19.16b \n" "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" - "ld1 {v1.16b}, [%1], #16 \n" - "subs %w2, %w2, #16 \n" - "usubl v2.8h, v0.8b, v1.8b \n" - "usubl2 v3.8h, v0.16b, v1.16b \n" - "smlal v16.4s, v2.4h, v2.4h \n" - "smlal v17.4s, v3.4h, v3.4h \n" - "smlal2 v18.4s, v2.8h, v2.8h \n" - "smlal2 v19.4s, v3.8h, v3.8h \n" - "b.gt 1b \n" + "ld1 {v0.16b}, [%0], #16 \n" + "ld1 {v1.16b}, [%1], #16 \n" + "subs %w2, %w2, #16 \n" + "usubl v2.8h, v0.8b, v1.8b \n" + "usubl2 v3.8h, v0.16b, v1.16b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "smlal v16.4s, v2.4h, v2.4h \n" + "smlal v17.4s, v3.4h, v3.4h \n" + "prfm pldl1keep, [%1, 448] \n" + "smlal2 v18.4s, v2.8h, v2.8h \n" + "smlal2 v19.4s, v3.8h, v3.8h \n" + "b.gt 1b \n" - "add v16.4s, v16.4s, v17.4s \n" - "add v18.4s, v18.4s, v19.4s \n" - "add v19.4s, v16.4s, v18.4s \n" - "addv s0, v19.4s \n" - "fmov %w3, s0 \n" + "add v16.4s, v16.4s, v17.4s \n" + "add v18.4s, v18.4s, v19.4s \n" + "add v19.4s, v16.4s, v18.4s \n" + "addv s0, v19.4s \n" + "fmov %w3, s0 \n" : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) : : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); diff --git a/chromium/third_party/libyuv/source/convert.cc b/chromium/third_party/libyuv/source/convert.cc index 614fa48241e..98258b9bc93 100644 --- a/chromium/third_party/libyuv/source/convert.cc +++ b/chromium/third_party/libyuv/source/convert.cc @@ -320,14 +320,6 @@ int I422ToNV21(const uint8_t* src_y, } } #endif -#if defined(HAS_MERGEUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - MergeUVRow = MergeUVRow_Any_MSA; - if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow = MergeUVRow_MSA; - } - } -#endif #if defined(HAS_MERGEUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { MergeUVRow = MergeUVRow_Any_MMI; @@ -336,6 +328,14 @@ int I422ToNV21(const uint8_t* src_y, } } #endif +#if defined(HAS_MERGEUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MergeUVRow = MergeUVRow_Any_MSA; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow = MergeUVRow_MSA; + } + } +#endif #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; @@ -360,14 +360,6 @@ int I422ToNV21(const uint8_t* src_y, } } #endif -#if defined(HAS_INTERPOLATEROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - InterpolateRow = InterpolateRow_Any_MSA; - if (IS_ALIGNED(width, 32)) { - InterpolateRow = InterpolateRow_MSA; - } - } -#endif #if defined(HAS_INTERPOLATEROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { InterpolateRow = InterpolateRow_Any_MMI; @@ -376,6 +368,14 @@ int I422ToNV21(const uint8_t* src_y, } } #endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif if (dst_y) { CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, halfwidth, height); @@ -426,9 +426,8 @@ int I444ToI420(const uint8_t* src_y, dst_v, dst_stride_v, width, height, width, height); } -// TODO(fbarchard): Implement row conversion. LIBYUV_API -int I444ToNV21(const uint8_t* src_y, +int I444ToNV12(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, @@ -436,16 +435,17 @@ int I444ToNV21(const uint8_t* src_y, int src_stride_v, uint8_t* dst_y, int dst_stride_y, - uint8_t* dst_vu, - int dst_stride_vu, + uint8_t* dst_uv, + int dst_stride_uv, int width, int height) { - int halfwidth = (width + 1) >> 1; - int halfheight = (height + 1) >> 1; + if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 || + height == 0) { + return -1; + } // Negative height means invert the image. if (height < 0) { height = -height; - halfheight = (height + 1) >> 1; src_y = src_y + (height - 1) * src_stride_y; src_u = src_u + (height - 1) * src_stride_u; src_v = src_v + (height - 1) * src_stride_v; @@ -453,19 +453,32 @@ int I444ToNV21(const uint8_t* src_y, src_stride_u = -src_stride_u; src_stride_v = -src_stride_v; } - // Allocate u and v buffers - align_buffer_64(plane_u, halfwidth * halfheight * 2); - uint8_t* plane_v = plane_u + halfwidth * halfheight; - - I444ToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, - dst_y, dst_stride_y, plane_u, halfwidth, plane_v, halfwidth, width, - height); - MergeUVPlane(plane_v, halfwidth, plane_u, halfwidth, dst_vu, dst_stride_vu, - halfwidth, halfheight); - free_aligned_buffer_64(plane_u); + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + HalfMergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv, + dst_stride_uv, width, height); return 0; } +LIBYUV_API +int I444ToNV21(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height) { + return I444ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u, + src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu, + width, height); +} + // I400 is greyscale typically used in MJPG LIBYUV_API int I400ToI420(const uint8_t* src_y, @@ -527,70 +540,21 @@ int I400ToNV21(const uint8_t* src_y, return 0; } -static void CopyPlane2(const uint8_t* src, - int src_stride_0, - int src_stride_1, - uint8_t* dst, - int dst_stride, - int width, - int height) { - int y; - void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C; -#if defined(HAS_COPYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; - } -#endif -#if defined(HAS_COPYROW_AVX) - if (TestCpuFlag(kCpuHasAVX)) { - CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX; - } -#endif -#if defined(HAS_COPYROW_ERMS) - if (TestCpuFlag(kCpuHasERMS)) { - CopyRow = CopyRow_ERMS; - } -#endif -#if defined(HAS_COPYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; - } -#endif - - // Copy plane - for (y = 0; y < height - 1; y += 2) { - CopyRow(src, dst, width); - CopyRow(src + src_stride_0, dst + dst_stride, width); - src += src_stride_0 + src_stride_1; - dst += dst_stride * 2; - } - if (height & 1) { - CopyRow(src, dst, width); - } -} - -// Support converting from FOURCC_M420 -// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for -// easy conversion to I420. -// M420 format description: -// M420 is row biplanar 420: 2 rows of Y and 1 row of UV. -// Chroma is half width / half height. (420) -// src_stride_m420 is row planar. Normally this will be the width in pixels. -// The UV plane is half width, but 2 values, so src_stride_m420 applies to -// this as well as the two Y planes. -static int X420ToI420(const uint8_t* src_y, - int src_stride_y0, - int src_stride_y1, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { +// Convert NV12 to I420. +// TODO(fbarchard): Consider inverting destination. Faster on ARM with prfm. +LIBYUV_API +int NV12ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; if (!src_uv || !dst_u || !dst_v || width <= 0 || height == 0) { @@ -600,21 +564,16 @@ static int X420ToI420(const uint8_t* src_y, if (height < 0) { height = -height; halfheight = (height + 1) >> 1; - if (dst_y) { - dst_y = dst_y + (height - 1) * dst_stride_y; - } - dst_u = dst_u + (halfheight - 1) * dst_stride_u; - dst_v = dst_v + (halfheight - 1) * dst_stride_v; - dst_stride_y = -dst_stride_y; - dst_stride_u = -dst_stride_u; - dst_stride_v = -dst_stride_v; + src_y = src_y + (height - 1) * src_stride_y; + src_uv = src_uv + (halfheight - 1) * src_stride_uv; + src_stride_y = -src_stride_y; + src_stride_uv = -src_stride_uv; } // Coalesce rows. - if (src_stride_y0 == width && src_stride_y1 == width && - dst_stride_y == width) { + if (src_stride_y == width && dst_stride_y == width) { width *= height; height = 1; - src_stride_y0 = src_stride_y1 = dst_stride_y = 0; + src_stride_y = dst_stride_y = 0; } // Coalesce rows. if (src_stride_uv == halfwidth * 2 && dst_stride_u == halfwidth && @@ -625,12 +584,7 @@ static int X420ToI420(const uint8_t* src_y, } if (dst_y) { - if (src_stride_y0 == src_stride_y1) { - CopyPlane(src_y, src_stride_y0, dst_y, dst_stride_y, width, height); - } else { - CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y, - width, height); - } + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); } // Split UV plane - NV12 / NV21 @@ -640,25 +594,6 @@ static int X420ToI420(const uint8_t* src_y, return 0; } -// Convert NV12 to I420. -LIBYUV_API -int NV12ToI420(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - return X420ToI420(src_y, src_stride_y, src_stride_y, src_uv, src_stride_uv, - dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, - dst_stride_v, width, height); -} - // Convert NV21 to I420. Same as NV12 but u and v pointers swapped. LIBYUV_API int NV21ToI420(const uint8_t* src_y, @@ -673,26 +608,8 @@ int NV21ToI420(const uint8_t* src_y, int dst_stride_v, int width, int height) { - return X420ToI420(src_y, src_stride_y, src_stride_y, src_vu, src_stride_vu, - dst_y, dst_stride_y, dst_v, dst_stride_v, dst_u, - dst_stride_u, width, height); -} - -// Convert M420 to I420. -LIBYUV_API -int M420ToI420(const uint8_t* src_m420, - int src_stride_m420, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2, - src_m420 + src_stride_m420 * 2, src_stride_m420 * 3, dst_y, - dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, + return NV12ToI420(src_y, src_stride_y, src_vu, src_stride_vu, dst_y, + dst_stride_y, dst_v, dst_stride_v, dst_u, dst_stride_u, width, height); } @@ -750,17 +667,7 @@ int YUY2ToI420(const uint8_t* src_yuy2, } } #endif -#if defined(HAS_YUY2TOYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - YUY2ToYRow = YUY2ToYRow_Any_MSA; - YUY2ToUVRow = YUY2ToUVRow_Any_MSA; - if (IS_ALIGNED(width, 32)) { - YUY2ToYRow = YUY2ToYRow_MSA; - YUY2ToUVRow = YUY2ToUVRow_MSA; - } - } -#endif -#if defined(HAS_YUY2TOYROW_MMI) +#if defined(HAS_YUY2TOYROW_MMI) && defined(HAS_YUY2TOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { YUY2ToYRow = YUY2ToYRow_Any_MMI; YUY2ToUVRow = YUY2ToUVRow_Any_MMI; @@ -772,6 +679,16 @@ int YUY2ToI420(const uint8_t* src_yuy2, } } #endif +#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + YUY2ToYRow = YUY2ToYRow_Any_MSA; + YUY2ToUVRow = YUY2ToUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + YUY2ToYRow = YUY2ToYRow_MSA; + YUY2ToUVRow = YUY2ToUVRow_MSA; + } + } +#endif for (y = 0; y < height - 1; y += 2) { YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width); @@ -843,6 +760,16 @@ int UYVYToI420(const uint8_t* src_uyvy, } } #endif +#if defined(HAS_UYVYTOYROW_MMI) && defined(HAS_UYVYTOUVROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + UYVYToYRow = UYVYToYRow_Any_MMI; + UYVYToUVRow = UYVYToUVRow_Any_MMI; + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_MMI; + UYVYToUVRow = UYVYToUVRow_MMI; + } + } +#endif #if defined(HAS_UYVYTOYROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { UYVYToYRow = UYVYToYRow_Any_MSA; @@ -853,16 +780,6 @@ int UYVYToI420(const uint8_t* src_uyvy, } } #endif -#if defined(HAS_UYVYTOYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - UYVYToYRow = UYVYToYRow_Any_MMI; - UYVYToUVRow = UYVYToUVRow_Any_MMI; - if (IS_ALIGNED(width, 16)) { - UYVYToYRow = UYVYToYRow_MMI; - UYVYToUVRow = UYVYToUVRow_MMI; - } - } -#endif for (y = 0; y < height - 1; y += 2) { UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width); @@ -1081,35 +998,27 @@ int ARGBToI420(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToYRow = ARGBToYRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_MSA; - } - } -#endif -#if defined(HAS_ARGBTOUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToUVRow = ARGBToUVRow_Any_MSA; - if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_MSA; - } - } -#endif -#if defined(HAS_ARGBTOYROW_MMI) +#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToYRow = ARGBToYRow_Any_MMI; + ARGBToUVRow = ARGBToUVRow_Any_MMI; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_MMI; } + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_MMI; + } } #endif -#if defined(HAS_ARGBTOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToUVRow = ARGBToUVRow_Any_MMI; +#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + ARGBToUVRow = ARGBToUVRow_Any_MSA; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_MMI; + ARGBToYRow = ARGBToYRow_MSA; + } + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; } } #endif @@ -1183,35 +1092,25 @@ int BGRAToI420(const uint8_t* src_bgra, } } #endif -#if defined(HAS_BGRATOYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - BGRAToYRow = BGRAToYRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - BGRAToYRow = BGRAToYRow_MSA; - } - } -#endif -#if defined(HAS_BGRATOUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - BGRAToUVRow = BGRAToUVRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - BGRAToUVRow = BGRAToUVRow_MSA; - } - } -#endif -#if defined(HAS_BGRATOYROW_MMI) +#if defined(HAS_BGRATOYROW_MMI) && defined(HAS_BGRATOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { BGRAToYRow = BGRAToYRow_Any_MMI; + BGRAToUVRow = BGRAToUVRow_Any_MMI; if (IS_ALIGNED(width, 8)) { BGRAToYRow = BGRAToYRow_MMI; } + if (IS_ALIGNED(width, 16)) { + BGRAToUVRow = BGRAToUVRow_MMI; + } } #endif -#if defined(HAS_BGRATOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - BGRAToUVRow = BGRAToUVRow_Any_MMI; +#if defined(HAS_BGRATOYROW_MSA) && defined(HAS_BGRATOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + BGRAToYRow = BGRAToYRow_Any_MSA; + BGRAToUVRow = BGRAToUVRow_Any_MSA; if (IS_ALIGNED(width, 16)) { - BGRAToUVRow = BGRAToUVRow_MMI; + BGRAToYRow = BGRAToYRow_MSA; + BGRAToUVRow = BGRAToUVRow_MSA; } } #endif @@ -1269,6 +1168,16 @@ int ABGRToI420(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOYROW_AVX2) && defined(HAS_ABGRTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ABGRToUVRow = ABGRToUVRow_Any_AVX2; + ABGRToYRow = ABGRToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ABGRToUVRow = ABGRToUVRow_AVX2; + ABGRToYRow = ABGRToYRow_AVX2; + } + } +#endif #if defined(HAS_ABGRTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ABGRToYRow = ABGRToYRow_Any_NEON; @@ -1285,35 +1194,25 @@ int ABGRToI420(const uint8_t* src_abgr, } } #endif -#if defined(HAS_ABGRTOYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ABGRToYRow = ABGRToYRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ABGRToYRow = ABGRToYRow_MSA; - } - } -#endif -#if defined(HAS_ABGRTOUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ABGRToUVRow = ABGRToUVRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ABGRToUVRow = ABGRToUVRow_MSA; - } - } -#endif -#if defined(HAS_ABGRTOYROW_MMI) +#if defined(HAS_ABGRTOYROW_MMI) && defined(HAS_ABGRTOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ABGRToYRow = ABGRToYRow_Any_MMI; + ABGRToUVRow = ABGRToUVRow_Any_MMI; if (IS_ALIGNED(width, 8)) { ABGRToYRow = ABGRToYRow_MMI; } + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_MMI; + } } #endif -#if defined(HAS_ABGRTOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ABGRToUVRow = ABGRToUVRow_Any_MMI; +#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ABGRToYRow = ABGRToYRow_Any_MSA; + ABGRToUVRow = ABGRToUVRow_Any_MSA; if (IS_ALIGNED(width, 16)) { - ABGRToUVRow = ABGRToUVRow_MMI; + ABGRToYRow = ABGRToYRow_MSA; + ABGRToUVRow = ABGRToUVRow_MSA; } } #endif @@ -1387,35 +1286,25 @@ int RGBAToI420(const uint8_t* src_rgba, } } #endif -#if defined(HAS_RGBATOYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - RGBAToYRow = RGBAToYRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - RGBAToYRow = RGBAToYRow_MSA; - } - } -#endif -#if defined(HAS_RGBATOUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - RGBAToUVRow = RGBAToUVRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - RGBAToUVRow = RGBAToUVRow_MSA; - } - } -#endif -#if defined(HAS_RGBATOYROW_MMI) +#if defined(HAS_RGBATOYROW_MMI) && defined(HAS_RGBATOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { RGBAToYRow = RGBAToYRow_Any_MMI; + RGBAToUVRow = RGBAToUVRow_Any_MMI; if (IS_ALIGNED(width, 8)) { RGBAToYRow = RGBAToYRow_MMI; } + if (IS_ALIGNED(width, 16)) { + RGBAToUVRow = RGBAToUVRow_MMI; + } } #endif -#if defined(HAS_RGBATOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - RGBAToUVRow = RGBAToUVRow_Any_MMI; +#if defined(HAS_RGBATOYROW_MSA) && defined(HAS_RGBATOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGBAToYRow = RGBAToYRow_Any_MSA; + RGBAToUVRow = RGBAToUVRow_Any_MSA; if (IS_ALIGNED(width, 16)) { - RGBAToUVRow = RGBAToUVRow_MMI; + RGBAToYRow = RGBAToYRow_MSA; + RGBAToUVRow = RGBAToUVRow_MSA; } } #endif @@ -1487,16 +1376,9 @@ int RGB24ToI420(const uint8_t* src_rgb24, } } } -#elif defined(HAS_RGB24TOYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - RGB24ToUVRow = RGB24ToUVRow_Any_MSA; - RGB24ToYRow = RGB24ToYRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - RGB24ToYRow = RGB24ToYRow_MSA; - RGB24ToUVRow = RGB24ToUVRow_MSA; - } - } -#elif defined(HAS_RGB24TOYROW_MMI) +// MMI and MSA version does direct RGB24 to YUV. +#elif (defined(HAS_RGB24TOYROW_MMI) || defined(HAS_RGB24TOYROW_MSA)) +#if defined(HAS_RGB24TOYROW_MMI) && defined(HAS_RGB24TOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { RGB24ToUVRow = RGB24ToUVRow_Any_MMI; RGB24ToYRow = RGB24ToYRow_Any_MMI; @@ -1507,6 +1389,17 @@ int RGB24ToI420(const uint8_t* src_rgb24, } } } +#endif +#if defined(HAS_RGB24TOYROW_MSA) && defined(HAS_RGB24TOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGB24ToUVRow = RGB24ToUVRow_Any_MSA; + RGB24ToYRow = RGB24ToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGB24ToYRow = RGB24ToYRow_MSA; + RGB24ToUVRow = RGB24ToUVRow_MSA; + } + } +#endif // Other platforms do intermediate conversion from RGB24 to ARGB. #else #if defined(HAS_RGB24TOARGBROW_SSSE3) @@ -1598,8 +1491,8 @@ int RGB24ToJ420(const uint8_t* src_rgb24, int width, int height) { int y; -#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \ - defined(HAS_RGB24TOYJROW_MMI)) +#if (defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \ + defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI) void (*RGB24ToUVJRow)(const uint8_t* src_rgb24, int src_stride_rgb24, uint8_t* dst_u, uint8_t* dst_v, int width) = RGB24ToUVJRow_C; @@ -1625,7 +1518,7 @@ int RGB24ToJ420(const uint8_t* src_rgb24, } // Neon version does direct RGB24 to YUV. -#if defined(HAS_RGB24TOYJROW_NEON) +#if defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RGB24ToUVJRow = RGB24ToUVJRow_Any_NEON; RGB24ToYJRow = RGB24ToYJRow_Any_NEON; @@ -1636,16 +1529,9 @@ int RGB24ToJ420(const uint8_t* src_rgb24, } } } -#elif defined(HAS_RGB24TOYJROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - RGB24ToUVJRow = RGB24ToUVJRow_Any_MSA; - RGB24ToYJRow = RGB24ToYJRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - RGB24ToYJRow = RGB24ToYJRow_MSA; - RGB24ToUVJRow = RGB24ToUVJRow_MSA; - } - } -#elif defined(HAS_RGB24TOYJROW_MMI) +// MMI and MSA version does direct RGB24 to YUV. +#elif (defined(HAS_RGB24TOYJROW_MMI) || defined(HAS_RGB24TOYJROW_MSA)) +#if defined(HAS_RGB24TOYJROW_MMI) && defined(HAS_RGB24TOUVJROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { RGB24ToUVJRow = RGB24ToUVJRow_Any_MMI; RGB24ToYJRow = RGB24ToYJRow_Any_MMI; @@ -1656,7 +1542,17 @@ int RGB24ToJ420(const uint8_t* src_rgb24, } } } -// Other platforms do intermediate conversion from RGB24 to ARGB. +#endif +#if defined(HAS_RGB24TOYJROW_MSA) && defined(HAS_RGB24TOUVJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGB24ToUVJRow = RGB24ToUVJRow_Any_MSA; + RGB24ToYJRow = RGB24ToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGB24ToYJRow = RGB24ToYJRow_MSA; + RGB24ToUVJRow = RGB24ToUVJRow_MSA; + } + } +#endif #else #if defined(HAS_RGB24TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { @@ -1689,16 +1585,16 @@ int RGB24ToJ420(const uint8_t* src_rgb24, #endif { -#if !(defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \ - defined(HAS_RGB24TOYJROW_MMI)) +#if !((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \ + defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI)) // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); #endif for (y = 0; y < height - 1; y += 2) { -#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \ - defined(HAS_RGB24TOYJROW_MMI)) +#if ((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \ + defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI)) RGB24ToUVJRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width); RGB24ToYJRow(src_rgb24, dst_y, width); RGB24ToYJRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width); @@ -1715,8 +1611,8 @@ int RGB24ToJ420(const uint8_t* src_rgb24, dst_v += dst_stride_v; } if (height & 1) { -#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \ - defined(HAS_RGB24TOYJROW_MMI)) +#if ((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \ + defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI)) RGB24ToUVJRow(src_rgb24, 0, dst_u, dst_v, width); RGB24ToYJRow(src_rgb24, dst_y, width); #else @@ -1725,8 +1621,8 @@ int RGB24ToJ420(const uint8_t* src_rgb24, ARGBToYJRow(row, dst_y, width); #endif } -#if !(defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \ - defined(HAS_RGB24TOYJROW_MMI)) +#if !((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \ + defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI)) free_aligned_buffer_64(row); #endif } @@ -1746,8 +1642,8 @@ int RAWToI420(const uint8_t* src_raw, int width, int height) { int y; -#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \ - defined(HAS_RAWTOYROW_MMI)) +#if (defined(HAS_RAWTOYROW_NEON) && defined(HAS_RAWTOUVROW_NEON)) || \ + defined(HAS_RAWTOYROW_MSA) || defined(HAS_RAWTOYROW_MMI) void (*RAWToUVRow)(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u, uint8_t* dst_v, int width) = RAWToUVRow_C; void (*RAWToYRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) = @@ -1772,7 +1668,7 @@ int RAWToI420(const uint8_t* src_raw, } // Neon version does direct RAW to YUV. -#if defined(HAS_RAWTOYROW_NEON) +#if defined(HAS_RAWTOYROW_NEON) && defined(HAS_RAWTOUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RAWToUVRow = RAWToUVRow_Any_NEON; RAWToYRow = RAWToYRow_Any_NEON; @@ -1783,16 +1679,9 @@ int RAWToI420(const uint8_t* src_raw, } } } -#elif defined(HAS_RAWTOYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - RAWToUVRow = RAWToUVRow_Any_MSA; - RAWToYRow = RAWToYRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - RAWToYRow = RAWToYRow_MSA; - RAWToUVRow = RAWToUVRow_MSA; - } - } -#elif defined(HAS_RAWTOYROW_MMI) +// MMI and MSA version does direct RAW to YUV. +#elif (defined(HAS_RAWTOYROW_MMI) || defined(HAS_RAWTOYROW_MSA)) +#if defined(HAS_RAWTOYROW_MMI) && defined(HAS_RAWTOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { RAWToUVRow = RAWToUVRow_Any_MMI; RAWToYRow = RAWToYRow_Any_MMI; @@ -1803,6 +1692,17 @@ int RAWToI420(const uint8_t* src_raw, } } } +#endif +#if defined(HAS_RAWTOYROW_MSA) && defined(HAS_RAWTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RAWToUVRow = RAWToUVRow_Any_MSA; + RAWToYRow = RAWToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RAWToYRow = RAWToYRow_MSA; + RAWToUVRow = RAWToUVRow_MSA; + } + } +#endif // Other platforms do intermediate conversion from RAW to ARGB. #else #if defined(HAS_RAWTOARGBROW_SSSE3) @@ -1931,16 +1831,9 @@ int RGB565ToI420(const uint8_t* src_rgb565, } } } -#elif defined(HAS_RGB565TOYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - RGB565ToUVRow = RGB565ToUVRow_Any_MSA; - RGB565ToYRow = RGB565ToYRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - RGB565ToYRow = RGB565ToYRow_MSA; - RGB565ToUVRow = RGB565ToUVRow_MSA; - } - } -#elif defined(HAS_RGB565TOYROW_MMI) +// MMI and MSA version does direct RGB565 to YUV. +#elif (defined(HAS_RGB565TOYROW_MMI) || defined(HAS_RGB565TOYROW_MSA)) +#if defined(HAS_RGB565TOYROW_MMI) && defined(HAS_RGB565TOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { RGB565ToUVRow = RGB565ToUVRow_Any_MMI; RGB565ToYRow = RGB565ToYRow_Any_MMI; @@ -1951,6 +1844,17 @@ int RGB565ToI420(const uint8_t* src_rgb565, } } } +#endif +#if defined(HAS_RGB565TOYROW_MSA) && defined(HAS_RGB565TOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGB565ToUVRow = RGB565ToUVRow_Any_MSA; + RGB565ToYRow = RGB565ToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGB565ToYRow = RGB565ToYRow_MSA; + RGB565ToUVRow = RGB565ToUVRow_MSA; + } + } +#endif // Other platforms do intermediate conversion from RGB565 to ARGB. #else #if defined(HAS_RGB565TOARGBROW_SSE2) @@ -2086,16 +1990,9 @@ int ARGB1555ToI420(const uint8_t* src_argb1555, } } } -#elif defined(HAS_ARGB1555TOYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MSA; - ARGB1555ToYRow = ARGB1555ToYRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGB1555ToYRow = ARGB1555ToYRow_MSA; - ARGB1555ToUVRow = ARGB1555ToUVRow_MSA; - } - } -#elif defined(HAS_ARGB1555TOYROW_MMI) +// MMI and MSA version does direct ARGB1555 to YUV. +#elif (defined(HAS_ARGB1555TOYROW_MMI) || defined(HAS_ARGB1555TOYROW_MSA)) +#if defined(HAS_ARGB1555TOYROW_MMI) && defined(HAS_ARGB1555TOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MMI; ARGB1555ToYRow = ARGB1555ToYRow_Any_MMI; @@ -2106,6 +2003,17 @@ int ARGB1555ToI420(const uint8_t* src_argb1555, } } } +#endif +#if defined(HAS_ARGB1555TOYROW_MSA) && defined(HAS_ARGB1555TOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MSA; + ARGB1555ToYRow = ARGB1555ToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToYRow = ARGB1555ToYRow_MSA; + ARGB1555ToUVRow = ARGB1555ToUVRow_MSA; + } + } +#endif // Other platforms do intermediate conversion from ARGB1555 to ARGB. #else #if defined(HAS_ARGB1555TOARGBROW_SSE2) @@ -2243,7 +2151,7 @@ int ARGB4444ToI420(const uint8_t* src_argb4444, } } } -#elif defined(HAS_ARGB4444TOYROW_MMI) +#elif defined(HAS_ARGB4444TOYROW_MMI) && defined(HAS_ARGB4444TOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGB4444ToUVRow = ARGB4444ToUVRow_Any_MMI; ARGB4444ToYRow = ARGB4444ToYRow_Any_MMI; @@ -2300,19 +2208,7 @@ int ARGB4444ToI420(const uint8_t* src_argb4444, } } #endif -#if defined(HAS_ARGBTOYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToUVRow = ARGBToUVRow_Any_MSA; - ARGBToYRow = ARGBToYRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_MSA; - if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_MSA; - } - } - } -#endif -#if defined(HAS_ARGBTOYROW_MMI) +#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToUVRow = ARGBToUVRow_Any_MMI; ARGBToYRow = ARGBToYRow_Any_MMI; @@ -2324,6 +2220,18 @@ int ARGB4444ToI420(const uint8_t* src_argb4444, } } #endif +#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVRow = ARGBToUVRow_Any_MSA; + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } + } +#endif #endif { @@ -2378,27 +2286,38 @@ int RGB24ToJ400(const uint8_t* src_rgb24, int width, int height) { int y; -#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \ - defined(HAS_RGB24TOYJROW_MMI)) void (*RGB24ToYJRow)(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) = RGB24ToYJRow_C; -#else - void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = - RGB24ToARGBRow_C; - void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) = - ARGBToYJRow_C; -#endif if (!src_rgb24 || !dst_yj || width <= 0 || height == 0) { return -1; } - // Negative height means invert the image. if (height < 0) { height = -height; src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24; src_stride_rgb24 = -src_stride_rgb24; } - -// Neon version does direct RGB24 to YUV. + // Coalesce rows. + if (src_stride_rgb24 == width * 3 && dst_stride_yj == width) { + width *= height; + height = 1; + src_stride_rgb24 = dst_stride_yj = 0; + } +#if defined(HAS_RGB24TOYJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RGB24ToYJRow = RGB24ToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RGB24ToYJRow = RGB24ToYJRow_SSSE3; + } + } +#endif +#if defined(HAS_RGB24TOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + RGB24ToYJRow = RGB24ToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + RGB24ToYJRow = RGB24ToYJRow_AVX2; + } + } +#endif #if defined(HAS_RGB24TOYJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RGB24ToYJRow = RGB24ToYJRow_Any_NEON; @@ -2406,83 +2325,102 @@ int RGB24ToJ400(const uint8_t* src_rgb24, RGB24ToYJRow = RGB24ToYJRow_NEON; } } -#elif defined(HAS_RGB24TOYJROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - RGB24ToYJRow = RGB24ToYJRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - RGB24ToYJRow = RGB24ToYJRow_MSA; - } - } -#elif defined(HAS_RGB24TOYJROW_MMI) +#endif +#if defined(HAS_RGB24TOYJROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { RGB24ToYJRow = RGB24ToYJRow_Any_MMI; if (IS_ALIGNED(width, 8)) { RGB24ToYJRow = RGB24ToYJRow_MMI; } } -// Other platforms do intermediate conversion from RGB24 to ARGB. -#else -#if defined(HAS_RGB24TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; +#endif +#if defined(HAS_RGB24TOYJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGB24ToYJRow = RGB24ToYJRow_Any_MSA; if (IS_ALIGNED(width, 16)) { - RGB24ToARGBRow = RGB24ToARGBRow_SSSE3; + RGB24ToYJRow = RGB24ToYJRow_MSA; } } #endif -#if defined(HAS_ARGBTOYJROW_SSSE3) + + for (y = 0; y < height; ++y) { + RGB24ToYJRow(src_rgb24, dst_yj, width); + src_rgb24 += src_stride_rgb24; + dst_yj += dst_stride_yj; + } + return 0; +} + +// Convert RAW to J400. +LIBYUV_API +int RAWToJ400(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_yj, + int dst_stride_yj, + int width, + int height) { + int y; + void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_yj, int width) = + RAWToYJRow_C; + if (!src_raw || !dst_yj || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_raw = src_raw + (height - 1) * src_stride_raw; + src_stride_raw = -src_stride_raw; + } + // Coalesce rows. + if (src_stride_raw == width * 3 && dst_stride_yj == width) { + width *= height; + height = 1; + src_stride_raw = dst_stride_yj = 0; + } +#if defined(HAS_RAWTOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYJRow = ARGBToYJRow_Any_SSSE3; + RAWToYJRow = RAWToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYJRow = ARGBToYJRow_SSSE3; + RAWToYJRow = RAWToYJRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOYJROW_AVX2) +#if defined(HAS_RAWTOYJROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToYJRow = ARGBToYJRow_Any_AVX2; + RAWToYJRow = RAWToYJRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { - ARGBToYJRow = ARGBToYJRow_AVX2; + RAWToYJRow = RAWToYJRow_AVX2; } } #endif +#if defined(HAS_RAWTOYJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RAWToYJRow = RAWToYJRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RAWToYJRow = RAWToYJRow_NEON; + } + } #endif - - { -#if !(defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \ - defined(HAS_RGB24TOYJROW_MMI)) - // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 31) & ~31; - align_buffer_64(row, kRowSize * 2); -#endif - - for (y = 0; y < height - 1; y += 2) { -#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \ - defined(HAS_RGB24TOYJROW_MMI)) - RGB24ToYJRow(src_rgb24, dst_yj, width); - RGB24ToYJRow(src_rgb24 + src_stride_rgb24, dst_yj + dst_stride_yj, width); -#else - RGB24ToARGBRow(src_rgb24, row, width); - RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width); - ARGBToYJRow(row, dst_yj, width); - ARGBToYJRow(row + kRowSize, dst_yj + dst_stride_yj, width); -#endif - src_rgb24 += src_stride_rgb24 * 2; - dst_yj += dst_stride_yj * 2; +#if defined(HAS_RAWTOYJROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + RAWToYJRow = RAWToYJRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + RAWToYJRow = RAWToYJRow_MMI; } - if (height & 1) { -#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \ - defined(HAS_RGB24TOYJROW_MMI)) - RGB24ToYJRow(src_rgb24, dst_yj, width); -#else - RGB24ToARGBRow(src_rgb24, row, width); - ARGBToYJRow(row, dst_yj, width); + } #endif +#if defined(HAS_RAWTOYJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RAWToYJRow = RAWToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RAWToYJRow = RAWToYJRow_MSA; } -#if !(defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \ - defined(HAS_RGB24TOYJROW_MMI)) - free_aligned_buffer_64(row); + } #endif + + for (y = 0; y < height; ++y) { + RAWToYJRow(src_raw, dst_yj, width); + src_raw += src_stride_raw; + dst_yj += dst_stride_yj; } return 0; } diff --git a/chromium/third_party/libyuv/source/convert_argb.cc b/chromium/third_party/libyuv/source/convert_argb.cc index 4217b1dc9e9..5e7225faf21 100644 --- a/chromium/third_party/libyuv/source/convert_argb.cc +++ b/chromium/third_party/libyuv/source/convert_argb.cc @@ -47,18 +47,19 @@ int ARGBCopy(const uint8_t* src_argb, return 0; } -// Convert I420 to ARGB with matrix -static int I420ToARGBMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height) { +// Convert I420 to ARGB with matrix. +LIBYUV_API +int I420ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { int y; void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* rgb_buf, @@ -97,14 +98,6 @@ static int I420ToARGBMatrix(const uint8_t* src_y, } } #endif -#if defined(HAS_I422TOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToARGBRow = I422ToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_MSA; - } - } -#endif #if defined(HAS_I422TOARGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { I422ToARGBRow = I422ToARGBRow_Any_MMI; @@ -113,6 +106,14 @@ static int I420ToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGBRow = I422ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); @@ -270,18 +271,19 @@ int U420ToABGR(const uint8_t* src_y, width, height); } -// Convert I422 to ARGB with matrix -static int I422ToARGBMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height) { +// Convert I422 to ARGB with matrix. +LIBYUV_API +int I422ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { int y; void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* rgb_buf, @@ -327,14 +329,6 @@ static int I422ToARGBMatrix(const uint8_t* src_y, } } #endif -#if defined(HAS_I422TOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToARGBRow = I422ToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_MSA; - } - } -#endif #if defined(HAS_I422TOARGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { I422ToARGBRow = I422ToARGBRow_Any_MMI; @@ -343,6 +337,14 @@ static int I422ToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGBRow = I422ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); @@ -498,18 +500,19 @@ int U422ToABGR(const uint8_t* src_y, width, height); } -// Convert I444 to ARGB with matrix -static int I444ToARGBMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height) { +// Convert I444 to ARGB with matrix. +LIBYUV_API +int I444ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { int y; void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* rgb_buf, @@ -555,14 +558,6 @@ static int I444ToARGBMatrix(const uint8_t* src_y, } } #endif -#if defined(HAS_I444TOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I444ToARGBRow = I444ToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - I444ToARGBRow = I444ToARGBRow_MSA; - } - } -#endif #if defined(HAS_I444TOARGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { I444ToARGBRow = I444ToARGBRow_Any_MMI; @@ -571,6 +566,14 @@ static int I444ToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I444TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I444ToARGBRow = I444ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I444ToARGBRow = I444ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); @@ -726,20 +729,21 @@ int U444ToABGR(const uint8_t* src_y, width, height); } -// Convert 10 bit YUV to ARGB with matrix +// Convert 10 bit YUV to ARGB with matrix. // TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to // multiply 10 bit yuv into high bits to allow any number of bits. -static int I010ToAR30Matrix(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, - const struct YuvConstants* yuvconstants, - int width, - int height) { +LIBYUV_API +int I010ToAR30Matrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { int y; void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* rgb_buf, @@ -884,20 +888,21 @@ int U010ToAB30(const uint16_t* src_y, &kYuv2020Constants, width, height); } -// Convert 10 bit YUV to ARGB with matrix +// Convert 10 bit YUV to ARGB with matrix. // TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to // multiply 10 bit yuv into high bits to allow any number of bits. -static int I210ToAR30Matrix(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, - const struct YuvConstants* yuvconstants, - int width, - int height) { +LIBYUV_API +int I210ToAR30Matrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { int y; void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* rgb_buf, @@ -1040,18 +1045,19 @@ int U210ToAB30(const uint16_t* src_y, &kYuv2020Constants, width, height); } -// Convert 10 bit YUV to ARGB with matrix -static int I010ToARGBMatrix(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height) { +// Convert 10 bit YUV to ARGB with matrix. +LIBYUV_API +int I010ToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { int y; void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* rgb_buf, @@ -1210,18 +1216,19 @@ int U010ToABGR(const uint16_t* src_y, width, height); } -// Convert 10 bit 422 YUV to ARGB with matrix -static int I210ToARGBMatrix(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height) { +// Convert 10 bit 422 YUV to ARGB with matrix. +LIBYUV_API +int I210ToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { int y; void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* rgb_buf, @@ -1270,9 +1277,6 @@ static int I210ToARGBMatrix(const uint16_t* src_y, return 0; } - - - // Convert I210 to ARGB. LIBYUV_API int I210ToARGB(const uint16_t* src_y, @@ -1381,21 +1385,22 @@ int U210ToABGR(const uint16_t* src_y, width, height); } -// Convert I420 with Alpha to preattenuated ARGB. -static int I420AlphaToARGBMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - const uint8_t* src_a, - int src_stride_a, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height, - int attenuate) { +// Convert I420 with Alpha to preattenuated ARGB with matrix. +LIBYUV_API +int I420AlphaToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate) { int y; void (*I422AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, const uint8_t* a_buf, @@ -1437,14 +1442,6 @@ static int I420AlphaToARGBMatrix(const uint8_t* src_y, } } #endif -#if defined(HAS_I422ALPHATOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - I422AlphaToARGBRow = I422AlphaToARGBRow_MSA; - } - } -#endif #if defined(HAS_I422ALPHATOARGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MMI; @@ -1453,6 +1450,14 @@ static int I420AlphaToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422ALPHATOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_MSA; + } + } +#endif #if defined(HAS_ARGBATTENUATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; @@ -1477,14 +1482,6 @@ static int I420AlphaToARGBMatrix(const uint8_t* src_y, } } #endif -#if defined(HAS_ARGBATTENUATEROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - ARGBAttenuateRow = ARGBAttenuateRow_MSA; - } - } -#endif #if defined(HAS_ARGBATTENUATEROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI; @@ -1493,6 +1490,14 @@ static int I420AlphaToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_ARGBATTENUATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, @@ -1554,16 +1559,18 @@ int I420AlphaToABGR(const uint8_t* src_y, width, height, attenuate); } -// Convert I400 to ARGB. +// Convert I400 to ARGB with matrix. LIBYUV_API -int I400ToARGB(const uint8_t* src_y, - int src_stride_y, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { +int I400ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { int y; - void (*I400ToARGBRow)(const uint8_t* y_buf, uint8_t* rgb_buf, int width) = + void (*I400ToARGBRow)(const uint8_t* y_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = I400ToARGBRow_C; if (!src_y || !dst_argb || width <= 0 || height == 0) { return -1; @@ -1604,14 +1611,6 @@ int I400ToARGB(const uint8_t* src_y, } } #endif -#if defined(HAS_I400TOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I400ToARGBRow = I400ToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - I400ToARGBRow = I400ToARGBRow_MSA; - } - } -#endif #if defined(HAS_I400TOARGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { I400ToARGBRow = I400ToARGBRow_Any_MMI; @@ -1620,15 +1619,35 @@ int I400ToARGB(const uint8_t* src_y, } } #endif +#if defined(HAS_I400TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I400ToARGBRow = I400ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + I400ToARGBRow = I400ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { - I400ToARGBRow(src_y, dst_argb, width); + I400ToARGBRow(src_y, dst_argb, yuvconstants, width); dst_argb += dst_stride_argb; src_y += src_stride_y; } return 0; } +// Convert I400 to ARGB. +LIBYUV_API +int I400ToARGB(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I400ToARGBMatrix(src_y, src_stride_y, dst_argb, dst_stride_argb, + &kYuvI601Constants, width, height); +} + // Convert J400 to ARGB. LIBYUV_API int J400ToARGB(const uint8_t* src_y, @@ -1679,14 +1698,6 @@ int J400ToARGB(const uint8_t* src_y, } } #endif -#if defined(HAS_J400TOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - J400ToARGBRow = J400ToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - J400ToARGBRow = J400ToARGBRow_MSA; - } - } -#endif #if defined(HAS_J400TOARGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { J400ToARGBRow = J400ToARGBRow_Any_MMI; @@ -1695,6 +1706,14 @@ int J400ToARGB(const uint8_t* src_y, } } #endif +#if defined(HAS_J400TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + J400ToARGBRow = J400ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + J400ToARGBRow = J400ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { J400ToARGBRow(src_y, dst_argb, width); src_y += src_stride_y; @@ -1817,14 +1836,6 @@ int RGB24ToARGB(const uint8_t* src_rgb24, } } #endif -#if defined(HAS_RGB24TOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - RGB24ToARGBRow = RGB24ToARGBRow_MSA; - } - } -#endif #if defined(HAS_RGB24TOARGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { RGB24ToARGBRow = RGB24ToARGBRow_Any_MMI; @@ -1833,6 +1844,14 @@ int RGB24ToARGB(const uint8_t* src_rgb24, } } #endif +#if defined(HAS_RGB24TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGB24ToARGBRow = RGB24ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { RGB24ToARGBRow(src_rgb24, dst_argb, width); @@ -1884,14 +1903,6 @@ int RAWToARGB(const uint8_t* src_raw, } } #endif -#if defined(HAS_RAWTOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - RAWToARGBRow = RAWToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - RAWToARGBRow = RAWToARGBRow_MSA; - } - } -#endif #if defined(HAS_RAWTOARGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { RAWToARGBRow = RAWToARGBRow_Any_MMI; @@ -1900,6 +1911,14 @@ int RAWToARGB(const uint8_t* src_raw, } } #endif +#if defined(HAS_RAWTOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RAWToARGBRow = RAWToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RAWToARGBRow = RAWToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { RAWToARGBRow(src_raw, dst_argb, width); @@ -2010,14 +2029,6 @@ int RGB565ToARGB(const uint8_t* src_rgb565, } } #endif -#if defined(HAS_RGB565TOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - RGB565ToARGBRow = RGB565ToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - RGB565ToARGBRow = RGB565ToARGBRow_MSA; - } - } -#endif #if defined(HAS_RGB565TOARGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { RGB565ToARGBRow = RGB565ToARGBRow_Any_MMI; @@ -2026,6 +2037,14 @@ int RGB565ToARGB(const uint8_t* src_rgb565, } } #endif +#if defined(HAS_RGB565TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGB565ToARGBRow = RGB565ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { RGB565ToARGBRow(src_rgb565, dst_argb, width); @@ -2085,14 +2104,6 @@ int ARGB1555ToARGB(const uint8_t* src_argb1555, } } #endif -#if defined(HAS_ARGB1555TOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGB1555ToARGBRow = ARGB1555ToARGBRow_MSA; - } - } -#endif #if defined(HAS_ARGB1555TOARGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MMI; @@ -2101,6 +2112,14 @@ int ARGB1555ToARGB(const uint8_t* src_argb1555, } } #endif +#if defined(HAS_ARGB1555TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGB1555ToARGBRow(src_argb1555, dst_argb, width); @@ -2160,14 +2179,6 @@ int ARGB4444ToARGB(const uint8_t* src_argb4444, } } #endif -#if defined(HAS_ARGB4444TOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA; - } - } -#endif #if defined(HAS_ARGB4444TOARGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MMI; @@ -2176,6 +2187,14 @@ int ARGB4444ToARGB(const uint8_t* src_argb4444, } } #endif +#if defined(HAS_ARGB4444TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGB4444ToARGBRow(src_argb4444, dst_argb, width); @@ -2281,16 +2300,17 @@ int AR30ToAB30(const uint8_t* src_ar30, return 0; } -// Convert NV12 to ARGB with matrix -static int NV12ToARGBMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height) { +// Convert NV12 to ARGB with matrix. +LIBYUV_API +int NV12ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { int y; void (*NV12ToARGBRow)( const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, @@ -2328,14 +2348,6 @@ static int NV12ToARGBMatrix(const uint8_t* src_y, } } #endif -#if defined(HAS_NV12TOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - NV12ToARGBRow = NV12ToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - NV12ToARGBRow = NV12ToARGBRow_MSA; - } - } -#endif #if defined(HAS_NV12TOARGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { NV12ToARGBRow = NV12ToARGBRow_Any_MMI; @@ -2344,6 +2356,14 @@ static int NV12ToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_NV12TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + NV12ToARGBRow = NV12ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { NV12ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width); @@ -2356,16 +2376,17 @@ static int NV12ToARGBMatrix(const uint8_t* src_y, return 0; } -// Convert NV21 to ARGB with matrix -static int NV21ToARGBMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_vu, - int src_stride_vu, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height) { +// Convert NV21 to ARGB with matrix. +LIBYUV_API +int NV21ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { int y; void (*NV21ToARGBRow)( const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, @@ -2403,14 +2424,6 @@ static int NV21ToARGBMatrix(const uint8_t* src_y, } } #endif -#if defined(HAS_NV21TOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - NV21ToARGBRow = NV21ToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - NV21ToARGBRow = NV21ToARGBRow_MSA; - } - } -#endif #if defined(HAS_NV21TOARGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { NV21ToARGBRow = NV21ToARGBRow_Any_MMI; @@ -2419,6 +2432,14 @@ static int NV21ToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_NV21TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + NV21ToARGBRow = NV21ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + NV21ToARGBRow = NV21ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { NV21ToARGBRow(src_y, src_vu, dst_argb, yuvconstants, width); @@ -2490,16 +2511,17 @@ int NV21ToABGR(const uint8_t* src_y, } // TODO(fbarchard): Consider SSSE3 2 step conversion. -// Convert NV12 to RGB24 with matrix -static int NV12ToRGB24Matrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - const struct YuvConstants* yuvconstants, - int width, - int height) { +// Convert NV12 to RGB24 with matrix. +LIBYUV_API +int NV12ToRGB24Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, + int height) { int y; void (*NV12ToRGB24Row)( const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, @@ -2557,16 +2579,17 @@ static int NV12ToRGB24Matrix(const uint8_t* src_y, return 0; } -// Convert NV21 to RGB24 with matrix -static int NV21ToRGB24Matrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_vu, - int src_stride_vu, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - const struct YuvConstants* yuvconstants, - int width, - int height) { +// Convert NV21 to RGB24 with matrix. +LIBYUV_API +int NV21ToRGB24Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, + int height) { int y; void (*NV21ToRGB24Row)( const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, @@ -2730,83 +2753,6 @@ int NV21ToYUV24(const uint8_t* src_y, return 0; } -// Convert M420 to ARGB. -LIBYUV_API -int M420ToARGB(const uint8_t* src_m420, - int src_stride_m420, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - int y; - void (*NV12ToARGBRow)( - const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C; - if (!src_m420 || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } -#if defined(HAS_NV12TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - NV12ToARGBRow = NV12ToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_NV12TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - NV12ToARGBRow = NV12ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - NV12ToARGBRow = NV12ToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_NV12TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - NV12ToARGBRow = NV12ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - NV12ToARGBRow = NV12ToARGBRow_NEON; - } - } -#endif -#if defined(HAS_NV12TOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - NV12ToARGBRow = NV12ToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - NV12ToARGBRow = NV12ToARGBRow_MSA; - } - } -#endif -#if defined(HAS_NV12TOARGBROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - NV12ToARGBRow = NV12ToARGBRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - NV12ToARGBRow = NV12ToARGBRow_MMI; - } - } -#endif - - for (y = 0; y < height - 1; y += 2) { - NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, - &kYuvI601Constants, width); - NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2, - dst_argb + dst_stride_argb, &kYuvI601Constants, width); - dst_argb += dst_stride_argb * 2; - src_m420 += src_stride_m420 * 3; - } - if (height & 1) { - NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, - &kYuvI601Constants, width); - } - return 0; -} - // Convert YUY2 to ARGB. LIBYUV_API int YUY2ToARGB(const uint8_t* src_yuy2, @@ -2858,14 +2804,6 @@ int YUY2ToARGB(const uint8_t* src_yuy2, } } #endif -#if defined(HAS_YUY2TOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - YUY2ToARGBRow = YUY2ToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - YUY2ToARGBRow = YUY2ToARGBRow_MSA; - } - } -#endif #if defined(HAS_YUY2TOARGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { YUY2ToARGBRow = YUY2ToARGBRow_Any_MMI; @@ -2874,6 +2812,14 @@ int YUY2ToARGB(const uint8_t* src_yuy2, } } #endif +#if defined(HAS_YUY2TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + YUY2ToARGBRow = YUY2ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + YUY2ToARGBRow = YUY2ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width); src_yuy2 += src_stride_yuy2; @@ -2933,14 +2879,6 @@ int UYVYToARGB(const uint8_t* src_uyvy, } } #endif -#if defined(HAS_UYVYTOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - UYVYToARGBRow = UYVYToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - UYVYToARGBRow = UYVYToARGBRow_MSA; - } - } -#endif #if defined(HAS_UYVYTOARGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { UYVYToARGBRow = UYVYToARGBRow_Any_MMI; @@ -2949,6 +2887,14 @@ int UYVYToARGB(const uint8_t* src_uyvy, } } #endif +#if defined(HAS_UYVYTOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + UYVYToARGBRow = UYVYToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + UYVYToARGBRow = UYVYToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width); src_uyvy += src_stride_uyvy; @@ -2971,7 +2917,7 @@ static void WeavePixels(const uint8_t* src_u, } } -// Convert Android420 to ARGB. +// Convert Android420 to ARGB with matrix. LIBYUV_API int Android420ToARGBMatrix(const uint8_t* src_y, int src_stride_y, @@ -3072,6 +3018,1107 @@ int Android420ToABGR(const uint8_t* src_y, height); } +// Convert I422 to RGBA with matrix. +LIBYUV_API +int I422ToRGBAMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgba, + int dst_stride_rgba, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToRGBARow_C; + if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba; + dst_stride_rgba = -dst_stride_rgba; + } +#if defined(HAS_I422TORGBAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToRGBARow = I422ToRGBARow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_SSSE3; + } + } +#endif +#if defined(HAS_I422TORGBAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToRGBARow = I422ToRGBARow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToRGBARow = I422ToRGBARow_AVX2; + } + } +#endif +#if defined(HAS_I422TORGBAROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToRGBARow = I422ToRGBARow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_NEON; + } + } +#endif +#if defined(HAS_I422TORGBAROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + I422ToRGBARow = I422ToRGBARow_Any_MMI; + if (IS_ALIGNED(width, 4)) { + I422ToRGBARow = I422ToRGBARow_MMI; + } + } +#endif +#if defined(HAS_I422TORGBAROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGBARow = I422ToRGBARow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width); + dst_rgba += dst_stride_rgba; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I422 to RGBA. +LIBYUV_API +int I422ToRGBA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgba, + int dst_stride_rgba, + int width, + int height) { + return I422ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgba, dst_stride_rgba, + &kYuvI601Constants, width, height); +} + +// Convert I422 to BGRA. +LIBYUV_API +int I422ToBGRA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_bgra, + int dst_stride_bgra, + int width, + int height) { + return I422ToRGBAMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_bgra, dst_stride_bgra, + &kYvuI601Constants, // Use Yvu matrix + width, height); +} + +// Convert NV12 to RGB565 with matrix. +LIBYUV_API +int NV12ToRGB565Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*NV12ToRGB565Row)( + const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C; + if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; + dst_stride_rgb565 = -dst_stride_rgb565; + } +#if defined(HAS_NV12TORGB565ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + NV12ToRGB565Row = NV12ToRGB565Row_SSSE3; + } + } +#endif +#if defined(HAS_NV12TORGB565ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + NV12ToRGB565Row = NV12ToRGB565Row_AVX2; + } + } +#endif +#if defined(HAS_NV12TORGB565ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV12ToRGB565Row = NV12ToRGB565Row_NEON; + } + } +#endif +#if defined(HAS_NV12TORGB565ROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + NV12ToRGB565Row = NV12ToRGB565Row_Any_MMI; + if (IS_ALIGNED(width, 4)) { + NV12ToRGB565Row = NV12ToRGB565Row_MMI; + } + } +#endif +#if defined(HAS_NV12TORGB565ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + NV12ToRGB565Row = NV12ToRGB565Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + NV12ToRGB565Row = NV12ToRGB565Row_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + NV12ToRGB565Row(src_y, src_uv, dst_rgb565, yuvconstants, width); + dst_rgb565 += dst_stride_rgb565; + src_y += src_stride_y; + if (y & 1) { + src_uv += src_stride_uv; + } + } + return 0; +} + +// Convert NV12 to RGB565. +LIBYUV_API +int NV12ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height) { + return NV12ToRGB565Matrix(src_y, src_stride_y, src_uv, src_stride_uv, + dst_rgb565, dst_stride_rgb565, &kYuvI601Constants, + width, height); +} + +// Convert I422 to RGBA with matrix. +LIBYUV_API +int I420ToRGBAMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgba, + int dst_stride_rgba, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToRGBARow_C; + if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba; + dst_stride_rgba = -dst_stride_rgba; + } +#if defined(HAS_I422TORGBAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToRGBARow = I422ToRGBARow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_SSSE3; + } + } +#endif +#if defined(HAS_I422TORGBAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToRGBARow = I422ToRGBARow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToRGBARow = I422ToRGBARow_AVX2; + } + } +#endif +#if defined(HAS_I422TORGBAROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToRGBARow = I422ToRGBARow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_NEON; + } + } +#endif +#if defined(HAS_I422TORGBAROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + I422ToRGBARow = I422ToRGBARow_Any_MMI; + if (IS_ALIGNED(width, 4)) { + I422ToRGBARow = I422ToRGBARow_MMI; + } + } +#endif +#if defined(HAS_I422TORGBAROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGBARow = I422ToRGBARow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width); + dst_rgba += dst_stride_rgba; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to RGBA. +LIBYUV_API +int I420ToRGBA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgba, + int dst_stride_rgba, + int width, + int height) { + return I420ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgba, dst_stride_rgba, + &kYuvI601Constants, width, height); +} + +// Convert I420 to BGRA. +LIBYUV_API +int I420ToBGRA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_bgra, + int dst_stride_bgra, + int width, + int height) { + return I420ToRGBAMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_bgra, dst_stride_bgra, + &kYvuI601Constants, // Use Yvu matrix + width, height); +} + +// Convert I420 to RGB24 with matrix. +LIBYUV_API +int I420ToRGB24Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I422ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToRGB24Row_C; + if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; + dst_stride_rgb24 = -dst_stride_rgb24; + } +#if defined(HAS_I422TORGB24ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + I422ToRGB24Row = I422ToRGB24Row_SSSE3; + } + } +#endif +#if defined(HAS_I422TORGB24ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToRGB24Row = I422ToRGB24Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToRGB24Row = I422ToRGB24Row_AVX2; + } + } +#endif +#if defined(HAS_I422TORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToRGB24Row = I422ToRGB24Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRGB24Row = I422ToRGB24Row_NEON; + } + } +#endif +#if defined(HAS_I422TORGB24ROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + I422ToRGB24Row = I422ToRGB24Row_Any_MMI; + if (IS_ALIGNED(width, 4)) { + I422ToRGB24Row = I422ToRGB24Row_MMI; + } + } +#endif +#if defined(HAS_I422TORGB24ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGB24Row = I422ToRGB24Row_Any_MSA; + if (IS_ALIGNED(width, 16)) { + I422ToRGB24Row = I422ToRGB24Row_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width); + dst_rgb24 += dst_stride_rgb24; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to RGB24. +LIBYUV_API +int I420ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb24, dst_stride_rgb24, + &kYuvI601Constants, width, height); +} + +// Convert I420 to RAW. +LIBYUV_API +int I420ToRAW(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height) { + return I420ToRGB24Matrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_raw, dst_stride_raw, + &kYvuI601Constants, // Use Yvu matrix + width, height); +} + +// Convert J420 to RGB24. +LIBYUV_API +int J420ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb24, dst_stride_rgb24, + &kYuvJPEGConstants, width, height); +} + +// Convert J420 to RAW. +LIBYUV_API +int J420ToRAW(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height) { + return I420ToRGB24Matrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_raw, dst_stride_raw, + &kYvuJPEGConstants, // Use Yvu matrix + width, height); +} + +// Convert H420 to RGB24. +LIBYUV_API +int H420ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb24, dst_stride_rgb24, + &kYuvH709Constants, width, height); +} + +// Convert H420 to RAW. +LIBYUV_API +int H420ToRAW(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height) { + return I420ToRGB24Matrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_raw, dst_stride_raw, + &kYvuH709Constants, // Use Yvu matrix + width, height); +} + +// Convert I420 to ARGB1555. +LIBYUV_API +int I420ToARGB1555(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb1555, + int dst_stride_argb1555, + int width, + int height) { + int y; + void (*I422ToARGB1555Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) = I422ToARGB1555Row_C; + if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555; + dst_stride_argb1555 = -dst_stride_argb1555; + } +#if defined(HAS_I422TOARGB1555ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGB1555Row = I422ToARGB1555Row_SSSE3; + } + } +#endif +#if defined(HAS_I422TOARGB1555ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToARGB1555Row = I422ToARGB1555Row_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGB1555ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGB1555Row = I422ToARGB1555Row_NEON; + } + } +#endif +#if defined(HAS_I422TOARGB1555ROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_MMI; + if (IS_ALIGNED(width, 4)) { + I422ToARGB1555Row = I422ToARGB1555Row_MMI; + } + } +#endif +#if defined(HAS_I422TOARGB1555ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToARGB1555Row = I422ToARGB1555Row_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants, + width); + dst_argb1555 += dst_stride_argb1555; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to ARGB4444. +LIBYUV_API +int I420ToARGB4444(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb4444, + int dst_stride_argb4444, + int width, + int height) { + int y; + void (*I422ToARGB4444Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) = I422ToARGB4444Row_C; + if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444; + dst_stride_argb4444 = -dst_stride_argb4444; + } +#if defined(HAS_I422TOARGB4444ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGB4444Row = I422ToARGB4444Row_SSSE3; + } + } +#endif +#if defined(HAS_I422TOARGB4444ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToARGB4444Row = I422ToARGB4444Row_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGB4444ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGB4444Row = I422ToARGB4444Row_NEON; + } + } +#endif +#if defined(HAS_I422TOARGB4444ROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + I422ToARGB4444Row = I422ToARGB4444Row_Any_MMI; + if (IS_ALIGNED(width, 4)) { + I422ToARGB4444Row = I422ToARGB4444Row_MMI; + } + } +#endif +#if defined(HAS_I422TOARGB4444ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToARGB4444Row = I422ToARGB4444Row_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants, + width); + dst_argb4444 += dst_stride_argb4444; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to RGB565 with specified color matrix. +LIBYUV_API +int I420ToRGB565Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToRGB565Row_C; + if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; + dst_stride_rgb565 = -dst_stride_rgb565; + } +#if defined(HAS_I422TORGB565ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_SSSE3; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToRGB565Row = I422ToRGB565Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToRGB565Row = I422ToRGB565Row_AVX2; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToRGB565Row = I422ToRGB565Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_NEON; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + I422ToRGB565Row = I422ToRGB565Row_Any_MMI; + if (IS_ALIGNED(width, 4)) { + I422ToRGB565Row = I422ToRGB565Row_MMI; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGB565Row = I422ToRGB565Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, yuvconstants, width); + dst_rgb565 += dst_stride_rgb565; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to RGB565. +LIBYUV_API +int I420ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height) { + return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb565, dst_stride_rgb565, + &kYuvI601Constants, width, height); +} + +// Convert J420 to RGB565. +LIBYUV_API +int J420ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height) { + return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb565, dst_stride_rgb565, + &kYuvJPEGConstants, width, height); +} + +// Convert H420 to RGB565. +LIBYUV_API +int H420ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height) { + return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb565, dst_stride_rgb565, + &kYuvH709Constants, width, height); +} + +// Convert I422 to RGB565. +LIBYUV_API +int I422ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height) { + int y; + void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToRGB565Row_C; + if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; + dst_stride_rgb565 = -dst_stride_rgb565; + } +#if defined(HAS_I422TORGB565ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_SSSE3; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToRGB565Row = I422ToRGB565Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToRGB565Row = I422ToRGB565Row_AVX2; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToRGB565Row = I422ToRGB565Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_NEON; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGB565Row = I422ToRGB565Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width); + dst_rgb565 += dst_stride_rgb565; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Ordered 8x8 dither for 888 to 565. Values from 0 to 7. +static const uint8_t kDither565_4x4[16] = { + 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2, +}; + +// Convert I420 to RGB565 with dithering. +LIBYUV_API +int I420ToRGB565Dither(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + const uint8_t* dither4x4, + int width, + int height) { + int y; + void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToARGBRow_C; + void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb, + const uint32_t dither4, int width) = + ARGBToRGB565DitherRow_C; + if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; + dst_stride_rgb565 = -dst_stride_rgb565; + } + if (!dither4x4) { + dither4x4 = kDither565_4x4; + } +#if defined(HAS_I422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I422TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToARGBRow = I422ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I422TOARGBROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + I422ToARGBRow = I422ToARGBRow_Any_MMI; + if (IS_ALIGNED(width, 4)) { + I422ToARGBRow = I422ToARGBRow_MMI; + } + } +#endif +#if defined(HAS_I422TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGBRow = I422ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MMI; + if (IS_ALIGNED(width, 4)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MMI; + } + } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA; + } + } +#endif + { + // Allocate a row of argb. + align_buffer_64(row_argb, width * 4); + for (y = 0; y < height; ++y) { + I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width); + ARGBToRGB565DitherRow(row_argb, dst_rgb565, + *(const uint32_t*)(dither4x4 + ((y & 3) << 2)), + width); + dst_rgb565 += dst_stride_rgb565; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + free_aligned_buffer_64(row_argb); + } + return 0; +} + +// Convert I420 to AR30 with matrix. +LIBYUV_API +int I420ToAR30Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I422ToAR30Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToAR30Row_C; + + if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } + +#if defined(HAS_I422TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToAR30Row = I422ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToAR30Row = I422ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_I422TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToAR30Row = I422ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToAR30Row = I422ToAR30Row_AVX2; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to AR30. +LIBYUV_API +int I420ToAR30(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_ar30, dst_stride_ar30, + &kYuvI601Constants, width, height); +} + +// Convert H420 to AR30. +LIBYUV_API +int H420ToAR30(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_ar30, dst_stride_ar30, + &kYvuH709Constants, width, height); +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/chromium/third_party/libyuv/source/convert_from.cc b/chromium/third_party/libyuv/source/convert_from.cc index 0c95f1f2918..f2cfc1d8f53 100644 --- a/chromium/third_party/libyuv/source/convert_from.cc +++ b/chromium/third_party/libyuv/source/convert_from.cc @@ -294,14 +294,6 @@ int I420ToYUY2(const uint8_t* src_y, } } #endif -#if defined(HAS_I422TOYUY2ROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToYUY2Row = I422ToYUY2Row_Any_MSA; - if (IS_ALIGNED(width, 32)) { - I422ToYUY2Row = I422ToYUY2Row_MSA; - } - } -#endif #if defined(HAS_I422TOYUY2ROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { I422ToYUY2Row = I422ToYUY2Row_Any_MMI; @@ -310,6 +302,14 @@ int I420ToYUY2(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOYUY2ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToYUY2Row = I422ToYUY2Row_Any_MSA; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_MSA; + } + } +#endif for (y = 0; y < height - 1; y += 2) { I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width); @@ -381,14 +381,6 @@ int I422ToUYVY(const uint8_t* src_y, } } #endif -#if defined(HAS_I422TOUYVYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToUYVYRow = I422ToUYVYRow_Any_MSA; - if (IS_ALIGNED(width, 32)) { - I422ToUYVYRow = I422ToUYVYRow_MSA; - } - } -#endif #if defined(HAS_I422TOUYVYROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { I422ToUYVYRow = I422ToUYVYRow_Any_MMI; @@ -397,6 +389,14 @@ int I422ToUYVY(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOUYVYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToUYVYRow = I422ToUYVYRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); @@ -456,14 +456,6 @@ int I420ToUYVY(const uint8_t* src_y, } } #endif -#if defined(HAS_I422TOUYVYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToUYVYRow = I422ToUYVYRow_Any_MSA; - if (IS_ALIGNED(width, 32)) { - I422ToUYVYRow = I422ToUYVYRow_MSA; - } - } -#endif #if defined(HAS_I422TOUYVYROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { I422ToUYVYRow = I422ToUYVYRow_Any_MMI; @@ -472,6 +464,14 @@ int I420ToUYVY(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOUYVYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToUYVYRow = I422ToUYVYRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_MSA; + } + } +#endif for (y = 0; y < height - 1; y += 2) { I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); @@ -488,7 +488,6 @@ int I420ToUYVY(const uint8_t* src_y, return 0; } -// TODO(fbarchard): test negative height for invert. LIBYUV_API int I420ToNV12(const uint8_t* src_y, int src_stride_y, @@ -502,12 +501,23 @@ int I420ToNV12(const uint8_t* src_y, int dst_stride_uv, int width, int height) { + int halfwidth = (width + 1) / 2; + int halfheight = (height + 1) / 2; if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 || height == 0) { return -1; } - int halfwidth = (width + 1) / 2; - int halfheight = height > 0 ? (height + 1) / 2 : (height - 1) / 2; + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } if (dst_y) { CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); } @@ -534,899 +544,6 @@ int I420ToNV21(const uint8_t* src_y, width, height); } -// Convert I422 to RGBA with matrix -static int I420ToRGBAMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgba, - int dst_stride_rgba, - const struct YuvConstants* yuvconstants, - int width, - int height) { - int y; - void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf, - const uint8_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - I422ToRGBARow_C; - if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba; - dst_stride_rgba = -dst_stride_rgba; - } -#if defined(HAS_I422TORGBAROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToRGBARow = I422ToRGBARow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422ToRGBARow = I422ToRGBARow_SSSE3; - } - } -#endif -#if defined(HAS_I422TORGBAROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToRGBARow = I422ToRGBARow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I422ToRGBARow = I422ToRGBARow_AVX2; - } - } -#endif -#if defined(HAS_I422TORGBAROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToRGBARow = I422ToRGBARow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I422ToRGBARow = I422ToRGBARow_NEON; - } - } -#endif -#if defined(HAS_I422TORGBAROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToRGBARow = I422ToRGBARow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - I422ToRGBARow = I422ToRGBARow_MSA; - } - } -#endif -#if defined(HAS_I422TORGBAROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - I422ToRGBARow = I422ToRGBARow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - I422ToRGBARow = I422ToRGBARow_MMI; - } - } -#endif - - for (y = 0; y < height; ++y) { - I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width); - dst_rgba += dst_stride_rgba; - src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } - } - return 0; -} - -// Convert I420 to RGBA. -LIBYUV_API -int I420ToRGBA(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgba, - int dst_stride_rgba, - int width, - int height) { - return I420ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_rgba, dst_stride_rgba, - &kYuvI601Constants, width, height); -} - -// Convert I420 to BGRA. -LIBYUV_API -int I420ToBGRA(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_bgra, - int dst_stride_bgra, - int width, - int height) { - return I420ToRGBAMatrix(src_y, src_stride_y, src_v, - src_stride_v, // Swap U and V - src_u, src_stride_u, dst_bgra, dst_stride_bgra, - &kYvuI601Constants, // Use Yvu matrix - width, height); -} - -// Convert I420 to RGB24 with matrix -static int I420ToRGB24Matrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - const struct YuvConstants* yuvconstants, - int width, - int height) { - int y; - void (*I422ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf, - const uint8_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - I422ToRGB24Row_C; - if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; - dst_stride_rgb24 = -dst_stride_rgb24; - } -#if defined(HAS_I422TORGB24ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - I422ToRGB24Row = I422ToRGB24Row_SSSE3; - } - } -#endif -#if defined(HAS_I422TORGB24ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToRGB24Row = I422ToRGB24Row_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - I422ToRGB24Row = I422ToRGB24Row_AVX2; - } - } -#endif -#if defined(HAS_I422TORGB24ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToRGB24Row = I422ToRGB24Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I422ToRGB24Row = I422ToRGB24Row_NEON; - } - } -#endif -#if defined(HAS_I422TORGB24ROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToRGB24Row = I422ToRGB24Row_Any_MSA; - if (IS_ALIGNED(width, 16)) { - I422ToRGB24Row = I422ToRGB24Row_MSA; - } - } -#endif -#if defined(HAS_I422TORGB24ROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - I422ToRGB24Row = I422ToRGB24Row_Any_MMI; - if (IS_ALIGNED(width, 4)) { - I422ToRGB24Row = I422ToRGB24Row_MMI; - } - } -#endif - - for (y = 0; y < height; ++y) { - I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width); - dst_rgb24 += dst_stride_rgb24; - src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } - } - return 0; -} - -// Convert I420 to RGB24. -LIBYUV_API -int I420ToRGB24(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - int width, - int height) { - return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_rgb24, dst_stride_rgb24, - &kYuvI601Constants, width, height); -} - -// Convert I420 to RAW. -LIBYUV_API -int I420ToRAW(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_raw, - int dst_stride_raw, - int width, - int height) { - return I420ToRGB24Matrix(src_y, src_stride_y, src_v, - src_stride_v, // Swap U and V - src_u, src_stride_u, dst_raw, dst_stride_raw, - &kYvuI601Constants, // Use Yvu matrix - width, height); -} - -// Convert J420 to RGB24. -LIBYUV_API -int J420ToRGB24(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - int width, - int height) { - return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_rgb24, dst_stride_rgb24, - &kYuvJPEGConstants, width, height); -} - -// Convert J420 to RAW. -LIBYUV_API -int J420ToRAW(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_raw, - int dst_stride_raw, - int width, - int height) { - return I420ToRGB24Matrix(src_y, src_stride_y, src_v, - src_stride_v, // Swap U and V - src_u, src_stride_u, dst_raw, dst_stride_raw, - &kYvuJPEGConstants, // Use Yvu matrix - width, height); -} - -// Convert H420 to RGB24. -LIBYUV_API -int H420ToRGB24(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - int width, - int height) { - return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_rgb24, dst_stride_rgb24, - &kYuvH709Constants, width, height); -} - -// Convert H420 to RAW. -LIBYUV_API -int H420ToRAW(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_raw, - int dst_stride_raw, - int width, - int height) { - return I420ToRGB24Matrix(src_y, src_stride_y, src_v, - src_stride_v, // Swap U and V - src_u, src_stride_u, dst_raw, dst_stride_raw, - &kYvuH709Constants, // Use Yvu matrix - width, height); -} - -// Convert I420 to ARGB1555. -LIBYUV_API -int I420ToARGB1555(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb1555, - int dst_stride_argb1555, - int width, - int height) { - int y; - void (*I422ToARGB1555Row)(const uint8_t* y_buf, const uint8_t* u_buf, - const uint8_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = I422ToARGB1555Row_C; - if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 || - height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555; - dst_stride_argb1555 = -dst_stride_argb1555; - } -#if defined(HAS_I422TOARGB1555ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422ToARGB1555Row = I422ToARGB1555Row_SSSE3; - } - } -#endif -#if defined(HAS_I422TOARGB1555ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I422ToARGB1555Row = I422ToARGB1555Row_AVX2; - } - } -#endif -#if defined(HAS_I422TOARGB1555ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I422ToARGB1555Row = I422ToARGB1555Row_NEON; - } - } -#endif -#if defined(HAS_I422TOARGB1555ROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA; - if (IS_ALIGNED(width, 8)) { - I422ToARGB1555Row = I422ToARGB1555Row_MSA; - } - } -#endif -#if defined(HAS_I422TOARGB1555ROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - I422ToARGB1555Row = I422ToARGB1555Row_Any_MMI; - if (IS_ALIGNED(width, 4)) { - I422ToARGB1555Row = I422ToARGB1555Row_MMI; - } - } -#endif - - for (y = 0; y < height; ++y) { - I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants, - width); - dst_argb1555 += dst_stride_argb1555; - src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } - } - return 0; -} - -// Convert I420 to ARGB4444. -LIBYUV_API -int I420ToARGB4444(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb4444, - int dst_stride_argb4444, - int width, - int height) { - int y; - void (*I422ToARGB4444Row)(const uint8_t* y_buf, const uint8_t* u_buf, - const uint8_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = I422ToARGB4444Row_C; - if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 || - height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444; - dst_stride_argb4444 = -dst_stride_argb4444; - } -#if defined(HAS_I422TOARGB4444ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422ToARGB4444Row = I422ToARGB4444Row_SSSE3; - } - } -#endif -#if defined(HAS_I422TOARGB4444ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I422ToARGB4444Row = I422ToARGB4444Row_AVX2; - } - } -#endif -#if defined(HAS_I422TOARGB4444ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I422ToARGB4444Row = I422ToARGB4444Row_NEON; - } - } -#endif -#if defined(HAS_I422TOARGB4444ROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA; - if (IS_ALIGNED(width, 8)) { - I422ToARGB4444Row = I422ToARGB4444Row_MSA; - } - } -#endif -#if defined(HAS_I422TOARGB4444ROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - I422ToARGB4444Row = I422ToARGB4444Row_Any_MMI; - if (IS_ALIGNED(width, 4)) { - I422ToARGB4444Row = I422ToARGB4444Row_MMI; - } - } -#endif - - for (y = 0; y < height; ++y) { - I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants, - width); - dst_argb4444 += dst_stride_argb4444; - src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } - } - return 0; -} - -// Convert I420 to RGB565 with specified color matrix. -LIBYUV_API -int I420ToRGB565Matrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - const struct YuvConstants* yuvconstants, - int width, - int height) { - int y; - void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf, - const uint8_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - I422ToRGB565Row_C; - if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; - dst_stride_rgb565 = -dst_stride_rgb565; - } -#if defined(HAS_I422TORGB565ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422ToRGB565Row = I422ToRGB565Row_SSSE3; - } - } -#endif -#if defined(HAS_I422TORGB565ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToRGB565Row = I422ToRGB565Row_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I422ToRGB565Row = I422ToRGB565Row_AVX2; - } - } -#endif -#if defined(HAS_I422TORGB565ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToRGB565Row = I422ToRGB565Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I422ToRGB565Row = I422ToRGB565Row_NEON; - } - } -#endif -#if defined(HAS_I422TORGB565ROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToRGB565Row = I422ToRGB565Row_Any_MSA; - if (IS_ALIGNED(width, 8)) { - I422ToRGB565Row = I422ToRGB565Row_MSA; - } - } -#endif -#if defined(HAS_I422TORGB565ROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - I422ToRGB565Row = I422ToRGB565Row_Any_MMI; - if (IS_ALIGNED(width, 4)) { - I422ToRGB565Row = I422ToRGB565Row_MMI; - } - } -#endif - - for (y = 0; y < height; ++y) { - I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, yuvconstants, width); - dst_rgb565 += dst_stride_rgb565; - src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } - } - return 0; -} - -// Convert I420 to RGB565. -LIBYUV_API -int I420ToRGB565(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - int width, - int height) { - return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_rgb565, dst_stride_rgb565, - &kYuvI601Constants, width, height); -} - -// Convert J420 to RGB565. -LIBYUV_API -int J420ToRGB565(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - int width, - int height) { - return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_rgb565, dst_stride_rgb565, - &kYuvJPEGConstants, width, height); -} - -// Convert H420 to RGB565. -LIBYUV_API -int H420ToRGB565(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - int width, - int height) { - return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_rgb565, dst_stride_rgb565, - &kYuvH709Constants, width, height); -} - -// Convert I422 to RGB565. -LIBYUV_API -int I422ToRGB565(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - int width, - int height) { - int y; - void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf, - const uint8_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - I422ToRGB565Row_C; - if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; - dst_stride_rgb565 = -dst_stride_rgb565; - } -#if defined(HAS_I422TORGB565ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422ToRGB565Row = I422ToRGB565Row_SSSE3; - } - } -#endif -#if defined(HAS_I422TORGB565ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToRGB565Row = I422ToRGB565Row_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I422ToRGB565Row = I422ToRGB565Row_AVX2; - } - } -#endif -#if defined(HAS_I422TORGB565ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToRGB565Row = I422ToRGB565Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I422ToRGB565Row = I422ToRGB565Row_NEON; - } - } -#endif -#if defined(HAS_I422TORGB565ROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToRGB565Row = I422ToRGB565Row_Any_MSA; - if (IS_ALIGNED(width, 8)) { - I422ToRGB565Row = I422ToRGB565Row_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width); - dst_rgb565 += dst_stride_rgb565; - src_y += src_stride_y; - src_u += src_stride_u; - src_v += src_stride_v; - } - return 0; -} - -// Ordered 8x8 dither for 888 to 565. Values from 0 to 7. -static const uint8_t kDither565_4x4[16] = { - 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2, -}; - -// Convert I420 to RGB565 with dithering. -LIBYUV_API -int I420ToRGB565Dither(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - const uint8_t* dither4x4, - int width, - int height) { - int y; - void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, - const uint8_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - I422ToARGBRow_C; - void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb, - const uint32_t dither4, int width) = - ARGBToRGB565DitherRow_C; - if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; - dst_stride_rgb565 = -dst_stride_rgb565; - } - if (!dither4x4) { - dither4x4 = kDither565_4x4; - } -#if defined(HAS_I422TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToARGBRow = I422ToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_I422TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToARGBRow = I422ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I422ToARGBRow = I422ToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_I422TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToARGBRow = I422ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_NEON; - } - } -#endif -#if defined(HAS_I422TOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToARGBRow = I422ToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_MSA; - } - } -#endif -#if defined(HAS_I422TOARGBROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - I422ToARGBRow = I422ToARGBRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - I422ToARGBRow = I422ToARGBRow_MMI; - } - } -#endif -#if defined(HAS_ARGBTORGB565DITHERROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2; - if (IS_ALIGNED(width, 4)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2; - } - } -#endif -#if defined(HAS_ARGBTORGB565DITHERROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2; - if (IS_ALIGNED(width, 8)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBTORGB565DITHERROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTORGB565DITHERROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA; - } - } -#endif -#if defined(HAS_ARGBTORGB565DITHERROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MMI; - } - } -#endif - { - // Allocate a row of argb. - align_buffer_64(row_argb, width * 4); - for (y = 0; y < height; ++y) { - I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width); - ARGBToRGB565DitherRow(row_argb, dst_rgb565, - *(const uint32_t*)(dither4x4 + ((y & 3) << 2)), - width); - dst_rgb565 += dst_stride_rgb565; - src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } - } - free_aligned_buffer_64(row_argb); - } - return 0; -} - -// Convert I420 to AR30 with matrix -static int I420ToAR30Matrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, - const struct YuvConstants* yuvconstants, - int width, - int height) { - int y; - void (*I422ToAR30Row)(const uint8_t* y_buf, const uint8_t* u_buf, - const uint8_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - I422ToAR30Row_C; - - if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; - dst_stride_ar30 = -dst_stride_ar30; - } - -#if defined(HAS_I422TOAR30ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToAR30Row = I422ToAR30Row_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422ToAR30Row = I422ToAR30Row_SSSE3; - } - } -#endif -#if defined(HAS_I422TOAR30ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToAR30Row = I422ToAR30Row_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I422ToAR30Row = I422ToAR30Row_AVX2; - } - } -#endif - - for (y = 0; y < height; ++y) { - I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); - dst_ar30 += dst_stride_ar30; - src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } - } - return 0; -} - -// Convert I420 to AR30. -LIBYUV_API -int I420ToAR30(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, - int width, - int height) { - return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_ar30, dst_stride_ar30, - &kYuvI601Constants, width, height); -} - -// Convert H420 to AR30. -LIBYUV_API -int H420ToAR30(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, - int width, - int height) { - return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_ar30, dst_stride_ar30, - &kYvuH709Constants, width, height); -} - // Convert I420 to specified format LIBYUV_API int ConvertFromI420(const uint8_t* y, @@ -1528,7 +645,6 @@ int ConvertFromI420(const uint8_t* y, height); break; } - // TODO(fbarchard): Add M420. // Triplanar formats case FOURCC_I420: case FOURCC_YV12: { diff --git a/chromium/third_party/libyuv/source/convert_from_argb.cc b/chromium/third_party/libyuv/source/convert_from_argb.cc index de301ebbc82..4ba4bb5e0f5 100644 --- a/chromium/third_party/libyuv/source/convert_from_argb.cc +++ b/chromium/third_party/libyuv/source/convert_from_argb.cc @@ -68,14 +68,6 @@ int ARGBToI444(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOUV444ROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToUV444Row = ARGBToUV444Row_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGBToUV444Row = ARGBToUV444Row_MSA; - } - } -#endif #if defined(HAS_ARGBTOUV444ROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToUV444Row = ARGBToUV444Row_Any_MMI; @@ -84,6 +76,14 @@ int ARGBToI444(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUV444ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUV444Row = ARGBToUV444Row_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToUV444Row = ARGBToUV444Row_MSA; + } + } +#endif #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; @@ -108,14 +108,6 @@ int ARGBToI444(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToYRow = ARGBToYRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_MSA; - } - } -#endif #if defined(HAS_ARGBTOYROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToYRow = ARGBToYRow_Any_MMI; @@ -124,6 +116,14 @@ int ARGBToI444(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToUV444Row(src_argb, dst_u, dst_v, width); @@ -207,36 +207,28 @@ int ARGBToI422(const uint8_t* src_argb, } #endif -#if defined(HAS_ARGBTOYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToYRow = ARGBToYRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_MSA; - } - } -#endif -#if defined(HAS_ARGBTOUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToUVRow = ARGBToUVRow_Any_MSA; - if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_MSA; - } - } -#endif - -#if defined(HAS_ARGBTOYROW_MMI) +#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToYRow = ARGBToYRow_Any_MMI; + ARGBToUVRow = ARGBToUVRow_Any_MMI; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_MMI; } + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_MMI; + } } #endif -#if defined(HAS_ARGBTOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToUVRow = ARGBToUVRow_Any_MMI; + +#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + ARGBToUVRow = ARGBToUVRow_Any_MSA; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_MMI; + ARGBToYRow = ARGBToYRow_MSA; + } + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; } } #endif @@ -315,35 +307,27 @@ int ARGBToNV12(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToYRow = ARGBToYRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_MSA; - } - } -#endif -#if defined(HAS_ARGBTOUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToUVRow = ARGBToUVRow_Any_MSA; - if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_MSA; - } - } -#endif -#if defined(HAS_ARGBTOYROW_MMI) +#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToYRow = ARGBToYRow_Any_MMI; + ARGBToUVRow = ARGBToUVRow_Any_MMI; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_MMI; } + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_MMI; + } } #endif -#if defined(HAS_ARGBTOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToUVRow = ARGBToUVRow_Any_MMI; +#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + ARGBToUVRow = ARGBToUVRow_Any_MSA; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_MMI; + ARGBToYRow = ARGBToYRow_MSA; + } + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; } } #endif @@ -371,14 +355,6 @@ int ARGBToNV12(const uint8_t* src_argb, } } #endif -#if defined(HAS_MERGEUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - MergeUVRow_ = MergeUVRow_Any_MSA; - if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow_ = MergeUVRow_MSA; - } - } -#endif #if defined(HAS_MERGEUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { MergeUVRow_ = MergeUVRow_Any_MMI; @@ -387,6 +363,14 @@ int ARGBToNV12(const uint8_t* src_argb, } } #endif +#if defined(HAS_MERGEUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MergeUVRow_ = MergeUVRow_Any_MSA; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_MSA; + } + } +#endif { // Allocate a rows of uv. align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); @@ -475,39 +459,30 @@ int ARGBToNV21(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToYRow = ARGBToYRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_MSA; - } - } -#endif -#if defined(HAS_ARGBTOUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToUVRow = ARGBToUVRow_Any_MSA; - if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_MSA; - } - } -#endif -#if defined(HAS_ARGBTOYROW_MMI) +#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToYRow = ARGBToYRow_Any_MMI; + ARGBToUVRow = ARGBToUVRow_Any_MMI; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_MMI; } + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_MMI; + } } #endif -#if defined(HAS_ARGBTOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToUVRow = ARGBToUVRow_Any_MMI; +#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + ARGBToUVRow = ARGBToUVRow_Any_MSA; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_MMI; + ARGBToYRow = ARGBToYRow_MSA; + } + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; } } #endif - #if defined(HAS_MERGEUVROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { MergeUVRow_ = MergeUVRow_Any_SSE2; @@ -532,14 +507,6 @@ int ARGBToNV21(const uint8_t* src_argb, } } #endif -#if defined(HAS_MERGEUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - MergeUVRow_ = MergeUVRow_Any_MSA; - if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow_ = MergeUVRow_MSA; - } - } -#endif #if defined(HAS_MERGEUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { MergeUVRow_ = MergeUVRow_Any_MMI; @@ -548,6 +515,14 @@ int ARGBToNV21(const uint8_t* src_argb, } } #endif +#if defined(HAS_MERGEUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MergeUVRow_ = MergeUVRow_Any_MSA; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_MSA; + } + } +#endif { // Allocate a rows of uv. align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); @@ -635,35 +610,27 @@ int ABGRToNV12(const uint8_t* src_abgr, } } #endif -#if defined(HAS_ABGRTOYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ABGRToYRow = ABGRToYRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ABGRToYRow = ABGRToYRow_MSA; - } - } -#endif -#if defined(HAS_ABGRTOUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ABGRToUVRow = ABGRToUVRow_Any_MSA; - if (IS_ALIGNED(width, 32)) { - ABGRToUVRow = ABGRToUVRow_MSA; - } - } -#endif -#if defined(HAS_ABGRTOYROW_MMI) +#if defined(HAS_ABGRTOYROW_MMI) && defined(HAS_ABGRTOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ABGRToYRow = ABGRToYRow_Any_MMI; + ABGRToUVRow = ABGRToUVRow_Any_MMI; if (IS_ALIGNED(width, 8)) { ABGRToYRow = ABGRToYRow_MMI; } + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_MMI; + } } #endif -#if defined(HAS_ABGRTOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ABGRToUVRow = ABGRToUVRow_Any_MMI; +#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ABGRToYRow = ABGRToYRow_Any_MSA; + ABGRToUVRow = ABGRToUVRow_Any_MSA; if (IS_ALIGNED(width, 16)) { - ABGRToUVRow = ABGRToUVRow_MMI; + ABGRToYRow = ABGRToYRow_MSA; + } + if (IS_ALIGNED(width, 32)) { + ABGRToUVRow = ABGRToUVRow_MSA; } } #endif @@ -691,14 +658,6 @@ int ABGRToNV12(const uint8_t* src_abgr, } } #endif -#if defined(HAS_MERGEUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - MergeUVRow_ = MergeUVRow_Any_MSA; - if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow_ = MergeUVRow_MSA; - } - } -#endif #if defined(HAS_MERGEUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { MergeUVRow_ = MergeUVRow_Any_MMI; @@ -707,6 +666,14 @@ int ABGRToNV12(const uint8_t* src_abgr, } } #endif +#if defined(HAS_MERGEUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MergeUVRow_ = MergeUVRow_Any_MSA; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_MSA; + } + } +#endif { // Allocate a rows of uv. align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); @@ -795,39 +762,30 @@ int ABGRToNV21(const uint8_t* src_abgr, } } #endif -#if defined(HAS_ABGRTOYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ABGRToYRow = ABGRToYRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ABGRToYRow = ABGRToYRow_MSA; - } - } -#endif -#if defined(HAS_ABGRTOUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ABGRToUVRow = ABGRToUVRow_Any_MSA; - if (IS_ALIGNED(width, 32)) { - ABGRToUVRow = ABGRToUVRow_MSA; - } - } -#endif -#if defined(HAS_ABGRTOYROW_MMI) +#if defined(HAS_ABGRTOYROW_MMI) && defined(HAS_ABGRTOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ABGRToYRow = ABGRToYRow_Any_MMI; + ABGRToUVRow = ABGRToUVRow_Any_MMI; if (IS_ALIGNED(width, 8)) { ABGRToYRow = ABGRToYRow_MMI; } + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_MMI; + } } #endif -#if defined(HAS_ABGRTOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ABGRToUVRow = ABGRToUVRow_Any_MMI; +#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ABGRToYRow = ABGRToYRow_Any_MSA; + ABGRToUVRow = ABGRToUVRow_Any_MSA; if (IS_ALIGNED(width, 16)) { - ABGRToUVRow = ABGRToUVRow_MMI; + ABGRToYRow = ABGRToYRow_MSA; + } + if (IS_ALIGNED(width, 32)) { + ABGRToUVRow = ABGRToUVRow_MSA; } } #endif - #if defined(HAS_MERGEUVROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { MergeUVRow_ = MergeUVRow_Any_SSE2; @@ -852,14 +810,6 @@ int ABGRToNV21(const uint8_t* src_abgr, } } #endif -#if defined(HAS_MERGEUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - MergeUVRow_ = MergeUVRow_Any_MSA; - if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow_ = MergeUVRow_MSA; - } - } -#endif #if defined(HAS_MERGEUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { MergeUVRow_ = MergeUVRow_Any_MMI; @@ -868,6 +818,14 @@ int ABGRToNV21(const uint8_t* src_abgr, } } #endif +#if defined(HAS_MERGEUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MergeUVRow_ = MergeUVRow_Any_MSA; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_MSA; + } + } +#endif { // Allocate a rows of uv. align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); @@ -961,35 +919,27 @@ int ARGBToYUY2(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToYRow = ARGBToYRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_MSA; - } - } -#endif -#if defined(HAS_ARGBTOUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToUVRow = ARGBToUVRow_Any_MSA; - if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_MSA; - } - } -#endif -#if defined(HAS_ARGBTOYROW_MMI) +#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToYRow = ARGBToYRow_Any_MMI; + ARGBToUVRow = ARGBToUVRow_Any_MMI; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_MMI; } + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_MMI; + } } #endif -#if defined(HAS_ARGBTOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToUVRow = ARGBToUVRow_Any_MMI; +#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + ARGBToUVRow = ARGBToUVRow_Any_MSA; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_MMI; + ARGBToYRow = ARGBToYRow_MSA; + } + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; } } #endif @@ -1017,14 +967,6 @@ int ARGBToYUY2(const uint8_t* src_argb, } } #endif -#if defined(HAS_I422TOYUY2ROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToYUY2Row = I422ToYUY2Row_Any_MSA; - if (IS_ALIGNED(width, 32)) { - I422ToYUY2Row = I422ToYUY2Row_MSA; - } - } -#endif #if defined(HAS_I422TOYUY2ROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { I422ToYUY2Row = I422ToYUY2Row_Any_MMI; @@ -1033,6 +975,14 @@ int ARGBToYUY2(const uint8_t* src_argb, } } #endif +#if defined(HAS_I422TOYUY2ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToYUY2Row = I422ToYUY2Row_Any_MSA; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_MSA; + } + } +#endif { // Allocate a rows of yuv. @@ -1122,35 +1072,27 @@ int ARGBToUYVY(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToYRow = ARGBToYRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_MSA; - } - } -#endif -#if defined(HAS_ARGBTOUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToUVRow = ARGBToUVRow_Any_MSA; - if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_MSA; - } - } -#endif -#if defined(HAS_ARGBTOYROW_MMI) +#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToYRow = ARGBToYRow_Any_MMI; + ARGBToUVRow = ARGBToUVRow_Any_MMI; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_MMI; } + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_MMI; + } } #endif -#if defined(HAS_ARGBTOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToUVRow = ARGBToUVRow_Any_MMI; +#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + ARGBToUVRow = ARGBToUVRow_Any_MSA; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_MMI; + ARGBToYRow = ARGBToYRow_MSA; + } + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; } } #endif @@ -1178,14 +1120,6 @@ int ARGBToUYVY(const uint8_t* src_argb, } } #endif -#if defined(HAS_I422TOUYVYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToUYVYRow = I422ToUYVYRow_Any_MSA; - if (IS_ALIGNED(width, 32)) { - I422ToUYVYRow = I422ToUYVYRow_MSA; - } - } -#endif #if defined(HAS_I422TOUYVYROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { I422ToUYVYRow = I422ToUYVYRow_Any_MMI; @@ -1194,6 +1128,14 @@ int ARGBToUYVY(const uint8_t* src_argb, } } #endif +#if defined(HAS_I422TOUYVYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToUYVYRow = I422ToUYVYRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_MSA; + } + } +#endif { // Allocate a rows of yuv. @@ -1263,14 +1205,6 @@ int ARGBToI400(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToYRow = ARGBToYRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_MSA; - } - } -#endif #if defined(HAS_ARGBTOYROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToYRow = ARGBToYRow_Any_MMI; @@ -1279,6 +1213,14 @@ int ARGBToI400(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToYRow(src_argb, dst_y, width); @@ -1361,14 +1303,6 @@ int ARGBToRGB24(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTORGB24ROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToRGB24Row = ARGBToRGB24Row_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGBToRGB24Row = ARGBToRGB24Row_MSA; - } - } -#endif #if defined(HAS_ARGBTORGB24ROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToRGB24Row = ARGBToRGB24Row_Any_MMI; @@ -1377,6 +1311,14 @@ int ARGBToRGB24(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTORGB24ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToRGB24Row = ARGBToRGB24Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToRGB24Row(src_argb, dst_rgb24, width); @@ -1435,14 +1377,6 @@ int ARGBToRAW(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTORAWROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToRAWRow = ARGBToRAWRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGBToRAWRow = ARGBToRAWRow_MSA; - } - } -#endif #if defined(HAS_ARGBTORAWROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToRAWRow = ARGBToRAWRow_Any_MMI; @@ -1451,6 +1385,14 @@ int ARGBToRAW(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTORAWROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToRAWRow = ARGBToRAWRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToRAWRow = ARGBToRAWRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToRAWRow(src_argb, dst_raw, width); @@ -1513,14 +1455,6 @@ int ARGBToRGB565Dither(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTORGB565DITHERROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA; - } - } -#endif #if defined(HAS_ARGBTORGB565DITHERROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MMI; @@ -1529,6 +1463,14 @@ int ARGBToRGB565Dither(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTORGB565DITHERROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToRGB565DitherRow(src_argb, dst_rgb565, @@ -1590,14 +1532,6 @@ int ARGBToRGB565(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTORGB565ROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToRGB565Row = ARGBToRGB565Row_Any_MSA; - if (IS_ALIGNED(width, 8)) { - ARGBToRGB565Row = ARGBToRGB565Row_MSA; - } - } -#endif #if defined(HAS_ARGBTORGB565ROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToRGB565Row = ARGBToRGB565Row_Any_MMI; @@ -1606,6 +1540,14 @@ int ARGBToRGB565(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTORGB565ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToRGB565Row = ARGBToRGB565Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565Row = ARGBToRGB565Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToRGB565Row(src_argb, dst_rgb565, width); @@ -1664,14 +1606,6 @@ int ARGBToARGB1555(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOARGB1555ROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MSA; - if (IS_ALIGNED(width, 8)) { - ARGBToARGB1555Row = ARGBToARGB1555Row_MSA; - } - } -#endif #if defined(HAS_ARGBTOARGB1555ROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MMI; @@ -1680,6 +1614,14 @@ int ARGBToARGB1555(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOARGB1555ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToARGB1555Row(src_argb, dst_argb1555, width); @@ -1738,14 +1680,6 @@ int ARGBToARGB4444(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOARGB4444ROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MSA; - if (IS_ALIGNED(width, 8)) { - ARGBToARGB4444Row = ARGBToARGB4444Row_MSA; - } - } -#endif #if defined(HAS_ARGBTOARGB4444ROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MMI; @@ -1754,6 +1688,14 @@ int ARGBToARGB4444(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOARGB4444ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToARGB4444Row(src_argb, dst_argb4444, width); @@ -1922,38 +1864,30 @@ int ARGBToJ420(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYJROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToYJRow = ARGBToYJRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGBToYJRow = ARGBToYJRow_MSA; - } - } -#endif -#if defined(HAS_ARGBTOYJROW_MMI) +#if defined(HAS_ARGBTOYJROW_MMI) && defined(HAS_ARGBTOUVJROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToYJRow = ARGBToYJRow_Any_MMI; + ARGBToUVJRow = ARGBToUVJRow_Any_MMI; if (IS_ALIGNED(width, 8)) { ARGBToYJRow = ARGBToYJRow_MMI; } + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_MMI; + } } #endif -#if defined(HAS_ARGBTOUVJROW_MSA) +#if defined(HAS_ARGBTOYJROW_MSA) && defined(HAS_ARGBTOUVJROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYJRow = ARGBToYJRow_Any_MSA; ARGBToUVJRow = ARGBToUVJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_MSA; + } if (IS_ALIGNED(width, 32)) { ARGBToUVJRow = ARGBToUVJRow_MSA; } } #endif -#if defined(HAS_ARGBTOUVJROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToUVJRow = ARGBToUVJRow_Any_MMI; - if (IS_ALIGNED(width, 16)) { - ARGBToUVJRow = ARGBToUVJRow_MMI; - } - } -#endif for (y = 0; y < height - 1; y += 2) { ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width); @@ -2039,38 +1973,30 @@ int ARGBToJ422(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYJROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToYJRow = ARGBToYJRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGBToYJRow = ARGBToYJRow_MSA; - } - } -#endif -#if defined(HAS_ARGBTOYJROW_MMI) +#if defined(HAS_ARGBTOYJROW_MMI) && defined(HAS_ARGBTOUVJROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToYJRow = ARGBToYJRow_Any_MMI; + ARGBToUVJRow = ARGBToUVJRow_Any_MMI; if (IS_ALIGNED(width, 8)) { ARGBToYJRow = ARGBToYJRow_MMI; } + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_MMI; + } } #endif -#if defined(HAS_ARGBTOUVJROW_MSA) +#if defined(HAS_ARGBTOYJROW_MSA) && defined(HAS_ARGBTOUVJROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYJRow = ARGBToYJRow_Any_MSA; ARGBToUVJRow = ARGBToUVJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_MSA; + } if (IS_ALIGNED(width, 32)) { ARGBToUVJRow = ARGBToUVJRow_MSA; } } #endif -#if defined(HAS_ARGBTOUVJROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToUVJRow = ARGBToUVJRow_Any_MMI; - if (IS_ALIGNED(width, 16)) { - ARGBToUVJRow = ARGBToUVJRow_MMI; - } - } -#endif for (y = 0; y < height; ++y) { ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width); @@ -2132,14 +2058,6 @@ int ARGBToJ400(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYJROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToYJRow = ARGBToYJRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGBToYJRow = ARGBToYJRow_MSA; - } - } -#endif #if defined(HAS_ARGBTOYJROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToYJRow = ARGBToYJRow_Any_MMI; @@ -2148,6 +2066,14 @@ int ARGBToJ400(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYJRow = ARGBToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToYJRow(src_argb, dst_yj, width); @@ -2206,14 +2132,6 @@ int RGBAToJ400(const uint8_t* src_rgba, } } #endif -#if defined(HAS_RGBATOYJROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - RGBAToYJRow = RGBAToYJRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - RGBAToYJRow = RGBAToYJRow_MSA; - } - } -#endif #if defined(HAS_RGBATOYJROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { RGBAToYJRow = RGBAToYJRow_Any_MMI; @@ -2222,6 +2140,14 @@ int RGBAToJ400(const uint8_t* src_rgba, } } #endif +#if defined(HAS_RGBATOYJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGBAToYJRow = RGBAToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGBAToYJRow = RGBAToYJRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { RGBAToYJRow(src_rgba, dst_yj, width); diff --git a/chromium/third_party/libyuv/source/convert_jpeg.cc b/chromium/third_party/libyuv/source/convert_jpeg.cc index f440c7c2e97..d7556ee91ba 100644 --- a/chromium/third_party/libyuv/source/convert_jpeg.cc +++ b/chromium/third_party/libyuv/source/convert_jpeg.cc @@ -328,6 +328,140 @@ int MJPGToNV21(const uint8_t* src_mjpg, return ret ? 0 : 1; } +static void JpegI420ToNV12(void* opaque, + const uint8_t* const* data, + const int* strides, + int rows) { + NV21Buffers* dest = (NV21Buffers*)(opaque); + // Use NV21 with VU swapped. + I420ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1], + dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows); + dest->y += rows * dest->y_stride; + dest->vu += ((rows + 1) >> 1) * dest->vu_stride; + dest->h -= rows; +} + +static void JpegI422ToNV12(void* opaque, + const uint8_t* const* data, + const int* strides, + int rows) { + NV21Buffers* dest = (NV21Buffers*)(opaque); + // Use NV21 with VU swapped. + I422ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1], + dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows); + dest->y += rows * dest->y_stride; + dest->vu += ((rows + 1) >> 1) * dest->vu_stride; + dest->h -= rows; +} + +static void JpegI444ToNV12(void* opaque, + const uint8_t* const* data, + const int* strides, + int rows) { + NV21Buffers* dest = (NV21Buffers*)(opaque); + // Use NV21 with VU swapped. + I444ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1], + dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows); + dest->y += rows * dest->y_stride; + dest->vu += ((rows + 1) >> 1) * dest->vu_stride; + dest->h -= rows; +} + +static void JpegI400ToNV12(void* opaque, + const uint8_t* const* data, + const int* strides, + int rows) { + NV21Buffers* dest = (NV21Buffers*)(opaque); + // Use NV21 since there is no UV plane. + I400ToNV21(data[0], strides[0], dest->y, dest->y_stride, dest->vu, + dest->vu_stride, dest->w, rows); + dest->y += rows * dest->y_stride; + dest->vu += ((rows + 1) >> 1) * dest->vu_stride; + dest->h -= rows; +} + +// MJPG (Motion JPEG) to NV12. +LIBYUV_API +int MJPGToNV12(const uint8_t* sample, + size_t sample_size, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int src_width, + int src_height, + int dst_width, + int dst_height) { + if (sample_size == kUnknownDataSize) { + // ERROR: MJPEG frame size unknown + return -1; + } + + // TODO(fbarchard): Port MJpeg to C. + MJpegDecoder mjpeg_decoder; + LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size); + if (ret && (mjpeg_decoder.GetWidth() != src_width || + mjpeg_decoder.GetHeight() != src_height)) { + // ERROR: MJPEG frame has unexpected dimensions + mjpeg_decoder.UnloadFrame(); + return 1; // runtime failure + } + if (ret) { + // Use NV21Buffers but with UV instead of VU. + NV21Buffers bufs = {dst_y, dst_stride_y, dst_uv, + dst_stride_uv, dst_width, dst_height}; + // YUV420 + if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 2 && + mjpeg_decoder.GetHorizSampFactor(0) == 2 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToNV12, &bufs, dst_width, + dst_height); + // YUV422 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 2 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToNV12, &bufs, dst_width, + dst_height); + // YUV444 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 1 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToNV12, &bufs, dst_width, + dst_height); + // YUV400 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceGrayscale && + mjpeg_decoder.GetNumComponents() == 1 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToNV12, &bufs, dst_width, + dst_height); + } else { + // Unknown colorspace. + mjpeg_decoder.UnloadFrame(); + return 1; + } + } + return ret ? 0 : 1; +} + struct ARGBBuffers { uint8_t* argb; int argb_stride; diff --git a/chromium/third_party/libyuv/source/convert_to_argb.cc b/chromium/third_party/libyuv/source/convert_to_argb.cc index c08f61013bd..84df16c8c26 100644 --- a/chromium/third_party/libyuv/source/convert_to_argb.cc +++ b/chromium/third_party/libyuv/source/convert_to_argb.cc @@ -180,12 +180,6 @@ int ConvertToARGB(const uint8_t* sample, r = NV21ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; - case FOURCC_M420: - src = sample + (src_width * crop_y) * 12 / 8 + crop_x; - r = M420ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width, - inv_crop_height); - break; - // Triplanar formats case FOURCC_I420: case FOURCC_YV12: { diff --git a/chromium/third_party/libyuv/source/convert_to_i420.cc b/chromium/third_party/libyuv/source/convert_to_i420.cc index 584be0ac33a..ac6eeab24ef 100644 --- a/chromium/third_party/libyuv/source/convert_to_i420.cc +++ b/chromium/third_party/libyuv/source/convert_to_i420.cc @@ -179,11 +179,6 @@ int ConvertToI420(const uint8_t* sample, dst_stride_y, dst_v, dst_stride_v, dst_u, dst_stride_u, crop_width, inv_crop_height, rotation); break; - case FOURCC_M420: - src = sample + (src_width * crop_y) * 12 / 8 + crop_x; - r = M420ToI420(src, src_width, dst_y, dst_stride_y, dst_u, dst_stride_u, - dst_v, dst_stride_v, crop_width, inv_crop_height); - break; // Triplanar formats case FOURCC_I420: case FOURCC_YV12: { diff --git a/chromium/third_party/libyuv/source/cpu_id.cc b/chromium/third_party/libyuv/source/cpu_id.cc index 48e2b61526d..fe89452b772 100644 --- a/chromium/third_party/libyuv/source/cpu_id.cc +++ b/chromium/third_party/libyuv/source/cpu_id.cc @@ -75,9 +75,9 @@ void CpuId(int info_eax, int info_ecx, int* cpu_info) { asm volatile( #if defined(__i386__) && defined(__PIC__) // Preserve ebx for fpic 32 bit. - "mov %%ebx, %%edi \n" + "mov %%ebx, %%edi \n" "cpuid \n" - "xchg %%edi, %%ebx \n" + "xchg %%edi, %%ebx \n" : "=D"(info_ebx), #else "cpuid \n" @@ -163,44 +163,38 @@ LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) { } // TODO(fbarchard): Consider read_msa_ir(). -// TODO(fbarchard): Add unittest. -LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name, - const char ase[]) { +LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name) { char cpuinfo_line[512]; + int flag = 0x0; FILE* f = fopen(cpuinfo_name, "r"); if (!f) { - // ase enabled if /proc/cpuinfo is unavailable. - if (strcmp(ase, " msa") == 0) { - return kCpuHasMSA; - } - if (strcmp(ase, " mmi") == 0) { - return kCpuHasMMI; - } + // Assume nothing if /proc/cpuinfo is unavailable. + // This will occur for Chrome sandbox for Pepper or Render process. return 0; } while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) { + if (memcmp(cpuinfo_line, "cpu model", 9) == 0) { + // Workaround early kernel without mmi in ASEs line. + if (strstr(cpuinfo_line, "Loongson-3")) { + flag |= kCpuHasMMI; + } else if (strstr(cpuinfo_line, "Loongson-2K")) { + flag |= kCpuHasMMI | kCpuHasMSA; + } + } if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) { - char* p = strstr(cpuinfo_line, ase); - if (p) { - fclose(f); - if (strcmp(ase, " msa") == 0) { - return kCpuHasMSA; - } - return 0; + if (strstr(cpuinfo_line, "loongson-mmi") && + strstr(cpuinfo_line, "loongson-ext")) { + flag |= kCpuHasMMI; } - } else if (memcmp(cpuinfo_line, "cpu model", 9) == 0) { - char* p = strstr(cpuinfo_line, "Loongson-3"); - if (p) { - fclose(f); - if (strcmp(ase, " mmi") == 0) { - return kCpuHasMMI; - } - return 0; + if (strstr(cpuinfo_line, "msa")) { + flag |= kCpuHasMSA; } + // ASEs is the last line, so we can break here. + break; } } fclose(f); - return 0; + return flag; } static SAFEBUFFERS int GetCpuFlags(void) { @@ -242,11 +236,7 @@ static SAFEBUFFERS int GetCpuFlags(void) { } #endif #if defined(__mips__) && defined(__linux__) -#if defined(__mips_msa) - cpu_info = MipsCpuCaps("/proc/cpuinfo", " msa"); -#elif defined(_MIPS_ARCH_LOONGSON3A) - cpu_info = MipsCpuCaps("/proc/cpuinfo", " mmi"); -#endif + cpu_info = MipsCpuCaps("/proc/cpuinfo"); cpu_info |= kCpuHasMIPS; #endif #if defined(__arm__) || defined(__aarch64__) diff --git a/chromium/third_party/libyuv/source/planar_functions.cc b/chromium/third_party/libyuv/source/planar_functions.cc index 1aa151b6258..4e8908c2eba 100644 --- a/chromium/third_party/libyuv/source/planar_functions.cc +++ b/chromium/third_party/libyuv/source/planar_functions.cc @@ -349,6 +349,39 @@ int I420ToI400(const uint8_t* src_y, return 0; } +// Copy NV12. Supports inverting. +int NV12Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, + int src_stride_uv, uint8_t* dst_y, int dst_stride_y, + uint8_t* dst_uv, int dst_stride_uv, int width, int height) { + if (!src_y || !dst_y || !src_uv || !dst_uv || width <= 0 || height == 0) { + return -1; + } + + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_uv = src_uv + (halfheight - 1) * src_stride_uv; + src_stride_y = -src_stride_y; + src_stride_uv = -src_stride_uv; + } + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + CopyPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, halfwidth * 2, + halfheight); + return 0; +} + +// Copy NV21. Supports inverting. +int NV21Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_vu, + int src_stride_vu, uint8_t* dst_y, int dst_stride_y, + uint8_t* dst_vu, int dst_stride_vu, int width, int height) { + return NV12Copy(src_y, src_stride_y, src_vu, src_stride_vu, dst_y, + dst_stride_y, dst_vu, dst_stride_vu, width, height); +} + // Support function for NV12 etc UV channels. // Width and height are plane sizes (typically half pixel width). LIBYUV_API @@ -402,14 +435,6 @@ void SplitUVPlane(const uint8_t* src_uv, } } #endif -#if defined(HAS_SPLITUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - SplitUVRow = SplitUVRow_Any_MSA; - if (IS_ALIGNED(width, 32)) { - SplitUVRow = SplitUVRow_MSA; - } - } -#endif #if defined(HAS_SPLITUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { SplitUVRow = SplitUVRow_Any_MMI; @@ -418,6 +443,14 @@ void SplitUVPlane(const uint8_t* src_uv, } } #endif +#if defined(HAS_SPLITUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SplitUVRow = SplitUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + SplitUVRow = SplitUVRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { // Copy a row of UV. @@ -477,14 +510,6 @@ void MergeUVPlane(const uint8_t* src_u, } } #endif -#if defined(HAS_MERGEUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - MergeUVRow = MergeUVRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - MergeUVRow = MergeUVRow_MSA; - } - } -#endif #if defined(HAS_MERGEUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { MergeUVRow = MergeUVRow_Any_MMI; @@ -493,6 +518,14 @@ void MergeUVPlane(const uint8_t* src_u, } } #endif +#if defined(HAS_MERGEUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MergeUVRow = MergeUVRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + MergeUVRow = MergeUVRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { // Merge a row of U and V into a row of UV. @@ -579,6 +612,15 @@ int NV21ToNV12(const uint8_t* src_y, if (dst_y) { CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_vu = src_vu + (halfheight - 1) * src_stride_vu; + src_stride_vu = -src_stride_vu; + } + SwapUVPlane(src_vu, src_stride_vu, dst_uv, dst_stride_uv, halfwidth, halfheight); return 0; @@ -625,14 +667,6 @@ void SplitRGBPlane(const uint8_t* src_rgb, } } #endif -#if defined(HAS_SPLITRGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - SplitRGBRow = SplitRGBRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - SplitRGBRow = SplitRGBRow_NEON; - } - } -#endif #if defined(HAS_SPLITRGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { SplitRGBRow = SplitRGBRow_Any_MMI; @@ -641,6 +675,14 @@ void SplitRGBPlane(const uint8_t* src_rgb, } } #endif +#if defined(HAS_SPLITRGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SplitRGBRow = SplitRGBRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + SplitRGBRow = SplitRGBRow_NEON; + } + } +#endif for (y = 0; y < height; ++y) { // Copy a row of RGB. @@ -716,70 +758,6 @@ void MergeRGBPlane(const uint8_t* src_r, } } -// Mirror a plane of data. -void MirrorPlane(const uint8_t* src_y, - int src_stride_y, - uint8_t* dst_y, - int dst_stride_y, - int width, - int height) { - int y; - void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C; - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_y = src_y + (height - 1) * src_stride_y; - src_stride_y = -src_stride_y; - } -#if defined(HAS_MIRRORROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - MirrorRow = MirrorRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - MirrorRow = MirrorRow_NEON; - } - } -#endif -#if defined(HAS_MIRRORROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - MirrorRow = MirrorRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - MirrorRow = MirrorRow_SSSE3; - } - } -#endif -#if defined(HAS_MIRRORROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - MirrorRow = MirrorRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - MirrorRow = MirrorRow_AVX2; - } - } -#endif -#if defined(HAS_MIRRORROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - MirrorRow = MirrorRow_Any_MSA; - if (IS_ALIGNED(width, 64)) { - MirrorRow = MirrorRow_MSA; - } - } -#endif -#if defined(HAS_MIRRORROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - MirrorRow = MirrorRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - MirrorRow = MirrorRow_MMI; - } - } -#endif - - // Mirror plane - for (y = 0; y < height; ++y) { - MirrorRow(src_y, dst_y, width); - src_y += src_stride_y; - dst_y += dst_stride_y; - } -} - // Convert YUY2 to I422. LIBYUV_API int YUY2ToI422(const uint8_t* src_yuy2, @@ -844,17 +822,7 @@ int YUY2ToI422(const uint8_t* src_yuy2, } } #endif -#if defined(HAS_YUY2TOYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - YUY2ToYRow = YUY2ToYRow_Any_MSA; - YUY2ToUV422Row = YUY2ToUV422Row_Any_MSA; - if (IS_ALIGNED(width, 32)) { - YUY2ToYRow = YUY2ToYRow_MSA; - YUY2ToUV422Row = YUY2ToUV422Row_MSA; - } - } -#endif -#if defined(HAS_YUY2TOYROW_MMI) +#if defined(HAS_YUY2TOYROW_MMI) && defined(HAS_YUY2TOUV422ROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { YUY2ToYRow = YUY2ToYRow_Any_MMI; YUY2ToUV422Row = YUY2ToUV422Row_Any_MMI; @@ -864,6 +832,16 @@ int YUY2ToI422(const uint8_t* src_yuy2, } } #endif +#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUV422ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + YUY2ToYRow = YUY2ToYRow_Any_MSA; + YUY2ToUV422Row = YUY2ToUV422Row_Any_MSA; + if (IS_ALIGNED(width, 32)) { + YUY2ToYRow = YUY2ToYRow_MSA; + YUY2ToUV422Row = YUY2ToUV422Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width); @@ -940,17 +918,7 @@ int UYVYToI422(const uint8_t* src_uyvy, } } #endif -#if defined(HAS_UYVYTOYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - UYVYToYRow = UYVYToYRow_Any_MSA; - UYVYToUV422Row = UYVYToUV422Row_Any_MSA; - if (IS_ALIGNED(width, 32)) { - UYVYToYRow = UYVYToYRow_MSA; - UYVYToUV422Row = UYVYToUV422Row_MSA; - } - } -#endif -#if defined(HAS_UYVYTOYROW_MMI) +#if defined(HAS_UYVYTOYROW_MMI) && defined(HAS_UYVYTOUV422ROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { UYVYToYRow = UYVYToYRow_Any_MMI; UYVYToUV422Row = UYVYToUV422Row_Any_MMI; @@ -960,6 +928,16 @@ int UYVYToI422(const uint8_t* src_uyvy, } } #endif +#if defined(HAS_UYVYTOYROW_MSA) && defined(HAS_UYVYTOUV422ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + UYVYToYRow = UYVYToYRow_Any_MSA; + UYVYToUV422Row = UYVYToUV422Row_Any_MSA; + if (IS_ALIGNED(width, 32)) { + UYVYToYRow = UYVYToYRow_MSA; + UYVYToUV422Row = UYVYToUV422Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { UYVYToUV422Row(src_uyvy, dst_u, dst_v, width); @@ -1022,6 +1000,14 @@ int YUY2ToY(const uint8_t* src_yuy2, } } #endif +#if defined(HAS_YUY2TOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + YUY2ToYRow = YUY2ToYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + YUY2ToYRow = YUY2ToYRow_MMI; + } + } +#endif #if defined(HAS_YUY2TOYROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { YUY2ToYRow = YUY2ToYRow_Any_MSA; @@ -1030,21 +1016,137 @@ int YUY2ToY(const uint8_t* src_yuy2, } } #endif -#if defined(HAS_YUY2TOYROW_MMI) + + for (y = 0; y < height; ++y) { + YUY2ToYRow(src_yuy2, dst_y, width); + src_yuy2 += src_stride_yuy2; + dst_y += dst_stride_y; + } + return 0; +} + +// Mirror a plane of data. +// See Also I400Mirror +LIBYUV_API +void MirrorPlane(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { + int y; + void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } +#if defined(HAS_MIRRORROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MirrorRow = MirrorRow_Any_NEON; + if (IS_ALIGNED(width, 32)) { + MirrorRow = MirrorRow_NEON; + } + } +#endif +#if defined(HAS_MIRRORROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + MirrorRow = MirrorRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + MirrorRow = MirrorRow_SSSE3; + } + } +#endif +#if defined(HAS_MIRRORROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MirrorRow = MirrorRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + MirrorRow = MirrorRow_AVX2; + } + } +#endif +#if defined(HAS_MIRRORROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { - YUY2ToYRow = YUY2ToYRow_Any_MMI; + MirrorRow = MirrorRow_Any_MMI; if (IS_ALIGNED(width, 8)) { - YUY2ToYRow = YUY2ToYRow_MMI; + MirrorRow = MirrorRow_MMI; + } + } +#endif +#if defined(HAS_MIRRORROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MirrorRow = MirrorRow_Any_MSA; + if (IS_ALIGNED(width, 64)) { + MirrorRow = MirrorRow_MSA; } } #endif + // Mirror plane for (y = 0; y < height; ++y) { - YUY2ToYRow(src_yuy2, dst_y, width); - src_yuy2 += src_stride_yuy2; + MirrorRow(src_y, dst_y, width); + src_y += src_stride_y; dst_y += dst_stride_y; } - return 0; +} + +// Mirror a plane of UV data. +LIBYUV_API +void MirrorUVPlane(const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + int y; + void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst, int width) = + MirrorUVRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uv = src_uv + (height - 1) * src_stride_uv; + src_stride_uv = -src_stride_uv; + } +#if defined(HAS_MIRRORUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MirrorUVRow = MirrorUVRow_Any_NEON; + if (IS_ALIGNED(width, 32)) { + MirrorUVRow = MirrorUVRow_NEON; + } + } +#endif +#if defined(HAS_MIRRORUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + MirrorUVRow = MirrorUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + MirrorUVRow = MirrorUVRow_SSSE3; + } + } +#endif +#if defined(HAS_MIRRORUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MirrorUVRow = MirrorUVRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + MirrorUVRow = MirrorUVRow_AVX2; + } + } +#endif +#if defined(HAS_MIRRORUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MirrorUVRow = MirrorUVRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + MirrorUVRow = MirrorUVRow_MSA; + } + } +#endif + + // MirrorUV plane + for (y = 0; y < height; ++y) { + MirrorUVRow(src_uv, dst_uv, width); + src_uv += src_stride_uv; + dst_uv += dst_stride_uv; + } } // Mirror I400 with optional flipping @@ -1087,7 +1189,7 @@ int I420Mirror(const uint8_t* src_y, int height) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; - if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v || width <= 0 || + if (!src_y || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } @@ -1111,6 +1213,41 @@ int I420Mirror(const uint8_t* src_y, return 0; } +// NV12 mirror. +LIBYUV_API +int NV12Mirror(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_y || !src_uv || !dst_uv || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_uv = src_uv + (halfheight - 1) * src_stride_uv; + src_stride_y = -src_stride_y; + src_stride_uv = -src_stride_uv; + } + + if (dst_y) { + MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + MirrorUVPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, halfwidth, + halfheight); + return 0; +} + // ARGB mirror. LIBYUV_API int ARGBMirror(const uint8_t* src_argb, @@ -1134,7 +1271,7 @@ int ARGBMirror(const uint8_t* src_argb, #if defined(HAS_ARGBMIRRORROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBMirrorRow = ARGBMirrorRow_Any_NEON; - if (IS_ALIGNED(width, 4)) { + if (IS_ALIGNED(width, 8)) { ARGBMirrorRow = ARGBMirrorRow_NEON; } } @@ -1155,14 +1292,6 @@ int ARGBMirror(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBMIRRORROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBMirrorRow = ARGBMirrorRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGBMirrorRow = ARGBMirrorRow_MSA; - } - } -#endif #if defined(HAS_ARGBMIRRORROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBMirrorRow = ARGBMirrorRow_Any_MMI; @@ -1171,6 +1300,14 @@ int ARGBMirror(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBMIRRORROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBMirrorRow = ARGBMirrorRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBMirrorRow = ARGBMirrorRow_MSA; + } + } +#endif // Mirror plane for (y = 0; y < height; ++y) { @@ -1181,6 +1318,52 @@ int ARGBMirror(const uint8_t* src_argb, return 0; } +// RGB24 mirror. +LIBYUV_API +int RGB24Mirror(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + int y; + void (*RGB24MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = + RGB24MirrorRow_C; + if (!src_rgb24 || !dst_rgb24 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24; + src_stride_rgb24 = -src_stride_rgb24; + } +#if defined(HAS_RGB24MIRRORROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGB24MirrorRow = RGB24MirrorRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RGB24MirrorRow = RGB24MirrorRow_NEON; + } + } +#endif +#if defined(HAS_RGB24MIRRORROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RGB24MirrorRow = RGB24MirrorRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RGB24MirrorRow = RGB24MirrorRow_SSSE3; + } + } +#endif + + // Mirror plane + for (y = 0; y < height; ++y) { + RGB24MirrorRow(src_rgb24, dst_rgb24, width); + src_rgb24 += src_stride_rgb24; + dst_rgb24 += dst_stride_rgb24; + } + return 0; +} + // Get a blender that optimized for the CPU and pixel count. // As there are 6 blenders to choose from, the caller should try to use // the same blend function for all pixels if possible. @@ -1199,16 +1382,16 @@ ARGBBlendRow GetARGBBlend() { ARGBBlendRow = ARGBBlendRow_NEON; } #endif -#if defined(HAS_ARGBBLENDROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBBlendRow = ARGBBlendRow_MSA; - } -#endif #if defined(HAS_ARGBBLENDROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBBlendRow = ARGBBlendRow_MMI; } #endif +#if defined(HAS_ARGBBLENDROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBBlendRow = ARGBBlendRow_MSA; + } +#endif return ARGBBlendRow; } @@ -1517,14 +1700,6 @@ int ARGBMultiply(const uint8_t* src_argb0, } } #endif -#if defined(HAS_ARGBMULTIPLYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBMultiplyRow = ARGBMultiplyRow_Any_MSA; - if (IS_ALIGNED(width, 4)) { - ARGBMultiplyRow = ARGBMultiplyRow_MSA; - } - } -#endif #if defined(HAS_ARGBMULTIPLYROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBMultiplyRow = ARGBMultiplyRow_Any_MMI; @@ -1533,6 +1708,14 @@ int ARGBMultiply(const uint8_t* src_argb0, } } #endif +#if defined(HAS_ARGBMULTIPLYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBMultiplyRow = ARGBMultiplyRow_Any_MSA; + if (IS_ALIGNED(width, 4)) { + ARGBMultiplyRow = ARGBMultiplyRow_MSA; + } + } +#endif // Multiply plane for (y = 0; y < height; ++y) { @@ -1602,14 +1785,6 @@ int ARGBAdd(const uint8_t* src_argb0, } } #endif -#if defined(HAS_ARGBADDROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBAddRow = ARGBAddRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - ARGBAddRow = ARGBAddRow_MSA; - } - } -#endif #if defined(HAS_ARGBADDROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBAddRow = ARGBAddRow_Any_MMI; @@ -1618,6 +1793,14 @@ int ARGBAdd(const uint8_t* src_argb0, } } #endif +#if defined(HAS_ARGBADDROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAddRow = ARGBAddRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAddRow = ARGBAddRow_MSA; + } + } +#endif // Add plane for (y = 0; y < height; ++y) { @@ -1682,14 +1865,6 @@ int ARGBSubtract(const uint8_t* src_argb0, } } #endif -#if defined(HAS_ARGBSUBTRACTROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBSubtractRow = ARGBSubtractRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - ARGBSubtractRow = ARGBSubtractRow_MSA; - } - } -#endif #if defined(HAS_ARGBSUBTRACTROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBSubtractRow = ARGBSubtractRow_Any_MMI; @@ -1698,200 +1873,21 @@ int ARGBSubtract(const uint8_t* src_argb0, } } #endif - - // Subtract plane - for (y = 0; y < height; ++y) { - ARGBSubtractRow(src_argb0, src_argb1, dst_argb, width); - src_argb0 += src_stride_argb0; - src_argb1 += src_stride_argb1; - dst_argb += dst_stride_argb; - } - return 0; -} -// Convert I422 to RGBA with matrix -static int I422ToRGBAMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgba, - int dst_stride_rgba, - const struct YuvConstants* yuvconstants, - int width, - int height) { - int y; - void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf, - const uint8_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - I422ToRGBARow_C; - if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba; - dst_stride_rgba = -dst_stride_rgba; - } -#if defined(HAS_I422TORGBAROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToRGBARow = I422ToRGBARow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422ToRGBARow = I422ToRGBARow_SSSE3; - } - } -#endif -#if defined(HAS_I422TORGBAROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToRGBARow = I422ToRGBARow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I422ToRGBARow = I422ToRGBARow_AVX2; - } - } -#endif -#if defined(HAS_I422TORGBAROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToRGBARow = I422ToRGBARow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I422ToRGBARow = I422ToRGBARow_NEON; - } - } -#endif -#if defined(HAS_I422TORGBAROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToRGBARow = I422ToRGBARow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - I422ToRGBARow = I422ToRGBARow_MSA; - } - } -#endif -#if defined(HAS_I422TORGBAROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - I422ToRGBARow = I422ToRGBARow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - I422ToRGBARow = I422ToRGBARow_MMI; - } - } -#endif - - for (y = 0; y < height; ++y) { - I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width); - dst_rgba += dst_stride_rgba; - src_y += src_stride_y; - src_u += src_stride_u; - src_v += src_stride_v; - } - return 0; -} - -// Convert I422 to RGBA. -LIBYUV_API -int I422ToRGBA(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgba, - int dst_stride_rgba, - int width, - int height) { - return I422ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_rgba, dst_stride_rgba, - &kYuvI601Constants, width, height); -} - -// Convert I422 to BGRA. -LIBYUV_API -int I422ToBGRA(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_bgra, - int dst_stride_bgra, - int width, - int height) { - return I422ToRGBAMatrix(src_y, src_stride_y, src_v, - src_stride_v, // Swap U and V - src_u, src_stride_u, dst_bgra, dst_stride_bgra, - &kYvuI601Constants, // Use Yvu matrix - width, height); -} - -// Convert NV12 to RGB565. -LIBYUV_API -int NV12ToRGB565(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - int width, - int height) { - int y; - void (*NV12ToRGB565Row)( - const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C; - if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; - dst_stride_rgb565 = -dst_stride_rgb565; - } -#if defined(HAS_NV12TORGB565ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - NV12ToRGB565Row = NV12ToRGB565Row_SSSE3; - } - } -#endif -#if defined(HAS_NV12TORGB565ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - NV12ToRGB565Row = NV12ToRGB565Row_AVX2; - } - } -#endif -#if defined(HAS_NV12TORGB565ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { - NV12ToRGB565Row = NV12ToRGB565Row_NEON; - } - } -#endif -#if defined(HAS_NV12TORGB565ROW_MSA) +#if defined(HAS_ARGBSUBTRACTROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { - NV12ToRGB565Row = NV12ToRGB565Row_Any_MSA; + ARGBSubtractRow = ARGBSubtractRow_Any_MSA; if (IS_ALIGNED(width, 8)) { - NV12ToRGB565Row = NV12ToRGB565Row_MSA; - } - } -#endif -#if defined(HAS_NV12TORGB565ROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - NV12ToRGB565Row = NV12ToRGB565Row_Any_MMI; - if (IS_ALIGNED(width, 4)) { - NV12ToRGB565Row = NV12ToRGB565Row_MMI; + ARGBSubtractRow = ARGBSubtractRow_MSA; } } #endif + // Subtract plane for (y = 0; y < height; ++y) { - NV12ToRGB565Row(src_y, src_uv, dst_rgb565, &kYuvI601Constants, width); - dst_rgb565 += dst_stride_rgb565; - src_y += src_stride_y; - if (y & 1) { - src_uv += src_stride_uv; - } + ARGBSubtractRow(src_argb0, src_argb1, dst_argb, width); + src_argb0 += src_stride_argb0; + src_argb1 += src_stride_argb1; + dst_argb += dst_stride_argb; } return 0; } @@ -1938,14 +1934,6 @@ int RAWToRGB24(const uint8_t* src_raw, } } #endif -#if defined(HAS_RAWTORGB24ROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - RAWToRGB24Row = RAWToRGB24Row_Any_MSA; - if (IS_ALIGNED(width, 16)) { - RAWToRGB24Row = RAWToRGB24Row_MSA; - } - } -#endif #if defined(HAS_RAWTORGB24ROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { RAWToRGB24Row = RAWToRGB24Row_Any_MMI; @@ -1954,6 +1942,14 @@ int RAWToRGB24(const uint8_t* src_raw, } } #endif +#if defined(HAS_RAWTORGB24ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RAWToRGB24Row = RAWToRGB24Row_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RAWToRGB24Row = RAWToRGB24Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { RAWToRGB24Row(src_raw, dst_rgb24, width); @@ -2089,14 +2085,6 @@ int ARGBRect(uint8_t* dst_argb, ARGBSetRow = ARGBSetRow_X86; } #endif -#if defined(HAS_ARGBSETROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBSetRow = ARGBSetRow_Any_MSA; - if (IS_ALIGNED(width, 4)) { - ARGBSetRow = ARGBSetRow_MSA; - } - } -#endif #if defined(HAS_ARGBSETROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBSetRow = ARGBSetRow_Any_MMI; @@ -2105,6 +2093,14 @@ int ARGBRect(uint8_t* dst_argb, } } #endif +#if defined(HAS_ARGBSETROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBSetRow = ARGBSetRow_Any_MSA; + if (IS_ALIGNED(width, 4)) { + ARGBSetRow = ARGBSetRow_MSA; + } + } +#endif // Set plane for (y = 0; y < height; ++y) { @@ -2175,14 +2171,6 @@ int ARGBAttenuate(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBATTENUATEROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - ARGBAttenuateRow = ARGBAttenuateRow_MSA; - } - } -#endif #if defined(HAS_ARGBATTENUATEROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI; @@ -2191,6 +2179,14 @@ int ARGBAttenuate(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBATTENUATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBAttenuateRow(src_argb, dst_argb, width); @@ -2286,16 +2282,16 @@ int ARGBGrayTo(const uint8_t* src_argb, ARGBGrayRow = ARGBGrayRow_NEON; } #endif -#if defined(HAS_ARGBGRAYROW_MSA) - if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { - ARGBGrayRow = ARGBGrayRow_MSA; - } -#endif #if defined(HAS_ARGBGRAYROW_MMI) if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) { ARGBGrayRow = ARGBGrayRow_MMI; } #endif +#if defined(HAS_ARGBGRAYROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { + ARGBGrayRow = ARGBGrayRow_MSA; + } +#endif for (y = 0; y < height; ++y) { ARGBGrayRow(src_argb, dst_argb, width); @@ -2336,16 +2332,16 @@ int ARGBGray(uint8_t* dst_argb, ARGBGrayRow = ARGBGrayRow_NEON; } #endif -#if defined(HAS_ARGBGRAYROW_MSA) - if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { - ARGBGrayRow = ARGBGrayRow_MSA; - } -#endif #if defined(HAS_ARGBGRAYROW_MMI) if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) { ARGBGrayRow = ARGBGrayRow_MMI; } #endif +#if defined(HAS_ARGBGRAYROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { + ARGBGrayRow = ARGBGrayRow_MSA; + } +#endif for (y = 0; y < height; ++y) { ARGBGrayRow(dst, dst, width); @@ -2384,16 +2380,16 @@ int ARGBSepia(uint8_t* dst_argb, ARGBSepiaRow = ARGBSepiaRow_NEON; } #endif -#if defined(HAS_ARGBSEPIAROW_MSA) - if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { - ARGBSepiaRow = ARGBSepiaRow_MSA; - } -#endif #if defined(HAS_ARGBSEPIAROW_MMI) if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) { ARGBSepiaRow = ARGBSepiaRow_MMI; } #endif +#if defined(HAS_ARGBSEPIAROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { + ARGBSepiaRow = ARGBSepiaRow_MSA; + } +#endif for (y = 0; y < height; ++y) { ARGBSepiaRow(dst, width); @@ -2440,16 +2436,16 @@ int ARGBColorMatrix(const uint8_t* src_argb, ARGBColorMatrixRow = ARGBColorMatrixRow_NEON; } #endif -#if defined(HAS_ARGBCOLORMATRIXROW_MSA) - if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { - ARGBColorMatrixRow = ARGBColorMatrixRow_MSA; - } -#endif #if defined(HAS_ARGBCOLORMATRIXROW_MMI) if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) { ARGBColorMatrixRow = ARGBColorMatrixRow_MMI; } #endif +#if defined(HAS_ARGBCOLORMATRIXROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { + ARGBColorMatrixRow = ARGBColorMatrixRow_MSA; + } +#endif for (y = 0; y < height; ++y) { ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width); src_argb += src_stride_argb; @@ -2814,16 +2810,16 @@ int ARGBShade(const uint8_t* src_argb, ARGBShadeRow = ARGBShadeRow_NEON; } #endif -#if defined(HAS_ARGBSHADEROW_MSA) - if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 4)) { - ARGBShadeRow = ARGBShadeRow_MSA; - } -#endif #if defined(HAS_ARGBSHADEROW_MMI) if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) { ARGBShadeRow = ARGBShadeRow_MMI; } #endif +#if defined(HAS_ARGBSHADEROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 4)) { + ARGBShadeRow = ARGBShadeRow_MSA; + } +#endif for (y = 0; y < height; ++y) { ARGBShadeRow(src_argb, dst_argb, width, value); @@ -2887,14 +2883,6 @@ int InterpolatePlane(const uint8_t* src0, } } #endif -#if defined(HAS_INTERPOLATEROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - InterpolateRow = InterpolateRow_Any_MSA; - if (IS_ALIGNED(width, 32)) { - InterpolateRow = InterpolateRow_MSA; - } - } -#endif #if defined(HAS_INTERPOLATEROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { InterpolateRow = InterpolateRow_Any_MMI; @@ -2903,6 +2891,14 @@ int InterpolatePlane(const uint8_t* src0, } } #endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { InterpolateRow(dst, src0, src1 - src0, width, interpolation); @@ -3018,14 +3014,6 @@ int ARGBShuffle(const uint8_t* src_bgra, } } #endif -#if defined(HAS_ARGBSHUFFLEROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBShuffleRow = ARGBShuffleRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - ARGBShuffleRow = ARGBShuffleRow_MSA; - } - } -#endif #if defined(HAS_ARGBSHUFFLEROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBShuffleRow = ARGBShuffleRow_Any_MMI; @@ -3034,6 +3022,14 @@ int ARGBShuffle(const uint8_t* src_bgra, } } #endif +#if defined(HAS_ARGBSHUFFLEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBShuffleRow = ARGBShuffleRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBShuffleRow = ARGBShuffleRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBShuffleRow(src_bgra, dst_argb, shuffler, width); @@ -3043,6 +3039,80 @@ int ARGBShuffle(const uint8_t* src_bgra, return 0; } +// Gauss blur a float plane using Gaussian 5x5 filter with +// coefficients of 1, 4, 6, 4, 1. +// Each destination pixel is a blur of the 5x5 +// pixels from the source. +// Source edges are clamped. +// Edge is 2 pixels on each side, and interior is multiple of 4. +LIBYUV_API +int GaussPlane_F32(const float* src, + int src_stride, + float* dst, + int dst_stride, + int width, + int height) { + int y; + void (*GaussCol_F32)(const float* src0, const float* src1, const float* src2, + const float* src3, const float* src4, float* dst, + int width) = GaussCol_F32_C; + void (*GaussRow_F32)(const float* src, float* dst, int width) = + GaussRow_F32_C; + if (!src || !dst || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src = src + (height - 1) * src_stride; + src_stride = -src_stride; + } + +#if defined(HAS_GAUSSCOL_F32_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + GaussCol_F32 = GaussCol_F32_NEON; + } +#endif +#if defined(HAS_GAUSSROW_F32_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + GaussRow_F32 = GaussRow_F32_NEON; + } +#endif + { + // 2 pixels on each side, but aligned out to 16 bytes. + align_buffer_64(rowbuf, (4 + width + 4) * 4); + memset(rowbuf, 0, 16); + memset(rowbuf + (4 + width) * 4, 0, 16); + float* row = (float*)(rowbuf + 16); + const float* src0 = src; + const float* src1 = src; + const float* src2 = src; + const float* src3 = src2 + ((height > 1) ? src_stride : 0); + const float* src4 = src3 + ((height > 2) ? src_stride : 0); + + for (y = 0; y < height; ++y) { + GaussCol_F32(src0, src1, src2, src3, src4, row, width); + + // Extrude edge by 2 floats + row[-2] = row[-1] = row[0]; + row[width + 1] = row[width] = row[width - 1]; + + GaussRow_F32(row - 2, dst, width); + + src0 = src1; + src1 = src2; + src2 = src3; + src3 = src4; + if ((y + 2) < (height - 1)) { + src4 += src_stride; + } + dst += dst_stride; + } + free_aligned_buffer_64(rowbuf); + } + return 0; +} + // Sobel ARGB effect. static int ARGBSobelize(const uint8_t* src_argb, int src_stride_argb, @@ -3097,14 +3167,6 @@ static int ARGBSobelize(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYJROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToYJRow = ARGBToYJRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGBToYJRow = ARGBToYJRow_MSA; - } - } -#endif #if defined(HAS_ARGBTOYJROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToYJRow = ARGBToYJRow_Any_MMI; @@ -3113,6 +3175,14 @@ static int ARGBSobelize(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYJRow = ARGBToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_MSA; + } + } +#endif #if defined(HAS_SOBELYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { @@ -3124,16 +3194,16 @@ static int ARGBSobelize(const uint8_t* src_argb, SobelYRow = SobelYRow_NEON; } #endif -#if defined(HAS_SOBELYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - SobelYRow = SobelYRow_MSA; - } -#endif #if defined(HAS_SOBELYROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { SobelYRow = SobelYRow_MMI; } #endif +#if defined(HAS_SOBELYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SobelYRow = SobelYRow_MSA; + } +#endif #if defined(HAS_SOBELXROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { SobelXRow = SobelXRow_SSE2; @@ -3144,16 +3214,16 @@ static int ARGBSobelize(const uint8_t* src_argb, SobelXRow = SobelXRow_NEON; } #endif -#if defined(HAS_SOBELXROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - SobelXRow = SobelXRow_MSA; - } -#endif #if defined(HAS_SOBELXROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { SobelXRow = SobelXRow_MMI; } #endif +#if defined(HAS_SOBELXROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SobelXRow = SobelXRow_MSA; + } +#endif { // 3 rows with edges before/after. const int kRowSize = (width + kEdge + 31) & ~31; @@ -3228,14 +3298,6 @@ int ARGBSobel(const uint8_t* src_argb, } } #endif -#if defined(HAS_SOBELROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - SobelRow = SobelRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - SobelRow = SobelRow_MSA; - } - } -#endif #if defined(HAS_SOBELROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { SobelRow = SobelRow_Any_MMI; @@ -3244,6 +3306,14 @@ int ARGBSobel(const uint8_t* src_argb, } } #endif +#if defined(HAS_SOBELROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SobelRow = SobelRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + SobelRow = SobelRow_MSA; + } + } +#endif return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, height, SobelRow); } @@ -3274,14 +3344,6 @@ int ARGBSobelToPlane(const uint8_t* src_argb, } } #endif -#if defined(HAS_SOBELTOPLANEROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - SobelToPlaneRow = SobelToPlaneRow_Any_MSA; - if (IS_ALIGNED(width, 32)) { - SobelToPlaneRow = SobelToPlaneRow_MSA; - } - } -#endif #if defined(HAS_SOBELTOPLANEROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { SobelToPlaneRow = SobelToPlaneRow_Any_MMI; @@ -3290,6 +3352,14 @@ int ARGBSobelToPlane(const uint8_t* src_argb, } } #endif +#if defined(HAS_SOBELTOPLANEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SobelToPlaneRow = SobelToPlaneRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + SobelToPlaneRow = SobelToPlaneRow_MSA; + } + } +#endif return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y, width, height, SobelToPlaneRow); } @@ -3321,14 +3391,6 @@ int ARGBSobelXY(const uint8_t* src_argb, } } #endif -#if defined(HAS_SOBELXYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - SobelXYRow = SobelXYRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - SobelXYRow = SobelXYRow_MSA; - } - } -#endif #if defined(HAS_SOBELXYROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { SobelXYRow = SobelXYRow_Any_MMI; @@ -3337,6 +3399,14 @@ int ARGBSobelXY(const uint8_t* src_argb, } } #endif +#if defined(HAS_SOBELXYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SobelXYRow = SobelXYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + SobelXYRow = SobelXYRow_MSA; + } + } +#endif return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, height, SobelXYRow); } @@ -3634,18 +3704,18 @@ int ARGBExtractAlpha(const uint8_t* src_argb, : ARGBExtractAlphaRow_Any_NEON; } #endif -#if defined(HAS_ARGBEXTRACTALPHAROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_MSA - : ARGBExtractAlphaRow_Any_MSA; - } -#endif #if defined(HAS_ARGBEXTRACTALPHAROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_MMI : ARGBExtractAlphaRow_Any_MMI; } #endif +#if defined(HAS_ARGBEXTRACTALPHAROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_MSA + : ARGBExtractAlphaRow_Any_MSA; + } +#endif for (int y = 0; y < height; ++y) { ARGBExtractAlphaRow(src_argb, dst_a, width); @@ -3766,14 +3836,6 @@ int YUY2ToNV12(const uint8_t* src_yuy2, } } #endif -#if defined(HAS_SPLITUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - SplitUVRow = SplitUVRow_Any_MSA; - if (IS_ALIGNED(width, 32)) { - SplitUVRow = SplitUVRow_MSA; - } - } -#endif #if defined(HAS_SPLITUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { SplitUVRow = SplitUVRow_Any_MMI; @@ -3782,6 +3844,14 @@ int YUY2ToNV12(const uint8_t* src_yuy2, } } #endif +#if defined(HAS_SPLITUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SplitUVRow = SplitUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + SplitUVRow = SplitUVRow_MSA; + } + } +#endif #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; @@ -3806,14 +3876,6 @@ int YUY2ToNV12(const uint8_t* src_yuy2, } } #endif -#if defined(HAS_INTERPOLATEROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - InterpolateRow = InterpolateRow_Any_MSA; - if (IS_ALIGNED(width, 32)) { - InterpolateRow = InterpolateRow_MSA; - } - } -#endif #if defined(HAS_INTERPOLATEROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { InterpolateRow = InterpolateRow_Any_MMI; @@ -3822,6 +3884,14 @@ int YUY2ToNV12(const uint8_t* src_yuy2, } } #endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif { int awidth = halfwidth * 2; @@ -3898,14 +3968,6 @@ int UYVYToNV12(const uint8_t* src_uyvy, } } #endif -#if defined(HAS_SPLITUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - SplitUVRow = SplitUVRow_Any_MSA; - if (IS_ALIGNED(width, 32)) { - SplitUVRow = SplitUVRow_MSA; - } - } -#endif #if defined(HAS_SPLITUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { SplitUVRow = SplitUVRow_Any_MMI; @@ -3914,6 +3976,14 @@ int UYVYToNV12(const uint8_t* src_uyvy, } } #endif +#if defined(HAS_SPLITUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SplitUVRow = SplitUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + SplitUVRow = SplitUVRow_MSA; + } + } +#endif #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; @@ -3938,14 +4008,6 @@ int UYVYToNV12(const uint8_t* src_uyvy, } } #endif -#if defined(HAS_INTERPOLATEROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - InterpolateRow = InterpolateRow_Any_MSA; - if (IS_ALIGNED(width, 32)) { - InterpolateRow = InterpolateRow_MSA; - } - } -#endif #if defined(HAS_INTERPOLATEROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { InterpolateRow = InterpolateRow_Any_MMI; @@ -3954,6 +4016,14 @@ int UYVYToNV12(const uint8_t* src_uyvy, } } #endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif { int awidth = halfwidth * 2; @@ -3981,6 +4051,56 @@ int UYVYToNV12(const uint8_t* src_uyvy, return 0; } +// width and height are src size allowing odd size handling. +LIBYUV_API +void HalfMergeUVPlane(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + int y; + void (*HalfMergeUVRow)(const uint8_t* src_u, int src_stride_u, + const uint8_t* src_v, int src_stride_v, + uint8_t* dst_uv, int width) = HalfMergeUVRow_C; + + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } +#if defined(HAS_HALFMERGEUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { + HalfMergeUVRow = HalfMergeUVRow_NEON; + } +#endif +#if defined(HAS_HALFMERGEUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) { + HalfMergeUVRow = HalfMergeUVRow_SSSE3; + } +#endif +#if defined(HAS_HALFMERGEUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) { + HalfMergeUVRow = HalfMergeUVRow_AVX2; + } +#endif + for (y = 0; y < height - 1; y += 2) { + // Merge a row of U and V into a row of UV. + HalfMergeUVRow(src_u, src_stride_u, src_v, src_stride_v, dst_uv, width); + src_u += src_stride_u * 2; + src_v += src_stride_v * 2; + dst_uv += dst_stride_uv; + } + if (height & 1) { + HalfMergeUVRow(src_u, 0, src_v, 0, dst_uv, width); + } +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/chromium/third_party/libyuv/source/rotate.cc b/chromium/third_party/libyuv/source/rotate.cc index d414186a5a0..32904e47312 100644 --- a/chromium/third_party/libyuv/source/rotate.cc +++ b/chromium/third_party/libyuv/source/rotate.cc @@ -36,6 +36,15 @@ void TransposePlane(const uint8_t* src, void (*TransposeWx8)(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width) = TransposeWx8_C; #endif + +#if defined(HAS_TRANSPOSEWX16_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + TransposeWx16 = TransposeWx16_Any_MSA; + if (IS_ALIGNED(width, 16)) { + TransposeWx16 = TransposeWx16_MSA; + } + } +#else #if defined(HAS_TRANSPOSEWX8_NEON) if (TestCpuFlag(kCpuHasNEON)) { TransposeWx8 = TransposeWx8_NEON; @@ -62,14 +71,7 @@ void TransposePlane(const uint8_t* src, } } #endif -#if defined(HAS_TRANSPOSEWX16_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - TransposeWx16 = TransposeWx16_Any_MSA; - if (IS_ALIGNED(width, 16)) { - TransposeWx16 = TransposeWx16_MSA; - } - } -#endif +#endif /* defined(HAS_TRANSPOSEWX16_MSA) */ #if defined(HAS_TRANSPOSEWX16_MSA) // Work across the source in 16x16 tiles @@ -142,7 +144,7 @@ void RotatePlane180(const uint8_t* src, #if defined(HAS_MIRRORROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MirrorRow = MirrorRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { + if (IS_ALIGNED(width, 32)) { MirrorRow = MirrorRow_NEON; } } @@ -163,14 +165,6 @@ void RotatePlane180(const uint8_t* src, } } #endif -#if defined(HAS_MIRRORROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - MirrorRow = MirrorRow_Any_MSA; - if (IS_ALIGNED(width, 64)) { - MirrorRow = MirrorRow_MSA; - } - } -#endif #if defined(HAS_MIRRORROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { MirrorRow = MirrorRow_Any_MMI; @@ -179,6 +173,14 @@ void RotatePlane180(const uint8_t* src, } } #endif +#if defined(HAS_MIRRORROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MirrorRow = MirrorRow_Any_MSA; + if (IS_ALIGNED(width, 64)) { + MirrorRow = MirrorRow_MSA; + } + } +#endif #if defined(HAS_COPYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; @@ -207,11 +209,11 @@ void RotatePlane180(const uint8_t* src, // Odd height will harmlessly mirror the middle row twice. for (y = 0; y < half_height; ++y) { - MirrorRow(src, row, width); // Mirror first row into a buffer - src += src_stride; + CopyRow(src, row, width); // Copy first row into buffer MirrorRow(src_bot, dst, width); // Mirror last row into first row + MirrorRow(row, dst_bot, width); // Mirror buffer into last row + src += src_stride; dst += dst_stride; - CopyRow(row, dst_bot, width); // Copy first mirrored row into last src_bot -= src_stride; dst_bot -= dst_stride; } @@ -237,6 +239,15 @@ void TransposeUV(const uint8_t* src, int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width) = TransposeUVWx8_C; #endif + +#if defined(HAS_TRANSPOSEUVWX16_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + TransposeUVWx16 = TransposeUVWx16_Any_MSA; + if (IS_ALIGNED(width, 8)) { + TransposeUVWx16 = TransposeUVWx16_MSA; + } + } +#else #if defined(HAS_TRANSPOSEUVWX8_NEON) if (TestCpuFlag(kCpuHasNEON)) { TransposeUVWx8 = TransposeUVWx8_NEON; @@ -258,14 +269,7 @@ void TransposeUV(const uint8_t* src, } } #endif -#if defined(HAS_TRANSPOSEUVWX16_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - TransposeUVWx16 = TransposeUVWx16_Any_MSA; - if (IS_ALIGNED(width, 8)) { - TransposeUVWx16 = TransposeUVWx16_MSA; - } - } -#endif +#endif /* defined(HAS_TRANSPOSEUVWX16_MSA) */ #if defined(HAS_TRANSPOSEUVWX16_MSA) // Work through the source in 8x8 tiles. @@ -340,26 +344,26 @@ void RotateUV180(const uint8_t* src, int width, int height) { int i; - void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v, - int width) = MirrorUVRow_C; -#if defined(HAS_MIRRORUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { - MirrorUVRow = MirrorUVRow_NEON; + void (*MirrorSplitUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v, + int width) = MirrorSplitUVRow_C; +#if defined(HAS_MIRRORSPLITUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { + MirrorSplitUVRow = MirrorSplitUVRow_NEON; } #endif -#if defined(HAS_MIRRORUVROW_SSSE3) +#if defined(HAS_MIRRORSPLITUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) { - MirrorUVRow = MirrorUVRow_SSSE3; + MirrorSplitUVRow = MirrorSplitUVRow_SSSE3; } #endif -#if defined(HAS_MIRRORUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 32)) { - MirrorUVRow = MirrorUVRow_MSA; +#if defined(HAS_MIRRORSPLITUVROW_MMI) + if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 8)) { + MirrorSplitUVRow = MirrorSplitUVRow_MMI; } #endif -#if defined(HAS_MIRRORUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 8)) { - MirrorUVRow = MirrorUVRow_MMI; +#if defined(HAS_MIRRORSPLITUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 32)) { + MirrorSplitUVRow = MirrorSplitUVRow_MSA; } #endif @@ -367,7 +371,7 @@ void RotateUV180(const uint8_t* src, dst_b += dst_stride_b * (height - 1); for (i = 0; i < height; ++i) { - MirrorUVRow(src, dst_a, dst_b, width); + MirrorSplitUVRow(src, dst_a, dst_b, width); src += src_stride; dst_a -= dst_stride_a; dst_b -= dst_stride_b; diff --git a/chromium/third_party/libyuv/source/rotate_argb.cc b/chromium/third_party/libyuv/source/rotate_argb.cc index a93fd55f921..ae653886018 100644 --- a/chromium/third_party/libyuv/source/rotate_argb.cc +++ b/chromium/third_party/libyuv/source/rotate_argb.cc @@ -21,17 +21,21 @@ namespace libyuv { extern "C" { #endif -static void ARGBTranspose(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { +static int ARGBTranspose(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int i; int src_pixel_step = src_stride_argb >> 2; void (*ScaleARGBRowDownEven)( const uint8_t* src_argb, ptrdiff_t src_stride_argb, int src_step, uint8_t* dst_argb, int dst_width) = ScaleARGBRowDownEven_C; + // Check stride is a multiple of 4. + if (src_stride_argb & 3) { + return -1; + } #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_SSE2; @@ -48,14 +52,6 @@ static void ARGBTranspose(const uint8_t* src_argb, } } #endif -#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_MSA; - if (IS_ALIGNED(height, 4)) { // Width of dest. - ScaleARGBRowDownEven = ScaleARGBRowDownEven_MSA; - } - } -#endif #if defined(HAS_SCALEARGBROWDOWNEVEN_MMI) if (TestCpuFlag(kCpuHasMMI)) { ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_MMI; @@ -64,50 +60,59 @@ static void ARGBTranspose(const uint8_t* src_argb, } } #endif +#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_MSA; + if (IS_ALIGNED(height, 4)) { // Width of dest. + ScaleARGBRowDownEven = ScaleARGBRowDownEven_MSA; + } + } +#endif for (i = 0; i < width; ++i) { // column of source to row of dest. ScaleARGBRowDownEven(src_argb, 0, src_pixel_step, dst_argb, height); dst_argb += dst_stride_argb; src_argb += 4; } + return 0; } -void ARGBRotate90(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { +static int ARGBRotate90(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { // Rotate by 90 is a ARGBTranspose with the source read // from bottom to top. So set the source pointer to the end // of the buffer and flip the sign of the source stride. src_argb += src_stride_argb * (height - 1); src_stride_argb = -src_stride_argb; - ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, - height); + return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, + width, height); } -void ARGBRotate270(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { +static int ARGBRotate270(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { // Rotate by 270 is a ARGBTranspose with the destination written // from bottom to top. So set the destination pointer to the end // of the buffer and flip the sign of the destination stride. dst_argb += dst_stride_argb * (width - 1); dst_stride_argb = -dst_stride_argb; - ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, - height); + return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, + width, height); } -void ARGBRotate180(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { +static int ARGBRotate180(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { // Swap first and last row and mirror the content. Uses a temporary row. align_buffer_64(row, width * 4); const uint8_t* src_bot = src_argb + src_stride_argb * (height - 1); @@ -121,7 +126,7 @@ void ARGBRotate180(const uint8_t* src_argb, #if defined(HAS_ARGBMIRRORROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBMirrorRow = ARGBMirrorRow_Any_NEON; - if (IS_ALIGNED(width, 4)) { + if (IS_ALIGNED(width, 8)) { ARGBMirrorRow = ARGBMirrorRow_NEON; } } @@ -142,14 +147,6 @@ void ARGBRotate180(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBMIRRORROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBMirrorRow = ARGBMirrorRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGBMirrorRow = ARGBMirrorRow_MSA; - } - } -#endif #if defined(HAS_ARGBMIRRORROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBMirrorRow = ARGBMirrorRow_Any_MMI; @@ -158,6 +155,14 @@ void ARGBRotate180(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBMIRRORROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBMirrorRow = ARGBMirrorRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBMirrorRow = ARGBMirrorRow_MSA; + } + } +#endif #if defined(HAS_COPYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; @@ -190,6 +195,7 @@ void ARGBRotate180(const uint8_t* src_argb, dst_bot -= dst_stride_argb; } free_aligned_buffer_64(row); + return 0; } LIBYUV_API @@ -217,17 +223,14 @@ int ARGBRotate(const uint8_t* src_argb, return ARGBCopy(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, height); case kRotate90: - ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, - height); - return 0; + return ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb, + width, height); case kRotate270: - ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, - height); - return 0; + return ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb, + width, height); case kRotate180: - ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, - height); - return 0; + return ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb, + width, height); default: break; } diff --git a/chromium/third_party/libyuv/source/rotate_gcc.cc b/chromium/third_party/libyuv/source/rotate_gcc.cc index 04e19e29eef..fd359d4ae69 100644 --- a/chromium/third_party/libyuv/source/rotate_gcc.cc +++ b/chromium/third_party/libyuv/source/rotate_gcc.cc @@ -31,75 +31,75 @@ void TransposeWx8_SSSE3(const uint8_t* src, // Read in the data from the source pointer. // First round of bit swap. LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" - "movq (%0,%3),%%xmm1 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "movq (%0),%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "palignr $0x8,%%xmm1,%%xmm1 \n" - "movq (%0,%3),%%xmm3 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "movdqa %%xmm2,%%xmm3 \n" - "movq (%0),%%xmm4 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "movq (%0,%3),%%xmm5 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm5,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "movq (%0),%%xmm6 \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "movq (%0,%3),%%xmm7 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm7,%%xmm6 \n" - "neg %3 \n" - "movdqa %%xmm6,%%xmm7 \n" - "lea 0x8(%0,%3,8),%0 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "neg %3 \n" + "1: \n" + "movq (%0),%%xmm0 \n" + "movq (%0,%3),%%xmm1 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "movq (%0),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "palignr $0x8,%%xmm1,%%xmm1 \n" + "movq (%0,%3),%%xmm3 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "movq (%0),%%xmm4 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "movq (%0,%3),%%xmm5 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "movq (%0),%%xmm6 \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq (%0,%3),%%xmm7 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "neg %3 \n" + "movdqa %%xmm6,%%xmm7 \n" + "lea 0x8(%0,%3,8),%0 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "neg %3 \n" // Second round of bit swap. - "punpcklwd %%xmm2,%%xmm0 \n" - "punpcklwd %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "palignr $0x8,%%xmm2,%%xmm2 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "punpcklwd %%xmm6,%%xmm4 \n" - "punpcklwd %%xmm7,%%xmm5 \n" - "movdqa %%xmm4,%%xmm6 \n" - "movdqa %%xmm5,%%xmm7 \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "palignr $0x8,%%xmm2,%%xmm2 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "movdqa %%xmm5,%%xmm7 \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" // Third round of bit swap. // Write to the destination pointer. - "punpckldq %%xmm4,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "movdqa %%xmm0,%%xmm4 \n" - "palignr $0x8,%%xmm4,%%xmm4 \n" - "movq %%xmm4,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm6,%%xmm2 \n" - "movdqa %%xmm2,%%xmm6 \n" - "movq %%xmm2,(%1) \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "punpckldq %%xmm5,%%xmm1 \n" - "movq %%xmm6,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "movdqa %%xmm1,%%xmm5 \n" - "movq %%xmm1,(%1) \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "movq %%xmm5,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm7,%%xmm3 \n" - "movq %%xmm3,(%1) \n" - "movdqa %%xmm3,%%xmm7 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "sub $0x8,%2 \n" - "movq %%xmm7,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "jg 1b \n" + "punpckldq %%xmm4,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqa %%xmm0,%%xmm4 \n" + "palignr $0x8,%%xmm4,%%xmm4 \n" + "movq %%xmm4,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movdqa %%xmm2,%%xmm6 \n" + "movq %%xmm2,(%1) \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movq %%xmm6,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm1,%%xmm5 \n" + "movq %%xmm1,(%1) \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq %%xmm5,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movq %%xmm3,(%1) \n" + "movdqa %%xmm3,%%xmm7 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "sub $0x8,%2 \n" + "movq %%xmm7,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -121,127 +121,127 @@ void TransposeWx8_Fast_SSSE3(const uint8_t* src, // Read in the data from the source pointer. // First round of bit swap. LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%0,%3),%%xmm1 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm0,%%xmm8 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm8 \n" - "movdqu (%0),%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm8,%%xmm9 \n" - "palignr $0x8,%%xmm1,%%xmm1 \n" - "palignr $0x8,%%xmm9,%%xmm9 \n" - "movdqu (%0,%3),%%xmm3 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm2,%%xmm10 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "punpckhbw %%xmm3,%%xmm10 \n" - "movdqa %%xmm2,%%xmm3 \n" - "movdqa %%xmm10,%%xmm11 \n" - "movdqu (%0),%%xmm4 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "palignr $0x8,%%xmm11,%%xmm11 \n" - "movdqu (%0,%3),%%xmm5 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm4,%%xmm12 \n" - "punpcklbw %%xmm5,%%xmm4 \n" - "punpckhbw %%xmm5,%%xmm12 \n" - "movdqa %%xmm4,%%xmm5 \n" - "movdqa %%xmm12,%%xmm13 \n" - "movdqu (%0),%%xmm6 \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "palignr $0x8,%%xmm13,%%xmm13 \n" - "movdqu (%0,%3),%%xmm7 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm6,%%xmm14 \n" - "punpcklbw %%xmm7,%%xmm6 \n" - "punpckhbw %%xmm7,%%xmm14 \n" - "neg %3 \n" - "movdqa %%xmm6,%%xmm7 \n" - "movdqa %%xmm14,%%xmm15 \n" - "lea 0x10(%0,%3,8),%0 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "palignr $0x8,%%xmm15,%%xmm15 \n" - "neg %3 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu (%0,%3),%%xmm1 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm0,%%xmm8 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm8 \n" + "movdqu (%0),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm8,%%xmm9 \n" + "palignr $0x8,%%xmm1,%%xmm1 \n" + "palignr $0x8,%%xmm9,%%xmm9 \n" + "movdqu (%0,%3),%%xmm3 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm2,%%xmm10 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "punpckhbw %%xmm3,%%xmm10 \n" + "movdqa %%xmm2,%%xmm3 \n" + "movdqa %%xmm10,%%xmm11 \n" + "movdqu (%0),%%xmm4 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "palignr $0x8,%%xmm11,%%xmm11 \n" + "movdqu (%0,%3),%%xmm5 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm4,%%xmm12 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "punpckhbw %%xmm5,%%xmm12 \n" + "movdqa %%xmm4,%%xmm5 \n" + "movdqa %%xmm12,%%xmm13 \n" + "movdqu (%0),%%xmm6 \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "palignr $0x8,%%xmm13,%%xmm13 \n" + "movdqu (%0,%3),%%xmm7 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm6,%%xmm14 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "punpckhbw %%xmm7,%%xmm14 \n" + "neg %3 \n" + "movdqa %%xmm6,%%xmm7 \n" + "movdqa %%xmm14,%%xmm15 \n" + "lea 0x10(%0,%3,8),%0 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + "neg %3 \n" // Second round of bit swap. - "punpcklwd %%xmm2,%%xmm0 \n" - "punpcklwd %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "palignr $0x8,%%xmm2,%%xmm2 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "punpcklwd %%xmm6,%%xmm4 \n" - "punpcklwd %%xmm7,%%xmm5 \n" - "movdqa %%xmm4,%%xmm6 \n" - "movdqa %%xmm5,%%xmm7 \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "punpcklwd %%xmm10,%%xmm8 \n" - "punpcklwd %%xmm11,%%xmm9 \n" - "movdqa %%xmm8,%%xmm10 \n" - "movdqa %%xmm9,%%xmm11 \n" - "palignr $0x8,%%xmm10,%%xmm10 \n" - "palignr $0x8,%%xmm11,%%xmm11 \n" - "punpcklwd %%xmm14,%%xmm12 \n" - "punpcklwd %%xmm15,%%xmm13 \n" - "movdqa %%xmm12,%%xmm14 \n" - "movdqa %%xmm13,%%xmm15 \n" - "palignr $0x8,%%xmm14,%%xmm14 \n" - "palignr $0x8,%%xmm15,%%xmm15 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "palignr $0x8,%%xmm2,%%xmm2 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "movdqa %%xmm5,%%xmm7 \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "punpcklwd %%xmm10,%%xmm8 \n" + "punpcklwd %%xmm11,%%xmm9 \n" + "movdqa %%xmm8,%%xmm10 \n" + "movdqa %%xmm9,%%xmm11 \n" + "palignr $0x8,%%xmm10,%%xmm10 \n" + "palignr $0x8,%%xmm11,%%xmm11 \n" + "punpcklwd %%xmm14,%%xmm12 \n" + "punpcklwd %%xmm15,%%xmm13 \n" + "movdqa %%xmm12,%%xmm14 \n" + "movdqa %%xmm13,%%xmm15 \n" + "palignr $0x8,%%xmm14,%%xmm14 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" // Third round of bit swap. // Write to the destination pointer. - "punpckldq %%xmm4,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "movdqa %%xmm0,%%xmm4 \n" - "palignr $0x8,%%xmm4,%%xmm4 \n" - "movq %%xmm4,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm6,%%xmm2 \n" - "movdqa %%xmm2,%%xmm6 \n" - "movq %%xmm2,(%1) \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "punpckldq %%xmm5,%%xmm1 \n" - "movq %%xmm6,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "movdqa %%xmm1,%%xmm5 \n" - "movq %%xmm1,(%1) \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "movq %%xmm5,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm7,%%xmm3 \n" - "movq %%xmm3,(%1) \n" - "movdqa %%xmm3,%%xmm7 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "movq %%xmm7,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm12,%%xmm8 \n" - "movq %%xmm8,(%1) \n" - "movdqa %%xmm8,%%xmm12 \n" - "palignr $0x8,%%xmm12,%%xmm12 \n" - "movq %%xmm12,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm14,%%xmm10 \n" - "movdqa %%xmm10,%%xmm14 \n" - "movq %%xmm10,(%1) \n" - "palignr $0x8,%%xmm14,%%xmm14 \n" - "punpckldq %%xmm13,%%xmm9 \n" - "movq %%xmm14,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "movdqa %%xmm9,%%xmm13 \n" - "movq %%xmm9,(%1) \n" - "palignr $0x8,%%xmm13,%%xmm13 \n" - "movq %%xmm13,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm15,%%xmm11 \n" - "movq %%xmm11,(%1) \n" - "movdqa %%xmm11,%%xmm15 \n" - "palignr $0x8,%%xmm15,%%xmm15 \n" - "sub $0x10,%2 \n" - "movq %%xmm15,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "jg 1b \n" + "punpckldq %%xmm4,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqa %%xmm0,%%xmm4 \n" + "palignr $0x8,%%xmm4,%%xmm4 \n" + "movq %%xmm4,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movdqa %%xmm2,%%xmm6 \n" + "movq %%xmm2,(%1) \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movq %%xmm6,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm1,%%xmm5 \n" + "movq %%xmm1,(%1) \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq %%xmm5,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movq %%xmm3,(%1) \n" + "movdqa %%xmm3,%%xmm7 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "movq %%xmm7,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm12,%%xmm8 \n" + "movq %%xmm8,(%1) \n" + "movdqa %%xmm8,%%xmm12 \n" + "palignr $0x8,%%xmm12,%%xmm12 \n" + "movq %%xmm12,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm14,%%xmm10 \n" + "movdqa %%xmm10,%%xmm14 \n" + "movq %%xmm10,(%1) \n" + "palignr $0x8,%%xmm14,%%xmm14 \n" + "punpckldq %%xmm13,%%xmm9 \n" + "movq %%xmm14,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm9,%%xmm13 \n" + "movq %%xmm9,(%1) \n" + "palignr $0x8,%%xmm13,%%xmm13 \n" + "movq %%xmm13,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm15,%%xmm11 \n" + "movq %%xmm11,(%1) \n" + "movdqa %%xmm11,%%xmm15 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + "sub $0x10,%2 \n" + "movq %%xmm15,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -266,95 +266,95 @@ void TransposeUVWx8_SSE2(const uint8_t* src, // Read in the data from the source pointer. // First round of bit swap. LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%0,%4),%%xmm1 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm0,%%xmm8 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm8 \n" - "movdqa %%xmm8,%%xmm1 \n" - "movdqu (%0),%%xmm2 \n" - "movdqu (%0,%4),%%xmm3 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm2,%%xmm8 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "punpckhbw %%xmm3,%%xmm8 \n" - "movdqa %%xmm8,%%xmm3 \n" - "movdqu (%0),%%xmm4 \n" - "movdqu (%0,%4),%%xmm5 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm4,%%xmm8 \n" - "punpcklbw %%xmm5,%%xmm4 \n" - "punpckhbw %%xmm5,%%xmm8 \n" - "movdqa %%xmm8,%%xmm5 \n" - "movdqu (%0),%%xmm6 \n" - "movdqu (%0,%4),%%xmm7 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm6,%%xmm8 \n" - "punpcklbw %%xmm7,%%xmm6 \n" - "neg %4 \n" - "lea 0x10(%0,%4,8),%0 \n" - "punpckhbw %%xmm7,%%xmm8 \n" - "movdqa %%xmm8,%%xmm7 \n" - "neg %4 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu (%0,%4),%%xmm1 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm0,%%xmm8 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm8 \n" + "movdqa %%xmm8,%%xmm1 \n" + "movdqu (%0),%%xmm2 \n" + "movdqu (%0,%4),%%xmm3 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm2,%%xmm8 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "punpckhbw %%xmm3,%%xmm8 \n" + "movdqa %%xmm8,%%xmm3 \n" + "movdqu (%0),%%xmm4 \n" + "movdqu (%0,%4),%%xmm5 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm4,%%xmm8 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "punpckhbw %%xmm5,%%xmm8 \n" + "movdqa %%xmm8,%%xmm5 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu (%0,%4),%%xmm7 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm6,%%xmm8 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "neg %4 \n" + "lea 0x10(%0,%4,8),%0 \n" + "punpckhbw %%xmm7,%%xmm8 \n" + "movdqa %%xmm8,%%xmm7 \n" + "neg %4 \n" // Second round of bit swap. - "movdqa %%xmm0,%%xmm8 \n" - "movdqa %%xmm1,%%xmm9 \n" - "punpckhwd %%xmm2,%%xmm8 \n" - "punpckhwd %%xmm3,%%xmm9 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpcklwd %%xmm3,%%xmm1 \n" - "movdqa %%xmm8,%%xmm2 \n" - "movdqa %%xmm9,%%xmm3 \n" - "movdqa %%xmm4,%%xmm8 \n" - "movdqa %%xmm5,%%xmm9 \n" - "punpckhwd %%xmm6,%%xmm8 \n" - "punpckhwd %%xmm7,%%xmm9 \n" - "punpcklwd %%xmm6,%%xmm4 \n" - "punpcklwd %%xmm7,%%xmm5 \n" - "movdqa %%xmm8,%%xmm6 \n" - "movdqa %%xmm9,%%xmm7 \n" + "movdqa %%xmm0,%%xmm8 \n" + "movdqa %%xmm1,%%xmm9 \n" + "punpckhwd %%xmm2,%%xmm8 \n" + "punpckhwd %%xmm3,%%xmm9 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm8,%%xmm2 \n" + "movdqa %%xmm9,%%xmm3 \n" + "movdqa %%xmm4,%%xmm8 \n" + "movdqa %%xmm5,%%xmm9 \n" + "punpckhwd %%xmm6,%%xmm8 \n" + "punpckhwd %%xmm7,%%xmm9 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm8,%%xmm6 \n" + "movdqa %%xmm9,%%xmm7 \n" // Third round of bit swap. // Write to the destination pointer. - "movdqa %%xmm0,%%xmm8 \n" - "punpckldq %%xmm4,%%xmm0 \n" - "movlpd %%xmm0,(%1) \n" // Write back U channel - "movhpd %%xmm0,(%2) \n" // Write back V channel - "punpckhdq %%xmm4,%%xmm8 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "movdqa %%xmm2,%%xmm8 \n" - "punpckldq %%xmm6,%%xmm2 \n" - "movlpd %%xmm2,(%1) \n" - "movhpd %%xmm2,(%2) \n" - "punpckhdq %%xmm6,%%xmm8 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "movdqa %%xmm1,%%xmm8 \n" - "punpckldq %%xmm5,%%xmm1 \n" - "movlpd %%xmm1,(%1) \n" - "movhpd %%xmm1,(%2) \n" - "punpckhdq %%xmm5,%%xmm8 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "movdqa %%xmm3,%%xmm8 \n" - "punpckldq %%xmm7,%%xmm3 \n" - "movlpd %%xmm3,(%1) \n" - "movhpd %%xmm3,(%2) \n" - "punpckhdq %%xmm7,%%xmm8 \n" - "sub $0x8,%3 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "jg 1b \n" + "movdqa %%xmm0,%%xmm8 \n" + "punpckldq %%xmm4,%%xmm0 \n" + "movlpd %%xmm0,(%1) \n" // Write back U channel + "movhpd %%xmm0,(%2) \n" // Write back V channel + "punpckhdq %%xmm4,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm2,%%xmm8 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movlpd %%xmm2,(%1) \n" + "movhpd %%xmm2,(%2) \n" + "punpckhdq %%xmm6,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm1,%%xmm8 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movlpd %%xmm1,(%1) \n" + "movhpd %%xmm1,(%2) \n" + "punpckhdq %%xmm5,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm3,%%xmm8 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movlpd %%xmm3,(%1) \n" + "movhpd %%xmm3,(%2) \n" + "punpckhdq %%xmm7,%%xmm8 \n" + "sub $0x8,%3 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst_a), // %1 "+r"(dst_b), // %2 diff --git a/chromium/third_party/libyuv/source/rotate_neon.cc b/chromium/third_party/libyuv/source/rotate_neon.cc index fdc0dd476c6..844df2bf305 100644 --- a/chromium/third_party/libyuv/source/rotate_neon.cc +++ b/chromium/third_party/libyuv/source/rotate_neon.cc @@ -38,52 +38,52 @@ void TransposeWx8_NEON(const uint8_t* src, // handle 8x8 blocks. this should be the majority of the plane "1: \n" - "mov %0, %1 \n" - - "vld1.8 {d0}, [%0], %2 \n" - "vld1.8 {d1}, [%0], %2 \n" - "vld1.8 {d2}, [%0], %2 \n" - "vld1.8 {d3}, [%0], %2 \n" - "vld1.8 {d4}, [%0], %2 \n" - "vld1.8 {d5}, [%0], %2 \n" - "vld1.8 {d6}, [%0], %2 \n" - "vld1.8 {d7}, [%0] \n" - - "vtrn.8 d1, d0 \n" - "vtrn.8 d3, d2 \n" - "vtrn.8 d5, d4 \n" - "vtrn.8 d7, d6 \n" - - "vtrn.16 d1, d3 \n" - "vtrn.16 d0, d2 \n" - "vtrn.16 d5, d7 \n" - "vtrn.16 d4, d6 \n" - - "vtrn.32 d1, d5 \n" - "vtrn.32 d0, d4 \n" - "vtrn.32 d3, d7 \n" - "vtrn.32 d2, d6 \n" - - "vrev16.8 q0, q0 \n" - "vrev16.8 q1, q1 \n" - "vrev16.8 q2, q2 \n" - "vrev16.8 q3, q3 \n" - - "mov %0, %3 \n" - - "vst1.8 {d1}, [%0], %4 \n" - "vst1.8 {d0}, [%0], %4 \n" - "vst1.8 {d3}, [%0], %4 \n" - "vst1.8 {d2}, [%0], %4 \n" - "vst1.8 {d5}, [%0], %4 \n" - "vst1.8 {d4}, [%0], %4 \n" - "vst1.8 {d7}, [%0], %4 \n" - "vst1.8 {d6}, [%0] \n" - - "add %1, #8 \n" // src += 8 - "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride - "subs %5, #8 \n" // w -= 8 - "bge 1b \n" + "mov %0, %1 \n" + + "vld1.8 {d0}, [%0], %2 \n" + "vld1.8 {d1}, [%0], %2 \n" + "vld1.8 {d2}, [%0], %2 \n" + "vld1.8 {d3}, [%0], %2 \n" + "vld1.8 {d4}, [%0], %2 \n" + "vld1.8 {d5}, [%0], %2 \n" + "vld1.8 {d6}, [%0], %2 \n" + "vld1.8 {d7}, [%0] \n" + + "vtrn.8 d1, d0 \n" + "vtrn.8 d3, d2 \n" + "vtrn.8 d5, d4 \n" + "vtrn.8 d7, d6 \n" + + "vtrn.16 d1, d3 \n" + "vtrn.16 d0, d2 \n" + "vtrn.16 d5, d7 \n" + "vtrn.16 d4, d6 \n" + + "vtrn.32 d1, d5 \n" + "vtrn.32 d0, d4 \n" + "vtrn.32 d3, d7 \n" + "vtrn.32 d2, d6 \n" + + "vrev16.8 q0, q0 \n" + "vrev16.8 q1, q1 \n" + "vrev16.8 q2, q2 \n" + "vrev16.8 q3, q3 \n" + + "mov %0, %3 \n" + + "vst1.8 {d1}, [%0], %4 \n" + "vst1.8 {d0}, [%0], %4 \n" + "vst1.8 {d3}, [%0], %4 \n" + "vst1.8 {d2}, [%0], %4 \n" + "vst1.8 {d5}, [%0], %4 \n" + "vst1.8 {d4}, [%0], %4 \n" + "vst1.8 {d7}, [%0], %4 \n" + "vst1.8 {d6}, [%0] \n" + + "add %1, #8 \n" // src += 8 + "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride + "subs %5, #8 \n" // w -= 8 + "bge 1b \n" // add 8 back to counter. if the result is 0 there are // no residuals. @@ -208,68 +208,70 @@ void TransposeUVWx8_NEON(const uint8_t* src, // handle 8x8 blocks. this should be the majority of the plane "1: \n" - "mov %0, %1 \n" - - "vld2.8 {d0, d1}, [%0], %2 \n" - "vld2.8 {d2, d3}, [%0], %2 \n" - "vld2.8 {d4, d5}, [%0], %2 \n" - "vld2.8 {d6, d7}, [%0], %2 \n" - "vld2.8 {d16, d17}, [%0], %2 \n" - "vld2.8 {d18, d19}, [%0], %2 \n" - "vld2.8 {d20, d21}, [%0], %2 \n" - "vld2.8 {d22, d23}, [%0] \n" - - "vtrn.8 q1, q0 \n" - "vtrn.8 q3, q2 \n" - "vtrn.8 q9, q8 \n" - "vtrn.8 q11, q10 \n" - - "vtrn.16 q1, q3 \n" - "vtrn.16 q0, q2 \n" - "vtrn.16 q9, q11 \n" - "vtrn.16 q8, q10 \n" - - "vtrn.32 q1, q9 \n" - "vtrn.32 q0, q8 \n" - "vtrn.32 q3, q11 \n" - "vtrn.32 q2, q10 \n" - - "vrev16.8 q0, q0 \n" - "vrev16.8 q1, q1 \n" - "vrev16.8 q2, q2 \n" - "vrev16.8 q3, q3 \n" - "vrev16.8 q8, q8 \n" - "vrev16.8 q9, q9 \n" - "vrev16.8 q10, q10 \n" - "vrev16.8 q11, q11 \n" - - "mov %0, %3 \n" - - "vst1.8 {d2}, [%0], %4 \n" - "vst1.8 {d0}, [%0], %4 \n" - "vst1.8 {d6}, [%0], %4 \n" - "vst1.8 {d4}, [%0], %4 \n" - "vst1.8 {d18}, [%0], %4 \n" - "vst1.8 {d16}, [%0], %4 \n" - "vst1.8 {d22}, [%0], %4 \n" - "vst1.8 {d20}, [%0] \n" - - "mov %0, %5 \n" - - "vst1.8 {d3}, [%0], %6 \n" - "vst1.8 {d1}, [%0], %6 \n" - "vst1.8 {d7}, [%0], %6 \n" - "vst1.8 {d5}, [%0], %6 \n" - "vst1.8 {d19}, [%0], %6 \n" - "vst1.8 {d17}, [%0], %6 \n" - "vst1.8 {d23}, [%0], %6 \n" - "vst1.8 {d21}, [%0] \n" - - "add %1, #8*2 \n" // src += 8*2 - "add %3, %3, %4, lsl #3 \n" // dst_a += 8 * dst_stride_a - "add %5, %5, %6, lsl #3 \n" // dst_b += 8 * dst_stride_b - "subs %7, #8 \n" // w -= 8 - "bge 1b \n" + "mov %0, %1 \n" + + "vld2.8 {d0, d1}, [%0], %2 \n" + "vld2.8 {d2, d3}, [%0], %2 \n" + "vld2.8 {d4, d5}, [%0], %2 \n" + "vld2.8 {d6, d7}, [%0], %2 \n" + "vld2.8 {d16, d17}, [%0], %2 \n" + "vld2.8 {d18, d19}, [%0], %2 \n" + "vld2.8 {d20, d21}, [%0], %2 \n" + "vld2.8 {d22, d23}, [%0] \n" + + "vtrn.8 q1, q0 \n" + "vtrn.8 q3, q2 \n" + "vtrn.8 q9, q8 \n" + "vtrn.8 q11, q10 \n" + + "vtrn.16 q1, q3 \n" + "vtrn.16 q0, q2 \n" + "vtrn.16 q9, q11 \n" + "vtrn.16 q8, q10 \n" + + "vtrn.32 q1, q9 \n" + "vtrn.32 q0, q8 \n" + "vtrn.32 q3, q11 \n" + "vtrn.32 q2, q10 \n" + + "vrev16.8 q0, q0 \n" + "vrev16.8 q1, q1 \n" + "vrev16.8 q2, q2 \n" + "vrev16.8 q3, q3 \n" + "vrev16.8 q8, q8 \n" + "vrev16.8 q9, q9 \n" + "vrev16.8 q10, q10 \n" + "vrev16.8 q11, q11 \n" + + "mov %0, %3 \n" + + "vst1.8 {d2}, [%0], %4 \n" + "vst1.8 {d0}, [%0], %4 \n" + "vst1.8 {d6}, [%0], %4 \n" + "vst1.8 {d4}, [%0], %4 \n" + "vst1.8 {d18}, [%0], %4 \n" + "vst1.8 {d16}, [%0], %4 \n" + "vst1.8 {d22}, [%0], %4 \n" + "vst1.8 {d20}, [%0] \n" + + "mov %0, %5 \n" + + "vst1.8 {d3}, [%0], %6 \n" + "vst1.8 {d1}, [%0], %6 \n" + "vst1.8 {d7}, [%0], %6 \n" + "vst1.8 {d5}, [%0], %6 \n" + "vst1.8 {d19}, [%0], %6 \n" + "vst1.8 {d17}, [%0], %6 \n" + "vst1.8 {d23}, [%0], %6 \n" + "vst1.8 {d21}, [%0] \n" + + "add %1, #8*2 \n" // src += 8*2 + "add %3, %3, %4, lsl #3 \n" // dst_a += 8 * + // dst_stride_a + "add %5, %5, %6, lsl #3 \n" // dst_b += 8 * + // dst_stride_b + "subs %7, #8 \n" // w -= 8 + "bge 1b \n" // add 8 back to counter. if the result is 0 there are // no residuals. diff --git a/chromium/third_party/libyuv/source/rotate_neon64.cc b/chromium/third_party/libyuv/source/rotate_neon64.cc index f469baacf68..43c1581731d 100644 --- a/chromium/third_party/libyuv/source/rotate_neon64.cc +++ b/chromium/third_party/libyuv/source/rotate_neon64.cc @@ -34,58 +34,74 @@ void TransposeWx8_NEON(const uint8_t* src, // loops are on blocks of 8. loop will stop when // counter gets to or below 0. starting the counter // at w-8 allow for this - "sub %w3, %w3, #8 \n" + "sub %w3, %w3, #8 \n" // handle 8x8 blocks. this should be the majority of the plane - "1: \n" + "1: \n" + "mov %0, %1 \n" + + "ld1 {v0.8b}, [%0], %5 \n" + "ld1 {v1.8b}, [%0], %5 \n" + "ld1 {v2.8b}, [%0], %5 \n" + "ld1 {v3.8b}, [%0], %5 \n" + "ld1 {v4.8b}, [%0], %5 \n" + "ld1 {v5.8b}, [%0], %5 \n" + "ld1 {v6.8b}, [%0], %5 \n" + "ld1 {v7.8b}, [%0] \n" "mov %0, %1 \n" - "ld1 {v0.8b}, [%0], %5 \n" - "ld1 {v1.8b}, [%0], %5 \n" - "ld1 {v2.8b}, [%0], %5 \n" - "ld1 {v3.8b}, [%0], %5 \n" - "ld1 {v4.8b}, [%0], %5 \n" - "ld1 {v5.8b}, [%0], %5 \n" - "ld1 {v6.8b}, [%0], %5 \n" - "ld1 {v7.8b}, [%0] \n" - - "trn2 v16.8b, v0.8b, v1.8b \n" - "trn1 v17.8b, v0.8b, v1.8b \n" - "trn2 v18.8b, v2.8b, v3.8b \n" - "trn1 v19.8b, v2.8b, v3.8b \n" - "trn2 v20.8b, v4.8b, v5.8b \n" - "trn1 v21.8b, v4.8b, v5.8b \n" - "trn2 v22.8b, v6.8b, v7.8b \n" - "trn1 v23.8b, v6.8b, v7.8b \n" - - "trn2 v3.4h, v17.4h, v19.4h \n" - "trn1 v1.4h, v17.4h, v19.4h \n" - "trn2 v2.4h, v16.4h, v18.4h \n" - "trn1 v0.4h, v16.4h, v18.4h \n" - "trn2 v7.4h, v21.4h, v23.4h \n" - "trn1 v5.4h, v21.4h, v23.4h \n" - "trn2 v6.4h, v20.4h, v22.4h \n" - "trn1 v4.4h, v20.4h, v22.4h \n" - - "trn2 v21.2s, v1.2s, v5.2s \n" - "trn1 v17.2s, v1.2s, v5.2s \n" - "trn2 v20.2s, v0.2s, v4.2s \n" - "trn1 v16.2s, v0.2s, v4.2s \n" - "trn2 v23.2s, v3.2s, v7.2s \n" - "trn1 v19.2s, v3.2s, v7.2s \n" - "trn2 v22.2s, v2.2s, v6.2s \n" - "trn1 v18.2s, v2.2s, v6.2s \n" + "trn2 v16.8b, v0.8b, v1.8b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "trn1 v17.8b, v0.8b, v1.8b \n" + "add %0, %0, %5 \n" + "trn2 v18.8b, v2.8b, v3.8b \n" + "prfm pldl1keep, [%0, 448] \n" // row 1 + "trn1 v19.8b, v2.8b, v3.8b \n" + "add %0, %0, %5 \n" + "trn2 v20.8b, v4.8b, v5.8b \n" + "prfm pldl1keep, [%0, 448] \n" // row 2 + "trn1 v21.8b, v4.8b, v5.8b \n" + "add %0, %0, %5 \n" + "trn2 v22.8b, v6.8b, v7.8b \n" + "prfm pldl1keep, [%0, 448] \n" // row 3 + "trn1 v23.8b, v6.8b, v7.8b \n" + "add %0, %0, %5 \n" + + "trn2 v3.4h, v17.4h, v19.4h \n" + "prfm pldl1keep, [%0, 448] \n" // row 4 + "trn1 v1.4h, v17.4h, v19.4h \n" + "add %0, %0, %5 \n" + "trn2 v2.4h, v16.4h, v18.4h \n" + "prfm pldl1keep, [%0, 448] \n" // row 5 + "trn1 v0.4h, v16.4h, v18.4h \n" + "add %0, %0, %5 \n" + "trn2 v7.4h, v21.4h, v23.4h \n" + "prfm pldl1keep, [%0, 448] \n" // row 6 + "trn1 v5.4h, v21.4h, v23.4h \n" + "add %0, %0, %5 \n" + "trn2 v6.4h, v20.4h, v22.4h \n" + "prfm pldl1keep, [%0, 448] \n" // row 7 + "trn1 v4.4h, v20.4h, v22.4h \n" + + "trn2 v21.2s, v1.2s, v5.2s \n" + "trn1 v17.2s, v1.2s, v5.2s \n" + "trn2 v20.2s, v0.2s, v4.2s \n" + "trn1 v16.2s, v0.2s, v4.2s \n" + "trn2 v23.2s, v3.2s, v7.2s \n" + "trn1 v19.2s, v3.2s, v7.2s \n" + "trn2 v22.2s, v2.2s, v6.2s \n" + "trn1 v18.2s, v2.2s, v6.2s \n" "mov %0, %2 \n" - "st1 {v17.8b}, [%0], %6 \n" - "st1 {v16.8b}, [%0], %6 \n" - "st1 {v19.8b}, [%0], %6 \n" - "st1 {v18.8b}, [%0], %6 \n" - "st1 {v21.8b}, [%0], %6 \n" - "st1 {v20.8b}, [%0], %6 \n" - "st1 {v23.8b}, [%0], %6 \n" - "st1 {v22.8b}, [%0] \n" + "st1 {v17.8b}, [%0], %6 \n" + "st1 {v16.8b}, [%0], %6 \n" + "st1 {v19.8b}, [%0], %6 \n" + "st1 {v18.8b}, [%0], %6 \n" + "st1 {v21.8b}, [%0], %6 \n" + "st1 {v20.8b}, [%0], %6 \n" + "st1 {v23.8b}, [%0], %6 \n" + "st1 {v22.8b}, [%0] \n" "add %1, %1, #8 \n" // src += 8 "add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride @@ -94,33 +110,33 @@ void TransposeWx8_NEON(const uint8_t* src, // add 8 back to counter. if the result is 0 there are // no residuals. - "adds %w3, %w3, #8 \n" - "b.eq 4f \n" + "adds %w3, %w3, #8 \n" + "b.eq 4f \n" // some residual, so between 1 and 7 lines left to transpose - "cmp %w3, #2 \n" - "b.lt 3f \n" + "cmp %w3, #2 \n" + "b.lt 3f \n" - "cmp %w3, #4 \n" - "b.lt 2f \n" + "cmp %w3, #4 \n" + "b.lt 2f \n" // 4x8 block - "mov %0, %1 \n" - "ld1 {v0.s}[0], [%0], %5 \n" - "ld1 {v0.s}[1], [%0], %5 \n" - "ld1 {v0.s}[2], [%0], %5 \n" - "ld1 {v0.s}[3], [%0], %5 \n" - "ld1 {v1.s}[0], [%0], %5 \n" - "ld1 {v1.s}[1], [%0], %5 \n" - "ld1 {v1.s}[2], [%0], %5 \n" - "ld1 {v1.s}[3], [%0] \n" + "mov %0, %1 \n" + "ld1 {v0.s}[0], [%0], %5 \n" + "ld1 {v0.s}[1], [%0], %5 \n" + "ld1 {v0.s}[2], [%0], %5 \n" + "ld1 {v0.s}[3], [%0], %5 \n" + "ld1 {v1.s}[0], [%0], %5 \n" + "ld1 {v1.s}[1], [%0], %5 \n" + "ld1 {v1.s}[2], [%0], %5 \n" + "ld1 {v1.s}[3], [%0] \n" - "mov %0, %2 \n" + "mov %0, %2 \n" - "ld1 {v2.16b}, [%4] \n" + "ld1 {v2.16b}, [%4] \n" - "tbl v3.16b, {v0.16b}, v2.16b \n" - "tbl v0.16b, {v1.16b}, v2.16b \n" + "tbl v3.16b, {v0.16b}, v2.16b \n" + "tbl v0.16b, {v1.16b}, v2.16b \n" // TODO(frkoenig): Rework shuffle above to // write out with 4 instead of 8 writes. @@ -212,89 +228,90 @@ void TransposeUVWx8_NEON(const uint8_t* src, // loops are on blocks of 8. loop will stop when // counter gets to or below 0. starting the counter // at w-8 allow for this - "sub %w4, %w4, #8 \n" + "sub %w4, %w4, #8 \n" // handle 8x8 blocks. this should be the majority of the plane "1: \n" - "mov %0, %1 \n" - - "ld1 {v0.16b}, [%0], %5 \n" - "ld1 {v1.16b}, [%0], %5 \n" - "ld1 {v2.16b}, [%0], %5 \n" - "ld1 {v3.16b}, [%0], %5 \n" - "ld1 {v4.16b}, [%0], %5 \n" - "ld1 {v5.16b}, [%0], %5 \n" - "ld1 {v6.16b}, [%0], %5 \n" - "ld1 {v7.16b}, [%0] \n" - - "trn1 v16.16b, v0.16b, v1.16b \n" - "trn2 v17.16b, v0.16b, v1.16b \n" - "trn1 v18.16b, v2.16b, v3.16b \n" - "trn2 v19.16b, v2.16b, v3.16b \n" - "trn1 v20.16b, v4.16b, v5.16b \n" - "trn2 v21.16b, v4.16b, v5.16b \n" - "trn1 v22.16b, v6.16b, v7.16b \n" - "trn2 v23.16b, v6.16b, v7.16b \n" - - "trn1 v0.8h, v16.8h, v18.8h \n" - "trn2 v1.8h, v16.8h, v18.8h \n" - "trn1 v2.8h, v20.8h, v22.8h \n" - "trn2 v3.8h, v20.8h, v22.8h \n" - "trn1 v4.8h, v17.8h, v19.8h \n" - "trn2 v5.8h, v17.8h, v19.8h \n" - "trn1 v6.8h, v21.8h, v23.8h \n" - "trn2 v7.8h, v21.8h, v23.8h \n" - - "trn1 v16.4s, v0.4s, v2.4s \n" - "trn2 v17.4s, v0.4s, v2.4s \n" - "trn1 v18.4s, v1.4s, v3.4s \n" - "trn2 v19.4s, v1.4s, v3.4s \n" - "trn1 v20.4s, v4.4s, v6.4s \n" - "trn2 v21.4s, v4.4s, v6.4s \n" - "trn1 v22.4s, v5.4s, v7.4s \n" - "trn2 v23.4s, v5.4s, v7.4s \n" + "mov %0, %1 \n" - "mov %0, %2 \n" + "ld1 {v0.16b}, [%0], %5 \n" + "ld1 {v1.16b}, [%0], %5 \n" + "ld1 {v2.16b}, [%0], %5 \n" + "ld1 {v3.16b}, [%0], %5 \n" + "ld1 {v4.16b}, [%0], %5 \n" + "ld1 {v5.16b}, [%0], %5 \n" + "ld1 {v6.16b}, [%0], %5 \n" + "ld1 {v7.16b}, [%0] \n" + "mov %0, %1 \n" - "st1 {v16.d}[0], [%0], %6 \n" - "st1 {v18.d}[0], [%0], %6 \n" - "st1 {v17.d}[0], [%0], %6 \n" - "st1 {v19.d}[0], [%0], %6 \n" - "st1 {v16.d}[1], [%0], %6 \n" - "st1 {v18.d}[1], [%0], %6 \n" - "st1 {v17.d}[1], [%0], %6 \n" - "st1 {v19.d}[1], [%0] \n" + "trn1 v16.16b, v0.16b, v1.16b \n" + "trn2 v17.16b, v0.16b, v1.16b \n" + "trn1 v18.16b, v2.16b, v3.16b \n" + "trn2 v19.16b, v2.16b, v3.16b \n" + "trn1 v20.16b, v4.16b, v5.16b \n" + "trn2 v21.16b, v4.16b, v5.16b \n" + "trn1 v22.16b, v6.16b, v7.16b \n" + "trn2 v23.16b, v6.16b, v7.16b \n" + + "trn1 v0.8h, v16.8h, v18.8h \n" + "trn2 v1.8h, v16.8h, v18.8h \n" + "trn1 v2.8h, v20.8h, v22.8h \n" + "trn2 v3.8h, v20.8h, v22.8h \n" + "trn1 v4.8h, v17.8h, v19.8h \n" + "trn2 v5.8h, v17.8h, v19.8h \n" + "trn1 v6.8h, v21.8h, v23.8h \n" + "trn2 v7.8h, v21.8h, v23.8h \n" + + "trn1 v16.4s, v0.4s, v2.4s \n" + "trn2 v17.4s, v0.4s, v2.4s \n" + "trn1 v18.4s, v1.4s, v3.4s \n" + "trn2 v19.4s, v1.4s, v3.4s \n" + "trn1 v20.4s, v4.4s, v6.4s \n" + "trn2 v21.4s, v4.4s, v6.4s \n" + "trn1 v22.4s, v5.4s, v7.4s \n" + "trn2 v23.4s, v5.4s, v7.4s \n" - "mov %0, %3 \n" + "mov %0, %2 \n" - "st1 {v20.d}[0], [%0], %7 \n" - "st1 {v22.d}[0], [%0], %7 \n" - "st1 {v21.d}[0], [%0], %7 \n" - "st1 {v23.d}[0], [%0], %7 \n" - "st1 {v20.d}[1], [%0], %7 \n" - "st1 {v22.d}[1], [%0], %7 \n" - "st1 {v21.d}[1], [%0], %7 \n" - "st1 {v23.d}[1], [%0] \n" - - "add %1, %1, #16 \n" // src += 8*2 - "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * + "st1 {v16.d}[0], [%0], %6 \n" + "st1 {v18.d}[0], [%0], %6 \n" + "st1 {v17.d}[0], [%0], %6 \n" + "st1 {v19.d}[0], [%0], %6 \n" + "st1 {v16.d}[1], [%0], %6 \n" + "st1 {v18.d}[1], [%0], %6 \n" + "st1 {v17.d}[1], [%0], %6 \n" + "st1 {v19.d}[1], [%0] \n" + + "mov %0, %3 \n" + + "st1 {v20.d}[0], [%0], %7 \n" + "st1 {v22.d}[0], [%0], %7 \n" + "st1 {v21.d}[0], [%0], %7 \n" + "st1 {v23.d}[0], [%0], %7 \n" + "st1 {v20.d}[1], [%0], %7 \n" + "st1 {v22.d}[1], [%0], %7 \n" + "st1 {v21.d}[1], [%0], %7 \n" + "st1 {v23.d}[1], [%0] \n" + + "add %1, %1, #16 \n" // src += 8*2 + "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * // dst_stride_a - "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * + "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * // dst_stride_b - "subs %w4, %w4, #8 \n" // w -= 8 - "b.ge 1b \n" + "subs %w4, %w4, #8 \n" // w -= 8 + "b.ge 1b \n" // add 8 back to counter. if the result is 0 there are // no residuals. - "adds %w4, %w4, #8 \n" - "b.eq 4f \n" + "adds %w4, %w4, #8 \n" + "b.eq 4f \n" // some residual, so between 1 and 7 lines left to transpose - "cmp %w4, #2 \n" - "b.lt 3f \n" + "cmp %w4, #2 \n" + "b.lt 3f \n" - "cmp %w4, #4 \n" - "b.lt 2f \n" + "cmp %w4, #4 \n" + "b.lt 2f \n" // TODO(frkoenig): Clean this up // 4x8 block diff --git a/chromium/third_party/libyuv/source/row_any.cc b/chromium/third_party/libyuv/source/row_any.cc index 9b29b2bfbff..7216373bcd1 100644 --- a/chromium/third_party/libyuv/source/row_any.cc +++ b/chromium/third_party/libyuv/source/row_any.cc @@ -546,12 +546,6 @@ ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7) #if defined(HAS_J400TOARGBROW_AVX2) ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15) #endif -#if defined(HAS_I400TOARGBROW_SSE2) -ANY11(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, 0, 1, 4, 7) -#endif -#if defined(HAS_I400TOARGBROW_AVX2) -ANY11(I400ToARGBRow_Any_AVX2, I400ToARGBRow_AVX2, 0, 1, 4, 15) -#endif #if defined(HAS_RGB24TOARGBROW_SSSE3) ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15) ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15) @@ -581,7 +575,6 @@ ANY11(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, 0, 4, 2, 7) ANY11(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, 0, 4, 2, 7) ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7) ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7) -ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7) #endif #if defined(HAS_ARGBTORGB24ROW_MSA) ANY11(ARGBToRGB24Row_Any_MSA, ARGBToRGB24Row_MSA, 0, 4, 3, 15) @@ -590,7 +583,6 @@ ANY11(ARGBToRGB565Row_Any_MSA, ARGBToRGB565Row_MSA, 0, 4, 2, 7) ANY11(ARGBToARGB1555Row_Any_MSA, ARGBToARGB1555Row_MSA, 0, 4, 2, 7) ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7) ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15) -ANY11(I400ToARGBRow_Any_MSA, I400ToARGBRow_MSA, 0, 1, 4, 15) #endif #if defined(HAS_ARGBTORGB24ROW_MMI) ANY11(ARGBToRGB24Row_Any_MMI, ARGBToRGB24Row_MMI, 0, 4, 3, 3) @@ -599,7 +591,6 @@ ANY11(ARGBToRGB565Row_Any_MMI, ARGBToRGB565Row_MMI, 0, 4, 2, 3) ANY11(ARGBToARGB1555Row_Any_MMI, ARGBToARGB1555Row_MMI, 0, 4, 2, 3) ANY11(ARGBToARGB4444Row_Any_MMI, ARGBToARGB4444Row_MMI, 0, 4, 2, 3) ANY11(J400ToARGBRow_Any_MMI, J400ToARGBRow_MMI, 0, 1, 4, 3) -ANY11(I400ToARGBRow_Any_MMI, I400ToARGBRow_MMI, 0, 1, 4, 7) #endif #if defined(HAS_RAWTORGB24ROW_NEON) ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7) @@ -695,6 +686,15 @@ ANY11(RGBAToYRow_Any_MMI, RGBAToYRow_MMI, 0, 4, 1, 7) #ifdef HAS_RGB24TOYROW_NEON ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7) #endif +#ifdef HAS_RGB24TOYJROW_AVX2 +ANY11(RGB24ToYJRow_Any_AVX2, RGB24ToYJRow_AVX2, 0, 3, 1, 31) +#endif +#ifdef HAS_RGB24TOYJROW_SSSE3 +ANY11(RGB24ToYJRow_Any_SSSE3, RGB24ToYJRow_SSSE3, 0, 3, 1, 15) +#endif +#ifdef HAS_RGB24TOYJROW_NEON +ANY11(RGB24ToYJRow_Any_NEON, RGB24ToYJRow_NEON, 0, 3, 1, 7) +#endif #ifdef HAS_RGB24TOYROW_MSA ANY11(RGB24ToYRow_Any_MSA, RGB24ToYRow_MSA, 0, 3, 1, 15) #endif @@ -704,6 +704,15 @@ ANY11(RGB24ToYRow_Any_MMI, RGB24ToYRow_MMI, 0, 3, 1, 7) #ifdef HAS_RAWTOYROW_NEON ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7) #endif +#ifdef HAS_RAWTOYJROW_AVX2 +ANY11(RAWToYJRow_Any_AVX2, RAWToYJRow_AVX2, 0, 3, 1, 31) +#endif +#ifdef HAS_RAWTOYJROW_SSSE3 +ANY11(RAWToYJRow_Any_SSSE3, RAWToYJRow_SSSE3, 0, 3, 1, 15) +#endif +#ifdef HAS_RAWTOYJROW_NEON +ANY11(RAWToYJRow_Any_NEON, RAWToYJRow_NEON, 0, 3, 1, 7) +#endif #ifdef HAS_RAWTOYROW_MSA ANY11(RAWToYRow_Any_MSA, RAWToYRow_MSA, 0, 3, 1, 15) #endif @@ -901,6 +910,47 @@ ANY11B(ARGBCopyYToAlphaRow_Any_MMI, ARGBCopyYToAlphaRow_MMI, 0, 1, 4, 7) memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \ } +#if defined(HAS_I400TOARGBROW_SSE2) +ANY11P(I400ToARGBRow_Any_SSE2, + I400ToARGBRow_SSE2, + const struct YuvConstants*, + 1, + 4, + 7) +#endif +#if defined(HAS_I400TOARGBROW_AVX2) +ANY11P(I400ToARGBRow_Any_AVX2, + I400ToARGBRow_AVX2, + const struct YuvConstants*, + 1, + 4, + 15) +#endif +#if defined(HAS_I400TOARGBROW_NEON) +ANY11P(I400ToARGBRow_Any_NEON, + I400ToARGBRow_NEON, + const struct YuvConstants*, + 1, + 4, + 7) +#endif +#if defined(HAS_I400TOARGBROW_MSA) +ANY11P(I400ToARGBRow_Any_MSA, + I400ToARGBRow_MSA, + const struct YuvConstants*, + 1, + 4, + 15) +#endif +#if defined(HAS_I400TOARGBROW_MMI) +ANY11P(I400ToARGBRow_Any_MMI, + I400ToARGBRow_MMI, + const struct YuvConstants*, + 1, + 4, + 7) +#endif + #if defined(HAS_ARGBTORGB565DITHERROW_SSE2) ANY11P(ARGBToRGB565DitherRow_Any_SSE2, ARGBToRGB565DitherRow_SSE2, @@ -1156,7 +1206,7 @@ ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31) ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15) #endif #ifdef HAS_MIRRORROW_NEON -ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15) +ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 31) #endif #ifdef HAS_MIRRORROW_MSA ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63) @@ -1164,6 +1214,18 @@ ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63) #ifdef HAS_MIRRORROW_MMI ANY11M(MirrorRow_Any_MMI, MirrorRow_MMI, 1, 7) #endif +#ifdef HAS_MIRRORUVROW_AVX2 +ANY11M(MirrorUVRow_Any_AVX2, MirrorUVRow_AVX2, 2, 15) +#endif +#ifdef HAS_MIRRORUVROW_SSSE3 +ANY11M(MirrorUVRow_Any_SSSE3, MirrorUVRow_SSSE3, 2, 7) +#endif +#ifdef HAS_MIRRORUVROW_NEON +ANY11M(MirrorUVRow_Any_NEON, MirrorUVRow_NEON, 2, 31) +#endif +#ifdef HAS_MIRRORUVROW_MSA +ANY11M(MirrorUVRow_Any_MSA, MirrorUVRow_MSA, 2, 7) +#endif #ifdef HAS_ARGBMIRRORROW_AVX2 ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7) #endif @@ -1171,7 +1233,7 @@ ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7) ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3) #endif #ifdef HAS_ARGBMIRRORROW_NEON -ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 3) +ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 7) #endif #ifdef HAS_ARGBMIRRORROW_MSA ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15) @@ -1179,12 +1241,19 @@ ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15) #ifdef HAS_ARGBMIRRORROW_MMI ANY11M(ARGBMirrorRow_Any_MMI, ARGBMirrorRow_MMI, 4, 1) #endif +#ifdef HAS_RGB24MIRRORROW_SSSE3 +ANY11M(RGB24MirrorRow_Any_SSSE3, RGB24MirrorRow_SSSE3, 3, 15) +#endif +#ifdef HAS_RGB24MIRRORROW_NEON +ANY11M(RGB24MirrorRow_Any_NEON, RGB24MirrorRow_NEON, 3, 15) +#endif #undef ANY11M // Any 1 plane. (memset) #define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK) \ void NAMEANY(uint8_t* dst_ptr, T v32, int width) { \ SIMD_ALIGNED(uint8_t temp[64]); \ + memset(temp, 0, 64); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ @@ -1371,7 +1440,7 @@ ANY12S(ARGBToUVJRow_Any_MMI, ARGBToUVJRow_MMI, 0, 4, 15) ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15) #endif #ifdef HAS_BGRATOUVROW_MSA -ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 31) +ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 15) #endif #ifdef HAS_BGRATOUVROW_MMI ANY12S(BGRAToUVRow_Any_MMI, BGRAToUVRow_MMI, 0, 4, 15) @@ -1380,7 +1449,7 @@ ANY12S(BGRAToUVRow_Any_MMI, BGRAToUVRow_MMI, 0, 4, 15) ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15) #endif #ifdef HAS_ABGRTOUVROW_MSA -ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 31) +ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 15) #endif #ifdef HAS_ABGRTOUVROW_MMI ANY12S(ABGRToUVRow_Any_MMI, ABGRToUVRow_MMI, 0, 4, 15) @@ -1389,7 +1458,7 @@ ANY12S(ABGRToUVRow_Any_MMI, ABGRToUVRow_MMI, 0, 4, 15) ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15) #endif #ifdef HAS_RGBATOUVROW_MSA -ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 31) +ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 15) #endif #ifdef HAS_RGBATOUVROW_MMI ANY12S(RGBAToUVRow_Any_MMI, RGBAToUVRow_MMI, 0, 4, 15) diff --git a/chromium/third_party/libyuv/source/row_common.cc b/chromium/third_party/libyuv/source/row_common.cc index 70aa2e13cf4..79aed5c7877 100644 --- a/chromium/third_party/libyuv/source/row_common.cc +++ b/chromium/third_party/libyuv/source/row_common.cc @@ -14,6 +14,7 @@ #include <string.h> // For memcpy and memset. #include "libyuv/basic_types.h" +#include "libyuv/convert_argb.h" // For kYuvI601Constants #ifdef __cplusplus namespace libyuv { @@ -26,10 +27,11 @@ extern "C" { (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__))) #define LIBYUV_RGB7 1 #endif -// mips use 7 bit RGBToY -#if (!defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)) || \ - (!defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)) -#define LIBYUV_RGB7 1 + +#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \ + defined(_M_IX86) +#define LIBYUV_ARGBTOUV_PAVGB 1 +#define LIBYUV_RGBTOU_TRUNCATE 1 #endif // llvm x86 is poor at ternary operator, so use branchless min/max. @@ -37,19 +39,19 @@ extern "C" { #define USE_BRANCHLESS 1 #if USE_BRANCHLESS static __inline int32_t clamp0(int32_t v) { - return ((-(v) >> 31) & (v)); + return -(v >= 0) & v; } - +// TODO(fbarchard): make clamp255 preserve negative values. static __inline int32_t clamp255(int32_t v) { - return (((255 - (v)) >> 31) | (v)) & 255; + return (-(v >= 255) | v) & 255; } static __inline int32_t clamp1023(int32_t v) { - return (((1023 - (v)) >> 31) | (v)) & 1023; + return (-(v >= 1023) | v) & 1023; } static __inline uint32_t Abs(int32_t v) { - int m = v >> 31; + int m = -(v < 0); return (v + m) ^ m; } #else // USE_BRANCHLESS @@ -208,7 +210,8 @@ void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444, void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { - uint32_t ar30 = *(const uint32_t*)src_ar30; + uint32_t ar30; + memcpy(&ar30, src_ar30, sizeof ar30); uint32_t b = (ar30 >> 2) & 0xff; uint32_t g = (ar30 >> 12) & 0xff; uint32_t r = (ar30 >> 22) & 0xff; @@ -222,7 +225,8 @@ void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) { void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) { int x; for (x = 0; x < width; ++x) { - uint32_t ar30 = *(const uint32_t*)src_ar30; + uint32_t ar30; + memcpy(&ar30, src_ar30, sizeof ar30); uint32_t b = (ar30 >> 2) & 0xff; uint32_t g = (ar30 >> 12) & 0xff; uint32_t r = (ar30 >> 22) & 0xff; @@ -236,7 +240,8 @@ void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) { void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width) { int x; for (x = 0; x < width; ++x) { - uint32_t ar30 = *(const uint32_t*)src_ar30; + uint32_t ar30; + memcpy(&ar30, src_ar30, sizeof ar30); uint32_t b = ar30 & 0x3ff; uint32_t ga = ar30 & 0xc00ffc00; uint32_t r = (ar30 >> 20) & 0x3ff; @@ -425,14 +430,38 @@ static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) { } #endif +#define AVGB(a, b) (((a) + (b) + 1) >> 1) + +#ifdef LIBYUV_RGBTOU_TRUNCATE +static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) { + return (112 * b - 74 * g - 38 * r + 0x8000) >> 8; +} +static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) { + return (112 * r - 94 * g - 18 * b + 0x8000) >> 8; +} +#else +// TODO(fbarchard): Add rounding to SIMD and use this static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) { return (112 * b - 74 * g - 38 * r + 0x8080) >> 8; } static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) { return (112 * r - 94 * g - 18 * b + 0x8080) >> 8; } +#endif + +#if !defined(LIBYUV_ARGBTOUV_PAVGB) +static __inline int RGB2xToU(uint16_t r, uint16_t g, uint16_t b) { + return ((112 / 2) * b - (74 / 2) * g - (38 / 2) * r + 0x8080) >> 8; +} +static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) { + return ((112 / 2) * r - (94 / 2) * g - (18 / 2) * b + 0x8080) >> 8; +} +#endif // ARGBToY_C and ARGBToUV_C +// Intel version mimic SSE/AVX which does 2 pavgb +#if LIBYUV_ARGBTOUV_PAVGB + #define MAKEROWY(NAME, R, G, B, BPP) \ void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \ int x; \ @@ -447,15 +476,12 @@ static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) { const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \ int x; \ for (x = 0; x < width - 1; x += 2) { \ - uint8_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] + \ - src_rgb1[B + BPP]) >> \ - 2; \ - uint8_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] + \ - src_rgb1[G + BPP]) >> \ - 2; \ - uint8_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] + \ - src_rgb1[R + BPP]) >> \ - 2; \ + uint8_t ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \ + AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \ + uint8_t ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \ + AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \ + uint8_t ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \ + AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \ dst_u[0] = RGBToU(ar, ag, ab); \ dst_v[0] = RGBToV(ar, ag, ab); \ src_rgb0 += BPP * 2; \ @@ -464,13 +490,54 @@ static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) { dst_v += 1; \ } \ if (width & 1) { \ - uint8_t ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \ - uint8_t ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \ - uint8_t ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \ + uint8_t ab = AVGB(src_rgb0[B], src_rgb1[B]); \ + uint8_t ag = AVGB(src_rgb0[G], src_rgb1[G]); \ + uint8_t ar = AVGB(src_rgb0[R], src_rgb1[R]); \ dst_u[0] = RGBToU(ar, ag, ab); \ dst_v[0] = RGBToV(ar, ag, ab); \ } \ } +#else +// ARM version does sum / 2 then multiply by 2x smaller coefficients +#define MAKEROWY(NAME, R, G, B, BPP) \ + void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \ + src_argb0 += BPP; \ + dst_y += 1; \ + } \ + } \ + void NAME##ToUVRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \ + uint8_t* dst_u, uint8_t* dst_v, int width) { \ + const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint16_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] + \ + src_rgb1[B + BPP] + 1) >> \ + 1; \ + uint16_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] + \ + src_rgb1[G + BPP] + 1) >> \ + 1; \ + uint16_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] + \ + src_rgb1[R + BPP] + 1) >> \ + 1; \ + dst_u[0] = RGB2xToU(ar, ag, ab); \ + dst_v[0] = RGB2xToV(ar, ag, ab); \ + src_rgb0 += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint16_t ab = src_rgb0[B] + src_rgb1[B]; \ + uint16_t ag = src_rgb0[G] + src_rgb1[G]; \ + uint16_t ar = src_rgb0[R] + src_rgb1[R]; \ + dst_u[0] = RGB2xToU(ar, ag, ab); \ + dst_v[0] = RGB2xToV(ar, ag, ab); \ + } \ + } +#endif MAKEROWY(ARGB, 2, 1, 0, 4) MAKEROWY(BGRA, 1, 2, 3, 4) @@ -517,16 +584,25 @@ static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) { } #endif +#if defined(LIBYUV_ARGBTOUV_PAVGB) static __inline int RGBToUJ(uint8_t r, uint8_t g, uint8_t b) { return (127 * b - 84 * g - 43 * r + 0x8080) >> 8; } static __inline int RGBToVJ(uint8_t r, uint8_t g, uint8_t b) { return (127 * r - 107 * g - 20 * b + 0x8080) >> 8; } - -#define AVGB(a, b) (((a) + (b) + 1) >> 1) +#else +static __inline int RGB2xToUJ(uint16_t r, uint16_t g, uint16_t b) { + return ((127 / 2) * b - (84 / 2) * g - (43 / 2) * r + 0x8080) >> 8; +} +static __inline int RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) { + return ((127 / 2) * r - (107 / 2) * g - (20 / 2) * b + 0x8080) >> 8; +} +#endif // ARGBToYJ_C and ARGBToUVJ_C +// Intel version mimic SSE/AVX which does 2 pavgb +#if LIBYUV_ARGBTOUV_PAVGB #define MAKEROWYJ(NAME, R, G, B, BPP) \ void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \ int x; \ @@ -562,9 +638,53 @@ static __inline int RGBToVJ(uint8_t r, uint8_t g, uint8_t b) { dst_v[0] = RGBToVJ(ar, ag, ab); \ } \ } +#else +// ARM version does sum / 2 then multiply by 2x smaller coefficients +#define MAKEROWYJ(NAME, R, G, B, BPP) \ + void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]); \ + src_argb0 += BPP; \ + dst_y += 1; \ + } \ + } \ + void NAME##ToUVJRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \ + uint8_t* dst_u, uint8_t* dst_v, int width) { \ + const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint16_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] + \ + src_rgb1[B + BPP] + 1) >> \ + 1; \ + uint16_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] + \ + src_rgb1[G + BPP] + 1) >> \ + 1; \ + uint16_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] + \ + src_rgb1[R + BPP] + 1) >> \ + 1; \ + dst_u[0] = RGB2xToUJ(ar, ag, ab); \ + dst_v[0] = RGB2xToVJ(ar, ag, ab); \ + src_rgb0 += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint16_t ab = (src_rgb0[B] + src_rgb1[B]); \ + uint16_t ag = (src_rgb0[G] + src_rgb1[G]); \ + uint16_t ar = (src_rgb0[R] + src_rgb1[R]); \ + dst_u[0] = RGB2xToUJ(ar, ag, ab); \ + dst_v[0] = RGB2xToVJ(ar, ag, ab); \ + } \ + } + +#endif MAKEROWYJ(ARGB, 2, 1, 0, 4) MAKEROWYJ(RGBA, 3, 2, 1, 4) +MAKEROWYJ(RGB24, 2, 1, 0, 3) +MAKEROWYJ(RAW, 0, 1, 2, 3) #undef MAKEROWYJ void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { @@ -632,13 +752,34 @@ void RGB565ToUVRow_C(const uint8_t* src_rgb565, uint8_t b3 = next_rgb565[2] & 0x1f; uint8_t g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3); uint8_t r3 = next_rgb565[3] >> 3; - uint8_t b = (b0 + b1 + b2 + b3); // 565 * 4 = 787. - uint8_t g = (g0 + g1 + g2 + g3); - uint8_t r = (r0 + r1 + r2 + r3); - b = (b << 1) | (b >> 6); // 787 -> 888. - r = (r << 1) | (r >> 6); - dst_u[0] = RGBToU(r, g, b); - dst_v[0] = RGBToV(r, g, b); + + b0 = (b0 << 3) | (b0 >> 2); + g0 = (g0 << 2) | (g0 >> 4); + r0 = (r0 << 3) | (r0 >> 2); + b1 = (b1 << 3) | (b1 >> 2); + g1 = (g1 << 2) | (g1 >> 4); + r1 = (r1 << 3) | (r1 >> 2); + b2 = (b2 << 3) | (b2 >> 2); + g2 = (g2 << 2) | (g2 >> 4); + r2 = (r2 << 3) | (r2 >> 2); + b3 = (b3 << 3) | (b3 >> 2); + g3 = (g3 << 2) | (g3 >> 4); + r3 = (r3 << 3) | (r3 >> 2); + +#if LIBYUV_ARGBTOUV_PAVGB + uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3)); + uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3)); + uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3)); + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); +#else + uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1; + uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1; + uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1; + dst_u[0] = RGB2xToU(r, g, b); + dst_v[0] = RGB2xToV(r, g, b); +#endif + src_rgb565 += 4; next_rgb565 += 4; dst_u += 1; @@ -651,14 +792,27 @@ void RGB565ToUVRow_C(const uint8_t* src_rgb565, uint8_t b2 = next_rgb565[0] & 0x1f; uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3); uint8_t r2 = next_rgb565[1] >> 3; - uint8_t b = (b0 + b2); // 565 * 2 = 676. - uint8_t g = (g0 + g2); - uint8_t r = (r0 + r2); - b = (b << 2) | (b >> 4); // 676 -> 888 - g = (g << 1) | (g >> 6); - r = (r << 2) | (r >> 4); - dst_u[0] = RGBToU(r, g, b); - dst_v[0] = RGBToV(r, g, b); + + b0 = (b0 << 3) | (b0 >> 2); + g0 = (g0 << 2) | (g0 >> 4); + r0 = (r0 << 3) | (r0 >> 2); + b2 = (b2 << 3) | (b2 >> 2); + g2 = (g2 << 2) | (g2 >> 4); + r2 = (r2 << 3) | (r2 >> 2); + +#if LIBYUV_ARGBTOUV_PAVGB + uint8_t ab = AVGB(b0, b2); + uint8_t ag = AVGB(g0, g2); + uint8_t ar = AVGB(r0, r2); + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); +#else + uint16_t b = b0 + b2; + uint16_t g = g0 + g2; + uint16_t r = r0 + r2; + dst_u[0] = RGB2xToU(r, g, b); + dst_v[0] = RGB2xToV(r, g, b); +#endif } } @@ -682,14 +836,34 @@ void ARGB1555ToUVRow_C(const uint8_t* src_argb1555, uint8_t b3 = next_argb1555[2] & 0x1f; uint8_t g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3); uint8_t r3 = (next_argb1555[3] & 0x7c) >> 2; - uint8_t b = (b0 + b1 + b2 + b3); // 555 * 4 = 777. - uint8_t g = (g0 + g1 + g2 + g3); - uint8_t r = (r0 + r1 + r2 + r3); - b = (b << 1) | (b >> 6); // 777 -> 888. - g = (g << 1) | (g >> 6); - r = (r << 1) | (r >> 6); - dst_u[0] = RGBToU(r, g, b); - dst_v[0] = RGBToV(r, g, b); + + b0 = (b0 << 3) | (b0 >> 2); + g0 = (g0 << 3) | (g0 >> 2); + r0 = (r0 << 3) | (r0 >> 2); + b1 = (b1 << 3) | (b1 >> 2); + g1 = (g1 << 3) | (g1 >> 2); + r1 = (r1 << 3) | (r1 >> 2); + b2 = (b2 << 3) | (b2 >> 2); + g2 = (g2 << 3) | (g2 >> 2); + r2 = (r2 << 3) | (r2 >> 2); + b3 = (b3 << 3) | (b3 >> 2); + g3 = (g3 << 3) | (g3 >> 2); + r3 = (r3 << 3) | (r3 >> 2); + +#if LIBYUV_ARGBTOUV_PAVGB + uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3)); + uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3)); + uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3)); + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); +#else + uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1; + uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1; + uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1; + dst_u[0] = RGB2xToU(r, g, b); + dst_v[0] = RGB2xToV(r, g, b); +#endif + src_argb1555 += 4; next_argb1555 += 4; dst_u += 1; @@ -702,14 +876,27 @@ void ARGB1555ToUVRow_C(const uint8_t* src_argb1555, uint8_t b2 = next_argb1555[0] & 0x1f; uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3); uint8_t r2 = next_argb1555[1] >> 3; - uint8_t b = (b0 + b2); // 555 * 2 = 666. - uint8_t g = (g0 + g2); - uint8_t r = (r0 + r2); - b = (b << 2) | (b >> 4); // 666 -> 888. - g = (g << 2) | (g >> 4); - r = (r << 2) | (r >> 4); - dst_u[0] = RGBToU(r, g, b); - dst_v[0] = RGBToV(r, g, b); + + b0 = (b0 << 3) | (b0 >> 2); + g0 = (g0 << 3) | (g0 >> 2); + r0 = (r0 << 3) | (r0 >> 2); + b2 = (b2 << 3) | (b2 >> 2); + g2 = (g2 << 3) | (g2 >> 2); + r2 = (r2 << 3) | (r2 >> 2); + +#if LIBYUV_ARGBTOUV_PAVGB + uint8_t ab = AVGB(b0, b2); + uint8_t ag = AVGB(g0, g2); + uint8_t ar = AVGB(r0, r2); + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); +#else + uint16_t b = b0 + b2; + uint16_t g = g0 + g2; + uint16_t r = r0 + r2; + dst_u[0] = RGB2xToU(r, g, b); + dst_v[0] = RGB2xToV(r, g, b); +#endif } } @@ -733,14 +920,34 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444, uint8_t b3 = next_argb4444[2] & 0x0f; uint8_t g3 = next_argb4444[2] >> 4; uint8_t r3 = next_argb4444[3] & 0x0f; - uint8_t b = (b0 + b1 + b2 + b3); // 444 * 4 = 666. - uint8_t g = (g0 + g1 + g2 + g3); - uint8_t r = (r0 + r1 + r2 + r3); - b = (b << 2) | (b >> 4); // 666 -> 888. - g = (g << 2) | (g >> 4); - r = (r << 2) | (r >> 4); - dst_u[0] = RGBToU(r, g, b); - dst_v[0] = RGBToV(r, g, b); + + b0 = (b0 << 4) | b0; + g0 = (g0 << 4) | g0; + r0 = (r0 << 4) | r0; + b1 = (b1 << 4) | b1; + g1 = (g1 << 4) | g1; + r1 = (r1 << 4) | r1; + b2 = (b2 << 4) | b2; + g2 = (g2 << 4) | g2; + r2 = (r2 << 4) | r2; + b3 = (b3 << 4) | b3; + g3 = (g3 << 4) | g3; + r3 = (r3 << 4) | r3; + +#if LIBYUV_ARGBTOUV_PAVGB + uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3)); + uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3)); + uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3)); + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); +#else + uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1; + uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1; + uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1; + dst_u[0] = RGB2xToU(r, g, b); + dst_v[0] = RGB2xToV(r, g, b); +#endif + src_argb4444 += 4; next_argb4444 += 4; dst_u += 1; @@ -753,14 +960,27 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444, uint8_t b2 = next_argb4444[0] & 0x0f; uint8_t g2 = next_argb4444[0] >> 4; uint8_t r2 = next_argb4444[1] & 0x0f; - uint8_t b = (b0 + b2); // 444 * 2 = 555. - uint8_t g = (g0 + g2); - uint8_t r = (r0 + r2); - b = (b << 3) | (b >> 2); // 555 -> 888. - g = (g << 3) | (g >> 2); - r = (r << 3) | (r >> 2); - dst_u[0] = RGBToU(r, g, b); - dst_v[0] = RGBToV(r, g, b); + + b0 = (b0 << 4) | b0; + g0 = (g0 << 4) | g0; + r0 = (r0 << 4) | r0; + b2 = (b2 << 4) | b2; + g2 = (g2 << 4) | g2; + r2 = (r2 << 4) | r2; + +#if LIBYUV_ARGBTOUV_PAVGB + uint8_t ab = AVGB(b0, b2); + uint8_t ag = AVGB(g0, g2); + uint8_t ar = AVGB(r0, r2); + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); +#else + uint16_t b = b0 + b2; + uint16_t g = g0 + g2; + uint16_t r = r0 + r2; + dst_u[0] = RGB2xToU(r, g, b); + dst_v[0] = RGB2xToV(r, g, b); +#endif } } @@ -1136,26 +1356,26 @@ const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = { {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, {UG, VG, UG, VG, UG, VG, UG, VG}, {UG, VG, UG, VG, UG, VG, UG, VG}, - {BB, BG, BR, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; + {BB, BG, BR, YGB, 0, 0, 0, 0}, + {0x0101 * YG, YG, 0, 0}}; const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = { {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, {VG, UG, VG, UG, VG, UG, VG, UG}, {VG, UG, VG, UG, VG, UG, VG, UG}, - {BR, BG, BB, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; + {BR, BG, BB, YGB, 0, 0, 0, 0}, + {0x0101 * YG, YG, 0, 0}}; #elif defined(__arm__) // 32 bit arm const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = { {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0}, {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0}, - {BB, BG, BR, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; + {BB, BG, BR, YGB, 0, 0, 0, 0}, + {0x0101 * YG, YG, 0, 0}}; const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = { {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0}, {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0}, - {BR, BG, BB, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; + {BR, BG, BB, YGB, 0, 0, 0, 0}, + {0x0101 * YG, YG, 0, 0}}; #else const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = { {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, @@ -1167,7 +1387,9 @@ const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = { {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, - {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, + {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, + YGB}}; const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = { {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0}, @@ -1178,7 +1400,9 @@ const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = { {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, - {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, + {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, + YGB}}; #endif #undef BB @@ -1217,26 +1441,26 @@ const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = { {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, {UG, VG, UG, VG, UG, VG, UG, VG}, {UG, VG, UG, VG, UG, VG, UG, VG}, - {BB, BG, BR, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; + {BB, BG, BR, YGB, 0, 0, 0, 0}, + {0x0101 * YG, YG, 0, 0}}; const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = { {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, {VG, UG, VG, UG, VG, UG, VG, UG}, {VG, UG, VG, UG, VG, UG, VG, UG}, - {BR, BG, BB, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; + {BR, BG, BB, YGB, 0, 0, 0, 0}, + {0x0101 * YG, YG, 0, 0}}; #elif defined(__arm__) const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = { {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0}, {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0}, - {BB, BG, BR, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; + {BB, BG, BR, YGB, 0, 0, 0, 0}, + {0x0101 * YG, YG, 0, 0}}; const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = { {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0}, {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0}, - {BR, BG, BB, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; + {BR, BG, BB, YGB, 0, 0, 0, 0}, + {0x0101 * YG, YG, 0, 0}}; #else const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = { {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, @@ -1248,7 +1472,9 @@ const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = { {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, - {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, + {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, + YGB}}; const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = { {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0}, @@ -1259,7 +1485,9 @@ const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = { {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, - {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, + {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, + YGB}}; #endif #undef BB @@ -1300,26 +1528,26 @@ const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = { {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, {UG, VG, UG, VG, UG, VG, UG, VG}, {UG, VG, UG, VG, UG, VG, UG, VG}, - {BB, BG, BR, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; + {BB, BG, BR, YGB, 0, 0, 0, 0}, + {0x0101 * YG, YG, 0, 0}}; const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = { {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, {VG, UG, VG, UG, VG, UG, VG, UG}, {VG, UG, VG, UG, VG, UG, VG, UG}, - {BR, BG, BB, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; + {BR, BG, BB, YGB, 0, 0, 0, 0}, + {0x0101 * YG, YG, 0, 0}}; #elif defined(__arm__) const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = { {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0}, {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0}, - {BB, BG, BR, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; + {BB, BG, BR, YGB, 0, 0, 0, 0}, + {0x0101 * YG, YG, 0, 0}}; const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = { {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0}, {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0}, - {BR, BG, BB, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; + {BR, BG, BB, YGB, 0, 0, 0, 0}, + {0x0101 * YG, YG, 0, 0}}; #else const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = { {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, @@ -1331,7 +1559,9 @@ const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = { {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, - {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, + {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, + YGB}}; const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = { {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0}, @@ -1342,7 +1572,9 @@ const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = { {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, - {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, + {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, + YGB}}; #endif #undef BB @@ -1357,7 +1589,7 @@ const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = { // BT.2020 YUV to RGB reference // R = (Y - 16) * 1.164384 - V * -1.67867 -// G = (Y - 16) * 1.164384 - U * 0.187326 - V * -0.65042 +// G = (Y - 16) * 1.164384 - U * 0.187326 - V * 0.65042 // B = (Y - 16) * 1.164384 - U * -2.14177 // Y contribution to R,G,B. Scale and bias. @@ -1365,6 +1597,7 @@ const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = { #define YGB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */ // TODO(fbarchard): Improve accuracy; the B channel is off by 7%. +// U and V contributions to R,G,B. #define UB -128 /* max(-128, round(-2.142 * 64)) */ #define UG 12 /* round(0.187326 * 64) */ #define VG 42 /* round(0.65042 * 64) */ @@ -1381,26 +1614,26 @@ const struct YuvConstants SIMD_ALIGNED(kYuv2020Constants) = { {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, {UG, VG, UG, VG, UG, VG, UG, VG}, {UG, VG, UG, VG, UG, VG, UG, VG}, - {BB, BG, BR, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; + {BB, BG, BR, YGB, 0, 0, 0, 0}, + {0x0101 * YG, YG, 0, 0}}; const struct YuvConstants SIMD_ALIGNED(kYvu2020Constants) = { {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, {VG, UG, VG, UG, VG, UG, VG, UG}, {VG, UG, VG, UG, VG, UG, VG, UG}, - {BR, BG, BB, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; + {BR, BG, BB, YGB, 0, 0, 0, 0}, + {0x0101 * YG, YG, 0, 0}}; #elif defined(__arm__) const struct YuvConstants SIMD_ALIGNED(kYuv2020Constants) = { {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0}, {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0}, - {BB, BG, BR, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; + {BB, BG, BR, YGB, 0, 0, 0, 0}, + {0x0101 * YG, YG, 0, 0}}; const struct YuvConstants SIMD_ALIGNED(kYvu2020Constants) = { {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0}, {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0}, - {BR, BG, BB, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; + {BR, BG, BB, YGB, 0, 0, 0, 0}, + {0x0101 * YG, YG, 0, 0}}; #else const struct YuvConstants SIMD_ALIGNED(kYuv2020Constants) = { {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, @@ -1412,7 +1645,9 @@ const struct YuvConstants SIMD_ALIGNED(kYuv2020Constants) = { {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, - {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, + {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, + YGB}}; const struct YuvConstants SIMD_ALIGNED(kYvu2020Constants) = { {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0}, @@ -1423,7 +1658,9 @@ const struct YuvConstants SIMD_ALIGNED(kYvu2020Constants) = { {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, - {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, + {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, + YGB}}; #endif #undef BB @@ -1438,7 +1675,6 @@ const struct YuvConstants SIMD_ALIGNED(kYvu2020Constants) = { // C reference code that mimics the YUV assembly. // Reads 8 bit YUV and leaves result as 16 bit. - static __inline void YuvPixel(uint8_t y, uint8_t u, uint8_t v, @@ -1454,7 +1690,7 @@ static __inline void YuvPixel(uint8_t y, int bb = yuvconstants->kUVBiasBGR[0]; int bg = yuvconstants->kUVBiasBGR[1]; int br = yuvconstants->kUVBiasBGR[2]; - int yg = yuvconstants->kYToRgb[0] / 0x0101; + int yg = yuvconstants->kYToRgb[1]; #elif defined(__arm__) int ub = -yuvconstants->kUVToRB[0]; int ug = yuvconstants->kUVToG[0]; @@ -1463,7 +1699,7 @@ static __inline void YuvPixel(uint8_t y, int bb = yuvconstants->kUVBiasBGR[0]; int bg = yuvconstants->kUVBiasBGR[1]; int br = yuvconstants->kUVBiasBGR[2]; - int yg = yuvconstants->kYToRgb[0] / 0x0101; + int yg = yuvconstants->kYToRgb[1]; #else int ub = yuvconstants->kUVToB[0]; int ug = yuvconstants->kUVToG[0]; @@ -1497,7 +1733,7 @@ static __inline void YuvPixel8_16(uint8_t y, int bb = yuvconstants->kUVBiasBGR[0]; int bg = yuvconstants->kUVBiasBGR[1]; int br = yuvconstants->kUVBiasBGR[2]; - int yg = yuvconstants->kYToRgb[0] / 0x0101; + int yg = yuvconstants->kYToRgb[1]; #elif defined(__arm__) int ub = -yuvconstants->kUVToRB[0]; int ug = yuvconstants->kUVToG[0]; @@ -1506,7 +1742,7 @@ static __inline void YuvPixel8_16(uint8_t y, int bb = yuvconstants->kUVBiasBGR[0]; int bg = yuvconstants->kUVBiasBGR[1]; int br = yuvconstants->kUVBiasBGR[2]; - int yg = yuvconstants->kYToRgb[0] / 0x0101; + int yg = yuvconstants->kYToRgb[1]; #else int ub = yuvconstants->kUVToB[0]; int ug = yuvconstants->kUVToG[0]; @@ -1541,7 +1777,7 @@ static __inline void YuvPixel16(int16_t y, int bb = yuvconstants->kUVBiasBGR[0]; int bg = yuvconstants->kUVBiasBGR[1]; int br = yuvconstants->kUVBiasBGR[2]; - int yg = yuvconstants->kYToRgb[0] / 0x0101; + int yg = yuvconstants->kYToRgb[1]; #elif defined(__arm__) int ub = -yuvconstants->kUVToRB[0]; int ug = yuvconstants->kUVToG[0]; @@ -1550,7 +1786,7 @@ static __inline void YuvPixel16(int16_t y, int bb = yuvconstants->kUVBiasBGR[0]; int bg = yuvconstants->kUVBiasBGR[1]; int br = yuvconstants->kUVBiasBGR[2]; - int yg = yuvconstants->kYToRgb[0] / 0x0101; + int yg = yuvconstants->kYToRgb[1]; #else int ub = yuvconstants->kUVToB[0]; int ug = yuvconstants->kUVToG[0]; @@ -1588,21 +1824,26 @@ static __inline void YuvPixel10(uint16_t y, *r = Clamp(r16 >> 6); } -// Y contribution to R,G,B. Scale and bias. -#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ -#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ - // C reference code that mimics the YUV assembly. -static __inline void YPixel(uint8_t y, uint8_t* b, uint8_t* g, uint8_t* r) { - uint32_t y1 = (uint32_t)(y * 0x0101 * YG) >> 16; - *b = Clamp((int32_t)(y1 + YGB) >> 6); - *g = Clamp((int32_t)(y1 + YGB) >> 6); - *r = Clamp((int32_t)(y1 + YGB) >> 6); +// Reads 8 bit YUV and leaves result as 16 bit. +static __inline void YPixel(uint8_t y, + uint8_t* b, + uint8_t* g, + uint8_t* r, + const struct YuvConstants* yuvconstants) { +#if defined(__aarch64__) || defined(__arm__) + int ygb = yuvconstants->kUVBiasBGR[3]; + int yg = yuvconstants->kYToRgb[1]; +#else + int ygb = yuvconstants->kYBiasToRgb[0]; + int yg = yuvconstants->kYToRgb[0]; +#endif + uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16; + *b = Clamp(((int32_t)(y1) + ygb) >> 6); + *g = Clamp(((int32_t)(y1) + ygb) >> 6); + *r = Clamp(((int32_t)(y1) + ygb) >> 6); } -#undef YG -#undef YGB - #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON)) // C mimic assembly. @@ -2136,18 +2377,21 @@ void I422ToRGBARow_C(const uint8_t* src_y, } } -void I400ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, int width) { +void I400ToARGBRow_C(const uint8_t* src_y, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { int x; for (x = 0; x < width - 1; x += 2) { - YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; - YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_y += 2; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } @@ -2165,10 +2409,21 @@ void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width) { } } -void MirrorUVRow_C(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { +void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width) { + int x; + src_uv += (width - 1) << 1; + for (x = 0; x < width; ++x) { + dst_uv[0] = src_uv[0]; + dst_uv[1] = src_uv[1]; + src_uv -= 2; + dst_uv += 2; + } +} + +void MirrorSplitUVRow_C(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { int x; src_uv += (width - 1) << 1; for (x = 0; x < width - 1; x += 2) { @@ -2199,6 +2454,21 @@ void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width) { } } +void RGB24MirrorRow_C(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width) { + int x; + src_rgb24 += width * 3 - 3; + for (x = 0; x < width; ++x) { + uint8_t b = src_rgb24[0]; + uint8_t g = src_rgb24[1]; + uint8_t r = src_rgb24[2]; + dst_rgb24[0] = b; + dst_rgb24[1] = g; + dst_rgb24[2] = r; + src_rgb24 -= 3; + dst_rgb24 += 3; + } +} + void SplitUVRow_C(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, @@ -2338,10 +2608,9 @@ void SetRow_C(uint8_t* dst, uint8_t v8, int width) { } void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width) { - uint32_t* d = (uint32_t*)(dst_argb); int x; for (x = 0; x < width; ++x) { - d[x] = v32; + memcpy(dst_argb + x * sizeof v32, &v32, sizeof v32); } } @@ -2439,7 +2708,7 @@ void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { } } -#define BLEND(f, b, a) (((256 - a) * b) >> 8) + f +#define BLEND(f, b, a) clamp255((((256 - a) * b) >> 8) + f) // Blend src_argb0 over src_argb1 and store to dst_argb. // dst_argb may be src_argb0 or src_argb1. @@ -2515,10 +2784,14 @@ void BlendPlaneRow_C(const uint8_t* src0, } #undef UBLEND +#if defined(__aarch64__) || defined(__arm__) +#define ATTENUATE(f, a) (f * a + 128) >> 8 +#else +// This code mimics the SSSE3 version for better testability. #define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24 +#endif // Multiply source RGB by alpha and store to destination. -// This code mimics the SSSE3 version for better testability. void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) { int i; for (i = 0; i < width - 1; i += 2) { @@ -3305,6 +3578,70 @@ void NV12ToRGB565Row_AVX2(const uint8_t* src_y, } #endif +#ifdef HAS_RGB24TOYJROW_AVX2 +// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values. +void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth); + ARGBToYJRow_AVX2(row, dst_yj, twidth); + src_rgb24 += twidth * 3; + dst_yj += twidth; + width -= twidth; + } +} +#endif // HAS_RGB24TOYJROW_AVX2 + +#ifdef HAS_RAWTOYJROW_AVX2 +// Convert 16 RAW pixels (64 bytes) to 16 YJ values. +void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + RAWToARGBRow_SSSE3(src_raw, row, twidth); + ARGBToYJRow_AVX2(row, dst_yj, twidth); + src_raw += twidth * 3; + dst_yj += twidth; + width -= twidth; + } +} +#endif // HAS_RAWTOYJROW_AVX2 + +#ifdef HAS_RGB24TOYJROW_SSSE3 +// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values. +void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth); + ARGBToYJRow_SSSE3(row, dst_yj, twidth); + src_rgb24 += twidth * 3; + dst_yj += twidth; + width -= twidth; + } +} +#endif // HAS_RGB24TOYJROW_SSSE3 + +#ifdef HAS_RAWTOYJROW_SSSE3 +// Convert 16 RAW pixels (64 bytes) to 16 YJ values. +void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + RAWToARGBRow_SSSE3(src_raw, row, twidth); + ARGBToYJRow_SSSE3(row, dst_yj, twidth); + src_raw += twidth * 3; + dst_yj += twidth; + width -= twidth; + } +} +#endif // HAS_RAWTOYJROW_SSSE3 + float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) { float fsum = 0.f; int i; @@ -3358,6 +3695,29 @@ void GaussCol_C(const uint16_t* src0, } } +void GaussRow_F32_C(const float* src, float* dst, int width) { + int i; + for (i = 0; i < width; ++i) { + *dst++ = (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4]) * + (1.0f / 256.0f); + ++src; + } +} + +// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. +void GaussCol_F32_C(const float* src0, + const float* src1, + const float* src2, + const float* src3, + const float* src4, + float* dst, + int width) { + int i; + for (i = 0; i < width; ++i) { + *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++; + } +} + // Convert biplanar NV21 to packed YUV24 void NV21ToYUV24Row_C(const uint8_t* src_y, const uint8_t* src_vu, @@ -3459,6 +3819,30 @@ void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width) { } } +void HalfMergeUVRow_C(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + dst_uv[0] = (src_u[0] + src_u[1] + src_u[src_stride_u] + + src_u[src_stride_u + 1] + 2) >> + 2; + dst_uv[1] = (src_v[0] + src_v[1] + src_v[src_stride_v] + + src_v[src_stride_v + 1] + 2) >> + 2; + src_u += 2; + src_v += 2; + dst_uv += 2; + } + if (width & 1) { + dst_uv[0] = (src_u[0] + src_u[src_stride_u] + 1) >> 1; + dst_uv[1] = (src_v[0] + src_v[src_stride_v] + 1) >> 1; + } +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/chromium/third_party/libyuv/source/row_gcc.cc b/chromium/third_party/libyuv/source/row_gcc.cc index 3088bb75517..a107c30e769 100644 --- a/chromium/third_party/libyuv/source/row_gcc.cc +++ b/chromium/third_party/libyuv/source/row_gcc.cc @@ -159,24 +159,24 @@ static const lvec8 kShuffleNV21 = { #ifdef HAS_J400TOARGBROW_SSE2 void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) { asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0x18,%%xmm5 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" LABELALIGN "1: \n" - "movq (%0),%%xmm0 \n" - "lea 0x8(%0),%0 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm0,%%xmm0 \n" - "punpckhwd %%xmm1,%%xmm1 \n" - "por %%xmm5,%%xmm0 \n" - "por %%xmm5,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "movq (%0),%%xmm0 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm1 \n" + "por %%xmm5,%%xmm0 \n" + "por %%xmm5,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src_y), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -190,35 +190,35 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 - "pslld $0x18,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 + "pslld $0x18,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm3 \n" - "lea 0x30(%0),%0 \n" - "movdqa %%xmm3,%%xmm2 \n" - "palignr $0x8,%%xmm1,%%xmm2 \n" - "pshufb %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm2 \n" - "palignr $0xc,%%xmm0,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "movdqu %%xmm2,0x20(%1) \n" - "por %%xmm5,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "por %%xmm5,%%xmm1 \n" - "palignr $0x4,%%xmm3,%%xmm3 \n" - "pshufb %%xmm4,%%xmm3 \n" - "movdqu %%xmm1,0x10(%1) \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm3,0x30(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm3 \n" + "lea 0x30(%0),%0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "palignr $0x8,%%xmm1,%%xmm2 \n" + "pshufb %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm2 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqu %%xmm2,0x20(%1) \n" + "por %%xmm5,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "palignr $0x4,%%xmm3,%%xmm3 \n" + "pshufb %%xmm4,%%xmm3 \n" + "movdqu %%xmm1,0x10(%1) \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm3,0x30(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -228,35 +228,35 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) { asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 - "pslld $0x18,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 + "pslld $0x18,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm3 \n" - "lea 0x30(%0),%0 \n" - "movdqa %%xmm3,%%xmm2 \n" - "palignr $0x8,%%xmm1,%%xmm2 \n" - "pshufb %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm2 \n" - "palignr $0xc,%%xmm0,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "movdqu %%xmm2,0x20(%1) \n" - "por %%xmm5,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "por %%xmm5,%%xmm1 \n" - "palignr $0x4,%%xmm3,%%xmm3 \n" - "pshufb %%xmm4,%%xmm3 \n" - "movdqu %%xmm1,0x10(%1) \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm3,0x30(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm3 \n" + "lea 0x30(%0),%0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "palignr $0x8,%%xmm1,%%xmm2 \n" + "pshufb %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm2 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqu %%xmm2,0x20(%1) \n" + "por %%xmm5,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "palignr $0x4,%%xmm3,%%xmm3 \n" + "pshufb %%xmm4,%%xmm3 \n" + "movdqu %%xmm1,0x10(%1) \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm3,0x30(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_raw), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -267,35 +267,35 @@ void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) { // Same code as RAWToARGB with different shuffler and A in low bits void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" // 0x000000ff - "psrld $0x18,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" // 0x000000ff + "psrld $0x18,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm3 \n" - "lea 0x30(%0),%0 \n" - "movdqa %%xmm3,%%xmm2 \n" - "palignr $0x8,%%xmm1,%%xmm2 \n" - "pshufb %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm2 \n" - "palignr $0xc,%%xmm0,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "movdqu %%xmm2,0x20(%1) \n" - "por %%xmm5,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "por %%xmm5,%%xmm1 \n" - "palignr $0x4,%%xmm3,%%xmm3 \n" - "pshufb %%xmm4,%%xmm3 \n" - "movdqu %%xmm1,0x10(%1) \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm3,0x30(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm3 \n" + "lea 0x30(%0),%0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "palignr $0x8,%%xmm1,%%xmm2 \n" + "pshufb %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm2 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqu %%xmm2,0x20(%1) \n" + "por %%xmm5,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "palignr $0x4,%%xmm3,%%xmm3 \n" + "pshufb %%xmm4,%%xmm3 \n" + "movdqu %%xmm1,0x10(%1) \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm3,0x30(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_raw), // %0 "+r"(dst_rgba), // %1 "+r"(width) // %2 @@ -307,25 +307,25 @@ void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { asm volatile( - "movdqa %3,%%xmm3 \n" - "movdqa %4,%%xmm4 \n" - "movdqa %5,%%xmm5 \n" + "movdqa %3,%%xmm3 \n" + "movdqa %4,%%xmm4 \n" + "movdqa %5,%%xmm5 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x4(%0),%%xmm1 \n" - "movdqu 0x8(%0),%%xmm2 \n" - "lea 0x18(%0),%0 \n" - "pshufb %%xmm3,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "pshufb %%xmm5,%%xmm2 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x8(%1) \n" - "movq %%xmm2,0x10(%1) \n" - "lea 0x18(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x4(%0),%%xmm1 \n" + "movdqu 0x8(%0),%%xmm2 \n" + "lea 0x18(%0),%0 \n" + "pshufb %%xmm3,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "pshufb %%xmm5,%%xmm2 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x8(%1) \n" + "movq %%xmm2,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src_raw), // %0 "+r"(dst_rgb24), // %1 "+r"(width) // %2 @@ -337,44 +337,44 @@ void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "mov $0x1080108,%%eax \n" - "movd %%eax,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "mov $0x20802080,%%eax \n" - "movd %%eax,%%xmm6 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "pcmpeqb %%xmm3,%%xmm3 \n" - "psllw $0xb,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psllw $0xa,%%xmm4 \n" - "psrlw $0x5,%%xmm4 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" - "psllw $0x8,%%xmm7 \n" - "sub %0,%1 \n" - "sub %0,%1 \n" + "mov $0x1080108,%%eax \n" + "movd %%eax,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "mov $0x20802080,%%eax \n" + "movd %%eax,%%xmm6 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "psllw $0xb,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psllw $0xa,%%xmm4 \n" + "psrlw $0x5,%%xmm4 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "psllw $0x8,%%xmm7 \n" + "sub %0,%1 \n" + "sub %0,%1 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pand %%xmm3,%%xmm1 \n" - "psllw $0xb,%%xmm2 \n" - "pmulhuw %%xmm5,%%xmm1 \n" - "pmulhuw %%xmm5,%%xmm2 \n" - "psllw $0x8,%%xmm1 \n" - "por %%xmm2,%%xmm1 \n" - "pand %%xmm4,%%xmm0 \n" - "pmulhuw %%xmm6,%%xmm0 \n" - "por %%xmm7,%%xmm0 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm2 \n" - "movdqu %%xmm1,0x00(%1,%0,2) \n" - "movdqu %%xmm2,0x10(%1,%0,2) \n" - "lea 0x10(%0),%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pand %%xmm3,%%xmm1 \n" + "psllw $0xb,%%xmm2 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm2 \n" + "psllw $0x8,%%xmm1 \n" + "por %%xmm2,%%xmm1 \n" + "pand %%xmm4,%%xmm0 \n" + "pmulhuw %%xmm6,%%xmm0 \n" + "por %%xmm7,%%xmm0 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + "movdqu %%xmm1,0x00(%1,%0,2) \n" + "movdqu %%xmm2,0x10(%1,%0,2) \n" + "lea 0x10(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -385,47 +385,47 @@ void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "mov $0x1080108,%%eax \n" - "movd %%eax,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "mov $0x42004200,%%eax \n" - "movd %%eax,%%xmm6 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "pcmpeqb %%xmm3,%%xmm3 \n" - "psllw $0xb,%%xmm3 \n" - "movdqa %%xmm3,%%xmm4 \n" - "psrlw $0x6,%%xmm4 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" - "psllw $0x8,%%xmm7 \n" - "sub %0,%1 \n" - "sub %0,%1 \n" + "mov $0x1080108,%%eax \n" + "movd %%eax,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "mov $0x42004200,%%eax \n" + "movd %%eax,%%xmm6 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "psllw $0xb,%%xmm3 \n" + "movdqa %%xmm3,%%xmm4 \n" + "psrlw $0x6,%%xmm4 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "psllw $0x8,%%xmm7 \n" + "sub %0,%1 \n" + "sub %0,%1 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "psllw $0x1,%%xmm1 \n" - "psllw $0xb,%%xmm2 \n" - "pand %%xmm3,%%xmm1 \n" - "pmulhuw %%xmm5,%%xmm2 \n" - "pmulhuw %%xmm5,%%xmm1 \n" - "psllw $0x8,%%xmm1 \n" - "por %%xmm2,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pand %%xmm4,%%xmm0 \n" - "psraw $0x8,%%xmm2 \n" - "pmulhuw %%xmm6,%%xmm0 \n" - "pand %%xmm7,%%xmm2 \n" - "por %%xmm2,%%xmm0 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm2 \n" - "movdqu %%xmm1,0x00(%1,%0,2) \n" - "movdqu %%xmm2,0x10(%1,%0,2) \n" - "lea 0x10(%0),%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psllw $0x1,%%xmm1 \n" + "psllw $0xb,%%xmm2 \n" + "pand %%xmm3,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm2 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "psllw $0x8,%%xmm1 \n" + "por %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pand %%xmm4,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "pmulhuw %%xmm6,%%xmm0 \n" + "pand %%xmm7,%%xmm2 \n" + "por %%xmm2,%%xmm0 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + "movdqu %%xmm1,0x00(%1,%0,2) \n" + "movdqu %%xmm2,0x10(%1,%0,2) \n" + "lea 0x10(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -436,34 +436,34 @@ void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "mov $0xf0f0f0f,%%eax \n" - "movd %%eax,%%xmm4 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "pslld $0x4,%%xmm5 \n" - "sub %0,%1 \n" - "sub %0,%1 \n" + "mov $0xf0f0f0f,%%eax \n" + "movd %%eax,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "pslld $0x4,%%xmm5 \n" + "sub %0,%1 \n" + "sub %0,%1 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pand %%xmm4,%%xmm0 \n" - "pand %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm3 \n" - "psllw $0x4,%%xmm1 \n" - "psrlw $0x4,%%xmm3 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm3,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "movdqu %%xmm0,0x00(%1,%0,2) \n" - "movdqu %%xmm1,0x10(%1,%0,2) \n" - "lea 0x10(%0),%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pand %%xmm4,%%xmm0 \n" + "pand %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm3 \n" + "psllw $0x4,%%xmm1 \n" + "psrlw $0x4,%%xmm3 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm3,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,0x00(%1,%0,2) \n" + "movdqu %%xmm1,0x10(%1,%0,2) \n" + "lea 0x10(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -474,35 +474,35 @@ void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "movdqa %3,%%xmm6 \n" + "movdqa %3,%%xmm6 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "pshufb %%xmm6,%%xmm0 \n" - "pshufb %%xmm6,%%xmm1 \n" - "pshufb %%xmm6,%%xmm2 \n" - "pshufb %%xmm6,%%xmm3 \n" - "movdqa %%xmm1,%%xmm4 \n" - "psrldq $0x4,%%xmm1 \n" - "pslldq $0xc,%%xmm4 \n" - "movdqa %%xmm2,%%xmm5 \n" - "por %%xmm4,%%xmm0 \n" - "pslldq $0x8,%%xmm5 \n" - "movdqu %%xmm0,(%1) \n" - "por %%xmm5,%%xmm1 \n" - "psrldq $0x8,%%xmm2 \n" - "pslldq $0x4,%%xmm3 \n" - "por %%xmm3,%%xmm2 \n" - "movdqu %%xmm1,0x10(%1) \n" - "movdqu %%xmm2,0x20(%1) \n" - "lea 0x30(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "pshufb %%xmm6,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "pshufb %%xmm6,%%xmm2 \n" + "pshufb %%xmm6,%%xmm3 \n" + "movdqa %%xmm1,%%xmm4 \n" + "psrldq $0x4,%%xmm1 \n" + "pslldq $0xc,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "por %%xmm4,%%xmm0 \n" + "pslldq $0x8,%%xmm5 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "psrldq $0x8,%%xmm2 \n" + "pslldq $0x4,%%xmm3 \n" + "por %%xmm3,%%xmm2 \n" + "movdqu %%xmm1,0x10(%1) \n" + "movdqu %%xmm2,0x20(%1) \n" + "lea 0x30(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -513,35 +513,35 @@ void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "movdqa %3,%%xmm6 \n" + "movdqa %3,%%xmm6 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "pshufb %%xmm6,%%xmm0 \n" - "pshufb %%xmm6,%%xmm1 \n" - "pshufb %%xmm6,%%xmm2 \n" - "pshufb %%xmm6,%%xmm3 \n" - "movdqa %%xmm1,%%xmm4 \n" - "psrldq $0x4,%%xmm1 \n" - "pslldq $0xc,%%xmm4 \n" - "movdqa %%xmm2,%%xmm5 \n" - "por %%xmm4,%%xmm0 \n" - "pslldq $0x8,%%xmm5 \n" - "movdqu %%xmm0,(%1) \n" - "por %%xmm5,%%xmm1 \n" - "psrldq $0x8,%%xmm2 \n" - "pslldq $0x4,%%xmm3 \n" - "por %%xmm3,%%xmm2 \n" - "movdqu %%xmm1,0x10(%1) \n" - "movdqu %%xmm2,0x20(%1) \n" - "lea 0x30(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "pshufb %%xmm6,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "pshufb %%xmm6,%%xmm2 \n" + "pshufb %%xmm6,%%xmm3 \n" + "movdqa %%xmm1,%%xmm4 \n" + "psrldq $0x4,%%xmm1 \n" + "pslldq $0xc,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "por %%xmm4,%%xmm0 \n" + "pslldq $0x8,%%xmm5 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "psrldq $0x8,%%xmm2 \n" + "pslldq $0x4,%%xmm3 \n" + "por %%xmm3,%%xmm2 \n" + "movdqu %%xmm1,0x10(%1) \n" + "movdqu %%xmm2,0x20(%1) \n" + "lea 0x30(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -556,37 +556,37 @@ static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7}; void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "vbroadcastf128 %3,%%ymm6 \n" - "vmovdqa %4,%%ymm7 \n" + "vmovdqa %4,%%ymm7 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x40(%0),%%ymm2 \n" - "vmovdqu 0x60(%0),%%ymm3 \n" - "lea 0x80(%0),%0 \n" - "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0 - "vpshufb %%ymm6,%%ymm1,%%ymm1 \n" - "vpshufb %%ymm6,%%ymm2,%%ymm2 \n" - "vpshufb %%ymm6,%%ymm3,%%ymm3 \n" - "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes - "vpermd %%ymm1,%%ymm7,%%ymm1 \n" - "vpermd %%ymm2,%%ymm7,%%ymm2 \n" - "vpermd %%ymm3,%%ymm7,%%ymm3 \n" - "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8 - "vpor %%ymm4,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16 - "vpermq $0x4f,%%ymm2,%%ymm4 \n" - "vpor %%ymm4,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24 - "vpermq $0x93,%%ymm3,%%ymm3 \n" - "vpor %%ymm3,%%ymm2,%%ymm2 \n" - "vmovdqu %%ymm2,0x40(%1) \n" - "lea 0x60(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0 + "vpshufb %%ymm6,%%ymm1,%%ymm1 \n" + "vpshufb %%ymm6,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm6,%%ymm3,%%ymm3 \n" + "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes + "vpermd %%ymm1,%%ymm7,%%ymm1 \n" + "vpermd %%ymm2,%%ymm7,%%ymm2 \n" + "vpermd %%ymm3,%%ymm7,%%ymm3 \n" + "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8 + "vpor %%ymm4,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16 + "vpermq $0x4f,%%ymm2,%%ymm4 \n" + "vpor %%ymm4,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24 + "vpermq $0x93,%%ymm3,%%ymm3 \n" + "vpor %%ymm3,%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm2,0x40(%1) \n" + "lea 0x60(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -615,26 +615,26 @@ static const ulvec8 kPermARGBToRGB24_2 = { void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "vmovdqa %3,%%ymm5 \n" - "vmovdqa %4,%%ymm6 \n" - "vmovdqa %5,%%ymm7 \n" + "vmovdqa %3,%%ymm5 \n" + "vmovdqa %4,%%ymm6 \n" + "vmovdqa %5,%%ymm7 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x40(%0),%%ymm2 \n" - "vmovdqu 0x60(%0),%%ymm3 \n" - "lea 0x80(%0),%0 \n" - "vpermt2b %%ymm1,%%ymm5,%%ymm0 \n" - "vpermt2b %%ymm2,%%ymm6,%%ymm1 \n" - "vpermt2b %%ymm3,%%ymm7,%%ymm2 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "vmovdqu %%ymm2,0x40(%1) \n" - "lea 0x60(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vpermt2b %%ymm1,%%ymm5,%%ymm0 \n" + "vpermt2b %%ymm2,%%ymm6,%%ymm1 \n" + "vpermt2b %%ymm3,%%ymm7,%%ymm2 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "vmovdqu %%ymm2,0x40(%1) \n" + "lea 0x60(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -650,37 +650,37 @@ void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) { void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "vbroadcastf128 %3,%%ymm6 \n" - "vmovdqa %4,%%ymm7 \n" + "vmovdqa %4,%%ymm7 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x40(%0),%%ymm2 \n" - "vmovdqu 0x60(%0),%%ymm3 \n" - "lea 0x80(%0),%0 \n" - "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0 - "vpshufb %%ymm6,%%ymm1,%%ymm1 \n" - "vpshufb %%ymm6,%%ymm2,%%ymm2 \n" - "vpshufb %%ymm6,%%ymm3,%%ymm3 \n" - "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes - "vpermd %%ymm1,%%ymm7,%%ymm1 \n" - "vpermd %%ymm2,%%ymm7,%%ymm2 \n" - "vpermd %%ymm3,%%ymm7,%%ymm3 \n" - "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8 - "vpor %%ymm4,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16 - "vpermq $0x4f,%%ymm2,%%ymm4 \n" - "vpor %%ymm4,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24 - "vpermq $0x93,%%ymm3,%%ymm3 \n" - "vpor %%ymm3,%%ymm2,%%ymm2 \n" - "vmovdqu %%ymm2,0x40(%1) \n" - "lea 0x60(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0 + "vpshufb %%ymm6,%%ymm1,%%ymm1 \n" + "vpshufb %%ymm6,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm6,%%ymm3,%%ymm3 \n" + "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes + "vpermd %%ymm1,%%ymm7,%%ymm1 \n" + "vpermd %%ymm2,%%ymm7,%%ymm2 \n" + "vpermd %%ymm3,%%ymm7,%%ymm3 \n" + "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8 + "vpor %%ymm4,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16 + "vpermq $0x4f,%%ymm2,%%ymm4 \n" + "vpor %%ymm4,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24 + "vpermq $0x93,%%ymm3,%%ymm3 \n" + "vpor %%ymm3,%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm2,0x40(%1) \n" + "lea 0x60(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -694,34 +694,34 @@ void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "pcmpeqb %%xmm3,%%xmm3 \n" - "psrld $0x1b,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrld $0x1a,%%xmm4 \n" - "pslld $0x5,%%xmm4 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0xb,%%xmm5 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "psrld $0x1b,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrld $0x1a,%%xmm4 \n" + "pslld $0x5,%%xmm4 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0xb,%%xmm5 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pslld $0x8,%%xmm0 \n" - "psrld $0x3,%%xmm1 \n" - "psrld $0x5,%%xmm2 \n" - "psrad $0x10,%%xmm0 \n" - "pand %%xmm3,%%xmm1 \n" - "pand %%xmm4,%%xmm2 \n" - "pand %%xmm5,%%xmm0 \n" - "por %%xmm2,%%xmm1 \n" - "por %%xmm1,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pslld $0x8,%%xmm0 \n" + "psrld $0x3,%%xmm1 \n" + "psrld $0x5,%%xmm2 \n" + "psrad $0x10,%%xmm0 \n" + "pand %%xmm3,%%xmm1 \n" + "pand %%xmm4,%%xmm2 \n" + "pand %%xmm5,%%xmm0 \n" + "por %%xmm2,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -734,40 +734,40 @@ void ARGBToRGB565DitherRow_SSE2(const uint8_t* src, const uint32_t dither4, int width) { asm volatile( - "movd %3,%%xmm6 \n" - "punpcklbw %%xmm6,%%xmm6 \n" - "movdqa %%xmm6,%%xmm7 \n" - "punpcklwd %%xmm6,%%xmm6 \n" - "punpckhwd %%xmm7,%%xmm7 \n" - "pcmpeqb %%xmm3,%%xmm3 \n" - "psrld $0x1b,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrld $0x1a,%%xmm4 \n" - "pslld $0x5,%%xmm4 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0xb,%%xmm5 \n" + "movd %3,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm6 \n" + "movdqa %%xmm6,%%xmm7 \n" + "punpcklwd %%xmm6,%%xmm6 \n" + "punpckhwd %%xmm7,%%xmm7 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "psrld $0x1b,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrld $0x1a,%%xmm4 \n" + "pslld $0x5,%%xmm4 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0xb,%%xmm5 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "paddusb %%xmm6,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pslld $0x8,%%xmm0 \n" - "psrld $0x3,%%xmm1 \n" - "psrld $0x5,%%xmm2 \n" - "psrad $0x10,%%xmm0 \n" - "pand %%xmm3,%%xmm1 \n" - "pand %%xmm4,%%xmm2 \n" - "pand %%xmm5,%%xmm0 \n" - "por %%xmm2,%%xmm1 \n" - "por %%xmm1,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "paddusb %%xmm6,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pslld $0x8,%%xmm0 \n" + "psrld $0x3,%%xmm1 \n" + "psrld $0x5,%%xmm2 \n" + "psrad $0x10,%%xmm0 \n" + "pand %%xmm3,%%xmm1 \n" + "pand %%xmm4,%%xmm2 \n" + "pand %%xmm5,%%xmm0 \n" + "por %%xmm2,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -783,35 +783,35 @@ void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, int width) { asm volatile( "vbroadcastss %3,%%xmm6 \n" - "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n" - "vpermq $0xd8,%%ymm6,%%ymm6 \n" - "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n" - "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" - "vpsrld $0x1b,%%ymm3,%%ymm3 \n" - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrld $0x1a,%%ymm4,%%ymm4 \n" - "vpslld $0x5,%%ymm4,%%ymm4 \n" - "vpslld $0xb,%%ymm3,%%ymm5 \n" + "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n" + "vpermq $0xd8,%%ymm6,%%ymm6 \n" + "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n" + "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" + "vpsrld $0x1b,%%ymm3,%%ymm3 \n" + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrld $0x1a,%%ymm4,%%ymm4 \n" + "vpslld $0x5,%%ymm4,%%ymm4 \n" + "vpslld $0xb,%%ymm3,%%ymm5 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n" - "vpsrld $0x5,%%ymm0,%%ymm2 \n" - "vpsrld $0x3,%%ymm0,%%ymm1 \n" - "vpsrld $0x8,%%ymm0,%%ymm0 \n" - "vpand %%ymm4,%%ymm2,%%ymm2 \n" - "vpand %%ymm3,%%ymm1,%%ymm1 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpor %%ymm2,%%ymm1,%%ymm1 \n" - "vpor %%ymm1,%%ymm0,%%ymm0 \n" - "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "lea 0x20(%0),%0 \n" - "vmovdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n" + "vpsrld $0x5,%%ymm0,%%ymm2 \n" + "vpsrld $0x3,%%ymm0,%%ymm1 \n" + "vpsrld $0x8,%%ymm0,%%ymm0 \n" + "vpand %%ymm4,%%ymm2,%%ymm2 \n" + "vpand %%ymm3,%%ymm1,%%ymm1 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpor %%ymm2,%%ymm1,%%ymm1 \n" + "vpor %%ymm1,%%ymm0,%%ymm0 \n" + "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "lea 0x20(%0),%0 \n" + "vmovdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -824,38 +824,38 @@ void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrld $0x1b,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "pslld $0x5,%%xmm5 \n" - "movdqa %%xmm4,%%xmm6 \n" - "pslld $0xa,%%xmm6 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" - "pslld $0xf,%%xmm7 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrld $0x1b,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "pslld $0x5,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "pslld $0xa,%%xmm6 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "pslld $0xf,%%xmm7 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm0,%%xmm3 \n" - "psrad $0x10,%%xmm0 \n" - "psrld $0x3,%%xmm1 \n" - "psrld $0x6,%%xmm2 \n" - "psrld $0x9,%%xmm3 \n" - "pand %%xmm7,%%xmm0 \n" - "pand %%xmm4,%%xmm1 \n" - "pand %%xmm5,%%xmm2 \n" - "pand %%xmm6,%%xmm3 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm3,%%xmm2 \n" - "por %%xmm2,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "psrad $0x10,%%xmm0 \n" + "psrld $0x3,%%xmm1 \n" + "psrld $0x6,%%xmm2 \n" + "psrld $0x9,%%xmm3 \n" + "pand %%xmm7,%%xmm0 \n" + "pand %%xmm4,%%xmm1 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm6,%%xmm3 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm3,%%xmm2 \n" + "por %%xmm2,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -865,26 +865,26 @@ void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psllw $0xc,%%xmm4 \n" - "movdqa %%xmm4,%%xmm3 \n" - "psrlw $0x8,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psllw $0xc,%%xmm4 \n" + "movdqa %%xmm4,%%xmm3 \n" + "psrlw $0x8,%%xmm3 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm3,%%xmm0 \n" - "pand %%xmm4,%%xmm1 \n" - "psrlq $0x4,%%xmm0 \n" - "psrlq $0x8,%%xmm1 \n" - "por %%xmm1,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm3,%%xmm0 \n" + "pand %%xmm4,%%xmm1 \n" + "psrlq $0x4,%%xmm0 \n" + "psrlq $0x8,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -928,31 +928,31 @@ static const uint32_t kMulAG10 = 64 * 65536 + 1028; void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "movdqa %3,%%xmm2 \n" // shuffler for RB - "movd %4,%%xmm3 \n" // multipler for RB - "movd %5,%%xmm4 \n" // mask for R10 B10 - "movd %6,%%xmm5 \n" // mask for AG - "movd %7,%%xmm6 \n" // multipler for AG - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "sub %0,%1 \n" - - "1: \n" - "movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels - "movdqa %%xmm0,%%xmm1 \n" - "pshufb %%xmm2,%%xmm1 \n" // R0B0 - "pand %%xmm5,%%xmm0 \n" // A0G0 - "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10 - "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10 - "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10 - "pslld $10,%%xmm0 \n" // A2 x10 G10 x10 - "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10 - "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels - "add $0x10,%0 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "movdqa %3,%%xmm2 \n" // shuffler for RB + "movd %4,%%xmm3 \n" // multipler for RB + "movd %5,%%xmm4 \n" // mask for R10 B10 + "movd %6,%%xmm5 \n" // mask for AG + "movd %7,%%xmm6 \n" // multipler for AG + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "sub %0,%1 \n" + + "1: \n" + "movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm2,%%xmm1 \n" // R0B0 + "pand %%xmm5,%%xmm0 \n" // A0G0 + "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10 + "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10 + "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10 + "pslld $10,%%xmm0 \n" // A2 x10 G10 x10 + "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10 + "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels + "add $0x10,%0 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -967,31 +967,31 @@ void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "movdqa %3,%%xmm2 \n" // shuffler for RB - "movd %4,%%xmm3 \n" // multipler for RB - "movd %5,%%xmm4 \n" // mask for R10 B10 - "movd %6,%%xmm5 \n" // mask for AG - "movd %7,%%xmm6 \n" // multipler for AG - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "sub %0,%1 \n" - - "1: \n" - "movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels - "movdqa %%xmm0,%%xmm1 \n" - "pshufb %%xmm2,%%xmm1 \n" // R0B0 - "pand %%xmm5,%%xmm0 \n" // A0G0 - "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10 - "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10 - "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10 - "pslld $10,%%xmm0 \n" // A2 x10 G10 x10 - "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10 - "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels - "add $0x10,%0 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "movdqa %3,%%xmm2 \n" // shuffler for RB + "movd %4,%%xmm3 \n" // multipler for RB + "movd %5,%%xmm4 \n" // mask for R10 B10 + "movd %6,%%xmm5 \n" // mask for AG + "movd %7,%%xmm6 \n" // multipler for AG + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "sub %0,%1 \n" + + "1: \n" + "movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm2,%%xmm1 \n" // R0B0 + "pand %%xmm5,%%xmm0 \n" // A0G0 + "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10 + "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10 + "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10 + "pslld $10,%%xmm0 \n" // A2 x10 G10 x10 + "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10 + "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels + "add $0x10,%0 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -1008,25 +1008,25 @@ void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB - "vbroadcastss %4,%%ymm3 \n" // multipler for RB - "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10 - "vbroadcastss %6,%%ymm5 \n" // mask for AG - "vbroadcastss %7,%%ymm6 \n" // multipler for AG - "sub %0,%1 \n" - - "1: \n" - "vmovdqu (%0),%%ymm0 \n" // fetch 8 ARGB pixels - "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0 - "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0 - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10 - "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10 - "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10 - "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10 - "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10 - "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels - "add $0x20,%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "vbroadcastss %4,%%ymm3 \n" // multipler for RB + "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10 + "vbroadcastss %6,%%ymm5 \n" // mask for AG + "vbroadcastss %7,%%ymm6 \n" // multipler for AG + "sub %0,%1 \n" + + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // fetch 8 ARGB pixels + "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0 + "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0 + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10 + "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10 + "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10 + "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10 + "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10 + "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels + "add $0x20,%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 @@ -1045,25 +1045,25 @@ void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB - "vbroadcastss %4,%%ymm3 \n" // multipler for RB - "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10 - "vbroadcastss %6,%%ymm5 \n" // mask for AG - "vbroadcastss %7,%%ymm6 \n" // multipler for AG - "sub %0,%1 \n" - - "1: \n" - "vmovdqu (%0),%%ymm0 \n" // fetch 8 ABGR pixels - "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0 - "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0 - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10 - "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10 - "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10 - "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10 - "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10 - "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels - "add $0x20,%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "vbroadcastss %4,%%ymm3 \n" // multipler for RB + "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10 + "vbroadcastss %6,%%ymm5 \n" // mask for AG + "vbroadcastss %7,%%ymm6 \n" // multipler for AG + "sub %0,%1 \n" + + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // fetch 8 ABGR pixels + "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0 + "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0 + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10 + "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10 + "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10 + "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10 + "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10 + "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels + "add $0x20,%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 @@ -1078,6 +1078,8 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { } #endif +// clang-format off + // TODO(mraptis): Consider passing R, G, B multipliers as parameter. // round parameter is register containing value to add before shift. #define RGBTOY(round) \ @@ -1101,10 +1103,9 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { "lea 0x40(%0),%0 \n" \ "phaddw %%xmm0,%%xmm6 \n" \ "phaddw %%xmm2,%%xmm1 \n" \ - "paddw %%" #round \ - ",%%xmm6 \n" \ - "paddw %%" #round \ - ",%%xmm1 \n" \ + "prefetcht0 1280(%0) \n" \ + "paddw %%" #round ",%%xmm6 \n" \ + "paddw %%" #round ",%%xmm1 \n" \ "psrlw $0x8,%%xmm6 \n" \ "psrlw $0x8,%%xmm1 \n" \ "packuswb %%xmm1,%%xmm6 \n" \ @@ -1130,10 +1131,9 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { "lea 0x80(%0),%0 \n" \ "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" /* mutates. */ \ "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" \ - "vpaddw %%" #round \ - ",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */ \ - "vpaddw %%" #round \ - ",%%ymm2,%%ymm2 \n" \ + "prefetcht0 1280(%0) \n" \ + "vpaddw %%" #round ",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */ \ + "vpaddw %%" #round ",%%ymm2,%%ymm2 \n" \ "vpsrlw $0x8,%%ymm0,%%ymm0 \n" \ "vpsrlw $0x8,%%ymm2,%%ymm2 \n" \ "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" /* mutates. */ \ @@ -1144,13 +1144,15 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { "jg 1b \n" \ "vzeroupper \n" +// clang-format on + #ifdef HAS_ARGBTOYROW_SSSE3 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - "movdqa %5,%%xmm7 \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + "movdqa %5,%%xmm7 \n" LABELALIGN RGBTOY(xmm7) : "+r"(src_argb), // %0 @@ -1169,8 +1171,8 @@ void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { // Same as ARGBToYRow but different coefficients, no add 16. void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" LABELALIGN RGBTOY(xmm5) : "+r"(src_argb), // %0 @@ -1187,8 +1189,8 @@ void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { // Same as ARGBToYRow but different coefficients, no add 16. void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" LABELALIGN RGBTOY(xmm5) : "+r"(src_rgba), // %0 @@ -1210,7 +1212,7 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" "vbroadcastf128 %5,%%ymm7 \n" - "vmovdqu %6,%%ymm6 \n" + "vmovdqu %6,%%ymm6 \n" LABELALIGN RGBTOY_AVX2(ymm7) : "+r"(src_argb), // %0 @@ -1232,7 +1234,7 @@ void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) { "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" "vbroadcastf128 %5,%%ymm7 \n" - "vmovdqu %6,%%ymm6 \n" + "vmovdqu %6,%%ymm6 \n" LABELALIGN RGBTOY_AVX2(ymm7) : "+r"(src_abgr), // %0 @@ -1253,7 +1255,7 @@ void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { asm volatile( "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" - "vmovdqu %5,%%ymm6 \n" + "vmovdqu %5,%%ymm6 \n" LABELALIGN RGBTOY_AVX2(ymm5) : "+r"(src_argb), // %0 @@ -1273,7 +1275,7 @@ void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) { asm volatile( "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" - "vmovdqu %5,%%ymm6 \n" + "vmovdqu %5,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( ymm5) "vzeroupper \n" @@ -1294,52 +1296,52 @@ void ARGBToUVRow_SSSE3(const uint8_t* src_argb0, uint8_t* dst_v, int width) { asm volatile( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x10(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x20(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "movdqu 0x30(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - - "lea 0x40(%0),%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_argb0), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -1366,44 +1368,44 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb0, "vbroadcastf128 %5,%%ymm5 \n" "vbroadcastf128 %6,%%ymm6 \n" "vbroadcastf128 %7,%%ymm7 \n" - "sub %1,%2 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x40(%0),%%ymm2 \n" - "vmovdqu 0x60(%0),%%ymm3 \n" - "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" - "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" - "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" - "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" - "lea 0x80(%0),%0 \n" - "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" - "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" - "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" - "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" - "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" - "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" - - "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" - "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" - "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" - "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpsraw $0x8,%%ymm1,%%ymm1 \n" - "vpsraw $0x8,%%ymm0,%%ymm0 \n" - "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpshufb %8,%%ymm0,%%ymm0 \n" - "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" + "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" + "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" + "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" + "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" + "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" + "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" + + "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" + "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" + "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpsraw $0x8,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm0,%%ymm0 \n" + "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %8,%%ymm0,%%ymm0 \n" + "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" "vextractf128 $0x0,%%ymm0,(%1) \n" "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_argb0), // %0 "+r"(dst_u), // %1 @@ -1429,44 +1431,44 @@ void ABGRToUVRow_AVX2(const uint8_t* src_abgr0, "vbroadcastf128 %5,%%ymm5 \n" "vbroadcastf128 %6,%%ymm6 \n" "vbroadcastf128 %7,%%ymm7 \n" - "sub %1,%2 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x40(%0),%%ymm2 \n" - "vmovdqu 0x60(%0),%%ymm3 \n" - "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" - "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" - "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" - "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" - "lea 0x80(%0),%0 \n" - "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" - "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" - "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" - "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" - "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" - "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" - - "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" - "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" - "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" - "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpsraw $0x8,%%ymm1,%%ymm1 \n" - "vpsraw $0x8,%%ymm0,%%ymm0 \n" - "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpshufb %8,%%ymm0,%%ymm0 \n" - "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" + "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" + "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" + "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" + "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" + "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" + "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" + + "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" + "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" + "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpsraw $0x8,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm0,%%ymm0 \n" + "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %8,%%ymm0,%%ymm0 \n" + "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" "vextractf128 $0x0,%%ymm0,(%1) \n" "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_abgr0), // %0 "+r"(dst_u), // %1 @@ -1492,45 +1494,45 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb0, "vbroadcastf128 %5,%%ymm5 \n" "vbroadcastf128 %6,%%ymm6 \n" "vbroadcastf128 %7,%%ymm7 \n" - "sub %1,%2 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x40(%0),%%ymm2 \n" - "vmovdqu 0x60(%0),%%ymm3 \n" - "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" - "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" - "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" - "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" - "lea 0x80(%0),%0 \n" - "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" - "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" - "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" - "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" - "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" - "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" - - "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" - "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" - "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" - "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm5,%%ymm1,%%ymm1 \n" - "vpsraw $0x8,%%ymm1,%%ymm1 \n" - "vpsraw $0x8,%%ymm0,%%ymm0 \n" - "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpshufb %8,%%ymm0,%%ymm0 \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" + "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" + "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" + "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" + "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" + "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" + "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" + + "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" + "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" + "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm0,%%ymm0 \n" + "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %8,%%ymm0,%%ymm0 \n" "vextractf128 $0x0,%%ymm0,(%1) \n" "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_argb0), // %0 "+r"(dst_u), // %1 @@ -1553,53 +1555,53 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0, uint8_t* dst_v, int width) { asm volatile( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x10(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x20(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "movdqu 0x30(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - - "lea 0x40(%0),%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "paddw %%xmm5,%%xmm0 \n" - "paddw %%xmm5,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "paddw %%xmm5,%%xmm0 \n" + "paddw %%xmm5,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_argb0), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -1618,47 +1620,47 @@ void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, uint8_t* dst_v, int width) { asm volatile( - "movdqa %4,%%xmm3 \n" - "movdqa %5,%%xmm4 \n" - "movdqa %6,%%xmm5 \n" - "sub %1,%2 \n" + "movdqa %4,%%xmm3 \n" + "movdqa %5,%%xmm4 \n" + "movdqa %6,%%xmm5 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm6 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm6,%%xmm2 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm2 \n" - "packsswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "pmaddubsw %%xmm3,%%xmm0 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm6,%%xmm2 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm2 \n" - "packsswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "lea 0x40(%0),%0 \n" - "movdqu %%xmm0,0x00(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm6 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm6,%%xmm2 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "packsswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "pmaddubsw %%xmm3,%%xmm0 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm6,%%xmm2 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "packsswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "lea 0x40(%0),%0 \n" + "movdqu %%xmm0,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -1672,9 +1674,9 @@ void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) { asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - "movdqa %5,%%xmm7 \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + "movdqa %5,%%xmm7 \n" LABELALIGN RGBTOY(xmm7) : "+r"(src_bgra), // %0 @@ -1693,52 +1695,52 @@ void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0, uint8_t* dst_v, int width) { asm volatile( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x10(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x20(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "movdqu 0x30(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - - "lea 0x40(%0),%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_bgra0), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -1752,9 +1754,9 @@ void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0, void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - "movdqa %5,%%xmm7 \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + "movdqa %5,%%xmm7 \n" LABELALIGN RGBTOY(xmm7) : "+r"(src_abgr), // %0 @@ -1769,9 +1771,9 @@ void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - "movdqa %5,%%xmm7 \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + "movdqa %5,%%xmm7 \n" LABELALIGN RGBTOY(xmm7) : "+r"(src_rgba), // %0 @@ -1790,52 +1792,52 @@ void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0, uint8_t* dst_v, int width) { asm volatile( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x10(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x20(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "movdqu 0x30(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - - "lea 0x40(%0),%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_abgr0), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -1853,52 +1855,52 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0, uint8_t* dst_v, int width) { asm volatile( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x10(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x20(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "movdqu 0x30(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - - "lea 0x40(%0),%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_rgba0), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -2115,16 +2117,16 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV444 YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2144,27 +2146,27 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) - "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" - "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" - "sub %[u_buf],%[v_buf] \n" + "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" + "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" + "sub %[u_buf],%[v_buf] \n" LABELALIGN - "1: \n" + "1: \n" READYUV422 YUVTORGB(yuvconstants) - "punpcklbw %%xmm1,%%xmm0 \n" - "punpcklbw %%xmm2,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpckhwd %%xmm2,%%xmm1 \n" - "pshufb %%xmm5,%%xmm0 \n" - "pshufb %%xmm6,%%xmm1 \n" - "palignr $0xc,%%xmm0,%%xmm1 \n" - "movq %%xmm0,(%[dst_rgb24]) \n" - "movdqu %%xmm1,0x8(%[dst_rgb24]) \n" - "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n" - "subl $0x8,%[width] \n" - "jg 1b \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm2,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "movq %%xmm0,(%[dst_rgb24]) \n" + "movdqu %%xmm1,0x8(%[dst_rgb24]) \n" + "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n" + "subl $0x8,%[width] \n" + "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2190,16 +2192,16 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV422 YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2219,21 +2221,21 @@ void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants - "psrlw $14,%%xmm5 \n" - "psllw $4,%%xmm5 \n" // 2 alpha bits - "pxor %%xmm6,%%xmm6 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min - "psrlw $6,%%xmm7 \n" // 1023 for max + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants + "psrlw $14,%%xmm5 \n" + "psllw $4,%%xmm5 \n" // 2 alpha bits + "pxor %%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min + "psrlw $6,%%xmm7 \n" // 1023 for max LABELALIGN - "1: \n" + "1: \n" READYUV422 YUVTORGB16(yuvconstants) STOREAR30 - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2254,16 +2256,16 @@ void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV210 YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2284,21 +2286,21 @@ void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $14,%%xmm5 \n" - "psllw $4,%%xmm5 \n" // 2 alpha bits - "pxor %%xmm6,%%xmm6 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min - "psrlw $6,%%xmm7 \n" // 1023 for max + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $14,%%xmm5 \n" + "psllw $4,%%xmm5 \n" // 2 alpha bits + "pxor %%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min + "psrlw $6,%%xmm7 \n" // 1023 for max LABELALIGN - "1: \n" + "1: \n" READYUV210 YUVTORGB16(yuvconstants) STOREAR30 - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2321,15 +2323,15 @@ void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, // clang-format off asm volatile ( YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" + "sub %[u_buf],%[v_buf] \n" LABELALIGN - "1: \n" + "1: \n" READYUVA422 YUVTORGB(yuvconstants) STOREARGB - "subl $0x8,%[width] \n" - "jg 1b \n" + "subl $0x8,%[width] \n" + "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2356,15 +2358,15 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf, // clang-format off asm volatile ( YUVTORGB_SETUP(yuvconstants) - "pcmpeqb %%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" READNV12 YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [uv_buf]"+r"(uv_buf), // %[uv_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] @@ -2384,15 +2386,15 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf, // clang-format off asm volatile ( YUVTORGB_SETUP(yuvconstants) - "pcmpeqb %%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" READNV21 YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [vu_buf]"+r"(vu_buf), // %[vu_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] @@ -2412,15 +2414,15 @@ void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf, // clang-format off asm volatile ( YUVTORGB_SETUP(yuvconstants) - "pcmpeqb %%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUY2 YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] @@ -2440,15 +2442,15 @@ void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf, // clang-format off asm volatile ( YUVTORGB_SETUP(yuvconstants) - "pcmpeqb %%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" READUYVY YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] @@ -2469,16 +2471,16 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV422 YUVTORGB(yuvconstants) STORERGBA - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2693,17 +2695,17 @@ void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV444_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2727,18 +2729,18 @@ void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV422_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" + "sub $0x10,%[width] \n" + "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2762,23 +2764,23 @@ void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants - "vpsrlw $14,%%ymm5,%%ymm5 \n" - "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits - "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min - "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max - "vpsrlw $6,%%ymm7,%%ymm7 \n" + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" LABELALIGN - "1: \n" + "1: \n" READYUV422_AVX2 YUVTORGB16_AVX2(yuvconstants) STOREAR30_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" + "sub $0x10,%[width] \n" + "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2802,18 +2804,18 @@ void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV210_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" + "sub $0x10,%[width] \n" + "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2837,23 +2839,23 @@ void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants - "vpsrlw $14,%%ymm5,%%ymm5 \n" - "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits - "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min - "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max - "vpsrlw $6,%%ymm7,%%ymm7 \n" + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" LABELALIGN - "1: \n" + "1: \n" READYUV210_AVX2 YUVTORGB16_AVX2(yuvconstants) STOREAR30_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" + "sub $0x10,%[width] \n" + "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2879,16 +2881,16 @@ void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf, // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" + "sub %[u_buf],%[v_buf] \n" LABELALIGN - "1: \n" + "1: \n" READYUVA422_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 - "subl $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + "subl $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2918,11 +2920,11 @@ void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV422_AVX2 YUVTORGB_AVX2(yuvconstants) @@ -2962,16 +2964,16 @@ void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf, // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" READNV12_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [uv_buf]"+r"(uv_buf), // %[uv_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] @@ -2995,16 +2997,16 @@ void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf, // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" READNV21_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [vu_buf]"+r"(vu_buf), // %[vu_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] @@ -3028,16 +3030,16 @@ void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf, // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUY2_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] @@ -3061,16 +3063,16 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf, // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" READUYVY_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] @@ -3085,17 +3087,15 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf, #endif // HAS_UYVYTOARGBROW_AVX2 #ifdef HAS_I400TOARGBROW_SSE2 -void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) { - asm volatile( - "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 - "movd %%eax,%%xmm2 \n" - "pshufd $0x0,%%xmm2,%%xmm2 \n" - "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * - // 16 - "movd %%eax,%%xmm3 \n" - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "pslld $0x18,%%xmm4 \n" +void I400ToARGBRow_SSE2(const uint8_t* y_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + "movdqa 192(%3),%%xmm2 \n" // yg = 18997 = 1.164 + "movdqa 224(%3),%%xmm3 \n" // ygb = 1160 = 1.164 * 16 + "pcmpeqb %%xmm4,%%xmm4 \n" // 0xff000000 + "pslld $0x18,%%xmm4 \n" LABELALIGN "1: \n" @@ -3104,8 +3104,8 @@ void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) { "lea 0x8(%0),%0 \n" "punpcklbw %%xmm0,%%xmm0 \n" "pmulhuw %%xmm2,%%xmm0 \n" - "psubusw %%xmm3,%%xmm0 \n" - "psrlw $6, %%xmm0 \n" + "paddsw %%xmm3,%%xmm0 \n" + "psraw $6, %%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" // Step 2: Weave into ARGB @@ -3121,28 +3121,26 @@ void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) { "sub $0x8,%2 \n" "jg 1b \n" - : "+r"(y_buf), // %0 - "+r"(dst_argb), // %1 - "+rm"(width) // %2 - : - : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); + : "+r"(y_buf), // %0 + "+r"(dst_argb), // %1 + "+rm"(width) // %2 + : "r"(yuvconstants) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } #endif // HAS_I400TOARGBROW_SSE2 #ifdef HAS_I400TOARGBROW_AVX2 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). // note: vpunpcklbw mutates and vpackuswb unmutates. -void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width) { +void I400ToARGBRow_AVX2(const uint8_t* y_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { asm volatile( - "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * - // 16 - "vmovd %%eax,%%xmm2 \n" - "vbroadcastss %%xmm2,%%ymm2 \n" - "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164 - "vmovd %%eax,%%xmm3 \n" - "vbroadcastss %%xmm3,%%ymm3 \n" - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpslld $0x18,%%ymm4,%%ymm4 \n" + "vmovdqa 192(%3),%%ymm2 \n" // yg = 18997 = 1.164 + "vmovdqa 224(%3),%%ymm3 \n" // ygb = -1160 = 1.164*16 + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 0xff000000 + "vpslld $0x18,%%ymm4,%%ymm4 \n" LABELALIGN "1: \n" @@ -3152,8 +3150,8 @@ void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width) { "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n" - "vpsrlw $0x6,%%ymm0,%%ymm0 \n" + "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n" + "vpsraw $0x6,%%ymm0,%%ymm0 \n" "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n" "vpermq $0xd8,%%ymm1,%%ymm1 \n" @@ -3163,15 +3161,15 @@ void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width) { "vpor %%ymm4,%%ymm1,%%ymm1 \n" "vmovdqu %%ymm0,(%1) \n" "vmovdqu %%ymm1,0x20(%1) \n" - "lea 0x40(%1),%1 \n" + "lea 0x40(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(y_buf), // %0 - "+r"(dst_argb), // %1 - "+rm"(width) // %2 - : - : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); + : "+r"(y_buf), // %0 + "+r"(dst_argb), // %1 + "+rm"(width) // %2 + : "r"(yuvconstants) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } #endif // HAS_I400TOARGBROW_AVX2 @@ -3184,16 +3182,16 @@ void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); asm volatile( - "movdqa %3,%%xmm5 \n" + "movdqa %3,%%xmm5 \n" LABELALIGN "1: \n" - "movdqu -0x10(%0,%2,1),%%xmm0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu -0x10(%0,%2,1),%%xmm0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(temp_width) // %2 @@ -3211,13 +3209,13 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { LABELALIGN "1: \n" - "vmovdqu -0x20(%0,%2,1),%%ymm0 \n" - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" - "vpermq $0x4e,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "vmovdqu -0x20(%0,%2,1),%%ymm0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpermq $0x4e,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -3228,37 +3226,136 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { #endif // HAS_MIRRORROW_AVX2 #ifdef HAS_MIRRORUVROW_SSSE3 +// Shuffle table for reversing the UV. +static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u, + 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u}; + +void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) { + intptr_t temp_width = (intptr_t)(width); + asm volatile( + + "movdqa %3,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu -0x10(%0,%2,2),%%xmm0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_uv), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirrorUV) // %3 + : "memory", "cc", "xmm0", "xmm5"); +} +#endif // HAS_MIRRORUVROW_SSSE3 + +#ifdef HAS_MIRRORUVROW_AVX2 +void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) { + intptr_t temp_width = (intptr_t)(width); + asm volatile( + + "vbroadcastf128 %3,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu -0x20(%0,%2,2),%%ymm0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpermq $0x4e,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uv), // %0 + "+r"(dst_uv), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirrorUV) // %3 + : "memory", "cc", "xmm0", "xmm5"); +} +#endif // HAS_MIRRORUVROW_AVX2 + +#ifdef HAS_MIRRORSPLITUVROW_SSSE3 // Shuffle table for reversing the bytes of UV channels. -static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, - 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; -void MirrorUVRow_SSSE3(const uint8_t* src, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { +static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, + 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; +void MirrorSplitUVRow_SSSE3(const uint8_t* src, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { intptr_t temp_width = (intptr_t)(width); asm volatile( - "movdqa %4,%%xmm1 \n" - "lea -0x10(%0,%3,2),%0 \n" - "sub %1,%2 \n" + "movdqa %4,%%xmm1 \n" + "lea -0x10(%0,%3,2),%0 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "lea -0x10(%0),%0 \n" - "pshufb %%xmm1,%%xmm0 \n" - "movlpd %%xmm0,(%1) \n" - "movhpd %%xmm0,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $8,%3 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(temp_width) // %3 - : "m"(kShuffleMirrorUV) // %4 + "movdqu (%0),%%xmm0 \n" + "lea -0x10(%0),%0 \n" + "pshufb %%xmm1,%%xmm0 \n" + "movlpd %%xmm0,(%1) \n" + "movhpd %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $8,%3 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(temp_width) // %3 + : "m"(kShuffleMirrorSplitUV) // %4 : "memory", "cc", "xmm0", "xmm1"); } -#endif // HAS_MIRRORUVROW_SSSE3 +#endif // HAS_MIRRORSPLITUVROW_SSSE3 + +#ifdef HAS_RGB24MIRRORROW_SSSE3 + +// Shuffle first 5 pixels to last 5 mirrored. first byte zero +static const uvec8 kShuffleMirrorRGB0 = {128u, 12u, 13u, 14u, 9u, 10u, 11u, 6u, + 7u, 8u, 3u, 4u, 5u, 0u, 1u, 2u}; + +// Shuffle last 5 pixels to first 5 mirrored. last byte zero +static const uvec8 kShuffleMirrorRGB1 = { + 13u, 14u, 15u, 10u, 11u, 12u, 7u, 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u}; + +// Shuffle 5 pixels at a time (15 bytes) +void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24, + uint8_t* dst_rgb24, + int width) { + intptr_t temp_width = (intptr_t)(width); + src_rgb24 += width * 3 - 48; + asm volatile( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" // first 5 + "movdqu 15(%0),%%xmm1 \n" // next 5 + "movdqu 30(%0),%%xmm2 \n" // next 5 + "movdqu 32(%0),%%xmm3 \n" // last 1 special + "pshufb %%xmm4,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "pshufb %%xmm4,%%xmm2 \n" + "pshufb %%xmm5,%%xmm3 \n" + "lea -0x30(%0),%0 \n" + "movdqu %%xmm0,32(%1) \n" // last 5 + "movdqu %%xmm1,17(%1) \n" // next 5 + "movdqu %%xmm2,2(%1) \n" // next 5 + "movlpd %%xmm3,0(%1) \n" // first 1 + "lea 0x30(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_rgb24), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirrorRGB0), // %3 + "m"(kShuffleMirrorRGB1) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_RGB24MIRRORROW_SSSE3 #ifdef HAS_ARGBMIRRORROW_SSE2 @@ -3266,17 +3363,17 @@ void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); asm volatile( - "lea -0x10(%0,%2,4),%0 \n" + "lea -0x10(%0,%2,4),%0 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "pshufd $0x1b,%%xmm0,%%xmm0 \n" - "lea -0x10(%0),%0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "pshufd $0x1b,%%xmm0,%%xmm0 \n" + "lea -0x10(%0),%0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(temp_width) // %2 @@ -3292,15 +3389,15 @@ void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); asm volatile( - "vmovdqu %3,%%ymm5 \n" + "vmovdqu %3,%%ymm5 \n" LABELALIGN "1: \n" - "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -3316,28 +3413,28 @@ void SplitUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_v, int width) { asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - "sub %1,%2 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm2 \n" - "vpsrlw $0x8,%%ymm1,%%ymm3 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm2,%%ymm2 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm2,0x00(%1,%2,1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm2 \n" + "vpsrlw $0x8,%%ymm1,%%ymm3 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm2,0x00(%1,%2,1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 @@ -3354,28 +3451,28 @@ void SplitUVRow_SSE2(const uint8_t* src_uv, uint8_t* dst_v, int width) { asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "psrlw $0x8,%%xmm2 \n" - "psrlw $0x8,%%xmm3 \n" - "packuswb %%xmm3,%%xmm2 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm2,0x00(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "psrlw $0x8,%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm2,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -3392,22 +3489,22 @@ void MergeUVRow_AVX2(const uint8_t* src_u, int width) { asm volatile( - "sub %0,%1 \n" + "sub %0,%1 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x00(%0,%1,1),%%ymm1 \n" - "lea 0x20(%0),%0 \n" - "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n" - "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x00(%0,%1,1),%%ymm1 \n" + "lea 0x20(%0),%0 \n" + "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n" + "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n" "vextractf128 $0x0,%%ymm2,(%2) \n" "vextractf128 $0x0,%%ymm0,0x10(%2) \n" "vextractf128 $0x1,%%ymm2,0x20(%2) \n" "vextractf128 $0x1,%%ymm0,0x30(%2) \n" - "lea 0x40(%2),%2 \n" - "sub $0x20,%3 \n" - "jg 1b \n" + "lea 0x40(%2),%2 \n" + "sub $0x20,%3 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 @@ -3425,21 +3522,21 @@ void MergeUVRow_SSE2(const uint8_t* src_u, int width) { asm volatile( - "sub %0,%1 \n" + "sub %0,%1 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%1,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm2 \n" - "movdqu %%xmm0,(%2) \n" - "movdqu %%xmm2,0x10(%2) \n" - "lea 0x20(%2),%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm2 \n" + "movdqu %%xmm0,(%2) \n" + "movdqu %%xmm2,0x10(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 @@ -3462,30 +3559,30 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u, int width) { // clang-format off asm volatile ( - "vmovd %4,%%xmm3 \n" - "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" - "vbroadcastss %%xmm3,%%ymm3 \n" - "sub %0,%1 \n" + "vmovd %4,%%xmm3 \n" + "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" + "vbroadcastss %%xmm3,%%ymm3 \n" + "sub %0,%1 \n" // 16 pixels per loop. LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu (%0,%1,1),%%ymm1 \n" - "add $0x20,%0 \n" - - "vpmullw %%ymm3,%%ymm0,%%ymm0 \n" - "vpmullw %%ymm3,%%ymm1,%%ymm1 \n" - "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates - "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n" - "vextractf128 $0x0,%%ymm2,(%2) \n" - "vextractf128 $0x0,%%ymm0,0x10(%2) \n" - "vextractf128 $0x1,%%ymm2,0x20(%2) \n" - "vextractf128 $0x1,%%ymm0,0x30(%2) \n" - "add $0x40,%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu (%0,%1,1),%%ymm1 \n" + "add $0x20,%0 \n" + + "vpmullw %%ymm3,%%ymm0,%%ymm0 \n" + "vpmullw %%ymm3,%%ymm1,%%ymm1 \n" + "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates + "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm2,(%2) \n" + "vextractf128 $0x0,%%ymm0,0x10(%2) \n" + "vextractf128 $0x1,%%ymm2,0x20(%2) \n" + "vextractf128 $0x1,%%ymm0,0x30(%2) \n" + "add $0x40,%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 @@ -3508,24 +3605,24 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y, int width) { // clang-format off asm volatile ( - "vmovd %3,%%xmm3 \n" - "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" - "vbroadcastss %%xmm3,%%ymm3 \n" - "sub %0,%1 \n" + "vmovd %3,%%xmm3 \n" + "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" + "vbroadcastss %%xmm3,%%ymm3 \n" + "sub %0,%1 \n" // 16 pixels per loop. LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vpmullw %%ymm3,%%ymm0,%%ymm0 \n" - "vpmullw %%ymm3,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm0,(%0,%1) \n" - "vmovdqu %%ymm1,0x20(%0,%1) \n" - "add $0x40,%0 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpmullw %%ymm3,%%ymm0,%%ymm0 \n" + "vpmullw %%ymm3,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%0,%1) \n" + "vmovdqu %%ymm1,0x20(%0,%1) \n" + "add $0x40,%0 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -3546,23 +3643,23 @@ void Convert16To8Row_SSSE3(const uint16_t* src_y, int width) { // clang-format off asm volatile ( - "movd %3,%%xmm2 \n" - "punpcklwd %%xmm2,%%xmm2 \n" - "pshufd $0x0,%%xmm2,%%xmm2 \n" + "movd %3,%%xmm2 \n" + "punpcklwd %%xmm2,%%xmm2 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" // 32 pixels per loop. LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "add $0x20,%0 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "add $0x10,%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "add $0x20,%0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "add $0x10,%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -3578,25 +3675,25 @@ void Convert16To8Row_AVX2(const uint16_t* src_y, int width) { // clang-format off asm volatile ( - "vmovd %3,%%xmm2 \n" - "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n" - "vbroadcastss %%xmm2,%%ymm2 \n" + "vmovd %3,%%xmm2 \n" + "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n" + "vbroadcastss %%xmm2,%%ymm2 \n" // 32 pixels per loop. LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "add $0x40,%0 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" // mutates - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "add $0x20,%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "add $0x40,%0 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" // mutates + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "add $0x20,%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -3617,25 +3714,25 @@ void Convert8To16Row_SSE2(const uint8_t* src_y, int width) { // clang-format off asm volatile ( - "movd %3,%%xmm2 \n" - "punpcklwd %%xmm2,%%xmm2 \n" - "pshufd $0x0,%%xmm2,%%xmm2 \n" + "movd %3,%%xmm2 \n" + "punpcklwd %%xmm2,%%xmm2 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" // 32 pixels per loop. LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "add $0x10,%0 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "add $0x20,%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "add $0x10,%0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "add $0x20,%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -3651,26 +3748,26 @@ void Convert8To16Row_AVX2(const uint8_t* src_y, int width) { // clang-format off asm volatile ( - "vmovd %3,%%xmm2 \n" - "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n" - "vbroadcastss %%xmm2,%%ymm2 \n" + "vmovd %3,%%xmm2 \n" + "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n" + "vbroadcastss %%xmm2,%%ymm2 \n" // 32 pixels per loop. LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "add $0x20,%0 \n" - "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n" - "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "add $0x40,%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "add $0x20,%0 \n" + "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "add $0x40,%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -3722,41 +3819,41 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb, LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "pshufb %5, %%xmm0 \n" - "pshufb %6, %%xmm1 \n" - "pshufb %7, %%xmm2 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "pshufb %8, %%xmm0 \n" - "pshufb %9, %%xmm1 \n" - "pshufb %10, %%xmm2 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "pshufb %11, %%xmm0 \n" - "pshufb %12, %%xmm1 \n" - "pshufb %13, %%xmm2 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%3) \n" - "lea 0x10(%3),%3 \n" - "lea 0x30(%0),%0 \n" - "sub $0x10,%4 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "pshufb %5, %%xmm0 \n" + "pshufb %6, %%xmm1 \n" + "pshufb %7, %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "pshufb %8, %%xmm0 \n" + "pshufb %9, %%xmm1 \n" + "pshufb %10, %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "pshufb %11, %%xmm0 \n" + "pshufb %12, %%xmm1 \n" + "pshufb %13, %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%3) \n" + "lea 0x10(%3),%3 \n" + "lea 0x30(%0),%0 \n" + "sub $0x10,%4 \n" + "jg 1b \n" : "+r"(src_rgb), // %0 "+r"(dst_r), // %1 "+r"(dst_g), // %2 @@ -3817,42 +3914,42 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r, LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%1),%%xmm1 \n" - "movdqu (%2),%%xmm2 \n" - "pshufb %5, %%xmm0 \n" - "pshufb %6, %%xmm1 \n" - "pshufb %7, %%xmm2 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%3) \n" - - "movdqu (%0),%%xmm0 \n" - "movdqu (%1),%%xmm1 \n" - "movdqu (%2),%%xmm2 \n" - "pshufb %8, %%xmm0 \n" - "pshufb %9, %%xmm1 \n" - "pshufb %10, %%xmm2 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,16(%3) \n" - - "movdqu (%0),%%xmm0 \n" - "movdqu (%1),%%xmm1 \n" - "movdqu (%2),%%xmm2 \n" - "pshufb %11, %%xmm0 \n" - "pshufb %12, %%xmm1 \n" - "pshufb %13, %%xmm2 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,32(%3) \n" - - "lea 0x10(%0),%0 \n" - "lea 0x10(%1),%1 \n" - "lea 0x10(%2),%2 \n" - "lea 0x30(%3),%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu (%1),%%xmm1 \n" + "movdqu (%2),%%xmm2 \n" + "pshufb %5, %%xmm0 \n" + "pshufb %6, %%xmm1 \n" + "pshufb %7, %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%3) \n" + + "movdqu (%0),%%xmm0 \n" + "movdqu (%1),%%xmm1 \n" + "movdqu (%2),%%xmm2 \n" + "pshufb %8, %%xmm0 \n" + "pshufb %9, %%xmm1 \n" + "pshufb %10, %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,16(%3) \n" + + "movdqu (%0),%%xmm0 \n" + "movdqu (%1),%%xmm1 \n" + "movdqu (%2),%%xmm2 \n" + "pshufb %11, %%xmm0 \n" + "pshufb %12, %%xmm1 \n" + "pshufb %13, %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,32(%3) \n" + + "lea 0x10(%0),%0 \n" + "lea 0x10(%1),%1 \n" + "lea 0x10(%2),%2 \n" + "lea 0x30(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 @@ -3874,35 +3971,35 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r, #ifdef HAS_COPYROW_SSE2 void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "test $0xf,%0 \n" - "jne 2f \n" - "test $0xf,%1 \n" - "jne 2f \n" + "test $0xf,%0 \n" + "jne 2f \n" + "test $0xf,%1 \n" + "jne 2f \n" LABELALIGN "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "movdqa %%xmm0,(%1) \n" - "movdqa %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "jmp 9f \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm0,(%1) \n" + "movdqa %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "jmp 9f \n" LABELALIGN "2: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 2b \n" - - LABELALIGN "9: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 2b \n" + + LABELALIGN "9: \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -3917,14 +4014,14 @@ void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) { LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x40,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x40,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -3939,7 +4036,7 @@ void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) { size_t width_tmp = (size_t)(width); asm volatile( - "rep movsb \n" + "rep movsb \n" : "+S"(src), // %0 "+D"(dst), // %1 "+c"(width_tmp) // %2 @@ -3952,29 +4049,29 @@ void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) { // width in pixels void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "pcmpeqb %%xmm0,%%xmm0 \n" - "pslld $0x18,%%xmm0 \n" - "pcmpeqb %%xmm1,%%xmm1 \n" - "psrld $0x8,%%xmm1 \n" + "pcmpeqb %%xmm0,%%xmm0 \n" + "pslld $0x18,%%xmm0 \n" + "pcmpeqb %%xmm1,%%xmm1 \n" + "psrld $0x8,%%xmm1 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm2 \n" - "movdqu 0x10(%0),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "movdqu (%1),%%xmm4 \n" - "movdqu 0x10(%1),%%xmm5 \n" - "pand %%xmm0,%%xmm2 \n" - "pand %%xmm0,%%xmm3 \n" - "pand %%xmm1,%%xmm4 \n" - "pand %%xmm1,%%xmm5 \n" - "por %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm2,(%1) \n" - "movdqu %%xmm3,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm2 \n" + "movdqu 0x10(%0),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "movdqu (%1),%%xmm4 \n" + "movdqu 0x10(%1),%%xmm5 \n" + "pand %%xmm0,%%xmm2 \n" + "pand %%xmm0,%%xmm3 \n" + "pand %%xmm1,%%xmm4 \n" + "pand %%xmm1,%%xmm5 \n" + "por %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm2,(%1) \n" + "movdqu %%xmm3,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -3987,21 +4084,21 @@ void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { // width in pixels void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" - "vpsrld $0x8,%%ymm0,%%ymm0 \n" + "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" + "vpsrld $0x8,%%ymm0,%%ymm0 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm1 \n" - "vmovdqu 0x20(%0),%%ymm2 \n" - "lea 0x40(%0),%0 \n" - "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n" - "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n" - "vmovdqu %%ymm1,(%1) \n" - "vmovdqu %%ymm2,0x20(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm1 \n" + "vmovdqu 0x20(%0),%%ymm2 \n" + "lea 0x40(%0),%0 \n" + "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n" + "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm1,(%1) \n" + "vmovdqu %%ymm2,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -4020,17 +4117,17 @@ void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, LABELALIGN "1: \n" - "movdqu (%0), %%xmm0 \n" - "movdqu 0x10(%0), %%xmm1 \n" - "lea 0x20(%0), %0 \n" - "psrld $0x18, %%xmm0 \n" - "psrld $0x18, %%xmm1 \n" - "packssdw %%xmm1, %%xmm0 \n" - "packuswb %%xmm0, %%xmm0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1), %1 \n" - "sub $0x8, %2 \n" - "jg 1b \n" + "movdqu (%0), %%xmm0 \n" + "movdqu 0x10(%0), %%xmm1 \n" + "lea 0x20(%0), %0 \n" + "psrld $0x18, %%xmm0 \n" + "psrld $0x18, %%xmm1 \n" + "packssdw %%xmm1, %%xmm0 \n" + "packuswb %%xmm0, %%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1), %1 \n" + "sub $0x8, %2 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_a), // %1 "+rm"(width) // %2 @@ -4048,28 +4145,28 @@ void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, uint8_t* dst_a, int width) { asm volatile( - "vmovdqa %3,%%ymm4 \n" + "vmovdqa %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" LABELALIGN "1: \n" - "vmovdqu (%0), %%ymm0 \n" - "vmovdqu 0x20(%0), %%ymm1 \n" - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0 - "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" - "vmovdqu 0x40(%0), %%ymm2 \n" - "vmovdqu 0x60(%0), %%ymm3 \n" - "lea 0x80(%0), %0 \n" - "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates - "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" - "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" - "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates - "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. - "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate. - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20, %2 \n" - "jg 1b \n" + "vmovdqu (%0), %%ymm0 \n" + "vmovdqu 0x20(%0), %%ymm1 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0 + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" + "vmovdqu 0x40(%0), %%ymm2 \n" + "vmovdqu 0x60(%0), %%ymm3 \n" + "lea 0x80(%0), %0 \n" + "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates + "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" + "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates + "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. + "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate. + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20, %2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_a), // %1 @@ -4084,31 +4181,31 @@ void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, // width in pixels void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "pcmpeqb %%xmm0,%%xmm0 \n" - "pslld $0x18,%%xmm0 \n" - "pcmpeqb %%xmm1,%%xmm1 \n" - "psrld $0x8,%%xmm1 \n" + "pcmpeqb %%xmm0,%%xmm0 \n" + "pslld $0x18,%%xmm0 \n" + "pcmpeqb %%xmm1,%%xmm1 \n" + "psrld $0x8,%%xmm1 \n" LABELALIGN "1: \n" - "movq (%0),%%xmm2 \n" - "lea 0x8(%0),%0 \n" - "punpcklbw %%xmm2,%%xmm2 \n" - "punpckhwd %%xmm2,%%xmm3 \n" - "punpcklwd %%xmm2,%%xmm2 \n" - "movdqu (%1),%%xmm4 \n" - "movdqu 0x10(%1),%%xmm5 \n" - "pand %%xmm0,%%xmm2 \n" - "pand %%xmm0,%%xmm3 \n" - "pand %%xmm1,%%xmm4 \n" - "pand %%xmm1,%%xmm5 \n" - "por %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm2,(%1) \n" - "movdqu %%xmm3,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "movq (%0),%%xmm2 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm2,%%xmm2 \n" + "punpckhwd %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm2,%%xmm2 \n" + "movdqu (%1),%%xmm4 \n" + "movdqu 0x10(%1),%%xmm5 \n" + "pand %%xmm0,%%xmm2 \n" + "pand %%xmm0,%%xmm3 \n" + "pand %%xmm1,%%xmm4 \n" + "pand %%xmm1,%%xmm5 \n" + "por %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm2,(%1) \n" + "movdqu %%xmm3,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -4121,23 +4218,23 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { // width in pixels void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" - "vpsrld $0x8,%%ymm0,%%ymm0 \n" + "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" + "vpsrld $0x8,%%ymm0,%%ymm0 \n" LABELALIGN "1: \n" - "vpmovzxbd (%0),%%ymm1 \n" - "vpmovzxbd 0x8(%0),%%ymm2 \n" - "lea 0x10(%0),%0 \n" - "vpslld $0x18,%%ymm1,%%ymm1 \n" - "vpslld $0x18,%%ymm2,%%ymm2 \n" - "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n" - "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n" - "vmovdqu %%ymm1,(%1) \n" - "vmovdqu %%ymm2,0x20(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "vpmovzxbd (%0),%%ymm1 \n" + "vpmovzxbd 0x8(%0),%%ymm2 \n" + "lea 0x10(%0),%0 \n" + "vpslld $0x18,%%ymm1,%%ymm1 \n" + "vpslld $0x18,%%ymm2,%%ymm2 \n" + "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n" + "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm1,(%1) \n" + "vmovdqu %%ymm2,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -4153,7 +4250,7 @@ void SetRow_X86(uint8_t* dst, uint8_t v8, int width) { const uint32_t v32 = v8 * 0x01010101u; // Duplicate byte to all bytes. asm volatile( - "rep stosl \n" + "rep stosl \n" : "+D"(dst), // %0 "+c"(width_tmp) // %1 : "a"(v32) // %2 @@ -4164,7 +4261,7 @@ void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) { size_t width_tmp = (size_t)(width); asm volatile( - "rep stosb \n" + "rep stosb \n" : "+D"(dst), // %0 "+c"(width_tmp) // %1 : "a"(v8) // %2 @@ -4175,7 +4272,7 @@ void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) { size_t width_tmp = (size_t)(width); asm volatile( - "rep stosl \n" + "rep stosl \n" : "+D"(dst_argb), // %0 "+c"(width_tmp) // %1 : "a"(v32) // %2 @@ -4186,21 +4283,21 @@ void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) { #ifdef HAS_YUY2TOYROW_SSE2 void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -4214,32 +4311,32 @@ void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_v, int width) { asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x00(%0,%4,1),%%xmm2 \n" - "movdqu 0x10(%0,%4,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%4,1),%%xmm2 \n" + "movdqu 0x10(%0,%4,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -4253,28 +4350,28 @@ void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, uint8_t* dst_v, int width) { asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -4288,16 +4385,16 @@ void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -4311,32 +4408,32 @@ void UYVYToUVRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_v, int width) { asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x00(%0,%4,1),%%xmm2 \n" - "movdqu 0x10(%0,%4,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%4,1),%%xmm2 \n" + "movdqu 0x10(%0,%4,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -4350,28 +4447,28 @@ void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, uint8_t* dst_v, int width) { asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -4384,22 +4481,22 @@ void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, #ifdef HAS_YUY2TOYROW_AVX2 void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_yuy2), // %0 "+r"(dst_y), // %1 @@ -4414,32 +4511,32 @@ void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_v, int width) { asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - "sub %1,%2 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" - "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm0,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vextractf128 $0x0,%%ymm1,(%1) \n" "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 @@ -4454,30 +4551,30 @@ void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, uint8_t* dst_v, int width) { asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - "sub %1,%2 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm0,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vextractf128 $0x0,%%ymm1,(%1) \n" "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 @@ -4492,17 +4589,17 @@ void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_uyvy), // %0 "+r"(dst_y), // %1 @@ -4516,32 +4613,32 @@ void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_v, int width) { asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - "sub %1,%2 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" - "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm0,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vextractf128 $0x0,%%ymm1,(%1) \n" "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 @@ -4556,30 +4653,30 @@ void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, uint8_t* dst_v, int width) { asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - "sub %1,%2 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm0,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vextractf128 $0x0,%%ymm1,(%1) \n" "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 @@ -4601,71 +4698,71 @@ void ARGBBlendRow_SSSE3(const uint8_t* src_argb0, uint8_t* dst_argb, int width) { asm volatile( - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $0xf,%%xmm7 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $0x8,%%xmm6 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "psllw $0x8,%%xmm5 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "pslld $0x18,%%xmm4 \n" - "sub $0x4,%3 \n" - "jl 49f \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $0xf,%%xmm7 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x8,%%xmm6 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psllw $0x8,%%xmm5 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" + "sub $0x4,%3 \n" + "jl 49f \n" // 4 pixel loop. LABELALIGN "40: \n" - "movdqu (%0),%%xmm3 \n" - "lea 0x10(%0),%0 \n" - "movdqa %%xmm3,%%xmm0 \n" - "pxor %%xmm4,%%xmm3 \n" - "movdqu (%1),%%xmm2 \n" - "pshufb %4,%%xmm3 \n" - "pand %%xmm6,%%xmm2 \n" - "paddw %%xmm7,%%xmm3 \n" - "pmullw %%xmm3,%%xmm2 \n" - "movdqu (%1),%%xmm1 \n" - "lea 0x10(%1),%1 \n" - "psrlw $0x8,%%xmm1 \n" - "por %%xmm4,%%xmm0 \n" - "pmullw %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm2 \n" - "paddusb %%xmm2,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jge 40b \n" + "movdqu (%0),%%xmm3 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movdqu (%1),%%xmm2 \n" + "pshufb %4,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movdqu (%1),%%xmm1 \n" + "lea 0x10(%1),%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jge 40b \n" "49: \n" - "add $0x3,%3 \n" - "jl 99f \n" + "add $0x3,%3 \n" + "jl 99f \n" // 1 pixel loop. "91: \n" - "movd (%0),%%xmm3 \n" - "lea 0x4(%0),%0 \n" - "movdqa %%xmm3,%%xmm0 \n" - "pxor %%xmm4,%%xmm3 \n" - "movd (%1),%%xmm2 \n" - "pshufb %4,%%xmm3 \n" - "pand %%xmm6,%%xmm2 \n" - "paddw %%xmm7,%%xmm3 \n" - "pmullw %%xmm3,%%xmm2 \n" - "movd (%1),%%xmm1 \n" - "lea 0x4(%1),%1 \n" - "psrlw $0x8,%%xmm1 \n" - "por %%xmm4,%%xmm0 \n" - "pmullw %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm2 \n" - "paddusb %%xmm2,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movd %%xmm0,(%2) \n" - "lea 0x4(%2),%2 \n" - "sub $0x1,%3 \n" - "jge 91b \n" + "movd (%0),%%xmm3 \n" + "lea 0x4(%0),%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movd (%1),%%xmm2 \n" + "pshufb %4,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movd (%1),%%xmm1 \n" + "lea 0x4(%1),%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movd %%xmm0,(%2) \n" + "lea 0x4(%2),%2 \n" + "sub $0x1,%3 \n" + "jge 91b \n" "99: \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 @@ -4689,36 +4786,36 @@ void BlendPlaneRow_SSSE3(const uint8_t* src0, uint8_t* dst, int width) { asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psllw $0x8,%%xmm5 \n" - "mov $0x80808080,%%eax \n" - "movd %%eax,%%xmm6 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "mov $0x807f807f,%%eax \n" - "movd %%eax,%%xmm7 \n" - "pshufd $0x0,%%xmm7,%%xmm7 \n" - "sub %2,%0 \n" - "sub %2,%1 \n" - "sub %2,%3 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psllw $0x8,%%xmm5 \n" + "mov $0x80808080,%%eax \n" + "movd %%eax,%%xmm6 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "mov $0x807f807f,%%eax \n" + "movd %%eax,%%xmm7 \n" + "pshufd $0x0,%%xmm7,%%xmm7 \n" + "sub %2,%0 \n" + "sub %2,%1 \n" + "sub %2,%3 \n" // 8 pixel loop. LABELALIGN "1: \n" - "movq (%2),%%xmm0 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "pxor %%xmm5,%%xmm0 \n" - "movq (%0,%2,1),%%xmm1 \n" - "movq (%1,%2,1),%%xmm2 \n" - "punpcklbw %%xmm2,%%xmm1 \n" - "psubb %%xmm6,%%xmm1 \n" - "pmaddubsw %%xmm1,%%xmm0 \n" - "paddw %%xmm7,%%xmm0 \n" - "psrlw $0x8,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%3,%2,1) \n" - "lea 0x8(%2),%2 \n" - "sub $0x8,%4 \n" - "jg 1b \n" + "movq (%2),%%xmm0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "pxor %%xmm5,%%xmm0 \n" + "movq (%0,%2,1),%%xmm1 \n" + "movq (%1,%2,1),%%xmm2 \n" + "punpcklbw %%xmm2,%%xmm1 \n" + "psubb %%xmm6,%%xmm1 \n" + "pmaddubsw %%xmm1,%%xmm0 \n" + "paddw %%xmm7,%%xmm0 \n" + "psrlw $0x8,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%3,%2,1) \n" + "lea 0x8(%2),%2 \n" + "sub $0x8,%4 \n" + "jg 1b \n" : "+r"(src0), // %0 "+r"(src1), // %1 "+r"(alpha), // %2 @@ -4741,43 +4838,43 @@ void BlendPlaneRow_AVX2(const uint8_t* src0, uint8_t* dst, int width) { asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsllw $0x8,%%ymm5,%%ymm5 \n" - "mov $0x80808080,%%eax \n" - "vmovd %%eax,%%xmm6 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsllw $0x8,%%ymm5,%%ymm5 \n" + "mov $0x80808080,%%eax \n" + "vmovd %%eax,%%xmm6 \n" "vbroadcastss %%xmm6,%%ymm6 \n" - "mov $0x807f807f,%%eax \n" - "vmovd %%eax,%%xmm7 \n" + "mov $0x807f807f,%%eax \n" + "vmovd %%eax,%%xmm7 \n" "vbroadcastss %%xmm7,%%ymm7 \n" - "sub %2,%0 \n" - "sub %2,%1 \n" - "sub %2,%3 \n" + "sub %2,%0 \n" + "sub %2,%1 \n" + "sub %2,%3 \n" // 32 pixel loop. LABELALIGN "1: \n" - "vmovdqu (%2),%%ymm0 \n" - "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n" - "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" - "vpxor %%ymm5,%%ymm3,%%ymm3 \n" - "vpxor %%ymm5,%%ymm0,%%ymm0 \n" - "vmovdqu (%0,%2,1),%%ymm1 \n" - "vmovdqu (%1,%2,1),%%ymm2 \n" - "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n" - "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" - "vpsubb %%ymm6,%%ymm4,%%ymm4 \n" - "vpsubb %%ymm6,%%ymm1,%%ymm1 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm7,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm7,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm3,%%ymm3 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%3,%2,1) \n" - "lea 0x20(%2),%2 \n" - "sub $0x20,%4 \n" - "jg 1b \n" + "vmovdqu (%2),%%ymm0 \n" + "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" + "vpxor %%ymm5,%%ymm3,%%ymm3 \n" + "vpxor %%ymm5,%%ymm0,%%ymm0 \n" + "vmovdqu (%0,%2,1),%%ymm1 \n" + "vmovdqu (%1,%2,1),%%ymm2 \n" + "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n" + "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" + "vpsubb %%ymm6,%%ymm4,%%ymm4 \n" + "vpsubb %%ymm6,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm7,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm7,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm3,%%ymm3 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%3,%2,1) \n" + "lea 0x20(%2),%2 \n" + "sub $0x20,%4 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src0), // %0 "+r"(src1), // %1 @@ -4791,7 +4888,7 @@ void BlendPlaneRow_AVX2(const uint8_t* src0, #endif // HAS_BLENDPLANEROW_AVX2 #ifdef HAS_ARGBATTENUATEROW_SSSE3 -// Shuffle table duplicating alpha +// Shuffle table duplicating alpha. static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u}; static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, @@ -4801,35 +4898,35 @@ void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( - "pcmpeqb %%xmm3,%%xmm3 \n" - "pslld $0x18,%%xmm3 \n" - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "pslld $0x18,%%xmm3 \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" // 4 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "pshufb %%xmm4,%%xmm0 \n" - "movdqu (%0),%%xmm1 \n" - "punpcklbw %%xmm1,%%xmm1 \n" - "pmulhuw %%xmm1,%%xmm0 \n" - "movdqu (%0),%%xmm1 \n" - "pshufb %%xmm5,%%xmm1 \n" - "movdqu (%0),%%xmm2 \n" - "punpckhbw %%xmm2,%%xmm2 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "movdqu (%0),%%xmm2 \n" - "lea 0x10(%0),%0 \n" - "pand %%xmm3,%%xmm2 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "punpcklbw %%xmm1,%%xmm1 \n" + "pmulhuw %%xmm1,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "pshufb %%xmm5,%%xmm1 \n" + "movdqu (%0),%%xmm2 \n" + "punpckhbw %%xmm2,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "movdqu (%0),%%xmm2 \n" + "lea 0x10(%0),%0 \n" + "pand %%xmm3,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -4850,29 +4947,29 @@ void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, int width) { asm volatile( "vbroadcastf128 %3,%%ymm4 \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpslld $0x18,%%ymm5,%%ymm5 \n" - "sub %0,%1 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpslld $0x18,%%ymm5,%%ymm5 \n" + "sub %0,%1 \n" // 8 pixel loop. LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm6 \n" - "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" - "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" - "vpshufb %%ymm4,%%ymm0,%%ymm2 \n" - "vpshufb %%ymm4,%%ymm1,%%ymm3 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" - "vpand %%ymm5,%%ymm6,%%ymm6 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpor %%ymm6,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,0x00(%0,%1,1) \n" - "lea 0x20(%0),%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm6 \n" + "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" + "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" + "vpshufb %%ymm4,%%ymm0,%%ymm2 \n" + "vpshufb %%ymm4,%%ymm1,%%ymm3 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vpand %%ymm5,%%ymm6,%%ymm6 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpor %%ymm6,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%0,%1,1) \n" + "lea 0x20(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -4892,32 +4989,32 @@ void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb, // 4 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movzb 0x03(%0),%3 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "movd 0x00(%4,%3,4),%%xmm2 \n" - "movzb 0x07(%0),%3 \n" - "movd 0x00(%4,%3,4),%%xmm3 \n" - "pshuflw $0x40,%%xmm2,%%xmm2 \n" - "pshuflw $0x40,%%xmm3,%%xmm3 \n" - "movlhps %%xmm3,%%xmm2 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "movdqu (%0),%%xmm1 \n" - "movzb 0x0b(%0),%3 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "movd 0x00(%4,%3,4),%%xmm2 \n" - "movzb 0x0f(%0),%3 \n" - "movd 0x00(%4,%3,4),%%xmm3 \n" - "pshuflw $0x40,%%xmm2,%%xmm2 \n" - "pshuflw $0x40,%%xmm3,%%xmm3 \n" - "movlhps %%xmm3,%%xmm2 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movzb 0x03(%0),%3 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "movd 0x00(%4,%3,4),%%xmm2 \n" + "movzb 0x07(%0),%3 \n" + "movd 0x00(%4,%3,4),%%xmm3 \n" + "pshuflw $0x40,%%xmm2,%%xmm2 \n" + "pshuflw $0x40,%%xmm3,%%xmm3 \n" + "movlhps %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "movzb 0x0b(%0),%3 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "movd 0x00(%4,%3,4),%%xmm2 \n" + "movzb 0x0f(%0),%3 \n" + "movd 0x00(%4,%3,4),%%xmm3 \n" + "pshuflw $0x40,%%xmm2,%%xmm2 \n" + "pshuflw $0x40,%%xmm3,%%xmm3 \n" + "movlhps %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width), // %2 @@ -4937,52 +5034,52 @@ void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, int width) { uintptr_t alpha; asm volatile( - "sub %0,%1 \n" + "sub %0,%1 \n" "vbroadcastf128 %5,%%ymm5 \n" // 8 pixel loop. LABELALIGN "1: \n" // replace VPGATHER - "movzb 0x03(%0),%3 \n" - "vmovd 0x00(%4,%3,4),%%xmm0 \n" - "movzb 0x07(%0),%3 \n" - "vmovd 0x00(%4,%3,4),%%xmm1 \n" - "movzb 0x0b(%0),%3 \n" - "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n" - "vmovd 0x00(%4,%3,4),%%xmm2 \n" - "movzb 0x0f(%0),%3 \n" - "vmovd 0x00(%4,%3,4),%%xmm3 \n" - "movzb 0x13(%0),%3 \n" - "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n" - "vmovd 0x00(%4,%3,4),%%xmm0 \n" - "movzb 0x17(%0),%3 \n" - "vmovd 0x00(%4,%3,4),%%xmm1 \n" - "movzb 0x1b(%0),%3 \n" - "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n" - "vmovd 0x00(%4,%3,4),%%xmm2 \n" - "movzb 0x1f(%0),%3 \n" - "vmovd 0x00(%4,%3,4),%%xmm3 \n" - "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n" + "movzb 0x03(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm0 \n" + "movzb 0x07(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm1 \n" + "movzb 0x0b(%0),%3 \n" + "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n" + "vmovd 0x00(%4,%3,4),%%xmm2 \n" + "movzb 0x0f(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm3 \n" + "movzb 0x13(%0),%3 \n" + "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n" + "vmovd 0x00(%4,%3,4),%%xmm0 \n" + "movzb 0x17(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm1 \n" + "movzb 0x1b(%0),%3 \n" + "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n" + "vmovd 0x00(%4,%3,4),%%xmm2 \n" + "movzb 0x1f(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm3 \n" + "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n" "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n" "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n" "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n" // end of VPGATHER - "vmovdqu (%0),%%ymm6 \n" - "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" - "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" - "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n" - "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n" - "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" - "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,0x00(%0,%1,1) \n" - "lea 0x20(%0),%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm6 \n" + "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" + "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" + "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n" + "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n" + "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%0,%1,1) \n" + "lea 0x20(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -4999,42 +5096,42 @@ void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" // 8 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "psubb %%xmm5,%%xmm0 \n" - "psubb %%xmm5,%%xmm1 \n" - "movdqu %%xmm4,%%xmm6 \n" - "pmaddubsw %%xmm0,%%xmm6 \n" - "movdqu %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm1,%%xmm0 \n" - "phaddw %%xmm0,%%xmm6 \n" - "paddw %%xmm5,%%xmm6 \n" - "psrlw $0x8,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movdqu (%0),%%xmm2 \n" - "movdqu 0x10(%0),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "psrld $0x18,%%xmm2 \n" - "psrld $0x18,%%xmm3 \n" - "packuswb %%xmm3,%%xmm2 \n" - "packuswb %%xmm2,%%xmm2 \n" - "movdqa %%xmm6,%%xmm3 \n" - "punpcklbw %%xmm6,%%xmm6 \n" - "punpcklbw %%xmm2,%%xmm3 \n" - "movdqa %%xmm6,%%xmm1 \n" - "punpcklwd %%xmm3,%%xmm6 \n" - "punpckhwd %%xmm3,%%xmm1 \n" - "movdqu %%xmm6,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "psubb %%xmm5,%%xmm0 \n" + "psubb %%xmm5,%%xmm1 \n" + "movdqu %%xmm4,%%xmm6 \n" + "pmaddubsw %%xmm0,%%xmm6 \n" + "movdqu %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm1,%%xmm0 \n" + "phaddw %%xmm0,%%xmm6 \n" + "paddw %%xmm5,%%xmm6 \n" + "psrlw $0x8,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movdqu (%0),%%xmm2 \n" + "movdqu 0x10(%0),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "psrld $0x18,%%xmm2 \n" + "psrld $0x18,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "packuswb %%xmm2,%%xmm2 \n" + "movdqa %%xmm6,%%xmm3 \n" + "punpcklbw %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm2,%%xmm3 \n" + "movdqa %%xmm6,%%xmm1 \n" + "punpcklwd %%xmm3,%%xmm6 \n" + "punpckhwd %%xmm3,%%xmm1 \n" + "movdqu %%xmm6,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -5061,50 +5158,50 @@ static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0, // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) { asm volatile( - "movdqa %2,%%xmm2 \n" - "movdqa %3,%%xmm3 \n" - "movdqa %4,%%xmm4 \n" + "movdqa %2,%%xmm2 \n" + "movdqa %3,%%xmm3 \n" + "movdqa %4,%%xmm4 \n" // 8 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm6 \n" - "pmaddubsw %%xmm2,%%xmm0 \n" - "pmaddubsw %%xmm2,%%xmm6 \n" - "phaddw %%xmm6,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movdqu (%0),%%xmm5 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm5 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "phaddw %%xmm1,%%xmm5 \n" - "psrlw $0x7,%%xmm5 \n" - "packuswb %%xmm5,%%xmm5 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "movdqu (%0),%%xmm5 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm5 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "phaddw %%xmm1,%%xmm5 \n" - "psrlw $0x7,%%xmm5 \n" - "packuswb %%xmm5,%%xmm5 \n" - "movdqu (%0),%%xmm6 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "psrld $0x18,%%xmm6 \n" - "psrld $0x18,%%xmm1 \n" - "packuswb %%xmm1,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "punpcklbw %%xmm6,%%xmm5 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm5,%%xmm0 \n" - "punpckhwd %%xmm5,%%xmm1 \n" - "movdqu %%xmm0,(%0) \n" - "movdqu %%xmm1,0x10(%0) \n" - "lea 0x20(%0),%0 \n" - "sub $0x8,%1 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm6 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "pmaddubsw %%xmm2,%%xmm6 \n" + "phaddw %%xmm6,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movdqu (%0),%%xmm5 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm5 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "phaddw %%xmm1,%%xmm5 \n" + "psrlw $0x7,%%xmm5 \n" + "packuswb %%xmm5,%%xmm5 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "movdqu (%0),%%xmm5 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm5 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "phaddw %%xmm1,%%xmm5 \n" + "psrlw $0x7,%%xmm5 \n" + "packuswb %%xmm5,%%xmm5 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "psrld $0x18,%%xmm6 \n" + "psrld $0x18,%%xmm1 \n" + "packuswb %%xmm1,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm5 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm5,%%xmm0 \n" + "punpckhwd %%xmm5,%%xmm1 \n" + "movdqu %%xmm0,(%0) \n" + "movdqu %%xmm1,0x10(%0) \n" + "lea 0x20(%0),%0 \n" + "sub $0x8,%1 \n" + "jg 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 : "m"(kARGBToSepiaB), // %2 @@ -5122,54 +5219,54 @@ void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb, const int8_t* matrix_argb, int width) { asm volatile( - "movdqu (%3),%%xmm5 \n" - "pshufd $0x00,%%xmm5,%%xmm2 \n" - "pshufd $0x55,%%xmm5,%%xmm3 \n" - "pshufd $0xaa,%%xmm5,%%xmm4 \n" - "pshufd $0xff,%%xmm5,%%xmm5 \n" + "movdqu (%3),%%xmm5 \n" + "pshufd $0x00,%%xmm5,%%xmm2 \n" + "pshufd $0x55,%%xmm5,%%xmm3 \n" + "pshufd $0xaa,%%xmm5,%%xmm4 \n" + "pshufd $0xff,%%xmm5,%%xmm5 \n" // 8 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm7 \n" - "pmaddubsw %%xmm2,%%xmm0 \n" - "pmaddubsw %%xmm2,%%xmm7 \n" - "movdqu (%0),%%xmm6 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "phaddsw %%xmm7,%%xmm0 \n" - "phaddsw %%xmm1,%%xmm6 \n" - "psraw $0x6,%%xmm0 \n" - "psraw $0x6,%%xmm6 \n" - "packuswb %%xmm0,%%xmm0 \n" - "packuswb %%xmm6,%%xmm6 \n" - "punpcklbw %%xmm6,%%xmm0 \n" - "movdqu (%0),%%xmm1 \n" - "movdqu 0x10(%0),%%xmm7 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm7 \n" - "phaddsw %%xmm7,%%xmm1 \n" - "movdqu (%0),%%xmm6 \n" - "movdqu 0x10(%0),%%xmm7 \n" - "pmaddubsw %%xmm5,%%xmm6 \n" - "pmaddubsw %%xmm5,%%xmm7 \n" - "phaddsw %%xmm7,%%xmm6 \n" - "psraw $0x6,%%xmm1 \n" - "psraw $0x6,%%xmm6 \n" - "packuswb %%xmm1,%%xmm1 \n" - "packuswb %%xmm6,%%xmm6 \n" - "punpcklbw %%xmm6,%%xmm1 \n" - "movdqa %%xmm0,%%xmm6 \n" - "punpcklwd %%xmm1,%%xmm0 \n" - "punpckhwd %%xmm1,%%xmm6 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm6,0x10(%1) \n" - "lea 0x20(%0),%0 \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm7 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "pmaddubsw %%xmm2,%%xmm7 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "phaddsw %%xmm7,%%xmm0 \n" + "phaddsw %%xmm1,%%xmm6 \n" + "psraw $0x6,%%xmm0 \n" + "psraw $0x6,%%xmm6 \n" + "packuswb %%xmm0,%%xmm0 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "movdqu 0x10(%0),%%xmm7 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm7 \n" + "phaddsw %%xmm7,%%xmm1 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x10(%0),%%xmm7 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm7 \n" + "phaddsw %%xmm7,%%xmm6 \n" + "psraw $0x6,%%xmm1 \n" + "psraw $0x6,%%xmm6 \n" + "packuswb %%xmm1,%%xmm1 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "punpcklwd %%xmm1,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm6 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm6,0x10(%1) \n" + "lea 0x20(%0),%0 \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -5187,40 +5284,40 @@ void ARGBQuantizeRow_SSE2(uint8_t* dst_argb, int interval_offset, int width) { asm volatile( - "movd %2,%%xmm2 \n" - "movd %3,%%xmm3 \n" - "movd %4,%%xmm4 \n" - "pshuflw $0x40,%%xmm2,%%xmm2 \n" - "pshufd $0x44,%%xmm2,%%xmm2 \n" - "pshuflw $0x40,%%xmm3,%%xmm3 \n" - "pshufd $0x44,%%xmm3,%%xmm3 \n" - "pshuflw $0x40,%%xmm4,%%xmm4 \n" - "pshufd $0x44,%%xmm4,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "pslld $0x18,%%xmm6 \n" + "movd %2,%%xmm2 \n" + "movd %3,%%xmm3 \n" + "movd %4,%%xmm4 \n" + "pshuflw $0x40,%%xmm2,%%xmm2 \n" + "pshufd $0x44,%%xmm2,%%xmm2 \n" + "pshuflw $0x40,%%xmm3,%%xmm3 \n" + "pshufd $0x44,%%xmm3,%%xmm3 \n" + "pshuflw $0x40,%%xmm4,%%xmm4 \n" + "pshufd $0x44,%%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "pslld $0x18,%%xmm6 \n" // 4 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "movdqu (%0),%%xmm1 \n" - "punpckhbw %%xmm5,%%xmm1 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "pmullw %%xmm3,%%xmm0 \n" - "movdqu (%0),%%xmm7 \n" - "pmullw %%xmm3,%%xmm1 \n" - "pand %%xmm6,%%xmm7 \n" - "paddw %%xmm4,%%xmm0 \n" - "paddw %%xmm4,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "por %%xmm7,%%xmm0 \n" - "movdqu %%xmm0,(%0) \n" - "lea 0x10(%0),%0 \n" - "sub $0x4,%1 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "punpckhbw %%xmm5,%%xmm1 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "pmullw %%xmm3,%%xmm0 \n" + "movdqu (%0),%%xmm7 \n" + "pmullw %%xmm3,%%xmm1 \n" + "pand %%xmm6,%%xmm7 \n" + "paddw %%xmm4,%%xmm0 \n" + "paddw %%xmm4,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "por %%xmm7,%%xmm0 \n" + "movdqu %%xmm0,(%0) \n" + "lea 0x10(%0),%0 \n" + "sub $0x4,%1 \n" + "jg 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 : "r"(scale), // %2 @@ -5238,27 +5335,27 @@ void ARGBShadeRow_SSE2(const uint8_t* src_argb, int width, uint32_t value) { asm volatile( - "movd %3,%%xmm2 \n" - "punpcklbw %%xmm2,%%xmm2 \n" - "punpcklqdq %%xmm2,%%xmm2 \n" + "movd %3,%%xmm2 \n" + "punpcklbw %%xmm2,%%xmm2 \n" + "punpcklqdq %%xmm2,%%xmm2 \n" // 4 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -5275,28 +5372,28 @@ void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0, int width) { asm volatile( - "pxor %%xmm5,%%xmm5 \n" + "pxor %%xmm5,%%xmm5 \n" // 4 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movdqu (%1),%%xmm2 \n" - "lea 0x10(%1),%1 \n" - "movdqu %%xmm0,%%xmm1 \n" - "movdqu %%xmm2,%%xmm3 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpckhbw %%xmm5,%%xmm3 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "pmulhuw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqu (%1),%%xmm2 \n" + "lea 0x10(%1),%1 \n" + "movdqu %%xmm0,%%xmm1 \n" + "movdqu %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpckhbw %%xmm5,%%xmm3 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 @@ -5314,26 +5411,26 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0, int width) { asm volatile( - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" // 4 pixel loop. LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm1 \n" - "lea 0x20(%0),%0 \n" - "vmovdqu (%1),%%ymm3 \n" - "lea 0x20(%1),%1 \n" - "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n" - "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n" - "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" - "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%2) \n" - "lea 0x20(%2),%2 \n" - "sub $0x8,%3 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm1 \n" + "lea 0x20(%0),%0 \n" + "vmovdqu (%1),%%ymm3 \n" + "lea 0x20(%1),%1 \n" + "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n" + "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n" + "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" + "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x8,%3 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 @@ -5359,15 +5456,15 @@ void ARGBAddRow_SSE2(const uint8_t* src_argb0, // 4 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movdqu (%1),%%xmm1 \n" - "lea 0x10(%1),%1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqu (%1),%%xmm1 \n" + "lea 0x10(%1),%1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 @@ -5387,14 +5484,14 @@ void ARGBAddRow_AVX2(const uint8_t* src_argb0, // 4 pixel loop. LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "lea 0x20(%0),%0 \n" - "vpaddusb (%1),%%ymm0,%%ymm0 \n" - "lea 0x20(%1),%1 \n" - "vmovdqu %%ymm0,(%2) \n" - "lea 0x20(%2),%2 \n" - "sub $0x8,%3 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "lea 0x20(%0),%0 \n" + "vpaddusb (%1),%%ymm0,%%ymm0 \n" + "lea 0x20(%1),%1 \n" + "vmovdqu %%ymm0,(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x8,%3 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 @@ -5415,15 +5512,15 @@ void ARGBSubtractRow_SSE2(const uint8_t* src_argb0, // 4 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movdqu (%1),%%xmm1 \n" - "lea 0x10(%1),%1 \n" - "psubusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqu (%1),%%xmm1 \n" + "lea 0x10(%1),%1 \n" + "psubusb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 @@ -5443,14 +5540,14 @@ void ARGBSubtractRow_AVX2(const uint8_t* src_argb0, // 4 pixel loop. LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "lea 0x20(%0),%0 \n" - "vpsubusb (%1),%%ymm0,%%ymm0 \n" - "lea 0x20(%1),%1 \n" - "vmovdqu %%ymm0,(%2) \n" - "lea 0x20(%2),%2 \n" - "sub $0x8,%3 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "lea 0x20(%0),%0 \n" + "vpsubusb (%1),%%ymm0,%%ymm0 \n" + "lea 0x20(%1),%1 \n" + "vmovdqu %%ymm0,(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x8,%3 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 @@ -5472,40 +5569,40 @@ void SobelXRow_SSE2(const uint8_t* src_y0, uint8_t* dst_sobelx, int width) { asm volatile( - "sub %0,%1 \n" - "sub %0,%2 \n" - "sub %0,%3 \n" - "pxor %%xmm5,%%xmm5 \n" + "sub %0,%1 \n" + "sub %0,%2 \n" + "sub %0,%3 \n" + "pxor %%xmm5,%%xmm5 \n" // 8 pixel loop. LABELALIGN "1: \n" - "movq (%0),%%xmm0 \n" - "movq 0x2(%0),%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "psubw %%xmm1,%%xmm0 \n" - "movq 0x00(%0,%1,1),%%xmm1 \n" - "movq 0x02(%0,%1,1),%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "psubw %%xmm2,%%xmm1 \n" - "movq 0x00(%0,%2,1),%%xmm2 \n" - "movq 0x02(%0,%2,1),%%xmm3 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm3 \n" - "psubw %%xmm3,%%xmm2 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm1,%%xmm0 \n" - "paddw %%xmm1,%%xmm0 \n" - "pxor %%xmm1,%%xmm1 \n" - "psubw %%xmm0,%%xmm1 \n" - "pmaxsw %%xmm1,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,0x00(%0,%3,1) \n" - "lea 0x8(%0),%0 \n" - "sub $0x8,%4 \n" - "jg 1b \n" + "movq (%0),%%xmm0 \n" + "movq 0x2(%0),%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "psubw %%xmm1,%%xmm0 \n" + "movq 0x00(%0,%1,1),%%xmm1 \n" + "movq 0x02(%0,%1,1),%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "psubw %%xmm2,%%xmm1 \n" + "movq 0x00(%0,%2,1),%%xmm2 \n" + "movq 0x02(%0,%2,1),%%xmm3 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + "psubw %%xmm3,%%xmm2 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "psubw %%xmm0,%%xmm1 \n" + "pmaxsw %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,0x00(%0,%3,1) \n" + "lea 0x8(%0),%0 \n" + "sub $0x8,%4 \n" + "jg 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 "+r"(src_y2), // %2 @@ -5526,39 +5623,39 @@ void SobelYRow_SSE2(const uint8_t* src_y0, uint8_t* dst_sobely, int width) { asm volatile( - "sub %0,%1 \n" - "sub %0,%2 \n" - "pxor %%xmm5,%%xmm5 \n" + "sub %0,%1 \n" + "sub %0,%2 \n" + "pxor %%xmm5,%%xmm5 \n" // 8 pixel loop. LABELALIGN "1: \n" - "movq (%0),%%xmm0 \n" - "movq 0x00(%0,%1,1),%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "psubw %%xmm1,%%xmm0 \n" - "movq 0x1(%0),%%xmm1 \n" - "movq 0x01(%0,%1,1),%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "psubw %%xmm2,%%xmm1 \n" - "movq 0x2(%0),%%xmm2 \n" - "movq 0x02(%0,%1,1),%%xmm3 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm3 \n" - "psubw %%xmm3,%%xmm2 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm1,%%xmm0 \n" - "paddw %%xmm1,%%xmm0 \n" - "pxor %%xmm1,%%xmm1 \n" - "psubw %%xmm0,%%xmm1 \n" - "pmaxsw %%xmm1,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,0x00(%0,%2,1) \n" - "lea 0x8(%0),%0 \n" - "sub $0x8,%3 \n" - "jg 1b \n" + "movq (%0),%%xmm0 \n" + "movq 0x00(%0,%1,1),%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "psubw %%xmm1,%%xmm0 \n" + "movq 0x1(%0),%%xmm1 \n" + "movq 0x01(%0,%1,1),%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "psubw %%xmm2,%%xmm1 \n" + "movq 0x2(%0),%%xmm2 \n" + "movq 0x02(%0,%1,1),%%xmm3 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + "psubw %%xmm3,%%xmm2 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "psubw %%xmm0,%%xmm1 \n" + "pmaxsw %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,0x00(%0,%2,1) \n" + "lea 0x8(%0),%0 \n" + "sub $0x8,%3 \n" + "jg 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 "+r"(dst_sobely), // %2 @@ -5579,37 +5676,37 @@ void SobelRow_SSE2(const uint8_t* src_sobelx, uint8_t* dst_argb, int width) { asm volatile( - "sub %0,%1 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0x18,%%xmm5 \n" + "sub %0,%1 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" // 8 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%1,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "punpcklbw %%xmm0,%%xmm2 \n" - "punpckhbw %%xmm0,%%xmm0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm1 \n" - "punpckhwd %%xmm2,%%xmm2 \n" - "por %%xmm5,%%xmm1 \n" - "por %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm3 \n" - "punpcklwd %%xmm0,%%xmm3 \n" - "punpckhwd %%xmm0,%%xmm0 \n" - "por %%xmm5,%%xmm3 \n" - "por %%xmm5,%%xmm0 \n" - "movdqu %%xmm1,(%2) \n" - "movdqu %%xmm2,0x10(%2) \n" - "movdqu %%xmm3,0x20(%2) \n" - "movdqu %%xmm0,0x30(%2) \n" - "lea 0x40(%2),%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "punpcklbw %%xmm0,%%xmm2 \n" + "punpckhbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm1 \n" + "punpckhwd %%xmm2,%%xmm2 \n" + "por %%xmm5,%%xmm1 \n" + "por %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "punpcklwd %%xmm0,%%xmm3 \n" + "punpckhwd %%xmm0,%%xmm0 \n" + "por %%xmm5,%%xmm3 \n" + "por %%xmm5,%%xmm0 \n" + "movdqu %%xmm1,(%2) \n" + "movdqu %%xmm2,0x10(%2) \n" + "movdqu %%xmm3,0x20(%2) \n" + "movdqu %%xmm0,0x30(%2) \n" + "lea 0x40(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_argb), // %2 @@ -5626,21 +5723,21 @@ void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx, uint8_t* dst_y, int width) { asm volatile( - "sub %0,%1 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0x18,%%xmm5 \n" + "sub %0,%1 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" // 8 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%1,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_y), // %2 @@ -5661,36 +5758,36 @@ void SobelXYRow_SSE2(const uint8_t* src_sobelx, uint8_t* dst_argb, int width) { asm volatile( - "sub %0,%1 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" + "sub %0,%1 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" // 8 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%1,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "paddusb %%xmm1,%%xmm2 \n" - "movdqa %%xmm0,%%xmm3 \n" - "punpcklbw %%xmm5,%%xmm3 \n" - "punpckhbw %%xmm5,%%xmm0 \n" - "movdqa %%xmm1,%%xmm4 \n" - "punpcklbw %%xmm2,%%xmm4 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "movdqa %%xmm4,%%xmm6 \n" - "punpcklwd %%xmm3,%%xmm6 \n" - "punpckhwd %%xmm3,%%xmm4 \n" - "movdqa %%xmm1,%%xmm7 \n" - "punpcklwd %%xmm0,%%xmm7 \n" - "punpckhwd %%xmm0,%%xmm1 \n" - "movdqu %%xmm6,(%2) \n" - "movdqu %%xmm4,0x10(%2) \n" - "movdqu %%xmm7,0x20(%2) \n" - "movdqu %%xmm1,0x30(%2) \n" - "lea 0x40(%2),%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "paddusb %%xmm1,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + "punpckhbw %%xmm5,%%xmm0 \n" + "movdqa %%xmm1,%%xmm4 \n" + "punpcklbw %%xmm2,%%xmm4 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "movdqa %%xmm4,%%xmm6 \n" + "punpcklwd %%xmm3,%%xmm6 \n" + "punpckhwd %%xmm3,%%xmm4 \n" + "movdqa %%xmm1,%%xmm7 \n" + "punpcklwd %%xmm0,%%xmm7 \n" + "punpckhwd %%xmm0,%%xmm1 \n" + "movdqu %%xmm6,(%2) \n" + "movdqu %%xmm4,0x10(%2) \n" + "movdqu %%xmm7,0x20(%2) \n" + "movdqu %%xmm1,0x30(%2) \n" + "lea 0x40(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_argb), // %2 @@ -5709,67 +5806,67 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row, const int32_t* previous_cumsum, int width) { asm volatile( - "pxor %%xmm0,%%xmm0 \n" - "pxor %%xmm1,%%xmm1 \n" - "sub $0x4,%3 \n" - "jl 49f \n" - "test $0xf,%1 \n" - "jne 49f \n" + "pxor %%xmm0,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "sub $0x4,%3 \n" + "jl 49f \n" + "test $0xf,%1 \n" + "jne 49f \n" // 4 pixel loop. LABELALIGN "40: \n" - "movdqu (%0),%%xmm2 \n" - "lea 0x10(%0),%0 \n" - "movdqa %%xmm2,%%xmm4 \n" - "punpcklbw %%xmm1,%%xmm2 \n" - "movdqa %%xmm2,%%xmm3 \n" - "punpcklwd %%xmm1,%%xmm2 \n" - "punpckhwd %%xmm1,%%xmm3 \n" - "punpckhbw %%xmm1,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "punpcklwd %%xmm1,%%xmm4 \n" - "punpckhwd %%xmm1,%%xmm5 \n" - "paddd %%xmm2,%%xmm0 \n" - "movdqu (%2),%%xmm2 \n" - "paddd %%xmm0,%%xmm2 \n" - "paddd %%xmm3,%%xmm0 \n" - "movdqu 0x10(%2),%%xmm3 \n" - "paddd %%xmm0,%%xmm3 \n" - "paddd %%xmm4,%%xmm0 \n" - "movdqu 0x20(%2),%%xmm4 \n" - "paddd %%xmm0,%%xmm4 \n" - "paddd %%xmm5,%%xmm0 \n" - "movdqu 0x30(%2),%%xmm5 \n" - "lea 0x40(%2),%2 \n" - "paddd %%xmm0,%%xmm5 \n" - "movdqu %%xmm2,(%1) \n" - "movdqu %%xmm3,0x10(%1) \n" - "movdqu %%xmm4,0x20(%1) \n" - "movdqu %%xmm5,0x30(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x4,%3 \n" - "jge 40b \n" + "movdqu (%0),%%xmm2 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm2,%%xmm4 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm1,%%xmm2 \n" + "punpckhwd %%xmm1,%%xmm3 \n" + "punpckhbw %%xmm1,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "punpcklwd %%xmm1,%%xmm4 \n" + "punpckhwd %%xmm1,%%xmm5 \n" + "paddd %%xmm2,%%xmm0 \n" + "movdqu (%2),%%xmm2 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm0 \n" + "movdqu 0x10(%2),%%xmm3 \n" + "paddd %%xmm0,%%xmm3 \n" + "paddd %%xmm4,%%xmm0 \n" + "movdqu 0x20(%2),%%xmm4 \n" + "paddd %%xmm0,%%xmm4 \n" + "paddd %%xmm5,%%xmm0 \n" + "movdqu 0x30(%2),%%xmm5 \n" + "lea 0x40(%2),%2 \n" + "paddd %%xmm0,%%xmm5 \n" + "movdqu %%xmm2,(%1) \n" + "movdqu %%xmm3,0x10(%1) \n" + "movdqu %%xmm4,0x20(%1) \n" + "movdqu %%xmm5,0x30(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x4,%3 \n" + "jge 40b \n" "49: \n" - "add $0x3,%3 \n" - "jl 19f \n" + "add $0x3,%3 \n" + "jl 19f \n" // 1 pixel loop. LABELALIGN "10: \n" - "movd (%0),%%xmm2 \n" - "lea 0x4(%0),%0 \n" - "punpcklbw %%xmm1,%%xmm2 \n" - "punpcklwd %%xmm1,%%xmm2 \n" - "paddd %%xmm2,%%xmm0 \n" - "movdqu (%2),%%xmm2 \n" - "lea 0x10(%2),%2 \n" - "paddd %%xmm0,%%xmm2 \n" - "movdqu %%xmm2,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x1,%3 \n" - "jge 10b \n" + "movd (%0),%%xmm2 \n" + "lea 0x4(%0),%0 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "punpcklwd %%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm0 \n" + "movdqu (%2),%%xmm2 \n" + "lea 0x10(%2),%2 \n" + "paddd %%xmm0,%%xmm2 \n" + "movdqu %%xmm2,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x1,%3 \n" + "jge 10b \n" "19: \n" : "+r"(row), // %0 @@ -5789,119 +5886,119 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, uint8_t* dst, int count) { asm volatile( - "movd %5,%%xmm5 \n" - "cvtdq2ps %%xmm5,%%xmm5 \n" - "rcpss %%xmm5,%%xmm4 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - "sub $0x4,%3 \n" - "jl 49f \n" - "cmpl $0x80,%5 \n" - "ja 40f \n" - - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrld $0x10,%%xmm6 \n" - "cvtdq2ps %%xmm6,%%xmm6 \n" - "addps %%xmm6,%%xmm5 \n" - "mulps %%xmm4,%%xmm5 \n" - "cvtps2dq %%xmm5,%%xmm5 \n" - "packssdw %%xmm5,%%xmm5 \n" + "movd %5,%%xmm5 \n" + "cvtdq2ps %%xmm5,%%xmm5 \n" + "rcpss %%xmm5,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "sub $0x4,%3 \n" + "jl 49f \n" + "cmpl $0x80,%5 \n" + "ja 40f \n" + + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrld $0x10,%%xmm6 \n" + "cvtdq2ps %%xmm6,%%xmm6 \n" + "addps %%xmm6,%%xmm5 \n" + "mulps %%xmm4,%%xmm5 \n" + "cvtps2dq %%xmm5,%%xmm5 \n" + "packssdw %%xmm5,%%xmm5 \n" // 4 pixel small loop. LABELALIGN "4: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" - "psubd 0x00(%0,%4,4),%%xmm0 \n" - "psubd 0x10(%0,%4,4),%%xmm1 \n" - "psubd 0x20(%0,%4,4),%%xmm2 \n" - "psubd 0x30(%0,%4,4),%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "psubd (%1),%%xmm0 \n" - "psubd 0x10(%1),%%xmm1 \n" - "psubd 0x20(%1),%%xmm2 \n" - "psubd 0x30(%1),%%xmm3 \n" - "paddd 0x00(%1,%4,4),%%xmm0 \n" - "paddd 0x10(%1,%4,4),%%xmm1 \n" - "paddd 0x20(%1,%4,4),%%xmm2 \n" - "paddd 0x30(%1,%4,4),%%xmm3 \n" - "lea 0x40(%1),%1 \n" - "packssdw %%xmm1,%%xmm0 \n" - "packssdw %%xmm3,%%xmm2 \n" - "pmulhuw %%xmm5,%%xmm0 \n" - "pmulhuw %%xmm5,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jge 4b \n" - "jmp 49f \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "psubd 0x00(%0,%4,4),%%xmm0 \n" + "psubd 0x10(%0,%4,4),%%xmm1 \n" + "psubd 0x20(%0,%4,4),%%xmm2 \n" + "psubd 0x30(%0,%4,4),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "psubd (%1),%%xmm0 \n" + "psubd 0x10(%1),%%xmm1 \n" + "psubd 0x20(%1),%%xmm2 \n" + "psubd 0x30(%1),%%xmm3 \n" + "paddd 0x00(%1,%4,4),%%xmm0 \n" + "paddd 0x10(%1,%4,4),%%xmm1 \n" + "paddd 0x20(%1,%4,4),%%xmm2 \n" + "paddd 0x30(%1,%4,4),%%xmm3 \n" + "lea 0x40(%1),%1 \n" + "packssdw %%xmm1,%%xmm0 \n" + "packssdw %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm5,%%xmm0 \n" + "pmulhuw %%xmm5,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jge 4b \n" + "jmp 49f \n" // 4 pixel loop LABELALIGN "40: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" - "psubd 0x00(%0,%4,4),%%xmm0 \n" - "psubd 0x10(%0,%4,4),%%xmm1 \n" - "psubd 0x20(%0,%4,4),%%xmm2 \n" - "psubd 0x30(%0,%4,4),%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "psubd (%1),%%xmm0 \n" - "psubd 0x10(%1),%%xmm1 \n" - "psubd 0x20(%1),%%xmm2 \n" - "psubd 0x30(%1),%%xmm3 \n" - "paddd 0x00(%1,%4,4),%%xmm0 \n" - "paddd 0x10(%1,%4,4),%%xmm1 \n" - "paddd 0x20(%1,%4,4),%%xmm2 \n" - "paddd 0x30(%1,%4,4),%%xmm3 \n" - "lea 0x40(%1),%1 \n" - "cvtdq2ps %%xmm0,%%xmm0 \n" - "cvtdq2ps %%xmm1,%%xmm1 \n" - "mulps %%xmm4,%%xmm0 \n" - "mulps %%xmm4,%%xmm1 \n" - "cvtdq2ps %%xmm2,%%xmm2 \n" - "cvtdq2ps %%xmm3,%%xmm3 \n" - "mulps %%xmm4,%%xmm2 \n" - "mulps %%xmm4,%%xmm3 \n" - "cvtps2dq %%xmm0,%%xmm0 \n" - "cvtps2dq %%xmm1,%%xmm1 \n" - "cvtps2dq %%xmm2,%%xmm2 \n" - "cvtps2dq %%xmm3,%%xmm3 \n" - "packssdw %%xmm1,%%xmm0 \n" - "packssdw %%xmm3,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jge 40b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "psubd 0x00(%0,%4,4),%%xmm0 \n" + "psubd 0x10(%0,%4,4),%%xmm1 \n" + "psubd 0x20(%0,%4,4),%%xmm2 \n" + "psubd 0x30(%0,%4,4),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "psubd (%1),%%xmm0 \n" + "psubd 0x10(%1),%%xmm1 \n" + "psubd 0x20(%1),%%xmm2 \n" + "psubd 0x30(%1),%%xmm3 \n" + "paddd 0x00(%1,%4,4),%%xmm0 \n" + "paddd 0x10(%1,%4,4),%%xmm1 \n" + "paddd 0x20(%1,%4,4),%%xmm2 \n" + "paddd 0x30(%1,%4,4),%%xmm3 \n" + "lea 0x40(%1),%1 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "cvtdq2ps %%xmm1,%%xmm1 \n" + "mulps %%xmm4,%%xmm0 \n" + "mulps %%xmm4,%%xmm1 \n" + "cvtdq2ps %%xmm2,%%xmm2 \n" + "cvtdq2ps %%xmm3,%%xmm3 \n" + "mulps %%xmm4,%%xmm2 \n" + "mulps %%xmm4,%%xmm3 \n" + "cvtps2dq %%xmm0,%%xmm0 \n" + "cvtps2dq %%xmm1,%%xmm1 \n" + "cvtps2dq %%xmm2,%%xmm2 \n" + "cvtps2dq %%xmm3,%%xmm3 \n" + "packssdw %%xmm1,%%xmm0 \n" + "packssdw %%xmm3,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jge 40b \n" "49: \n" - "add $0x3,%3 \n" - "jl 19f \n" + "add $0x3,%3 \n" + "jl 19f \n" // 1 pixel loop LABELALIGN "10: \n" - "movdqu (%0),%%xmm0 \n" - "psubd 0x00(%0,%4,4),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "psubd (%1),%%xmm0 \n" - "paddd 0x00(%1,%4,4),%%xmm0 \n" - "lea 0x10(%1),%1 \n" - "cvtdq2ps %%xmm0,%%xmm0 \n" - "mulps %%xmm4,%%xmm0 \n" - "cvtps2dq %%xmm0,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movd %%xmm0,(%2) \n" - "lea 0x4(%2),%2 \n" - "sub $0x1,%3 \n" - "jge 10b \n" + "movdqu (%0),%%xmm0 \n" + "psubd 0x00(%0,%4,4),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "psubd (%1),%%xmm0 \n" + "paddd 0x00(%1,%4,4),%%xmm0 \n" + "lea 0x10(%1),%1 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "mulps %%xmm4,%%xmm0 \n" + "cvtps2dq %%xmm0,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,(%2) \n" + "lea 0x4(%2),%2 \n" + "sub $0x1,%3 \n" + "jge 10b \n" "19: \n" : "+r"(topleft), // %0 "+r"(botleft), // %1 @@ -5924,70 +6021,70 @@ void ARGBAffineRow_SSE2(const uint8_t* src_argb, intptr_t src_argb_stride_temp = src_argb_stride; intptr_t temp; asm volatile( - "movq (%3),%%xmm2 \n" - "movq 0x08(%3),%%xmm7 \n" - "shl $0x10,%1 \n" - "add $0x4,%1 \n" - "movd %1,%%xmm5 \n" - "sub $0x4,%4 \n" - "jl 49f \n" - - "pshufd $0x44,%%xmm7,%%xmm7 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "movdqa %%xmm2,%%xmm0 \n" - "addps %%xmm7,%%xmm0 \n" - "movlhps %%xmm0,%%xmm2 \n" - "movdqa %%xmm7,%%xmm4 \n" - "addps %%xmm4,%%xmm4 \n" - "movdqa %%xmm2,%%xmm3 \n" - "addps %%xmm4,%%xmm3 \n" - "addps %%xmm4,%%xmm4 \n" + "movq (%3),%%xmm2 \n" + "movq 0x08(%3),%%xmm7 \n" + "shl $0x10,%1 \n" + "add $0x4,%1 \n" + "movd %1,%%xmm5 \n" + "sub $0x4,%4 \n" + "jl 49f \n" + + "pshufd $0x44,%%xmm7,%%xmm7 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "movdqa %%xmm2,%%xmm0 \n" + "addps %%xmm7,%%xmm0 \n" + "movlhps %%xmm0,%%xmm2 \n" + "movdqa %%xmm7,%%xmm4 \n" + "addps %%xmm4,%%xmm4 \n" + "movdqa %%xmm2,%%xmm3 \n" + "addps %%xmm4,%%xmm3 \n" + "addps %%xmm4,%%xmm4 \n" // 4 pixel loop LABELALIGN "40: \n" - "cvttps2dq %%xmm2,%%xmm0 \n" // x,y float->int first 2 - "cvttps2dq %%xmm3,%%xmm1 \n" // x,y float->int next 2 - "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts - "pmaddwd %%xmm5,%%xmm0 \n" // off = x*4 + y*stride - "movd %%xmm0,%k1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - "movd %%xmm0,%k5 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - "movd 0x00(%0,%1,1),%%xmm1 \n" - "movd 0x00(%0,%5,1),%%xmm6 \n" - "punpckldq %%xmm6,%%xmm1 \n" - "addps %%xmm4,%%xmm2 \n" - "movq %%xmm1,(%2) \n" - "movd %%xmm0,%k1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - "movd %%xmm0,%k5 \n" - "movd 0x00(%0,%1,1),%%xmm0 \n" - "movd 0x00(%0,%5,1),%%xmm6 \n" - "punpckldq %%xmm6,%%xmm0 \n" - "addps %%xmm4,%%xmm3 \n" - "movq %%xmm0,0x08(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%4 \n" - "jge 40b \n" + "cvttps2dq %%xmm2,%%xmm0 \n" // x,y float->int first 2 + "cvttps2dq %%xmm3,%%xmm1 \n" // x,y float->int next 2 + "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts + "pmaddwd %%xmm5,%%xmm0 \n" // off = x*4 + y*stride + "movd %%xmm0,%k1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + "movd %%xmm0,%k5 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + "movd 0x00(%0,%1,1),%%xmm1 \n" + "movd 0x00(%0,%5,1),%%xmm6 \n" + "punpckldq %%xmm6,%%xmm1 \n" + "addps %%xmm4,%%xmm2 \n" + "movq %%xmm1,(%2) \n" + "movd %%xmm0,%k1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + "movd %%xmm0,%k5 \n" + "movd 0x00(%0,%1,1),%%xmm0 \n" + "movd 0x00(%0,%5,1),%%xmm6 \n" + "punpckldq %%xmm6,%%xmm0 \n" + "addps %%xmm4,%%xmm3 \n" + "movq %%xmm0,0x08(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%4 \n" + "jge 40b \n" "49: \n" - "add $0x3,%4 \n" - "jl 19f \n" + "add $0x3,%4 \n" + "jl 19f \n" // 1 pixel loop LABELALIGN "10: \n" - "cvttps2dq %%xmm2,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "pmaddwd %%xmm5,%%xmm0 \n" - "addps %%xmm7,%%xmm2 \n" - "movd %%xmm0,%k1 \n" - "movd 0x00(%0,%1,1),%%xmm0 \n" - "movd %%xmm0,(%2) \n" - "lea 0x04(%2),%2 \n" - "sub $0x1,%4 \n" - "jge 10b \n" + "cvttps2dq %%xmm2,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "pmaddwd %%xmm5,%%xmm0 \n" + "addps %%xmm7,%%xmm2 \n" + "movd %%xmm0,%k1 \n" + "movd 0x00(%0,%1,1),%%xmm0 \n" + "movd %%xmm0,(%2) \n" + "lea 0x04(%2),%2 \n" + "sub $0x1,%4 \n" + "jge 10b \n" "19: \n" : "+r"(src_argb), // %0 "+r"(src_argb_stride_temp), // %1 @@ -6009,68 +6106,68 @@ void InterpolateRow_SSSE3(uint8_t* dst_ptr, int dst_width, int source_y_fraction) { asm volatile( - "sub %1,%0 \n" - "cmp $0x0,%3 \n" - "je 100f \n" - "cmp $0x80,%3 \n" - "je 50f \n" - - "movd %3,%%xmm0 \n" - "neg %3 \n" - "add $0x100,%3 \n" - "movd %3,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm5 \n" - "punpcklwd %%xmm5,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "mov $0x80808080,%%eax \n" - "movd %%eax,%%xmm4 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" + "sub %1,%0 \n" + "cmp $0x0,%3 \n" + "je 100f \n" + "cmp $0x80,%3 \n" + "je 50f \n" + + "movd %3,%%xmm0 \n" + "neg %3 \n" + "add $0x100,%3 \n" + "movd %3,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" + "punpcklwd %%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "mov $0x80808080,%%eax \n" + "movd %%eax,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" // General purpose row blend. LABELALIGN "1: \n" - "movdqu (%1),%%xmm0 \n" - "movdqu 0x00(%1,%4,1),%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "psubb %%xmm4,%%xmm0 \n" - "psubb %%xmm4,%%xmm1 \n" - "movdqa %%xmm5,%%xmm2 \n" - "movdqa %%xmm5,%%xmm3 \n" - "pmaddubsw %%xmm0,%%xmm2 \n" - "pmaddubsw %%xmm1,%%xmm3 \n" - "paddw %%xmm4,%%xmm2 \n" - "paddw %%xmm4,%%xmm3 \n" - "psrlw $0x8,%%xmm2 \n" - "psrlw $0x8,%%xmm3 \n" - "packuswb %%xmm3,%%xmm2 \n" - "movdqu %%xmm2,0x00(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "jmp 99f \n" + "movdqu (%1),%%xmm0 \n" + "movdqu 0x00(%1,%4,1),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "psubb %%xmm4,%%xmm0 \n" + "psubb %%xmm4,%%xmm1 \n" + "movdqa %%xmm5,%%xmm2 \n" + "movdqa %%xmm5,%%xmm3 \n" + "pmaddubsw %%xmm0,%%xmm2 \n" + "pmaddubsw %%xmm1,%%xmm3 \n" + "paddw %%xmm4,%%xmm2 \n" + "paddw %%xmm4,%%xmm3 \n" + "psrlw $0x8,%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "movdqu %%xmm2,0x00(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "jmp 99f \n" // Blend 50 / 50. LABELALIGN "50: \n" - "movdqu (%1),%%xmm0 \n" - "movdqu 0x00(%1,%4,1),%%xmm1 \n" - "pavgb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,0x00(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 50b \n" - "jmp 99f \n" + "movdqu (%1),%%xmm0 \n" + "movdqu 0x00(%1,%4,1),%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,0x00(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 50b \n" + "jmp 99f \n" // Blend 100 / 0 - Copy row unchanged. LABELALIGN "100: \n" - "movdqu (%1),%%xmm0 \n" - "movdqu %%xmm0,0x00(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 100b \n" + "movdqu (%1),%%xmm0 \n" + "movdqu %%xmm0,0x00(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 100b \n" "99: \n" : "+r"(dst_ptr), // %0 @@ -6090,61 +6187,61 @@ void InterpolateRow_AVX2(uint8_t* dst_ptr, int dst_width, int source_y_fraction) { asm volatile( - "cmp $0x0,%3 \n" - "je 100f \n" - "sub %1,%0 \n" - "cmp $0x80,%3 \n" - "je 50f \n" - - "vmovd %3,%%xmm0 \n" - "neg %3 \n" - "add $0x100,%3 \n" - "vmovd %3,%%xmm5 \n" - "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n" - "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n" + "cmp $0x0,%3 \n" + "je 100f \n" + "sub %1,%0 \n" + "cmp $0x80,%3 \n" + "je 50f \n" + + "vmovd %3,%%xmm0 \n" + "neg %3 \n" + "add $0x100,%3 \n" + "vmovd %3,%%xmm5 \n" + "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n" + "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n" "vbroadcastss %%xmm5,%%ymm5 \n" - "mov $0x80808080,%%eax \n" - "vmovd %%eax,%%xmm4 \n" + "mov $0x80808080,%%eax \n" + "vmovd %%eax,%%xmm4 \n" "vbroadcastss %%xmm4,%%ymm4 \n" // General purpose row blend. LABELALIGN "1: \n" - "vmovdqu (%1),%%ymm0 \n" - "vmovdqu 0x00(%1,%4,1),%%ymm2 \n" - "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n" - "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n" - "vpsubb %%ymm4,%%ymm1,%%ymm1 \n" - "vpsubb %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n" - "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n" - "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" - "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,0x00(%1,%0,1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "jmp 99f \n" + "vmovdqu (%1),%%ymm0 \n" + "vmovdqu 0x00(%1,%4,1),%%ymm2 \n" + "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n" + "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n" + "vpsubb %%ymm4,%%ymm1,%%ymm1 \n" + "vpsubb %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n" + "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n" + "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" + "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%1,%0,1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "jmp 99f \n" // Blend 50 / 50. LABELALIGN "50: \n" - "vmovdqu (%1),%%ymm0 \n" - "vpavgb 0x00(%1,%4,1),%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,0x00(%1,%0,1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 50b \n" - "jmp 99f \n" + "vmovdqu (%1),%%ymm0 \n" + "vpavgb 0x00(%1,%4,1),%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%1,%0,1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 50b \n" + "jmp 99f \n" // Blend 100 / 0 - Copy row unchanged. LABELALIGN "100: \n" - "rep movsb \n" - "jmp 999f \n" + "rep movsb \n" + "jmp 999f \n" "99: \n" "vzeroupper \n" @@ -6166,20 +6263,20 @@ void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, int width) { asm volatile( - "movdqu (%3),%%xmm5 \n" + "movdqu (%3),%%xmm5 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -6200,16 +6297,16 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb, LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" - "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -6227,24 +6324,24 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y, int width) { asm volatile( - "sub %1,%2 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "movq (%1),%%xmm2 \n" - "movq 0x00(%1,%2,1),%%xmm1 \n" - "add $0x8,%1 \n" - "punpcklbw %%xmm1,%%xmm2 \n" - "movdqu (%0),%%xmm0 \n" - "add $0x10,%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "movdqu %%xmm0,(%3) \n" - "movdqu %%xmm1,0x10(%3) \n" - "lea 0x20(%3),%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" + "movq (%1),%%xmm2 \n" + "movq 0x00(%1,%2,1),%%xmm1 \n" + "add $0x8,%1 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "movdqu (%0),%%xmm0 \n" + "add $0x10,%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,(%3) \n" + "movdqu %%xmm1,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -6263,24 +6360,24 @@ void I422ToUYVYRow_SSE2(const uint8_t* src_y, int width) { asm volatile( - "sub %1,%2 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "movq (%1),%%xmm2 \n" - "movq 0x00(%1,%2,1),%%xmm1 \n" - "add $0x8,%1 \n" - "punpcklbw %%xmm1,%%xmm2 \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "add $0x10,%0 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm2 \n" - "movdqu %%xmm1,(%3) \n" - "movdqu %%xmm2,0x10(%3) \n" - "lea 0x20(%3),%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" + "movq (%1),%%xmm2 \n" + "movq 0x00(%1,%2,1),%%xmm1 \n" + "add $0x8,%1 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "add $0x10,%0 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + "movdqu %%xmm1,(%3) \n" + "movdqu %%xmm2,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -6299,26 +6396,26 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y, int width) { asm volatile( - "sub %1,%2 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "vpmovzxbw (%1),%%ymm1 \n" - "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" - "add $0x10,%1 \n" - "vpsllw $0x8,%%ymm2,%%ymm2 \n" - "vpor %%ymm1,%%ymm2,%%ymm2 \n" - "vmovdqu (%0),%%ymm0 \n" - "add $0x20,%0 \n" - "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n" - "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n" + "vpmovzxbw (%1),%%ymm1 \n" + "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" + "add $0x10,%1 \n" + "vpsllw $0x8,%%ymm2,%%ymm2 \n" + "vpor %%ymm1,%%ymm2,%%ymm2 \n" + "vmovdqu (%0),%%ymm0 \n" + "add $0x20,%0 \n" + "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n" + "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n" "vextractf128 $0x0,%%ymm1,(%3) \n" "vextractf128 $0x0,%%ymm2,0x10(%3) \n" "vextractf128 $0x1,%%ymm1,0x20(%3) \n" "vextractf128 $0x1,%%ymm2,0x30(%3) \n" - "lea 0x40(%3),%3 \n" - "sub $0x20,%4 \n" - "jg 1b \n" + "lea 0x40(%3),%3 \n" + "sub $0x20,%4 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 @@ -6338,26 +6435,26 @@ void I422ToUYVYRow_AVX2(const uint8_t* src_y, int width) { asm volatile( - "sub %1,%2 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "vpmovzxbw (%1),%%ymm1 \n" - "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" - "add $0x10,%1 \n" - "vpsllw $0x8,%%ymm2,%%ymm2 \n" - "vpor %%ymm1,%%ymm2,%%ymm2 \n" - "vmovdqu (%0),%%ymm0 \n" - "add $0x20,%0 \n" - "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n" - "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n" + "vpmovzxbw (%1),%%ymm1 \n" + "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" + "add $0x10,%1 \n" + "vpsllw $0x8,%%ymm2,%%ymm2 \n" + "vpor %%ymm1,%%ymm2,%%ymm2 \n" + "vmovdqu (%0),%%ymm0 \n" + "add $0x20,%0 \n" + "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n" + "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n" "vextractf128 $0x0,%%ymm1,(%3) \n" "vextractf128 $0x0,%%ymm2,0x10(%3) \n" "vextractf128 $0x1,%%ymm1,0x20(%3) \n" "vextractf128 $0x1,%%ymm2,0x30(%3) \n" - "lea 0x40(%3),%3 \n" - "sub $0x20,%4 \n" - "jg 1b \n" + "lea 0x40(%3),%3 \n" + "sub $0x20,%4 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 @@ -6376,47 +6473,47 @@ void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, int width) { asm volatile( - "pxor %%xmm3,%%xmm3 \n" + "pxor %%xmm3,%%xmm3 \n" // 2 pixel loop. LABELALIGN "1: \n" - "movq (%0),%%xmm0 \n" - "lea 0x8(%0),%0 \n" - "punpcklbw %%xmm3,%%xmm0 \n" - "movdqa %%xmm0,%%xmm4 \n" - "punpcklwd %%xmm3,%%xmm0 \n" - "punpckhwd %%xmm3,%%xmm4 \n" - "cvtdq2ps %%xmm0,%%xmm0 \n" - "cvtdq2ps %%xmm4,%%xmm4 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm4,%%xmm5 \n" - "mulps 0x10(%3),%%xmm0 \n" - "mulps 0x10(%3),%%xmm4 \n" - "addps (%3),%%xmm0 \n" - "addps (%3),%%xmm4 \n" - "movdqa %%xmm1,%%xmm2 \n" - "movdqa %%xmm5,%%xmm6 \n" - "mulps %%xmm1,%%xmm2 \n" - "mulps %%xmm5,%%xmm6 \n" - "mulps %%xmm2,%%xmm1 \n" - "mulps %%xmm6,%%xmm5 \n" - "mulps 0x20(%3),%%xmm2 \n" - "mulps 0x20(%3),%%xmm6 \n" - "mulps 0x30(%3),%%xmm1 \n" - "mulps 0x30(%3),%%xmm5 \n" - "addps %%xmm2,%%xmm0 \n" - "addps %%xmm6,%%xmm4 \n" - "addps %%xmm1,%%xmm0 \n" - "addps %%xmm5,%%xmm4 \n" - "cvttps2dq %%xmm0,%%xmm0 \n" - "cvttps2dq %%xmm4,%%xmm4 \n" - "packuswb %%xmm4,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x2,%2 \n" - "jg 1b \n" + "movq (%0),%%xmm0 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm3,%%xmm0 \n" + "movdqa %%xmm0,%%xmm4 \n" + "punpcklwd %%xmm3,%%xmm0 \n" + "punpckhwd %%xmm3,%%xmm4 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "cvtdq2ps %%xmm4,%%xmm4 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm4,%%xmm5 \n" + "mulps 0x10(%3),%%xmm0 \n" + "mulps 0x10(%3),%%xmm4 \n" + "addps (%3),%%xmm0 \n" + "addps (%3),%%xmm4 \n" + "movdqa %%xmm1,%%xmm2 \n" + "movdqa %%xmm5,%%xmm6 \n" + "mulps %%xmm1,%%xmm2 \n" + "mulps %%xmm5,%%xmm6 \n" + "mulps %%xmm2,%%xmm1 \n" + "mulps %%xmm6,%%xmm5 \n" + "mulps 0x20(%3),%%xmm2 \n" + "mulps 0x20(%3),%%xmm6 \n" + "mulps 0x30(%3),%%xmm1 \n" + "mulps 0x30(%3),%%xmm5 \n" + "addps %%xmm2,%%xmm0 \n" + "addps %%xmm6,%%xmm4 \n" + "addps %%xmm1,%%xmm0 \n" + "addps %%xmm5,%%xmm4 \n" + "cvttps2dq %%xmm0,%%xmm0 \n" + "cvttps2dq %%xmm4,%%xmm4 \n" + "packuswb %%xmm4,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x2,%2 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -6512,27 +6609,27 @@ void HalfFloatRow_AVX2(const uint16_t* src, int width) { scale *= kScaleBias; asm volatile( - "vbroadcastss %3, %%ymm4 \n" - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" - "sub %0,%1 \n" + "vbroadcastss %3, %%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + "sub %0,%1 \n" // 16 pixel loop. LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm2 \n" // 16 shorts - "add $0x20,%0 \n" - "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates - "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n" - "vcvtdq2ps %%ymm3,%%ymm3 \n" - "vcvtdq2ps %%ymm2,%%ymm2 \n" - "vmulps %%ymm3,%%ymm4,%%ymm3 \n" - "vmulps %%ymm2,%%ymm4,%%ymm2 \n" - "vpsrld $0xd,%%ymm3,%%ymm3 \n" - "vpsrld $0xd,%%ymm2,%%ymm2 \n" - "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates - "vmovdqu %%ymm2,-0x20(%0,%1,1) \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm2 \n" // 16 shorts + "add $0x20,%0 \n" + "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates + "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n" + "vcvtdq2ps %%ymm3,%%ymm3 \n" + "vcvtdq2ps %%ymm2,%%ymm2 \n" + "vmulps %%ymm3,%%ymm4,%%ymm3 \n" + "vmulps %%ymm2,%%ymm4,%%ymm2 \n" + "vpsrld $0xd,%%ymm3,%%ymm3 \n" + "vpsrld $0xd,%%ymm2,%%ymm2 \n" + "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates + "vmovdqu %%ymm2,-0x20(%0,%1,1) \n" + "sub $0x10,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 @@ -6553,8 +6650,8 @@ void HalfFloatRow_F16C(const uint16_t* src, float scale, int width) { asm volatile( - "vbroadcastss %3, %%ymm4 \n" - "sub %0,%1 \n" + "vbroadcastss %3, %%ymm4 \n" + "sub %0,%1 \n" // 16 pixel loop. LABELALIGN @@ -6588,7 +6685,7 @@ void HalfFloatRow_F16C(const uint16_t* src, #ifdef HAS_HALFFLOATROW_F16C void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) { asm volatile( - "sub %0,%1 \n" + "sub %0,%1 \n" // 16 pixel loop. LABELALIGN "1: \n" @@ -6622,21 +6719,21 @@ void ARGBColorTableRow_X86(uint8_t* dst_argb, // 1 pixel loop. LABELALIGN "1: \n" - "movzb (%0),%1 \n" - "lea 0x4(%0),%0 \n" - "movzb 0x00(%3,%1,4),%1 \n" - "mov %b1,-0x4(%0) \n" - "movzb -0x3(%0),%1 \n" - "movzb 0x01(%3,%1,4),%1 \n" - "mov %b1,-0x3(%0) \n" - "movzb -0x2(%0),%1 \n" - "movzb 0x02(%3,%1,4),%1 \n" - "mov %b1,-0x2(%0) \n" - "movzb -0x1(%0),%1 \n" - "movzb 0x03(%3,%1,4),%1 \n" - "mov %b1,-0x1(%0) \n" - "dec %2 \n" - "jg 1b \n" + "movzb (%0),%1 \n" + "lea 0x4(%0),%0 \n" + "movzb 0x00(%3,%1,4),%1 \n" + "mov %b1,-0x4(%0) \n" + "movzb -0x3(%0),%1 \n" + "movzb 0x01(%3,%1,4),%1 \n" + "mov %b1,-0x3(%0) \n" + "movzb -0x2(%0),%1 \n" + "movzb 0x02(%3,%1,4),%1 \n" + "mov %b1,-0x2(%0) \n" + "movzb -0x1(%0),%1 \n" + "movzb 0x03(%3,%1,4),%1 \n" + "mov %b1,-0x1(%0) \n" + "dec %2 \n" + "jg 1b \n" : "+r"(dst_argb), // %0 "=&d"(pixel_temp), // %1 "+r"(width) // %2 @@ -6655,18 +6752,18 @@ void RGBColorTableRow_X86(uint8_t* dst_argb, // 1 pixel loop. LABELALIGN "1: \n" - "movzb (%0),%1 \n" - "lea 0x4(%0),%0 \n" - "movzb 0x00(%3,%1,4),%1 \n" - "mov %b1,-0x4(%0) \n" - "movzb -0x3(%0),%1 \n" - "movzb 0x01(%3,%1,4),%1 \n" - "mov %b1,-0x3(%0) \n" - "movzb -0x2(%0),%1 \n" - "movzb 0x02(%3,%1,4),%1 \n" - "mov %b1,-0x2(%0) \n" - "dec %2 \n" - "jg 1b \n" + "movzb (%0),%1 \n" + "lea 0x4(%0),%0 \n" + "movzb 0x00(%3,%1,4),%1 \n" + "mov %b1,-0x4(%0) \n" + "movzb -0x3(%0),%1 \n" + "movzb 0x01(%3,%1,4),%1 \n" + "mov %b1,-0x3(%0) \n" + "movzb -0x2(%0),%1 \n" + "movzb 0x02(%3,%1,4),%1 \n" + "mov %b1,-0x2(%0) \n" + "dec %2 \n" + "jg 1b \n" : "+r"(dst_argb), // %0 "=&d"(pixel_temp), // %1 "+r"(width) // %2 @@ -6685,86 +6782,86 @@ void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, uintptr_t pixel_temp; uintptr_t table_temp; asm volatile( - "movd %6,%%xmm3 \n" - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psllw $0x8,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" + "movd %6,%%xmm3 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psllw $0x8,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" // 4 pixel loop. LABELALIGN "1: \n" - "movdqu (%2),%%xmm0 \n" - "pmaddubsw %%xmm3,%%xmm0 \n" - "phaddw %%xmm0,%%xmm0 \n" - "pand %%xmm4,%%xmm0 \n" - "punpcklwd %%xmm5,%%xmm0 \n" - "movd %%xmm0,%k1 \n" // 32 bit offset - "add %5,%1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - - "movzb (%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,(%3) \n" - "movzb 0x1(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x1(%3) \n" - "movzb 0x2(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x2(%3) \n" - "movzb 0x3(%2),%0 \n" - "mov %b0,0x3(%3) \n" - - "movd %%xmm0,%k1 \n" // 32 bit offset - "add %5,%1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - - "movzb 0x4(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x4(%3) \n" - "movzb 0x5(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x5(%3) \n" - "movzb 0x6(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x6(%3) \n" - "movzb 0x7(%2),%0 \n" - "mov %b0,0x7(%3) \n" - - "movd %%xmm0,%k1 \n" // 32 bit offset - "add %5,%1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - - "movzb 0x8(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x8(%3) \n" - "movzb 0x9(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x9(%3) \n" - "movzb 0xa(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0xa(%3) \n" - "movzb 0xb(%2),%0 \n" - "mov %b0,0xb(%3) \n" - - "movd %%xmm0,%k1 \n" // 32 bit offset - "add %5,%1 \n" - - "movzb 0xc(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0xc(%3) \n" - "movzb 0xd(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0xd(%3) \n" - "movzb 0xe(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0xe(%3) \n" - "movzb 0xf(%2),%0 \n" - "mov %b0,0xf(%3) \n" - "lea 0x10(%2),%2 \n" - "lea 0x10(%3),%3 \n" - "sub $0x4,%4 \n" - "jg 1b \n" + "movdqu (%2),%%xmm0 \n" + "pmaddubsw %%xmm3,%%xmm0 \n" + "phaddw %%xmm0,%%xmm0 \n" + "pand %%xmm4,%%xmm0 \n" + "punpcklwd %%xmm5,%%xmm0 \n" + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + + "movzb (%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,(%3) \n" + "movzb 0x1(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x1(%3) \n" + "movzb 0x2(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x2(%3) \n" + "movzb 0x3(%2),%0 \n" + "mov %b0,0x3(%3) \n" + + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + + "movzb 0x4(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x4(%3) \n" + "movzb 0x5(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x5(%3) \n" + "movzb 0x6(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x6(%3) \n" + "movzb 0x7(%2),%0 \n" + "mov %b0,0x7(%3) \n" + + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + + "movzb 0x8(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x8(%3) \n" + "movzb 0x9(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x9(%3) \n" + "movzb 0xa(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0xa(%3) \n" + "movzb 0xb(%2),%0 \n" + "mov %b0,0xb(%3) \n" + + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + + "movzb 0xc(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0xc(%3) \n" + "movzb 0xd(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0xd(%3) \n" + "movzb 0xe(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0xe(%3) \n" + "movzb 0xf(%2),%0 \n" + "mov %b0,0xf(%3) \n" + "lea 0x10(%2),%2 \n" + "lea 0x10(%3),%3 \n" + "sub $0x4,%4 \n" + "jg 1b \n" : "=&d"(pixel_temp), // %0 "=&a"(table_temp), // %1 "+r"(src_argb), // %2 @@ -6837,46 +6934,47 @@ void NV21ToYUV24Row_AVX2(const uint8_t* src_y, src_y_ptr = (uint8_t*)src_y; asm volatile( - "vmovdqu %5, %%ymm0 \n" // init blend value - "vmovdqu %6, %%ymm1 \n" // init blend value - "vmovdqu %7, %%ymm2 \n" // init blend value - // "sub $0x20, %3 \n" //sub 32 from width for final loop + "vmovdqu %5, %%ymm0 \n" // init blend value + "vmovdqu %6, %%ymm1 \n" // init blend value + "vmovdqu %7, %%ymm2 \n" // init blend value + // "sub $0x20, %3 \n" //sub 32 from + // width for final loop LABELALIGN - "1: \n" // label 1 - "vmovdqu (%0,%4), %%ymm3 \n" // src_y - "vmovdqu 1(%1,%4), %%ymm4 \n" // src_uv+1 - "vmovdqu (%1), %%ymm5 \n" // src_uv - "vpshufb %8, %%ymm3, %%ymm13 \n" // y, kSHUF0 for shuf - "vpshufb %9, %%ymm4, %%ymm14 \n" // uv+1, kSHUF1 for - // shuf - "vpshufb %10, %%ymm5, %%ymm15 \n" // uv, kSHUF2 for - // shuf - "vpshufb %11, %%ymm3, %%ymm3 \n" // y kSHUF3 for shuf - "vpshufb %12, %%ymm4, %%ymm4 \n" // uv+1 kSHUF4 for - // shuf - "vpblendvb %%ymm0, %%ymm14, %%ymm13, %%ymm12 \n" // blend 0 - "vpblendvb %%ymm0, %%ymm13, %%ymm14, %%ymm14 \n" // blend 0 - "vpblendvb %%ymm2, %%ymm15, %%ymm12, %%ymm12 \n" // blend 2 - "vpblendvb %%ymm1, %%ymm15, %%ymm14, %%ymm13 \n" // blend 1 - "vpshufb %13, %%ymm5, %%ymm15 \n" // shuffle const - "vpor %%ymm4, %%ymm3, %%ymm5 \n" // get results - "vmovdqu %%ymm12, 0x20(%2) \n" // store dst_yuv+20h - "vpor %%ymm15, %%ymm5, %%ymm3 \n" // get results - "add $0x20, %4 \n" // add to src buffer - // ptr - "vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4 \n" // insert - "vperm2i128 $0x31, %%ymm13, %%ymm3, %%ymm5 \n" // insert - "vmovdqu %%ymm4, (%2) \n" // store dst_yuv - "vmovdqu %%ymm5, 0x40(%2) \n" // store dst_yuv+40h - "add $0x60,%2 \n" // add to dst buffer - // ptr - // "cmp %3, %4 \n" //(width64 - + "1: \n" // label 1 + "vmovdqu (%0,%4), %%ymm3 \n" // src_y + "vmovdqu 1(%1,%4), %%ymm4 \n" // src_uv+1 + "vmovdqu (%1), %%ymm5 \n" // src_uv + "vpshufb %8, %%ymm3, %%ymm13 \n" // y, kSHUF0 for shuf + "vpshufb %9, %%ymm4, %%ymm14 \n" // uv+1, kSHUF1 for + // shuf + "vpshufb %10, %%ymm5, %%ymm15 \n" // uv, kSHUF2 for + // shuf + "vpshufb %11, %%ymm3, %%ymm3 \n" // y kSHUF3 for shuf + "vpshufb %12, %%ymm4, %%ymm4 \n" // uv+1 kSHUF4 for + // shuf + "vpblendvb %%ymm0, %%ymm14, %%ymm13, %%ymm12 \n" // blend 0 + "vpblendvb %%ymm0, %%ymm13, %%ymm14, %%ymm14 \n" // blend 0 + "vpblendvb %%ymm2, %%ymm15, %%ymm12, %%ymm12 \n" // blend 2 + "vpblendvb %%ymm1, %%ymm15, %%ymm14, %%ymm13 \n" // blend 1 + "vpshufb %13, %%ymm5, %%ymm15 \n" // shuffle const + "vpor %%ymm4, %%ymm3, %%ymm5 \n" // get results + "vmovdqu %%ymm12, 0x20(%2) \n" // store dst_yuv+20h + "vpor %%ymm15, %%ymm5, %%ymm3 \n" // get results + "add $0x20, %4 \n" // add to src buffer + // ptr + "vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4 \n" // insert + "vperm2i128 $0x31, %%ymm13, %%ymm3, %%ymm5 \n" // insert + "vmovdqu %%ymm4, (%2) \n" // store dst_yuv + "vmovdqu %%ymm5, 0x40(%2) \n" // store dst_yuv+40h + "add $0x60,%2 \n" // add to dst buffer + // ptr + // "cmp %3, %4 \n" //(width64 - // 32 bytes) and src_offset - "sub $0x20,%3 \n" // 32 pixels per loop - "jg 1b \n" - "vzeroupper \n" // sse-avx2 - // transistions + "sub $0x20,%3 \n" // 32 pixels per loop + "jg 1b \n" + "vzeroupper \n" // sse-avx2 + // transistions : "+r"(src_y), //%0 "+r"(src_vu), //%1 @@ -6907,20 +7005,20 @@ static const uvec8 kShuffleUVToVU = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) { asm volatile( - "movdqu %3,%%xmm5 \n" + "movdqu %3,%%xmm5 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_uv), // %0 "+r"(dst_vu), // %1 "+r"(width) // %2 @@ -6937,16 +7035,16 @@ void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) { LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" - "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_uv), // %0 "+r"(dst_vu), // %1 @@ -6956,6 +7054,119 @@ void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) { } #endif // HAS_SWAPUVROW_AVX2 +void HalfMergeUVRow_SSSE3(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int width) { + asm volatile( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "packuswb %%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" // load 16 U values + "movdqu (%1),%%xmm1 \n" // load 16 V values + "movdqu 0(%0,%4,1),%%xmm2 \n" // 16 from next row + "movdqu 0(%1,%5,1),%%xmm3 \n" + "lea 0x10(%0),%0 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" // half size + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x10(%1),%1 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "psrlw $0x1,%%xmm0 \n" + "psrlw $0x1,%%xmm1 \n" + "pavgw %%xmm5,%%xmm0 \n" + "pavgw %%xmm5,%%xmm1 \n" + "packuswb %%xmm0,%%xmm0 \n" + "packuswb %%xmm1,%%xmm1 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" // store 8 UV pixels + "lea 0x10(%2),%2 \n" + "sub $0x10,%3 \n" // 16 src pixels per loop + "jg 1b \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(src_stride_u)), // %4 + "r"((intptr_t)(src_stride_v)) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +void HalfMergeUVRow_AVX2(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int width) { + asm volatile( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // load 32 U values + "vmovdqu (%1),%%ymm1 \n" // load 32 V values + "vmovdqu 0(%0,%4,1),%%ymm2 \n" // 32 from next row + "vmovdqu 0(%1,%5,1),%%ymm3 \n" + "lea 0x20(%0),%0 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // half size + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "lea 0x20(%1),%1 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vpsrlw $0x1,%%ymm0,%%ymm0 \n" + "vpsrlw $0x1,%%ymm1,%%ymm1 \n" + "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" + "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%2) \n" // store 16 UV pixels + "lea 0x20(%2),%2 \n" + "sub $0x20,%3 \n" // 32 src pixels per loop + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(src_stride_u)), // %4 + "r"((intptr_t)(src_stride_v)) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width) { + asm volatile( + "pxor %%xmm1,%%xmm1 \n" + + LABELALIGN + "1: \n" + "movd (%0),%%xmm0 \n" // load float + "maxss %%xmm1, %%xmm0 \n" // clamp to zero + "add 4, %0 \n" + "movd %%xmm0, (%1) \n" // store float + "add 4, %1 \n" + "sub $0x4,%2 \n" // 1 float per loop + "jg 1b \n" + : "+r"(src_x), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); +} + #endif // defined(__x86_64__) || defined(__i386__) #ifdef __cplusplus diff --git a/chromium/third_party/libyuv/source/row_mmi.cc b/chromium/third_party/libyuv/source/row_mmi.cc index 50cfca726f8..9a8e2cb2d16 100644 --- a/chromium/third_party/libyuv/source/row_mmi.cc +++ b/chromium/third_party/libyuv/source/row_mmi.cc @@ -21,6 +21,8 @@ extern "C" { // This module is for Mips MMI. #if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) +// clang-format off + void RGB24ToARGBRow_MMI(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { @@ -688,12 +690,15 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, uint8_t* dst_v, int width) { uint64_t src_rgb1; - uint64_t ftmp[12]; + uint64_t ftmp[13]; + uint64_t tmp[1]; const uint64_t value = 0x4040; - const uint64_t mask_u = 0x0026004a00700002; - const uint64_t mask_v = 0x00020070005e0012; + const uint64_t mask_u = 0x0013002500380002; + const uint64_t mask_v = 0x00020038002f0009; __asm__ volatile( + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" @@ -707,7 +712,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest0_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t" "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t" @@ -725,7 +731,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" @@ -752,7 +759,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest1_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t" "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t" @@ -770,7 +778,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" @@ -797,7 +806,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest2_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t" "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t" @@ -815,7 +825,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" @@ -842,7 +853,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest3_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t" "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t" @@ -860,7 +872,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" @@ -898,11 +911,12 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), + [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), [sixteen] "f"(0x10) : "memory"); } @@ -992,12 +1006,15 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, uint8_t* dst_v, int width) { uint64_t src_rgb1; - uint64_t ftmp[12]; + uint64_t ftmp[13]; + uint64_t tmp[1]; const uint64_t value = 0x4040; - const uint64_t mask_u = 0x00020070004a0026; - const uint64_t mask_v = 0x0012005e00700002; + const uint64_t mask_u = 0x0002003800250013; + const uint64_t mask_v = 0x0009002f00380002; __asm__ volatile( + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" @@ -1011,7 +1028,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[dest0_u], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest0_u], %[dest0_u], %[value] \n\t" "pinsrh_0 %[dest0_v], %[src0], %[value] \n\t" @@ -1029,7 +1047,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" @@ -1056,7 +1075,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[dest1_u], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest1_u], %[dest1_u], %[value] \n\t" "pinsrh_0 %[dest1_v], %[src0], %[value] \n\t" @@ -1074,7 +1094,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" @@ -1101,7 +1122,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[dest2_u], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest2_u], %[dest2_u], %[value] \n\t" "pinsrh_0 %[dest2_v], %[src0], %[value] \n\t" @@ -1119,7 +1141,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" @@ -1146,7 +1169,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[dest3_u], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest3_u], %[dest3_u], %[value] \n\t" "pinsrh_0 %[dest3_v], %[src0], %[value] \n\t" @@ -1164,7 +1188,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" @@ -1202,11 +1227,12 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), + [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), [sixteen] "f"(0x10) : "memory"); } @@ -1296,12 +1322,15 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, uint8_t* dst_v, int width) { uint64_t src_rgb1; - uint64_t ftmp[12]; + uint64_t ftmp[13]; + uint64_t tmp[1]; const uint64_t value = 0x4040; - const uint64_t mask_u = 0x00020070004a0026; - const uint64_t mask_v = 0x0012005e00700002; + const uint64_t mask_u = 0x0002003800250013; + const uint64_t mask_v = 0x0009002F00380002; __asm__ volatile( + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" @@ -1315,7 +1344,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest0_u], %[src0], %[value] \n\t" "dsll %[dest0_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest0_v], %[dest0_v], %[value] \n\t" @@ -1333,7 +1363,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" @@ -1360,7 +1391,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest1_u], %[src0], %[value] \n\t" "dsll %[dest1_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest1_v], %[dest1_v], %[value] \n\t" @@ -1378,7 +1410,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" @@ -1405,7 +1438,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest2_u], %[src0], %[value] \n\t" "dsll %[dest2_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest2_v], %[dest2_v], %[value] \n\t" @@ -1423,7 +1457,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" @@ -1450,7 +1485,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest3_u], %[src0], %[value] \n\t" "dsll %[dest3_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest3_v], %[dest3_v], %[value] \n\t" @@ -1468,7 +1504,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" @@ -1506,11 +1543,12 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), + [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), [sixteen] "f"(0x10) : "memory"); } @@ -1600,12 +1638,15 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, uint8_t* dst_v, int width) { uint64_t src_rgb1; - uint64_t ftmp[12]; + uint64_t ftmp[13]; + uint64_t tmp[1]; const uint64_t value = 0x4040; - const uint64_t mask_u = 0x0026004a00700002; - const uint64_t mask_v = 0x00020070005e0012; + const uint64_t mask_u = 0x0013002500380002; + const uint64_t mask_v = 0x00020038002f0009; __asm__ volatile( + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" @@ -1619,7 +1660,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[dest0_u], %[src0], %[value] \n\t" "dsrl %[dest0_v], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest0_v], %[dest0_v], %[value] \n\t" @@ -1637,7 +1679,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" "dsrl %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" @@ -1664,7 +1707,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[dest1_u], %[src0], %[value] \n\t" "dsrl %[dest1_v], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest1_v], %[dest1_v], %[value] \n\t" @@ -1682,7 +1726,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" "dsrl %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" @@ -1709,7 +1754,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[dest2_u], %[src0], %[value] \n\t" "dsrl %[dest2_v], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest2_v], %[dest2_v], %[value] \n\t" @@ -1727,7 +1773,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" "dsrl %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" @@ -1754,7 +1801,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[dest3_u], %[src0], %[value] \n\t" "dsrl %[dest3_v], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest3_v], %[dest3_v], %[value] \n\t" @@ -1772,7 +1820,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" "dsrl %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" @@ -1810,11 +1859,12 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), + [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), [sixteen] "f"(0x10) : "memory"); } @@ -1908,12 +1958,15 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, uint8_t* dst_v, int width) { uint64_t src_rgb1; - uint64_t ftmp[12]; + uint64_t ftmp[13]; + uint64_t tmp[1]; const uint64_t value = 0x4040; - const uint64_t mask_u = 0x0026004a00700002; - const uint64_t mask_v = 0x00020070005e0012; + const uint64_t mask_u = 0x0013002500380002; + const uint64_t mask_v = 0x00020038002f0009; __asm__ volatile( + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" @@ -1929,7 +1982,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest0_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t" "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t" @@ -1949,7 +2003,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" @@ -1978,7 +2033,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest1_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t" "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t" @@ -1998,7 +2054,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" @@ -2027,7 +2084,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest2_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t" "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t" @@ -2047,7 +2105,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" @@ -2076,7 +2135,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest3_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t" "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t" @@ -2096,7 +2156,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" @@ -2134,11 +2195,12 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), + [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), [sixteen] "f"(0x10) : "memory"); } @@ -2232,12 +2294,15 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, uint8_t* dst_v, int width) { uint64_t src_rgb1; - uint64_t ftmp[12]; + uint64_t ftmp[13]; + uint64_t tmp[1]; const uint64_t value = 0x4040; - const uint64_t mask_u = 0x00020070004a0026; - const uint64_t mask_v = 0x0012005e00700002; + const uint64_t mask_u = 0x0002003800250013; + const uint64_t mask_v = 0x0009002f00380002; __asm__ volatile( + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" @@ -2253,7 +2318,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest0_u], %[src0], %[value] \n\t" "dsll %[dest0_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest0_v], %[dest0_v], %[value] \n\t" @@ -2273,7 +2339,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" @@ -2302,7 +2369,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest1_u], %[src0], %[value] \n\t" "dsll %[dest1_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest1_v], %[dest1_v], %[value] \n\t" @@ -2322,7 +2390,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" @@ -2351,7 +2420,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest2_u], %[src0], %[value] \n\t" "dsll %[dest2_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest2_v], %[dest2_v], %[value] \n\t" @@ -2371,7 +2441,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" @@ -2400,7 +2471,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest3_u], %[src0], %[value] \n\t" "dsll %[dest3_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest3_v], %[dest3_v], %[value] \n\t" @@ -2420,7 +2492,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" @@ -2458,11 +2531,12 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), + [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), [sixteen] "f"(0x10) : "memory"); } @@ -2471,10 +2545,10 @@ void ARGBToYJRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { uint64_t src, src_hi, src_lo; uint64_t dest, dest0, dest1, dest2, dest3; uint64_t tmp0, tmp1; - const uint64_t shift = 0x07; - const uint64_t value = 0x0040; + const uint64_t shift = 0x08; + const uint64_t value = 0x80; const uint64_t mask0 = 0x0; - const uint64_t mask1 = 0x00010026004B000FULL; + const uint64_t mask1 = 0x0001004D0096001DULL; __asm__ volatile( "1: \n\t" @@ -2558,8 +2632,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, uint64_t src_rgb1; uint64_t ftmp[12]; const uint64_t value = 0x4040; - const uint64_t mask_u = 0x002b0054007f0002; - const uint64_t mask_v = 0x0002007f006b0014; + const uint64_t mask_u = 0x0015002a003f0002; + const uint64_t mask_v = 0x0002003f0035000a; __asm__ volatile( "1: \n\t" @@ -2572,8 +2646,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" - "pavgh %[src0], %[src_lo], %[src0] \n\t" - "pavgh %[src1], %[src_hi], %[src1] \n\t" + "paddh %[src0], %[src_lo], %[src0] \n\t" + "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[dest0_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t" @@ -2589,8 +2663,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" - "pavgh %[src0], %[src_lo], %[src0] \n\t" - "pavgh %[src1], %[src_hi], %[src1] \n\t" + "paddh %[src0], %[src_lo], %[src0] \n\t" + "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" @@ -2615,8 +2689,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" - "pavgh %[src0], %[src_lo], %[src0] \n\t" - "pavgh %[src1], %[src_hi], %[src1] \n\t" + "paddh %[src0], %[src_lo], %[src0] \n\t" + "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[dest1_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t" @@ -2632,8 +2706,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" - "pavgh %[src0], %[src_lo], %[src0] \n\t" - "pavgh %[src1], %[src_hi], %[src1] \n\t" + "paddh %[src0], %[src_lo], %[src0] \n\t" + "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" @@ -2658,8 +2732,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" - "pavgh %[src0], %[src_lo], %[src0] \n\t" - "pavgh %[src1], %[src_hi], %[src1] \n\t" + "paddh %[src0], %[src_lo], %[src0] \n\t" + "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[dest2_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t" @@ -2675,8 +2749,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" - "pavgh %[src0], %[src_lo], %[src0] \n\t" - "pavgh %[src1], %[src_hi], %[src1] \n\t" + "paddh %[src0], %[src_lo], %[src0] \n\t" + "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" @@ -2701,8 +2775,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" - "pavgh %[src0], %[src_lo], %[src0] \n\t" - "pavgh %[src1], %[src_hi], %[src1] \n\t" + "paddh %[src0], %[src_lo], %[src0] \n\t" + "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[dest3_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t" @@ -2718,8 +2792,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" - "pavgh %[src0], %[src_lo], %[src0] \n\t" - "pavgh %[src1], %[src_hi], %[src1] \n\t" + "paddh %[src0], %[src_lo], %[src0] \n\t" + "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" @@ -2762,7 +2836,7 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [zero] "f"(0x00), [eight] "f"(0x08), [sixteen] "f"(0x10) : "memory"); } @@ -4052,10 +4126,10 @@ void ARGBGrayRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width) { uint64_t tmp0, tmp1; const uint64_t mask0 = 0x0; const uint64_t mask1 = 0x01; - const uint64_t mask2 = 0x00400026004B000FULL; + const uint64_t mask2 = 0x0080004D0096001DULL; const uint64_t mask3 = 0xFF000000FF000000ULL; const uint64_t mask4 = ~mask3; - const uint64_t shift = 0x07; + const uint64_t shift = 0x08; __asm__ volatile( "1: \n\t" @@ -4778,7 +4852,9 @@ void J400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width) { : "memory"); } -void I400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf, int width) { +// TODO - respect YuvConstants +void I400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf, + const struct YuvConstants*, int width) { uint64_t src, src_lo, src_hi, dest, dest_lo, dest_hi; const uint64_t mask0 = 0x0; const uint64_t mask1 = 0x55; @@ -4912,10 +4988,10 @@ void MirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) { : "memory"); } -void MirrorUVRow_MMI(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { +void MirrorSplitUVRow_MMI(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { uint64_t src0, src1, dest0, dest1; const uint64_t mask0 = 0x00ff00ff00ff00ffULL; const uint64_t mask1 = 0x1b; @@ -6040,90 +6116,93 @@ void I444ToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { - uint64_t y, u, v; - uint64_t b_vec[2], g_vec[2], r_vec[2]; + uint64_t y,u,v; + uint64_t b_vec[2],g_vec[2],r_vec[2]; uint64_t mask = 0xff00ff00ff00ff00ULL; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" // yg - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" // bb - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" // ub - "or %[ub], %[ub], %[mask] \n\t" // must - // sign - // extension - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" // bg - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" // ug - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" // vg - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" // br - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" // vr - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t" // sign - // extension - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t" // y*0x0101 - "pmulhuh %[y], %[y], %[yg] \n\t" // y1 - - "punpcklbh %[u], %[u], %[zero] \n\t" // u - "paddsh %[b_vec0], %[y], %[bb] \n\t" - "pmullh %[b_vec1], %[u], %[ub] \n\t" - "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" - "psrah %[b_vec0], %[b_vec0], %[six] \n\t" - - "punpcklbh %[v], %[v], %[zero] \n\t" // v - "paddsh %[g_vec0], %[y], %[bg] \n\t" - "pmullh %[g_vec1], %[u], %[ug] \n\t" // u*ug - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "pmullh %[g_vec1], %[v], %[vg] \n\t" // v*vg - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "psrah %[g_vec0], %[g_vec0], %[six] \n\t" - - "paddsh %[r_vec0], %[y], %[br] \n\t" - "pmullh %[r_vec1], %[v], %[vr] \n\t" // v*vr - "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" - "psrah %[r_vec0], %[r_vec0], %[six] \n\t" - - "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" // rrrrbbbb - "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t" // ffffgggg - "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" - "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" // gbgbgbgb - "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" // frfrfrfr - "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" // frgbfrgb - "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" // frgbfrgb - "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec0] "=&f"(b_vec[0]), - [b_vec1] "=&f"(b_vec[1]), [g_vec0] "=&f"(g_vec[0]), - [g_vec1] "=&f"(g_vec[1]), [r_vec0] "=&f"(r_vec[0]), - [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub), [ug] "=&f"(ug), - [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb), [bg] "=&f"(bg), - [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v), - [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants), - [width] "r"(width), [zero] "f"(0x00), [alpha] "f"(-1), [six] "f"(0x6), - [five] "f"(0x55), [mask] "f"(mask) - : "memory"); + uint64_t ub,ug,vg,vr,bb,bg,br,yg; + __asm__ volatile ( + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"//yg + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"//bb + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"//ub + "or %[ub], %[ub], %[mask] \n\t"//must sign extension + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"//bg + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"//ug + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"//vg + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"//br + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"//vr + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask] \n\t"//sign extension + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 + "pmulhuh %[y], %[y], %[yg] \n\t"//y1 + + "punpcklbh %[u], %[u], %[zero] \n\t"//u + "paddsh %[b_vec0], %[y], %[bb] \n\t" + "pmullh %[b_vec1], %[u], %[ub] \n\t" + "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" + "psrah %[b_vec0], %[b_vec0], %[six] \n\t" + + "punpcklbh %[v], %[v], %[zero] \n\t"//v + "paddsh %[g_vec0], %[y], %[bg] \n\t" + "pmullh %[g_vec1], %[u], %[ug] \n\t"//u*ug + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "pmullh %[g_vec1], %[v], %[vg] \n\t"//v*vg + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "psrah %[g_vec0], %[g_vec0], %[six] \n\t" + + "paddsh %[r_vec0], %[y], %[br] \n\t" + "pmullh %[r_vec1], %[v], %[vr] \n\t"//v*vr + "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" + "psrah %[r_vec0], %[r_vec0], %[six] \n\t" + + "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb + "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t"//ffffgggg + "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" + "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"//gbgbgbgb + "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"//frfrfrfr + "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb + "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb + "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + : [y]"=&f"(y), + [u]"=&f"(u), [v]"=&f"(v), + [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), + [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), + [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [alpha]"f"(-1), + [six]"f"(0x6), [five]"f"(0x55), + [mask]"f"(mask) + : "memory" + ); } // Also used for 420 @@ -6133,96 +6212,99 @@ void I422ToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { - uint64_t y, u, v; - uint64_t b_vec[2], g_vec[2], r_vec[2]; + uint64_t y,u,v; + uint64_t b_vec[2],g_vec[2],r_vec[2]; uint64_t mask = 0xff00ff00ff00ff00ULL; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" // yg - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" // bb - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" // ub - "or %[ub], %[ub], %[mask] \n\t" // must - // sign - // extension - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" // bg - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" // ug - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" // vg - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" // br - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" // vr - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t" // sign - // extension - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t" // y*0x0101 - "pmulhuh %[y], %[y], %[yg] \n\t" // y1 - - // u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t" // u - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec0], %[y], %[bb] \n\t" - "pmullh %[b_vec1], %[u], %[ub] \n\t" - "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" - "psrah %[b_vec0], %[b_vec0], %[six] \n\t" - - // v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" // v - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec0], %[y], %[bg] \n\t" - "pmullh %[g_vec1], %[u], %[ug] \n\t" // u*ug - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "pmullh %[g_vec1], %[v], %[vg] \n\t" // v*vg - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "psrah %[g_vec0], %[g_vec0], %[six] \n\t" - - "paddsh %[r_vec0], %[y], %[br] \n\t" - "pmullh %[r_vec1], %[v], %[vr] \n\t" // v*vr - "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" - "psrah %[r_vec0], %[r_vec0], %[six] \n\t" - - "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" // rrrrbbbb - "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t" // ffffgggg - "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" - "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" // gbgbgbgb - "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" // frfrfrfr - "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" // frgbfrgb - "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" // frgbfrgb - "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec0] "=&f"(b_vec[0]), - [b_vec1] "=&f"(b_vec[1]), [g_vec0] "=&f"(g_vec[0]), - [g_vec1] "=&f"(g_vec[1]), [r_vec0] "=&f"(r_vec[0]), - [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub), [ug] "=&f"(ug), - [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb), [bg] "=&f"(bg), - [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v), - [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants), - [width] "r"(width), [zero] "f"(0x00), [alpha] "f"(-1), [six] "f"(0x6), - [five] "f"(0x55), [mask] "f"(mask) - : "memory"); + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"//yg + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"//bb + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"//ub + "or %[ub], %[ub], %[mask] \n\t"//must sign extension + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"//bg + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"//ug + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"//vg + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"//br + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"//vr + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask] \n\t"//sign extension + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 + "pmulhuh %[y], %[y], %[yg] \n\t"//y1 + + //u3|u2|u1|u0 --> u1|u1|u0|u0 + "punpcklbh %[u], %[u], %[u] \n\t"//u + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec0], %[y], %[bb] \n\t" + "pmullh %[b_vec1], %[u], %[ub] \n\t" + "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" + "psrah %[b_vec0], %[b_vec0], %[six] \n\t" + + //v3|v2|v1|v0 --> v1|v1|v0|v0 + "punpcklbh %[v], %[v], %[v] \n\t"//v + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec0], %[y], %[bg] \n\t" + "pmullh %[g_vec1], %[u], %[ug] \n\t"//u*ug + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "pmullh %[g_vec1], %[v], %[vg] \n\t"//v*vg + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "psrah %[g_vec0], %[g_vec0], %[six] \n\t" + + "paddsh %[r_vec0], %[y], %[br] \n\t" + "pmullh %[r_vec1], %[v], %[vr] \n\t"//v*vr + "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" + "psrah %[r_vec0], %[r_vec0], %[six] \n\t" + + "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb + "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t"//ffffgggg + "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" + "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"//gbgbgbgb + "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"//frfrfrfr + "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb + "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb + "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), + [u]"=&f"(u), [v]"=&f"(v), + [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), + [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), + [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [alpha]"f"(-1), + [six]"f"(0x6), [five]"f"(0x55), + [mask]"f"(mask) + : "memory" + ); } // 10 bit YUV to ARGB @@ -6232,96 +6314,102 @@ void I210ToARGBRow_MMI(const uint16_t* src_y, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { - uint64_t y, u, v; - uint64_t b_vec[2], g_vec[2], r_vec[2]; + uint64_t y,u,v; + uint64_t b_vec[2],g_vec[2],r_vec[2]; uint64_t mask = 0xff00ff00ff00ff00ULL; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t" - - "1: \n\t" - "gsldlc1 %[y], 0x07(%[y_ptr]) \n\t" - "gsldrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - - "psllh %[y], %[y], %[six] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - "punpcklhw %[u], %[u], %[u] \n\t" - "psrah %[u], %[u], %[two] \n\t" - "punpcklhw %[v], %[v], %[v] \n\t" - "psrah %[v], %[v], %[two] \n\t" - "pminsh %[u], %[u], %[mask1] \n\t" - "pminsh %[v], %[v], %[mask1] \n\t" - - "paddsh %[b_vec0], %[y], %[bb] \n\t" - "pmullh %[b_vec1], %[u], %[ub] \n\t" - "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" - - "paddsh %[g_vec0], %[y], %[bg] \n\t" - "pmullh %[g_vec1], %[u], %[ug] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "pmullh %[g_vec1], %[v], %[vg] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - - "paddsh %[r_vec0], %[y], %[br] \n\t" - "pmullh %[r_vec1], %[v], %[vr] \n\t" - "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" - - "psrah %[b_vec0], %[b_vec0], %[six] \n\t" - "psrah %[g_vec0], %[g_vec0], %[six] \n\t" - "psrah %[r_vec0], %[r_vec0], %[six] \n\t" - - "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" - "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t" - "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" - "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" - "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" - "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x08 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec0] "=&f"(b_vec[0]), - [b_vec1] "=&f"(b_vec[1]), [g_vec0] "=&f"(g_vec[0]), - [g_vec1] "=&f"(g_vec[1]), [r_vec0] "=&f"(r_vec[0]), - [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub), [ug] "=&f"(ug), - [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb), [bg] "=&f"(bg), - [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v), - [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants), - [width] "r"(width), [zero] "f"(0x00), [alpha] "f"(-1), [six] "f"(0x6), - [five] "f"(0x55), [mask] "f"(mask), [two] "f"(0x02), - [mask1] "f"(0x00ff00ff00ff00ff) - : "memory"); + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask] \n\t" + + "1: \n\t" + "gsldlc1 %[y], 0x07(%[y_ptr]) \n\t" + "gsldrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + + "psllh %[y], %[y], %[six] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" + + "punpcklhw %[u], %[u], %[u] \n\t" + "psrah %[u], %[u], %[two] \n\t" + "punpcklhw %[v], %[v], %[v] \n\t" + "psrah %[v], %[v], %[two] \n\t" + "pminsh %[u], %[u], %[mask1] \n\t" + "pminsh %[v], %[v], %[mask1] \n\t" + + "paddsh %[b_vec0], %[y], %[bb] \n\t" + "pmullh %[b_vec1], %[u], %[ub] \n\t" + "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" + + "paddsh %[g_vec0], %[y], %[bg] \n\t" + "pmullh %[g_vec1], %[u], %[ug] \n\t" + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "pmullh %[g_vec1], %[v], %[vg] \n\t" + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + + "paddsh %[r_vec0], %[y], %[br] \n\t" + "pmullh %[r_vec1], %[v], %[vr] \n\t" + "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" + + "psrah %[b_vec0], %[b_vec0], %[six] \n\t" + "psrah %[g_vec0], %[g_vec0], %[six] \n\t" + "psrah %[r_vec0], %[r_vec0], %[six] \n\t" + + "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" + "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t" + "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" + "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" + "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" + "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" + "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" + "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x08 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), + [u]"=&f"(u), [v]"=&f"(v), + [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), + [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), + [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [alpha]"f"(-1), + [six]"f"(0x6), [five]"f"(0x55), + [mask]"f"(mask), [two]"f"(0x02), + [mask1]"f"(0x00ff00ff00ff00ff) + : "memory" + ); } void I422AlphaToARGBRow_MMI(const uint8_t* src_y, @@ -6331,96 +6419,102 @@ void I422AlphaToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { - uint64_t y, u, v, a; - uint64_t b_vec[2], g_vec[2], r_vec[2]; + uint64_t y,u,v,a; + uint64_t b_vec[2],g_vec[2],r_vec[2]; uint64_t mask = 0xff00ff00ff00ff00ULL; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - "gslwlc1 %[a], 0x03(%[a_ptr]) \n\t" - "gslwrc1 %[a], 0x00(%[a_ptr]) \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t" // y*0x0101 - "pmulhuh %[y], %[y], %[yg] \n\t" // y1 - - // u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t" // u - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec0], %[y], %[bb] \n\t" - "pmullh %[b_vec1], %[u], %[ub] \n\t" - "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" - "psrah %[b_vec0], %[b_vec0], %[six] \n\t" - - // v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec0], %[y], %[bg] \n\t" - "pmullh %[g_vec1], %[u], %[ug] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "pmullh %[g_vec1], %[v], %[vg] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "psrah %[g_vec0], %[g_vec0], %[six] \n\t" - - "paddsh %[r_vec0], %[y], %[br] \n\t" - "pmullh %[r_vec1], %[v], %[vr] \n\t" - "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" - "psrah %[r_vec0], %[r_vec0], %[six] \n\t" - - "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" // rrrrbbbb - "packushb %[g_vec0], %[g_vec0], %[a] \n\t" - "punpcklwd %[g_vec0], %[g_vec0], %[a] \n\t" // aaaagggg - "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" - "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" - "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[a_ptr], %[a_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [a] "=&f"(a), - [b_vec0] "=&f"(b_vec[0]), [b_vec1] "=&f"(b_vec[1]), - [g_vec0] "=&f"(g_vec[0]), [g_vec1] "=&f"(g_vec[1]), - [r_vec0] "=&f"(r_vec[0]), [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub), - [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb), - [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v), - [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants), - [width] "r"(width), [a_ptr] "r"(src_a), [zero] "f"(0x00), - [six] "f"(0x6), [five] "f"(0x55), [mask] "f"(mask) - : "memory"); + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask] \n\t" + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + "gslwlc1 %[a], 0x03(%[a_ptr]) \n\t" + "gslwrc1 %[a], 0x00(%[a_ptr]) \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 + "pmulhuh %[y], %[y], %[yg] \n\t"//y1 + + //u3|u2|u1|u0 --> u1|u1|u0|u0 + "punpcklbh %[u], %[u], %[u] \n\t"//u + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec0], %[y], %[bb] \n\t" + "pmullh %[b_vec1], %[u], %[ub] \n\t" + "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" + "psrah %[b_vec0], %[b_vec0], %[six] \n\t" + + //v3|v2|v1|v0 --> v1|v1|v0|v0 + "punpcklbh %[v], %[v], %[v] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec0], %[y], %[bg] \n\t" + "pmullh %[g_vec1], %[u], %[ug] \n\t" + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "pmullh %[g_vec1], %[v], %[vg] \n\t" + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "psrah %[g_vec0], %[g_vec0], %[six] \n\t" + + "paddsh %[r_vec0], %[y], %[br] \n\t" + "pmullh %[r_vec1], %[v], %[vr] \n\t" + "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" + "psrah %[r_vec0], %[r_vec0], %[six] \n\t" + + "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb + "packushb %[g_vec0], %[g_vec0], %[a] \n\t" + "punpcklwd %[g_vec0], %[g_vec0], %[a] \n\t"//aaaagggg + "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" + "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" + "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" + "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" + "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[a_ptr], %[a_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), [a]"=&f"(a), + [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), + [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), + [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [a_ptr]"r"(src_a), [zero]"f"(0x00), + [six]"f"(0x6), [five]"f"(0x55), + [mask]"f"(mask) + : "memory" + ); } void I422ToRGB24Row_MMI(const uint8_t* src_y, @@ -6429,105 +6523,113 @@ void I422ToRGB24Row_MMI(const uint8_t* src_y, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { - uint64_t y, u, v; - uint64_t b_vec[2], g_vec[2], r_vec[2]; + uint64_t y,u,v; + uint64_t b_vec[2],g_vec[2],r_vec[2]; uint64_t mask = 0xff00ff00ff00ff00ULL; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t" // y*0x0101 - "pmulhuh %[y], %[y], %[yg] \n\t" // y1 - - // u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t" // u - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec0], %[y], %[bb] \n\t" - "pmullh %[b_vec1], %[u], %[ub] \n\t" - "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" - "psrah %[b_vec0], %[b_vec0], %[six] \n\t" - - // v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec0], %[y], %[bg] \n\t" - "pmullh %[g_vec1], %[u], %[ug] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "pmullh %[g_vec1], %[v], %[vg] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "psrah %[g_vec0], %[g_vec0], %[six] \n\t" - - "paddsh %[r_vec0], %[y], %[br] \n\t" - "pmullh %[r_vec1], %[v], %[vr] \n\t" - "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" - "psrah %[r_vec0], %[r_vec0], %[six] \n\t" - - "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" - "packushb %[g_vec0], %[g_vec0], %[zero] \n\t" - "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" - "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" - - "punpckhwd %[r_vec0], %[g_vec0], %[g_vec0] \n\t" - "psllw %[r_vec1], %[r_vec0], %[lmove1] \n\t" - "or %[g_vec0], %[g_vec0], %[r_vec1] \n\t" - "psrlw %[r_vec1], %[r_vec0], %[rmove1] \n\t" - "pextrh %[r_vec1], %[r_vec1], %[zero] \n\t" - "pinsrh_2 %[g_vec0], %[g_vec0], %[r_vec1] \n\t" - "pextrh %[r_vec1], %[g_vec1], %[zero] \n\t" - "pinsrh_3 %[g_vec0], %[g_vec0], %[r_vec1] \n\t" - "pextrh %[r_vec1], %[g_vec1], %[one] \n\t" - "punpckhwd %[g_vec1], %[g_vec1], %[g_vec1] \n\t" - "psllw %[g_vec1], %[g_vec1], %[rmove1] \n\t" - "or %[g_vec1], %[g_vec1], %[r_vec1] \n\t" - "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" - "gsswlc1 %[g_vec1], 0x0b(%[rgbbuf_ptr]) \n\t" - "gsswrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0c \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec0] "=&f"(b_vec[0]), - [b_vec1] "=&f"(b_vec[1]), [g_vec0] "=&f"(g_vec[0]), - [g_vec1] "=&f"(g_vec[1]), [r_vec0] "=&f"(r_vec[0]), - [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub), [ug] "=&f"(ug), - [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb), [bg] "=&f"(bg), - [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v), - [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants), - [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6), - [mask] "f"(mask), [lmove1] "f"(0x18), [rmove1] "f"(0x8), [one] "f"(0x1) - : "memory"); + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask] \n\t" + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 + "pmulhuh %[y], %[y], %[yg] \n\t"//y1 + + //u3|u2|u1|u0 --> u1|u1|u0|u0 + "punpcklbh %[u], %[u], %[u] \n\t"//u + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec0], %[y], %[bb] \n\t" + "pmullh %[b_vec1], %[u], %[ub] \n\t" + "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" + "psrah %[b_vec0], %[b_vec0], %[six] \n\t" + + //v3|v2|v1|v0 --> v1|v1|v0|v0 + "punpcklbh %[v], %[v], %[v] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec0], %[y], %[bg] \n\t" + "pmullh %[g_vec1], %[u], %[ug] \n\t" + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "pmullh %[g_vec1], %[v], %[vg] \n\t" + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "psrah %[g_vec0], %[g_vec0], %[six] \n\t" + + "paddsh %[r_vec0], %[y], %[br] \n\t" + "pmullh %[r_vec1], %[v], %[vr] \n\t" + "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" + "psrah %[r_vec0], %[r_vec0], %[six] \n\t" + + "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" + "packushb %[g_vec0], %[g_vec0], %[zero] \n\t" + "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" + "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" + "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" + "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" + + "punpckhwd %[r_vec0], %[g_vec0], %[g_vec0] \n\t" + "psllw %[r_vec1], %[r_vec0], %[lmove1] \n\t" + "or %[g_vec0], %[g_vec0], %[r_vec1] \n\t" + "psrlw %[r_vec1], %[r_vec0], %[rmove1] \n\t" + "pextrh %[r_vec1], %[r_vec1], %[zero] \n\t" + "pinsrh_2 %[g_vec0], %[g_vec0], %[r_vec1] \n\t" + "pextrh %[r_vec1], %[g_vec1], %[zero] \n\t" + "pinsrh_3 %[g_vec0], %[g_vec0], %[r_vec1] \n\t" + "pextrh %[r_vec1], %[g_vec1], %[one] \n\t" + "punpckhwd %[g_vec1], %[g_vec1], %[g_vec1] \n\t" + "psllw %[g_vec1], %[g_vec1], %[rmove1] \n\t" + "or %[g_vec1], %[g_vec1], %[r_vec1] \n\t" + "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" + "gsswlc1 %[g_vec1], 0x0b(%[rgbbuf_ptr]) \n\t" + "gsswrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" + + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0c \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), + [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), + [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask]"f"(mask), + [lmove1]"f"(0x18), [rmove1]"f"(0x8), + [one]"f"(0x1) + : "memory" + ); } void I422ToARGB4444Row_MMI(const uint8_t* src_y, @@ -6538,103 +6640,110 @@ void I422ToARGB4444Row_MMI(const uint8_t* src_y, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t" // y*0x0101 - "pmulhuh %[y], %[y], %[yg] \n\t" // y1 - - // u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t" // u - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - // v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "and %[g_vec], %[g_vec], %[mask1] \n\t" - "psrlw %[g_vec], %[g_vec], %[four] \n\t" - "psrlw %[r_vec], %[g_vec], %[four] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "punpcklbh %[r_vec], %[alpha], %[zero] \n\t" - "and %[g_vec], %[g_vec], %[r_vec] \n\t" - - "and %[b_vec], %[b_vec], %[mask1] \n\t" - "psrlw %[b_vec], %[b_vec], %[four] \n\t" - "psrlw %[r_vec], %[b_vec], %[four] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "punpcklbh %[r_vec], %[alpha], %[zero] \n\t" - "and %[b_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[b_vec] \n\t" - - "gssdlc1 %[g_vec], 0x07(%[dst_argb4444]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[dst_argb4444]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[dst_argb4444], %[dst_argb4444], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), - [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), - [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), - [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v), - [dst_argb4444] "r"(dst_argb4444), [yuvcons_ptr] "r"(yuvconstants), - [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6), - [mask] "f"(0xff00ff00ff00ff00), [four] "f"(0x4), - [mask1] "f"(0xf0f0f0f0f0f0f0f0), [alpha] "f"(-1) - : "memory"); + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask] \n\t" + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 + "pmulhuh %[y], %[y], %[yg] \n\t"//y1 + + //u3|u2|u1|u0 --> u1|u1|u0|u0 + "punpcklbh %[u], %[u], %[u] \n\t"//u + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" + + //v3|v2|v1|v0 --> v1|v1|v0|v0 + "punpcklbh %[v], %[v], %[v] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" + + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" + + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + + "and %[g_vec], %[g_vec], %[mask1] \n\t" + "psrlw %[g_vec], %[g_vec], %[four] \n\t" + "psrlw %[r_vec], %[g_vec], %[four] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" + "punpcklbh %[r_vec], %[alpha], %[zero] \n\t" + "and %[g_vec], %[g_vec], %[r_vec] \n\t" + + "and %[b_vec], %[b_vec], %[mask1] \n\t" + "psrlw %[b_vec], %[b_vec], %[four] \n\t" + "psrlw %[r_vec], %[b_vec], %[four] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" + "punpcklbh %[r_vec], %[alpha], %[zero] \n\t" + "and %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[b_vec] \n\t" + + "gssdlc1 %[g_vec], 0x07(%[dst_argb4444]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[dst_argb4444]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[dst_argb4444], %[dst_argb4444], 0x08 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [dst_argb4444]"r"(dst_argb4444), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask]"f"(0xff00ff00ff00ff00), + [four]"f"(0x4), [mask1]"f"(0xf0f0f0f0f0f0f0f0), + [alpha]"f"(-1) + : "memory" + ); } void I422ToARGB1555Row_MMI(const uint8_t* src_y, @@ -6645,118 +6754,125 @@ void I422ToARGB1555Row_MMI(const uint8_t* src_y, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - // u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - // v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "psrlw %[temp], %[g_vec], %[three] \n\t" - "and %[g_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[eight] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "psrlw %[temp], %[temp], %[eight] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "or %[g_vec], %[g_vec], %[mask3] \n\t" - - "psrlw %[temp], %[b_vec], %[three] \n\t" - "and %[b_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[eight] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "psrlw %[temp], %[temp], %[eight] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "or %[b_vec], %[b_vec], %[mask3] \n\t" - - "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" - "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" - "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" - - "gssdlc1 %[g_vec], 0x07(%[dst_argb1555]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[dst_argb1555]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[dst_argb1555], %[dst_argb1555], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), - [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), - [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), - [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v), - [dst_argb1555] "r"(dst_argb1555), [yuvcons_ptr] "r"(yuvconstants), - [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6), - [mask1] "f"(0xff00ff00ff00ff00), [three] "f"(0x3), - [mask2] "f"(0x1f0000001f), [eight] "f"(0x8), - [mask3] "f"(0x800000008000), [lmove5] "f"(0x5) - : "memory"); + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" + + //u3|u2|u1|u0 --> u1|u1|u0|u0 + "punpcklbh %[u], %[u], %[u] \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" + + //v3|v2|v1|v0 --> v1|v1|v0|v0 + "punpcklbh %[v], %[v], %[v] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" + + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" + + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + + "psrlw %[temp], %[g_vec], %[three] \n\t" + "and %[g_vec], %[temp], %[mask2] \n\t" + "psrlw %[temp], %[temp], %[eight] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" + "psrlw %[temp], %[temp], %[eight] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" + "or %[g_vec], %[g_vec], %[mask3] \n\t" + + "psrlw %[temp], %[b_vec], %[three] \n\t" + "and %[b_vec], %[temp], %[mask2] \n\t" + "psrlw %[temp], %[temp], %[eight] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" + "psrlw %[temp], %[temp], %[eight] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" + "or %[b_vec], %[b_vec], %[mask3] \n\t" + + "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" + "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" + "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" + + "gssdlc1 %[g_vec], 0x07(%[dst_argb1555]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[dst_argb1555]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[dst_argb1555], %[dst_argb1555], 0x08 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [dst_argb1555]"r"(dst_argb1555), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [three]"f"(0x3), [mask2]"f"(0x1f0000001f), + [eight]"f"(0x8), [mask3]"f"(0x800000008000), + [lmove5]"f"(0x5) + : "memory" + ); } void I422ToRGB565Row_MMI(const uint8_t* src_y, @@ -6767,120 +6883,127 @@ void I422ToRGB565Row_MMI(const uint8_t* src_y, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - // u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - // v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "psrlh %[temp], %[g_vec], %[three] \n\t" - "and %[g_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[seven] \n\t" - "psrlw %[r_vec], %[mask1], %[eight] \n\t" - "and %[r_vec], %[temp], %[r_vec] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "paddb %[r_vec], %[three], %[six] \n\t" - "psrlw %[temp], %[temp], %[r_vec] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "paddb %[temp], %[three], %[eight] \n\t" - "psllw %[r_vec], %[r_vec], %[temp] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - - "psrlh %[temp], %[b_vec], %[three] \n\t" - "and %[b_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[seven] \n\t" - "psrlw %[r_vec], %[mask1], %[eight] \n\t" - "and %[r_vec], %[temp], %[r_vec] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "paddb %[r_vec], %[three], %[six] \n\t" - "psrlw %[temp], %[temp], %[r_vec] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "paddb %[temp], %[three], %[eight] \n\t" - "psllw %[r_vec], %[r_vec], %[temp] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - - "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" - "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" - "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" - - "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), - [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), - [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), - [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v), - [dst_rgb565] "r"(dst_rgb565), [yuvcons_ptr] "r"(yuvconstants), - [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6), - [mask1] "f"(0xff00ff00ff00ff00), [three] "f"(0x3), - [mask2] "f"(0x1f0000001f), [eight] "f"(0x8), [seven] "f"(0x7), - [lmove5] "f"(0x5) - : "memory"); + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" + + //u3|u2|u1|u0 --> u1|u1|u0|u0 + "punpcklbh %[u], %[u], %[u] \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" + + //v3|v2|v1|v0 --> v1|v1|v0|v0 + "punpcklbh %[v], %[v], %[v] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" + + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" + + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + + "psrlh %[temp], %[g_vec], %[three] \n\t" + "and %[g_vec], %[temp], %[mask2] \n\t" + "psrlw %[temp], %[temp], %[seven] \n\t" + "psrlw %[r_vec], %[mask1], %[eight] \n\t" + "and %[r_vec], %[temp], %[r_vec] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" + "paddb %[r_vec], %[three], %[six] \n\t" + "psrlw %[temp], %[temp], %[r_vec] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "paddb %[temp], %[three], %[eight] \n\t" + "psllw %[r_vec], %[r_vec], %[temp] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" + + "psrlh %[temp], %[b_vec], %[three] \n\t" + "and %[b_vec], %[temp], %[mask2] \n\t" + "psrlw %[temp], %[temp], %[seven] \n\t" + "psrlw %[r_vec], %[mask1], %[eight] \n\t" + "and %[r_vec], %[temp], %[r_vec] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" + "paddb %[r_vec], %[three], %[six] \n\t" + "psrlw %[temp], %[temp], %[r_vec] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "paddb %[temp], %[three], %[eight] \n\t" + "psllw %[r_vec], %[r_vec], %[temp] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" + + "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" + "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" + "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" + + "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [dst_rgb565]"r"(dst_rgb565), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [three]"f"(0x3), [mask2]"f"(0x1f0000001f), + [eight]"f"(0x8), [seven]"f"(0x7), + [lmove5]"f"(0x5) + : "memory" + ); } void NV12ToARGBRow_MMI(const uint8_t* src_y, @@ -6890,83 +7013,91 @@ void NV12ToARGBRow_MMI(const uint8_t* src_y, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "pshufh %[v], %[u], %[vshu] \n\t" - "pshufh %[u], %[u], %[ushu] \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), - [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), - [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), - [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [uv_ptr] "r"(src_uv), [rgbbuf_ptr] "r"(rgb_buf), - [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00), - [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00), - [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1) - : "memory"); + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "pshufh %[v], %[u], %[vshu] \n\t" + "pshufh %[u], %[u], %[ushu] \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" + + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" + + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" + + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" + + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv), + [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [ushu]"f"(0xA0), [vshu]"f"(0xf5), + [alpha]"f"(-1) + : "memory" + ); } void NV21ToARGBRow_MMI(const uint8_t* src_y, @@ -6976,83 +7107,91 @@ void NV21ToARGBRow_MMI(const uint8_t* src_y, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "pshufh %[v], %[u], %[ushu] \n\t" - "pshufh %[u], %[u], %[vshu] \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), - [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), - [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), - [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [vu_ptr] "r"(src_vu), [rgbbuf_ptr] "r"(rgb_buf), - [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00), - [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00), - [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1) - : "memory"); + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "pshufh %[v], %[u], %[ushu] \n\t" + "pshufh %[u], %[u], %[vshu] \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" + + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" + + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" + + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" + + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [vu_ptr]"r"(src_vu), + [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [ushu]"f"(0xA0), [vshu]"f"(0xf5), + [alpha]"f"(-1) + : "memory" + ); } void NV12ToRGB24Row_MMI(const uint8_t* src_y, @@ -7062,95 +7201,103 @@ void NV12ToRGB24Row_MMI(const uint8_t* src_y, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "pshufh %[v], %[u], %[vshu] \n\t" - "pshufh %[u], %[u], %[ushu] \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t" - "psllw %[temp], %[r_vec], %[lmove1] \n\t" - "or %[g_vec], %[g_vec], %[temp] \n\t" - "psrlw %[temp], %[r_vec], %[rmove1] \n\t" - "pextrh %[temp], %[temp], %[zero] \n\t" - "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t" - "pextrh %[temp], %[b_vec], %[zero] \n\t" - "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t" - "pextrh %[temp], %[b_vec], %[one] \n\t" - "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t" - "psllw %[b_vec], %[b_vec], %[rmove1] \n\t" - "or %[b_vec], %[b_vec], %[temp] \n\t" - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t" - "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), - [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), - [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), - [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [uv_ptr] "r"(src_uv), [rgbbuf_ptr] "r"(rgb_buf), - [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00), - [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00), - [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1), [lmove1] "f"(0x18), - [one] "f"(0x1), [rmove1] "f"(0x8) - : "memory"); + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "pshufh %[v], %[u], %[vshu] \n\t" + "pshufh %[u], %[u], %[ushu] \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" + + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" + + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" + + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" + + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + + "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t" + "psllw %[temp], %[r_vec], %[lmove1] \n\t" + "or %[g_vec], %[g_vec], %[temp] \n\t" + "psrlw %[temp], %[r_vec], %[rmove1] \n\t" + "pextrh %[temp], %[temp], %[zero] \n\t" + "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t" + "pextrh %[temp], %[b_vec], %[zero] \n\t" + "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t" + "pextrh %[temp], %[b_vec], %[one] \n\t" + "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t" + "psllw %[b_vec], %[b_vec], %[rmove1] \n\t" + "or %[b_vec], %[b_vec], %[temp] \n\t" + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t" + "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv), + [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [ushu]"f"(0xA0), [vshu]"f"(0xf5), + [alpha]"f"(-1), [lmove1]"f"(0x18), + [one]"f"(0x1), [rmove1]"f"(0x8) + : "memory" + ); } void NV21ToRGB24Row_MMI(const uint8_t* src_y, @@ -7160,95 +7307,103 @@ void NV21ToRGB24Row_MMI(const uint8_t* src_y, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "pshufh %[v], %[u], %[ushu] \n\t" - "pshufh %[u], %[u], %[vshu] \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t" - "psllw %[temp], %[r_vec], %[lmove1] \n\t" - "or %[g_vec], %[g_vec], %[temp] \n\t" - "psrlw %[temp], %[r_vec], %[rmove1] \n\t" - "pextrh %[temp], %[temp], %[zero] \n\t" - "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t" - "pextrh %[temp], %[b_vec], %[zero] \n\t" - "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t" - "pextrh %[temp], %[b_vec], %[one] \n\t" - "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t" - "psllw %[b_vec], %[b_vec], %[rmove1] \n\t" - "or %[b_vec], %[b_vec], %[temp] \n\t" - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t" - "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), - [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), - [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), - [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [vu_ptr] "r"(src_vu), [rgbbuf_ptr] "r"(rgb_buf), - [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00), - [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00), - [ushu] "f"(0xA0), [vshu] "f"(0xf5), [lmove1] "f"(0x18), - [rmove1] "f"(0x8), [one] "f"(0x1) - : "memory"); + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "pshufh %[v], %[u], %[ushu] \n\t" + "pshufh %[u], %[u], %[vshu] \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" + + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" + + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" + + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" + + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + + "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t" + "psllw %[temp], %[r_vec], %[lmove1] \n\t" + "or %[g_vec], %[g_vec], %[temp] \n\t" + "psrlw %[temp], %[r_vec], %[rmove1] \n\t" + "pextrh %[temp], %[temp], %[zero] \n\t" + "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t" + "pextrh %[temp], %[b_vec], %[zero] \n\t" + "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t" + "pextrh %[temp], %[b_vec], %[one] \n\t" + "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t" + "psllw %[b_vec], %[b_vec], %[rmove1] \n\t" + "or %[b_vec], %[b_vec], %[temp] \n\t" + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t" + "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [vu_ptr]"r"(src_vu), + [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [ushu]"f"(0xA0), [vshu]"f"(0xf5), + [lmove1]"f"(0x18), [rmove1]"f"(0x8), + [one]"f"(0x1) + : "memory" + ); } void NV12ToRGB565Row_MMI(const uint8_t* src_y, @@ -7258,115 +7413,123 @@ void NV12ToRGB565Row_MMI(const uint8_t* src_y, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "pshufh %[v], %[u], %[vshu] \n\t" - "pshufh %[u], %[u], %[ushu] \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "psrlh %[temp], %[g_vec], %[three] \n\t" - "and %[g_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[seven] \n\t" - "psrlw %[r_vec], %[mask1], %[eight] \n\t" - "and %[r_vec], %[temp], %[r_vec] \n\t" - "psubb %[y], %[eight], %[three] \n\t" // 5 - "psllw %[r_vec], %[r_vec], %[y] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "paddb %[r_vec], %[three], %[six] \n\t" - "psrlw %[temp], %[temp], %[r_vec] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "paddb %[temp], %[three], %[eight] \n\t" - "psllw %[r_vec], %[r_vec], %[temp] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - - "psrlh %[temp], %[b_vec], %[three] \n\t" - "and %[b_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[seven] \n\t" - "psrlw %[r_vec], %[mask1], %[eight] \n\t" - "and %[r_vec], %[temp], %[r_vec] \n\t" - "psubb %[y], %[eight], %[three] \n\t" // 5 - "psllw %[r_vec], %[r_vec], %[y] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "paddb %[r_vec], %[three], %[six] \n\t" - "psrlw %[temp], %[temp], %[r_vec] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "paddb %[temp], %[three], %[eight] \n\t" - "psllw %[r_vec], %[r_vec], %[temp] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - - "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" - "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" - "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" - - "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" - "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), - [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), - [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), - [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [uv_ptr] "r"(src_uv), [dst_rgb565] "r"(dst_rgb565), - [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00), - [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00), - [ushu] "f"(0xA0), [vshu] "f"(0xf5), [three] "f"(0x3), - [mask2] "f"(0x1f0000001f), [eight] "f"(0x8), [seven] "f"(0x7) - : "memory"); + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "pshufh %[v], %[u], %[vshu] \n\t" + "pshufh %[u], %[u], %[ushu] \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" + + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" + + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" + + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" + + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + + "psrlh %[temp], %[g_vec], %[three] \n\t" + "and %[g_vec], %[temp], %[mask2] \n\t" + "psrlw %[temp], %[temp], %[seven] \n\t" + "psrlw %[r_vec], %[mask1], %[eight] \n\t" + "and %[r_vec], %[temp], %[r_vec] \n\t" + "psubb %[y], %[eight], %[three] \n\t"//5 + "psllw %[r_vec], %[r_vec], %[y] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" + "paddb %[r_vec], %[three], %[six] \n\t" + "psrlw %[temp], %[temp], %[r_vec] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "paddb %[temp], %[three], %[eight] \n\t" + "psllw %[r_vec], %[r_vec], %[temp] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" + + "psrlh %[temp], %[b_vec], %[three] \n\t" + "and %[b_vec], %[temp], %[mask2] \n\t" + "psrlw %[temp], %[temp], %[seven] \n\t" + "psrlw %[r_vec], %[mask1], %[eight] \n\t" + "and %[r_vec], %[temp], %[r_vec] \n\t" + "psubb %[y], %[eight], %[three] \n\t"//5 + "psllw %[r_vec], %[r_vec], %[y] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" + "paddb %[r_vec], %[three], %[six] \n\t" + "psrlw %[temp], %[temp], %[r_vec] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "paddb %[temp], %[three], %[eight] \n\t" + "psllw %[r_vec], %[r_vec], %[temp] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" + + "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" + "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" + "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" + + "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" + "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv), + [dst_rgb565]"r"(dst_rgb565), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [ushu]"f"(0xA0), [vshu]"f"(0xf5), + [three]"f"(0x3), [mask2]"f"(0x1f0000001f), + [eight]"f"(0x8), [seven]"f"(0x7) + : "memory" + ); } void YUY2ToARGBRow_MMI(const uint8_t* src_yuy2, @@ -7375,83 +7538,90 @@ void YUY2ToARGBRow_MMI(const uint8_t* src_yuy2, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gsldlc1 %[y], 0x07(%[yuy2_ptr]) \n\t" - "gsldrc1 %[y], 0x00(%[yuy2_ptr]) \n\t" - "psrlh %[temp], %[y], %[eight] \n\t" - "pshufh %[u], %[temp], %[ushu] \n\t" - "pshufh %[v], %[temp], %[vshu] \n\t" - - "psrlh %[temp], %[mask1], %[eight] \n\t" - "and %[y], %[y], %[temp] \n\t" - "psllh %[temp], %[y], %[eight] \n\t" - "or %[y], %[y], %[temp] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[yuy2_ptr], %[yuy2_ptr], 0x08 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), - [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), - [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), - [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) - : [yuy2_ptr] "r"(src_yuy2), [rgbbuf_ptr] "r"(rgb_buf), - [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00), - [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00), - [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1), [eight] "f"(0x8) - : "memory"); + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" + + "1: \n\t" + "gsldlc1 %[y], 0x07(%[yuy2_ptr]) \n\t" + "gsldrc1 %[y], 0x00(%[yuy2_ptr]) \n\t" + "psrlh %[temp], %[y], %[eight] \n\t" + "pshufh %[u], %[temp], %[ushu] \n\t" + "pshufh %[v], %[temp], %[vshu] \n\t" + + "psrlh %[temp], %[mask1], %[eight] \n\t" + "and %[y], %[y], %[temp] \n\t" + "psllh %[temp], %[y], %[eight] \n\t" + "or %[y], %[y], %[temp] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" + + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" + + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" + + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" + + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + + "daddiu %[yuy2_ptr], %[yuy2_ptr], 0x08 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [yuy2_ptr]"r"(src_yuy2), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [ushu]"f"(0xA0), [vshu]"f"(0xf5), + [alpha]"f"(-1), [eight]"f"(0x8) + : "memory" + ); } void UYVYToARGBRow_MMI(const uint8_t* src_uyvy, @@ -7460,83 +7630,90 @@ void UYVYToARGBRow_MMI(const uint8_t* src_uyvy, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gsldlc1 %[y], 0x07(%[uyvy_ptr]) \n\t" - "gsldrc1 %[y], 0x00(%[uyvy_ptr]) \n\t" - "psrlh %[temp], %[mask1], %[eight] \n\t" - "and %[temp], %[y], %[temp] \n\t" - "pshufh %[u], %[temp], %[ushu] \n\t" - "pshufh %[v], %[temp], %[vshu] \n\t" - - "psrlh %[y], %[y], %[eight] \n\t" - "psllh %[temp], %[y], %[eight] \n\t" - "or %[y], %[y], %[temp] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[uyvy_ptr], %[uyvy_ptr], 0x08 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), - [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), - [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), - [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) - : [uyvy_ptr] "r"(src_uyvy), [rgbbuf_ptr] "r"(rgb_buf), - [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00), - [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00), - [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1), [eight] "f"(0x8) - : "memory"); + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" + + "1: \n\t" + "gsldlc1 %[y], 0x07(%[uyvy_ptr]) \n\t" + "gsldrc1 %[y], 0x00(%[uyvy_ptr]) \n\t" + "psrlh %[temp], %[mask1], %[eight] \n\t" + "and %[temp], %[y], %[temp] \n\t" + "pshufh %[u], %[temp], %[ushu] \n\t" + "pshufh %[v], %[temp], %[vshu] \n\t" + + "psrlh %[y], %[y], %[eight] \n\t" + "psllh %[temp], %[y], %[eight] \n\t" + "or %[y], %[y], %[temp] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" + + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" + + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" + + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" + + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + + "daddiu %[uyvy_ptr], %[uyvy_ptr], 0x08 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [uyvy_ptr]"r"(src_uyvy), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [ushu]"f"(0xA0), [vshu]"f"(0xf5), + [alpha]"f"(-1), [eight]"f"(0x8) + : "memory" + ); } void I422ToRGBARow_MMI(const uint8_t* src_y, @@ -7547,105 +7724,114 @@ void I422ToRGBARow_MMI(const uint8_t* src_y, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - "punpcklbh %[u], %[u], %[u] \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[alpha], %[g_vec] \n\t" - "punpcklbh %[b_vec], %[g_vec], %[r_vec] \n\t" - "punpckhbh %[r_vec], %[g_vec], %[r_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), - [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), - [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), - [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v), - [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants), - [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6), - [mask1] "f"(0xff00ff00ff00ff00), [alpha] "f"(-1) - : "memory"); + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" + + "punpcklbh %[u], %[u], %[u] \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" + + "punpcklbh %[v], %[v], %[v] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" + + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" + + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklwd %[g_vec], %[alpha], %[g_vec] \n\t" + "punpcklbh %[b_vec], %[g_vec], %[r_vec] \n\t" + "punpckhbh %[r_vec], %[g_vec], %[r_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [alpha]"f"(-1) + : "memory" + ); } void ARGBSetRow_MMI(uint8_t* dst_argb, uint32_t v32, int width) { - __asm__ volatile( - "punpcklwd %[v32], %[v32], %[v32] \n\t" - "1: \n\t" - "gssdlc1 %[v32], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[v32], 0x00(%[dst_ptr]) \n\t" - "gssdlc1 %[v32], 0x0f(%[dst_ptr]) \n\t" - "gssdrc1 %[v32], 0x08(%[dst_ptr]) \n\t" - - "daddi %[width], %[width], -0x04 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" - "bnez %[width], 1b \n\t" - : [v32] "+&f"(v32) - : [dst_ptr] "r"(dst_argb), [width] "r"(width) - : "memory"); + __asm__ volatile ( + "punpcklwd %[v32], %[v32], %[v32] \n\t" + "1: \n\t" + "gssdlc1 %[v32], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[v32], 0x00(%[dst_ptr]) \n\t" + "gssdlc1 %[v32], 0x0f(%[dst_ptr]) \n\t" + "gssdrc1 %[v32], 0x08(%[dst_ptr]) \n\t" + + "daddi %[width], %[width], -0x04 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" + "bnez %[width], 1b \n\t" + : [v32]"+&f"(v32) + : [dst_ptr]"r"(dst_argb), [width]"r"(width) + : "memory" + ); } +// clang-format on // 10 bit YUV to ARGB #endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) diff --git a/chromium/third_party/libyuv/source/row_msa.cc b/chromium/third_party/libyuv/source/row_msa.cc index 5c0239a37f0..fe6df93a601 100644 --- a/chromium/third_party/libyuv/source/row_msa.cc +++ b/chromium/third_party/libyuv/source/row_msa.cc @@ -155,11 +155,10 @@ extern "C" { } // Loads current and next row of ARGB input and averages it to calculate U and V -#define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3) \ +#define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3, const_0x0101) \ { \ v16u8 src0_m, src1_m, src2_m, src3_m, src4_m, src5_m, src6_m, src7_m; \ v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ - v16u8 vec8_m, vec9_m; \ v8u16 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m, reg5_m, reg6_m, reg7_m; \ v8u16 reg8_m, reg9_m; \ \ @@ -195,81 +194,81 @@ extern "C" { reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \ reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \ reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \ - reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \ - reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \ - reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \ - reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \ - argb0 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \ - argb1 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \ - src0_m = (v16u8)__msa_ld_b((void*)s, 64); \ - src1_m = (v16u8)__msa_ld_b((void*)s, 80); \ - src2_m = (v16u8)__msa_ld_b((void*)s, 96); \ - src3_m = (v16u8)__msa_ld_b((void*)s, 112); \ - src4_m = (v16u8)__msa_ld_b((void*)t, 64); \ - src5_m = (v16u8)__msa_ld_b((void*)t, 80); \ - src6_m = (v16u8)__msa_ld_b((void*)t, 96); \ - src7_m = (v16u8)__msa_ld_b((void*)t, 112); \ - vec2_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \ - vec3_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \ - vec4_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \ - vec5_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m); \ - vec6_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m); \ - vec7_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m); \ - vec8_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m); \ - vec9_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m); \ - reg0_m = __msa_hadd_u_h(vec2_m, vec2_m); \ - reg1_m = __msa_hadd_u_h(vec3_m, vec3_m); \ - reg2_m = __msa_hadd_u_h(vec4_m, vec4_m); \ - reg3_m = __msa_hadd_u_h(vec5_m, vec5_m); \ - reg4_m = __msa_hadd_u_h(vec6_m, vec6_m); \ - reg5_m = __msa_hadd_u_h(vec7_m, vec7_m); \ - reg6_m = __msa_hadd_u_h(vec8_m, vec8_m); \ - reg7_m = __msa_hadd_u_h(vec9_m, vec9_m); \ - reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m); \ - reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m); \ - reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m); \ - reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m); \ - reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m); \ - reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \ - reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \ - reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \ - reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \ - reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \ - reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \ - reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \ - argb2 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \ - argb3 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \ + reg8_m += const_0x0101; \ + reg9_m += const_0x0101; \ + reg0_m += const_0x0101; \ + reg1_m += const_0x0101; \ + argb0 = (v8u16)__msa_srai_h((v8i16)reg8_m, 1); \ + argb1 = (v8u16)__msa_srai_h((v8i16)reg9_m, 1); \ + argb2 = (v8u16)__msa_srai_h((v8i16)reg0_m, 1); \ + argb3 = (v8u16)__msa_srai_h((v8i16)reg1_m, 1); \ } -// Takes ARGB input and calculates U and V. #define ARGBTOUV(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \ - shf0, shf1, shf2, shf3, v_out, u_out) \ + shf0, shf1, shf2, shf3, shift, u_out, v_out) \ { \ - v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ - v8u16 reg0_m, reg1_m, reg2_m, reg3_m; \ + v8u16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + v4u32 reg0_m, reg1_m, reg2_m, reg3_m; \ \ - vec0_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb1, (v16i8)argb0); \ - vec1_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb3, (v16i8)argb2); \ - vec2_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb1, (v16i8)argb0); \ - vec3_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb3, (v16i8)argb2); \ - vec4_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb1, (v16i8)argb0); \ - vec5_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb3, (v16i8)argb2); \ - vec6_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb1, (v16i8)argb0); \ - vec7_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb3, (v16i8)argb2); \ - reg0_m = __msa_dotp_u_h(vec0_m, const1); \ - reg1_m = __msa_dotp_u_h(vec1_m, const1); \ - reg2_m = __msa_dotp_u_h(vec4_m, const1); \ - reg3_m = __msa_dotp_u_h(vec5_m, const1); \ - reg0_m += const3; \ - reg1_m += const3; \ - reg2_m += const3; \ - reg3_m += const3; \ - reg0_m -= __msa_dotp_u_h(vec2_m, const0); \ - reg1_m -= __msa_dotp_u_h(vec3_m, const0); \ - reg2_m -= __msa_dotp_u_h(vec6_m, const2); \ - reg3_m -= __msa_dotp_u_h(vec7_m, const2); \ - v_out = (v16u8)__msa_pckod_b((v16i8)reg1_m, (v16i8)reg0_m); \ - u_out = (v16u8)__msa_pckod_b((v16i8)reg3_m, (v16i8)reg2_m); \ + vec0_m = (v8u16)__msa_vshf_h(shf0, (v16i8)argb1, (v16i8)argb0); \ + vec1_m = (v8u16)__msa_vshf_h(shf0, (v16i8)argb3, (v16i8)argb2); \ + vec2_m = (v8u16)__msa_vshf_h(shf1, (v16i8)argb1, (v16i8)argb0); \ + vec3_m = (v8u16)__msa_vshf_h(shf1, (v16i8)argb3, (v16i8)argb2); \ + vec4_m = (v8u16)__msa_vshf_h(shf2, (v16i8)argb1, (v16i8)argb0); \ + vec5_m = (v8u16)__msa_vshf_h(shf2, (v16i8)argb3, (v16i8)argb2); \ + vec6_m = (v8u16)__msa_vshf_h(shf3, (v16i8)argb1, (v16i8)argb0); \ + vec7_m = (v8u16)__msa_vshf_h(shf3, (v16i8)argb3, (v16i8)argb2); \ + reg0_m = __msa_dotp_u_w(vec0_m, const0); \ + reg1_m = __msa_dotp_u_w(vec1_m, const0); \ + reg2_m = __msa_dotp_u_w(vec4_m, const0); \ + reg3_m = __msa_dotp_u_w(vec5_m, const0); \ + reg0_m += const1; \ + reg1_m += const1; \ + reg2_m += const1; \ + reg3_m += const1; \ + reg0_m -= (v4u32)__msa_dotp_u_w(vec2_m, const2); \ + reg1_m -= (v4u32)__msa_dotp_u_w(vec3_m, const2); \ + reg2_m -= (v4u32)__msa_dotp_u_w(vec6_m, const3); \ + reg3_m -= (v4u32)__msa_dotp_u_w(vec7_m, const3); \ + reg0_m = __msa_srl_w(reg0_m, shift); \ + reg1_m = __msa_srl_w(reg1_m, shift); \ + reg2_m = __msa_srl_w(reg2_m, shift); \ + reg3_m = __msa_srl_w(reg3_m, shift); \ + u_out = (v8u16)__msa_pckev_h((v8i16)reg1_m, (v8i16)reg0_m); \ + v_out = (v8u16)__msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ + } + +// Takes ARGB input and calculates U and V. +#define ARGBTOUV_H(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \ + shf0, shf1, shf2, shf3, v_out, u_out) \ + { \ + v8u16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + v4u32 reg0_m, reg1_m, reg2_m, reg3_m; \ + \ + vec0_m = __msa_vshf_h(shf0, (v16i8)argb1, (v16i8)argb0); \ + vec1_m = __msa_vshf_h(shf0, (v16i8)argb3, (v16i8)argb2); \ + vec2_m = __msa_vshf_h(shf1, (v16i8)argb1, (v16i8)argb0); \ + vec3_m = __msa_vshf_h(shf1, (v16i8)argb3, (v16i8)argb2); \ + vec4_m = __msa_vshf_h(shf2, (v16i8)argb1, (v16i8)argb0); \ + vec5_m = __msa_vshf_h(shf2, (v16i8)argb3, (v16i8)argb2); \ + vec6_m = __msa_vshf_h(shf3, (v16i8)argb1, (v16i8)argb0); \ + vec7_m = __msa_vshf_h(shf3, (v16i8)argb3, (v16i8)argb2); \ + reg0_m = __msa_dotp_u_w(vec0_m, const1); \ + reg1_m = __msa_dotp_u_w(vec1_m, const1); \ + reg2_m = __msa_dotp_u_w(vec4_m, const1); \ + reg3_m = __msa_dotp_u_w(vec5_m, const1); \ + reg0_m += (v4u32)const3; \ + reg1_m += (v4u32)const3; \ + reg2_m += (v4u32)const3; \ + reg3_m += (v4u32)const3; \ + reg0_m -= __msa_dotp_u_w(vec2_m, const0); \ + reg1_m -= __msa_dotp_u_w(vec3_m, const0); \ + reg2_m -= __msa_dotp_u_w(vec6_m, const2); \ + reg3_m -= __msa_dotp_u_w(vec7_m, const2); \ + u_out = (v16u8)__msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ + v_out = (v16u8)__msa_pckev_h((v8i16)reg1_m, (v8i16)reg0_m); \ + u_out = (v16u8)__msa_pckod_b((v16i8)u_out, (v16i8)u_out); \ + v_out = (v16u8)__msa_pckod_b((v16i8)v_out, (v16i8)v_out); \ } // Load I444 pixel data @@ -302,6 +301,20 @@ void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) { } } +void MirrorUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_uv, int width) { + int x; + v8u16 src, dst; + v8u16 shuffler = {7, 6, 5, 4, 3, 2, 1, 0}; + src_uv += (width - 8) << 1; + for (x = 0; x < width; x += 8) { + src = LD_UH(src_uv); + dst = __msa_vshf_h(shuffler, src, src); + ST_UH(dst, dst_uv); + src_uv -= 16; + dst_uv += 16; + } +} + void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) { int x; v16u8 src0, src1, src2, src3; @@ -825,12 +838,13 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb0, v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9; v16u8 dst0, dst1; - v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70); - v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A); - v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26); - v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E); - v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12); + v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x38); + v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x25); + v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x13); + v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x2f); + v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x09); v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); for (x = 0; x < width; x += 32) { src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0); @@ -889,12 +903,18 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb0, reg3 += __msa_hadd_u_h(vec5, vec5); reg4 += __msa_hadd_u_h(vec0, vec0); reg5 += __msa_hadd_u_h(vec1, vec1); - reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 2); - reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 2); - reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 2); - reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 2); - reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 2); - reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 2); + reg0 += const_0x0001; + reg1 += const_0x0001; + reg2 += const_0x0001; + reg3 += const_0x0001; + reg4 += const_0x0001; + reg5 += const_0x0001; + reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 1); + reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 1); + reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 1); + reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 1); + reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 1); + reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 1); reg6 = reg0 * const_0x70; reg7 = reg1 * const_0x70; reg8 = reg2 * const_0x4A; @@ -1412,17 +1432,17 @@ void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width) { int x; v16u8 src0, src1, vec0, vec1, dst0, dst1; v8u16 reg0; - v16u8 const_0x26 = (v16u8)__msa_ldi_h(0x26); - v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F); + v16u8 const_0x4D = (v16u8)__msa_ldi_h(0x4D); + v16u8 const_0x961D = (v16u8)__msa_fill_h(0x961D); for (x = 0; x < width; x += 8) { src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0); src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16); vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0); vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0); - reg0 = __msa_dotp_u_h(vec0, const_0x4B0F); - reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x26); - reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 7); + reg0 = __msa_dotp_u_h(vec0, const_0x961D); + reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x4D); + reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 8); vec0 = (v16u8)__msa_ilvev_b((v16i8)reg0, (v16i8)reg0); vec1 = (v16u8)__msa_ilvod_b((v16i8)vec1, (v16i8)vec0); dst0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0); @@ -2031,12 +2051,13 @@ void RGB24ToUVRow_MSA(const uint8_t* src_rgb0, v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v8i16 reg0, reg1, reg2, reg3; v16u8 dst0; - v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70); - v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A); - v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26); - v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E); - v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12); + v8u16 const_0x70 = (v8u16)__msa_fill_h(0x38); + v8u16 const_0x4A = (v8u16)__msa_fill_h(0x25); + v8u16 const_0x26 = (v8u16)__msa_fill_h(0x13); + v8u16 const_0x5E = (v8u16)__msa_fill_h(0x2f); + v8u16 const_0x12 = (v8u16)__msa_fill_h(0x09); v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19}; v16i8 zero = {0}; @@ -2085,10 +2106,14 @@ void RGB24ToUVRow_MSA(const uint8_t* src_rgb0, reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2); reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4); reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6); - reg0 = __msa_srai_h((v8i16)reg0, 2); - reg1 = __msa_srai_h((v8i16)reg1, 2); - reg2 = __msa_srai_h((v8i16)reg2, 2); - reg3 = __msa_srai_h((v8i16)reg3, 2); + reg0 += const_0x0001; + reg1 += const_0x0001; + reg2 += const_0x0001; + reg3 += const_0x0001; + reg0 = __msa_srai_h((v8i16)reg0, 1); + reg1 = __msa_srai_h((v8i16)reg1, 1); + reg2 = __msa_srai_h((v8i16)reg2, 1); + reg3 = __msa_srai_h((v8i16)reg3, 1); vec4 = (v8u16)__msa_pckev_h(reg1, reg0); vec5 = (v8u16)__msa_pckev_h(reg3, reg2); vec6 = (v8u16)__msa_pckod_h(reg1, reg0); @@ -2136,12 +2161,13 @@ void RAWToUVRow_MSA(const uint8_t* src_rgb0, v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v8i16 reg0, reg1, reg2, reg3; v16u8 dst0; - v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70); - v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A); - v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26); - v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E); - v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12); + v8u16 const_0x70 = (v8u16)__msa_fill_h(0x38); + v8u16 const_0x4A = (v8u16)__msa_fill_h(0x25); + v8u16 const_0x26 = (v8u16)__msa_fill_h(0x13); + v8u16 const_0x5E = (v8u16)__msa_fill_h(0x2f); + v8u16 const_0x12 = (v8u16)__msa_fill_h(0x09); v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19}; v16i8 zero = {0}; @@ -2190,10 +2216,14 @@ void RAWToUVRow_MSA(const uint8_t* src_rgb0, reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2); reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4); reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6); - reg0 = __msa_srai_h(reg0, 2); - reg1 = __msa_srai_h(reg1, 2); - reg2 = __msa_srai_h(reg2, 2); - reg3 = __msa_srai_h(reg3, 2); + reg0 += const_0x0001; + reg1 += const_0x0001; + reg2 += const_0x0001; + reg3 += const_0x0001; + reg0 = __msa_srai_h(reg0, 1); + reg1 = __msa_srai_h(reg1, 1); + reg2 = __msa_srai_h(reg2, 1); + reg3 = __msa_srai_h(reg3, 1); vec4 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); vec5 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); vec6 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0); @@ -2419,16 +2449,16 @@ void SobelXYRow_MSA(const uint8_t* src_sobelx, void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, src3, dst0; - v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F); - v16u8 const_0x26 = (v16u8)__msa_fill_h(0x26); - v8u16 const_0x40 = (v8u16)__msa_fill_h(0x40); + v16u8 const_0x961D = (v16u8)__msa_fill_h(0x961D); + v16u8 const_0x4D = (v16u8)__msa_fill_h(0x4D); + v8u16 const_0x80 = (v8u16)__msa_fill_h(0x80); for (x = 0; x < width; x += 16) { src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16); src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32); src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48); - ARGBTOY(src0, src1, src2, src3, const_0x4B0F, const_0x26, const_0x40, 7, + ARGBTOY(src0, src1, src2, src3, const_0x961D, const_0x4D, const_0x80, 8, dst0); ST_UB(dst0, dst_y); src_argb0 += 64; @@ -2504,61 +2534,123 @@ void ARGBToUVJRow_MSA(const uint8_t* src_rgb0, int x; const uint8_t* s = src_rgb0; const uint8_t* t = src_rgb0 + src_stride_rgb; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - v16u8 vec0, vec1, vec2, vec3; - v16u8 dst0, dst1; - v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; - v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, - 18, 19, 22, 23, 26, 27, 30, 31}; - v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31}; - v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30}; - v16u8 const_0x7F = (v16u8)__msa_fill_h(0x7F); - v16u8 const_0x6B14 = (v16u8)__msa_fill_h(0x6B14); - v16u8 const_0x2B54 = (v16u8)__msa_fill_h(0x2B54); - v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + v8u16 src0, src1, src2, src3, src4, src5, src6, src7; + v8u16 vec0, vec1, vec2, vec3; + v8u16 dst0, dst1, dst2, dst3; + v16u8 zero = {0}; + v8i16 shuffler0 = {0, 3, 4, 7, 8, 11, 12, 15}; + v8i16 shuffler1 = {1, 2, 5, 6, 9, 10, 13, 14}; + v8i16 shuffler2 = {2, 3, 6, 7, 10, 11, 14, 15}; + v8i16 shuffler3 = {0, 1, 4, 5, 8, 9, 12, 13}; + v8u16 const_0x0000003f = (v8u16)__msa_fill_w(0x0000003f); + v4u32 const_0x00008080 = (v8u16)__msa_fill_w(0x00008080); + v8u16 const_0x0015002a = (v8u16)__msa_fill_w(0x0015002a); + v8u16 const_0x0035000a = (v8u16)__msa_fill_w(0x0035000a); + v4i32 shift = __msa_fill_w(0x00000008); for (x = 0; x < width; x += 32) { - src0 = (v16u8)__msa_ld_b((void*)s, 0); - src1 = (v16u8)__msa_ld_b((void*)s, 16); - src2 = (v16u8)__msa_ld_b((void*)s, 32); - src3 = (v16u8)__msa_ld_b((void*)s, 48); - src4 = (v16u8)__msa_ld_b((void*)t, 0); - src5 = (v16u8)__msa_ld_b((void*)t, 16); - src6 = (v16u8)__msa_ld_b((void*)t, 32); - src7 = (v16u8)__msa_ld_b((void*)t, 48); - src0 = __msa_aver_u_b(src0, src4); - src1 = __msa_aver_u_b(src1, src5); - src2 = __msa_aver_u_b(src2, src6); - src3 = __msa_aver_u_b(src3, src7); - src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0); - src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2); - src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); - src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2); - vec0 = __msa_aver_u_b(src4, src6); - vec1 = __msa_aver_u_b(src5, src7); - src0 = (v16u8)__msa_ld_b((void*)s, 64); - src1 = (v16u8)__msa_ld_b((void*)s, 80); - src2 = (v16u8)__msa_ld_b((void*)s, 96); - src3 = (v16u8)__msa_ld_b((void*)s, 112); - src4 = (v16u8)__msa_ld_b((void*)t, 64); - src5 = (v16u8)__msa_ld_b((void*)t, 80); - src6 = (v16u8)__msa_ld_b((void*)t, 96); - src7 = (v16u8)__msa_ld_b((void*)t, 112); - src0 = __msa_aver_u_b(src0, src4); - src1 = __msa_aver_u_b(src1, src5); - src2 = __msa_aver_u_b(src2, src6); - src3 = __msa_aver_u_b(src3, src7); - src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0); - src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2); - src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); - src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2); - vec2 = __msa_aver_u_b(src4, src6); - vec3 = __msa_aver_u_b(src5, src7); - ARGBTOUV(vec0, vec1, vec2, vec3, const_0x6B14, const_0x7F, const_0x2B54, - const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0, - dst1); - ST_UB(dst0, dst_v); - ST_UB(dst1, dst_u); + src1 = __msa_ld_b((void*)s, 0); + src3 = __msa_ld_b((void*)s, 16); + src5 = __msa_ld_b((void*)t, 0); + src7 = __msa_ld_b((void*)t, 16); + src0 = __msa_ilvr_b(zero, src1); + src1 = __msa_ilvl_b(zero, src1); + src2 = __msa_ilvr_b(zero, src3); + src3 = __msa_ilvl_b(zero, src3); + src4 = __msa_ilvr_b(zero, src5); + src5 = __msa_ilvl_b(zero, src5); + src6 = __msa_ilvr_b(zero, src7); + src7 = __msa_ilvl_b(zero, src7); + src0 += src4; + src1 += src5; + src2 += src6; + src3 += src7; + src4 = __msa_ilvev_d(src1, src0); + src5 = __msa_ilvod_d(src1, src0); + src6 = __msa_ilvev_d(src3, src2); + src7 = __msa_ilvod_d(src3, src2); + vec0 = __msa_aver_u_h(src4, src5); + vec1 = __msa_aver_u_h(src6, src7); + + src1 = __msa_ld_b((void*)s, 32); + src3 = __msa_ld_b((void*)s, 48); + src5 = __msa_ld_b((void*)t, 32); + src7 = __msa_ld_b((void*)t, 48); + src0 = __msa_ilvr_b(zero, src1); + src1 = __msa_ilvl_b(zero, src1); + src2 = __msa_ilvr_b(zero, src3); + src3 = __msa_ilvl_b(zero, src3); + src4 = __msa_ilvr_b(zero, src5); + src5 = __msa_ilvl_b(zero, src5); + src6 = __msa_ilvr_b(zero, src7); + src7 = __msa_ilvl_b(zero, src7); + src0 += src4; + src1 += src5; + src2 += src6; + src3 += src7; + src4 = __msa_ilvev_d(src1, src0); + src5 = __msa_ilvod_d(src1, src0); + src6 = __msa_ilvev_d(src3, src2); + src7 = __msa_ilvod_d(src3, src2); + vec2 = __msa_aver_u_h(src4, src5); + vec3 = __msa_aver_u_h(src6, src7); + ARGBTOUV(vec0, vec1, vec2, vec3, const_0x0000003f, const_0x00008080, + const_0x0015002a, const_0x0035000a, shuffler0, shuffler1, + shuffler2, shuffler3, shift, dst0, dst1); + + src1 = __msa_ld_b((void*)s, 64); + src3 = __msa_ld_b((void*)s, 80); + src5 = __msa_ld_b((void*)t, 64); + src7 = __msa_ld_b((void*)t, 80); + src0 = __msa_ilvr_b(zero, src1); + src1 = __msa_ilvl_b(zero, src1); + src2 = __msa_ilvr_b(zero, src3); + src3 = __msa_ilvl_b(zero, src3); + src4 = __msa_ilvr_b(zero, src5); + src5 = __msa_ilvl_b(zero, src5); + src6 = __msa_ilvr_b(zero, src7); + src7 = __msa_ilvl_b(zero, src7); + src0 += src4; + src1 += src5; + src2 += src6; + src3 += src7; + src4 = __msa_ilvev_d(src1, src0); + src5 = __msa_ilvod_d(src1, src0); + src6 = __msa_ilvev_d(src3, src2); + src7 = __msa_ilvod_d(src3, src2); + vec0 = __msa_aver_u_h(src4, src5); + vec1 = __msa_aver_u_h(src6, src7); + + src1 = __msa_ld_b((void*)s, 96); + src3 = __msa_ld_b((void*)s, 112); + src5 = __msa_ld_b((void*)t, 96); + src7 = __msa_ld_b((void*)t, 112); + src0 = __msa_ilvr_b(zero, src1); + src1 = __msa_ilvl_b(zero, src1); + src2 = __msa_ilvr_b(zero, src3); + src3 = __msa_ilvl_b(zero, src3); + src4 = __msa_ilvr_b(zero, src5); + src5 = __msa_ilvl_b(zero, src5); + src6 = __msa_ilvr_b(zero, src7); + src7 = __msa_ilvl_b(zero, src7); + src0 += src4; + src1 += src5; + src2 += src6; + src3 += src7; + src4 = __msa_ilvev_d(src1, src0); + src5 = __msa_ilvod_d(src1, src0); + src6 = __msa_ilvev_d(src3, src2); + src7 = __msa_ilvod_d(src3, src2); + vec2 = __msa_aver_u_h(src4, src5); + vec3 = __msa_aver_u_h(src6, src7); + ARGBTOUV(vec0, vec1, vec2, vec3, const_0x0000003f, const_0x00008080, + const_0x0015002a, const_0x0035000a, shuffler0, shuffler1, + shuffler2, shuffler3, shift, dst2, dst3); + + dst0 = (v8u16)__msa_pckev_b(dst2, dst0); + dst1 = (v8u16)__msa_pckev_b(dst3, dst1); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); s += 128; t += 128; dst_v += 16; @@ -2574,28 +2666,30 @@ void BGRAToUVRow_MSA(const uint8_t* src_rgb0, int x; const uint8_t* s = src_rgb0; const uint8_t* t = src_rgb0 + src_stride_rgb; - v16u8 dst0, dst1, vec0, vec1, vec2, vec3; - v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; - v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, - 18, 19, 22, 23, 26, 27, 30, 31}; - v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31}; - v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29}; - v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E); - v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000); - v16u8 const_0x264A = (v16u8)__msa_fill_h(0x264A); - v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + const uint8_t unused = 0xf; + v8u16 src0, src1, src2, src3; + v16u8 dst0, dst1; + v8i16 shuffler0 = {1, unused, 5, unused, 9, unused, 13, unused}; + v8i16 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15}; + v8i16 shuffler2 = {3, unused, 7, unused, 11, unused, 15, unused}; + v8i16 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14}; + v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f); + v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038); + v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013); + v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080); + v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); - for (x = 0; x < width; x += 32) { - READ_ARGB(s, t, vec0, vec1, vec2, vec3); - ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A, - const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0, - dst1); - ST_UB(dst0, dst_v); - ST_UB(dst1, dst_u); - s += 128; - t += 128; - dst_v += 16; - dst_u += 16; + for (x = 0; x < width; x += 16) { + READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001); + ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038, + const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2, + shuffler3, dst0, dst1); + *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0); + *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0); + s += 64; + t += 64; + dst_u += 8; + dst_v += 8; } } @@ -2607,29 +2701,30 @@ void ABGRToUVRow_MSA(const uint8_t* src_rgb0, int x; const uint8_t* s = src_rgb0; const uint8_t* t = src_rgb0 + src_stride_rgb; - v16u8 src0, src1, src2, src3; + const uint8_t unused = 0xf; + v8u16 src0, src1, src2, src3; v16u8 dst0, dst1; - v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; - v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, - 18, 19, 22, 23, 26, 27, 30, 31}; - v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31}; - v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30}; - v16u8 const_0x4A26 = (v16u8)__msa_fill_h(0x4A26); - v16u8 const_0x0070 = (v16u8)__msa_fill_h(0x0070); - v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E); - v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + v8i16 shuffler0 = {0, unused, 4, unused, 8, unused, 12, unused}; + v8i16 shuffler1 = {1, 2, 5, 6, 9, 10, 13, 14}; + v8i16 shuffler2 = {2, unused, 6, unused, 10, unused, 14, unused}; + v8i16 shuffler3 = {0, 1, 4, 5, 8, 9, 12, 13}; + v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f); + v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038); + v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013); + v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080); + v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); - for (x = 0; x < width; x += 32) { - READ_ARGB(s, t, src0, src1, src2, src3); - ARGBTOUV(src0, src1, src2, src3, const_0x4A26, const_0x0070, const_0x125E, - const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0, - dst1); - ST_UB(dst0, dst_u); - ST_UB(dst1, dst_v); - s += 128; - t += 128; - dst_u += 16; - dst_v += 16; + for (x = 0; x < width; x += 16) { + READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001); + ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038, + const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2, + shuffler3, dst0, dst1); + *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0); + *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0); + s += 64; + t += 64; + dst_u += 8; + dst_v += 8; } } @@ -2641,28 +2736,30 @@ void RGBAToUVRow_MSA(const uint8_t* src_rgb0, int x; const uint8_t* s = src_rgb0; const uint8_t* t = src_rgb0 + src_stride_rgb; - v16u8 dst0, dst1, vec0, vec1, vec2, vec3; - v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; - v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, - 18, 19, 22, 23, 26, 27, 30, 31}; - v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31}; - v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29}; - v16u8 const_0x125E = (v16u8)__msa_fill_h(0x264A); - v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000); - v16u8 const_0x264A = (v16u8)__msa_fill_h(0x125E); - v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + const uint8_t unused = 0xf; + v8u16 src0, src1, src2, src3; + v16u8 dst0, dst1; + v8i16 shuffler0 = {3, unused, 7, unused, 11, unused, 15, unused}; + v8i16 shuffler1 = {2, 1, 6, 5, 10, 9, 14, 13}; + v8i16 shuffler2 = {1, unused, 5, unused, 9, unused, 13, unused}; + v8i16 shuffler3 = {3, 2, 7, 6, 11, 10, 15, 14}; + v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f); + v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038); + v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013); + v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080); + v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); - for (x = 0; x < width; x += 32) { - READ_ARGB(s, t, vec0, vec1, vec2, vec3); - ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A, - const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0, - dst1); - ST_UB(dst0, dst_u); - ST_UB(dst1, dst_v); - s += 128; - t += 128; - dst_u += 16; - dst_v += 16; + for (x = 0; x < width; x += 16) { + READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001); + ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038, + const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2, + shuffler3, dst0, dst1); + *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0); + *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0); + s += 64; + t += 64; + dst_u += 8; + dst_v += 8; } } @@ -2734,13 +2831,24 @@ void I444ToARGBRow_MSA(const uint8_t* src_y, } } -void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width) { +// TODO - respect YuvConstants +void I400ToARGBRow_MSA(const uint8_t* src_y, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { int x; +#if defined(__aarch64__) || defined(__arm__) + int ygb = yuvconstants->kUVBiasBGR[3]; + int yg = yuvconstants->kYToRgb[1]; +#else + int ygb = yuvconstants->kYBiasToRgb[0]; + int yg = yuvconstants->kYToRgb[0]; +#endif v16u8 src0, res0, res1, res2, res3, res4, dst0, dst1, dst2, dst3; v8i16 vec0, vec1; v4i32 reg0, reg1, reg2, reg3; - v4i32 vec_yg = __msa_fill_w(0x4A35); - v8i16 vec_ygb = __msa_fill_h(0xFB78); + v4i32 vec_yg = __msa_fill_w(yg); + v8i16 vec_ygb = __msa_fill_h(ygb); v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); v8i16 max = __msa_ldi_h(0xFF); v8i16 zero = {0}; @@ -3006,7 +3114,7 @@ void ARGBBlendRow_MSA(const uint8_t* src_argb0, uint8_t* dst_argb, int width) { int x; - v16u8 src0, src1, src2, src3, dst0, dst1; + v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3; v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v8u16 vec8, vec9, vec10, vec11, vec12, vec13; v8u16 const_256 = (v8u16)__msa_ldi_h(256); @@ -3051,12 +3159,12 @@ void ARGBBlendRow_MSA(const uint8_t* src_argb0, vec9 = (v8u16)__msa_srai_h((v8i16)vec9, 8); vec10 = (v8u16)__msa_srai_h((v8i16)vec10, 8); vec11 = (v8u16)__msa_srai_h((v8i16)vec11, 8); - vec0 += vec8; - vec1 += vec9; - vec2 += vec10; - vec3 += vec11; dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + dst2 = (v16u8)__msa_pckev_b((v16i8)vec9, (v16i8)vec8); + dst3 = (v16u8)__msa_pckev_b((v16i8)vec11, (v16i8)vec10); + dst0 = (v16u8)__msa_adds_u_b(dst0, dst2); + dst1 = (v16u8)__msa_adds_u_b(dst1, dst3); dst0 = __msa_bmnz_v(dst0, const_255, mask); dst1 = __msa_bmnz_v(dst1, const_255, mask); ST_UB2(dst0, dst1, dst_argb, 16); @@ -3082,7 +3190,7 @@ void ARGBQuantizeRow_MSA(uint8_t* dst_argb, v16i8 mask = {0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31}; v16i8 zero = {0}; - for (x = 0; x < width; x += 8) { + for (x = 0; x < width; x += 16) { src0 = (v16u8)__msa_ld_b((void*)dst_argb, 0); src1 = (v16u8)__msa_ld_b((void*)dst_argb, 16); src2 = (v16u8)__msa_ld_b((void*)dst_argb, 32); @@ -3315,10 +3423,10 @@ void SetRow_MSA(uint8_t* dst, uint8_t v8, int width) { } } -void MirrorUVRow_MSA(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { +void MirrorSplitUVRow_MSA(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { int x; v16u8 src0, src1, src2, src3; v16u8 dst0, dst1, dst2, dst3; diff --git a/chromium/third_party/libyuv/source/row_neon.cc b/chromium/third_party/libyuv/source/row_neon.cc index 1cf8eefeaa3..a5aeaabfbd7 100644 --- a/chromium/third_party/libyuv/source/row_neon.cc +++ b/chromium/third_party/libyuv/source/row_neon.cc @@ -114,11 +114,11 @@ void I444ToARGBRow_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" + "vmov.u8 d23, #255 \n" "1: \n" READYUV444 YUVTORGB - "subs %4, %4, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%3]! \n" - "bgt 1b \n" + "subs %4, %4, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -140,11 +140,11 @@ void I422ToARGBRow_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" + "vmov.u8 d23, #255 \n" "1: \n" READYUV422 YUVTORGB - "subs %4, %4, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%3]! \n" - "bgt 1b \n" + "subs %4, %4, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -168,10 +168,10 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "1: \n" READYUV422 YUVTORGB - "subs %5, %5, #8 \n" - "vld1.8 {d23}, [%3]! \n" - "vst4.8 {d20, d21, d22, d23}, [%4]! \n" - "bgt 1b \n" + "subs %5, %5, #8 \n" + "vld1.8 {d23}, [%3]! \n" + "vst4.8 {d20, d21, d22, d23}, [%4]! \n" + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -195,10 +195,10 @@ void I422ToRGBARow_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "1: \n" READYUV422 YUVTORGB - "subs %4, %4, #8 \n" - "vmov.u8 d19, #255 \n" // YUVTORGB modified d19 - "vst4.8 {d19, d20, d21, d22}, [%3]! \n" - "bgt 1b \n" + "subs %4, %4, #8 \n" + "vmov.u8 d19, #255 \n" // YUVTORGB modified d19 + "vst4.8 {d19, d20, d21, d22}, [%3]! \n" + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -221,9 +221,9 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "1: \n" READYUV422 YUVTORGB - "subs %4, %4, #8 \n" - "vst3.8 {d20, d21, d22}, [%3]! \n" - "bgt 1b \n" + "subs %4, %4, #8 \n" + "vst3.8 {d20, d21, d22}, [%3]! \n" + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -253,9 +253,9 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "1: \n" READYUV422 YUVTORGB - "subs %4, %4, #8 \n" ARGBTORGB565 - "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565. - "bgt 1b \n" + "subs %4, %4, #8 \n" ARGBTORGB565 + "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565. + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -287,10 +287,10 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "1: \n" READYUV422 YUVTORGB - "subs %4, %4, #8 \n" - "vmov.u8 d23, #255 \n" ARGBTOARGB1555 - "vst1.8 {q0}, [%3]! \n" // store 8 pixels - "bgt 1b \n" + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" ARGBTOARGB1555 + "vst1.8 {q0}, [%3]! \n" // store 8 pixels + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -321,14 +321,14 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "vmov.u8 d4, #0x0f \n" // vbic bits to clear + "vmov.u8 d4, #0x0f \n" // vbic bits to clear "1: \n" READYUV422 YUVTORGB - "subs %4, %4, #8 \n" - "vmov.u8 d23, #255 \n" ARGBTOARGB4444 - "vst1.8 {q0}, [%3]! \n" // store 8 pixels - "bgt 1b \n" + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" ARGBTOARGB4444 + "vst1.8 {q0}, [%3]! \n" // store 8 pixels + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -342,35 +342,38 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y, "q12", "q13", "q14", "q15"); } -void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { +void I400ToARGBRow_NEON(const uint8_t* src_y, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { asm volatile( YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" + "vmov.u8 d23, #255 \n" "1: \n" READYUV400 YUVTORGB - "subs %2, %2, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" + "subs %2, %2, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 - : [kUVToRB] "r"(&kYuvI601Constants.kUVToRB), - [kUVToG] "r"(&kYuvI601Constants.kUVToG), - [kUVBiasBGR] "r"(&kYuvI601Constants.kUVBiasBGR), - [kYToRgb] "r"(&kYuvI601Constants.kYToRgb) + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); } void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { asm volatile( - "vmov.u8 d23, #255 \n" + "vmov.u8 d23, #255 \n" "1: \n" - "vld1.8 {d20}, [%0]! \n" - "vmov d21, d20 \n" - "vmov d22, d20 \n" - "subs %2, %2, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" + "vld1.8 {d20}, [%0]! \n" + "vmov d21, d20 \n" + "vmov d22, d20 \n" + "subs %2, %2, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -384,11 +387,11 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" + "vmov.u8 d23, #255 \n" "1: \n" READNV12 YUVTORGB - "subs %3, %3, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%2]! \n" - "bgt 1b \n" + "subs %3, %3, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%2]! \n" + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_uv), // %1 "+r"(dst_argb), // %2 @@ -407,11 +410,11 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" + "vmov.u8 d23, #255 \n" "1: \n" READNV21 YUVTORGB - "subs %3, %3, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%2]! \n" - "bgt 1b \n" + "subs %3, %3, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%2]! \n" + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_vu), // %1 "+r"(dst_argb), // %2 @@ -436,9 +439,9 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y, "1: \n" READNV12 YUVTORGB - "subs %3, %3, #8 \n" - "vst3.8 {d20, d21, d22}, [%2]! \n" - "bgt 1b \n" + "subs %3, %3, #8 \n" + "vst3.8 {d20, d21, d22}, [%2]! \n" + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_uv), // %1 "+r"(dst_rgb24), // %2 @@ -463,9 +466,9 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y, "1: \n" READNV21 YUVTORGB - "subs %3, %3, #8 \n" - "vst3.8 {d20, d21, d22}, [%2]! \n" - "bgt 1b \n" + "subs %3, %3, #8 \n" + "vst3.8 {d20, d21, d22}, [%2]! \n" + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_vu), // %1 "+r"(dst_rgb24), // %2 @@ -486,9 +489,9 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "1: \n" READNV12 YUVTORGB - "subs %3, %3, #8 \n" ARGBTORGB565 - "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. - "bgt 1b \n" + "subs %3, %3, #8 \n" ARGBTORGB565 + "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_uv), // %1 "+r"(dst_rgb565), // %2 @@ -506,11 +509,11 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" + "vmov.u8 d23, #255 \n" "1: \n" READYUY2 YUVTORGB - "subs %2, %2, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" + "subs %2, %2, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -527,11 +530,11 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" + "vmov.u8 d23, #255 \n" "1: \n" READUYVY YUVTORGB - "subs %2, %2, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" + "subs %2, %2, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -550,11 +553,11 @@ void SplitUVRow_NEON(const uint8_t* src_uv, int width) { asm volatile( "1: \n" - "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV - "subs %3, %3, #16 \n" // 16 processed per loop - "vst1.8 {q0}, [%1]! \n" // store U - "vst1.8 {q1}, [%2]! \n" // store V - "bgt 1b \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV + "subs %3, %3, #16 \n" // 16 processed per loop + "vst1.8 {q0}, [%1]! \n" // store U + "vst1.8 {q1}, [%2]! \n" // store V + "bgt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -571,11 +574,11 @@ void MergeUVRow_NEON(const uint8_t* src_u, int width) { asm volatile( "1: \n" - "vld1.8 {q0}, [%0]! \n" // load U - "vld1.8 {q1}, [%1]! \n" // load V - "subs %3, %3, #16 \n" // 16 processed per loop - "vst2.8 {q0, q1}, [%2]! \n" // store 16 pairs of UV - "bgt 1b \n" + "vld1.8 {q0}, [%0]! \n" // load U + "vld1.8 {q1}, [%1]! \n" // load V + "subs %3, %3, #16 \n" // 16 processed per loop + "vst2.8 {q0, q1}, [%2]! \n" // store 16 pairs of UV + "bgt 1b \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 @@ -593,13 +596,13 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb, int width) { asm volatile( "1: \n" - "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB - "vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB - "subs %4, %4, #16 \n" // 16 processed per loop - "vst1.8 {q0}, [%1]! \n" // store R - "vst1.8 {q1}, [%2]! \n" // store G - "vst1.8 {q2}, [%3]! \n" // store B - "bgt 1b \n" + "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB + "vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB + "subs %4, %4, #16 \n" // 16 processed per loop + "vst1.8 {q0}, [%1]! \n" // store R + "vst1.8 {q1}, [%2]! \n" // store G + "vst1.8 {q2}, [%3]! \n" // store B + "bgt 1b \n" : "+r"(src_rgb), // %0 "+r"(dst_r), // %1 "+r"(dst_g), // %2 @@ -618,13 +621,13 @@ void MergeRGBRow_NEON(const uint8_t* src_r, int width) { asm volatile( "1: \n" - "vld1.8 {q0}, [%0]! \n" // load R - "vld1.8 {q1}, [%1]! \n" // load G - "vld1.8 {q2}, [%2]! \n" // load B - "subs %4, %4, #16 \n" // 16 processed per loop - "vst3.8 {d0, d2, d4}, [%3]! \n" // store 8 RGB - "vst3.8 {d1, d3, d5}, [%3]! \n" // next 8 RGB - "bgt 1b \n" + "vld1.8 {q0}, [%0]! \n" // load R + "vld1.8 {q1}, [%1]! \n" // load G + "vld1.8 {q2}, [%2]! \n" // load B + "subs %4, %4, #16 \n" // 16 processed per loop + "vst3.8 {d0, d2, d4}, [%3]! \n" // store 8 RGB + "vst3.8 {d1, d3, d5}, [%3]! \n" // next 8 RGB + "bgt 1b \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 @@ -639,10 +642,10 @@ void MergeRGBRow_NEON(const uint8_t* src_r, void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "1: \n" - "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 - "subs %2, %2, #32 \n" // 32 processed per loop - "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 - "bgt 1b \n" + "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 + "subs %2, %2, #32 \n" // 32 processed per loop + "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 + "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 // Output registers @@ -654,11 +657,11 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { // SetRow writes 'width' bytes using an 8 bit value repeated. void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { asm volatile( - "vdup.8 q0, %2 \n" // duplicate 16 bytes + "vdup.8 q0, %2 \n" // duplicate 16 bytes "1: \n" - "subs %1, %1, #16 \n" // 16 bytes per loop - "vst1.8 {q0}, [%0]! \n" // store - "bgt 1b \n" + "subs %1, %1, #16 \n" // 16 bytes per loop + "vst1.8 {q0}, [%0]! \n" // store + "bgt 1b \n" : "+r"(dst), // %0 "+r"(width) // %1 : "r"(v8) // %2 @@ -668,11 +671,11 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { // ARGBSetRow writes 'width' pixels using an 32 bit value repeated. void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { asm volatile( - "vdup.u32 q0, %2 \n" // duplicate 4 ints + "vdup.u32 q0, %2 \n" // duplicate 4 ints "1: \n" - "subs %1, %1, #4 \n" // 4 pixels per loop - "vst1.8 {q0}, [%0]! \n" // store - "bgt 1b \n" + "subs %1, %1, #4 \n" // 4 pixels per loop + "vst1.8 {q0}, [%0]! \n" // store + "bgt 1b \n" : "+r"(dst), // %0 "+r"(width) // %1 : "r"(v32) // %2 @@ -682,41 +685,62 @@ void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { asm volatile( // Start at end of source row. - "mov r3, #-16 \n" - "add %0, %0, %2 \n" - "sub %0, #16 \n" + "add %0, %0, %2 \n" + "sub %0, %0, #32 \n" // 32 bytes per loop "1: \n" - "vld1.8 {q0}, [%0], r3 \n" // src -= 16 - "subs %2, #16 \n" // 16 pixels per loop. - "vrev64.8 q0, q0 \n" - "vst1.8 {d1}, [%1]! \n" // dst += 16 - "vst1.8 {d0}, [%1]! \n" - "bgt 1b \n" + "vld1.8 {q1, q2}, [%0], %3 \n" // src -= 32 + "subs %2, #32 \n" // 32 pixels per loop. + "vrev64.8 q0, q2 \n" + "vrev64.8 q1, q1 \n" + "vswp d0, d1 \n" + "vswp d2, d3 \n" + "vst1.8 {q0, q1}, [%1]! \n" // dst += 32 + "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 + : "r"(-32) // %3 + : "cc", "memory", "q0", "q1", "q2"); +} + +void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) { + asm volatile( + // Start at end of source row. + "mov r12, #-16 \n" + "add %0, %0, %2, lsl #1 \n" + "sub %0, #16 \n" + + "1: \n" + "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 + "subs %2, #8 \n" // 8 pixels per loop. + "vrev64.8 q0, q0 \n" + "vst2.8 {d0, d1}, [%1]! \n" // dst += 16 + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_uv), // %1 + "+r"(width) // %2 : - : "cc", "memory", "r3", "q0"); + : "cc", "memory", "r12", "q0"); } -void MirrorUVRow_NEON(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { +void MirrorSplitUVRow_NEON(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { asm volatile( // Start at end of source row. - "mov r12, #-16 \n" - "add %0, %0, %3, lsl #1 \n" - "sub %0, #16 \n" + "mov r12, #-16 \n" + "add %0, %0, %3, lsl #1 \n" + "sub %0, #16 \n" "1: \n" - "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 - "subs %3, #8 \n" // 8 pixels per loop. - "vrev64.8 q0, q0 \n" - "vst1.8 {d0}, [%1]! \n" // dst += 8 - "vst1.8 {d1}, [%2]! \n" - "bgt 1b \n" + "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 + "subs %3, #8 \n" // 8 pixels per loop. + "vrev64.8 q0, q0 \n" + "vst1.8 {d0}, [%1]! \n" // dst += 8 + "vst1.8 {d1}, [%2]! \n" + "bgt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -725,37 +749,57 @@ void MirrorUVRow_NEON(const uint8_t* src_uv, : "cc", "memory", "r12", "q0"); } -void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { +void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( - // Start at end of source row. - "mov r3, #-16 \n" - "add %0, %0, %2, lsl #2 \n" - "sub %0, #16 \n" + "add %0, %0, %2, lsl #2 \n" + "sub %0, #32 \n" "1: \n" - "vld1.8 {q0}, [%0], r3 \n" // src -= 16 - "subs %2, #4 \n" // 4 pixels per loop. - "vrev64.32 q0, q0 \n" - "vst1.8 {d1}, [%1]! \n" // dst += 16 - "vst1.8 {d0}, [%1]! \n" - "bgt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "r3", "q0"); + "vld4.8 {d0, d1, d2, d3}, [%0], %3 \n" // src -= 32 + "subs %2, #8 \n" // 8 pixels per loop. + "vrev64.8 d0, d0 \n" + "vrev64.8 d1, d1 \n" + "vrev64.8 d2, d2 \n" + "vrev64.8 d3, d3 \n" + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // dst += 32 + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(-32) // %3 + : "cc", "memory", "d0", "d1", "d2", "d3"); +} + +void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, + uint8_t* dst_rgb24, + int width) { + src_rgb24 += width * 3 - 24; + asm volatile( + "1: \n" + "vld3.8 {d0, d1, d2}, [%0], %3 \n" // src -= 24 + "subs %2, #8 \n" // 8 pixels per loop. + "vrev64.8 d0, d0 \n" + "vrev64.8 d1, d1 \n" + "vrev64.8 d2, d2 \n" + "vst3.8 {d0, d1, d2}, [%1]! \n" // dst += 24 + "bgt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_rgb24), // %1 + "+r"(width) // %2 + : "r"(-24) // %3 + : "cc", "memory", "d0", "d1", "d2"); } void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { asm volatile( - "vmov.u8 d4, #255 \n" // Alpha + "vmov.u8 d4, #255 \n" // Alpha "1: \n" - "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -766,13 +810,13 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { asm volatile( - "vmov.u8 d4, #255 \n" // Alpha + "vmov.u8 d4, #255 \n" // Alpha "1: \n" - "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vswp.u8 d1, d3 \n" // swap R, B - "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -783,13 +827,13 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { asm volatile( - "vmov.u8 d0, #255 \n" // Alpha + "vmov.u8 d0, #255 \n" // Alpha "1: \n" - "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vswp.u8 d1, d3 \n" // swap R, B - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of RGBA. - "bgt 1b \n" + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of RGBA. + "bgt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_rgba), // %1 "+r"(width) // %2 @@ -800,12 +844,12 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { asm volatile( "1: \n" - "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vswp.u8 d1, d3 \n" // swap R, B - "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of // RGB24. - "bgt 1b \n" + "bgt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_rgb24), // %1 "+r"(width) // %2 @@ -830,13 +874,13 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_argb, int width) { asm volatile( - "vmov.u8 d3, #255 \n" // Alpha + "vmov.u8 d3, #255 \n" // Alpha "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. RGB565TOARGB - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" : "+r"(src_rgb565), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -876,13 +920,13 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_argb, int width) { asm volatile( - "vmov.u8 d3, #255 \n" // Alpha + "vmov.u8 d3, #255 \n" // Alpha "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. ARGB1555TOARGB - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" : "+r"(src_argb1555), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -905,13 +949,13 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_argb, int width) { asm volatile( - "vmov.u8 d3, #255 \n" // Alpha + "vmov.u8 d3, #255 \n" // Alpha "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. ARGB4444TOARGB - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" : "+r"(src_argb4444), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -925,11 +969,11 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb, int width) { asm volatile( "1: \n" - "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of + "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of // RGB24. - "bgt 1b \n" + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgb24), // %1 "+r"(width) // %2 @@ -941,11 +985,11 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb, void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { asm volatile( "1: \n" - "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vswp.u8 d1, d3 \n" // swap R, B - "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW. - "bgt 1b \n" + "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_raw), // %1 "+r"(width) // %2 @@ -957,10 +1001,10 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { asm volatile( "1: \n" - "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. - "subs %2, %2, #16 \n" // 16 processed per loop. - "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. - "bgt 1b \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. + "subs %2, %2, #16 \n" // 16 processed per loop. + "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. + "bgt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -972,10 +1016,10 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { asm volatile( "1: \n" - "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. - "subs %2, %2, #16 \n" // 16 processed per loop. - "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. - "bgt 1b \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. + "subs %2, %2, #16 \n" // 16 processed per loop. + "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. + "bgt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -990,11 +1034,11 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, int width) { asm volatile( "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. - "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. - "vst1.8 {d1}, [%1]! \n" // store 8 U. - "vst1.8 {d3}, [%2]! \n" // store 8 V. - "bgt 1b \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + "vst1.8 {d1}, [%1]! \n" // store 8 U. + "vst1.8 {d3}, [%2]! \n" // store 8 V. + "bgt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -1010,11 +1054,11 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, int width) { asm volatile( "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. - "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. - "vst1.8 {d0}, [%1]! \n" // store 8 U. - "vst1.8 {d2}, [%2]! \n" // store 8 V. - "bgt 1b \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + "vst1.8 {d0}, [%1]! \n" // store 8 U. + "vst1.8 {d2}, [%2]! \n" // store 8 V. + "bgt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -1030,16 +1074,16 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_v, int width) { asm volatile( - "add %1, %0, %1 \n" // stride + src_yuy2 + "add %1, %0, %1 \n" // stride + src_yuy2 "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. - "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. - "vrhadd.u8 d1, d1, d5 \n" // average rows of U - "vrhadd.u8 d3, d3, d7 \n" // average rows of V - "vst1.8 {d1}, [%2]! \n" // store 8 U. - "vst1.8 {d3}, [%3]! \n" // store 8 V. - "bgt 1b \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. + "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. + "vrhadd.u8 d1, d1, d5 \n" // average rows of U + "vrhadd.u8 d3, d3, d7 \n" // average rows of V + "vst1.8 {d1}, [%2]! \n" // store 8 U. + "vst1.8 {d3}, [%3]! \n" // store 8 V. + "bgt 1b \n" : "+r"(src_yuy2), // %0 "+r"(stride_yuy2), // %1 "+r"(dst_u), // %2 @@ -1057,16 +1101,16 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_v, int width) { asm volatile( - "add %1, %0, %1 \n" // stride + src_uyvy + "add %1, %0, %1 \n" // stride + src_uyvy "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. - "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. - "vrhadd.u8 d0, d0, d4 \n" // average rows of U - "vrhadd.u8 d2, d2, d6 \n" // average rows of V - "vst1.8 {d0}, [%2]! \n" // store 8 U. - "vst1.8 {d2}, [%3]! \n" // store 8 V. - "bgt 1b \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. + "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. + "vrhadd.u8 d0, d0, d4 \n" // average rows of U + "vrhadd.u8 d2, d2, d6 \n" // average rows of V + "vst1.8 {d0}, [%2]! \n" // store 8 U. + "vst1.8 {d2}, [%3]! \n" // store 8 V. + "bgt 1b \n" : "+r"(src_uyvy), // %0 "+r"(stride_uyvy), // %1 "+r"(dst_u), // %2 @@ -1084,14 +1128,14 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb, const uint8_t* shuffler, int width) { asm volatile( - "vld1.8 {q2}, [%3] \n" // shuffler + "vld1.8 {q2}, [%3] \n" // shuffler "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 4 pixels. - "subs %2, %2, #4 \n" // 4 processed per loop - "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels - "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels - "vst1.8 {q1}, [%1]! \n" // store 4. - "bgt 1b \n" + "vld1.8 {q0}, [%0]! \n" // load 4 pixels. + "subs %2, %2, #4 \n" // 4 processed per loop + "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels + "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels + "vst1.8 {q1}, [%1]! \n" // store 4. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -1107,12 +1151,12 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y, int width) { asm volatile( "1: \n" - "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys - "vld1.8 {d1}, [%1]! \n" // load 8 Us - "vld1.8 {d3}, [%2]! \n" // load 8 Vs - "subs %4, %4, #16 \n" // 16 pixels - "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. - "bgt 1b \n" + "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys + "vld1.8 {d1}, [%1]! \n" // load 8 Us + "vld1.8 {d3}, [%2]! \n" // load 8 Vs + "subs %4, %4, #16 \n" // 16 pixels + "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -1129,12 +1173,12 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y, int width) { asm volatile( "1: \n" - "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys - "vld1.8 {d0}, [%1]! \n" // load 8 Us - "vld1.8 {d2}, [%2]! \n" // load 8 Vs - "subs %4, %4, #16 \n" // 16 pixels - "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. - "bgt 1b \n" + "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys + "vld1.8 {d0}, [%1]! \n" // load 8 Us + "vld1.8 {d2}, [%2]! \n" // load 8 Vs + "subs %4, %4, #16 \n" // 16 pixels + "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -1149,11 +1193,11 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb, int width) { asm volatile( "1: \n" - "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. + "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. ARGBTORGB565 - "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565. - "bgt 1b \n" + "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgb565), // %1 "+r"(width) // %2 @@ -1166,16 +1210,16 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, const uint32_t dither4, int width) { asm volatile( - "vdup.32 d2, %2 \n" // dither4 + "vdup.32 d2, %2 \n" // dither4 "1: \n" - "vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 d20, d20, d2 \n" - "vqadd.u8 d21, d21, d2 \n" - "vqadd.u8 d22, d22, d2 \n" // add for dither + "vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 d20, d20, d2 \n" + "vqadd.u8 d21, d21, d2 \n" + "vqadd.u8 d22, d22, d2 \n" // add for dither ARGBTORGB565 - "vst1.8 {q0}, [%0]! \n" // store 8 RGB565. - "bgt 1b \n" + "vst1.8 {q0}, [%0]! \n" // store 8 RGB565. + "bgt 1b \n" : "+r"(dst_rgb) // %0 : "r"(src_argb), // %1 "r"(dither4), // %2 @@ -1188,11 +1232,11 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, int width) { asm volatile( "1: \n" - "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. + "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. ARGBTOARGB1555 - "vst1.8 {q0}, [%1]! \n" // store 8 ARGB1555. - "bgt 1b \n" + "vst1.8 {q0}, [%1]! \n" // store 8 ARGB1555. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb1555), // %1 "+r"(width) // %2 @@ -1204,14 +1248,14 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, uint8_t* dst_argb4444, int width) { asm volatile( - "vmov.u8 d4, #0x0f \n" // bits to clear with + "vmov.u8 d4, #0x0f \n" // bits to clear with // vbic. "1: \n" - "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. + "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. ARGBTOARGB4444 - "vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444. - "bgt 1b \n" + "vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb4444), // %1 "+r"(width) // %2 @@ -1221,20 +1265,20 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { asm volatile( - "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient - "vmov.u8 d27, #16 \n" // Add 16 constant + "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d27 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" + "vqadd.u8 d0, d27 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1247,11 +1291,11 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, int width) { asm volatile( "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels - "subs %2, %2, #16 \n" // 16 processed per loop - "vst1.8 {q3}, [%1]! \n" // store 16 A's. - "bgt 1b \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels + "subs %2, %2, #16 \n" // 16 processed per loop + "vst1.8 {q3}, [%1]! \n" // store 16 A's. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_a), // %1 "+r"(width) // %2 @@ -1262,18 +1306,18 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { asm volatile( - "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient - "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient - "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient + "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient + "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient + "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1283,18 +1327,18 @@ void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { asm volatile( - "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient - "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient - "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient + "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient + "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient + "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 RGBA pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d1, d24 \n" // B - "vmlal.u8 q2, d2, d25 \n" // G - "vmlal.u8 q2, d3, d26 \n" // R + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 RGBA pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d1, d24 \n" // B + "vmlal.u8 q2, d2, d25 \n" // G + "vmlal.u8 q2, d3, d26 \n" // R "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1308,32 +1352,32 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb, uint8_t* dst_v, int width) { asm volatile( - "vmov.u8 d24, #112 \n" // UB / VR 0.875 + "vmov.u8 d24, #112 \n" // UB / VR 0.875 // coefficient - "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient - "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient - "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient - "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlsl.u8 q2, d1, d25 \n" // G - "vmlsl.u8 q2, d2, d26 \n" // R - "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned - - "vmull.u8 q3, d2, d24 \n" // R - "vmlsl.u8 q3, d1, d28 \n" // G - "vmlsl.u8 q3, d0, d27 \n" // B - "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned + "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient + "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient + "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient + "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlsl.u8 q2, d1, d25 \n" // G + "vmlsl.u8 q2, d2, d26 \n" // R + "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned + + "vmull.u8 q3, d2, d24 \n" // R + "vmlsl.u8 q3, d1, d28 \n" // G + "vmlsl.u8 q3, d0, d27 \n" // B + "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V - "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. - "bgt 1b \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -1365,34 +1409,34 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb, uint8_t* dst_v, int width) { asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. - "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" - - "subs %4, %4, #16 \n" // 32 processed per loop. + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q0, q1, q2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(src_stride_argb), // %1 "+r"(dst_u), // %2 @@ -1411,34 +1455,34 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb, uint8_t* dst_v, int width) { asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient - "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient - "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient - "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient - "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. - "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" - - "subs %4, %4, #16 \n" // 32 processed per loop. + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient + "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient + "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient + "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient + "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q0, q1, q2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(src_stride_argb), // %1 "+r"(dst_u), // %2 @@ -1456,34 +1500,34 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra, uint8_t* dst_v, int width) { asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_bgra - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels. - "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels. - "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q1, q1, #1 \n" // 2x average - "vrshr.u16 q2, q2, #1 \n" - "vrshr.u16 q3, q3, #1 \n" - - "subs %4, %4, #16 \n" // 32 processed per loop. + "add %1, %0, %1 \n" // src_stride + src_bgra + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels. + "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels. + "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q1, q1, #1 \n" // 2x average + "vrshr.u16 q2, q2, #1 \n" + "vrshr.u16 q3, q3, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q3, q2, q1) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" : "+r"(src_bgra), // %0 "+r"(src_stride_bgra), // %1 "+r"(dst_u), // %2 @@ -1501,34 +1545,34 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr, uint8_t* dst_v, int width) { asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_abgr - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. - "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels. - "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" - - "subs %4, %4, #16 \n" // 32 processed per loop. + "add %1, %0, %1 \n" // src_stride + src_abgr + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. + "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels. + "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q2, q1, q0) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" : "+r"(src_abgr), // %0 "+r"(src_stride_abgr), // %1 "+r"(dst_u), // %2 @@ -1546,34 +1590,34 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba, uint8_t* dst_v, int width) { asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_rgba - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels. - "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels. - "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" - - "subs %4, %4, #16 \n" // 32 processed per loop. + "add %1, %0, %1 \n" // src_stride + src_rgba + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels. + "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels. + "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q0, q1, q2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" : "+r"(src_rgba), // %0 "+r"(src_stride_rgba), // %1 "+r"(dst_u), // %2 @@ -1591,34 +1635,34 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_v, int width) { asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_rgb24 - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. - "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels. - "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels. - "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" - - "subs %4, %4, #16 \n" // 32 processed per loop. + "add %1, %0, %1 \n" // src_stride + src_rgb24 + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. + "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels. + "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q0, q1, q2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" : "+r"(src_rgb24), // %0 "+r"(src_stride_rgb24), // %1 "+r"(dst_u), // %2 @@ -1636,34 +1680,34 @@ void RAWToUVRow_NEON(const uint8_t* src_raw, uint8_t* dst_v, int width) { asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_raw - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. - "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. - "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. - "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels. - "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels. - "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" - - "subs %4, %4, #16 \n" // 32 processed per loop. + "add %1, %0, %1 \n" // src_stride + src_raw + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. + "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. + "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. + "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels. + "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels. + "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q2, q1, q0) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" : "+r"(src_raw), // %0 "+r"(src_stride_raw), // %1 "+r"(dst_u), // %2 @@ -1682,55 +1726,55 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_v, int width) { asm volatile( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 // coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. RGB565TOARGB - "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels. + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels. RGB565TOARGB - "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels. + "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels. RGB565TOARGB - "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels. + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels. RGB565TOARGB - "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - - "vrshr.u16 q4, q4, #1 \n" // 2x average - "vrshr.u16 q5, q5, #1 \n" - "vrshr.u16 q6, q6, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. - "vmul.s16 q8, q4, q10 \n" // B - "vmls.s16 q8, q5, q11 \n" // G - "vmls.s16 q8, q6, q12 \n" // R - "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned - "vmul.s16 q9, q6, q10 \n" // R - "vmls.s16 q9, q5, q14 \n" // G - "vmls.s16 q9, q4, q13 \n" // B - "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vrshr.u16 q4, q4, #1 \n" // 2x average + "vrshr.u16 q5, q5, #1 \n" + "vrshr.u16 q6, q6, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q4, q10 \n" // B + "vmls.s16 q8, q5, q11 \n" // G + "vmls.s16 q8, q6, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q6, q10 \n" // R + "vmls.s16 q9, q5, q14 \n" // G + "vmls.s16 q9, q4, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" : "+r"(src_rgb565), // %0 "+r"(src_stride_rgb565), // %1 "+r"(dst_u), // %2 @@ -1748,55 +1792,55 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_v, int width) { asm volatile( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 // coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. RGB555TOARGB - "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels. + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels. RGB555TOARGB - "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels. + "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels. RGB555TOARGB - "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels. + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels. RGB555TOARGB - "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - - "vrshr.u16 q4, q4, #1 \n" // 2x average - "vrshr.u16 q5, q5, #1 \n" - "vrshr.u16 q6, q6, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. - "vmul.s16 q8, q4, q10 \n" // B - "vmls.s16 q8, q5, q11 \n" // G - "vmls.s16 q8, q6, q12 \n" // R - "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned - "vmul.s16 q9, q6, q10 \n" // R - "vmls.s16 q9, q5, q14 \n" // G - "vmls.s16 q9, q4, q13 \n" // B - "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vrshr.u16 q4, q4, #1 \n" // 2x average + "vrshr.u16 q5, q5, #1 \n" + "vrshr.u16 q6, q6, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q4, q10 \n" // B + "vmls.s16 q8, q5, q11 \n" // G + "vmls.s16 q8, q6, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q6, q10 \n" // R + "vmls.s16 q9, q5, q14 \n" // G + "vmls.s16 q9, q4, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" : "+r"(src_argb1555), // %0 "+r"(src_stride_argb1555), // %1 "+r"(dst_u), // %2 @@ -1814,55 +1858,46 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_v, int width) { asm volatile( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 // coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. ARGB4444TOARGB - "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels. + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels. ARGB4444TOARGB - "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels. + "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels. ARGB4444TOARGB - "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels. + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels. ARGB4444TOARGB - "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - - "vrshr.u16 q4, q4, #1 \n" // 2x average - "vrshr.u16 q5, q5, #1 \n" - "vrshr.u16 q6, q6, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. - "vmul.s16 q8, q4, q10 \n" // B - "vmls.s16 q8, q5, q11 \n" // G - "vmls.s16 q8, q6, q12 \n" // R - "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned - "vmul.s16 q9, q6, q10 \n" // R - "vmls.s16 q9, q5, q14 \n" // G - "vmls.s16 q9, q4, q13 \n" // B - "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned - "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U - "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vrshr.u16 q0, q4, #1 \n" // 2x average + "vrshr.u16 q1, q5, #1 \n" + "vrshr.u16 q2, q6, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + RGBTOUV(q0, q1, q2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" : "+r"(src_argb4444), // %0 "+r"(src_stride_argb4444), // %1 "+r"(dst_u), // %2 @@ -1875,21 +1910,21 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { asm volatile( - "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient - "vmov.u8 d27, #16 \n" // Add 16 constant + "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. RGB565TOARGB - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d27 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" + "vqadd.u8 d0, d27 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" : "+r"(src_rgb565), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1901,21 +1936,21 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_y, int width) { asm volatile( - "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient - "vmov.u8 d27, #16 \n" // Add 16 constant + "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. ARGB1555TOARGB - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d27 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" + "vqadd.u8 d0, d27 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" : "+r"(src_argb1555), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1927,21 +1962,21 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_y, int width) { asm volatile( - "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient - "vmov.u8 d27, #16 \n" // Add 16 constant + "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. ARGB4444TOARGB - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d27 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" + "vqadd.u8 d0, d27 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" : "+r"(src_argb4444), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1951,20 +1986,20 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { asm volatile( - "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient - "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient - "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant + "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient + "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient + "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d1, d4 \n" // R - "vmlal.u8 q8, d2, d5 \n" // G - "vmlal.u8 q8, d3, d6 \n" // B + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d1, d4 \n" // R + "vmlal.u8 q8, d2, d5 \n" // G + "vmlal.u8 q8, d3, d6 \n" // B "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" + "vqadd.u8 d0, d7 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" : "+r"(src_bgra), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1974,20 +2009,20 @@ void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { asm volatile( - "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient - "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient - "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant + "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient + "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient + "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d0, d4 \n" // R - "vmlal.u8 q8, d1, d5 \n" // G - "vmlal.u8 q8, d2, d6 \n" // B + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d0, d4 \n" // R + "vmlal.u8 q8, d1, d5 \n" // G + "vmlal.u8 q8, d2, d6 \n" // B "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" + "vqadd.u8 d0, d7 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" : "+r"(src_abgr), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1997,20 +2032,20 @@ void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { asm volatile( - "vmov.u8 d4, #25 \n" // B * 0.1016 coefficient - "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #66 \n" // R * 0.2578 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant + "vmov.u8 d4, #25 \n" // B * 0.1016 coefficient + "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #66 \n" // R * 0.2578 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d1, d4 \n" // B - "vmlal.u8 q8, d2, d5 \n" // G - "vmlal.u8 q8, d3, d6 \n" // R + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d1, d4 \n" // B + "vmlal.u8 q8, d2, d5 \n" // G + "vmlal.u8 q8, d3, d6 \n" // R "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" + "vqadd.u8 d0, d7 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" : "+r"(src_rgba), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -2020,20 +2055,20 @@ void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { asm volatile( - "vmov.u8 d4, #25 \n" // B * 0.1016 coefficient - "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #66 \n" // R * 0.2578 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant + "vmov.u8 d4, #25 \n" // B * 0.1016 coefficient + "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #66 \n" // R * 0.2578 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant "1: \n" - "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d0, d4 \n" // B - "vmlal.u8 q8, d1, d5 \n" // G - "vmlal.u8 q8, d2, d6 \n" // R + "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d0, d4 \n" // B + "vmlal.u8 q8, d1, d5 \n" // G + "vmlal.u8 q8, d2, d6 \n" // R "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" + "vqadd.u8 d0, d7 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -2043,20 +2078,20 @@ void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { asm volatile( - "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient - "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient - "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant + "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient + "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient + "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant "1: \n" - "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d0, d4 \n" // B - "vmlal.u8 q8, d1, d5 \n" // G - "vmlal.u8 q8, d2, d6 \n" // R + "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d0, d4 \n" // B + "vmlal.u8 q8, d1, d5 \n" // G + "vmlal.u8 q8, d2, d6 \n" // R "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" + "vqadd.u8 d0, d7 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -2064,6 +2099,48 @@ void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); } +void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { + asm volatile( + "vmov.u8 d4, #29 \n" // B * 0.1140 coefficient + "vmov.u8 d5, #150 \n" // G * 0.5870 coefficient + "vmov.u8 d6, #77 \n" // R * 0.2990 coefficient + "1: \n" + "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q4, d0, d4 \n" // B + "vmlal.u8 q4, d1, d5 \n" // G + "vmlal.u8 q4, d2, d6 \n" // R + "vqrshrn.u16 d0, q4, #8 \n" // 16 bit to 8 bit Y + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_yj), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "q4"); +} + +void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) { + asm volatile( + "vmov.u8 d6, #29 \n" // B * 0.1140 coefficient + "vmov.u8 d5, #150 \n" // G * 0.5870 coefficient + "vmov.u8 d4, #77 \n" // R * 0.2990 coefficient + "1: \n" + "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q4, d0, d4 \n" // B + "vmlal.u8 q4, d1, d5 \n" // G + "vmlal.u8 q4, d2, d6 \n" // R + "vqrshrn.u16 d0, q4, #8 \n" // 16 bit to 8 bit Y + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_yj), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "q4"); +} + // Bilinear filter 16x2 -> 16x1 void InterpolateRow_NEON(uint8_t* dst_ptr, const uint8_t* src_ptr, @@ -2072,46 +2149,46 @@ void InterpolateRow_NEON(uint8_t* dst_ptr, int source_y_fraction) { int y1_fraction = source_y_fraction; asm volatile( - "cmp %4, #0 \n" - "beq 100f \n" - "add %2, %1 \n" - "cmp %4, #128 \n" - "beq 50f \n" + "cmp %4, #0 \n" + "beq 100f \n" + "add %2, %1 \n" + "cmp %4, #128 \n" + "beq 50f \n" - "vdup.8 d5, %4 \n" - "rsb %4, #256 \n" - "vdup.8 d4, %4 \n" + "vdup.8 d5, %4 \n" + "rsb %4, #256 \n" + "vdup.8 d4, %4 \n" // General purpose row blend. "1: \n" - "vld1.8 {q0}, [%1]! \n" - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vmull.u8 q13, d0, d4 \n" - "vmull.u8 q14, d1, d4 \n" - "vmlal.u8 q13, d2, d5 \n" - "vmlal.u8 q14, d3, d5 \n" - "vrshrn.u16 d0, q13, #8 \n" - "vrshrn.u16 d1, q14, #8 \n" - "vst1.8 {q0}, [%0]! \n" - "bgt 1b \n" - "b 99f \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vmull.u8 q13, d0, d4 \n" + "vmull.u8 q14, d1, d4 \n" + "vmlal.u8 q13, d2, d5 \n" + "vmlal.u8 q14, d3, d5 \n" + "vrshrn.u16 d0, q13, #8 \n" + "vrshrn.u16 d1, q14, #8 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 1b \n" + "b 99f \n" // Blend 50 / 50. "50: \n" - "vld1.8 {q0}, [%1]! \n" - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - "vst1.8 {q0}, [%0]! \n" - "bgt 50b \n" - "b 99f \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 50b \n" + "b 99f \n" // Blend 100 / 0 - Copy row unchanged. "100: \n" - "vld1.8 {q0}, [%1]! \n" - "subs %3, %3, #16 \n" - "vst1.8 {q0}, [%0]! \n" - "bgt 100b \n" + "vld1.8 {q0}, [%1]! \n" + "subs %3, %3, #16 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 100b \n" "99: \n" : "+r"(dst_ptr), // %0 @@ -2129,51 +2206,51 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb0, uint8_t* dst_argb, int width) { asm volatile( - "subs %3, #8 \n" - "blt 89f \n" + "subs %3, #8 \n" + "blt 89f \n" // Blend 8 pixels. "8: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0. - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vmull.u8 q10, d4, d3 \n" // db * a - "vmull.u8 q11, d5, d3 \n" // dg * a - "vmull.u8 q12, d6, d3 \n" // dr * a + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vmull.u8 q10, d4, d3 \n" // db * a + "vmull.u8 q11, d5, d3 \n" // dg * a + "vmull.u8 q12, d6, d3 \n" // dr * a "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 - "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 - "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 - "vqadd.u8 q0, q0, q2 \n" // + sbg - "vqadd.u8 d2, d2, d6 \n" // + sr - "vmov.u8 d3, #255 \n" // a = 255 - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB. - "bge 8b \n" + "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 + "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 + "vqadd.u8 q0, q0, q2 \n" // + sbg + "vqadd.u8 d2, d2, d6 \n" // + sr + "vmov.u8 d3, #255 \n" // a = 255 + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB. + "bge 8b \n" "89: \n" - "adds %3, #8-1 \n" - "blt 99f \n" + "adds %3, #8-1 \n" + "blt 99f \n" // Blend 1 pixels. "1: \n" - "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0. - "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1. - "subs %3, %3, #1 \n" // 1 processed per loop. - "vmull.u8 q10, d4, d3 \n" // db * a - "vmull.u8 q11, d5, d3 \n" // dg * a - "vmull.u8 q12, d6, d3 \n" // dr * a - "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 - "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 - "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 - "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 - "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 - "vqadd.u8 q0, q0, q2 \n" // + sbg - "vqadd.u8 d2, d2, d6 \n" // + sr - "vmov.u8 d3, #255 \n" // a = 255 - "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel. - "bge 1b \n" - - "99: \n" + "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0. + "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1. + "subs %3, %3, #1 \n" // 1 processed per loop. + "vmull.u8 q10, d4, d3 \n" // db * a + "vmull.u8 q11, d5, d3 \n" // dg * a + "vmull.u8 q12, d6, d3 \n" // dr * a + "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 + "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 + "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 + "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 + "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 + "vqadd.u8 q0, q0, q2 \n" // + sbg + "vqadd.u8 d2, d2, d6 \n" // + sr + "vmov.u8 d3, #255 \n" // a = 255 + "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel. + "bge 1b \n" + + "99: \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 @@ -2190,16 +2267,16 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb, asm volatile( // Attenuate 8 pixels. "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q10, d0, d3 \n" // b * a - "vmull.u8 q11, d1, d3 \n" // g * a - "vmull.u8 q12, d2, d3 \n" // r * a + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q10, d0, d3 \n" // b * a + "vmull.u8 q11, d1, d3 \n" // g * a + "vmull.u8 q12, d2, d3 \n" // r * a "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8 "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8 "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8 - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -2215,32 +2292,32 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb, int interval_offset, int width) { asm volatile( - "vdup.u16 q8, %2 \n" - "vshr.u16 q8, q8, #1 \n" // scale >>= 1 - "vdup.u16 q9, %3 \n" // interval multiply. - "vdup.u16 q10, %4 \n" // interval add + "vdup.u16 q8, %2 \n" + "vshr.u16 q8, q8, #1 \n" // scale >>= 1 + "vdup.u16 q9, %3 \n" // interval multiply. + "vdup.u16 q10, %4 \n" // interval add // 8 pixel loop. "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. - "subs %1, %1, #8 \n" // 8 processed per loop. - "vmovl.u8 q0, d0 \n" // b (0 .. 255) - "vmovl.u8 q1, d2 \n" - "vmovl.u8 q2, d4 \n" + "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. + "subs %1, %1, #8 \n" // 8 processed per loop. + "vmovl.u8 q0, d0 \n" // b (0 .. 255) + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q2, d4 \n" "vqdmulh.s16 q0, q0, q8 \n" // b * scale "vqdmulh.s16 q1, q1, q8 \n" // g "vqdmulh.s16 q2, q2, q8 \n" // r - "vmul.u16 q0, q0, q9 \n" // b * interval_size - "vmul.u16 q1, q1, q9 \n" // g - "vmul.u16 q2, q2, q9 \n" // r - "vadd.u16 q0, q0, q10 \n" // b + interval_offset - "vadd.u16 q1, q1, q10 \n" // g - "vadd.u16 q2, q2, q10 \n" // r - "vqmovn.u16 d0, q0 \n" - "vqmovn.u16 d2, q1 \n" - "vqmovn.u16 d4, q2 \n" - "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" + "vmul.u16 q0, q0, q9 \n" // b * interval_size + "vmul.u16 q1, q1, q9 \n" // g + "vmul.u16 q2, q2, q9 \n" // r + "vadd.u16 q0, q0, q10 \n" // b + interval_offset + "vadd.u16 q1, q1, q10 \n" // g + "vadd.u16 q2, q2, q10 \n" // r + "vqmovn.u16 d0, q0 \n" + "vqmovn.u16 d2, q1 \n" + "vqmovn.u16 d4, q2 \n" + "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 : "r"(scale), // %2 @@ -2257,28 +2334,28 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb, int width, uint32_t value) { asm volatile( - "vdup.u32 q0, %3 \n" // duplicate scale value. - "vzip.u8 d0, d1 \n" // d0 aarrggbb. - "vshr.u16 q0, q0, #1 \n" // scale / 2. + "vdup.u32 q0, %3 \n" // duplicate scale value. + "vzip.u8 d0, d1 \n" // d0 aarrggbb. + "vshr.u16 q0, q0, #1 \n" // scale / 2. // 8 pixel loop. "1: \n" - "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmovl.u8 q10, d20 \n" // b (0 .. 255) - "vmovl.u8 q11, d22 \n" - "vmovl.u8 q12, d24 \n" - "vmovl.u8 q13, d26 \n" + "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmovl.u8 q10, d20 \n" // b (0 .. 255) + "vmovl.u8 q11, d22 \n" + "vmovl.u8 q12, d24 \n" + "vmovl.u8 q13, d26 \n" "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2 "vqrdmulh.s16 q11, q11, d0[1] \n" // g "vqrdmulh.s16 q12, q12, d0[2] \n" // r "vqrdmulh.s16 q13, q13, d0[3] \n" // a - "vqmovn.u16 d20, q10 \n" - "vqmovn.u16 d22, q11 \n" - "vqmovn.u16 d24, q12 \n" - "vqmovn.u16 d26, q13 \n" - "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" + "vqmovn.u16 d20, q10 \n" + "vqmovn.u16 d22, q11 \n" + "vqmovn.u16 d24, q12 \n" + "vqmovn.u16 d26, q13 \n" + "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -2291,20 +2368,20 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb, // C code is (29 * b + 150 * g + 77 * r + 128) >> 8; void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( - "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient - "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient - "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient + "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient + "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient + "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit B - "vmov d1, d0 \n" // G - "vmov d2, d0 \n" // R - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels. - "bgt 1b \n" + "vmov d1, d0 \n" // G + "vmov d2, d0 \n" // R + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -2318,32 +2395,32 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { // r = (r * 50 + g * 98 + b * 24) >> 7 void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) { asm volatile( - "vmov.u8 d20, #17 \n" // BB coefficient - "vmov.u8 d21, #68 \n" // BG coefficient - "vmov.u8 d22, #35 \n" // BR coefficient - "vmov.u8 d24, #22 \n" // GB coefficient - "vmov.u8 d25, #88 \n" // GG coefficient - "vmov.u8 d26, #45 \n" // GR coefficient - "vmov.u8 d28, #24 \n" // BB coefficient - "vmov.u8 d29, #98 \n" // BG coefficient - "vmov.u8 d30, #50 \n" // BR coefficient - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. - "subs %1, %1, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d20 \n" // B to Sepia B - "vmlal.u8 q2, d1, d21 \n" // G - "vmlal.u8 q2, d2, d22 \n" // R - "vmull.u8 q3, d0, d24 \n" // B to Sepia G - "vmlal.u8 q3, d1, d25 \n" // G - "vmlal.u8 q3, d2, d26 \n" // R - "vmull.u8 q8, d0, d28 \n" // B to Sepia R - "vmlal.u8 q8, d1, d29 \n" // G - "vmlal.u8 q8, d2, d30 \n" // R - "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B - "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G - "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R - "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels. - "bgt 1b \n" + "vmov.u8 d20, #17 \n" // BB coefficient + "vmov.u8 d21, #68 \n" // BG coefficient + "vmov.u8 d22, #35 \n" // BR coefficient + "vmov.u8 d24, #22 \n" // GB coefficient + "vmov.u8 d25, #88 \n" // GG coefficient + "vmov.u8 d26, #45 \n" // GR coefficient + "vmov.u8 d28, #24 \n" // BB coefficient + "vmov.u8 d29, #98 \n" // BG coefficient + "vmov.u8 d30, #50 \n" // BR coefficient + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. + "subs %1, %1, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d20 \n" // B to Sepia B + "vmlal.u8 q2, d1, d21 \n" // G + "vmlal.u8 q2, d2, d22 \n" // R + "vmull.u8 q3, d0, d24 \n" // B to Sepia G + "vmlal.u8 q3, d1, d25 \n" // G + "vmlal.u8 q3, d2, d26 \n" // R + "vmull.u8 q8, d0, d28 \n" // B to Sepia R + "vmlal.u8 q8, d1, d29 \n" // G + "vmlal.u8 q8, d2, d30 \n" // R + "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B + "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G + "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R + "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels. + "bgt 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 : @@ -2359,51 +2436,51 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, const int8_t* matrix_argb, int width) { asm volatile( - "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. - "vmovl.s8 q0, d4 \n" // B,G coefficients s16. - "vmovl.s8 q1, d5 \n" // R,A coefficients s16. - - "1: \n" - "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit - "vmovl.u8 q9, d18 \n" // g - "vmovl.u8 q10, d20 \n" // r - "vmovl.u8 q11, d22 \n" // a - "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B - "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G - "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R - "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A - "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B - "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G - "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R - "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A - "vqadd.s16 q12, q12, q4 \n" // Accumulate B - "vqadd.s16 q13, q13, q5 \n" // Accumulate G - "vqadd.s16 q14, q14, q6 \n" // Accumulate R - "vqadd.s16 q15, q15, q7 \n" // Accumulate A - "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B - "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G - "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R - "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A - "vqadd.s16 q12, q12, q4 \n" // Accumulate B - "vqadd.s16 q13, q13, q5 \n" // Accumulate G - "vqadd.s16 q14, q14, q6 \n" // Accumulate R - "vqadd.s16 q15, q15, q7 \n" // Accumulate A - "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B - "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G - "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R - "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A - "vqadd.s16 q12, q12, q4 \n" // Accumulate B - "vqadd.s16 q13, q13, q5 \n" // Accumulate G - "vqadd.s16 q14, q14, q6 \n" // Accumulate R - "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. + "vmovl.s8 q0, d4 \n" // B,G coefficients s16. + "vmovl.s8 q1, d5 \n" // R,A coefficients s16. + + "1: \n" + "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit + "vmovl.u8 q9, d18 \n" // g + "vmovl.u8 q10, d20 \n" // r + "vmovl.u8 q11, d22 \n" // a + "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B + "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G + "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R + "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A + "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B + "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G + "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R + "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A + "vqadd.s16 q12, q12, q4 \n" // Accumulate B + "vqadd.s16 q13, q13, q5 \n" // Accumulate G + "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B + "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G + "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R + "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A + "vqadd.s16 q12, q12, q4 \n" // Accumulate B + "vqadd.s16 q13, q13, q5 \n" // Accumulate G + "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B + "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G + "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R + "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A + "vqadd.s16 q12, q12, q4 \n" // Accumulate B + "vqadd.s16 q13, q13, q5 \n" // Accumulate G + "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A - "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels. - "bgt 1b \n" + "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -2420,19 +2497,19 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0, asm volatile( // 8 pixel loop. "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB - "subs %3, %3, #8 \n" // 8 processed per loop. - "vmull.u8 q0, d0, d1 \n" // multiply B - "vmull.u8 q1, d2, d3 \n" // multiply G - "vmull.u8 q2, d4, d5 \n" // multiply R - "vmull.u8 q3, d6, d7 \n" // multiply A - "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B - "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G - "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R - "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB + "subs %3, %3, #8 \n" // 8 processed per loop. + "vmull.u8 q0, d0, d1 \n" // multiply B + "vmull.u8 q1, d2, d3 \n" // multiply G + "vmull.u8 q2, d4, d5 \n" // multiply R + "vmull.u8 q3, d6, d7 \n" // multiply A + "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B + "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G + "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R + "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 @@ -2449,13 +2526,13 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0, asm volatile( // 8 pixel loop. "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 q0, q0, q2 \n" // add B, G - "vqadd.u8 q1, q1, q3 \n" // add R, A - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 q0, q0, q2 \n" // add B, G + "vqadd.u8 q1, q1, q3 \n" // add R, A + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 @@ -2472,13 +2549,13 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb0, asm volatile( // 8 pixel loop. "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqsub.u8 q0, q0, q2 \n" // subtract B, G - "vqsub.u8 q1, q1, q3 \n" // subtract R, A - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqsub.u8 q0, q0, q2 \n" // subtract B, G + "vqsub.u8 q1, q1, q3 \n" // subtract R, A + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 @@ -2497,17 +2574,17 @@ void SobelRow_NEON(const uint8_t* src_sobelx, uint8_t* dst_argb, int width) { asm volatile( - "vmov.u8 d3, #255 \n" // alpha + "vmov.u8 d3, #255 \n" // alpha // 8 pixel loop. "1: \n" - "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. - "vld1.8 {d1}, [%1]! \n" // load 8 sobely. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 d0, d0, d1 \n" // add - "vmov.u8 d1, d0 \n" - "vmov.u8 d2, d0 \n" - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" + "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. + "vld1.8 {d1}, [%1]! \n" // load 8 sobely. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 d0, d0, d1 \n" // add + "vmov.u8 d1, d0 \n" + "vmov.u8 d2, d0 \n" + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_argb), // %2 @@ -2524,12 +2601,12 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, asm volatile( // 16 pixel loop. "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. - "vld1.8 {q1}, [%1]! \n" // load 16 sobely. - "subs %3, %3, #16 \n" // 16 processed per loop. - "vqadd.u8 q0, q0, q1 \n" // add - "vst1.8 {q0}, [%2]! \n" // store 16 pixels. - "bgt 1b \n" + "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. + "vld1.8 {q1}, [%1]! \n" // load 16 sobely. + "subs %3, %3, #16 \n" // 16 processed per loop. + "vqadd.u8 q0, q0, q1 \n" // add + "vst1.8 {q0}, [%2]! \n" // store 16 pixels. + "bgt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_y), // %2 @@ -2548,15 +2625,15 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx, uint8_t* dst_argb, int width) { asm volatile( - "vmov.u8 d3, #255 \n" // alpha + "vmov.u8 d3, #255 \n" // alpha // 8 pixel loop. "1: \n" - "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. - "vld1.8 {d0}, [%1]! \n" // load 8 sobely. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 d1, d0, d2 \n" // add - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" + "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. + "vld1.8 {d0}, [%1]! \n" // load 8 sobely. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 d1, d0, d2 \n" // add + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_argb), // %2 @@ -2576,23 +2653,23 @@ void SobelXRow_NEON(const uint8_t* src_y0, int width) { asm volatile( "1: \n" - "vld1.8 {d0}, [%0],%5 \n" // top - "vld1.8 {d1}, [%0],%6 \n" - "vsubl.u8 q0, d0, d1 \n" - "vld1.8 {d2}, [%1],%5 \n" // center * 2 - "vld1.8 {d3}, [%1],%6 \n" - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vadd.s16 q0, q0, q1 \n" - "vld1.8 {d2}, [%2],%5 \n" // bottom - "vld1.8 {d3}, [%2],%6 \n" - "subs %4, %4, #8 \n" // 8 pixels - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vabs.s16 q0, q0 \n" - "vqmovn.u16 d0, q0 \n" - "vst1.8 {d0}, [%3]! \n" // store 8 sobelx - "bgt 1b \n" + "vld1.8 {d0}, [%0],%5 \n" // top + "vld1.8 {d1}, [%0],%6 \n" + "vsubl.u8 q0, d0, d1 \n" + "vld1.8 {d2}, [%1],%5 \n" // center * 2 + "vld1.8 {d3}, [%1],%6 \n" + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vadd.s16 q0, q0, q1 \n" + "vld1.8 {d2}, [%2],%5 \n" // bottom + "vld1.8 {d3}, [%2],%6 \n" + "subs %4, %4, #8 \n" // 8 pixels + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vabs.s16 q0, q0 \n" + "vqmovn.u16 d0, q0 \n" + "vst1.8 {d0}, [%3]! \n" // store 8 sobelx + "bgt 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 "+r"(src_y2), // %2 @@ -2614,23 +2691,23 @@ void SobelYRow_NEON(const uint8_t* src_y0, int width) { asm volatile( "1: \n" - "vld1.8 {d0}, [%0],%4 \n" // left - "vld1.8 {d1}, [%1],%4 \n" - "vsubl.u8 q0, d0, d1 \n" - "vld1.8 {d2}, [%0],%4 \n" // center * 2 - "vld1.8 {d3}, [%1],%4 \n" - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vadd.s16 q0, q0, q1 \n" - "vld1.8 {d2}, [%0],%5 \n" // right - "vld1.8 {d3}, [%1],%5 \n" - "subs %3, %3, #8 \n" // 8 pixels - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vabs.s16 q0, q0 \n" - "vqmovn.u16 d0, q0 \n" - "vst1.8 {d0}, [%2]! \n" // store 8 sobely - "bgt 1b \n" + "vld1.8 {d0}, [%0],%4 \n" // left + "vld1.8 {d1}, [%1],%4 \n" + "vsubl.u8 q0, d0, d1 \n" + "vld1.8 {d2}, [%0],%4 \n" // center * 2 + "vld1.8 {d3}, [%1],%4 \n" + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vadd.s16 q0, q0, q1 \n" + "vld1.8 {d2}, [%0],%5 \n" // right + "vld1.8 {d3}, [%1],%5 \n" + "subs %3, %3, #8 \n" // 8 pixels + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vabs.s16 q0, q0 \n" + "vqmovn.u16 d0, q0 \n" + "vst1.8 {d0}, [%2]! \n" // store 8 sobely + "bgt 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 "+r"(dst_sobely), // %2 @@ -2652,18 +2729,18 @@ void HalfFloat1Row_NEON(const uint16_t* src, asm volatile( "1: \n" - "vld1.8 {q1}, [%0]! \n" // load 8 shorts - "subs %2, %2, #8 \n" // 8 pixels per loop - "vmovl.u16 q2, d2 \n" // 8 int's - "vmovl.u16 q3, d3 \n" - "vcvt.f32.u32 q2, q2 \n" // 8 floats - "vcvt.f32.u32 q3, q3 \n" - "vmul.f32 q2, q2, %y3 \n" // adjust exponent - "vmul.f32 q3, q3, %y3 \n" - "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat - "vqshrn.u32 d3, q3, #13 \n" - "vst1.8 {q1}, [%1]! \n" - "bgt 1b \n" + "vld1.8 {q1}, [%0]! \n" // load 8 shorts + "subs %2, %2, #8 \n" // 8 pixels per loop + "vmovl.u16 q2, d2 \n" // 8 int's + "vmovl.u16 q3, d3 \n" + "vcvt.f32.u32 q2, q2 \n" // 8 floats + "vcvt.f32.u32 q3, q3 \n" + "vmul.f32 q2, q2, %y3 \n" // adjust exponent + "vmul.f32 q3, q3, %y3 \n" + "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat + "vqshrn.u32 d3, q3, #13 \n" + "vst1.8 {q1}, [%1]! \n" + "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -2678,18 +2755,18 @@ void HalfFloatRow_NEON(const uint16_t* src, asm volatile( "1: \n" - "vld1.8 {q1}, [%0]! \n" // load 8 shorts - "subs %2, %2, #8 \n" // 8 pixels per loop - "vmovl.u16 q2, d2 \n" // 8 int's - "vmovl.u16 q3, d3 \n" - "vcvt.f32.u32 q2, q2 \n" // 8 floats - "vcvt.f32.u32 q3, q3 \n" - "vmul.f32 q2, q2, %y3 \n" // adjust exponent - "vmul.f32 q3, q3, %y3 \n" - "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat - "vqshrn.u32 d3, q3, #13 \n" - "vst1.8 {q1}, [%1]! \n" - "bgt 1b \n" + "vld1.8 {q1}, [%0]! \n" // load 8 shorts + "subs %2, %2, #8 \n" // 8 pixels per loop + "vmovl.u16 q2, d2 \n" // 8 int's + "vmovl.u16 q3, d3 \n" + "vcvt.f32.u32 q2, q2 \n" // 8 floats + "vcvt.f32.u32 q3, q3 \n" + "vmul.f32 q2, q2, %y3 \n" // adjust exponent + "vmul.f32 q3, q3, %y3 \n" + "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat + "vqshrn.u32 d3, q3, #13 \n" + "vst1.8 {q1}, [%1]! \n" + "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -2704,17 +2781,17 @@ void ByteToFloatRow_NEON(const uint8_t* src, asm volatile( "1: \n" - "vld1.8 {d2}, [%0]! \n" // load 8 bytes - "subs %2, %2, #8 \n" // 8 pixels per loop - "vmovl.u8 q1, d2 \n" // 8 shorts - "vmovl.u16 q2, d2 \n" // 8 ints - "vmovl.u16 q3, d3 \n" - "vcvt.f32.u32 q2, q2 \n" // 8 floats - "vcvt.f32.u32 q3, q3 \n" - "vmul.f32 q2, q2, %y3 \n" // scale - "vmul.f32 q3, q3, %y3 \n" - "vst1.8 {q2, q3}, [%1]! \n" // store 8 floats - "bgt 1b \n" + "vld1.8 {d2}, [%0]! \n" // load 8 bytes + "subs %2, %2, #8 \n" // 8 pixels per loop + "vmovl.u8 q1, d2 \n" // 8 shorts + "vmovl.u16 q2, d2 \n" // 8 ints + "vmovl.u16 q3, d3 \n" + "vcvt.f32.u32 q2, q2 \n" // 8 floats + "vcvt.f32.u32 q3, q3 \n" + "vmul.f32 q2, q2, %y3 \n" // scale + "vmul.f32 q3, q3, %y3 \n" + "vst1.8 {q2, q3}, [%1]! \n" // store 8 floats + "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -2731,26 +2808,26 @@ void GaussCol_NEON(const uint16_t* src0, uint32_t* dst, int width) { asm volatile( - "vmov.u16 d6, #4 \n" // constant 4 - "vmov.u16 d7, #6 \n" // constant 6 - - "1: \n" - "vld1.16 {q1}, [%0]! \n" // load 8 samples, 5 rows - "vld1.16 {q2}, [%4]! \n" - "vaddl.u16 q0, d2, d4 \n" // * 1 - "vaddl.u16 q1, d3, d5 \n" // * 1 - "vld1.16 {q2}, [%1]! \n" - "vmlal.u16 q0, d4, d6 \n" // * 4 - "vmlal.u16 q1, d5, d6 \n" // * 4 - "vld1.16 {q2}, [%2]! \n" - "vmlal.u16 q0, d4, d7 \n" // * 6 - "vmlal.u16 q1, d5, d7 \n" // * 6 - "vld1.16 {q2}, [%3]! \n" - "vmlal.u16 q0, d4, d6 \n" // * 4 - "vmlal.u16 q1, d5, d6 \n" // * 4 - "subs %6, %6, #8 \n" // 8 processed per loop - "vst1.32 {q0, q1}, [%5]! \n" // store 8 samples - "bgt 1b \n" + "vmov.u16 d6, #4 \n" // constant 4 + "vmov.u16 d7, #6 \n" // constant 6 + + "1: \n" + "vld1.16 {q1}, [%0]! \n" // load 8 samples, 5 rows + "vld1.16 {q2}, [%4]! \n" + "vaddl.u16 q0, d2, d4 \n" // * 1 + "vaddl.u16 q1, d3, d5 \n" // * 1 + "vld1.16 {q2}, [%1]! \n" + "vmlal.u16 q0, d4, d6 \n" // * 4 + "vmlal.u16 q1, d5, d6 \n" // * 4 + "vld1.16 {q2}, [%2]! \n" + "vmlal.u16 q0, d4, d7 \n" // * 6 + "vmlal.u16 q1, d5, d7 \n" // * 6 + "vld1.16 {q2}, [%3]! \n" + "vmlal.u16 q0, d4, d6 \n" // * 4 + "vmlal.u16 q1, d5, d6 \n" // * 4 + "subs %6, %6, #8 \n" // 8 processed per loop + "vst1.32 {q0, q1}, [%5]! \n" // store 8 samples + "bgt 1b \n" : "+r"(src0), // %0 "+r"(src1), // %1 "+r"(src2), // %2 @@ -2768,8 +2845,8 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) { const uint32_t* src2 = src + 2; const uint32_t* src3 = src + 3; asm volatile( - "vmov.u32 q10, #4 \n" // constant 4 - "vmov.u32 q11, #6 \n" // constant 6 + "vmov.u32 q10, #4 \n" // constant 4 + "vmov.u32 q11, #6 \n" // constant 6 "1: \n" "vld1.32 {q0, q1}, [%0]! \n" // load 12 source samples @@ -2807,16 +2884,16 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y, int width) { asm volatile( "1: \n" - "vld1.8 {q2}, [%0]! \n" // load 16 Y values - "vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values - "vmov d1, d0 \n" - "vzip.u8 d0, d1 \n" // VV - "vmov d3, d2 \n" - "vzip.u8 d2, d3 \n" // UU - "subs %3, %3, #16 \n" // 16 pixels per loop - "vst3.8 {d0, d2, d4}, [%2]! \n" // store 16 YUV pixels - "vst3.8 {d1, d3, d5}, [%2]! \n" - "bgt 1b \n" + "vld1.8 {q2}, [%0]! \n" // load 16 Y values + "vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values + "vmov d1, d0 \n" + "vzip.u8 d0, d1 \n" // VV + "vmov d3, d2 \n" + "vzip.u8 d2, d3 \n" // UU + "subs %3, %3, #16 \n" // 16 pixels per loop + "vst3.8 {d0, d2, d4}, [%2]! \n" // store 16 YUV pixels + "vst3.8 {d1, d3, d5}, [%2]! \n" + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_vu), // %1 "+r"(dst_yuv24), // %2 @@ -2830,24 +2907,24 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_uv, int width) { asm volatile( - "add %1, %0, %1 \n" // src_stride + src_AYUV + "add %1, %0, %1 \n" // src_stride + src_AYUV "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV // pixels. - "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV + "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV // pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV // pixels. - "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "vqrshrun.s16 d1, q0, #2 \n" // 2x2 average "vqrshrun.s16 d0, q1, #2 \n" - "subs %3, %3, #16 \n" // 16 processed per loop. - "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels UV. - "bgt 1b \n" + "subs %3, %3, #16 \n" // 16 processed per loop. + "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels UV. + "bgt 1b \n" : "+r"(src_ayuv), // %0 "+r"(src_stride_ayuv), // %1 "+r"(dst_uv), // %2 @@ -2861,24 +2938,24 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv, uint8_t* dst_vu, int width) { asm volatile( - "add %1, %0, %1 \n" // src_stride + src_AYUV + "add %1, %0, %1 \n" // src_stride + src_AYUV "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV // pixels. - "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV + "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV // pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV // pixels. - "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "vqrshrun.s16 d0, q0, #2 \n" // 2x2 average "vqrshrun.s16 d1, q1, #2 \n" - "subs %3, %3, #16 \n" // 16 processed per loop. - "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels VU. - "bgt 1b \n" + "subs %3, %3, #16 \n" // 16 processed per loop. + "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels VU. + "bgt 1b \n" : "+r"(src_ayuv), // %0 "+r"(src_stride_ayuv), // %1 "+r"(dst_vu), // %2 @@ -2892,11 +2969,11 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv, void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { asm volatile( "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels - "subs %2, %2, #16 \n" // 16 processed per loop - "vst1.8 {q2}, [%1]! \n" // store 16 Y's. - "bgt 1b \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels + "subs %2, %2, #16 \n" // 16 processed per loop + "vst1.8 {q2}, [%1]! \n" // store 16 Y's. + "bgt 1b \n" : "+r"(src_ayuv), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -2908,12 +2985,12 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { asm volatile( "1: \n" - "vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values - "vld2.8 {d1, d3}, [%0]! \n" - "vorr.u8 q2, q0, q0 \n" // move U after V - "subs %2, %2, #16 \n" // 16 pixels per loop - "vst2.8 {q1, q2}, [%1]! \n" // store 16 VU pixels - "bgt 1b \n" + "vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values + "vld2.8 {d1, d3}, [%0]! \n" + "vorr.u8 q2, q0, q0 \n" // move U after V + "subs %2, %2, #16 \n" // 16 pixels per loop + "vst2.8 {q1, q2}, [%1]! \n" // store 16 VU pixels + "bgt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_vu), // %1 "+r"(width) // %2 @@ -2921,6 +2998,39 @@ void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { : "cc", "memory", "q0", "q1", "q2"); } +void HalfMergeUVRow_NEON(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int width) { + const uint8_t* src_u_1 = src_u + src_stride_u; + const uint8_t* src_v_1 = src_v + src_stride_v; + asm volatile( + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 16 U values + "vld1.8 {q1}, [%2]! \n" // load 16 V values + "vld1.8 {q2}, [%1]! \n" + "vld1.8 {q3}, [%3]! \n" + "vpaddl.u8 q0, q0 \n" // half size + "vpaddl.u8 q1, q1 \n" + "vpadal.u8 q0, q2 \n" + "vpadal.u8 q1, q3 \n" + "vqrshrn.u16 d0, q0, #2 \n" + "vqrshrn.u16 d1, q1, #2 \n" + "subs %5, %5, #16 \n" // 16 src pixels per loop + "vst2.8 {d0, d1}, [%4]! \n" // store 8 UV pixels + "bgt 1b \n" + : "+r"(src_u), // %0 + "+r"(src_u_1), // %1 + "+r"(src_v), // %2 + "+r"(src_v_1), // %3 + "+r"(dst_uv), // %4 + "+r"(width) // %5 + : + : "cc", "memory", "q0", "q1", "q2", "q3"); +} + #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__).. #ifdef __cplusplus diff --git a/chromium/third_party/libyuv/source/row_neon64.cc b/chromium/third_party/libyuv/source/row_neon64.cc index 866e7bfc6e0..d5258a3aef3 100644 --- a/chromium/third_party/libyuv/source/row_neon64.cc +++ b/chromium/third_party/libyuv/source/row_neon64.cc @@ -68,13 +68,13 @@ extern "C" { "uzp2 v3.8b, v2.8b, v2.8b \n" \ "ins v1.s[1], v3.s[0] \n" -#define YUVTORGB_SETUP \ - "ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \ - "ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \ - "ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \ - "ld1r {v31.4s}, [%[kYToRgb]] \n" \ - "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \ - "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n" +#define YUVTORGB_SETUP \ + "ld3r {v24.8h, v25.8h, v26.8h}, [%[kUVBiasBGR]] \n" \ + "ld1r {v31.4s}, [%[kYToRgb]] \n" \ + "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \ + "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n" + +// clang-format off #define YUVTORGB(vR, vG, vB) \ "uxtl v0.8h, v0.8b \n" /* Extract Y */ \ @@ -89,29 +89,23 @@ extern "C" { "mov v2.d[0], v1.d[1] \n" /* Extract V */ \ "uxtl v2.8h, v2.8b \n" \ "uxtl v1.8h, v1.8b \n" /* Extract U */ \ - "mul v3.8h, v1.8h, v27.8h \n" \ - "mul v5.8h, v1.8h, v29.8h \n" \ - "mul v6.8h, v2.8h, v30.8h \n" \ - "mul v7.8h, v2.8h, v28.8h \n" \ + "mul v3.8h, v27.8h, v1.8h \n" \ + "mul v5.8h, v29.8h, v1.8h \n" \ + "mul v6.8h, v30.8h, v2.8h \n" \ + "mul v7.8h, v28.8h, v2.8h \n" \ "sqadd v6.8h, v6.8h, v5.8h \n" \ - "sqadd " #vB \ - ".8h, v24.8h, v0.8h \n" /* B */ \ - "sqadd " #vG \ - ".8h, v25.8h, v0.8h \n" /* G */ \ - "sqadd " #vR \ - ".8h, v26.8h, v0.8h \n" /* R */ \ - "sqadd " #vB ".8h, " #vB \ - ".8h, v3.8h \n" /* B */ \ - "sqsub " #vG ".8h, " #vG \ - ".8h, v6.8h \n" /* G */ \ - "sqadd " #vR ".8h, " #vR \ - ".8h, v7.8h \n" /* R */ \ - "sqshrun " #vB ".8b, " #vB \ - ".8h, #6 \n" /* B */ \ - "sqshrun " #vG ".8b, " #vG \ - ".8h, #6 \n" /* G */ \ + "sqadd " #vB ".8h, v24.8h, v0.8h \n" /* B */ \ + "sqadd " #vG ".8h, v25.8h, v0.8h \n" /* G */ \ + "sqadd " #vR ".8h, v26.8h, v0.8h \n" /* R */ \ + "sqadd " #vB ".8h, " #vB ".8h, v3.8h \n" /* B */ \ + "sqsub " #vG ".8h, " #vG ".8h, v6.8h \n" /* G */ \ + "sqadd " #vR ".8h, " #vR ".8h, v7.8h \n" /* R */ \ + "sqshrun " #vB ".8b, " #vB ".8h, #6 \n" /* B */ \ + "sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */ \ "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ +// clang-format on + void I444ToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -120,13 +114,16 @@ void I444ToARGBRow_NEON(const uint8_t* src_y, int width) { asm volatile ( YUVTORGB_SETUP - "movi v23.8b, #255 \n" /* A */ - "1: \n" + "movi v23.8b, #255 \n" /* A */ + "1: \n" READYUV444 + "prfm pldl1keep, [%0, 448] \n" YUVTORGB(v22, v21, v20) - "subs %w4, %w4, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" - "b.gt 1b \n" + "prfm pldl1keep, [%1, 448] \n" + "prfm pldl1keep, [%2, 448] \n" + "subs %w4, %w4, #8 \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -149,13 +146,17 @@ void I422ToARGBRow_NEON(const uint8_t* src_y, int width) { asm volatile ( YUVTORGB_SETUP - "movi v23.8b, #255 \n" /* A */ - "1: \n" + "movi v23.8b, #255 \n" /* A */ + + "1: \n" READYUV422 + "prfm pldl1keep, [%0, 448] \n" YUVTORGB(v22, v21, v20) - "subs %w4, %w4, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" - "b.gt 1b \n" + "prfm pldl1keep, [%1, 128] \n" + "prfm pldl1keep, [%2, 128] \n" + "subs %w4, %w4, #8 \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -179,13 +180,17 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y, int width) { asm volatile ( YUVTORGB_SETUP - "1: \n" + "1: \n" READYUV422 + "prfm pldl1keep, [%0, 448] \n" YUVTORGB(v22, v21, v20) - "ld1 {v23.8b}, [%3], #8 \n" - "subs %w5, %w5, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n" - "b.gt 1b \n" + "ld1 {v23.8b}, [%3], #8 \n" + "prfm pldl1keep, [%1, 128] \n" + "prfm pldl1keep, [%2, 128] \n" + "prfm pldl1keep, [%3, 448] \n" + "subs %w5, %w5, #8 \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n" + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -209,13 +214,16 @@ void I422ToRGBARow_NEON(const uint8_t* src_y, int width) { asm volatile ( YUVTORGB_SETUP - "movi v20.8b, #255 \n" /* A */ - "1: \n" + "movi v20.8b, #255 \n" /* A */ + "1: \n" READYUV422 + "prfm pldl1keep, [%0, 448] \n" YUVTORGB(v23, v22, v21) - "subs %w4, %w4, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" - "b.gt 1b \n" + "prfm pldl1keep, [%1, 128] \n" + "prfm pldl1keep, [%2, 128] \n" + "subs %w4, %w4, #8 \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -238,12 +246,15 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y, int width) { asm volatile ( YUVTORGB_SETUP - "1: \n" + "1: \n" READYUV422 + "prfm pldl1keep, [%0, 448] \n" YUVTORGB(v22, v21, v20) - "subs %w4, %w4, #8 \n" - "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" - "b.gt 1b \n" + "prfm pldl1keep, [%1, 128] \n" + "prfm pldl1keep, [%2, 128] \n" + "subs %w4, %w4, #8 \n" + "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -265,6 +276,8 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y, "sri v0.8h, v21.8h, #5 \n" /* RG */ \ "sri v0.8h, v20.8h, #11 \n" /* RGB */ +// clang-format off + void I422ToRGB565Row_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -272,13 +285,17 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y, const struct YuvConstants* yuvconstants, int width) { asm volatile( - YUVTORGB_SETUP - "1: \n" READYUV422 YUVTORGB( - v22, v21, - v20) "subs %w4, %w4, #8 \n" ARGBTORGB565 - "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels - // RGB565. - "b.gt 1b \n" + YUVTORGB_SETUP + "1: \n" + READYUV422 + YUVTORGB(v22, v21, v20) + "prfm pldl1keep, [%0, 448] \n" + "subs %w4, %w4, #8 \n" + ARGBTORGB565 + "prfm pldl1keep, [%1, 128] \n" + "prfm pldl1keep, [%2, 128] \n" + "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -308,14 +325,18 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y, const struct YuvConstants* yuvconstants, int width) { asm volatile( - YUVTORGB_SETUP - "movi v23.8b, #255 \n" - "1: \n" READYUV422 YUVTORGB( - v22, v21, - v20) "subs %w4, %w4, #8 \n" ARGBTOARGB1555 - "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels - // RGB565. - "b.gt 1b \n" + YUVTORGB_SETUP + "movi v23.8b, #255 \n" + "1: \n" + READYUV422 + YUVTORGB(v22, v21, v20) + "prfm pldl1keep, [%0, 448] \n" + "subs %w4, %w4, #8 \n" + ARGBTOARGB1555 + "prfm pldl1keep, [%1, 128] \n" + "prfm pldl1keep, [%2, 128] \n" + "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -328,6 +349,7 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y, : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"); } +// clang-format on #define ARGBTOARGB4444 \ /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \ @@ -347,15 +369,18 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y, int width) { asm volatile ( YUVTORGB_SETUP - "movi v4.16b, #0x0f \n" // bits to clear with vbic. - "1: \n" + "movi v4.16b, #0x0f \n" // bits to clear with vbic. + "1: \n" READYUV422 YUVTORGB(v22, v21, v20) - "subs %w4, %w4, #8 \n" - "movi v23.8b, #255 \n" + "prfm pldl1keep, [%0, 448] \n" + "subs %w4, %w4, #8 \n" + "movi v23.8b, #255 \n" ARGBTOARGB4444 - "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444. - "b.gt 1b \n" + "prfm pldl1keep, [%1, 128] \n" + "prfm pldl1keep, [%2, 128] \n" + "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444. + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -370,23 +395,27 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y, ); } -void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { +void I400ToARGBRow_NEON(const uint8_t* src_y, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { asm volatile ( YUVTORGB_SETUP - "movi v23.8b, #255 \n" - "1: \n" + "movi v23.8b, #255 \n" + "1: \n" READYUV400 YUVTORGB(v22, v21, v20) - "subs %w2, %w2, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" - "b.gt 1b \n" + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 - : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB), - [kUVToG]"r"(&kYuvI601Constants.kUVToG), - [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR), - [kYToRgb]"r"(&kYuvI601Constants.kYToRgb) + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); @@ -394,14 +423,15 @@ void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { asm volatile( - "movi v23.8b, #255 \n" + "movi v23.8b, #255 \n" "1: \n" - "ld1 {v20.8b}, [%0], #8 \n" - "orr v21.8b, v20.8b, v20.8b \n" - "orr v22.8b, v20.8b, v20.8b \n" - "subs %w2, %w2, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" - "b.gt 1b \n" + "ld1 {v20.8b}, [%0], #8 \n" + "prfm pldl1keep, [%0, 448] \n" + "orr v21.8b, v20.8b, v20.8b \n" + "orr v22.8b, v20.8b, v20.8b \n" + "subs %w2, %w2, #8 \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -416,13 +446,15 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y, int width) { asm volatile ( YUVTORGB_SETUP - "movi v23.8b, #255 \n" - "1: \n" + "movi v23.8b, #255 \n" + "1: \n" READNV12 + "prfm pldl1keep, [%0, 448] \n" YUVTORGB(v22, v21, v20) - "subs %w3, %w3, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" - "b.gt 1b \n" + "prfm pldl1keep, [%1, 256] \n" + "subs %w3, %w3, #8 \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_uv), // %1 "+r"(dst_argb), // %2 @@ -443,13 +475,15 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y, int width) { asm volatile ( YUVTORGB_SETUP - "movi v23.8b, #255 \n" - "1: \n" + "movi v23.8b, #255 \n" + "1: \n" READNV21 + "prfm pldl1keep, [%0, 448] \n" YUVTORGB(v22, v21, v20) - "subs %w3, %w3, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" - "b.gt 1b \n" + "prfm pldl1keep, [%1, 256] \n" + "subs %w3, %w3, #8 \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_vu), // %1 "+r"(dst_argb), // %2 @@ -470,12 +504,14 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y, int width) { asm volatile ( YUVTORGB_SETUP - "1: \n" + "1: \n" READNV12 + "prfm pldl1keep, [%0, 448] \n" YUVTORGB(v22, v21, v20) - "subs %w3, %w3, #8 \n" - "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n" - "b.gt 1b \n" + "prfm pldl1keep, [%1, 256] \n" + "subs %w3, %w3, #8 \n" + "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n" + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_uv), // %1 "+r"(dst_rgb24), // %2 @@ -496,12 +532,14 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y, int width) { asm volatile ( YUVTORGB_SETUP - "1: \n" + "1: \n" READNV21 + "prfm pldl1keep, [%0, 448] \n" YUVTORGB(v22, v21, v20) - "subs %w3, %w3, #8 \n" - "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n" - "b.gt 1b \n" + "prfm pldl1keep, [%1, 256] \n" + "subs %w3, %w3, #8 \n" + "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n" + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_vu), // %1 "+r"(dst_rgb24), // %2 @@ -521,13 +559,13 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y, const struct YuvConstants* yuvconstants, int width) { asm volatile( - YUVTORGB_SETUP - "1: \n" READNV12 YUVTORGB( - v22, v21, - v20) "subs %w3, %w3, #8 \n" ARGBTORGB565 - "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels - // RGB565. - "b.gt 1b \n" + YUVTORGB_SETUP "1: \n" READNV12 + "prfm pldl1keep, [%0, 448] \n" YUVTORGB( + v22, v21, v20) ARGBTORGB565 + "prfm pldl1keep, [%1, 256] \n" + "subs %w3, %w3, #8 \n" + "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_uv), // %1 "+r"(dst_rgb565), // %2 @@ -546,13 +584,14 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, int width) { asm volatile ( YUVTORGB_SETUP - "movi v23.8b, #255 \n" - "1: \n" + "movi v23.8b, #255 \n" + "1: \n" READYUY2 + "prfm pldl1keep, [%0, 448] \n" YUVTORGB(v22, v21, v20) - "subs %w2, %w2, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" - "b.gt 1b \n" + "subs %w2, %w2, #8 \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" + "b.gt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -571,13 +610,14 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, int width) { asm volatile ( YUVTORGB_SETUP - "movi v23.8b, #255 \n" - "1: \n" + "movi v23.8b, #255 \n" + "1: \n" READUYVY YUVTORGB(v22, v21, v20) - "subs %w2, %w2, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n" - "b.gt 1b \n" + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n" + "b.gt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -597,11 +637,12 @@ void SplitUVRow_NEON(const uint8_t* src_uv, int width) { asm volatile( "1: \n" - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV - "subs %w3, %w3, #16 \n" // 16 processed per loop - "st1 {v0.16b}, [%1], #16 \n" // store U - "st1 {v1.16b}, [%2], #16 \n" // store V - "b.gt 1b \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV + "prfm pldl1keep, [%0, 448] \n" + "subs %w3, %w3, #16 \n" // 16 processed per loop + "st1 {v0.16b}, [%1], #16 \n" // store U + "st1 {v1.16b}, [%2], #16 \n" // store V + "b.gt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -618,11 +659,13 @@ void MergeUVRow_NEON(const uint8_t* src_u, int width) { asm volatile( "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load U - "ld1 {v1.16b}, [%1], #16 \n" // load V - "subs %w3, %w3, #16 \n" // 16 processed per loop - "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV - "b.gt 1b \n" + "ld1 {v0.16b}, [%0], #16 \n" // load U + "ld1 {v1.16b}, [%1], #16 \n" // load V + "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%1, 448] \n" + "subs %w3, %w3, #16 \n" // 16 processed per loop + "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV + "b.gt 1b \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 @@ -640,12 +683,13 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb, int width) { asm volatile( "1: \n" - "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB - "subs %w4, %w4, #16 \n" // 16 processed per loop - "st1 {v0.16b}, [%1], #16 \n" // store R - "st1 {v1.16b}, [%2], #16 \n" // store G - "st1 {v2.16b}, [%3], #16 \n" // store B - "b.gt 1b \n" + "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB + "prfm pldl1keep, [%0, 448] \n" + "subs %w4, %w4, #16 \n" // 16 processed per loop + "st1 {v0.16b}, [%1], #16 \n" // store R + "st1 {v1.16b}, [%2], #16 \n" // store G + "st1 {v2.16b}, [%3], #16 \n" // store B + "b.gt 1b \n" : "+r"(src_rgb), // %0 "+r"(dst_r), // %1 "+r"(dst_g), // %2 @@ -664,12 +708,16 @@ void MergeRGBRow_NEON(const uint8_t* src_r, int width) { asm volatile( "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load R - "ld1 {v1.16b}, [%1], #16 \n" // load G - "ld1 {v2.16b}, [%2], #16 \n" // load B - "subs %w4, %w4, #16 \n" // 16 processed per loop - "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB - "b.gt 1b \n" + "ld1 {v0.16b}, [%0], #16 \n" // load R + "ld1 {v1.16b}, [%1], #16 \n" // load G + "ld1 {v2.16b}, [%2], #16 \n" // load B + "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%1, 448] \n" + "prfm pldl1keep, [%2, 448] \n" + "subs %w4, %w4, #16 \n" // 16 processed per loop + "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB + "prfm pldl1keep, [%0, 448] \n" + "b.gt 1b \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 @@ -684,10 +732,11 @@ void MergeRGBRow_NEON(const uint8_t* src_r, void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "1: \n" - "ldp q0, q1, [%0], #32 \n" - "subs %w2, %w2, #32 \n" // 32 processed per loop - "stp q0, q1, [%1], #32 \n" - "b.gt 1b \n" + "ldp q0, q1, [%0], #32 \n" + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #32 \n" // 32 processed per loop + "stp q0, q1, [%1], #32 \n" + "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 // Output registers @@ -699,11 +748,11 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { // SetRow writes 'width' bytes using an 8 bit value repeated. void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { asm volatile( - "dup v0.16b, %w2 \n" // duplicate 16 bytes + "dup v0.16b, %w2 \n" // duplicate 16 bytes "1: \n" - "subs %w1, %w1, #16 \n" // 16 bytes per loop - "st1 {v0.16b}, [%0], #16 \n" // store - "b.gt 1b \n" + "subs %w1, %w1, #16 \n" // 16 bytes per loop + "st1 {v0.16b}, [%0], #16 \n" // store + "b.gt 1b \n" : "+r"(dst), // %0 "+r"(width) // %1 : "r"(v8) // %2 @@ -712,89 +761,157 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { asm volatile( - "dup v0.4s, %w2 \n" // duplicate 4 ints + "dup v0.4s, %w2 \n" // duplicate 4 ints "1: \n" - "subs %w1, %w1, #4 \n" // 4 ints per loop - "st1 {v0.16b}, [%0], #16 \n" // store - "b.gt 1b \n" + "subs %w1, %w1, #4 \n" // 4 ints per loop + "st1 {v0.16b}, [%0], #16 \n" // store + "b.gt 1b \n" : "+r"(dst), // %0 "+r"(width) // %1 : "r"(v32) // %2 : "cc", "memory", "v0"); } +// Shuffle table for reversing the bytes. +static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, + 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; + void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { asm volatile( // Start at end of source row. - "add %0, %0, %w2, sxtw \n" - "sub %0, %0, #16 \n" - "1: \n" - "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 - "subs %w2, %w2, #16 \n" // 16 pixels per loop. - "rev64 v0.16b, v0.16b \n" - "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 - "st1 {v0.D}[0], [%1], #8 \n" - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"((ptrdiff_t)-16) // %3 - : "cc", "memory", "v0"); + "ld1 {v3.16b}, [%3] \n" // shuffler + "add %0, %0, %w2, sxtw \n" + "sub %0, %0, #32 \n" + "1: \n" + "ldr q2, [%0, 16] \n" + "ldr q1, [%0], -32 \n" // src -= 32 + "subs %w2, %w2, #32 \n" // 32 pixels per loop. + "tbl v0.16b, {v2.16b}, v3.16b \n" + "tbl v1.16b, {v1.16b}, v3.16b \n" + "st1 {v0.16b, v1.16b}, [%1], #32 \n" // store 32 pixels + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(&kShuffleMirror) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3"); } -void MirrorUVRow_NEON(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { +// Shuffle table for reversing the UV. +static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u, + 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u}; + +void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) { asm volatile( // Start at end of source row. - "add %0, %0, %w3, sxtw #1 \n" - "sub %0, %0, #16 \n" - "1: \n" - "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 - "subs %w3, %w3, #8 \n" // 8 pixels per loop. - "rev64 v0.8b, v0.8b \n" - "rev64 v1.8b, v1.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // dst += 8 - "st1 {v1.8b}, [%2], #8 \n" - "b.gt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"((ptrdiff_t)-16) // %4 - : "cc", "memory", "v0", "v1"); -} - -void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { + "ld1 {v4.16b}, [%3] \n" // shuffler + "add %0, %0, %w2, sxtw #1 \n" + "sub %0, %0, #32 \n" + "1: \n" + "ldr q1, [%0, 16] \n" + "ldr q0, [%0], -32 \n" // src -= 32 + "subs %w2, %w2, #16 \n" // 16 pixels per loop. + "tbl v2.16b, {v1.16b}, v4.16b \n" + "tbl v3.16b, {v0.16b}, v4.16b \n" + "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32 + "b.gt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_uv), // %1 + "+r"(width) // %2 + : "r"(&kShuffleMirrorUV) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); +} + +void MirrorSplitUVRow_NEON(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { asm volatile( // Start at end of source row. - "add %0, %0, %w2, sxtw #2 \n" - "sub %0, %0, #16 \n" - "1: \n" - "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 - "subs %w2, %w2, #4 \n" // 4 pixels per loop. - "rev64 v0.4s, v0.4s \n" - "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 - "st1 {v0.D}[0], [%1], #8 \n" - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"((ptrdiff_t)-16) // %3 - : "cc", "memory", "v0"); + "ld1 {v4.16b}, [%4] \n" // shuffler + "add %0, %0, %w3, sxtw #1 \n" + "sub %0, %0, #32 \n" + "1: \n" + "ldr q1, [%0, 16] \n" + "ldr q0, [%0], -32 \n" // src -= 32 + "subs %w3, %w3, #16 \n" // 16 pixels per loop. + "tbl v2.16b, {v1.16b}, v4.16b \n" + "tbl v3.16b, {v0.16b}, v4.16b \n" + "uzp1 v0.16b, v2.16b, v3.16b \n" // U + "uzp2 v1.16b, v2.16b, v3.16b \n" // V + "st1 {v0.16b}, [%1], #16 \n" // dst += 16 + "st1 {v1.16b}, [%2], #16 \n" + "b.gt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"(&kShuffleMirrorUV) // %4 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); +} + +// Shuffle table for reversing the ARGB. +static const uvec8 kShuffleMirrorARGB = {12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, + 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u}; + +void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { + asm volatile( + // Start at end of source row. + "ld1 {v4.16b}, [%3] \n" // shuffler + "add %0, %0, %w2, sxtw #2 \n" + "sub %0, %0, #32 \n" + "1: \n" + "ldr q1, [%0, 16] \n" + "ldr q0, [%0], -32 \n" // src -= 32 + "subs %w2, %w2, #8 \n" // 8 pixels per loop. + "tbl v2.16b, {v1.16b}, v4.16b \n" + "tbl v3.16b, {v0.16b}, v4.16b \n" + "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32 + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(&kShuffleMirrorARGB) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); +} + +void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, + uint8_t* dst_rgb24, + int width) { + asm volatile( + "ld1 {v3.16b}, [%4] \n" // shuffler + "add %0, %0, %w2, sxtw #1 \n" // Start at end of row. + "add %0, %0, %w2, sxtw \n" + "sub %0, %0, #48 \n" + + "1: \n" + "ld3 {v0.16b, v1.16b, v2.16b}, [%0], %3 \n" // src -= 48 + "subs %w2, %w2, #16 \n" // 16 pixels per loop. + "tbl v0.16b, {v0.16b}, v3.16b \n" + "tbl v1.16b, {v1.16b}, v3.16b \n" + "tbl v2.16b, {v2.16b}, v3.16b \n" + "st3 {v0.16b, v1.16b, v2.16b}, [%1], #48 \n" // dst += 48 + "b.gt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_rgb24), // %1 + "+r"(width) // %2 + : "r"((ptrdiff_t)-48), // %3 + "r"(&kShuffleMirror) // %4 + : "cc", "memory", "v0", "v1", "v2", "v3"); } void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { asm volatile( - "movi v4.8b, #255 \n" // Alpha + "movi v4.8b, #255 \n" // Alpha "1: \n" - "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB - "b.gt 1b \n" + "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of + // RGB24. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -805,14 +922,15 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { asm volatile( - "movi v5.8b, #255 \n" // Alpha + "movi v5.8b, #255 \n" // Alpha "1: \n" - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "orr v3.8b, v1.8b, v1.8b \n" // move g - "orr v4.8b, v0.8b, v0.8b \n" // move r - "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a - "b.gt 1b \n" + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "orr v3.8b, v1.8b, v1.8b \n" // move g + "orr v4.8b, v0.8b, v0.8b \n" // move r + "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a + "b.gt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -823,14 +941,15 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { asm volatile( - "movi v0.8b, #255 \n" // Alpha + "movi v0.8b, #255 \n" // Alpha "1: \n" - "ld3 {v3.8b,v4.8b,v5.8b}, [%0], #24 \n" // read r g b - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "orr v2.8b, v4.8b, v4.8b \n" // move g - "orr v1.8b, v5.8b, v5.8b \n" // move r - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store a b g r - "b.gt 1b \n" + "ld3 {v3.8b,v4.8b,v5.8b}, [%0], #24 \n" // read r g b + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "orr v2.8b, v4.8b, v4.8b \n" // move g + "orr v1.8b, v5.8b, v5.8b \n" // move r + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store a b g r + "b.gt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_rgba), // %1 "+r"(width) // %2 @@ -842,12 +961,13 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { asm volatile( "1: \n" - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "orr v3.8b, v1.8b, v1.8b \n" // move g - "orr v4.8b, v0.8b, v0.8b \n" // move r - "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r - "b.gt 1b \n" + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "orr v3.8b, v1.8b, v1.8b \n" // move g + "orr v4.8b, v0.8b, v0.8b \n" // move r + "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r + "b.gt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_rgb24), // %1 "+r"(width) // %2 @@ -873,13 +993,14 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_argb, int width) { asm volatile( - "movi v3.8b, #255 \n" // Alpha + "movi v3.8b, #255 \n" // Alpha "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. + "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. RGB565TOARGB - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB - "b.gt 1b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" : "+r"(src_rgb565), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -929,14 +1050,14 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_argb, int width) { asm volatile( - "movi v3.8b, #255 \n" // Alpha + "movi v3.8b, #255 \n" // Alpha "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGB1555TOARGB - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB - // pixels - "b.gt 1b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" : "+r"(src_argb1555), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -945,6 +1066,8 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, ); } +// Convert v0.8h to b = v0.8b g = v1.8b r = v2.8b +// clobbers v3 #define ARGB4444TOARGB \ "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \ "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \ @@ -962,12 +1085,12 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, int width) { asm volatile( "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGB4444TOARGB - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB - // pixels - "b.gt 1b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" : "+r"(src_argb4444), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -981,11 +1104,12 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb, int width) { asm volatile( "1: \n" - "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of - // RGB24. - "b.gt 1b \n" + "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of + // RGB24 + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgb24), // %1 "+r"(width) // %2 @@ -997,12 +1121,13 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb, void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { asm volatile( "1: \n" - "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "orr v4.8b, v2.8b, v2.8b \n" // mov g - "orr v5.8b, v1.8b, v1.8b \n" // mov b - "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b - "b.gt 1b \n" + "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "orr v4.8b, v2.8b, v2.8b \n" // mov g + "orr v5.8b, v1.8b, v1.8b \n" // mov b + "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_raw), // %1 "+r"(width) // %2 @@ -1014,10 +1139,11 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { asm volatile( "1: \n" - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. - "subs %w2, %w2, #16 \n" // 16 processed per loop. - "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. - "b.gt 1b \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #16 \n" // 16 processed per loop. + "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. + "b.gt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1029,10 +1155,11 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { asm volatile( "1: \n" - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. - "subs %w2, %w2, #16 \n" // 16 processed per loop. - "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. - "b.gt 1b \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #16 \n" // 16 processed per loop. + "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. + "b.gt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1047,11 +1174,12 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, int width) { asm volatile( "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 - "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. - "st1 {v1.8b}, [%1], #8 \n" // store 8 U. - "st1 {v3.8b}, [%2], #8 \n" // store 8 V. - "b.gt 1b \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 + "prfm pldl1keep, [%0, 448] \n" + "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. + "st1 {v1.8b}, [%1], #8 \n" // store 8 U. + "st1 {v3.8b}, [%2], #8 \n" // store 8 V. + "b.gt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -1067,11 +1195,12 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, int width) { asm volatile( "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY - "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. - "st1 {v0.8b}, [%1], #8 \n" // store 8 U. - "st1 {v2.8b}, [%2], #8 \n" // store 8 V. - "b.gt 1b \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY + "prfm pldl1keep, [%0, 448] \n" + "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. + "st1 {v0.8b}, [%1], #8 \n" // store 8 U. + "st1 {v2.8b}, [%2], #8 \n" // store 8 V. + "b.gt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -1089,14 +1218,15 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2; asm volatile( "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels - "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row - "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U - "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V - "st1 {v1.8b}, [%2], #8 \n" // store 8 U. - "st1 {v3.8b}, [%3], #8 \n" // store 8 V. - "b.gt 1b \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels + "prfm pldl1keep, [%0, 448] \n" + "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row + "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U + "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V + "st1 {v1.8b}, [%2], #8 \n" // store 8 U. + "st1 {v3.8b}, [%3], #8 \n" // store 8 V. + "b.gt 1b \n" : "+r"(src_yuy2), // %0 "+r"(src_yuy2b), // %1 "+r"(dst_u), // %2 @@ -1116,14 +1246,15 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy, const uint8_t* src_uyvyb = src_uyvy + stride_uyvy; asm volatile( "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels - "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row - "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U - "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V - "st1 {v0.8b}, [%2], #8 \n" // store 8 U. - "st1 {v2.8b}, [%3], #8 \n" // store 8 V. - "b.gt 1b \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels + "prfm pldl1keep, [%0, 448] \n" + "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row + "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U + "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V + "st1 {v0.8b}, [%2], #8 \n" // store 8 U. + "st1 {v2.8b}, [%3], #8 \n" // store 8 V. + "b.gt 1b \n" : "+r"(src_uyvy), // %0 "+r"(src_uyvyb), // %1 "+r"(dst_u), // %2 @@ -1141,13 +1272,14 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb, const uint8_t* shuffler, int width) { asm volatile( - "ld1 {v2.16b}, [%3] \n" // shuffler + "ld1 {v2.16b}, [%3] \n" // shuffler "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. - "subs %w2, %w2, #4 \n" // 4 processed per loop - "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels - "st1 {v1.16b}, [%1], #16 \n" // store 4. - "b.gt 1b \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #4 \n" // 4 processed per loop + "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels + "st1 {v1.16b}, [%1], #16 \n" // store 4. + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -1163,13 +1295,14 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y, int width) { asm volatile( "1: \n" - "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys - "orr v2.8b, v1.8b, v1.8b \n" - "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us - "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs - "subs %w4, %w4, #16 \n" // 16 pixels - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. - "b.gt 1b \n" + "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys + "prfm pldl1keep, [%0, 448] \n" + "orr v2.8b, v1.8b, v1.8b \n" + "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us + "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs + "subs %w4, %w4, #16 \n" // 16 pixels + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -1186,13 +1319,14 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y, int width) { asm volatile( "1: \n" - "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys - "orr v3.8b, v2.8b, v2.8b \n" - "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us - "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs - "subs %w4, %w4, #16 \n" // 16 pixels - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. - "b.gt 1b \n" + "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys + "prfm pldl1keep, [%0, 448] \n" + "orr v3.8b, v2.8b, v2.8b \n" + "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us + "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs + "subs %w4, %w4, #16 \n" // 16 pixels + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -1207,11 +1341,13 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb, int width) { asm volatile( "1: \n" - "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop. + "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 + // pixels + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGBTORGB565 - "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. - "b.gt 1b \n" + "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgb565), // %1 "+r"(width) // %2 @@ -1224,15 +1360,17 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, const uint32_t dither4, int width) { asm volatile( - "dup v1.4s, %w2 \n" // dither4 - "1: \n" - "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uqadd v20.8b, v20.8b, v1.8b \n" - "uqadd v21.8b, v21.8b, v1.8b \n" - "uqadd v22.8b, v22.8b, v1.8b \n" ARGBTORGB565 - "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565. - "b.gt 1b \n" + "dup v1.4s, %w2 \n" // dither4 + "1: \n" + "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 + // pixels + "prfm pldl1keep, [%0, 448] \n" + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqadd v20.8b, v20.8b, v1.8b \n" + "uqadd v21.8b, v21.8b, v1.8b \n" + "uqadd v22.8b, v22.8b, v1.8b \n" ARGBTORGB565 + "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" : "+r"(dst_rgb) // %0 : "r"(src_argb), // %1 "r"(dither4), // %2 @@ -1245,12 +1383,13 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, int width) { asm volatile( "1: \n" - "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop. + "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 + // pixels + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGBTOARGB1555 - "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels - // ARGB1555. - "b.gt 1b \n" + "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb1555), // %1 "+r"(width) // %2 @@ -1262,15 +1401,16 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, uint8_t* dst_argb4444, int width) { asm volatile( - "movi v4.16b, #0x0f \n" // bits to clear with + "movi v4.16b, #0x0f \n" // bits to clear with // vbic. "1: \n" - "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop. + "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 + // pixels + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGBTOARGB4444 - "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels - // ARGB4444. - "b.gt 1b \n" + "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb4444), // %1 "+r"(width) // %2 @@ -1280,20 +1420,21 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { asm volatile( - "movi v4.8b, #25 \n" // B * 0.1016 coefficient - "movi v5.8b, #129 \n" // G * 0.5078 coefficient - "movi v6.8b, #66 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v3.8h, v0.8b, v4.8b \n" // B - "umlal v3.8h, v1.8b, v5.8b \n" // G - "umlal v3.8h, v2.8b, v6.8b \n" // R - "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" + "movi v4.8b, #25 \n" // B * 0.1016 coefficient + "movi v5.8b, #129 \n" // G * 0.5078 coefficient + "movi v6.8b, #66 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v3.8h, v0.8b, v4.8b \n" // B + "umlal v3.8h, v1.8b, v5.8b \n" // G + "umlal v3.8h, v2.8b, v6.8b \n" // R + "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1306,11 +1447,11 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, int width) { asm volatile( "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 - // pixels - "subs %w2, %w2, #16 \n" // 16 processed per loop - "st1 {v3.16b}, [%1], #16 \n" // store 16 A's. - "b.gt 1b \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #16 \n" // 16 processed per loop + "st1 {v3.16b}, [%1], #16 \n" // store 16 A's. + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_a), // %1 "+r"(width) // %2 @@ -1321,18 +1462,19 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { asm volatile( - "movi v4.8b, #29 \n" // B * 0.1140 coefficient - "movi v5.8b, #150 \n" // G * 0.5870 coefficient - "movi v6.8b, #77 \n" // R * 0.2990 coefficient - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v3.8h, v0.8b, v4.8b \n" // B - "umlal v3.8h, v1.8b, v5.8b \n" // G - "umlal v3.8h, v2.8b, v6.8b \n" // R - "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" + "movi v4.8b, #29 \n" // B * 0.1140 coefficient + "movi v5.8b, #150 \n" // G * 0.5870 coefficient + "movi v6.8b, #77 \n" // R * 0.2990 coefficient + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v3.8h, v0.8b, v4.8b \n" // B + "umlal v3.8h, v1.8b, v5.8b \n" // G + "umlal v3.8h, v2.8b, v6.8b \n" // R + "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1342,18 +1484,19 @@ void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { asm volatile( - "movi v4.8b, #29 \n" // B * 0.1140 coefficient - "movi v5.8b, #150 \n" // G * 0.5870 coefficient - "movi v6.8b, #77 \n" // R * 0.2990 coefficient - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 RGBA - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v0.8h, v1.8b, v4.8b \n" // B - "umlal v0.8h, v2.8b, v5.8b \n" // G - "umlal v0.8h, v3.8b, v6.8b \n" // R - "uqrshrn v3.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y - "st1 {v3.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" + "movi v4.8b, #29 \n" // B * 0.1140 coefficient + "movi v5.8b, #150 \n" // G * 0.5870 coefficient + "movi v6.8b, #77 \n" // R * 0.2990 coefficient + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 RGBA + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v0.8h, v1.8b, v4.8b \n" // B + "umlal v0.8h, v2.8b, v5.8b \n" // G + "umlal v0.8h, v3.8b, v6.8b \n" // R + "uqrshrn v3.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y + "st1 {v3.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1367,33 +1510,33 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb, uint8_t* dst_v, int width) { asm volatile( - "movi v24.8b, #112 \n" // UB / VR 0.875 + "movi v24.8b, #112 \n" // UB / VR 0.875 // coefficient - "movi v25.8b, #74 \n" // UG -0.5781 coefficient - "movi v26.8b, #38 \n" // UR -0.2969 coefficient - "movi v27.8b, #18 \n" // VB -0.1406 coefficient - "movi v28.8b, #94 \n" // VG -0.7344 coefficient - "movi v29.16b,#0x80 \n" // 128.5 - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - // pixels. - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "umull v4.8h, v0.8b, v24.8b \n" // B - "umlsl v4.8h, v1.8b, v25.8b \n" // G - "umlsl v4.8h, v2.8b, v26.8b \n" // R - "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned - - "umull v3.8h, v2.8b, v24.8b \n" // R - "umlsl v3.8h, v1.8b, v28.8b \n" // G - "umlsl v3.8h, v0.8b, v27.8b \n" // B - "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned - - "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U - "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V - - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. - "b.gt 1b \n" + "movi v25.8b, #74 \n" // UG -0.5781 coefficient + "movi v26.8b, #38 \n" // UR -0.2969 coefficient + "movi v27.8b, #18 \n" // VB -0.1406 coefficient + "movi v28.8b, #94 \n" // VG -0.7344 coefficient + "movi v29.16b,#0x80 \n" // 128.5 + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "prfm pldl1keep, [%0, 448] \n" + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v24.8b \n" // B + "umlsl v4.8h, v1.8b, v25.8b \n" // G + "umlsl v4.8h, v2.8b, v26.8b \n" // R + "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned + + "umull v3.8h, v2.8b, v24.8b \n" // R + "umlsl v3.8h, v1.8b, v28.8b \n" // G + "umlsl v3.8h, v0.8b, v27.8b \n" // B + "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned + + "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V + + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -1437,26 +1580,28 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb_1 = src_argb + src_stride_argb; asm volatile ( RGBTOUV_SETUP_REG - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 - "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. - "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 32 processed per loop. + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 + "prfm pldl1keep, [%1, 448] \n" + "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(src_argb_1), // %1 "+r"(dst_u), // %2 @@ -1468,7 +1613,6 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb, ); } -// TODO(fbarchard): Subsample match C code. void ARGBToUVJRow_NEON(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, @@ -1476,31 +1620,33 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb, int width) { const uint8_t* src_argb_1 = src_argb + src_stride_argb; asm volatile ( - "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 - "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 - "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 - "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 - "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 - "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 - "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. - "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 32 processed per loop. + "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 + "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 + "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 + "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 + "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 + "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 + "prfm pldl1keep, [%1, 448] \n" + "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(src_argb_1), // %1 "+r"(dst_u), // %2 @@ -1520,25 +1666,27 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra, const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra; asm volatile ( RGBTOUV_SETUP_REG - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts. - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more - "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts. - "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v3.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 32 processed per loop. + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more + "prfm pldl1keep, [%1, 448] \n" + "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v3.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_bgra), // %0 "+r"(src_bgra_1), // %1 "+r"(dst_u), // %2 @@ -1558,25 +1706,27 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr, const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr; asm volatile ( RGBTOUV_SETUP_REG - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts. - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. - "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts. - "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v0.8h, v3.8h, #1 \n" // 2x average - "urshr v2.8h, v2.8h, #1 \n" - "urshr v1.8h, v1.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 32 processed per loop. + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. + "prfm pldl1keep, [%1, 448] \n" + "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v3.8h, #1 \n" // 2x average + "urshr v2.8h, v2.8h, #1 \n" + "urshr v1.8h, v1.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v0.8h, v2.8h, v1.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_abgr), // %0 "+r"(src_abgr_1), // %1 "+r"(dst_u), // %2 @@ -1596,25 +1746,27 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba, const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba; asm volatile ( RGBTOUV_SETUP_REG - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts. - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. - "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts. - "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 32 processed per loop. + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. + "prfm pldl1keep, [%1, 448] \n" + "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_rgba), // %0 "+r"(src_rgba_1), // %1 "+r"(dst_u), // %2 @@ -1634,25 +1786,27 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24; asm volatile ( RGBTOUV_SETUP_REG - "1: \n" - "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. - "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more. - "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. - "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 32 processed per loop. + "1: \n" + "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more. + "prfm pldl1keep, [%1, 448] \n" + "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_rgb24), // %0 "+r"(src_rgb24_1), // %1 "+r"(dst_u), // %2 @@ -1672,25 +1826,27 @@ void RAWToUVRow_NEON(const uint8_t* src_raw, const uint8_t* src_raw_1 = src_raw + src_stride_raw; asm volatile ( RGBTOUV_SETUP_REG - "1: \n" - "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels. - "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts. - "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels - "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts. - "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v2.8h, v2.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v0.8h, v0.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 32 processed per loop. + "1: \n" + "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts. + "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels + "prfm pldl1keep, [%1, 448] \n" + "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v2.8h, v2.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v0.8h, v0.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v2.8h, v1.8h, v0.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_raw), // %0 "+r"(src_raw_1), // %1 "+r"(dst_u), // %2 @@ -1702,7 +1858,7 @@ void RAWToUVRow_NEON(const uint8_t* src_raw, ); } -// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. +// 16x2 pixels -> 8x1. width is number of rgb pixels. e.g. 16. void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, int src_stride_rgb565, uint8_t* dst_u, @@ -1710,67 +1866,54 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, int width) { const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565; asm volatile( - "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / - // 2 - "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2 - "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2 - "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2 - "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2 - "movi v27.16b, #0x80 \n" // 128.5 0x8080 in 16bit + RGBTOUV_SETUP_REG "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. + "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. + "prfm pldl1keep, [%0, 448] \n" RGB565TOARGB - "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels. + "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels. RGB565TOARGB - "uaddlp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels. + "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels. + "prfm pldl1keep, [%1, 448] \n" RGB565TOARGB - "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels. + "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels. RGB565TOARGB - "uadalp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - - "ins v16.D[1], v17.D[0] \n" - "ins v18.D[1], v19.D[0] \n" - "ins v20.D[1], v21.D[0] \n" - - "urshr v4.8h, v16.8h, #1 \n" // 2x average - "urshr v5.8h, v18.8h, #1 \n" - "urshr v6.8h, v20.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 16 processed per loop. - "mul v16.8h, v4.8h, v22.8h \n" // B - "mls v16.8h, v5.8h, v23.8h \n" // G - "mls v16.8h, v6.8h, v24.8h \n" // R - "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned - "mul v17.8h, v6.8h, v22.8h \n" // R - "mls v17.8h, v5.8h, v26.8h \n" // G - "mls v17.8h, v4.8h, v25.8h \n" // B - "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned - "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U - "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" + "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + + "ins v16.D[1], v26.D[0] \n" + "ins v17.D[1], v27.D[0] \n" + "ins v18.D[1], v28.D[0] \n" + + "urshr v0.8h, v16.8h, #1 \n" // 2x average + "urshr v1.8h, v17.8h, #1 \n" + "urshr v2.8h, v18.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. + RGBTOUV(v0.8h, v1.8h, v2.8h) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_rgb565), // %0 "+r"(src_rgb565_1), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27"); + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", + "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", + "v28"); } // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. @@ -1783,50 +1926,43 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, asm volatile( RGBTOUV_SETUP_REG "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. + "prfm pldl1keep, [%0, 448] \n" RGB555TOARGB - "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels. + "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels. RGB555TOARGB - "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels. + "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels. + "prfm pldl1keep, [%1, 448] \n" RGB555TOARGB - "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels. + "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels. RGB555TOARGB - "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - - "ins v16.D[1], v26.D[0] \n" - "ins v17.D[1], v27.D[0] \n" - "ins v18.D[1], v28.D[0] \n" - - "urshr v4.8h, v16.8h, #1 \n" // 2x average - "urshr v5.8h, v17.8h, #1 \n" - "urshr v6.8h, v18.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 16 processed per loop. - "mul v2.8h, v4.8h, v20.8h \n" // B - "mls v2.8h, v5.8h, v21.8h \n" // G - "mls v2.8h, v6.8h, v22.8h \n" // R - "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned - "mul v3.8h, v6.8h, v20.8h \n" // R - "mls v3.8h, v5.8h, v24.8h \n" // G - "mls v3.8h, v4.8h, v23.8h \n" // B - "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned - "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U - "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" + "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + + "ins v16.D[1], v26.D[0] \n" + "ins v17.D[1], v27.D[0] \n" + "ins v18.D[1], v28.D[0] \n" + + "urshr v0.8h, v16.8h, #1 \n" // 2x average + "urshr v1.8h, v17.8h, #1 \n" + "urshr v2.8h, v18.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. + RGBTOUV(v0.8h, v1.8h, v2.8h) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_argb1555), // %0 "+r"(src_argb1555_1), // %1 "+r"(dst_u), // %2 @@ -1846,52 +1982,45 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, int width) { const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444; asm volatile( - RGBTOUV_SETUP_REG + RGBTOUV_SETUP_REG // sets v20-v25 "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. + "prfm pldl1keep, [%0, 448] \n" ARGB4444TOARGB - "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels. + "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels. ARGB4444TOARGB - "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels. + "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels. + "prfm pldl1keep, [%1, 448] \n" ARGB4444TOARGB - "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels. + "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels. ARGB4444TOARGB - "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - - "ins v16.D[1], v26.D[0] \n" - "ins v17.D[1], v27.D[0] \n" - "ins v18.D[1], v28.D[0] \n" - - "urshr v4.8h, v16.8h, #1 \n" // 2x average - "urshr v5.8h, v17.8h, #1 \n" - "urshr v6.8h, v18.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 16 processed per loop. - "mul v2.8h, v4.8h, v20.8h \n" // B - "mls v2.8h, v5.8h, v21.8h \n" // G - "mls v2.8h, v6.8h, v22.8h \n" // R - "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned - "mul v3.8h, v6.8h, v20.8h \n" // R - "mls v3.8h, v5.8h, v24.8h \n" // G - "mls v3.8h, v4.8h, v23.8h \n" // B - "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned - "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U - "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" + "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + + "ins v16.D[1], v26.D[0] \n" + "ins v17.D[1], v27.D[0] \n" + "ins v18.D[1], v28.D[0] \n" + + "urshr v0.8h, v16.8h, #1 \n" // 2x average + "urshr v1.8h, v17.8h, #1 \n" + "urshr v2.8h, v18.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. + RGBTOUV(v0.8h, v1.8h, v2.8h) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_argb4444), // %0 "+r"(src_argb4444_1), // %1 "+r"(dst_u), // %2 @@ -1907,21 +2036,22 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { asm volatile( - "movi v24.8b, #25 \n" // B * 0.1016 coefficient - "movi v25.8b, #129 \n" // G * 0.5078 coefficient - "movi v26.8b, #66 \n" // R * 0.2578 coefficient - "movi v27.8b, #16 \n" // Add 16 constant + "movi v24.8b, #25 \n" // B * 0.1016 coefficient + "movi v25.8b, #129 \n" // G * 0.5078 coefficient + "movi v26.8b, #66 \n" // R * 0.2578 coefficient + "movi v27.8b, #16 \n" // Add 16 constant "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. + "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. RGB565TOARGB - "umull v3.8h, v0.8b, v24.8b \n" // B - "umlal v3.8h, v1.8b, v25.8b \n" // G - "umlal v3.8h, v2.8b, v26.8b \n" // R - "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v27.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" + "umull v3.8h, v0.8b, v24.8b \n" // B + "umlal v3.8h, v1.8b, v25.8b \n" // G + "umlal v3.8h, v2.8b, v26.8b \n" // R + "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v27.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" : "+r"(src_rgb565), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1934,21 +2064,22 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_y, int width) { asm volatile( - "movi v4.8b, #25 \n" // B * 0.1016 coefficient - "movi v5.8b, #129 \n" // G * 0.5078 coefficient - "movi v6.8b, #66 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant + "movi v4.8b, #25 \n" // B * 0.1016 coefficient + "movi v5.8b, #129 \n" // G * 0.5078 coefficient + "movi v6.8b, #66 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGB1555TOARGB - "umull v3.8h, v0.8b, v4.8b \n" // B - "umlal v3.8h, v1.8b, v5.8b \n" // G - "umlal v3.8h, v2.8b, v6.8b \n" // R - "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" + "umull v3.8h, v0.8b, v4.8b \n" // B + "umlal v3.8h, v1.8b, v5.8b \n" // G + "umlal v3.8h, v2.8b, v6.8b \n" // R + "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" : "+r"(src_argb1555), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1960,21 +2091,22 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_y, int width) { asm volatile( - "movi v24.8b, #25 \n" // B * 0.1016 coefficient - "movi v25.8b, #129 \n" // G * 0.5078 coefficient - "movi v26.8b, #66 \n" // R * 0.2578 coefficient - "movi v27.8b, #16 \n" // Add 16 constant + "movi v24.8b, #25 \n" // B * 0.1016 coefficient + "movi v25.8b, #129 \n" // G * 0.5078 coefficient + "movi v26.8b, #66 \n" // R * 0.2578 coefficient + "movi v27.8b, #16 \n" // Add 16 constant "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGB4444TOARGB - "umull v3.8h, v0.8b, v24.8b \n" // B - "umlal v3.8h, v1.8b, v25.8b \n" // G - "umlal v3.8h, v2.8b, v26.8b \n" // R - "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v27.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" + "umull v3.8h, v0.8b, v24.8b \n" // B + "umlal v3.8h, v1.8b, v25.8b \n" // G + "umlal v3.8h, v2.8b, v26.8b \n" // R + "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v27.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" : "+r"(src_argb4444), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1984,20 +2116,21 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { asm volatile( - "movi v4.8b, #66 \n" // R * 0.2578 coefficient - "movi v5.8b, #129 \n" // G * 0.5078 coefficient - "movi v6.8b, #25 \n" // B * 0.1016 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v1.8b, v4.8b \n" // R - "umlal v16.8h, v2.8b, v5.8b \n" // G - "umlal v16.8h, v3.8b, v6.8b \n" // B - "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" + "movi v4.8b, #66 \n" // R * 0.2578 coefficient + "movi v5.8b, #129 \n" // G * 0.5078 coefficient + "movi v6.8b, #25 \n" // B * 0.1016 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v1.8b, v4.8b \n" // R + "umlal v16.8h, v2.8b, v5.8b \n" // G + "umlal v16.8h, v3.8b, v6.8b \n" // B + "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" : "+r"(src_bgra), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -2007,20 +2140,21 @@ void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { asm volatile( - "movi v6.8b, #25 \n" // B * 0.1016 coefficient - "movi v5.8b, #129 \n" // G * 0.5078 coefficient - "movi v4.8b, #66 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v0.8b, v4.8b \n" // R - "umlal v16.8h, v1.8b, v5.8b \n" // G - "umlal v16.8h, v2.8b, v6.8b \n" // B - "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" + "movi v6.8b, #25 \n" // B * 0.1016 coefficient + "movi v5.8b, #129 \n" // G * 0.5078 coefficient + "movi v4.8b, #66 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v0.8b, v4.8b \n" // R + "umlal v16.8h, v1.8b, v5.8b \n" // G + "umlal v16.8h, v2.8b, v6.8b \n" // B + "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" : "+r"(src_abgr), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -2030,20 +2164,21 @@ void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { asm volatile( - "movi v4.8b, #25 \n" // B * 0.1016 coefficient - "movi v5.8b, #129 \n" // G * 0.5078 coefficient - "movi v6.8b, #66 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v1.8b, v4.8b \n" // B - "umlal v16.8h, v2.8b, v5.8b \n" // G - "umlal v16.8h, v3.8b, v6.8b \n" // R - "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" + "movi v4.8b, #25 \n" // B * 0.1016 coefficient + "movi v5.8b, #129 \n" // G * 0.5078 coefficient + "movi v6.8b, #66 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v1.8b, v4.8b \n" // B + "umlal v16.8h, v2.8b, v5.8b \n" // G + "umlal v16.8h, v3.8b, v6.8b \n" // R + "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" : "+r"(src_rgba), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -2053,20 +2188,21 @@ void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { asm volatile( - "movi v4.8b, #25 \n" // B * 0.1016 coefficient - "movi v5.8b, #129 \n" // G * 0.5078 coefficient - "movi v6.8b, #66 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v0.8b, v4.8b \n" // B - "umlal v16.8h, v1.8b, v5.8b \n" // G - "umlal v16.8h, v2.8b, v6.8b \n" // R - "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" + "movi v4.8b, #25 \n" // B * 0.1016 coefficient + "movi v5.8b, #129 \n" // G * 0.5078 coefficient + "movi v6.8b, #66 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v0.8b, v4.8b \n" // B + "umlal v16.8h, v1.8b, v5.8b \n" // G + "umlal v16.8h, v2.8b, v6.8b \n" // R + "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -2076,20 +2212,21 @@ void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { asm volatile( - "movi v6.8b, #25 \n" // B * 0.1016 coefficient - "movi v5.8b, #129 \n" // G * 0.5078 coefficient - "movi v4.8b, #66 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v0.8b, v4.8b \n" // B - "umlal v16.8h, v1.8b, v5.8b \n" // G - "umlal v16.8h, v2.8b, v6.8b \n" // R - "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" + "movi v6.8b, #25 \n" // B * 0.1016 coefficient + "movi v5.8b, #129 \n" // G * 0.5078 coefficient + "movi v4.8b, #66 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v0.8b, v4.8b \n" // B + "umlal v16.8h, v1.8b, v5.8b \n" // G + "umlal v16.8h, v2.8b, v6.8b \n" // R + "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -2097,6 +2234,50 @@ void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); } +void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { + asm volatile( + "movi v4.8b, #29 \n" // B * 0.1140 coefficient + "movi v5.8b, #150 \n" // G * 0.5870 coefficient + "movi v6.8b, #77 \n" // R * 0.2990 coefficient + "1: \n" + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v0.8h, v0.8b, v4.8b \n" // B + "umlal v0.8h, v1.8b, v5.8b \n" // G + "umlal v0.8h, v2.8b, v6.8b \n" // R + "uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_yj), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); +} + +void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) { + asm volatile( + "movi v6.8b, #29 \n" // B * 0.1140 coefficient + "movi v5.8b, #150 \n" // G * 0.5870 coefficient + "movi v4.8b, #77 \n" // R * 0.2990 coefficient + "1: \n" + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v0.8h, v0.8b, v4.8b \n" // B + "umlal v0.8h, v1.8b, v5.8b \n" // G + "umlal v0.8h, v2.8b, v6.8b \n" // R + "uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_yj), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); +} + // Bilinear filter 16x2 -> 16x1 void InterpolateRow_NEON(uint8_t* dst_ptr, const uint8_t* src_ptr, @@ -2107,44 +2288,49 @@ void InterpolateRow_NEON(uint8_t* dst_ptr, int y0_fraction = 256 - y1_fraction; const uint8_t* src_ptr1 = src_ptr + src_stride; asm volatile( - "cmp %w4, #0 \n" - "b.eq 100f \n" - "cmp %w4, #128 \n" - "b.eq 50f \n" + "cmp %w4, #0 \n" + "b.eq 100f \n" + "cmp %w4, #128 \n" + "b.eq 50f \n" - "dup v5.16b, %w4 \n" - "dup v4.16b, %w5 \n" + "dup v5.16b, %w4 \n" + "dup v4.16b, %w5 \n" // General purpose row blend. "1: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "umull v2.8h, v0.8b, v4.8b \n" - "umull2 v3.8h, v0.16b, v4.16b \n" - "umlal v2.8h, v1.8b, v5.8b \n" - "umlal2 v3.8h, v1.16b, v5.16b \n" - "rshrn v0.8b, v2.8h, #8 \n" - "rshrn2 v0.16b, v3.8h, #8 \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 1b \n" - "b 99f \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "prfm pldl1keep, [%1, 448] \n" + "prfm pldl1keep, [%2, 448] \n" + "subs %w3, %w3, #16 \n" + "umull v2.8h, v0.8b, v4.8b \n" + "umull2 v3.8h, v0.16b, v4.16b \n" + "umlal v2.8h, v1.8b, v5.8b \n" + "umlal2 v3.8h, v1.16b, v5.16b \n" + "rshrn v0.8b, v2.8h, #8 \n" + "rshrn2 v0.16b, v3.8h, #8 \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 1b \n" + "b 99f \n" // Blend 50 / 50. "50: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 50b \n" - "b 99f \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "prfm pldl1keep, [%1, 448] \n" + "prfm pldl1keep, [%2, 448] \n" + "subs %w3, %w3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 50b \n" + "b 99f \n" // Blend 100 / 0 - Copy row unchanged. "100: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "subs %w3, %w3, #16 \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 100b \n" + "ld1 {v0.16b}, [%1], #16 \n" + "prfm pldl1keep, [%1, 448] \n" + "subs %w3, %w3, #16 \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 100b \n" "99: \n" : "+r"(dst_ptr), // %0 @@ -2163,56 +2349,60 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb0, uint8_t* dst_argb, int width) { asm volatile( - "subs %w3, %w3, #8 \n" - "b.lt 89f \n" + "subs %w3, %w3, #8 \n" + "b.lt 89f \n" // Blend 8 pixels. "8: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 - // pixels - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 - // pixels - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "umull v16.8h, v4.8b, v3.8b \n" // db * a - "umull v17.8h, v5.8b, v3.8b \n" // dg * a - "umull v18.8h, v6.8b, v3.8b \n" // dr * a - "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 - "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 - "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 - "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) - "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) - "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) - "uqadd v0.8b, v0.8b, v4.8b \n" // + sb - "uqadd v1.8b, v1.8b, v5.8b \n" // + sg - "uqadd v2.8b, v2.8b, v6.8b \n" // + sr - "movi v3.8b, #255 \n" // a = 255 - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB - // pixels - "b.ge 8b \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 + "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%1, 448] \n" + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "umull v16.8h, v4.8b, v3.8b \n" // db * a + "umull v17.8h, v5.8b, v3.8b \n" // dg * a + "umull v18.8h, v6.8b, v3.8b \n" // dr * a + "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 + "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 + "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 + "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) + "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) + "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) + "uqadd v0.8b, v0.8b, v4.8b \n" // + sb + "uqadd v1.8b, v1.8b, v5.8b \n" // + sg + "uqadd v2.8b, v2.8b, v6.8b \n" // + sr + "movi v3.8b, #255 \n" // a = 255 + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + // pixels + "b.ge 8b \n" "89: \n" - "adds %w3, %w3, #8-1 \n" - "b.lt 99f \n" + "adds %w3, %w3, #8-1 \n" + "b.lt 99f \n" // Blend 1 pixels. "1: \n" - "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0. - "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1. - "subs %w3, %w3, #1 \n" // 1 processed per loop. - "umull v16.8h, v4.8b, v3.8b \n" // db * a - "umull v17.8h, v5.8b, v3.8b \n" // dg * a - "umull v18.8h, v6.8b, v3.8b \n" // dr * a - "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 - "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 - "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 - "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) - "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) - "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) - "uqadd v0.8b, v0.8b, v4.8b \n" // + sb - "uqadd v1.8b, v1.8b, v5.8b \n" // + sg - "uqadd v2.8b, v2.8b, v6.8b \n" // + sr - "movi v3.8b, #255 \n" // a = 255 - "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel. - "b.ge 1b \n" + "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel + // ARGB0. + "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel + // ARGB1. + "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%1, 448] \n" + "subs %w3, %w3, #1 \n" // 1 processed per loop. + "umull v16.8h, v4.8b, v3.8b \n" // db * a + "umull v17.8h, v5.8b, v3.8b \n" // dg * a + "umull v18.8h, v6.8b, v3.8b \n" // dr * a + "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 + "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 + "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 + "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) + "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) + "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) + "uqadd v0.8b, v0.8b, v4.8b \n" // + sb + "uqadd v1.8b, v1.8b, v5.8b \n" // + sg + "uqadd v2.8b, v2.8b, v6.8b \n" // + sr + "movi v3.8b, #255 \n" // a = 255 + "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel. + "b.ge 1b \n" "99: \n" @@ -2232,17 +2422,17 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb, asm volatile( // Attenuate 8 pixels. "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v4.8h, v0.8b, v3.8b \n" // b * a - "umull v5.8h, v1.8b, v3.8b \n" // g * a - "umull v6.8h, v2.8b, v3.8b \n" // r * a - "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 - "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 - "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB - // pixels - "b.gt 1b \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v3.8b \n" // b * a + "umull v5.8h, v1.8b, v3.8b \n" // g * a + "umull v6.8h, v2.8b, v3.8b \n" // r * a + "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 + "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 + "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -2258,32 +2448,33 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb, int interval_offset, int width) { asm volatile( - "dup v4.8h, %w2 \n" - "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 - "dup v5.8h, %w3 \n" // interval multiply. - "dup v6.8h, %w4 \n" // interval add + "dup v4.8h, %w2 \n" + "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 + "dup v5.8h, %w3 \n" // interval multiply. + "dup v6.8h, %w4 \n" // interval add // 8 pixel loop. "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB. - "subs %w1, %w1, #8 \n" // 8 processed per loop. - "uxtl v0.8h, v0.8b \n" // b (0 .. 255) - "uxtl v1.8h, v1.8b \n" - "uxtl v2.8h, v2.8b \n" - "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale - "sqdmulh v1.8h, v1.8h, v4.8h \n" // g - "sqdmulh v2.8h, v2.8h, v4.8h \n" // r - "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size - "mul v1.8h, v1.8h, v5.8h \n" // g - "mul v2.8h, v2.8h, v5.8h \n" // r - "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset - "add v1.8h, v1.8h, v6.8h \n" // g - "add v2.8h, v2.8h, v6.8h \n" // r - "uqxtn v0.8b, v0.8h \n" - "uqxtn v1.8b, v1.8h \n" - "uqxtn v2.8b, v2.8h \n" - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB - "b.gt 1b \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB. + "prfm pldl1keep, [%0, 448] \n" + "subs %w1, %w1, #8 \n" // 8 processed per loop. + "uxtl v0.8h, v0.8b \n" // b (0 .. 255) + "uxtl v1.8h, v1.8b \n" + "uxtl v2.8h, v2.8b \n" + "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale + "sqdmulh v1.8h, v1.8h, v4.8h \n" // g + "sqdmulh v2.8h, v2.8h, v4.8h \n" // r + "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size + "mul v1.8h, v1.8h, v5.8h \n" // g + "mul v2.8h, v2.8h, v5.8h \n" // r + "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset + "add v1.8h, v1.8h, v6.8h \n" // g + "add v2.8h, v2.8h, v6.8h \n" // r + "uqxtn v0.8b, v0.8h \n" + "uqxtn v1.8b, v1.8h \n" + "uqxtn v2.8b, v2.8h \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB + "b.gt 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 : "r"(scale), // %2 @@ -2300,28 +2491,29 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb, int width, uint32_t value) { asm volatile( - "dup v0.4s, %w3 \n" // duplicate scale value. - "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb. - "ushr v0.8h, v0.8h, #1 \n" // scale / 2. + "dup v0.4s, %w3 \n" // duplicate scale value. + "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb. + "ushr v0.8h, v0.8h, #1 \n" // scale / 2. // 8 pixel loop. "1: \n" - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "uxtl v4.8h, v4.8b \n" // b (0 .. 255) - "uxtl v5.8h, v5.8b \n" - "uxtl v6.8h, v6.8b \n" - "uxtl v7.8h, v7.8b \n" - "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2 - "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g - "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r - "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a - "uqxtn v4.8b, v4.8h \n" - "uqxtn v5.8b, v5.8h \n" - "uqxtn v6.8b, v6.8h \n" - "uqxtn v7.8b, v7.8h \n" - "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB - "b.gt 1b \n" + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "uxtl v4.8h, v4.8b \n" // b (0 .. 255) + "uxtl v5.8h, v5.8b \n" + "uxtl v6.8h, v6.8b \n" + "uxtl v7.8h, v7.8b \n" + "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2 + "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g + "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r + "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a + "uqxtn v4.8b, v4.8h \n" + "uqxtn v5.8b, v5.8h \n" + "uqxtn v6.8b, v6.8h \n" + "uqxtn v7.8b, v7.8h \n" + "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -2334,20 +2526,21 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb, // C code is (29 * b + 150 * g + 77 * r + 128) >> 8; void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( - "movi v24.8b, #29 \n" // B * 0.1140 coefficient - "movi v25.8b, #150 \n" // G * 0.5870 coefficient - "movi v26.8b, #77 \n" // R * 0.2990 coefficient - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v4.8h, v0.8b, v24.8b \n" // B - "umlal v4.8h, v1.8b, v25.8b \n" // G - "umlal v4.8h, v2.8b, v26.8b \n" // R - "uqrshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit B - "orr v1.8b, v0.8b, v0.8b \n" // G - "orr v2.8b, v0.8b, v0.8b \n" // R - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels. - "b.gt 1b \n" + "movi v24.8b, #29 \n" // B * 0.1140 coefficient + "movi v25.8b, #150 \n" // G * 0.5870 coefficient + "movi v26.8b, #77 \n" // R * 0.2990 coefficient + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v24.8b \n" // B + "umlal v4.8h, v1.8b, v25.8b \n" // G + "umlal v4.8h, v2.8b, v26.8b \n" // R + "uqrshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit B + "orr v1.8b, v0.8b, v0.8b \n" // G + "orr v2.8b, v0.8b, v0.8b \n" // R + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels. + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -2362,32 +2555,33 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) { asm volatile( - "movi v20.8b, #17 \n" // BB coefficient - "movi v21.8b, #68 \n" // BG coefficient - "movi v22.8b, #35 \n" // BR coefficient - "movi v24.8b, #22 \n" // GB coefficient - "movi v25.8b, #88 \n" // GG coefficient - "movi v26.8b, #45 \n" // GR coefficient - "movi v28.8b, #24 \n" // BB coefficient - "movi v29.8b, #98 \n" // BG coefficient - "movi v30.8b, #50 \n" // BR coefficient - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. - "subs %w1, %w1, #8 \n" // 8 processed per loop. - "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B - "umlal v4.8h, v1.8b, v21.8b \n" // G - "umlal v4.8h, v2.8b, v22.8b \n" // R - "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G - "umlal v5.8h, v1.8b, v25.8b \n" // G - "umlal v5.8h, v2.8b, v26.8b \n" // R - "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R - "umlal v6.8h, v1.8b, v29.8b \n" // G - "umlal v6.8h, v2.8b, v30.8b \n" // R - "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B - "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G - "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels. - "b.gt 1b \n" + "movi v20.8b, #17 \n" // BB coefficient + "movi v21.8b, #68 \n" // BG coefficient + "movi v22.8b, #35 \n" // BR coefficient + "movi v24.8b, #22 \n" // GB coefficient + "movi v25.8b, #88 \n" // GG coefficient + "movi v26.8b, #45 \n" // GR coefficient + "movi v28.8b, #24 \n" // BB coefficient + "movi v29.8b, #98 \n" // BG coefficient + "movi v30.8b, #50 \n" // BR coefficient + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. + "prfm pldl1keep, [%0, 448] \n" + "subs %w1, %w1, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B + "umlal v4.8h, v1.8b, v21.8b \n" // G + "umlal v4.8h, v2.8b, v22.8b \n" // R + "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G + "umlal v5.8h, v1.8b, v25.8b \n" // G + "umlal v5.8h, v2.8b, v26.8b \n" // R + "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R + "umlal v6.8h, v1.8b, v29.8b \n" // G + "umlal v6.8h, v2.8b, v30.8b \n" // R + "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B + "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G + "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels. + "b.gt 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 : @@ -2403,51 +2597,52 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, const int8_t* matrix_argb, int width) { asm volatile( - "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors. - "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. - "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. - - "1: \n" - "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit - "uxtl v17.8h, v17.8b \n" // g - "uxtl v18.8h, v18.8b \n" // r - "uxtl v19.8h, v19.8b \n" // a - "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B - "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G - "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R - "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A - "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B - "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G - "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R - "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A - "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B - "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G - "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R - "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A - "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B - "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G - "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R - "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A - "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B - "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G - "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R - "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A - "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B - "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G - "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R - "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A - "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B - "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G - "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R - "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A - "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B - "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G - "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R - "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A - "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB - "b.gt 1b \n" + "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors. + "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. + "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. + + "1: \n" + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit + "uxtl v17.8h, v17.8b \n" // g + "uxtl v18.8h, v18.8b \n" // r + "uxtl v19.8h, v19.8b \n" // a + "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B + "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G + "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R + "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A + "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B + "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G + "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R + "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B + "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G + "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R + "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B + "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G + "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R + "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B + "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G + "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R + "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -2465,19 +2660,21 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0, asm volatile( // 8 pixel loop. "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "umull v0.8h, v0.8b, v4.8b \n" // multiply B - "umull v1.8h, v1.8b, v5.8b \n" // multiply G - "umull v2.8h, v2.8b, v6.8b \n" // multiply R - "umull v3.8h, v3.8b, v7.8b \n" // multiply A - "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B - "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G - "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R - "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB - "b.gt 1b \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more + "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%1, 448] \n" + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "umull v0.8h, v0.8b, v4.8b \n" // multiply B + "umull v1.8h, v1.8b, v5.8b \n" // multiply G + "umull v2.8h, v2.8b, v6.8b \n" // multiply R + "umull v3.8h, v3.8b, v7.8b \n" // multiply A + "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B + "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G + "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R + "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 @@ -2494,15 +2691,17 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0, asm volatile( // 8 pixel loop. "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uqadd v0.8b, v0.8b, v4.8b \n" - "uqadd v1.8b, v1.8b, v5.8b \n" - "uqadd v2.8b, v2.8b, v6.8b \n" - "uqadd v3.8b, v3.8b, v7.8b \n" - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB - "b.gt 1b \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more + "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%1, 448] \n" + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqadd v0.8b, v0.8b, v4.8b \n" + "uqadd v1.8b, v1.8b, v5.8b \n" + "uqadd v2.8b, v2.8b, v6.8b \n" + "uqadd v3.8b, v3.8b, v7.8b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 @@ -2519,15 +2718,17 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb0, asm volatile( // 8 pixel loop. "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uqsub v0.8b, v0.8b, v4.8b \n" - "uqsub v1.8b, v1.8b, v5.8b \n" - "uqsub v2.8b, v2.8b, v6.8b \n" - "uqsub v3.8b, v3.8b, v7.8b \n" - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB - "b.gt 1b \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more + "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%1, 448] \n" + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqsub v0.8b, v0.8b, v4.8b \n" + "uqsub v1.8b, v1.8b, v5.8b \n" + "uqsub v2.8b, v2.8b, v6.8b \n" + "uqsub v3.8b, v3.8b, v7.8b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 @@ -2546,17 +2747,19 @@ void SobelRow_NEON(const uint8_t* src_sobelx, uint8_t* dst_argb, int width) { asm volatile( - "movi v3.8b, #255 \n" // alpha + "movi v3.8b, #255 \n" // alpha // 8 pixel loop. "1: \n" - "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. - "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uqadd v0.8b, v0.8b, v1.8b \n" // add - "orr v1.8b, v0.8b, v0.8b \n" - "orr v2.8b, v0.8b, v0.8b \n" - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB - "b.gt 1b \n" + "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. + "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. + "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%1, 448] \n" + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqadd v0.8b, v0.8b, v1.8b \n" // add + "orr v1.8b, v0.8b, v0.8b \n" + "orr v2.8b, v0.8b, v0.8b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_argb), // %2 @@ -2573,12 +2776,14 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, asm volatile( // 16 pixel loop. "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. - "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. - "subs %w3, %w3, #16 \n" // 16 processed per loop. - "uqadd v0.16b, v0.16b, v1.16b \n" // add - "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. - "b.gt 1b \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. + "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. + "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%1, 448] \n" + "subs %w3, %w3, #16 \n" // 16 processed per loop. + "uqadd v0.16b, v0.16b, v1.16b \n" // add + "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. + "b.gt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_y), // %2 @@ -2597,15 +2802,17 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx, uint8_t* dst_argb, int width) { asm volatile( - "movi v3.8b, #255 \n" // alpha + "movi v3.8b, #255 \n" // alpha // 8 pixel loop. "1: \n" - "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. - "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uqadd v1.8b, v0.8b, v2.8b \n" // add - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB - "b.gt 1b \n" + "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. + "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. + "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%1, 448] \n" + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqadd v1.8b, v0.8b, v2.8b \n" // add + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_argb), // %2 @@ -2625,23 +2832,26 @@ void SobelXRow_NEON(const uint8_t* src_y0, int width) { asm volatile( "1: \n" - "ld1 {v0.8b}, [%0],%5 \n" // top - "ld1 {v1.8b}, [%0],%6 \n" - "usubl v0.8h, v0.8b, v1.8b \n" - "ld1 {v2.8b}, [%1],%5 \n" // center * 2 - "ld1 {v3.8b}, [%1],%6 \n" - "usubl v1.8h, v2.8b, v3.8b \n" - "add v0.8h, v0.8h, v1.8h \n" - "add v0.8h, v0.8h, v1.8h \n" - "ld1 {v2.8b}, [%2],%5 \n" // bottom - "ld1 {v3.8b}, [%2],%6 \n" - "subs %w4, %w4, #8 \n" // 8 pixels - "usubl v1.8h, v2.8b, v3.8b \n" - "add v0.8h, v0.8h, v1.8h \n" - "abs v0.8h, v0.8h \n" - "uqxtn v0.8b, v0.8h \n" - "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx - "b.gt 1b \n" + "ld1 {v0.8b}, [%0],%5 \n" // top + "ld1 {v1.8b}, [%0],%6 \n" + "prfm pldl1keep, [%0, 448] \n" + "usubl v0.8h, v0.8b, v1.8b \n" + "ld1 {v2.8b}, [%1],%5 \n" // center * 2 + "ld1 {v3.8b}, [%1],%6 \n" + "prfm pldl1keep, [%1, 448] \n" + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "add v0.8h, v0.8h, v1.8h \n" + "ld1 {v2.8b}, [%2],%5 \n" // bottom + "ld1 {v3.8b}, [%2],%6 \n" + "prfm pldl1keep, [%2, 448] \n" + "subs %w4, %w4, #8 \n" // 8 pixels + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "abs v0.8h, v0.8h \n" + "uqxtn v0.8b, v0.8h \n" + "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx + "b.gt 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 "+r"(src_y2), // %2 @@ -2663,23 +2873,25 @@ void SobelYRow_NEON(const uint8_t* src_y0, int width) { asm volatile( "1: \n" - "ld1 {v0.8b}, [%0],%4 \n" // left - "ld1 {v1.8b}, [%1],%4 \n" - "usubl v0.8h, v0.8b, v1.8b \n" - "ld1 {v2.8b}, [%0],%4 \n" // center * 2 - "ld1 {v3.8b}, [%1],%4 \n" - "usubl v1.8h, v2.8b, v3.8b \n" - "add v0.8h, v0.8h, v1.8h \n" - "add v0.8h, v0.8h, v1.8h \n" - "ld1 {v2.8b}, [%0],%5 \n" // right - "ld1 {v3.8b}, [%1],%5 \n" - "subs %w3, %w3, #8 \n" // 8 pixels - "usubl v1.8h, v2.8b, v3.8b \n" - "add v0.8h, v0.8h, v1.8h \n" - "abs v0.8h, v0.8h \n" - "uqxtn v0.8b, v0.8h \n" - "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely - "b.gt 1b \n" + "ld1 {v0.8b}, [%0],%4 \n" // left + "ld1 {v1.8b}, [%1],%4 \n" + "usubl v0.8h, v0.8b, v1.8b \n" + "ld1 {v2.8b}, [%0],%4 \n" // center * 2 + "ld1 {v3.8b}, [%1],%4 \n" + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "add v0.8h, v0.8h, v1.8h \n" + "ld1 {v2.8b}, [%0],%5 \n" // right + "ld1 {v3.8b}, [%1],%5 \n" + "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%1, 448] \n" + "subs %w3, %w3, #8 \n" // 8 pixels + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "abs v0.8h, v0.8h \n" + "uqxtn v0.8b, v0.8h \n" + "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely + "b.gt 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 "+r"(dst_sobely), // %2 @@ -2697,16 +2909,17 @@ void HalfFloat1Row_NEON(const uint16_t* src, int width) { asm volatile( "1: \n" - "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts - "subs %w2, %w2, #8 \n" // 8 pixels per loop - "uxtl v2.4s, v1.4h \n" // 8 int's - "uxtl2 v3.4s, v1.8h \n" - "scvtf v2.4s, v2.4s \n" // 8 floats - "scvtf v3.4s, v3.4s \n" - "fcvtn v1.4h, v2.4s \n" // 8 half floats - "fcvtn2 v1.8h, v3.4s \n" - "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts - "b.gt 1b \n" + "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 pixels per loop + "uxtl v2.4s, v1.4h \n" // 8 int's + "uxtl2 v3.4s, v1.8h \n" + "scvtf v2.4s, v2.4s \n" // 8 floats + "scvtf v3.4s, v3.4s \n" + "fcvtn v1.4h, v2.4s \n" // 8 half floats + "fcvtn2 v1.8h, v3.4s \n" + "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts + "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -2720,18 +2933,19 @@ void HalfFloatRow_NEON(const uint16_t* src, int width) { asm volatile( "1: \n" - "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts - "subs %w2, %w2, #8 \n" // 8 pixels per loop - "uxtl v2.4s, v1.4h \n" // 8 int's - "uxtl2 v3.4s, v1.8h \n" - "scvtf v2.4s, v2.4s \n" // 8 floats - "scvtf v3.4s, v3.4s \n" - "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent - "fmul v3.4s, v3.4s, %3.s[0] \n" - "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat - "uqshrn2 v1.8h, v3.4s, #13 \n" - "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts - "b.gt 1b \n" + "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 pixels per loop + "uxtl v2.4s, v1.4h \n" // 8 int's + "uxtl2 v3.4s, v1.8h \n" + "scvtf v2.4s, v2.4s \n" // 8 floats + "scvtf v3.4s, v3.4s \n" + "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent + "fmul v3.4s, v3.4s, %3.s[0] \n" + "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat + "uqshrn2 v1.8h, v3.4s, #13 \n" + "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts + "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -2745,17 +2959,18 @@ void ByteToFloatRow_NEON(const uint8_t* src, int width) { asm volatile( "1: \n" - "ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes - "subs %w2, %w2, #8 \n" // 8 pixels per loop - "uxtl v1.8h, v1.8b \n" // 8 shorts - "uxtl v2.4s, v1.4h \n" // 8 ints - "uxtl2 v3.4s, v1.8h \n" - "scvtf v2.4s, v2.4s \n" // 8 floats - "scvtf v3.4s, v3.4s \n" - "fmul v2.4s, v2.4s, %3.s[0] \n" // scale - "fmul v3.4s, v3.4s, %3.s[0] \n" - "st1 {v2.16b, v3.16b}, [%1], #32 \n" // store 8 floats - "b.gt 1b \n" + "ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 pixels per loop + "uxtl v1.8h, v1.8b \n" // 8 shorts + "uxtl v2.4s, v1.4h \n" // 8 ints + "uxtl2 v3.4s, v1.8h \n" + "scvtf v2.4s, v2.4s \n" // 8 floats + "scvtf v3.4s, v3.4s \n" + "fmul v2.4s, v2.4s, %3.s[0] \n" // scale + "fmul v3.4s, v3.4s, %3.s[0] \n" + "st1 {v2.16b, v3.16b}, [%1], #32 \n" // store 8 floats + "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -2769,20 +2984,21 @@ float ScaleMaxSamples_NEON(const float* src, int width) { float fmax; asm volatile( - "movi v5.4s, #0 \n" // max - "movi v6.4s, #0 \n" - - "1: \n" - "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples - "subs %w2, %w2, #8 \n" // 8 processed per loop - "fmul v3.4s, v1.4s, %4.s[0] \n" // scale - "fmul v4.4s, v2.4s, %4.s[0] \n" // scale - "fmax v5.4s, v5.4s, v1.4s \n" // max - "fmax v6.4s, v6.4s, v2.4s \n" - "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples - "b.gt 1b \n" - "fmax v5.4s, v5.4s, v6.4s \n" // max - "fmaxv %s3, v5.4s \n" // signed max acculator + "movi v5.4s, #0 \n" // max + "movi v6.4s, #0 \n" + + "1: \n" + "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + "fmul v3.4s, v1.4s, %4.s[0] \n" // scale + "fmul v4.4s, v2.4s, %4.s[0] \n" // scale + "fmax v5.4s, v5.4s, v1.4s \n" // max + "fmax v6.4s, v6.4s, v2.4s \n" + "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples + "b.gt 1b \n" + "fmax v5.4s, v5.4s, v6.4s \n" // max + "fmaxv %s3, v5.4s \n" // signed max acculator : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width), // %2 @@ -2798,21 +3014,22 @@ float ScaleSumSamples_NEON(const float* src, int width) { float fsum; asm volatile( - "movi v5.4s, #0 \n" // max - "movi v6.4s, #0 \n" // max - - "1: \n" - "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples - "subs %w2, %w2, #8 \n" // 8 processed per loop - "fmul v3.4s, v1.4s, %4.s[0] \n" // scale - "fmul v4.4s, v2.4s, %4.s[0] \n" - "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares - "fmla v6.4s, v2.4s, v2.4s \n" - "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples - "b.gt 1b \n" - "faddp v5.4s, v5.4s, v6.4s \n" - "faddp v5.4s, v5.4s, v5.4s \n" - "faddp %3.4s, v5.4s, v5.4s \n" // sum + "movi v5.4s, #0 \n" // max + "movi v6.4s, #0 \n" // max + + "1: \n" + "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + "fmul v3.4s, v1.4s, %4.s[0] \n" // scale + "fmul v4.4s, v2.4s, %4.s[0] \n" + "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares + "fmla v6.4s, v2.4s, v2.4s \n" + "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples + "b.gt 1b \n" + "faddp v5.4s, v5.4s, v6.4s \n" + "faddp v5.4s, v5.4s, v5.4s \n" + "faddp %3.4s, v5.4s, v5.4s \n" // sum : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width), // %2 @@ -2825,12 +3042,13 @@ float ScaleSumSamples_NEON(const float* src, void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) { asm volatile( "1: \n" - "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples - "subs %w2, %w2, #8 \n" // 8 processed per loop - "fmul v1.4s, v1.4s, %3.s[0] \n" // scale - "fmul v2.4s, v2.4s, %3.s[0] \n" // scale - "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples - "b.gt 1b \n" + "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + "fmul v1.4s, v1.4s, %3.s[0] \n" // scale + "fmul v2.4s, v2.4s, %3.s[0] \n" // scale + "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples + "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -2847,26 +3065,31 @@ void GaussCol_NEON(const uint16_t* src0, uint32_t* dst, int width) { asm volatile( - "movi v6.8h, #4 \n" // constant 4 - "movi v7.8h, #6 \n" // constant 6 - - "1: \n" - "ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows - "ld1 {v2.8h}, [%4], #16 \n" - "uaddl v0.4s, v1.4h, v2.4h \n" // * 1 - "uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1 - "ld1 {v2.8h}, [%1], #16 \n" - "umlal v0.4s, v2.4h, v6.4h \n" // * 4 - "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 - "ld1 {v2.8h}, [%2], #16 \n" - "umlal v0.4s, v2.4h, v7.4h \n" // * 6 - "umlal2 v1.4s, v2.8h, v7.8h \n" // * 6 - "ld1 {v2.8h}, [%3], #16 \n" - "umlal v0.4s, v2.4h, v6.4h \n" // * 4 - "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 - "subs %w6, %w6, #8 \n" // 8 processed per loop - "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples - "b.gt 1b \n" + "movi v6.8h, #4 \n" // constant 4 + "movi v7.8h, #6 \n" // constant 6 + + "1: \n" + "ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows + "ld1 {v2.8h}, [%4], #16 \n" + "uaddl v0.4s, v1.4h, v2.4h \n" // * 1 + "prfm pldl1keep, [%0, 448] \n" + "uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1 + "ld1 {v2.8h}, [%1], #16 \n" + "umlal v0.4s, v2.4h, v6.4h \n" // * 4 + "prfm pldl1keep, [%1, 448] \n" + "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 + "ld1 {v2.8h}, [%2], #16 \n" + "umlal v0.4s, v2.4h, v7.4h \n" // * 6 + "prfm pldl1keep, [%2, 448] \n" + "umlal2 v1.4s, v2.8h, v7.8h \n" // * 6 + "ld1 {v2.8h}, [%3], #16 \n" + "umlal v0.4s, v2.4h, v6.4h \n" // * 4 + "prfm pldl1keep, [%3, 448] \n" + "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 + "subs %w6, %w6, #8 \n" // 8 processed per loop + "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples + "prfm pldl1keep, [%4, 448] \n" + "b.gt 1b \n" : "+r"(src0), // %0 "+r"(src1), // %1 "+r"(src2), // %2 @@ -2884,27 +3107,28 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) { const uint32_t* src2 = src + 2; const uint32_t* src3 = src + 3; asm volatile( - "movi v6.4s, #4 \n" // constant 4 - "movi v7.4s, #6 \n" // constant 6 - - "1: \n" - "ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples - "add v0.4s, v0.4s, v1.4s \n" // * 1 - "add v1.4s, v1.4s, v2.4s \n" // * 1 - "ld1 {v2.4s,v3.4s}, [%2], #32 \n" - "mla v0.4s, v2.4s, v7.4s \n" // * 6 - "mla v1.4s, v3.4s, v7.4s \n" // * 6 - "ld1 {v2.4s,v3.4s}, [%1], #32 \n" - "ld1 {v4.4s,v5.4s}, [%3], #32 \n" - "add v2.4s, v2.4s, v4.4s \n" // add rows for * 4 - "add v3.4s, v3.4s, v5.4s \n" - "mla v0.4s, v2.4s, v6.4s \n" // * 4 - "mla v1.4s, v3.4s, v6.4s \n" // * 4 - "subs %w5, %w5, #8 \n" // 8 processed per loop - "uqrshrn v0.4h, v0.4s, #8 \n" // round and pack - "uqrshrn2 v0.8h, v1.4s, #8 \n" - "st1 {v0.8h}, [%4], #16 \n" // store 8 samples - "b.gt 1b \n" + "movi v6.4s, #4 \n" // constant 4 + "movi v7.4s, #6 \n" // constant 6 + + "1: \n" + "ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples + "add v0.4s, v0.4s, v1.4s \n" // * 1 + "add v1.4s, v1.4s, v2.4s \n" // * 1 + "ld1 {v2.4s,v3.4s}, [%2], #32 \n" + "mla v0.4s, v2.4s, v7.4s \n" // * 6 + "mla v1.4s, v3.4s, v7.4s \n" // * 6 + "ld1 {v2.4s,v3.4s}, [%1], #32 \n" + "ld1 {v4.4s,v5.4s}, [%3], #32 \n" + "add v2.4s, v2.4s, v4.4s \n" // add rows for * 4 + "add v3.4s, v3.4s, v5.4s \n" + "prfm pldl1keep, [%0, 448] \n" + "mla v0.4s, v2.4s, v6.4s \n" // * 4 + "mla v1.4s, v3.4s, v6.4s \n" // * 4 + "subs %w5, %w5, #8 \n" // 8 processed per loop + "uqrshrn v0.4h, v0.4s, #8 \n" // round and pack + "uqrshrn2 v0.8h, v1.4s, #8 \n" + "st1 {v0.8h}, [%4], #16 \n" // store 8 samples + "b.gt 1b \n" : "+r"(src), // %0 "+r"(src1), // %1 "+r"(src2), // %2 @@ -2915,6 +3139,87 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) { : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } +static const vecf32 kGaussCoefficients = {4.0f, 6.0f, 1.0f / 256.0f, 0.0f}; + +// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. +void GaussCol_F32_NEON(const float* src0, + const float* src1, + const float* src2, + const float* src3, + const float* src4, + float* dst, + int width) { + asm volatile( + "ld2r {v6.4s, v7.4s}, [%7] \n" // constants 4 and 6 + + "1: \n" + "ld1 {v0.4s, v1.4s}, [%0], #32 \n" // load 8 samples, 5 rows + "ld1 {v2.4s, v3.4s}, [%1], #32 \n" + "fmla v0.4s, v2.4s, v6.4s \n" // * 4 + "ld1 {v4.4s, v5.4s}, [%2], #32 \n" + "fmla v1.4s, v3.4s, v6.4s \n" + "prfm pldl1keep, [%0, 448] \n" + "fmla v0.4s, v4.4s, v7.4s \n" // * 6 + "ld1 {v2.4s, v3.4s}, [%3], #32 \n" + "fmla v1.4s, v5.4s, v7.4s \n" + "prfm pldl1keep, [%1, 448] \n" + "fmla v0.4s, v2.4s, v6.4s \n" // * 4 + "ld1 {v4.4s, v5.4s}, [%4], #32 \n" + "fmla v1.4s, v3.4s, v6.4s \n" + "prfm pldl1keep, [%2, 448] \n" + "fadd v0.4s, v0.4s, v4.4s \n" // * 1 + "prfm pldl1keep, [%3, 448] \n" + "fadd v1.4s, v1.4s, v5.4s \n" + "prfm pldl1keep, [%4, 448] \n" + "subs %w6, %w6, #8 \n" // 8 processed per loop + "st1 {v0.4s, v1.4s}, [%5], #32 \n" // store 8 samples + "b.gt 1b \n" + : "+r"(src0), // %0 + "+r"(src1), // %1 + "+r"(src2), // %2 + "+r"(src3), // %3 + "+r"(src4), // %4 + "+r"(dst), // %5 + "+r"(width) // %6 + : "r"(&kGaussCoefficients) // %7 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +} + +// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. +void GaussRow_F32_NEON(const float* src, float* dst, int width) { + asm volatile( + "ld3r {v6.4s, v7.4s, v8.4s}, [%3] \n" // constants 4, 6, 1/256 + + "1: \n" + "ld1 {v0.4s, v1.4s, v2.4s}, [%0], %4 \n" // load 12 samples, 5 + // rows + "fadd v0.4s, v0.4s, v1.4s \n" // * 1 + "ld1 {v4.4s, v5.4s}, [%0], %5 \n" + "fadd v1.4s, v1.4s, v2.4s \n" + "fmla v0.4s, v4.4s, v7.4s \n" // * 6 + "ld1 {v2.4s, v3.4s}, [%0], %4 \n" + "fmla v1.4s, v5.4s, v7.4s \n" + "ld1 {v4.4s, v5.4s}, [%0], %6 \n" + "fadd v2.4s, v2.4s, v4.4s \n" + "fadd v3.4s, v3.4s, v5.4s \n" + "fmla v0.4s, v2.4s, v6.4s \n" // * 4 + "fmla v1.4s, v3.4s, v6.4s \n" + "prfm pldl1keep, [%0, 448] \n" + "fmul v0.4s, v0.4s, v8.4s \n" // / 256 + "fmul v1.4s, v1.4s, v8.4s \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + "st1 {v0.4s, v1.4s}, [%1], #32 \n" // store 8 samples + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(&kGaussCoefficients), // %3 + "r"(8LL), // %4 + "r"(-4LL), // %5 + "r"(20LL) // %6 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8"); +} + // Convert biplanar NV21 to packed YUV24 void NV21ToYUV24Row_NEON(const uint8_t* src_y, const uint8_t* src_vu, @@ -2922,13 +3227,15 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y, int width) { asm volatile( "1: \n" - "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values - "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values - "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values - "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values - "subs %w3, %w3, #16 \n" // 16 pixels per loop - "st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels - "b.gt 1b \n" + "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values + "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values + "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%1, 448] \n" + "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values + "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values + "subs %w3, %w3, #16 \n" // 16 pixels per loop + "st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_vu), // %1 "+r"(dst_yuv24), // %2 @@ -2945,17 +3252,19 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv, asm volatile( "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv - "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 - "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. - "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. - "uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average - "uqrshrn v2.8b, v1.8h, #2 \n" - "subs %w3, %w3, #16 \n" // 16 processed per loop. - "st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV. - "b.gt 1b \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 + "prfm pldl1keep, [%1, 448] \n" + "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. + "uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average + "uqrshrn v2.8b, v1.8h, #2 \n" + "subs %w3, %w3, #16 \n" // 16 processed per loop. + "st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV. + "b.gt 1b \n" : "+r"(src_ayuv), // %0 "+r"(src_ayuv_1), // %1 "+r"(dst_uv), // %2 @@ -2972,18 +3281,19 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv, asm volatile( "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 - // pixels. - "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 - "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. - "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. - "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average - "uqrshrn v1.8b, v1.8h, #2 \n" - "subs %w3, %w3, #16 \n" // 16 processed per loop. - "st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU. - "b.gt 1b \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 + "prfm pldl1keep, [%1, 448] \n" + "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. + "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average + "uqrshrn v1.8b, v1.8h, #2 \n" + "subs %w3, %w3, #16 \n" // 16 processed per loop. + "st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU. + "b.gt 1b \n" : "+r"(src_ayuv), // %0 "+r"(src_ayuv_1), // %1 "+r"(dst_vu), // %2 @@ -2996,11 +3306,11 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv, void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { asm volatile( "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 - // pixels - "subs %w2, %w2, #16 \n" // 16 pixels per loop - "st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels - "b.gt 1b \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #16 \n" // 16 pixels per loop + "st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels + "b.gt 1b \n" : "+r"(src_ayuv), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -3008,22 +3318,67 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { : "cc", "memory", "v0", "v1", "v2", "v3"); } +// Shuffle table for swapping UV bytes. +static const uvec8 kShuffleSwapUV = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, + 9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u}; + // Convert UV plane of NV12 to VU of NV21. void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { asm volatile( - "1: \n" - "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 UV values - "orr v2.16b, v0.16b, v0.16b \n" // move U after V - "subs %w2, %w2, #16 \n" // 16 pixels per loop - "st2 {v1.16b, v2.16b}, [%1], #32 \n" // store 16 VU pixels - "b.gt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_vu), // %1 - "+r"(width) // %2 - : + "ld1 {v2.16b}, [%3] \n" // shuffler + "1: \n" + "ld1 {v0.16b}, [%0], 16 \n" // load 16 UV values + "ld1 {v1.16b}, [%0], 16 \n" + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #16 \n" // 16 pixels per loop + "tbl v0.16b, {v0.16b}, v2.16b \n" + "tbl v1.16b, {v1.16b}, v2.16b \n" + "stp q0, q1, [%1], 32 \n" // store 16 VU pixels + "b.gt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_vu), // %1 + "+r"(width) // %2 + : "r"(&kShuffleSwapUV) // %3 : "cc", "memory", "v0", "v1", "v2"); } +void HalfMergeUVRow_NEON(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int width) { + const uint8_t* src_u_1 = src_u + src_stride_u; + const uint8_t* src_v_1 = src_v + src_stride_v; + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 16 U values + "ld1 {v1.16b}, [%2], #16 \n" // load 16 V values + "ld1 {v2.16b}, [%1], #16 \n" + "ld1 {v3.16b}, [%3], #16 \n" + "uaddlp v0.8h, v0.16b \n" // half size + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v1.8h, v1.16b \n" + "prfm pldl1keep, [%2, 448] \n" + "uadalp v0.8h, v2.16b \n" + "prfm pldl1keep, [%1, 448] \n" + "uadalp v1.8h, v3.16b \n" + "prfm pldl1keep, [%3, 448] \n" + "uqrshrn v0.8b, v0.8h, #2 \n" + "uqrshrn v1.8b, v1.8h, #2 \n" + "subs %w5, %w5, #16 \n" // 16 src pixels per loop + "st2 {v0.8b, v1.8b}, [%4], #16 \n" // store 8 UV pixels + "b.gt 1b \n" + : "+r"(src_u), // %0 + "+r"(src_u_1), // %1 + "+r"(src_v), // %2 + "+r"(src_v_1), // %3 + "+r"(dst_uv), // %4 + "+r"(width) // %5 + : + : "cc", "memory", "v0", "v1", "v2", "v3"); +} + #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus diff --git a/chromium/third_party/libyuv/source/row_win.cc b/chromium/third_party/libyuv/source/row_win.cc index f976d40260a..9afcf060a4d 100644 --- a/chromium/third_party/libyuv/source/row_win.cc +++ b/chromium/third_party/libyuv/source/row_win.cc @@ -2898,10 +2898,12 @@ __declspec(naked) void I422ToRGBARow_SSSE3( } #endif // HAS_I422TOARGBROW_SSSE3 +// I400ToARGBRow_SSE2 is disabled due to new yuvconstant parameter #ifdef HAS_I400TOARGBROW_SSE2 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes). __declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* rgb_buf, + const struct YuvConstants*, int width) { __asm { mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) @@ -2949,6 +2951,7 @@ __declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf, // note: vpunpcklbw mutates and vpackuswb unmutates. __declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* rgb_buf, + const struct YuvConstants*, int width) { __asm { mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) @@ -3045,15 +3048,15 @@ __declspec(naked) void MirrorRow_AVX2(const uint8_t* src, } #endif // HAS_MIRRORROW_AVX2 -#ifdef HAS_MIRRORUVROW_SSSE3 +#ifdef HAS_MIRRORSPLITUVROW_SSSE3 // Shuffle table for reversing the bytes of UV channels. static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; -__declspec(naked) void MirrorUVRow_SSSE3(const uint8_t* src, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { +__declspec(naked) void MirrorSplitUVRow_SSSE3(const uint8_t* src, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push edi mov eax, [esp + 4 + 4] // src @@ -3078,7 +3081,7 @@ __declspec(naked) void MirrorUVRow_SSSE3(const uint8_t* src, ret } } -#endif // HAS_MIRRORUVROW_SSSE3 +#endif // HAS_MIRRORSPLITUVROW_SSSE3 #ifdef HAS_ARGBMIRRORROW_SSE2 __declspec(naked) void ARGBMirrorRow_SSE2(const uint8_t* src, diff --git a/chromium/third_party/libyuv/source/scale.cc b/chromium/third_party/libyuv/source/scale.cc index 5034c503296..cf3c0332573 100644 --- a/chromium/third_party/libyuv/source/scale.cc +++ b/chromium/third_party/libyuv/source/scale.cc @@ -17,6 +17,7 @@ #include "libyuv/planar_functions.h" // For CopyPlane #include "libyuv/row.h" #include "libyuv/scale_row.h" +#include "libyuv/scale_uv.h" // For UVScale #ifdef __cplusplus namespace libyuv { @@ -103,21 +104,6 @@ static void ScalePlaneDown2(int src_width, } } #endif -#if defined(HAS_SCALEROWDOWN2_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ScaleRowDown2 = - filtering == kFilterNone - ? ScaleRowDown2_Any_MSA - : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MSA - : ScaleRowDown2Box_Any_MSA); - if (IS_ALIGNED(dst_width, 32)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MSA - : (filtering == kFilterLinear - ? ScaleRowDown2Linear_MSA - : ScaleRowDown2Box_MSA); - } - } -#endif #if defined(HAS_SCALEROWDOWN2_MMI) if (TestCpuFlag(kCpuHasMMI)) { ScaleRowDown2 = @@ -133,6 +119,21 @@ static void ScalePlaneDown2(int src_width, } } #endif +#if defined(HAS_SCALEROWDOWN2_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_Any_MSA + : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MSA + : ScaleRowDown2Box_Any_MSA); + if (IS_ALIGNED(dst_width, 32)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MSA + : (filtering == kFilterLinear + ? ScaleRowDown2Linear_MSA + : ScaleRowDown2Box_MSA); + } + } +#endif if (filtering == kFilterLinear) { src_stride = 0; @@ -255,15 +256,6 @@ static void ScalePlaneDown4(int src_width, } } #endif -#if defined(HAS_SCALEROWDOWN4_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ScaleRowDown4 = - filtering ? ScaleRowDown4Box_Any_MSA : ScaleRowDown4_Any_MSA; - if (IS_ALIGNED(dst_width, 16)) { - ScaleRowDown4 = filtering ? ScaleRowDown4Box_MSA : ScaleRowDown4_MSA; - } - } -#endif #if defined(HAS_SCALEROWDOWN4_MMI) if (TestCpuFlag(kCpuHasMMI)) { ScaleRowDown4 = @@ -273,6 +265,15 @@ static void ScalePlaneDown4(int src_width, } } #endif +#if defined(HAS_SCALEROWDOWN4_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_Any_MSA : ScaleRowDown4_Any_MSA; + if (IS_ALIGNED(dst_width, 16)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_MSA : ScaleRowDown4_MSA; + } + } +#endif if (filtering == kFilterLinear) { src_stride = 0; @@ -378,6 +379,18 @@ static void ScalePlaneDown34(int src_width, } } #endif +#if defined(HAS_SCALEROWDOWN34_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_Any_MMI; + ScaleRowDown34_1 = ScaleRowDown34_Any_MMI; + if (dst_width % 24 == 0) { + ScaleRowDown34_0 = ScaleRowDown34_MMI; + ScaleRowDown34_1 = ScaleRowDown34_MMI; + } + } + } +#endif #if defined(HAS_SCALEROWDOWN34_MSA) if (TestCpuFlag(kCpuHasMSA)) { if (!filtering) { @@ -398,18 +411,6 @@ static void ScalePlaneDown34(int src_width, } } #endif -#if defined(HAS_SCALEROWDOWN34_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_Any_MMI; - ScaleRowDown34_1 = ScaleRowDown34_Any_MMI; - if (dst_width % 24 == 0) { - ScaleRowDown34_0 = ScaleRowDown34_MMI; - ScaleRowDown34_1 = ScaleRowDown34_MMI; - } - } - } -#endif #if defined(HAS_SCALEROWDOWN34_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { if (!filtering) { @@ -890,14 +891,6 @@ static void ScalePlaneBox(int src_width, } } #endif -#if defined(HAS_SCALEADDROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ScaleAddRow = ScaleAddRow_Any_MSA; - if (IS_ALIGNED(src_width, 16)) { - ScaleAddRow = ScaleAddRow_MSA; - } - } -#endif #if defined(HAS_SCALEADDROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ScaleAddRow = ScaleAddRow_Any_MMI; @@ -906,6 +899,14 @@ static void ScalePlaneBox(int src_width, } } #endif +#if defined(HAS_SCALEADDROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleAddRow = ScaleAddRow_Any_MSA; + if (IS_ALIGNED(src_width, 16)) { + ScaleAddRow = ScaleAddRow_MSA; + } + } +#endif for (j = 0; j < dst_height; ++j) { int boxheight; @@ -1042,14 +1043,6 @@ void ScalePlaneBilinearDown(int src_width, } } #endif -#if defined(HAS_INTERPOLATEROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - InterpolateRow = InterpolateRow_Any_MSA; - if (IS_ALIGNED(src_width, 32)) { - InterpolateRow = InterpolateRow_MSA; - } - } -#endif #if defined(HAS_INTERPOLATEROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { InterpolateRow = InterpolateRow_Any_MMI; @@ -1058,6 +1051,14 @@ void ScalePlaneBilinearDown(int src_width, } } #endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(src_width, 32)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif #if defined(HAS_SCALEFILTERCOLS_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { @@ -1670,7 +1671,7 @@ void ScalePlane_16(const uint16_t* src, } if (dst_width == src_width && filtering != kFilterBox) { int dy = FixedDiv(src_height, dst_height); - // Arbitrary scale vertically, but unscaled vertically. + // Arbitrary scale vertically, but unscaled horizontally. ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, 0, 0, dy, 1, filtering); return; @@ -1869,6 +1870,40 @@ int I444Scale_16(const uint16_t* src_y, return 0; } +// Scale an NV12 image. +// This function in turn calls a scaling function for each plane. + +LIBYUV_API +int NV12Scale(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + int src_width, + int src_height, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int dst_width, + int dst_height, + enum FilterMode filtering) { + int src_halfwidth = SUBSAMPLE(src_width, 1, 1); + int src_halfheight = SUBSAMPLE(src_height, 1, 1); + int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); + int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); + if (!src_y || !src_uv || src_width == 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_uv || + dst_width <= 0 || dst_height <= 0) { + return -1; + } + + ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, + dst_width, dst_height, filtering); + UVScale(src_uv, src_stride_uv, src_halfwidth, src_halfheight, dst_uv, + dst_stride_uv, dst_halfwidth, dst_halfheight, filtering); + return 0; +} + // Deprecated api LIBYUV_API int Scale(const uint8_t* src_y, diff --git a/chromium/third_party/libyuv/source/scale_any.cc b/chromium/third_party/libyuv/source/scale_any.cc index d780cb1ffda..c93d70c5fc7 100644 --- a/chromium/third_party/libyuv/source/scale_any.cc +++ b/chromium/third_party/libyuv/source/scale_any.cc @@ -20,49 +20,6 @@ namespace libyuv { extern "C" { #endif -// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols -#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \ - void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \ - int dx) { \ - int r = dst_width & MASK; \ - int n = dst_width & ~MASK; \ - if (n > 0) { \ - TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \ - } \ - TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx); \ - } - -#ifdef HAS_SCALEFILTERCOLS_NEON -CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7) -#endif -#ifdef HAS_SCALEFILTERCOLS_MSA -CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15) -#endif -#ifdef HAS_SCALEARGBCOLS_NEON -CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7) -#endif -#ifdef HAS_SCALEARGBCOLS_MSA -CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3) -#endif -#ifdef HAS_SCALEARGBCOLS_MMI -CANY(ScaleARGBCols_Any_MMI, ScaleARGBCols_MMI, ScaleARGBCols_C, 4, 0) -#endif -#ifdef HAS_SCALEARGBFILTERCOLS_NEON -CANY(ScaleARGBFilterCols_Any_NEON, - ScaleARGBFilterCols_NEON, - ScaleARGBFilterCols_C, - 4, - 3) -#endif -#ifdef HAS_SCALEARGBFILTERCOLS_MSA -CANY(ScaleARGBFilterCols_Any_MSA, - ScaleARGBFilterCols_MSA, - ScaleARGBFilterCols_C, - 4, - 7) -#endif -#undef CANY - // Fixed scale down. // Mask may be non-power of 2, so use MOD #define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \ @@ -113,6 +70,22 @@ SDODD(ScaleRowDown2Box_Odd_SSSE3, 1, 15) #endif +#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3 +SDANY(ScaleUVRowDown2Box_Any_SSSE3, + ScaleUVRowDown2Box_SSSE3, + ScaleUVRowDown2Box_C, + 2, + 2, + 4) +#endif +#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2 +SDANY(ScaleUVRowDown2Box_Any_AVX2, + ScaleUVRowDown2Box_AVX2, + ScaleUVRowDown2Box_C, + 2, + 2, + 8) +#endif #ifdef HAS_SCALEROWDOWN2_AVX2 SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31) SDANY(ScaleRowDown2Linear_Any_AVX2, @@ -155,6 +128,15 @@ SDODD(ScaleRowDown2Box_Odd_NEON, 1, 15) #endif +#ifdef HAS_SCALEUVROWDOWN2BOX_NEON +SDANY(ScaleUVRowDown2Box_Any_NEON, + ScaleUVRowDown2Box_NEON, + ScaleUVRowDown2Box_C, + 2, + 2, + 8) +#endif + #ifdef HAS_SCALEROWDOWN2_MSA SDANY(ScaleRowDown2_Any_MSA, ScaleRowDown2_MSA, ScaleRowDown2_C, 2, 1, 31) SDANY(ScaleRowDown2Linear_Any_MSA, @@ -508,6 +490,13 @@ SDAANY(ScaleARGBRowDownEvenBox_Any_MMI, 4, 1) #endif +#ifdef HAS_SCALEUVROWDOWNEVEN_NEON +SDAANY(ScaleUVRowDownEven_Any_NEON, + ScaleUVRowDownEven_NEON, + ScaleUVRowDownEven_C, + 2, + 3) +#endif #ifdef SASIMDONLY // This also works and uses memcpy and SIMD instead of C, but is slower on ARM @@ -577,6 +566,49 @@ SAANY(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, ScaleAddRow_C, 7) #endif // SASIMDONLY +// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols +#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \ + void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \ + int dx) { \ + int r = dst_width & MASK; \ + int n = dst_width & ~MASK; \ + if (n > 0) { \ + TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \ + } \ + TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx); \ + } + +#ifdef HAS_SCALEFILTERCOLS_NEON +CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7) +#endif +#ifdef HAS_SCALEFILTERCOLS_MSA +CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15) +#endif +#ifdef HAS_SCALEARGBCOLS_NEON +CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7) +#endif +#ifdef HAS_SCALEARGBCOLS_MSA +CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3) +#endif +#ifdef HAS_SCALEARGBCOLS_MMI +CANY(ScaleARGBCols_Any_MMI, ScaleARGBCols_MMI, ScaleARGBCols_C, 4, 0) +#endif +#ifdef HAS_SCALEARGBFILTERCOLS_NEON +CANY(ScaleARGBFilterCols_Any_NEON, + ScaleARGBFilterCols_NEON, + ScaleARGBFilterCols_C, + 4, + 3) +#endif +#ifdef HAS_SCALEARGBFILTERCOLS_MSA +CANY(ScaleARGBFilterCols_Any_MSA, + ScaleARGBFilterCols_MSA, + ScaleARGBFilterCols_C, + 4, + 7) +#endif +#undef CANY + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/chromium/third_party/libyuv/source/scale_argb.cc b/chromium/third_party/libyuv/source/scale_argb.cc index 58aa5ebbedf..451d4ec4d1b 100644 --- a/chromium/third_party/libyuv/source/scale_argb.cc +++ b/chromium/third_party/libyuv/source/scale_argb.cc @@ -95,22 +95,6 @@ static void ScaleARGBDown2(int src_width, } } #endif -#if defined(HAS_SCALEARGBROWDOWN2_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ScaleARGBRowDown2 = - filtering == kFilterNone - ? ScaleARGBRowDown2_Any_MSA - : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_MSA - : ScaleARGBRowDown2Box_Any_MSA); - if (IS_ALIGNED(dst_width, 4)) { - ScaleARGBRowDown2 = - filtering == kFilterNone - ? ScaleARGBRowDown2_MSA - : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_MSA - : ScaleARGBRowDown2Box_MSA); - } - } -#endif #if defined(HAS_SCALEARGBROWDOWN2_MMI) if (TestCpuFlag(kCpuHasMMI)) { ScaleARGBRowDown2 = @@ -127,6 +111,22 @@ static void ScaleARGBDown2(int src_width, } } #endif +#if defined(HAS_SCALEARGBROWDOWN2_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_Any_MSA + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_MSA + : ScaleARGBRowDown2Box_Any_MSA); + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_MSA + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_MSA + : ScaleARGBRowDown2Box_MSA); + } + } +#endif if (filtering == kFilterLinear) { src_stride = 0; @@ -243,16 +243,6 @@ static void ScaleARGBDownEven(int src_width, } } #endif -#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MSA - : ScaleARGBRowDownEven_Any_MSA; - if (IS_ALIGNED(dst_width, 4)) { - ScaleARGBRowDownEven = - filtering ? ScaleARGBRowDownEvenBox_MSA : ScaleARGBRowDownEven_MSA; - } - } -#endif #if defined(HAS_SCALEARGBROWDOWNEVEN_MMI) if (TestCpuFlag(kCpuHasMMI)) { ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MMI @@ -263,6 +253,16 @@ static void ScaleARGBDownEven(int src_width, } } #endif +#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MSA + : ScaleARGBRowDownEven_Any_MSA; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDownEven = + filtering ? ScaleARGBRowDownEvenBox_MSA : ScaleARGBRowDownEven_MSA; + } + } +#endif if (filtering == kFilterLinear) { src_stride = 0; @@ -436,14 +436,6 @@ static void ScaleARGBBilinearUp(int src_width, } } #endif -#if defined(HAS_INTERPOLATEROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - InterpolateRow = InterpolateRow_Any_MSA; - if (IS_ALIGNED(dst_width, 8)) { - InterpolateRow = InterpolateRow_MSA; - } - } -#endif #if defined(HAS_INTERPOLATEROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { InterpolateRow = InterpolateRow_Any_MMI; @@ -452,6 +444,14 @@ static void ScaleARGBBilinearUp(int src_width, } } #endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif if (src_width >= 32768) { ScaleARGBFilterCols = filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C; @@ -490,14 +490,6 @@ static void ScaleARGBBilinearUp(int src_width, } } #endif -#if defined(HAS_SCALEARGBCOLS_MSA) - if (!filtering && TestCpuFlag(kCpuHasMSA)) { - ScaleARGBFilterCols = ScaleARGBCols_Any_MSA; - if (IS_ALIGNED(dst_width, 4)) { - ScaleARGBFilterCols = ScaleARGBCols_MSA; - } - } -#endif #if defined(HAS_SCALEARGBCOLS_MMI) if (!filtering && TestCpuFlag(kCpuHasMMI)) { ScaleARGBFilterCols = ScaleARGBCols_Any_MMI; @@ -506,6 +498,14 @@ static void ScaleARGBBilinearUp(int src_width, } } #endif +#if defined(HAS_SCALEARGBCOLS_MSA) + if (!filtering && TestCpuFlag(kCpuHasMSA)) { + ScaleARGBFilterCols = ScaleARGBCols_Any_MSA; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBCols_MSA; + } + } +#endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleARGBFilterCols = ScaleARGBColsUp2_C; #if defined(HAS_SCALEARGBCOLSUP2_SSE2) @@ -619,14 +619,6 @@ static void ScaleYUVToARGBBilinearUp(int src_width, } } #endif -#if defined(HAS_I422TOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToARGBRow = I422ToARGBRow_Any_MSA; - if (IS_ALIGNED(src_width, 8)) { - I422ToARGBRow = I422ToARGBRow_MSA; - } - } -#endif #if defined(HAS_I422TOARGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { I422ToARGBRow = I422ToARGBRow_Any_MMI; @@ -635,6 +627,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width, } } #endif +#if defined(HAS_I422TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGBRow = I422ToARGBRow_Any_MSA; + if (IS_ALIGNED(src_width, 8)) { + I422ToARGBRow = I422ToARGBRow_MSA; + } + } +#endif void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb, ptrdiff_t src_stride, int dst_width, @@ -713,14 +713,6 @@ static void ScaleYUVToARGBBilinearUp(int src_width, } } #endif -#if defined(HAS_SCALEARGBCOLS_MSA) - if (!filtering && TestCpuFlag(kCpuHasMSA)) { - ScaleARGBFilterCols = ScaleARGBCols_Any_MSA; - if (IS_ALIGNED(dst_width, 4)) { - ScaleARGBFilterCols = ScaleARGBCols_MSA; - } - } -#endif #if defined(HAS_SCALEARGBCOLS_MMI) if (!filtering && TestCpuFlag(kCpuHasMMI)) { ScaleARGBFilterCols = ScaleARGBCols_Any_MMI; @@ -729,6 +721,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width, } } #endif +#if defined(HAS_SCALEARGBCOLS_MSA) + if (!filtering && TestCpuFlag(kCpuHasMSA)) { + ScaleARGBFilterCols = ScaleARGBCols_Any_MSA; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBCols_MSA; + } + } +#endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleARGBFilterCols = ScaleARGBColsUp2_C; #if defined(HAS_SCALEARGBCOLSUP2_SSE2) @@ -857,14 +857,6 @@ static void ScaleARGBSimple(int src_width, } } #endif -#if defined(HAS_SCALEARGBCOLS_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ScaleARGBCols = ScaleARGBCols_Any_MSA; - if (IS_ALIGNED(dst_width, 4)) { - ScaleARGBCols = ScaleARGBCols_MSA; - } - } -#endif #if defined(HAS_SCALEARGBCOLS_MMI) if (TestCpuFlag(kCpuHasMMI)) { ScaleARGBCols = ScaleARGBCols_Any_MMI; @@ -873,6 +865,14 @@ static void ScaleARGBSimple(int src_width, } } #endif +#if defined(HAS_SCALEARGBCOLS_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleARGBCols = ScaleARGBCols_Any_MSA; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBCols = ScaleARGBCols_MSA; + } + } +#endif if (src_width * 2 == dst_width && x < 0x8000) { ScaleARGBCols = ScaleARGBColsUp2_C; #if defined(HAS_SCALEARGBCOLSUP2_SSE2) @@ -981,7 +981,7 @@ static void ScaleARGB(const uint8_t* src, } } if (dx == 0x10000 && (x & 0xffff) == 0) { - // Arbitrary scale vertically, but unscaled vertically. + // Arbitrary scale vertically, but unscaled horizontally. ScalePlaneVertical(src_height, clip_width, clip_height, src_stride, dst_stride, src, dst, x, y, dy, 4, filtering); return; diff --git a/chromium/third_party/libyuv/source/scale_common.cc b/chromium/third_party/libyuv/source/scale_common.cc index 6369027175f..81959925c8a 100644 --- a/chromium/third_party/libyuv/source/scale_common.cc +++ b/chromium/third_party/libyuv/source/scale_common.cc @@ -776,6 +776,8 @@ void ScaleAddRow_16_C(const uint16_t* src_ptr, } } +// ARGB scale row functions + void ScaleARGBRowDown2_C(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, @@ -1018,6 +1020,235 @@ void ScaleARGBFilterCols64_C(uint8_t* dst_argb, #undef BLENDERC #undef BLENDER +// UV scale row functions +// same as ARGB but 2 channels + +void ScaleUVRowDown2_C(const uint8_t* src_uv, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width) { + const uint16_t* src = (const uint16_t*)(src_uv); + uint16_t* dst = (uint16_t*)(dst_uv); + int x; + (void)src_stride; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src[1]; + dst[1] = src[3]; + src += 2; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[1]; + } +} + +void ScaleUVRowDown2Linear_C(const uint8_t* src_uv, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width) { + int x; + (void)src_stride; + for (x = 0; x < dst_width; ++x) { + dst_uv[0] = (src_uv[0] + src_uv[2] + 1) >> 1; + dst_uv[1] = (src_uv[1] + src_uv[3] + 1) >> 1; + src_uv += 4; + dst_uv += 2; + } +} + +void ScaleUVRowDown2Box_C(const uint8_t* src_uv, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width) { + int x; + for (x = 0; x < dst_width; ++x) { + dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] + + src_uv[src_stride + 2] + 2) >> + 2; + dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] + + src_uv[src_stride + 3] + 2) >> + 2; + src_uv += 4; + dst_uv += 2; + } +} + +void ScaleUVRowDownEven_C(const uint8_t* src_uv, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_uv, + int dst_width) { + const uint16_t* src = (const uint16_t*)(src_uv); + uint16_t* dst = (uint16_t*)(dst_uv); + (void)src_stride; + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src[0]; + dst[1] = src[src_stepx]; + src += src_stepx * 2; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[0]; + } +} + +void ScaleUVRowDownEvenBox_C(const uint8_t* src_uv, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_uv, + int dst_width) { + int x; + for (x = 0; x < dst_width; ++x) { + dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] + + src_uv[src_stride + 2] + 2) >> + 2; + dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] + + src_uv[src_stride + 3] + 2) >> + 2; + src_uv += src_stepx * 2; + dst_uv += 2; + } +} + +// Scales a single row of pixels using point sampling. +void ScaleUVCols_C(uint8_t* dst_uv, + const uint8_t* src_uv, + int dst_width, + int x, + int dx) { + const uint16_t* src = (const uint16_t*)(src_uv); + uint16_t* dst = (uint16_t*)(dst_uv); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst[0] = src[x >> 16]; + x += dx; + dst[1] = src[x >> 16]; + x += dx; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[x >> 16]; + } +} + +void ScaleUVCols64_C(uint8_t* dst_uv, + const uint8_t* src_uv, + int dst_width, + int x32, + int dx) { + int64_t x = (int64_t)(x32); + const uint16_t* src = (const uint16_t*)(src_uv); + uint16_t* dst = (uint16_t*)(dst_uv); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst[0] = src[x >> 16]; + x += dx; + dst[1] = src[x >> 16]; + x += dx; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[x >> 16]; + } +} + +// Scales a single row of pixels up by 2x using point sampling. +void ScaleUVColsUp2_C(uint8_t* dst_uv, + const uint8_t* src_uv, + int dst_width, + int x, + int dx) { + const uint16_t* src = (const uint16_t*)(src_uv); + uint16_t* dst = (uint16_t*)(dst_uv); + int j; + (void)x; + (void)dx; + for (j = 0; j < dst_width - 1; j += 2) { + dst[1] = dst[0] = src[0]; + src += 1; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[0]; + } +} + +// TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=607. +// Mimics SSSE3 blender +#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7 +#define BLENDERC(a, b, f, s) \ + (uint16_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s) +#define BLENDER(a, b, f) BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0) + +void ScaleUVFilterCols_C(uint8_t* dst_uv, + const uint8_t* src_uv, + int dst_width, + int x, + int dx) { + const uint16_t* src = (const uint16_t*)(src_uv); + uint16_t* dst = (uint16_t*)(dst_uv); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint16_t a = src[xi]; + uint16_t b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + x += dx; + xi = x >> 16; + xf = (x >> 9) & 0x7f; + a = src[xi]; + b = src[xi + 1]; + dst[1] = BLENDER(a, b, xf); + x += dx; + dst += 2; + } + if (dst_width & 1) { + int xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint16_t a = src[xi]; + uint16_t b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + } +} + +void ScaleUVFilterCols64_C(uint8_t* dst_uv, + const uint8_t* src_uv, + int dst_width, + int x32, + int dx) { + int64_t x = (int64_t)(x32); + const uint16_t* src = (const uint16_t*)(src_uv); + uint16_t* dst = (uint16_t*)(dst_uv); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int64_t xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint16_t a = src[xi]; + uint16_t b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + x += dx; + xi = x >> 16; + xf = (x >> 9) & 0x7f; + a = src[xi]; + b = src[xi + 1]; + dst[1] = BLENDER(a, b, xf); + x += dx; + dst += 2; + } + if (dst_width & 1) { + int64_t xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint16_t a = src[xi]; + uint16_t b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + } +} +#undef BLENDER1 +#undef BLENDERC +#undef BLENDER + // Scale plane vertically with bilinear interpolation. void ScalePlaneVertical(int src_height, int dst_width, @@ -1067,14 +1298,6 @@ void ScalePlaneVertical(int src_height, } } #endif -#if defined(HAS_INTERPOLATEROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - InterpolateRow = InterpolateRow_Any_MSA; - if (IS_ALIGNED(dst_width_bytes, 32)) { - InterpolateRow = InterpolateRow_MSA; - } - } -#endif #if defined(HAS_INTERPOLATEROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { InterpolateRow = InterpolateRow_Any_MMI; @@ -1083,6 +1306,14 @@ void ScalePlaneVertical(int src_height, } } #endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(dst_width_bytes, 32)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif for (j = 0; j < dst_height; ++j) { int yi; int yf; @@ -1181,8 +1412,8 @@ enum FilterMode ScaleFilterReduce(int src_width, src_height = -src_height; } if (filtering == kFilterBox) { - // If scaling both axis to 0.5 or larger, switch from Box to Bilinear. - if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) { + // If scaling either axis to 0.5 or larger, switch from Box to Bilinear. + if (dst_width * 2 >= src_width || dst_height * 2 >= src_height) { filtering = kFilterBilinear; } } diff --git a/chromium/third_party/libyuv/source/scale_gcc.cc b/chromium/third_party/libyuv/source/scale_gcc.cc index 90a49f30d73..e575ee18bcb 100644 --- a/chromium/third_party/libyuv/source/scale_gcc.cc +++ b/chromium/third_party/libyuv/source/scale_gcc.cc @@ -102,16 +102,16 @@ void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, // 16 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -125,25 +125,25 @@ void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrlw $0xf,%%xmm4 \n" - "packuswb %%xmm4,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "packuswb %%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pavgw %%xmm5,%%xmm0 \n" - "pavgw %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pavgw %%xmm5,%%xmm0 \n" + "pavgw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -156,33 +156,33 @@ void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrlw $0xf,%%xmm4 \n" - "packuswb %%xmm4,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "packuswb %%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x00(%0,%3,1),%%xmm2 \n" - "movdqu 0x10(%0,%3,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "psrlw $0x1,%%xmm0 \n" - "psrlw $0x1,%%xmm1 \n" - "pavgw %%xmm5,%%xmm0 \n" - "pavgw %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%3,1),%%xmm2 \n" + "movdqu 0x10(%0,%3,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "psrlw $0x1,%%xmm0 \n" + "psrlw $0x1,%%xmm1 \n" + "pavgw %%xmm5,%%xmm0 \n" + "pavgw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -200,17 +200,17 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr, LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 @@ -225,26 +225,26 @@ void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrlw $0xf,%%ymm4,%%ymm4 \n" - "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" - "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" - "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" + "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 @@ -258,34 +258,34 @@ void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrlw $0xf,%%ymm4,%%ymm4 \n" - "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" - "vmovdqu 0x20(%0,%3,1),%%ymm3 \n" - "lea 0x40(%0),%0 \n" - "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vpsrlw $0x1,%%ymm0,%%ymm0 \n" - "vpsrlw $0x1,%%ymm1,%%ymm1 \n" - "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" - "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" + "vmovdqu 0x20(%0,%3,1),%%ymm3 \n" + "lea 0x40(%0),%0 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vpsrlw $0x1,%%ymm0,%%ymm0 \n" + "vpsrlw $0x1,%%ymm1,%%ymm1 \n" + "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" + "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 @@ -301,24 +301,24 @@ void ScaleRowDown4_SSSE3(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrld $0x18,%%xmm5 \n" - "pslld $0x10,%%xmm5 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrld $0x18,%%xmm5 \n" + "pslld $0x10,%%xmm5 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "psrlw $0x8,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "psrlw $0x8,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -332,46 +332,46 @@ void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, int dst_width) { intptr_t stridex3; asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrlw $0xf,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "packuswb %%xmm4,%%xmm4 \n" - "psllw $0x3,%%xmm5 \n" - "lea 0x00(%4,%4,2),%3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "packuswb %%xmm4,%%xmm4 \n" + "psllw $0x3,%%xmm5 \n" + "lea 0x00(%4,%4,2),%3 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x00(%0,%4,1),%%xmm2 \n" - "movdqu 0x10(%0,%4,1),%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "movdqu 0x00(%0,%4,2),%%xmm2 \n" - "movdqu 0x10(%0,%4,2),%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "movdqu 0x00(%0,%3,1),%%xmm2 \n" - "movdqu 0x10(%0,%3,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "phaddw %%xmm1,%%xmm0 \n" - "paddw %%xmm5,%%xmm0 \n" - "psrlw $0x4,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%4,1),%%xmm2 \n" + "movdqu 0x10(%0,%4,1),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "movdqu 0x00(%0,%4,2),%%xmm2 \n" + "movdqu 0x10(%0,%4,2),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "movdqu 0x00(%0,%3,1),%%xmm2 \n" + "movdqu 0x10(%0,%3,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "phaddw %%xmm1,%%xmm0 \n" + "paddw %%xmm5,%%xmm0 \n" + "psrlw $0x4,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 @@ -387,26 +387,26 @@ void ScaleRowDown4_AVX2(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrld $0x18,%%ymm5,%%ymm5 \n" - "vpslld $0x10,%%ymm5,%%ymm5 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrld $0x18,%%ymm5,%%ymm5 \n" + "vpslld $0x10,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 @@ -420,46 +420,46 @@ void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrlw $0xf,%%ymm4,%%ymm4 \n" - "vpsllw $0x3,%%ymm4,%%ymm5 \n" - "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpsllw $0x3,%%ymm4,%%ymm5 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" - "vmovdqu 0x20(%0,%3,1),%%ymm3 \n" - "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vmovdqu 0x00(%0,%3,2),%%ymm2 \n" - "vmovdqu 0x20(%0,%3,2),%%ymm3 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vmovdqu 0x00(%0,%4,1),%%ymm2 \n" - "vmovdqu 0x20(%0,%4,1),%%ymm3 \n" - "lea 0x40(%0),%0 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" - "vpsrlw $0x4,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" + "vmovdqu 0x20(%0,%3,1),%%ymm3 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vmovdqu 0x00(%0,%3,2),%%ymm2 \n" + "vmovdqu 0x20(%0,%3,2),%%ymm3 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vmovdqu 0x00(%0,%4,1),%%ymm2 \n" + "vmovdqu 0x20(%0,%4,1),%%ymm3 \n" + "lea 0x40(%0),%0 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" + "vpsrlw $0x4,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 @@ -476,9 +476,9 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "movdqa %0,%%xmm3 \n" - "movdqa %1,%%xmm4 \n" - "movdqa %2,%%xmm5 \n" + "movdqa %0,%%xmm3 \n" + "movdqa %1,%%xmm4 \n" + "movdqa %2,%%xmm5 \n" : : "m"(kShuf0), // %0 "m"(kShuf1), // %1 @@ -488,20 +488,20 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm2 \n" - "lea 0x20(%0),%0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "palignr $0x8,%%xmm0,%%xmm1 \n" - "pshufb %%xmm3,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "pshufb %%xmm5,%%xmm2 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x8(%1) \n" - "movq %%xmm2,0x10(%1) \n" - "lea 0x18(%1),%1 \n" - "sub $0x18,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm2 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "palignr $0x8,%%xmm0,%%xmm1 \n" + "pshufb %%xmm3,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "pshufb %%xmm5,%%xmm2 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x8(%1) \n" + "movq %%xmm2,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -514,18 +514,18 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "movdqa %0,%%xmm2 \n" // kShuf01 - "movdqa %1,%%xmm3 \n" // kShuf11 - "movdqa %2,%%xmm4 \n" // kShuf21 + "movdqa %0,%%xmm2 \n" // kShuf01 + "movdqa %1,%%xmm3 \n" // kShuf11 + "movdqa %2,%%xmm4 \n" // kShuf21 : : "m"(kShuf01), // %0 "m"(kShuf11), // %1 "m"(kShuf21) // %2 ); asm volatile( - "movdqa %0,%%xmm5 \n" // kMadd01 - "movdqa %1,%%xmm0 \n" // kMadd11 - "movdqa %2,%%xmm1 \n" // kRound34 + "movdqa %0,%%xmm5 \n" // kMadd01 + "movdqa %1,%%xmm0 \n" // kMadd11 + "movdqa %2,%%xmm1 \n" // kRound34 : : "m"(kMadd01), // %0 "m"(kMadd11), // %1 @@ -535,37 +535,37 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, LABELALIGN "1: \n" - "movdqu (%0),%%xmm6 \n" - "movdqu 0x00(%0,%3,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm5,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,(%1) \n" - "movdqu 0x8(%0),%%xmm6 \n" - "movdqu 0x8(%0,%3,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm3,%%xmm6 \n" - "pmaddubsw %%xmm0,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,0x8(%1) \n" - "movdqu 0x10(%0),%%xmm6 \n" - "movdqu 0x10(%0,%3,1),%%xmm7 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm4,%%xmm6 \n" - "pmaddubsw %4,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,0x10(%1) \n" - "lea 0x18(%1),%1 \n" - "sub $0x18,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x00(%0,%3,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,(%1) \n" + "movdqu 0x8(%0),%%xmm6 \n" + "movdqu 0x8(%0,%3,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm0,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x8(%1) \n" + "movdqu 0x10(%0),%%xmm6 \n" + "movdqu 0x10(%0,%3,1),%%xmm7 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "pmaddubsw %4,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -580,18 +580,18 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "movdqa %0,%%xmm2 \n" // kShuf01 - "movdqa %1,%%xmm3 \n" // kShuf11 - "movdqa %2,%%xmm4 \n" // kShuf21 + "movdqa %0,%%xmm2 \n" // kShuf01 + "movdqa %1,%%xmm3 \n" // kShuf11 + "movdqa %2,%%xmm4 \n" // kShuf21 : : "m"(kShuf01), // %0 "m"(kShuf11), // %1 "m"(kShuf21) // %2 ); asm volatile( - "movdqa %0,%%xmm5 \n" // kMadd01 - "movdqa %1,%%xmm0 \n" // kMadd11 - "movdqa %2,%%xmm1 \n" // kRound34 + "movdqa %0,%%xmm5 \n" // kMadd01 + "movdqa %1,%%xmm0 \n" // kMadd11 + "movdqa %2,%%xmm1 \n" // kRound34 : : "m"(kMadd01), // %0 "m"(kMadd11), // %1 @@ -602,40 +602,40 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, LABELALIGN "1: \n" - "movdqu (%0),%%xmm6 \n" - "movdqu 0x00(%0,%3,1),%%xmm7 \n" - "pavgb %%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm5,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,(%1) \n" - "movdqu 0x8(%0),%%xmm6 \n" - "movdqu 0x8(%0,%3,1),%%xmm7 \n" - "pavgb %%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm3,%%xmm6 \n" - "pmaddubsw %%xmm0,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,0x8(%1) \n" - "movdqu 0x10(%0),%%xmm6 \n" - "movdqu 0x10(%0,%3,1),%%xmm7 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm4,%%xmm6 \n" - "pmaddubsw %4,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,0x10(%1) \n" - "lea 0x18(%1),%1 \n" - "sub $0x18,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x00(%0,%3,1),%%xmm7 \n" + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,(%1) \n" + "movdqu 0x8(%0),%%xmm6 \n" + "movdqu 0x8(%0,%3,1),%%xmm7 \n" + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm0,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x8(%1) \n" + "movdqu 0x10(%0),%%xmm6 \n" + "movdqu 0x10(%0,%3,1),%%xmm7 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "pmaddubsw %4,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -651,23 +651,23 @@ void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "movhlps %%xmm0,%%xmm1 \n" - "movd %%xmm1,0x8(%1) \n" - "lea 0xc(%1),%1 \n" - "sub $0xc,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movhlps %%xmm0,%%xmm1 \n" + "movd %%xmm1,0x8(%1) \n" + "lea 0xc(%1),%1 \n" + "sub $0xc,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -681,10 +681,10 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "movdqa %0,%%xmm2 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm4 \n" - "movdqa %3,%%xmm5 \n" + "movdqa %0,%%xmm2 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm4 \n" + "movdqa %3,%%xmm5 \n" : : "m"(kShufAb0), // %0 "m"(kShufAb1), // %1 @@ -695,25 +695,25 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%3,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "pavgb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pshufb %%xmm2,%%xmm1 \n" - "movdqa %%xmm0,%%xmm6 \n" - "pshufb %%xmm3,%%xmm6 \n" - "paddusw %%xmm6,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "paddusw %%xmm0,%%xmm1 \n" - "pmulhuw %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movd %%xmm1,(%1) \n" - "psrlq $0x10,%%xmm1 \n" - "movd %%xmm1,0x2(%1) \n" - "lea 0x6(%1),%1 \n" - "sub $0x6,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%3,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "paddusw %%xmm6,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "paddusw %%xmm0,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movd %%xmm1,(%1) \n" + "psrlq $0x10,%%xmm1 \n" + "movd %%xmm1,0x2(%1) \n" + "lea 0x6(%1),%1 \n" + "sub $0x6,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -726,10 +726,10 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "movdqa %0,%%xmm2 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" + "movdqa %0,%%xmm2 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" : : "m"(kShufAc), // %0 "m"(kShufAc3), // %1 @@ -739,44 +739,44 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%3,1),%%xmm6 \n" - "movhlps %%xmm0,%%xmm1 \n" - "movhlps %%xmm6,%%xmm7 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm6 \n" - "punpcklbw %%xmm5,%%xmm7 \n" - "paddusw %%xmm6,%%xmm0 \n" - "paddusw %%xmm7,%%xmm1 \n" - "movdqu 0x00(%0,%3,2),%%xmm6 \n" - "lea 0x10(%0),%0 \n" - "movhlps %%xmm6,%%xmm7 \n" - "punpcklbw %%xmm5,%%xmm6 \n" - "punpcklbw %%xmm5,%%xmm7 \n" - "paddusw %%xmm6,%%xmm0 \n" - "paddusw %%xmm7,%%xmm1 \n" - "movdqa %%xmm0,%%xmm6 \n" - "psrldq $0x2,%%xmm0 \n" - "paddusw %%xmm0,%%xmm6 \n" - "psrldq $0x2,%%xmm0 \n" - "paddusw %%xmm0,%%xmm6 \n" - "pshufb %%xmm2,%%xmm6 \n" - "movdqa %%xmm1,%%xmm7 \n" - "psrldq $0x2,%%xmm1 \n" - "paddusw %%xmm1,%%xmm7 \n" - "psrldq $0x2,%%xmm1 \n" - "paddusw %%xmm1,%%xmm7 \n" - "pshufb %%xmm3,%%xmm7 \n" - "paddusw %%xmm7,%%xmm6 \n" - "pmulhuw %%xmm4,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movd %%xmm6,(%1) \n" - "psrlq $0x10,%%xmm6 \n" - "movd %%xmm6,0x2(%1) \n" - "lea 0x6(%1),%1 \n" - "sub $0x6,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%3,1),%%xmm6 \n" + "movhlps %%xmm0,%%xmm1 \n" + "movhlps %%xmm6,%%xmm7 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm6 \n" + "punpcklbw %%xmm5,%%xmm7 \n" + "paddusw %%xmm6,%%xmm0 \n" + "paddusw %%xmm7,%%xmm1 \n" + "movdqu 0x00(%0,%3,2),%%xmm6 \n" + "lea 0x10(%0),%0 \n" + "movhlps %%xmm6,%%xmm7 \n" + "punpcklbw %%xmm5,%%xmm6 \n" + "punpcklbw %%xmm5,%%xmm7 \n" + "paddusw %%xmm6,%%xmm0 \n" + "paddusw %%xmm7,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm6 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "movdqa %%xmm1,%%xmm7 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm7 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm7 \n" + "pshufb %%xmm3,%%xmm7 \n" + "paddusw %%xmm7,%%xmm6 \n" + "pmulhuw %%xmm4,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movd %%xmm6,(%1) \n" + "psrlq $0x10,%%xmm6 \n" + "movd %%xmm6,0x2(%1) \n" + "lea 0x6(%1),%1 \n" + "sub $0x6,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -791,25 +791,25 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr, int src_width) { asm volatile( - "pxor %%xmm5,%%xmm5 \n" + "pxor %%xmm5,%%xmm5 \n" // 16 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm3 \n" - "lea 0x10(%0),%0 \n" // src_ptr += 16 - "movdqu (%1),%%xmm0 \n" - "movdqu 0x10(%1),%%xmm1 \n" - "movdqa %%xmm3,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpckhbw %%xmm5,%%xmm3 \n" - "paddusw %%xmm2,%%xmm0 \n" - "paddusw %%xmm3,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm3 \n" + "lea 0x10(%0),%0 \n" // src_ptr += 16 + "movdqu (%1),%%xmm0 \n" + "movdqu 0x10(%1),%%xmm1 \n" + "movdqa %%xmm3,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpckhbw %%xmm5,%%xmm3 \n" + "paddusw %%xmm2,%%xmm0 \n" + "paddusw %%xmm3,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(src_width) // %2 @@ -824,22 +824,22 @@ void ScaleAddRow_AVX2(const uint8_t* src_ptr, int src_width) { asm volatile( - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm3 \n" - "lea 0x20(%0),%0 \n" // src_ptr += 32 - "vpermq $0xd8,%%ymm3,%%ymm3 \n" - "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" - "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" - "vpaddusw (%1),%%ymm2,%%ymm0 \n" - "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm3 \n" + "lea 0x20(%0),%0 \n" // src_ptr += 32 + "vpermq $0xd8,%%ymm3,%%ymm3 \n" + "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" + "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" + "vpaddusw (%1),%%ymm2,%%ymm0 \n" + "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 @@ -866,69 +866,69 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, int dx) { intptr_t x0, x1, temp_pixel; asm volatile( - "movd %6,%%xmm2 \n" - "movd %7,%%xmm3 \n" - "movl $0x04040000,%k2 \n" - "movd %k2,%%xmm5 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $0x9,%%xmm6 \n" // 0x007f007f - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $15,%%xmm7 \n" // 0x00010001 - - "pextrw $0x1,%%xmm2,%k3 \n" - "subl $0x2,%5 \n" - "jl 29f \n" - "movdqa %%xmm2,%%xmm0 \n" - "paddd %%xmm3,%%xmm0 \n" - "punpckldq %%xmm0,%%xmm2 \n" - "punpckldq %%xmm3,%%xmm3 \n" - "paddd %%xmm3,%%xmm3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" + "movd %6,%%xmm2 \n" + "movd %7,%%xmm3 \n" + "movl $0x04040000,%k2 \n" + "movd %k2,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x9,%%xmm6 \n" // 0x007f007f + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $15,%%xmm7 \n" // 0x00010001 + + "pextrw $0x1,%%xmm2,%k3 \n" + "subl $0x2,%5 \n" + "jl 29f \n" + "movdqa %%xmm2,%%xmm0 \n" + "paddd %%xmm3,%%xmm0 \n" + "punpckldq %%xmm0,%%xmm2 \n" + "punpckldq %%xmm3,%%xmm3 \n" + "paddd %%xmm3,%%xmm3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" LABELALIGN "2: \n" - "movdqa %%xmm2,%%xmm1 \n" - "paddd %%xmm3,%%xmm2 \n" - "movzwl 0x00(%1,%3,1),%k2 \n" - "movd %k2,%%xmm0 \n" - "psrlw $0x9,%%xmm1 \n" - "movzwl 0x00(%1,%4,1),%k2 \n" - "movd %k2,%%xmm4 \n" - "pshufb %%xmm5,%%xmm1 \n" - "punpcklwd %%xmm4,%%xmm0 \n" - "psubb %8,%%xmm0 \n" // make pixels signed. - "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) + + "movdqa %%xmm2,%%xmm1 \n" + "paddd %%xmm3,%%xmm2 \n" + "movzwl 0x00(%1,%3,1),%k2 \n" + "movd %k2,%%xmm0 \n" + "psrlw $0x9,%%xmm1 \n" + "movzwl 0x00(%1,%4,1),%k2 \n" + "movd %k2,%%xmm4 \n" + "pshufb %%xmm5,%%xmm1 \n" + "punpcklwd %%xmm4,%%xmm0 \n" + "psubb %8,%%xmm0 \n" // make pixels signed. + "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) + // 1 - "paddusb %%xmm7,%%xmm1 \n" - "pmaddubsw %%xmm0,%%xmm1 \n" - "pextrw $0x1,%%xmm2,%k3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" - "paddw %9,%%xmm1 \n" // make pixels unsigned. - "psrlw $0x7,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movd %%xmm1,%k2 \n" - "mov %w2,(%0) \n" - "lea 0x2(%0),%0 \n" - "subl $0x2,%5 \n" - "jge 2b \n" + "paddusb %%xmm7,%%xmm1 \n" + "pmaddubsw %%xmm0,%%xmm1 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + "paddw %9,%%xmm1 \n" // make pixels unsigned. + "psrlw $0x7,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movd %%xmm1,%k2 \n" + "mov %w2,(%0) \n" + "lea 0x2(%0),%0 \n" + "subl $0x2,%5 \n" + "jge 2b \n" LABELALIGN "29: \n" - "addl $0x1,%5 \n" - "jl 99f \n" - "movzwl 0x00(%1,%3,1),%k2 \n" - "movd %k2,%%xmm0 \n" - "psrlw $0x9,%%xmm2 \n" - "pshufb %%xmm5,%%xmm2 \n" - "psubb %8,%%xmm0 \n" // make pixels signed. - "pxor %%xmm6,%%xmm2 \n" - "paddusb %%xmm7,%%xmm2 \n" - "pmaddubsw %%xmm0,%%xmm2 \n" - "paddw %9,%%xmm2 \n" // make pixels unsigned. - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm2 \n" - "movd %%xmm2,%k2 \n" - "mov %b2,(%0) \n" + "addl $0x1,%5 \n" + "jl 99f \n" + "movzwl 0x00(%1,%3,1),%k2 \n" + "movd %k2,%%xmm0 \n" + "psrlw $0x9,%%xmm2 \n" + "pshufb %%xmm5,%%xmm2 \n" + "psubb %8,%%xmm0 \n" // make pixels signed. + "pxor %%xmm6,%%xmm2 \n" + "paddusb %%xmm7,%%xmm2 \n" + "pmaddubsw %%xmm0,%%xmm2 \n" + "paddw %9,%%xmm2 \n" // make pixels unsigned. + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm2 \n" + "movd %%xmm2,%k2 \n" + "mov %b2,(%0) \n" "99: \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 @@ -966,16 +966,16 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr, LABELALIGN "1: \n" - "movdqu (%1),%%xmm0 \n" - "lea 0x10(%1),%1 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "movdqu %%xmm0,(%0) \n" - "movdqu %%xmm1,0x10(%0) \n" - "lea 0x20(%0),%0 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "movdqu (%1),%%xmm0 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "movdqu %%xmm0,(%0) \n" + "movdqu %%xmm1,0x10(%0) \n" + "lea 0x20(%0),%0 \n" + "sub $0x20,%2 \n" + "jg 1b \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 @@ -993,14 +993,14 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "shufps $0xdd,%%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "shufps $0xdd,%%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(dst_width) // %2 @@ -1017,17 +1017,17 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm2 \n" - "pavgb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(dst_width) // %2 @@ -1043,21 +1043,21 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x00(%0,%3,1),%%xmm2 \n" - "movdqu 0x10(%0,%3,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm2 \n" - "pavgb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%3,1),%%xmm2 \n" + "movdqu 0x10(%0,%3,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(dst_width) // %2 @@ -1076,23 +1076,23 @@ void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, intptr_t src_stepx_x12; (void)src_stride; asm volatile( - "lea 0x00(,%1,4),%1 \n" - "lea 0x00(%1,%1,2),%4 \n" + "lea 0x00(,%1,4),%1 \n" + "lea 0x00(%1,%1,2),%4 \n" LABELALIGN "1: \n" - "movd (%0),%%xmm0 \n" - "movd 0x00(%0,%1,1),%%xmm1 \n" - "punpckldq %%xmm1,%%xmm0 \n" - "movd 0x00(%0,%1,2),%%xmm2 \n" - "movd 0x00(%0,%4,1),%%xmm3 \n" - "lea 0x00(%0,%1,4),%0 \n" - "punpckldq %%xmm3,%%xmm2 \n" - "punpcklqdq %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" + "movd (%0),%%xmm0 \n" + "movd 0x00(%0,%1,1),%%xmm1 \n" + "punpckldq %%xmm1,%%xmm0 \n" + "movd 0x00(%0,%1,2),%%xmm2 \n" + "movd 0x00(%0,%4,1),%%xmm3 \n" + "lea 0x00(%0,%1,4),%0 \n" + "punpckldq %%xmm3,%%xmm2 \n" + "punpcklqdq %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(src_stepx_x4), // %1 "+r"(dst_argb), // %2 @@ -1113,32 +1113,32 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, intptr_t src_stepx_x12; intptr_t row1 = (intptr_t)(src_stride); asm volatile( - "lea 0x00(,%1,4),%1 \n" - "lea 0x00(%1,%1,2),%4 \n" - "lea 0x00(%0,%5,1),%5 \n" + "lea 0x00(,%1,4),%1 \n" + "lea 0x00(%1,%1,2),%4 \n" + "lea 0x00(%0,%5,1),%5 \n" LABELALIGN "1: \n" - "movq (%0),%%xmm0 \n" - "movhps 0x00(%0,%1,1),%%xmm0 \n" - "movq 0x00(%0,%1,2),%%xmm1 \n" - "movhps 0x00(%0,%4,1),%%xmm1 \n" - "lea 0x00(%0,%1,4),%0 \n" - "movq (%5),%%xmm2 \n" - "movhps 0x00(%5,%1,1),%%xmm2 \n" - "movq 0x00(%5,%1,2),%%xmm3 \n" - "movhps 0x00(%5,%4,1),%%xmm3 \n" - "lea 0x00(%5,%1,4),%5 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm2 \n" - "pavgb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" + "movq (%0),%%xmm0 \n" + "movhps 0x00(%0,%1,1),%%xmm0 \n" + "movq 0x00(%0,%1,2),%%xmm1 \n" + "movhps 0x00(%0,%4,1),%%xmm1 \n" + "lea 0x00(%0,%1,4),%0 \n" + "movq (%5),%%xmm2 \n" + "movhps 0x00(%5,%1,1),%%xmm2 \n" + "movq 0x00(%5,%1,2),%%xmm3 \n" + "movhps 0x00(%5,%4,1),%%xmm3 \n" + "lea 0x00(%5,%1,4),%5 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(src_stepx_x4), // %1 "+r"(dst_argb), // %2 @@ -1156,56 +1156,56 @@ void ScaleARGBCols_SSE2(uint8_t* dst_argb, int dx) { intptr_t x0, x1; asm volatile( - "movd %5,%%xmm2 \n" - "movd %6,%%xmm3 \n" - "pshufd $0x0,%%xmm2,%%xmm2 \n" - "pshufd $0x11,%%xmm3,%%xmm0 \n" - "paddd %%xmm0,%%xmm2 \n" - "paddd %%xmm3,%%xmm3 \n" - "pshufd $0x5,%%xmm3,%%xmm0 \n" - "paddd %%xmm0,%%xmm2 \n" - "paddd %%xmm3,%%xmm3 \n" - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pextrw $0x1,%%xmm2,%k0 \n" - "pextrw $0x3,%%xmm2,%k1 \n" - "cmp $0x0,%4 \n" - "jl 99f \n" - "sub $0x4,%4 \n" - "jl 49f \n" + "movd %5,%%xmm2 \n" + "movd %6,%%xmm3 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" + "pshufd $0x11,%%xmm3,%%xmm0 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm3 \n" + "pshufd $0x5,%%xmm3,%%xmm0 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm3 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pextrw $0x1,%%xmm2,%k0 \n" + "pextrw $0x3,%%xmm2,%k1 \n" + "cmp $0x0,%4 \n" + "jl 99f \n" + "sub $0x4,%4 \n" + "jl 49f \n" LABELALIGN "40: \n" - "movd 0x00(%3,%0,4),%%xmm0 \n" - "movd 0x00(%3,%1,4),%%xmm1 \n" - "pextrw $0x5,%%xmm2,%k0 \n" - "pextrw $0x7,%%xmm2,%k1 \n" - "paddd %%xmm3,%%xmm2 \n" - "punpckldq %%xmm1,%%xmm0 \n" - "movd 0x00(%3,%0,4),%%xmm1 \n" - "movd 0x00(%3,%1,4),%%xmm4 \n" - "pextrw $0x1,%%xmm2,%k0 \n" - "pextrw $0x3,%%xmm2,%k1 \n" - "punpckldq %%xmm4,%%xmm1 \n" - "punpcklqdq %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%4 \n" - "jge 40b \n" + "movd 0x00(%3,%0,4),%%xmm0 \n" + "movd 0x00(%3,%1,4),%%xmm1 \n" + "pextrw $0x5,%%xmm2,%k0 \n" + "pextrw $0x7,%%xmm2,%k1 \n" + "paddd %%xmm3,%%xmm2 \n" + "punpckldq %%xmm1,%%xmm0 \n" + "movd 0x00(%3,%0,4),%%xmm1 \n" + "movd 0x00(%3,%1,4),%%xmm4 \n" + "pextrw $0x1,%%xmm2,%k0 \n" + "pextrw $0x3,%%xmm2,%k1 \n" + "punpckldq %%xmm4,%%xmm1 \n" + "punpcklqdq %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%4 \n" + "jge 40b \n" "49: \n" - "test $0x2,%4 \n" - "je 29f \n" - "movd 0x00(%3,%0,4),%%xmm0 \n" - "movd 0x00(%3,%1,4),%%xmm1 \n" - "pextrw $0x5,%%xmm2,%k0 \n" - "punpckldq %%xmm1,%%xmm0 \n" - "movq %%xmm0,(%2) \n" - "lea 0x8(%2),%2 \n" + "test $0x2,%4 \n" + "je 29f \n" + "movd 0x00(%3,%0,4),%%xmm0 \n" + "movd 0x00(%3,%1,4),%%xmm1 \n" + "pextrw $0x5,%%xmm2,%k0 \n" + "punpckldq %%xmm1,%%xmm0 \n" + "movq %%xmm0,(%2) \n" + "lea 0x8(%2),%2 \n" "29: \n" - "test $0x1,%4 \n" - "je 99f \n" - "movd 0x00(%3,%0,4),%%xmm0 \n" - "movd %%xmm0,(%2) \n" + "test $0x1,%4 \n" + "je 99f \n" + "movd 0x00(%3,%0,4),%%xmm0 \n" + "movd %%xmm0,(%2) \n" "99: \n" : "=&a"(x0), // %0 "=&d"(x1), // %1 @@ -1230,16 +1230,16 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, LABELALIGN "1: \n" - "movdqu (%1),%%xmm0 \n" - "lea 0x10(%1),%1 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpckldq %%xmm0,%%xmm0 \n" - "punpckhdq %%xmm1,%%xmm1 \n" - "movdqu %%xmm0,(%0) \n" - "movdqu %%xmm1,0x10(%0) \n" - "lea 0x20(%0),%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "movdqu (%1),%%xmm0 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpckldq %%xmm0,%%xmm0 \n" + "punpckhdq %%xmm1,%%xmm1 \n" + "movdqu %%xmm0,(%0) \n" + "movdqu %%xmm1,0x10(%0) \n" + "lea 0x20(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(dst_argb), // %0 "+r"(src_argb), // %1 @@ -1267,63 +1267,64 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, int dx) { intptr_t x0, x1; asm volatile( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm5 \n" + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm5 \n" : : "m"(kShuffleColARGB), // %0 "m"(kShuffleFractions) // %1 ); asm volatile( - "movd %5,%%xmm2 \n" - "movd %6,%%xmm3 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $0x9,%%xmm6 \n" - "pextrw $0x1,%%xmm2,%k3 \n" - "sub $0x2,%2 \n" - "jl 29f \n" - "movdqa %%xmm2,%%xmm0 \n" - "paddd %%xmm3,%%xmm0 \n" - "punpckldq %%xmm0,%%xmm2 \n" - "punpckldq %%xmm3,%%xmm3 \n" - "paddd %%xmm3,%%xmm3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" + "movd %5,%%xmm2 \n" + "movd %6,%%xmm3 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x9,%%xmm6 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "sub $0x2,%2 \n" + "jl 29f \n" + "movdqa %%xmm2,%%xmm0 \n" + "paddd %%xmm3,%%xmm0 \n" + "punpckldq %%xmm0,%%xmm2 \n" + "punpckldq %%xmm3,%%xmm3 \n" + "paddd %%xmm3,%%xmm3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" LABELALIGN "2: \n" - "movdqa %%xmm2,%%xmm1 \n" - "paddd %%xmm3,%%xmm2 \n" - "movq 0x00(%1,%3,4),%%xmm0 \n" - "psrlw $0x9,%%xmm1 \n" - "movhps 0x00(%1,%4,4),%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pxor %%xmm6,%%xmm1 \n" - "pmaddubsw %%xmm1,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "pextrw $0x1,%%xmm2,%k3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%0) \n" - "lea 0x8(%0),%0 \n" - "sub $0x2,%2 \n" - "jge 2b \n" + "movdqa %%xmm2,%%xmm1 \n" + "paddd %%xmm3,%%xmm2 \n" + "movq 0x00(%1,%3,4),%%xmm0 \n" + "psrlw $0x9,%%xmm1 \n" + "movhps 0x00(%1,%4,4),%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pxor %%xmm6,%%xmm1 \n" + "pmaddubsw %%xmm1,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%0) \n" + "lea 0x8(%0),%0 \n" + "sub $0x2,%2 \n" + "jge 2b \n" LABELALIGN "29: \n" - "add $0x1,%2 \n" - "jl 99f \n" - "psrlw $0x9,%%xmm2 \n" - "movq 0x00(%1,%3,4),%%xmm0 \n" - "pshufb %%xmm5,%%xmm2 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pxor %%xmm6,%%xmm2 \n" - "pmaddubsw %%xmm2,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movd %%xmm0,(%0) \n" - - LABELALIGN "99: \n" // clang-format error. + "add $0x1,%2 \n" + "jl 99f \n" + "psrlw $0x9,%%xmm2 \n" + "movq 0x00(%1,%3,4),%%xmm0 \n" + "pshufb %%xmm5,%%xmm2 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pxor %%xmm6,%%xmm2 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,(%0) \n" + + LABELALIGN + "99: \n" // clang-format error. : "+r"(dst_argb), // %0 "+r"(src_argb), // %1 @@ -1339,10 +1340,10 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, int FixedDiv_X86(int num, int div) { asm volatile( "cdq \n" - "shld $0x10,%%eax,%%edx \n" - "shl $0x10,%%eax \n" - "idiv %1 \n" - "mov %0, %%eax \n" + "shld $0x10,%%eax,%%edx \n" + "shl $0x10,%%eax \n" + "idiv %1 \n" + "mov %0, %%eax \n" : "+a"(num) // %0 : "c"(div) // %1 : "memory", "cc", "edx"); @@ -1353,19 +1354,108 @@ int FixedDiv_X86(int num, int div) { int FixedDiv1_X86(int num, int div) { asm volatile( "cdq \n" - "shld $0x10,%%eax,%%edx \n" - "shl $0x10,%%eax \n" - "sub $0x10001,%%eax \n" - "sbb $0x0,%%edx \n" - "sub $0x1,%1 \n" - "idiv %1 \n" - "mov %0, %%eax \n" + "shld $0x10,%%eax,%%edx \n" + "shl $0x10,%%eax \n" + "sub $0x10001,%%eax \n" + "sbb $0x0,%%edx \n" + "sub $0x1,%1 \n" + "idiv %1 \n" + "mov %0, %%eax \n" : "+a"(num) // %0 : "c"(div) // %1 : "memory", "cc", "edx"); return num; } +#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3 +// Shuffle table for splitting UV into upper and lower part of register. +static const uvec8 kShuffleSplitUV = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u, + 1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u}; +static const uvec8 kShuffleMergeUV = {0u, 8u, 2u, 10u, 4u, 12u, + 6u, 14u, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80}; + +void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "pcmpeqb %%xmm4,%%xmm4 \n" // 01010101 + "psrlw $0xf,%%xmm4 \n" + "packuswb %%xmm4,%%xmm4 \n" + "pxor %%xmm5, %%xmm5 \n" // zero + "movdqa %4,%%xmm1 \n" // split shuffler + "movdqa %5,%%xmm3 \n" // merge shuffler + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" // 8 UV row 0 + "movdqu 0x00(%0,%3,1),%%xmm2 \n" // 8 UV row 1 + "lea 0x10(%0),%0 \n" + "pshufb %%xmm1,%%xmm0 \n" // uuuuvvvv + "pshufb %%xmm1,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" // horizontal add + "pmaddubsw %%xmm4,%%xmm2 \n" + "paddw %%xmm2,%%xmm0 \n" // vertical add + "psrlw $0x1,%%xmm0 \n" // round + "pavgw %%xmm5,%%xmm0 \n" + "pshufb %%xmm3,%%xmm0 \n" // merge uv + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" // 4 UV + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kShuffleSplitUV), // %4 + "m"(kShuffleMergeUV) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_SCALEUVROWDOWN2BOX_SSSE3 + +#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2 +void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101 + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" // zero + "vbroadcastf128 %4,%%ymm1 \n" // split shuffler + "vbroadcastf128 %5,%%ymm3 \n" // merge shuffler + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // 16 UV row 0 + "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" // 16 UV row 1 + "lea 0x20(%0),%0 \n" + "vpshufb %%ymm1,%%ymm0,%%ymm0 \n" // uuuuvvvv + "vpshufb %%ymm1,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // horizontal add + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" // vertical add + "vpsrlw $0x1,%%ymm0,%%ymm0 \n" // round + "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm3,%%ymm0,%%ymm0 \n" // merge uv + "vpermq $0xd8,%%ymm0,%%ymm0 \n" // combine qwords + "vmovdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" // 8 UV + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kShuffleSplitUV), // %4 + "m"(kShuffleMergeUV) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_SCALEUVROWDOWN2BOX_AVX2 + #endif // defined(__x86_64__) || defined(__i386__) #ifdef __cplusplus diff --git a/chromium/third_party/libyuv/source/scale_neon.cc b/chromium/third_party/libyuv/source/scale_neon.cc index 366b155ba4e..572b4bfa9b3 100644 --- a/chromium/third_party/libyuv/source/scale_neon.cc +++ b/chromium/third_party/libyuv/source/scale_neon.cc @@ -31,10 +31,10 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr, asm volatile( "1: \n" // load even pixels into q0, odd into q1 - "vld2.8 {q0, q1}, [%0]! \n" - "subs %2, %2, #16 \n" // 16 processed per loop - "vst1.8 {q1}, [%1]! \n" // store odd pixels - "bgt 1b \n" + "vld2.8 {q0, q1}, [%0]! \n" + "subs %2, %2, #16 \n" // 16 processed per loop + "vst1.8 {q1}, [%1]! \n" // store odd pixels + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst), // %1 "+r"(dst_width) // %2 @@ -51,11 +51,11 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr, (void)src_stride; asm volatile( "1: \n" - "vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels - "subs %2, %2, #16 \n" // 16 processed per loop - "vrhadd.u8 q0, q0, q1 \n" // rounding half add - "vst1.8 {q0}, [%1]! \n" - "bgt 1b \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels + "subs %2, %2, #16 \n" // 16 processed per loop + "vrhadd.u8 q0, q0, q1 \n" // rounding half add + "vst1.8 {q0}, [%1]! \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst), // %1 "+r"(dst_width) // %2 @@ -71,21 +71,21 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr, int dst_width) { asm volatile( // change the stride to row 2 pointer - "add %1, %0 \n" + "add %1, %0 \n" "1: \n" - "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc - "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc - "subs %3, %3, #16 \n" // 16 processed per loop - "vpaddl.u8 q0, q0 \n" // row 1 add adjacent - "vpaddl.u8 q1, q1 \n" - "vpadal.u8 q0, q2 \n" // row 2 add adjacent + + "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc + "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc + "subs %3, %3, #16 \n" // 16 processed per loop + "vpaddl.u8 q0, q0 \n" // row 1 add adjacent + "vpaddl.u8 q1, q1 \n" + "vpadal.u8 q0, q2 \n" // row 2 add adjacent + // row1 - "vpadal.u8 q1, q3 \n" - "vrshrn.u16 d0, q0, #2 \n" // downshift, round and + "vpadal.u8 q1, q3 \n" + "vrshrn.u16 d0, q0, #2 \n" // downshift, round and // pack - "vrshrn.u16 d1, q1, #2 \n" - "vst1.8 {q0}, [%2]! \n" - "bgt 1b \n" + "vrshrn.u16 d1, q1, #2 \n" + "vst1.8 {q0}, [%2]! \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_stride), // %1 "+r"(dst), // %2 @@ -102,10 +102,10 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr, (void)src_stride; asm volatile( "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "subs %2, %2, #8 \n" // 8 processed per loop - "vst1.8 {d2}, [%1]! \n" - "bgt 1b \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "subs %2, %2, #8 \n" // 8 processed per loop + "vst1.8 {d2}, [%1]! \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -122,20 +122,20 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr, const uint8_t* src_ptr3 = src_ptr + src_stride * 3; asm volatile( "1: \n" - "vld1.8 {q0}, [%0]! \n" // load up 16x4 - "vld1.8 {q1}, [%3]! \n" - "vld1.8 {q2}, [%4]! \n" - "vld1.8 {q3}, [%5]! \n" - "subs %2, %2, #4 \n" - "vpaddl.u8 q0, q0 \n" - "vpadal.u8 q0, q1 \n" - "vpadal.u8 q0, q2 \n" - "vpadal.u8 q0, q3 \n" - "vpaddl.u16 q0, q0 \n" - "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding - "vmovn.u16 d0, q0 \n" - "vst1.32 {d0[0]}, [%1]! \n" - "bgt 1b \n" + "vld1.8 {q0}, [%0]! \n" // load up 16x4 + "vld1.8 {q1}, [%3]! \n" + "vld1.8 {q2}, [%4]! \n" + "vld1.8 {q3}, [%5]! \n" + "subs %2, %2, #4 \n" + "vpaddl.u8 q0, q0 \n" + "vpadal.u8 q0, q1 \n" + "vpadal.u8 q0, q2 \n" + "vpadal.u8 q0, q3 \n" + "vpaddl.u16 q0, q0 \n" + "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding + "vmovn.u16 d0, q0 \n" + "vst1.32 {d0[0]}, [%1]! \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 @@ -156,11 +156,11 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr, (void)src_stride; asm volatile( "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "subs %2, %2, #24 \n" - "vmov d2, d3 \n" // order d0, d1, d2 - "vst3.8 {d0, d1, d2}, [%1]! \n" - "bgt 1b \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "subs %2, %2, #24 \n" + "vmov d2, d3 \n" // order d0, d1, d2 + "vst3.8 {d0, d1, d2}, [%1]! \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -173,49 +173,49 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "vmov.u8 d24, #3 \n" - "add %3, %0 \n" + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 - "subs %2, %2, #24 \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + "subs %2, %2, #24 \n" // filter src line 0 with src line 1 // expand chars to shorts to allow for room // when adding lines together - "vmovl.u8 q8, d4 \n" - "vmovl.u8 q9, d5 \n" - "vmovl.u8 q10, d6 \n" - "vmovl.u8 q11, d7 \n" + "vmovl.u8 q8, d4 \n" + "vmovl.u8 q9, d5 \n" + "vmovl.u8 q10, d6 \n" + "vmovl.u8 q11, d7 \n" // 3 * line_0 + line_1 - "vmlal.u8 q8, d0, d24 \n" - "vmlal.u8 q9, d1, d24 \n" - "vmlal.u8 q10, d2, d24 \n" - "vmlal.u8 q11, d3, d24 \n" + "vmlal.u8 q8, d0, d24 \n" + "vmlal.u8 q9, d1, d24 \n" + "vmlal.u8 q10, d2, d24 \n" + "vmlal.u8 q11, d3, d24 \n" // (3 * line_0 + line_1) >> 2 - "vqrshrn.u16 d0, q8, #2 \n" - "vqrshrn.u16 d1, q9, #2 \n" - "vqrshrn.u16 d2, q10, #2 \n" - "vqrshrn.u16 d3, q11, #2 \n" + "vqrshrn.u16 d0, q8, #2 \n" + "vqrshrn.u16 d1, q9, #2 \n" + "vqrshrn.u16 d2, q10, #2 \n" + "vqrshrn.u16 d3, q11, #2 \n" // a0 = (src[0] * 3 + s[1] * 1) >> 2 - "vmovl.u8 q8, d1 \n" - "vmlal.u8 q8, d0, d24 \n" - "vqrshrn.u16 d0, q8, #2 \n" + "vmovl.u8 q8, d1 \n" + "vmlal.u8 q8, d0, d24 \n" + "vqrshrn.u16 d0, q8, #2 \n" // a1 = (src[1] * 1 + s[2] * 1) >> 1 - "vrhadd.u8 d1, d1, d2 \n" + "vrhadd.u8 d1, d1, d2 \n" // a2 = (src[2] * 1 + s[3] * 3) >> 2 - "vmovl.u8 q8, d2 \n" - "vmlal.u8 q8, d3, d24 \n" - "vqrshrn.u16 d2, q8, #2 \n" + "vmovl.u8 q8, d2 \n" + "vmlal.u8 q8, d3, d24 \n" + "vqrshrn.u16 d2, q8, #2 \n" - "vst3.8 {d0, d1, d2}, [%1]! \n" + "vst3.8 {d0, d1, d2}, [%1]! \n" - "bgt 1b \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 @@ -230,31 +230,31 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "vmov.u8 d24, #3 \n" - "add %3, %0 \n" + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 - "subs %2, %2, #24 \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + "subs %2, %2, #24 \n" // average src line 0 with src line 1 - "vrhadd.u8 q0, q0, q2 \n" - "vrhadd.u8 q1, q1, q3 \n" + "vrhadd.u8 q0, q0, q2 \n" + "vrhadd.u8 q1, q1, q3 \n" // a0 = (src[0] * 3 + s[1] * 1) >> 2 - "vmovl.u8 q3, d1 \n" - "vmlal.u8 q3, d0, d24 \n" - "vqrshrn.u16 d0, q3, #2 \n" + "vmovl.u8 q3, d1 \n" + "vmlal.u8 q3, d0, d24 \n" + "vqrshrn.u16 d0, q3, #2 \n" // a1 = (src[1] * 1 + s[2] * 1) >> 1 - "vrhadd.u8 d1, d1, d2 \n" + "vrhadd.u8 d1, d1, d2 \n" // a2 = (src[2] * 1 + s[3] * 3) >> 2 - "vmovl.u8 q3, d2 \n" - "vmlal.u8 q3, d3, d24 \n" - "vqrshrn.u16 d2, q3, #2 \n" + "vmovl.u8 q3, d2 \n" + "vmlal.u8 q3, d3, d24 \n" + "vqrshrn.u16 d2, q3, #2 \n" - "vst3.8 {d0, d1, d2}, [%1]! \n" - "bgt 1b \n" + "vst3.8 {d0, d1, d2}, [%1]! \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 @@ -282,15 +282,15 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "vld1.8 {q3}, [%3] \n" + "vld1.8 {q3}, [%3] \n" "1: \n" - "vld1.8 {d0, d1, d2, d3}, [%0]! \n" - "subs %2, %2, #12 \n" - "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" - "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" - "vst1.8 {d4}, [%1]! \n" - "vst1.32 {d5[0]}, [%1]! \n" - "bgt 1b \n" + "vld1.8 {d0, d1, d2, d3}, [%0]! \n" + "subs %2, %2, #12 \n" + "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" + "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" + "vst1.8 {d4}, [%1]! \n" + "vst1.32 {d5[0]}, [%1]! \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -306,57 +306,57 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, const uint8_t* src_ptr1 = src_ptr + src_stride * 2; asm volatile( - "vld1.16 {q13}, [%5] \n" - "vld1.8 {q14}, [%6] \n" - "vld1.8 {q15}, [%7] \n" - "add %3, %0 \n" + "vld1.16 {q13}, [%5] \n" + "vld1.8 {q14}, [%6] \n" + "vld1.8 {q15}, [%7] \n" + "add %3, %0 \n" "1: \n" // d0 = 00 40 01 41 02 42 03 43 // d1 = 10 50 11 51 12 52 13 53 // d2 = 20 60 21 61 22 62 23 63 // d3 = 30 70 31 71 32 72 33 73 - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" - "vld4.8 {d4, d5, d6, d7}, [%3]! \n" - "vld4.8 {d16, d17, d18, d19}, [%4]! \n" - "subs %2, %2, #12 \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" + "vld4.8 {d16, d17, d18, d19}, [%4]! \n" + "subs %2, %2, #12 \n" // Shuffle the input data around to get align the data // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 // d0 = 00 10 01 11 02 12 03 13 // d1 = 40 50 41 51 42 52 43 53 - "vtrn.u8 d0, d1 \n" - "vtrn.u8 d4, d5 \n" - "vtrn.u8 d16, d17 \n" + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" + "vtrn.u8 d16, d17 \n" // d2 = 20 30 21 31 22 32 23 33 // d3 = 60 70 61 71 62 72 63 73 - "vtrn.u8 d2, d3 \n" - "vtrn.u8 d6, d7 \n" - "vtrn.u8 d18, d19 \n" + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" + "vtrn.u8 d18, d19 \n" // d0 = 00+10 01+11 02+12 03+13 // d2 = 40+50 41+51 42+52 43+53 - "vpaddl.u8 q0, q0 \n" - "vpaddl.u8 q2, q2 \n" - "vpaddl.u8 q8, q8 \n" + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" + "vpaddl.u8 q8, q8 \n" // d3 = 60+70 61+71 62+72 63+73 - "vpaddl.u8 d3, d3 \n" - "vpaddl.u8 d7, d7 \n" - "vpaddl.u8 d19, d19 \n" + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" + "vpaddl.u8 d19, d19 \n" // combine source lines - "vadd.u16 q0, q2 \n" - "vadd.u16 q0, q8 \n" - "vadd.u16 d4, d3, d7 \n" - "vadd.u16 d4, d19 \n" + "vadd.u16 q0, q2 \n" + "vadd.u16 q0, q8 \n" + "vadd.u16 d4, d3, d7 \n" + "vadd.u16 d4, d19 \n" // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] // + s[6 + st * 1] + s[7 + st * 1] // + s[6 + st * 2] + s[7 + st * 2]) / 6 "vqrdmulh.s16 q2, q2, q13 \n" - "vmovn.u16 d4, q2 \n" + "vmovn.u16 d4, q2 \n" // Shuffle 2,3 reg around so that 2 can be added to the // 0,1 reg and 3 can be added to the 4,5 reg. This @@ -364,24 +364,24 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, // registers are already expanded. Then do transposes // to get aligned. // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - "vmovl.u8 q1, d2 \n" - "vmovl.u8 q3, d6 \n" - "vmovl.u8 q9, d18 \n" + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" + "vmovl.u8 q9, d18 \n" // combine source lines - "vadd.u16 q1, q3 \n" - "vadd.u16 q1, q9 \n" + "vadd.u16 q1, q3 \n" + "vadd.u16 q1, q9 \n" // d4 = xx 20 xx 30 xx 22 xx 32 // d5 = xx 21 xx 31 xx 23 xx 33 - "vtrn.u32 d2, d3 \n" + "vtrn.u32 d2, d3 \n" // d4 = xx 20 xx 21 xx 22 xx 23 // d5 = xx 30 xx 31 xx 32 xx 33 - "vtrn.u16 d2, d3 \n" + "vtrn.u16 d2, d3 \n" // 0+1+2, 3+4+5 - "vadd.u16 q0, q1 \n" + "vadd.u16 q0, q1 \n" // Need to divide, but can't downshift as the the value // isn't a power of 2. So multiply by 65536 / n @@ -390,14 +390,14 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, // Align for table lookup, vtbl requires registers to // be adjacent - "vmov.u8 d2, d4 \n" + "vmov.u8 d2, d4 \n" - "vtbl.u8 d3, {d0, d1, d2}, d28 \n" - "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" - "vst1.8 {d3}, [%1]! \n" - "vst1.32 {d4[0]}, [%1]! \n" - "bgt 1b \n" + "vst1.8 {d3}, [%1]! \n" + "vst1.32 {d4[0]}, [%1]! \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 @@ -416,46 +416,46 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "vld1.16 {q13}, [%4] \n" - "vld1.8 {q14}, [%5] \n" - "add %3, %0 \n" + "vld1.16 {q13}, [%4] \n" + "vld1.8 {q14}, [%5] \n" + "add %3, %0 \n" "1: \n" // d0 = 00 40 01 41 02 42 03 43 // d1 = 10 50 11 51 12 52 13 53 // d2 = 20 60 21 61 22 62 23 63 // d3 = 30 70 31 71 32 72 33 73 - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" - "vld4.8 {d4, d5, d6, d7}, [%3]! \n" - "subs %2, %2, #12 \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" + "subs %2, %2, #12 \n" // Shuffle the input data around to get align the data // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 // d0 = 00 10 01 11 02 12 03 13 // d1 = 40 50 41 51 42 52 43 53 - "vtrn.u8 d0, d1 \n" - "vtrn.u8 d4, d5 \n" + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" // d2 = 20 30 21 31 22 32 23 33 // d3 = 60 70 61 71 62 72 63 73 - "vtrn.u8 d2, d3 \n" - "vtrn.u8 d6, d7 \n" + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" // d0 = 00+10 01+11 02+12 03+13 // d2 = 40+50 41+51 42+52 43+53 - "vpaddl.u8 q0, q0 \n" - "vpaddl.u8 q2, q2 \n" + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" // d3 = 60+70 61+71 62+72 63+73 - "vpaddl.u8 d3, d3 \n" - "vpaddl.u8 d7, d7 \n" + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" // combine source lines - "vadd.u16 q0, q2 \n" - "vadd.u16 d4, d3, d7 \n" + "vadd.u16 q0, q2 \n" + "vadd.u16 d4, d3, d7 \n" // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 - "vqrshrn.u16 d4, q2, #2 \n" + "vqrshrn.u16 d4, q2, #2 \n" // Shuffle 2,3 reg around so that 2 can be added to the // 0,1 reg and 3 can be added to the 4,5 reg. This @@ -463,22 +463,22 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, // registers are already expanded. Then do transposes // to get aligned. // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - "vmovl.u8 q1, d2 \n" - "vmovl.u8 q3, d6 \n" + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" // combine source lines - "vadd.u16 q1, q3 \n" + "vadd.u16 q1, q3 \n" // d4 = xx 20 xx 30 xx 22 xx 32 // d5 = xx 21 xx 31 xx 23 xx 33 - "vtrn.u32 d2, d3 \n" + "vtrn.u32 d2, d3 \n" // d4 = xx 20 xx 21 xx 22 xx 23 // d5 = xx 30 xx 31 xx 32 xx 33 - "vtrn.u16 d2, d3 \n" + "vtrn.u16 d2, d3 \n" // 0+1+2, 3+4+5 - "vadd.u16 q0, q1 \n" + "vadd.u16 q0, q1 \n" // Need to divide, but can't downshift as the the value // isn't a power of 2. So multiply by 65536 / n @@ -487,14 +487,14 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, // Align for table lookup, vtbl requires registers to // be adjacent - "vmov.u8 d2, d4 \n" + "vmov.u8 d2, d4 \n" - "vtbl.u8 d3, {d0, d1, d2}, d28 \n" - "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" - "vst1.8 {d3}, [%1]! \n" - "vst1.32 {d4[0]}, [%1]! \n" - "bgt 1b \n" + "vst1.8 {d3}, [%1]! \n" + "vst1.32 {d4[0]}, [%1]! \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 @@ -511,13 +511,13 @@ void ScaleAddRow_NEON(const uint8_t* src_ptr, int src_width) { asm volatile( "1: \n" - "vld1.16 {q1, q2}, [%1] \n" // load accumulator - "vld1.8 {q0}, [%0]! \n" // load 16 bytes - "vaddw.u8 q2, q2, d1 \n" // add - "vaddw.u8 q1, q1, d0 \n" - "vst1.16 {q1, q2}, [%1]! \n" // store accumulator - "subs %2, %2, #16 \n" // 16 processed per loop - "bgt 1b \n" + "vld1.16 {q1, q2}, [%1] \n" // load accumulator + "vld1.8 {q0}, [%0]! \n" // load 16 bytes + "vaddw.u8 q2, q2, d1 \n" // add + "vaddw.u8 q1, q1, d0 \n" + "vst1.16 {q1, q2}, [%1]! \n" // store accumulator + "subs %2, %2, #16 \n" // 16 processed per loop + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(src_width) // %2 @@ -547,17 +547,17 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr, int* tmp = dx_offset; const uint8_t* src_tmp = src_ptr; asm volatile ( - "vdup.32 q0, %3 \n" // x - "vdup.32 q1, %4 \n" // dx - "vld1.32 {q2}, [%5] \n" // 0 1 2 3 - "vshl.i32 q3, q1, #2 \n" // 4 * dx - "vmul.s32 q1, q1, q2 \n" + "vdup.32 q0, %3 \n" // x + "vdup.32 q1, %4 \n" // dx + "vld1.32 {q2}, [%5] \n" // 0 1 2 3 + "vshl.i32 q3, q1, #2 \n" // 4 * dx + "vmul.s32 q1, q1, q2 \n" // x , x + 1 * dx, x + 2 * dx, x + 3 * dx - "vadd.s32 q1, q1, q0 \n" + "vadd.s32 q1, q1, q0 \n" // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx - "vadd.s32 q2, q1, q3 \n" - "vshl.i32 q0, q3, #1 \n" // 8 * dx - "1: \n" + "vadd.s32 q2, q1, q3 \n" + "vshl.i32 q0, q3, #1 \n" // 8 * dx + "1: \n" LOAD2_DATA8_LANE(0) LOAD2_DATA8_LANE(1) LOAD2_DATA8_LANE(2) @@ -566,27 +566,27 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr, LOAD2_DATA8_LANE(5) LOAD2_DATA8_LANE(6) LOAD2_DATA8_LANE(7) - "vmov q10, q1 \n" - "vmov q11, q2 \n" - "vuzp.16 q10, q11 \n" - "vmovl.u8 q8, d6 \n" - "vmovl.u8 q9, d7 \n" - "vsubl.s16 q11, d18, d16 \n" - "vsubl.s16 q12, d19, d17 \n" - "vmovl.u16 q13, d20 \n" - "vmovl.u16 q10, d21 \n" - "vmul.s32 q11, q11, q13 \n" - "vmul.s32 q12, q12, q10 \n" - "vrshrn.s32 d18, q11, #16 \n" - "vrshrn.s32 d19, q12, #16 \n" - "vadd.s16 q8, q8, q9 \n" - "vmovn.s16 d6, q8 \n" - - "vst1.8 {d6}, [%0]! \n" // store pixels - "vadd.s32 q1, q1, q0 \n" - "vadd.s32 q2, q2, q0 \n" - "subs %2, %2, #8 \n" // 8 processed per loop - "bgt 1b \n" + "vmov q10, q1 \n" + "vmov q11, q2 \n" + "vuzp.16 q10, q11 \n" + "vmovl.u8 q8, d6 \n" + "vmovl.u8 q9, d7 \n" + "vsubl.s16 q11, d18, d16 \n" + "vsubl.s16 q12, d19, d17 \n" + "vmovl.u16 q13, d20 \n" + "vmovl.u16 q10, d21 \n" + "vmul.s32 q11, q11, q13 \n" + "vmul.s32 q12, q12, q10 \n" + "vrshrn.s32 d18, q11, #16 \n" + "vrshrn.s32 d19, q12, #16 \n" + "vadd.s16 q8, q8, q9 \n" + "vmovn.s16 d6, q8 \n" + + "vst1.8 {d6}, [%0]! \n" // store pixels + "vadd.s32 q1, q1, q0 \n" + "vadd.s32 q2, q2, q0 \n" + "subs %2, %2, #8 \n" // 8 processed per loop + "bgt 1b \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(dst_width), // %2 @@ -609,75 +609,75 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr, int dst_width, int source_y_fraction) { asm volatile( - "cmp %4, #0 \n" - "beq 100f \n" - "add %2, %1 \n" - "cmp %4, #64 \n" - "beq 75f \n" - "cmp %4, #128 \n" - "beq 50f \n" - "cmp %4, #192 \n" - "beq 25f \n" - - "vdup.8 d5, %4 \n" - "rsb %4, #256 \n" - "vdup.8 d4, %4 \n" + "cmp %4, #0 \n" + "beq 100f \n" + "add %2, %1 \n" + "cmp %4, #64 \n" + "beq 75f \n" + "cmp %4, #128 \n" + "beq 50f \n" + "cmp %4, #192 \n" + "beq 25f \n" + + "vdup.8 d5, %4 \n" + "rsb %4, #256 \n" + "vdup.8 d4, %4 \n" // General purpose row blend. "1: \n" - "vld1.8 {q0}, [%1]! \n" - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vmull.u8 q13, d0, d4 \n" - "vmull.u8 q14, d1, d4 \n" - "vmlal.u8 q13, d2, d5 \n" - "vmlal.u8 q14, d3, d5 \n" - "vrshrn.u16 d0, q13, #8 \n" - "vrshrn.u16 d1, q14, #8 \n" - "vst1.8 {q0}, [%0]! \n" - "bgt 1b \n" - "b 99f \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vmull.u8 q13, d0, d4 \n" + "vmull.u8 q14, d1, d4 \n" + "vmlal.u8 q13, d2, d5 \n" + "vmlal.u8 q14, d3, d5 \n" + "vrshrn.u16 d0, q13, #8 \n" + "vrshrn.u16 d1, q14, #8 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 1b \n" + "b 99f \n" // Blend 25 / 75. "25: \n" - "vld1.8 {q0}, [%1]! \n" - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - "vrhadd.u8 q0, q1 \n" - "vst1.8 {q0}, [%0]! \n" - "bgt 25b \n" - "b 99f \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 25b \n" + "b 99f \n" // Blend 50 / 50. "50: \n" - "vld1.8 {q0}, [%1]! \n" - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - "vst1.8 {q0}, [%0]! \n" - "bgt 50b \n" - "b 99f \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 50b \n" + "b 99f \n" // Blend 75 / 25. "75: \n" - "vld1.8 {q1}, [%1]! \n" - "vld1.8 {q0}, [%2]! \n" - "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - "vrhadd.u8 q0, q1 \n" - "vst1.8 {q0}, [%0]! \n" - "bgt 75b \n" - "b 99f \n" + "vld1.8 {q1}, [%1]! \n" + "vld1.8 {q0}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 75b \n" + "b 99f \n" // Blend 100 / 0 - Copy row unchanged. "100: \n" - "vld1.8 {q0}, [%1]! \n" - "subs %3, %3, #16 \n" - "vst1.8 {q0}, [%0]! \n" - "bgt 100b \n" + "vld1.8 {q0}, [%1]! \n" + "subs %3, %3, #16 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 100b \n" "99: \n" - "vst1.8 {d1[7]}, [%0] \n" + "vst1.8 {d1[7]}, [%0] \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(src_stride), // %2 @@ -694,12 +694,12 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, (void)src_stride; asm volatile( "1: \n" - "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB - "subs %2, %2, #8 \n" // 8 processed per loop - "vmov q2, q1 \n" // load next 8 ARGB - "vst2.32 {q2, q3}, [%1]! \n" // store odd pixels - "bgt 1b \n" + "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB + "subs %2, %2, #8 \n" // 8 processed per loop + "vmov q2, q1 \n" // load next 8 ARGB + "vst2.32 {q2, q3}, [%1]! \n" // store odd pixels + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst), // %1 "+r"(dst_width) // %2 @@ -722,13 +722,13 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb, (void)src_stride; asm volatile( "1: \n" - "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB - "subs %2, %2, #8 \n" // 8 processed per loop - "vrhadd.u8 q0, q0, q1 \n" // rounding half add - "vrhadd.u8 q1, q2, q3 \n" // rounding half add - "vst2.32 {q0, q1}, [%1]! \n" - "bgt 1b \n" + "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB + "subs %2, %2, #8 \n" // 8 processed per loop + "vrhadd.u8 q0, q0, q1 \n" // rounding half add + "vrhadd.u8 q1, q2, q3 \n" // rounding half add + "vst2.32 {q0, q1}, [%1]! \n" + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(dst_width) // %2 @@ -743,27 +743,27 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, int dst_width) { asm volatile( // change the stride to row 2 pointer - "add %1, %1, %0 \n" + "add %1, %1, %0 \n" "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB - "subs %3, %3, #8 \n" // 8 processed per loop. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. - "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB - "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB - "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. - "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. - "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes - "vrshrn.u16 d1, q1, #2 \n" - "vrshrn.u16 d2, q2, #2 \n" - "vrshrn.u16 d3, q3, #2 \n" - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" - "bgt 1b \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB + "subs %3, %3, #8 \n" // 8 processed per loop. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. + "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB + "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB + "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. + "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. + "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes + "vrshrn.u16 d1, q1, #2 \n" + "vrshrn.u16 d2, q2, #2 \n" + "vrshrn.u16 d3, q3, #2 \n" + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_stride), // %1 "+r"(dst), // %2 @@ -781,15 +781,15 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, int dst_width) { (void)src_stride; asm volatile( - "mov r12, %3, lsl #2 \n" + "mov r12, %3, lsl #2 \n" "1: \n" - "vld1.32 {d0[0]}, [%0], r12 \n" - "vld1.32 {d0[1]}, [%0], r12 \n" - "vld1.32 {d1[0]}, [%0], r12 \n" - "vld1.32 {d1[1]}, [%0], r12 \n" - "subs %2, %2, #4 \n" // 4 pixels per loop. - "vst1.8 {q0}, [%1]! \n" - "bgt 1b \n" + "vld1.32 {d0[0]}, [%0], r12 \n" + "vld1.32 {d0[1]}, [%0], r12 \n" + "vld1.32 {d1[0]}, [%0], r12 \n" + "vld1.32 {d1[1]}, [%0], r12 \n" + "subs %2, %2, #4 \n" // 4 pixels per loop. + "vst1.8 {q0}, [%1]! \n" + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(dst_width) // %2 @@ -805,30 +805,30 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int dst_width) { asm volatile( - "mov r12, %4, lsl #2 \n" - "add %1, %1, %0 \n" + "mov r12, %4, lsl #2 \n" + "add %1, %1, %0 \n" "1: \n" - "vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1 - "vld1.8 {d1}, [%1], r12 \n" - "vld1.8 {d2}, [%0], r12 \n" - "vld1.8 {d3}, [%1], r12 \n" - "vld1.8 {d4}, [%0], r12 \n" - "vld1.8 {d5}, [%1], r12 \n" - "vld1.8 {d6}, [%0], r12 \n" - "vld1.8 {d7}, [%1], r12 \n" - "vaddl.u8 q0, d0, d1 \n" - "vaddl.u8 q1, d2, d3 \n" - "vaddl.u8 q2, d4, d5 \n" - "vaddl.u8 q3, d6, d7 \n" - "vswp.8 d1, d2 \n" // ab_cd -> ac_bd - "vswp.8 d5, d6 \n" // ef_gh -> eg_fh - "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d) - "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h) - "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. - "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. - "subs %3, %3, #4 \n" // 4 pixels per loop. - "vst1.8 {q0}, [%2]! \n" - "bgt 1b \n" + "vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1 + "vld1.8 {d1}, [%1], r12 \n" + "vld1.8 {d2}, [%0], r12 \n" + "vld1.8 {d3}, [%1], r12 \n" + "vld1.8 {d4}, [%0], r12 \n" + "vld1.8 {d5}, [%1], r12 \n" + "vld1.8 {d6}, [%0], r12 \n" + "vld1.8 {d7}, [%1], r12 \n" + "vaddl.u8 q0, d0, d1 \n" + "vaddl.u8 q1, d2, d3 \n" + "vaddl.u8 q2, d4, d5 \n" + "vaddl.u8 q3, d6, d7 \n" + "vswp.8 d1, d2 \n" // ab_cd -> ac_bd + "vswp.8 d5, d6 \n" // ef_gh -> eg_fh + "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d) + "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h) + "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. + "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. + "subs %3, %3, #4 \n" // 4 pixels per loop. + "vst1.8 {q0}, [%2]! \n" + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(src_stride), // %1 "+r"(dst_argb), // %2 @@ -865,8 +865,8 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb, LOAD1_DATA32_LANE(d3, 1) // clang-format on "vst1.32 {q0, q1}, [%0]! \n" // store pixels - "subs %2, %2, #8 \n" // 8 processed per loop - "bgt 1b \n" + "subs %2, %2, #8 \n" // 8 processed per loop + "bgt 1b \n" : "+r"(dst_argb), // %0 "+r"(src_argb), // %1 "+r"(dst_width), // %2 @@ -897,16 +897,16 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, int* tmp = dx_offset; const uint8_t* src_tmp = src_argb; asm volatile ( - "vdup.32 q0, %3 \n" // x - "vdup.32 q1, %4 \n" // dx - "vld1.32 {q2}, [%5] \n" // 0 1 2 3 - "vshl.i32 q9, q1, #2 \n" // 4 * dx - "vmul.s32 q1, q1, q2 \n" - "vmov.i8 q3, #0x7f \n" // 0x7F - "vmov.i16 q15, #0x7f \n" // 0x7F + "vdup.32 q0, %3 \n" // x + "vdup.32 q1, %4 \n" // dx + "vld1.32 {q2}, [%5] \n" // 0 1 2 3 + "vshl.i32 q9, q1, #2 \n" // 4 * dx + "vmul.s32 q1, q1, q2 \n" + "vmov.i8 q3, #0x7f \n" // 0x7F + "vmov.i16 q15, #0x7f \n" // 0x7F // x , x + 1 * dx, x + 2 * dx, x + 3 * dx - "vadd.s32 q8, q1, q0 \n" - "1: \n" + "vadd.s32 q8, q1, q0 \n" + "1: \n" // d0, d1: a // d2, d3: b LOAD2_DATA32_LANE(d0, d2, 0) @@ -950,6 +950,64 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, #undef LOAD2_DATA32_LANE +void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + asm volatile( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + "1: \n" + "vld2.8 {d0, d2}, [%0]! \n" // load 8 UV pixels. + "vld2.8 {d1, d3}, [%0]! \n" // load next 8 UV + "subs %3, %3, #8 \n" // 8 processed per loop. + "vpaddl.u8 q0, q0 \n" // U 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // V 16 bytes -> 8 shorts. + "vld2.8 {d16, d18}, [%1]! \n" // load 8 more UV + "vld2.8 {d17, d19}, [%1]! \n" // load last 8 UV + "vpadal.u8 q0, q8 \n" // U 16 bytes -> 8 shorts. + "vpadal.u8 q1, q9 \n" // V 16 bytes -> 8 shorts. + "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes + "vrshrn.u16 d1, q1, #2 \n" + "vst2.8 {d0, d1}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "q0", "q1", "q8", "q9"); +} + +// Reads 4 pixels at a time. +void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int src_stepx, // pixel step + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src1_ptr = src_ptr + src_stepx * 2; + const uint8_t* src2_ptr = src_ptr + src_stepx * 4; + const uint8_t* src3_ptr = src_ptr + src_stepx * 6; + (void)src_stride; + asm volatile( + "1: \n" + "vld1.16 {d0[0]}, [%0], %6 \n" + "vld1.16 {d0[1]}, [%1], %6 \n" + "vld1.16 {d0[2]}, [%2], %6 \n" + "vld1.16 {d0[3]}, [%3], %6 \n" + "subs %5, %5, #4 \n" // 4 pixels per loop. + "vst1.8 {d0}, [%4]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src1_ptr), // %1 + "+r"(src2_ptr), // %2 + "+r"(src3_ptr), // %3 + "+r"(dst_ptr), // %4 + "+r"(dst_width) // %5 + : "r"(src_stepx * 8) // %6 + : "memory", "cc", "d0"); +} + #endif // defined(__ARM_NEON__) && !defined(__aarch64__) #ifdef __cplusplus diff --git a/chromium/third_party/libyuv/source/scale_neon64.cc b/chromium/third_party/libyuv/source/scale_neon64.cc index 0a7b80ce1d4..185591cb55b 100644 --- a/chromium/third_party/libyuv/source/scale_neon64.cc +++ b/chromium/third_party/libyuv/source/scale_neon64.cc @@ -29,10 +29,11 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr, asm volatile( "1: \n" // load even pixels into v0, odd into v1 - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" - "subs %w2, %w2, #16 \n" // 16 processed per loop - "st1 {v1.16b}, [%1], #16 \n" // store odd pixels - "b.gt 1b \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" + "subs %w2, %w2, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "st1 {v1.16b}, [%1], #16 \n" // store odd pixels + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst), // %1 "+r"(dst_width) // %2 @@ -50,11 +51,12 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr, asm volatile( "1: \n" // load even pixels into v0, odd into v1 - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" - "subs %w2, %w2, #16 \n" // 16 processed per loop - "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add - "st1 {v0.16b}, [%1], #16 \n" - "b.gt 1b \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" + "subs %w2, %w2, #16 \n" // 16 processed per loop + "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "st1 {v0.16b}, [%1], #16 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst), // %1 "+r"(dst_width) // %2 @@ -70,19 +72,21 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr, int dst_width) { asm volatile( // change the stride to row 2 pointer - "add %1, %1, %0 \n" + "add %1, %1, %0 \n" "1: \n" - "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc - "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc - "subs %w3, %w3, #16 \n" // 16 processed per loop - "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent - "uaddlp v1.8h, v1.16b \n" - "uadalp v0.8h, v2.16b \n" // += row 2 add adjacent - "uadalp v1.8h, v3.16b \n" - "rshrn v0.8b, v0.8h, #2 \n" // round and pack - "rshrn2 v0.16b, v1.8h, #2 \n" - "st1 {v0.16b}, [%2], #16 \n" - "b.gt 1b \n" + "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc + "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc + "subs %w3, %w3, #16 \n" // 16 processed per loop + "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "uaddlp v1.8h, v1.16b \n" + "prfm pldl1keep, [%1, 448] \n" + "uadalp v0.8h, v2.16b \n" // += row 2 add adjacent + "uadalp v1.8h, v3.16b \n" + "rshrn v0.8b, v0.8h, #2 \n" // round and pack + "rshrn2 v0.16b, v1.8h, #2 \n" + "st1 {v0.16b}, [%2], #16 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_stride), // %1 "+r"(dst), // %2 @@ -99,10 +103,11 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr, (void)src_stride; asm volatile( "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 - "subs %w2, %w2, #8 \n" // 8 processed per loop - "st1 {v2.8b}, [%1], #8 \n" - "b.gt 1b \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "subs %w2, %w2, #8 \n" // 8 processed per loop + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "st1 {v2.8b}, [%1], #8 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -119,19 +124,23 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr, const uint8_t* src_ptr3 = src_ptr + src_stride * 3; asm volatile( "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 - "ld1 {v1.16b}, [%2], #16 \n" - "ld1 {v2.16b}, [%3], #16 \n" - "ld1 {v3.16b}, [%4], #16 \n" - "subs %w5, %w5, #4 \n" - "uaddlp v0.8h, v0.16b \n" - "uadalp v0.8h, v1.16b \n" - "uadalp v0.8h, v2.16b \n" - "uadalp v0.8h, v3.16b \n" - "addp v0.8h, v0.8h, v0.8h \n" - "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding - "st1 {v0.s}[0], [%1], #4 \n" - "b.gt 1b \n" + "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 + "ld1 {v1.16b}, [%2], #16 \n" + "ld1 {v2.16b}, [%3], #16 \n" + "ld1 {v3.16b}, [%4], #16 \n" + "subs %w5, %w5, #4 \n" + "uaddlp v0.8h, v0.16b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "uadalp v0.8h, v1.16b \n" + "prfm pldl1keep, [%2, 448] \n" + "uadalp v0.8h, v2.16b \n" + "prfm pldl1keep, [%3, 448] \n" + "uadalp v0.8h, v3.16b \n" + "prfm pldl1keep, [%4, 448] \n" + "addp v0.8h, v0.8h, v0.8h \n" + "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding + "st1 {v0.s}[0], [%1], #4 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(src_ptr1), // %2 @@ -151,12 +160,13 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 - "subs %w2, %w2, #24 \n" - "orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2 - "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" - "b.gt 1b \n" + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "subs %w2, %w2, #24 \n" + "orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2 + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -169,49 +179,51 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "movi v20.8b, #3 \n" - "add %3, %3, %0 \n" - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 - "subs %w2, %w2, #24 \n" + "movi v20.8b, #3 \n" + "add %3, %3, %0 \n" + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 + "subs %w2, %w2, #24 \n" // filter src line 0 with src line 1 // expand chars to shorts to allow for room // when adding lines together - "ushll v16.8h, v4.8b, #0 \n" - "ushll v17.8h, v5.8b, #0 \n" - "ushll v18.8h, v6.8b, #0 \n" - "ushll v19.8h, v7.8b, #0 \n" + "ushll v16.8h, v4.8b, #0 \n" + "ushll v17.8h, v5.8b, #0 \n" + "ushll v18.8h, v6.8b, #0 \n" + "ushll v19.8h, v7.8b, #0 \n" // 3 * line_0 + line_1 - "umlal v16.8h, v0.8b, v20.8b \n" - "umlal v17.8h, v1.8b, v20.8b \n" - "umlal v18.8h, v2.8b, v20.8b \n" - "umlal v19.8h, v3.8b, v20.8b \n" + "umlal v16.8h, v0.8b, v20.8b \n" + "umlal v17.8h, v1.8b, v20.8b \n" + "umlal v18.8h, v2.8b, v20.8b \n" + "umlal v19.8h, v3.8b, v20.8b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead // (3 * line_0 + line_1) >> 2 - "uqrshrn v0.8b, v16.8h, #2 \n" - "uqrshrn v1.8b, v17.8h, #2 \n" - "uqrshrn v2.8b, v18.8h, #2 \n" - "uqrshrn v3.8b, v19.8h, #2 \n" + "uqrshrn v0.8b, v16.8h, #2 \n" + "uqrshrn v1.8b, v17.8h, #2 \n" + "uqrshrn v2.8b, v18.8h, #2 \n" + "uqrshrn v3.8b, v19.8h, #2 \n" + "prfm pldl1keep, [%3, 448] \n" // a0 = (src[0] * 3 + s[1] * 1) >> 2 - "ushll v16.8h, v1.8b, #0 \n" - "umlal v16.8h, v0.8b, v20.8b \n" - "uqrshrn v0.8b, v16.8h, #2 \n" + "ushll v16.8h, v1.8b, #0 \n" + "umlal v16.8h, v0.8b, v20.8b \n" + "uqrshrn v0.8b, v16.8h, #2 \n" // a1 = (src[1] * 1 + s[2] * 1) >> 1 - "urhadd v1.8b, v1.8b, v2.8b \n" + "urhadd v1.8b, v1.8b, v2.8b \n" // a2 = (src[2] * 1 + s[3] * 3) >> 2 - "ushll v16.8h, v2.8b, #0 \n" - "umlal v16.8h, v3.8b, v20.8b \n" - "uqrshrn v2.8b, v16.8h, #2 \n" + "ushll v16.8h, v2.8b, #0 \n" + "umlal v16.8h, v3.8b, v20.8b \n" + "uqrshrn v2.8b, v16.8h, #2 \n" - "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" - "b.gt 1b \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 @@ -226,33 +238,35 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "movi v20.8b, #3 \n" - "add %3, %3, %0 \n" - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 - "subs %w2, %w2, #24 \n" + "movi v20.8b, #3 \n" + "add %3, %3, %0 \n" + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 + "subs %w2, %w2, #24 \n" // average src line 0 with src line 1 - "urhadd v0.8b, v0.8b, v4.8b \n" - "urhadd v1.8b, v1.8b, v5.8b \n" - "urhadd v2.8b, v2.8b, v6.8b \n" - "urhadd v3.8b, v3.8b, v7.8b \n" + "urhadd v0.8b, v0.8b, v4.8b \n" + "urhadd v1.8b, v1.8b, v5.8b \n" + "urhadd v2.8b, v2.8b, v6.8b \n" + "urhadd v3.8b, v3.8b, v7.8b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead // a0 = (src[0] * 3 + s[1] * 1) >> 2 - "ushll v4.8h, v1.8b, #0 \n" - "umlal v4.8h, v0.8b, v20.8b \n" - "uqrshrn v0.8b, v4.8h, #2 \n" + "ushll v4.8h, v1.8b, #0 \n" + "umlal v4.8h, v0.8b, v20.8b \n" + "uqrshrn v0.8b, v4.8h, #2 \n" + "prfm pldl1keep, [%3, 448] \n" // a1 = (src[1] * 1 + s[2] * 1) >> 1 - "urhadd v1.8b, v1.8b, v2.8b \n" + "urhadd v1.8b, v1.8b, v2.8b \n" // a2 = (src[2] * 1 + s[3] * 3) >> 2 - "ushll v4.8h, v2.8b, #0 \n" - "umlal v4.8h, v3.8b, v20.8b \n" - "uqrshrn v2.8b, v4.8h, #2 \n" + "ushll v4.8h, v2.8b, #0 \n" + "umlal v4.8h, v3.8b, v20.8b \n" + "uqrshrn v2.8b, v4.8h, #2 \n" - "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" - "b.gt 1b \n" + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 @@ -279,14 +293,15 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "ld1 {v3.16b}, [%3] \n" - "1: \n" - "ld1 {v0.16b,v1.16b}, [%0], #32 \n" - "subs %w2, %w2, #12 \n" - "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" - "st1 {v2.8b}, [%1], #8 \n" - "st1 {v2.s}[2], [%1], #4 \n" - "b.gt 1b \n" + "ld1 {v3.16b}, [%3] \n" + "1: \n" + "ld1 {v0.16b,v1.16b}, [%0], #32 \n" + "subs %w2, %w2, #12 \n" + "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "st1 {v2.8b}, [%1], #8 \n" + "st1 {v2.s}[2], [%1], #4 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -303,68 +318,68 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, ptrdiff_t tmp_src_stride = src_stride; asm volatile( - "ld1 {v29.8h}, [%5] \n" - "ld1 {v30.16b}, [%6] \n" - "ld1 {v31.8h}, [%7] \n" - "add %2, %2, %0 \n" - "1: \n" + "ld1 {v29.8h}, [%5] \n" + "ld1 {v30.16b}, [%6] \n" + "ld1 {v31.8h}, [%7] \n" + "add %2, %2, %0 \n" + "1: \n" // 00 40 01 41 02 42 03 43 // 10 50 11 51 12 52 13 53 // 20 60 21 61 22 62 23 63 // 30 70 31 71 32 72 33 73 - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" - "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" - "subs %w4, %w4, #12 \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" + "subs %w4, %w4, #12 \n" // Shuffle the input data around to get align the data // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 // 00 10 01 11 02 12 03 13 // 40 50 41 51 42 52 43 53 - "trn1 v20.8b, v0.8b, v1.8b \n" - "trn2 v21.8b, v0.8b, v1.8b \n" - "trn1 v22.8b, v4.8b, v5.8b \n" - "trn2 v23.8b, v4.8b, v5.8b \n" - "trn1 v24.8b, v16.8b, v17.8b \n" - "trn2 v25.8b, v16.8b, v17.8b \n" + "trn1 v20.8b, v0.8b, v1.8b \n" + "trn2 v21.8b, v0.8b, v1.8b \n" + "trn1 v22.8b, v4.8b, v5.8b \n" + "trn2 v23.8b, v4.8b, v5.8b \n" + "trn1 v24.8b, v16.8b, v17.8b \n" + "trn2 v25.8b, v16.8b, v17.8b \n" // 20 30 21 31 22 32 23 33 // 60 70 61 71 62 72 63 73 - "trn1 v0.8b, v2.8b, v3.8b \n" - "trn2 v1.8b, v2.8b, v3.8b \n" - "trn1 v4.8b, v6.8b, v7.8b \n" - "trn2 v5.8b, v6.8b, v7.8b \n" - "trn1 v16.8b, v18.8b, v19.8b \n" - "trn2 v17.8b, v18.8b, v19.8b \n" + "trn1 v0.8b, v2.8b, v3.8b \n" + "trn2 v1.8b, v2.8b, v3.8b \n" + "trn1 v4.8b, v6.8b, v7.8b \n" + "trn2 v5.8b, v6.8b, v7.8b \n" + "trn1 v16.8b, v18.8b, v19.8b \n" + "trn2 v17.8b, v18.8b, v19.8b \n" // 00+10 01+11 02+12 03+13 // 40+50 41+51 42+52 43+53 - "uaddlp v20.4h, v20.8b \n" - "uaddlp v21.4h, v21.8b \n" - "uaddlp v22.4h, v22.8b \n" - "uaddlp v23.4h, v23.8b \n" - "uaddlp v24.4h, v24.8b \n" - "uaddlp v25.4h, v25.8b \n" + "uaddlp v20.4h, v20.8b \n" + "uaddlp v21.4h, v21.8b \n" + "uaddlp v22.4h, v22.8b \n" + "uaddlp v23.4h, v23.8b \n" + "uaddlp v24.4h, v24.8b \n" + "uaddlp v25.4h, v25.8b \n" // 60+70 61+71 62+72 63+73 - "uaddlp v1.4h, v1.8b \n" - "uaddlp v5.4h, v5.8b \n" - "uaddlp v17.4h, v17.8b \n" + "uaddlp v1.4h, v1.8b \n" + "uaddlp v5.4h, v5.8b \n" + "uaddlp v17.4h, v17.8b \n" // combine source lines - "add v20.4h, v20.4h, v22.4h \n" - "add v21.4h, v21.4h, v23.4h \n" - "add v20.4h, v20.4h, v24.4h \n" - "add v21.4h, v21.4h, v25.4h \n" - "add v2.4h, v1.4h, v5.4h \n" - "add v2.4h, v2.4h, v17.4h \n" + "add v20.4h, v20.4h, v22.4h \n" + "add v21.4h, v21.4h, v23.4h \n" + "add v20.4h, v20.4h, v24.4h \n" + "add v21.4h, v21.4h, v25.4h \n" + "add v2.4h, v1.4h, v5.4h \n" + "add v2.4h, v2.4h, v17.4h \n" // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] // + s[6 + st * 1] + s[7 + st * 1] // + s[6 + st * 2] + s[7 + st * 2]) / 6 - "sqrdmulh v2.8h, v2.8h, v29.8h \n" - "xtn v2.8b, v2.8h \n" + "sqrdmulh v2.8h, v2.8h, v29.8h \n" + "xtn v2.8b, v2.8h \n" // Shuffle 2,3 reg around so that 2 can be added to the // 0,1 reg and 3 can be added to the 4,5 reg. This @@ -372,35 +387,38 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, // registers are already expanded. Then do transposes // to get aligned. // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - "ushll v16.8h, v16.8b, #0 \n" - "uaddl v0.8h, v0.8b, v4.8b \n" + "ushll v16.8h, v16.8b, #0 \n" + "uaddl v0.8h, v0.8b, v4.8b \n" // combine source lines - "add v0.8h, v0.8h, v16.8h \n" + "add v0.8h, v0.8h, v16.8h \n" // xx 20 xx 21 xx 22 xx 23 // xx 30 xx 31 xx 32 xx 33 - "trn1 v1.8h, v0.8h, v0.8h \n" - "trn2 v4.8h, v0.8h, v0.8h \n" - "xtn v0.4h, v1.4s \n" - "xtn v4.4h, v4.4s \n" + "trn1 v1.8h, v0.8h, v0.8h \n" + "trn2 v4.8h, v0.8h, v0.8h \n" + "xtn v0.4h, v1.4s \n" + "xtn v4.4h, v4.4s \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead // 0+1+2, 3+4+5 - "add v20.8h, v20.8h, v0.8h \n" - "add v21.8h, v21.8h, v4.8h \n" + "add v20.8h, v20.8h, v0.8h \n" + "add v21.8h, v21.8h, v4.8h \n" + "prfm pldl1keep, [%2, 448] \n" // Need to divide, but can't downshift as the the value // isn't a power of 2. So multiply by 65536 / n // and take the upper 16 bits. - "sqrdmulh v0.8h, v20.8h, v31.8h \n" - "sqrdmulh v1.8h, v21.8h, v31.8h \n" + "sqrdmulh v0.8h, v20.8h, v31.8h \n" + "sqrdmulh v1.8h, v21.8h, v31.8h \n" + "prfm pldl1keep, [%3, 448] \n" // Align for table lookup, vtbl requires registers to be adjacent - "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" + "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" - "st1 {v3.8b}, [%1], #8 \n" - "st1 {v3.s}[2], [%1], #4 \n" - "b.gt 1b \n" + "st1 {v3.8b}, [%1], #8 \n" + "st1 {v3.s}[2], [%1], #4 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(tmp_src_stride), // %2 @@ -422,53 +440,53 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, // TODO(fbarchard): use src_stride directly for clang 3.5+. ptrdiff_t tmp_src_stride = src_stride; asm volatile( - "ld1 {v30.8h}, [%4] \n" - "ld1 {v31.16b}, [%5] \n" - "add %2, %2, %0 \n" - "1: \n" + "ld1 {v30.8h}, [%4] \n" + "ld1 {v31.16b}, [%5] \n" + "add %2, %2, %0 \n" + "1: \n" // 00 40 01 41 02 42 03 43 // 10 50 11 51 12 52 13 53 // 20 60 21 61 22 62 23 63 // 30 70 31 71 32 72 33 73 - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" - "subs %w3, %w3, #12 \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" + "subs %w3, %w3, #12 \n" // Shuffle the input data around to get align the data // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 // 00 10 01 11 02 12 03 13 // 40 50 41 51 42 52 43 53 - "trn1 v16.8b, v0.8b, v1.8b \n" - "trn2 v17.8b, v0.8b, v1.8b \n" - "trn1 v18.8b, v4.8b, v5.8b \n" - "trn2 v19.8b, v4.8b, v5.8b \n" + "trn1 v16.8b, v0.8b, v1.8b \n" + "trn2 v17.8b, v0.8b, v1.8b \n" + "trn1 v18.8b, v4.8b, v5.8b \n" + "trn2 v19.8b, v4.8b, v5.8b \n" // 20 30 21 31 22 32 23 33 // 60 70 61 71 62 72 63 73 - "trn1 v0.8b, v2.8b, v3.8b \n" - "trn2 v1.8b, v2.8b, v3.8b \n" - "trn1 v4.8b, v6.8b, v7.8b \n" - "trn2 v5.8b, v6.8b, v7.8b \n" + "trn1 v0.8b, v2.8b, v3.8b \n" + "trn2 v1.8b, v2.8b, v3.8b \n" + "trn1 v4.8b, v6.8b, v7.8b \n" + "trn2 v5.8b, v6.8b, v7.8b \n" // 00+10 01+11 02+12 03+13 // 40+50 41+51 42+52 43+53 - "uaddlp v16.4h, v16.8b \n" - "uaddlp v17.4h, v17.8b \n" - "uaddlp v18.4h, v18.8b \n" - "uaddlp v19.4h, v19.8b \n" + "uaddlp v16.4h, v16.8b \n" + "uaddlp v17.4h, v17.8b \n" + "uaddlp v18.4h, v18.8b \n" + "uaddlp v19.4h, v19.8b \n" // 60+70 61+71 62+72 63+73 - "uaddlp v1.4h, v1.8b \n" - "uaddlp v5.4h, v5.8b \n" + "uaddlp v1.4h, v1.8b \n" + "uaddlp v5.4h, v5.8b \n" // combine source lines - "add v16.4h, v16.4h, v18.4h \n" - "add v17.4h, v17.4h, v19.4h \n" - "add v2.4h, v1.4h, v5.4h \n" + "add v16.4h, v16.4h, v18.4h \n" + "add v17.4h, v17.4h, v19.4h \n" + "add v2.4h, v1.4h, v5.4h \n" // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 - "uqrshrn v2.8b, v2.8h, #2 \n" + "uqrshrn v2.8b, v2.8h, #2 \n" // Shuffle 2,3 reg around so that 2 can be added to the // 0,1 reg and 3 can be added to the 4,5 reg. This @@ -478,33 +496,35 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 // combine source lines - "uaddl v0.8h, v0.8b, v4.8b \n" + "uaddl v0.8h, v0.8b, v4.8b \n" // xx 20 xx 21 xx 22 xx 23 // xx 30 xx 31 xx 32 xx 33 - "trn1 v1.8h, v0.8h, v0.8h \n" - "trn2 v4.8h, v0.8h, v0.8h \n" - "xtn v0.4h, v1.4s \n" - "xtn v4.4h, v4.4s \n" + "trn1 v1.8h, v0.8h, v0.8h \n" + "trn2 v4.8h, v0.8h, v0.8h \n" + "xtn v0.4h, v1.4s \n" + "xtn v4.4h, v4.4s \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead // 0+1+2, 3+4+5 - "add v16.8h, v16.8h, v0.8h \n" - "add v17.8h, v17.8h, v4.8h \n" + "add v16.8h, v16.8h, v0.8h \n" + "add v17.8h, v17.8h, v4.8h \n" + "prfm pldl1keep, [%2, 448] \n" // Need to divide, but can't downshift as the the value // isn't a power of 2. So multiply by 65536 / n // and take the upper 16 bits. - "sqrdmulh v0.8h, v16.8h, v30.8h \n" - "sqrdmulh v1.8h, v17.8h, v30.8h \n" + "sqrdmulh v0.8h, v16.8h, v30.8h \n" + "sqrdmulh v1.8h, v17.8h, v30.8h \n" // Align for table lookup, vtbl requires registers to // be adjacent - "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n" + "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n" - "st1 {v3.8b}, [%1], #8 \n" - "st1 {v3.s}[2], [%1], #4 \n" - "b.gt 1b \n" + "st1 {v3.8b}, [%1], #8 \n" + "st1 {v3.s}[2], [%1], #4 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(tmp_src_stride), // %2 @@ -522,13 +542,14 @@ void ScaleAddRow_NEON(const uint8_t* src_ptr, int src_width) { asm volatile( "1: \n" - "ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator - "ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes - "uaddw2 v2.8h, v2.8h, v0.16b \n" // add - "uaddw v1.8h, v1.8h, v0.8b \n" - "st1 {v1.8h, v2.8h}, [%1], #32 \n" // store accumulator - "subs %w2, %w2, #16 \n" // 16 processed per loop - "b.gt 1b \n" + "ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator + "ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes + "uaddw2 v2.8h, v2.8h, v0.16b \n" // add + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "uaddw v1.8h, v1.8h, v0.8b \n" + "st1 {v1.8h, v2.8h}, [%1], #32 \n" // store accumulator + "subs %w2, %w2, #16 \n" // 16 processed per loop + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(src_width) // %2 @@ -560,17 +581,17 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr, int64_t x64 = (int64_t)x; // NOLINT int64_t dx64 = (int64_t)dx; // NOLINT asm volatile ( - "dup v0.4s, %w3 \n" // x - "dup v1.4s, %w4 \n" // dx - "ld1 {v2.4s}, [%5] \n" // 0 1 2 3 - "shl v3.4s, v1.4s, #2 \n" // 4 * dx - "mul v1.4s, v1.4s, v2.4s \n" + "dup v0.4s, %w3 \n" // x + "dup v1.4s, %w4 \n" // dx + "ld1 {v2.4s}, [%5] \n" // 0 1 2 3 + "shl v3.4s, v1.4s, #2 \n" // 4 * dx + "mul v1.4s, v1.4s, v2.4s \n" // x , x + 1 * dx, x + 2 * dx, x + 3 * dx - "add v1.4s, v1.4s, v0.4s \n" + "add v1.4s, v1.4s, v0.4s \n" // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx - "add v2.4s, v1.4s, v3.4s \n" - "shl v0.4s, v3.4s, #1 \n" // 8 * dx - "1: \n" + "add v2.4s, v1.4s, v3.4s \n" + "shl v0.4s, v3.4s, #1 \n" // 8 * dx + "1: \n" LOAD2_DATA8_LANE(0) LOAD2_DATA8_LANE(1) LOAD2_DATA8_LANE(2) @@ -579,27 +600,27 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr, LOAD2_DATA8_LANE(5) LOAD2_DATA8_LANE(6) LOAD2_DATA8_LANE(7) - "mov v6.16b, v1.16b \n" - "mov v7.16b, v2.16b \n" - "uzp1 v6.8h, v6.8h, v7.8h \n" - "ushll v4.8h, v4.8b, #0 \n" - "ushll v5.8h, v5.8b, #0 \n" - "ssubl v16.4s, v5.4h, v4.4h \n" - "ssubl2 v17.4s, v5.8h, v4.8h \n" - "ushll v7.4s, v6.4h, #0 \n" - "ushll2 v6.4s, v6.8h, #0 \n" - "mul v16.4s, v16.4s, v7.4s \n" - "mul v17.4s, v17.4s, v6.4s \n" - "rshrn v6.4h, v16.4s, #16 \n" - "rshrn2 v6.8h, v17.4s, #16 \n" - "add v4.8h, v4.8h, v6.8h \n" - "xtn v4.8b, v4.8h \n" - - "st1 {v4.8b}, [%0], #8 \n" // store pixels - "add v1.4s, v1.4s, v0.4s \n" - "add v2.4s, v2.4s, v0.4s \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop - "b.gt 1b \n" + "mov v6.16b, v1.16b \n" + "mov v7.16b, v2.16b \n" + "uzp1 v6.8h, v6.8h, v7.8h \n" + "ushll v4.8h, v4.8b, #0 \n" + "ushll v5.8h, v5.8b, #0 \n" + "ssubl v16.4s, v5.4h, v4.4h \n" + "ssubl2 v17.4s, v5.8h, v4.8h \n" + "ushll v7.4s, v6.4h, #0 \n" + "ushll2 v6.4s, v6.8h, #0 \n" + "mul v16.4s, v16.4s, v7.4s \n" + "mul v17.4s, v17.4s, v6.4s \n" + "rshrn v6.4h, v16.4s, #16 \n" + "rshrn2 v6.8h, v17.4s, #16 \n" + "add v4.8h, v4.8h, v6.8h \n" + "xtn v4.8b, v4.8h \n" + + "st1 {v4.8b}, [%0], #8 \n" // store pixels + "add v1.4s, v1.4s, v0.4s \n" + "add v2.4s, v2.4s, v0.4s \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + "b.gt 1b \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(dst_width), // %2 @@ -623,74 +644,83 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr, int source_y_fraction) { int y_fraction = 256 - source_y_fraction; asm volatile( - "cmp %w4, #0 \n" - "b.eq 100f \n" - "add %2, %2, %1 \n" - "cmp %w4, #64 \n" - "b.eq 75f \n" - "cmp %w4, #128 \n" - "b.eq 50f \n" - "cmp %w4, #192 \n" - "b.eq 25f \n" - - "dup v5.8b, %w4 \n" - "dup v4.8b, %w5 \n" + "cmp %w4, #0 \n" + "b.eq 100f \n" + "add %2, %2, %1 \n" + "cmp %w4, #64 \n" + "b.eq 75f \n" + "cmp %w4, #128 \n" + "b.eq 50f \n" + "cmp %w4, #192 \n" + "b.eq 25f \n" + + "dup v5.8b, %w4 \n" + "dup v4.8b, %w5 \n" // General purpose row blend. "1: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "umull v6.8h, v0.8b, v4.8b \n" - "umull2 v7.8h, v0.16b, v4.16b \n" - "umlal v6.8h, v1.8b, v5.8b \n" - "umlal2 v7.8h, v1.16b, v5.16b \n" - "rshrn v0.8b, v6.8h, #8 \n" - "rshrn2 v0.16b, v7.8h, #8 \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 1b \n" - "b 99f \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "umull v6.8h, v0.8b, v4.8b \n" + "umull2 v7.8h, v0.16b, v4.16b \n" + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + "umlal v6.8h, v1.8b, v5.8b \n" + "umlal2 v7.8h, v1.16b, v5.16b \n" + "prfm pldl1keep, [%2, 448] \n" + "rshrn v0.8b, v6.8h, #8 \n" + "rshrn2 v0.16b, v7.8h, #8 \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 1b \n" + "b 99f \n" // Blend 25 / 75. "25: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 25b \n" - "b 99f \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + "urhadd v0.16b, v0.16b, v1.16b \n" + "prfm pldl1keep, [%2, 448] \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 25b \n" + "b 99f \n" // Blend 50 / 50. "50: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 50b \n" - "b 99f \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + "urhadd v0.16b, v0.16b, v1.16b \n" + "prfm pldl1keep, [%2, 448] \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 50b \n" + "b 99f \n" // Blend 75 / 25. "75: \n" - "ld1 {v1.16b}, [%1], #16 \n" - "ld1 {v0.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 75b \n" - "b 99f \n" + "ld1 {v1.16b}, [%1], #16 \n" + "ld1 {v0.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + "urhadd v0.16b, v0.16b, v1.16b \n" + "prfm pldl1keep, [%2, 448] \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 75b \n" + "b 99f \n" // Blend 100 / 0 - Copy row unchanged. "100: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "subs %w3, %w3, #16 \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 100b \n" + "ld1 {v0.16b}, [%1], #16 \n" + "subs %w3, %w3, #16 \n" + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 100b \n" "99: \n" - "st1 {v0.b}[15], [%0] \n" + "st1 {v0.b}[15], [%0] \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(src_stride), // %2 @@ -709,11 +739,12 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, asm volatile( "1: \n" // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3 - "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop - "mov v2.16b, v3.16b \n" - "st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels - "b.gt 1b \n" + "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + "mov v2.16b, v3.16b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst), // %1 "+r"(dst_width) // %2 @@ -730,13 +761,14 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb, asm volatile( "1: \n" // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3 - "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop - - "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add - "urhadd v1.16b, v2.16b, v3.16b \n" - "st2 {v0.4s,v1.4s}, [%1], #32 \n" // store 8 pixels - "b.gt 1b \n" + "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + + "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "urhadd v1.16b, v2.16b, v3.16b \n" + "st2 {v0.4s,v1.4s}, [%1], #32 \n" // store 8 pixels + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(dst_width) // %2 @@ -751,25 +783,27 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, int dst_width) { asm volatile( // change the stride to row 2 pointer - "add %1, %1, %0 \n" + "add %1, %1, %0 \n" "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. - "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 - "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. - "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts. - "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts. - "rshrn v0.8b, v0.8h, #2 \n" // round and pack - "rshrn v1.8b, v1.8h, #2 \n" - "rshrn v2.8b, v2.8h, #2 \n" - "rshrn v3.8b, v3.8h, #2 \n" - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" - "b.gt 1b \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. + "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 + "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts. + "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" + "rshrn v0.8b, v0.8h, #2 \n" // round and pack + "rshrn v1.8b, v1.8h, #2 \n" + "rshrn v2.8b, v2.8h, #2 \n" + "rshrn v3.8b, v3.8h, #2 \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_stride), // %1 "+r"(dst), // %2 @@ -788,13 +822,14 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, (void)src_stride; asm volatile( "1: \n" - "ld1 {v0.s}[0], [%0], %3 \n" - "ld1 {v0.s}[1], [%0], %3 \n" - "ld1 {v0.s}[2], [%0], %3 \n" - "ld1 {v0.s}[3], [%0], %3 \n" - "subs %w2, %w2, #4 \n" // 4 pixels per loop. - "st1 {v0.16b}, [%1], #16 \n" - "b.gt 1b \n" + "ld1 {v0.s}[0], [%0], %3 \n" + "ld1 {v0.s}[1], [%0], %3 \n" + "ld1 {v0.s}[2], [%0], %3 \n" + "ld1 {v0.s}[3], [%0], %3 \n" + "subs %w2, %w2, #4 \n" // 4 pixels per loop. + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "st1 {v0.16b}, [%1], #16 \n" + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(dst_width) // %2 @@ -812,33 +847,35 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int dst_width) { asm volatile( - "add %1, %1, %0 \n" + "add %1, %1, %0 \n" "1: \n" - "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1 - "ld1 {v1.8b}, [%1], %4 \n" - "ld1 {v2.8b}, [%0], %4 \n" - "ld1 {v3.8b}, [%1], %4 \n" - "ld1 {v4.8b}, [%0], %4 \n" - "ld1 {v5.8b}, [%1], %4 \n" - "ld1 {v6.8b}, [%0], %4 \n" - "ld1 {v7.8b}, [%1], %4 \n" - "uaddl v0.8h, v0.8b, v1.8b \n" - "uaddl v2.8h, v2.8b, v3.8b \n" - "uaddl v4.8h, v4.8b, v5.8b \n" - "uaddl v6.8h, v6.8b, v7.8b \n" - "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd - "mov v0.d[1], v2.d[0] \n" - "mov v2.d[0], v16.d[1] \n" - "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh - "mov v4.d[1], v6.d[0] \n" - "mov v6.d[0], v16.d[1] \n" - "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) - "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) - "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. - "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. - "subs %w3, %w3, #4 \n" // 4 pixels per loop. - "st1 {v0.16b}, [%2], #16 \n" - "b.gt 1b \n" + "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1 + "ld1 {v1.8b}, [%1], %4 \n" + "ld1 {v2.8b}, [%0], %4 \n" + "ld1 {v3.8b}, [%1], %4 \n" + "ld1 {v4.8b}, [%0], %4 \n" + "ld1 {v5.8b}, [%1], %4 \n" + "ld1 {v6.8b}, [%0], %4 \n" + "ld1 {v7.8b}, [%1], %4 \n" + "uaddl v0.8h, v0.8b, v1.8b \n" + "uaddl v2.8h, v2.8b, v3.8b \n" + "uaddl v4.8h, v4.8b, v5.8b \n" + "uaddl v6.8h, v6.8b, v7.8b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd + "mov v0.d[1], v2.d[0] \n" + "mov v2.d[0], v16.d[1] \n" + "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh + "mov v4.d[1], v6.d[0] \n" + "mov v6.d[0], v16.d[1] \n" + "prfm pldl1keep, [%1, 448] \n" + "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) + "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) + "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. + "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. + "subs %w3, %w3, #4 \n" // 4 pixels per loop. + "st1 {v0.16b}, [%2], #16 \n" + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(src_stride), // %1 "+r"(dst_argb), // %2 @@ -875,10 +912,11 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb, LOAD1_DATA32_LANE(v1, 1) LOAD1_DATA32_LANE(v1, 2) LOAD1_DATA32_LANE(v1, 3) + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead // clang-format on - "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop - "b.gt 1b \n" + "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop + "b.gt 1b \n" : "+r"(dst_argb), // %0 "+r"(src_argb), // %1 "+r"(dst_width), // %2 @@ -911,16 +949,16 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, int64_t x64 = (int64_t)x; // NOLINT int64_t dx64 = (int64_t)dx; // NOLINT asm volatile ( - "dup v0.4s, %w3 \n" // x - "dup v1.4s, %w4 \n" // dx - "ld1 {v2.4s}, [%5] \n" // 0 1 2 3 - "shl v6.4s, v1.4s, #2 \n" // 4 * dx - "mul v1.4s, v1.4s, v2.4s \n" - "movi v3.16b, #0x7f \n" // 0x7F - "movi v4.8h, #0x7f \n" // 0x7F + "dup v0.4s, %w3 \n" // x + "dup v1.4s, %w4 \n" // dx + "ld1 {v2.4s}, [%5] \n" // 0 1 2 3 + "shl v6.4s, v1.4s, #2 \n" // 4 * dx + "mul v1.4s, v1.4s, v2.4s \n" + "movi v3.16b, #0x7f \n" // 0x7F + "movi v4.8h, #0x7f \n" // 0x7F // x , x + 1 * dx, x + 2 * dx, x + 3 * dx - "add v5.4s, v1.4s, v0.4s \n" - "1: \n" + "add v5.4s, v1.4s, v0.4s \n" + "1: \n" // d0, d1: a // d2, d3: b LOAD2_DATA32_LANE(v0, v1, 0) @@ -941,15 +979,15 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, "umull2 v17.8h, v0.16b, v7.16b \n" "umull v18.8h, v1.8b, v2.8b \n" "umull2 v19.8h, v1.16b, v2.16b \n" + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead "add v16.8h, v16.8h, v18.8h \n" "add v17.8h, v17.8h, v19.8h \n" "shrn v0.8b, v16.8h, #7 \n" "shrn2 v0.16b, v17.8h, #7 \n" - "st1 {v0.4s}, [%0], #16 \n" // store pixels "add v5.4s, v5.4s, v6.4s \n" "subs %w2, %w2, #4 \n" // 4 processed per loop - "b.gt 1b \n" + "b.gt 1b \n" : "+r"(dst_argb), // %0 "+r"(src_argb), // %1 "+r"(dst_width), // %2 @@ -972,19 +1010,21 @@ void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr, int dst_width) { asm volatile( // change the stride to row 2 pointer - "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 + "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 "1: \n" - "ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc - "ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc - "subs %w3, %w3, #8 \n" // 8 processed per loop - "uaddlp v0.4s, v0.8h \n" // row 1 add adjacent - "uaddlp v1.4s, v1.8h \n" - "uadalp v0.4s, v2.8h \n" // +row 2 add adjacent - "uadalp v1.4s, v3.8h \n" - "rshrn v0.4h, v0.4s, #2 \n" // round and pack - "rshrn2 v0.8h, v1.4s, #2 \n" - "st1 {v0.8h}, [%2], #16 \n" - "b.gt 1b \n" + "ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc + "ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc + "subs %w3, %w3, #8 \n" // 8 processed per loop + "uaddlp v0.4s, v0.8h \n" // row 1 add adjacent + "uaddlp v1.4s, v1.8h \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "uadalp v0.4s, v2.8h \n" // +row 2 add adjacent + "uadalp v1.4s, v3.8h \n" + "prfm pldl1keep, [%1, 448] \n" + "rshrn v0.4h, v0.4s, #2 \n" // round and pack + "rshrn2 v0.8h, v1.4s, #2 \n" + "st1 {v0.8h}, [%2], #16 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_stride), // %1 "+r"(dst), // %2 @@ -1001,38 +1041,40 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr, uint16_t* dst, int dst_width) { asm volatile( - "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 - "movi v0.8h, #9 \n" // constants - "movi v1.4s, #3 \n" + "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 + "movi v0.8h, #9 \n" // constants + "movi v1.4s, #3 \n" "1: \n" - "ld1 {v3.8h}, [%0], %4 \n" // TL read first 8 - "ld1 {v4.8h}, [%0], %5 \n" // TR read 8 offset by 1 - "ld1 {v5.8h}, [%1], %4 \n" // BL read 8 from next row - "ld1 {v6.8h}, [%1], %5 \n" // BR offset by 1 - "subs %w3, %w3, #16 \n" // 16 dst pixels per loop - "umull v16.4s, v3.4h, v0.4h \n" - "umull2 v7.4s, v3.8h, v0.8h \n" - "umull v18.4s, v4.4h, v0.4h \n" - "umull2 v17.4s, v4.8h, v0.8h \n" - "uaddw v16.4s, v16.4s, v6.4h \n" - "uaddl2 v19.4s, v6.8h, v3.8h \n" - "uaddl v3.4s, v6.4h, v3.4h \n" - "uaddw2 v6.4s, v7.4s, v6.8h \n" - "uaddl2 v7.4s, v5.8h, v4.8h \n" - "uaddl v4.4s, v5.4h, v4.4h \n" - "uaddw v18.4s, v18.4s, v5.4h \n" - "mla v16.4s, v4.4s, v1.4s \n" - "mla v18.4s, v3.4s, v1.4s \n" - "mla v6.4s, v7.4s, v1.4s \n" - "uaddw2 v4.4s, v17.4s, v5.8h \n" - "uqrshrn v16.4h, v16.4s, #4 \n" - "mla v4.4s, v19.4s, v1.4s \n" - "uqrshrn2 v16.8h, v6.4s, #4 \n" - "uqrshrn v17.4h, v18.4s, #4 \n" - "uqrshrn2 v17.8h, v4.4s, #4 \n" - "st2 {v16.8h-v17.8h}, [%2], #32 \n" - "b.gt 1b \n" + "ld1 {v3.8h}, [%0], %4 \n" // TL read first 8 + "ld1 {v4.8h}, [%0], %5 \n" // TR read 8 offset by 1 + "ld1 {v5.8h}, [%1], %4 \n" // BL read 8 from next row + "ld1 {v6.8h}, [%1], %5 \n" // BR offset by 1 + "subs %w3, %w3, #16 \n" // 16 dst pixels per loop + "umull v16.4s, v3.4h, v0.4h \n" + "umull2 v7.4s, v3.8h, v0.8h \n" + "umull v18.4s, v4.4h, v0.4h \n" + "umull2 v17.4s, v4.8h, v0.8h \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "uaddw v16.4s, v16.4s, v6.4h \n" + "uaddl2 v19.4s, v6.8h, v3.8h \n" + "uaddl v3.4s, v6.4h, v3.4h \n" + "uaddw2 v6.4s, v7.4s, v6.8h \n" + "uaddl2 v7.4s, v5.8h, v4.8h \n" + "uaddl v4.4s, v5.4h, v4.4h \n" + "uaddw v18.4s, v18.4s, v5.4h \n" + "prfm pldl1keep, [%1, 448] \n" + "mla v16.4s, v4.4s, v1.4s \n" + "mla v18.4s, v3.4s, v1.4s \n" + "mla v6.4s, v7.4s, v1.4s \n" + "uaddw2 v4.4s, v17.4s, v5.8h \n" + "uqrshrn v16.4h, v16.4s, #4 \n" + "mla v4.4s, v19.4s, v1.4s \n" + "uqrshrn2 v16.8h, v6.4s, #4 \n" + "uqrshrn v17.4h, v18.4s, #4 \n" + "uqrshrn2 v17.8h, v4.4s, #4 \n" + "st2 {v16.8h-v17.8h}, [%2], #32 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_stride), // %1 "+r"(dst), // %2 @@ -1044,6 +1086,64 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr, ); } +void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + asm volatile( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + "1: \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 UV + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uaddlp v0.8h, v0.16b \n" // U 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // V 16 bytes -> 8 shorts. + "ld2 {v16.16b,v17.16b}, [%1], #32 \n" // load 16 + "uadalp v0.8h, v16.16b \n" // U 16 bytes -> 8 shorts. + "uadalp v1.8h, v17.16b \n" // V 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "rshrn v0.8b, v0.8h, #2 \n" // round and pack + "prfm pldl1keep, [%1, 448] \n" + "rshrn v1.8b, v1.8h, #2 \n" + "st2 {v0.8b,v1.8b}, [%2], #16 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "v0", "v1", "v16", "v17"); +} + +// Reads 4 pixels at a time. +void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int src_stepx, // pixel step + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src1_ptr = src_ptr + src_stepx * 2; + const uint8_t* src2_ptr = src_ptr + src_stepx * 4; + const uint8_t* src3_ptr = src_ptr + src_stepx * 6; + (void)src_stride; + asm volatile( + "1: \n" + "ld1 {v0.h}[0], [%0], %6 \n" + "ld1 {v1.h}[0], [%1], %6 \n" + "ld1 {v2.h}[0], [%2], %6 \n" + "ld1 {v3.h}[0], [%3], %6 \n" + "subs %w5, %w5, #4 \n" // 4 pixels per loop. + "st4 {v0.h, v1.h, v2.h, v3.h}[0], [%4], #8 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src1_ptr), // %1 + "+r"(src2_ptr), // %2 + "+r"(src3_ptr), // %3 + "+r"(dst_ptr), // %4 + "+r"(dst_width) // %5 + : "r"((int64_t)(src_stepx * 8)) // %6 + : "memory", "cc", "v0", "v1", "v2", "v3"); +} + #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus diff --git a/chromium/third_party/libyuv/source/scale_uv.cc b/chromium/third_party/libyuv/source/scale_uv.cc new file mode 100644 index 00000000000..b0469f09b87 --- /dev/null +++ b/chromium/third_party/libyuv/source/scale_uv.cc @@ -0,0 +1,891 @@ +/* + * Copyright 2020 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/scale.h" + +#include <assert.h> +#include <string.h> + +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" // For CopyUV +#include "libyuv/row.h" +#include "libyuv/scale_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Macros to enable specialized scalers + +#ifndef HAS_SCALEUVDOWN2 +#define HAS_SCALEUVDOWN2 1 +#endif +#ifndef HAS_SCALEUVDOWN4BOX +#define HAS_SCALEUVDOWN4BOX 1 +#endif +#ifndef HAS_SCALEUVDOWNEVEN +#define HAS_SCALEUVDOWNEVEN 1 +#endif +#ifndef HAS_SCALEUVBILINEARDOWN +#define HAS_SCALEUVBILINEARDOWN 1 +#endif +#ifndef HAS_SCALEUVBILINEARUP +#define HAS_SCALEUVBILINEARUP 1 +#endif +#ifndef HAS_UVCOPY +#define HAS_UVCOPY 1 +#endif +#ifndef HAS_SCALEPLANEVERTICAL +#define HAS_SCALEPLANEVERTICAL 1 +#endif + +static __inline int Abs(int v) { + return v >= 0 ? v : -v; +} + +// ScaleUV, 1/2 +// This is an optimized version for scaling down a UV to 1/2 of +// its original size. +#if HAS_SCALEUVDOWN2 +static void ScaleUVDown2(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_uv, + uint8_t* dst_uv, + int x, + int dx, + int y, + int dy, + enum FilterMode filtering) { + int j; + int row_stride = src_stride * (dy >> 16); + void (*ScaleUVRowDown2)(const uint8_t* src_uv, ptrdiff_t src_stride, + uint8_t* dst_uv, int dst_width) = + filtering == kFilterNone + ? ScaleUVRowDown2_C + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_C + : ScaleUVRowDown2Box_C); + (void)src_width; + (void)src_height; + (void)dx; + assert(dx == 65536 * 2); // Test scale factor of 2. + assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2. + // Advance to odd row, even column. + if (filtering == kFilterBilinear) { + src_uv += (y >> 16) * src_stride + (x >> 16) * 2; + } else { + src_uv += (y >> 16) * src_stride + ((x >> 16) - 1) * 2; + } + +#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && filtering) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3; + if (IS_ALIGNED(dst_width, 4)) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3; + } + } +#endif +#if defined(HAS_SCALEUVROWDOWN2BOX_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && filtering) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_AVX2; + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_AVX2; + } + } +#endif +#if defined(HAS_SCALEUVROWDOWN2BOX_NEON) + if (TestCpuFlag(kCpuHasNEON) && filtering) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_NEON; + } + } +#endif + +// This code is not enabled. Only box filter is available at this time. +#if defined(HAS_SCALEUVROWDOWN2_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleUVRowDown2 = + filtering == kFilterNone + ? ScaleUVRowDown2_Any_SSSE3 + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_SSSE3 + : ScaleUVRowDown2Box_Any_SSSE3); + if (IS_ALIGNED(dst_width, 2)) { + ScaleUVRowDown2 = + filtering == kFilterNone + ? ScaleUVRowDown2_SSSE3 + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_SSSE3 + : ScaleUVRowDown2Box_SSSE3); + } + } +#endif +// This code is not enabled. Only box filter is available at this time. +#if defined(HAS_SCALEUVROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleUVRowDown2 = + filtering == kFilterNone + ? ScaleUVRowDown2_Any_NEON + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_NEON + : ScaleUVRowDown2Box_Any_NEON); + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVRowDown2 = + filtering == kFilterNone + ? ScaleUVRowDown2_NEON + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_NEON + : ScaleUVRowDown2Box_NEON); + } + } +#endif +#if defined(HAS_SCALEUVROWDOWN2_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ScaleUVRowDown2 = + filtering == kFilterNone + ? ScaleUVRowDown2_Any_MMI + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_MMI + : ScaleUVRowDown2Box_Any_MMI); + if (IS_ALIGNED(dst_width, 2)) { + ScaleUVRowDown2 = + filtering == kFilterNone + ? ScaleUVRowDown2_MMI + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_MMI + : ScaleUVRowDown2Box_MMI); + } + } +#endif +#if defined(HAS_SCALEUVROWDOWN2_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleUVRowDown2 = + filtering == kFilterNone + ? ScaleUVRowDown2_Any_MSA + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_MSA + : ScaleUVRowDown2Box_Any_MSA); + if (IS_ALIGNED(dst_width, 2)) { + ScaleUVRowDown2 = + filtering == kFilterNone + ? ScaleUVRowDown2_MSA + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_MSA + : ScaleUVRowDown2Box_MSA); + } + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + for (j = 0; j < dst_height; ++j) { + ScaleUVRowDown2(src_uv, src_stride, dst_uv, dst_width); + src_uv += row_stride; + dst_uv += dst_stride; + } +} +#endif // HAS_SCALEUVDOWN2 + +// ScaleUV, 1/4 +// This is an optimized version for scaling down a UV to 1/4 of +// its original size. +#if HAS_SCALEUVDOWN4BOX +static void ScaleUVDown4Box(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_uv, + uint8_t* dst_uv, + int x, + int dx, + int y, + int dy) { + int j; + // Allocate 2 rows of UV. + const int kRowSize = (dst_width * 2 * 2 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); + int row_stride = src_stride * (dy >> 16); + void (*ScaleUVRowDown2)(const uint8_t* src_uv, ptrdiff_t src_stride, + uint8_t* dst_uv, int dst_width) = + ScaleUVRowDown2Box_C; + // Advance to odd row, even column. + src_uv += (y >> 16) * src_stride + (x >> 16) * 2; + (void)src_width; + (void)src_height; + (void)dx; + assert(dx == 65536 * 4); // Test scale factor of 4. + assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4. + +#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3; + if (IS_ALIGNED(dst_width, 4)) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3; + } + } +#endif +#if defined(HAS_SCALEUVROWDOWN2BOX_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_AVX2; + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_AVX2; + } + } +#endif +#if defined(HAS_SCALEUVROWDOWN2BOX_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_NEON; + } + } +#endif + + for (j = 0; j < dst_height; ++j) { + ScaleUVRowDown2(src_uv, src_stride, row, dst_width * 2); + ScaleUVRowDown2(src_uv + src_stride * 2, src_stride, row + kRowSize, + dst_width * 2); + ScaleUVRowDown2(row, kRowSize, dst_uv, dst_width); + src_uv += row_stride; + dst_uv += dst_stride; + } + free_aligned_buffer_64(row); +} +#endif // HAS_SCALEUVDOWN4BOX + +// ScaleUV Even +// This is an optimized version for scaling down a UV to even +// multiple of its original size. +#if HAS_SCALEUVDOWNEVEN +static void ScaleUVDownEven(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_uv, + uint8_t* dst_uv, + int x, + int dx, + int y, + int dy, + enum FilterMode filtering) { + int j; + int col_step = dx >> 16; + int row_stride = (dy >> 16) * src_stride; + void (*ScaleUVRowDownEven)(const uint8_t* src_uv, ptrdiff_t src_stride, + int src_step, uint8_t* dst_uv, int dst_width) = + filtering ? ScaleUVRowDownEvenBox_C : ScaleUVRowDownEven_C; + (void)src_width; + (void)src_height; + assert(IS_ALIGNED(src_width, 2)); + assert(IS_ALIGNED(src_height, 2)); + src_uv += (y >> 16) * src_stride + (x >> 16) * 2; +#if defined(HAS_SCALEUVROWDOWNEVEN_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_SSSE3 + : ScaleUVRowDownEven_Any_SSSE3; + if (IS_ALIGNED(dst_width, 4)) { + ScaleUVRowDownEven = + filtering ? ScaleUVRowDownEvenBox_SSE2 : ScaleUVRowDownEven_SSSE3; + } + } +#endif +#if defined(HAS_SCALEUVROWDOWNEVEN_NEON) + if (TestCpuFlag(kCpuHasNEON) && !filtering) { + ScaleUVRowDownEven = ScaleUVRowDownEven_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + ScaleUVRowDownEven = ScaleUVRowDownEven_NEON; + } + } +#endif// TODO(fbarchard): Enable Box filter +#if defined(HAS_SCALEUVROWDOWNEVENBOX_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_NEON + : ScaleUVRowDownEven_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + ScaleUVRowDownEven = + filtering ? ScaleUVRowDownEvenBox_NEON : ScaleUVRowDownEven_NEON; + } + } +#endif +#if defined(HAS_SCALEUVROWDOWNEVEN_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ScaleUVRowDownEven = + filtering ? ScaleUVRowDownEvenBox_Any_MMI : ScaleUVRowDownEven_Any_MMI; + if (IS_ALIGNED(dst_width, 2)) { + ScaleUVRowDownEven = + filtering ? ScaleUVRowDownEvenBox_MMI : ScaleUVRowDownEven_MMI; + } + } +#endif +#if defined(HAS_SCALEUVROWDOWNEVEN_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleUVRowDownEven = + filtering ? ScaleUVRowDownEvenBox_Any_MSA : ScaleUVRowDownEven_Any_MSA; + if (IS_ALIGNED(dst_width, 4)) { + ScaleUVRowDownEven = + filtering ? ScaleUVRowDownEvenBox_MSA : ScaleUVRowDownEven_MSA; + } + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + for (j = 0; j < dst_height; ++j) { + ScaleUVRowDownEven(src_uv, src_stride, col_step, dst_uv, dst_width); + src_uv += row_stride; + dst_uv += dst_stride; + } +} +#endif + +// Scale UV down with bilinear interpolation. +#if HAS_SCALEUVBILINEARDOWN +static void ScaleUVBilinearDown(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_uv, + uint8_t* dst_uv, + int x, + int dx, + int y, + int dy, + enum FilterMode filtering) { + int j; + void (*InterpolateRow)(uint8_t * dst_uv, const uint8_t* src_uv, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + void (*ScaleUVFilterCols)(uint8_t * dst_uv, const uint8_t* src_uv, + int dst_width, int x, int dx) = + (src_width >= 32768) ? ScaleUVFilterCols64_C : ScaleUVFilterCols_C; + int64_t xlast = x + (int64_t)(dst_width - 1) * dx; + int64_t xl = (dx >= 0) ? x : xlast; + int64_t xr = (dx >= 0) ? xlast : x; + int clip_src_width; + xl = (xl >> 16) & ~3; // Left edge aligned. + xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels. + xr = (xr + 1 + 3) & ~3; // 1 beyond 4 pixel aligned right most pixel. + if (xr > src_width) { + xr = src_width; + } + clip_src_width = (int)(xr - xl) * 2; // Width aligned to 2. + src_uv += xl * 2; + x -= (int)(xl << 16); +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(clip_src_width, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(clip_src_width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(clip_src_width, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(clip_src_width, 32)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif +#if defined(HAS_SCALEUVFILTERCOLS_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleUVFilterCols = ScaleUVFilterCols_SSSE3; + } +#endif +#if defined(HAS_SCALEUVFILTERCOLS_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleUVFilterCols = ScaleUVFilterCols_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + ScaleUVFilterCols = ScaleUVFilterCols_NEON; + } + } +#endif +#if defined(HAS_SCALEUVFILTERCOLS_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleUVFilterCols = ScaleUVFilterCols_Any_MSA; + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVFilterCols = ScaleUVFilterCols_MSA; + } + } +#endif + // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. + // Allocate a row of UV. + { + align_buffer_64(row, clip_src_width * 2); + + const int max_y = (src_height - 1) << 16; + if (y > max_y) { + y = max_y; + } + for (j = 0; j < dst_height; ++j) { + int yi = y >> 16; + const uint8_t* src = src_uv + yi * src_stride; + if (filtering == kFilterLinear) { + ScaleUVFilterCols(dst_uv, src, dst_width, x, dx); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(row, src, src_stride, clip_src_width, yf); + ScaleUVFilterCols(dst_uv, row, dst_width, x, dx); + } + dst_uv += dst_stride; + y += dy; + if (y > max_y) { + y = max_y; + } + } + free_aligned_buffer_64(row); + } +} +#endif + +// Scale UV up with bilinear interpolation. +#if HAS_SCALEUVBILINEARUP +static void ScaleUVBilinearUp(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_uv, + uint8_t* dst_uv, + int x, + int dx, + int y, + int dy, + enum FilterMode filtering) { + int j; + void (*InterpolateRow)(uint8_t * dst_uv, const uint8_t* src_uv, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + void (*ScaleUVFilterCols)(uint8_t * dst_uv, const uint8_t* src_uv, + int dst_width, int x, int dx) = + filtering ? ScaleUVFilterCols_C : ScaleUVCols_C; + const int max_y = (src_height - 1) << 16; +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + InterpolateRow = InterpolateRow_Any_MMI; + if (IS_ALIGNED(dst_width, 2)) { + InterpolateRow = InterpolateRow_MMI; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif + if (src_width >= 32768) { + ScaleUVFilterCols = filtering ? ScaleUVFilterCols64_C : ScaleUVCols64_C; + } +#if defined(HAS_SCALEUVFILTERCOLS_SSSE3) + if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleUVFilterCols = ScaleUVFilterCols_SSSE3; + } +#endif +#if defined(HAS_SCALEUVFILTERCOLS_NEON) + if (filtering && TestCpuFlag(kCpuHasNEON)) { + ScaleUVFilterCols = ScaleUVFilterCols_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + ScaleUVFilterCols = ScaleUVFilterCols_NEON; + } + } +#endif +#if defined(HAS_SCALEUVFILTERCOLS_MSA) + if (filtering && TestCpuFlag(kCpuHasMSA)) { + ScaleUVFilterCols = ScaleUVFilterCols_Any_MSA; + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVFilterCols = ScaleUVFilterCols_MSA; + } + } +#endif +#if defined(HAS_SCALEUVCOLS_SSSE3) + if (!filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleUVFilterCols = ScaleUVCols_SSSE3; + } +#endif +#if defined(HAS_SCALEUVCOLS_NEON) + if (!filtering && TestCpuFlag(kCpuHasNEON)) { + ScaleUVFilterCols = ScaleUVCols_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVFilterCols = ScaleUVCols_NEON; + } + } +#endif +#if defined(HAS_SCALEUVCOLS_MMI) + if (!filtering && TestCpuFlag(kCpuHasMMI)) { + ScaleUVFilterCols = ScaleUVCols_Any_MMI; + if (IS_ALIGNED(dst_width, 1)) { + ScaleUVFilterCols = ScaleUVCols_MMI; + } + } +#endif +#if defined(HAS_SCALEUVCOLS_MSA) + if (!filtering && TestCpuFlag(kCpuHasMSA)) { + ScaleUVFilterCols = ScaleUVCols_Any_MSA; + if (IS_ALIGNED(dst_width, 4)) { + ScaleUVFilterCols = ScaleUVCols_MSA; + } + } +#endif + if (!filtering && src_width * 2 == dst_width && x < 0x8000) { + ScaleUVFilterCols = ScaleUVColsUp2_C; +#if defined(HAS_SCALEUVCOLSUP2_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(dst_width, 8)) { + ScaleUVFilterCols = ScaleUVColsUp2_SSSE3; + } +#endif +#if defined(HAS_SCALEUVCOLSUP2_MMI) + if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) { + ScaleUVFilterCols = ScaleUVColsUp2_MMI; + } +#endif + } + + if (y > max_y) { + y = max_y; + } + + { + int yi = y >> 16; + const uint8_t* src = src_uv + yi * src_stride; + + // Allocate 2 rows of UV. + const int kRowSize = (dst_width * 2 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); + + uint8_t* rowptr = row; + int rowstride = kRowSize; + int lasty = yi; + + ScaleUVFilterCols(rowptr, src, dst_width, x, dx); + if (src_height > 1) { + src += src_stride; + } + ScaleUVFilterCols(rowptr + rowstride, src, dst_width, x, dx); + src += src_stride; + + for (j = 0; j < dst_height; ++j) { + yi = y >> 16; + if (yi != lasty) { + if (y > max_y) { + y = max_y; + yi = y >> 16; + src = src_uv + yi * src_stride; + } + if (yi != lasty) { + ScaleUVFilterCols(rowptr, src, dst_width, x, dx); + rowptr += rowstride; + rowstride = -rowstride; + lasty = yi; + src += src_stride; + } + } + if (filtering == kFilterLinear) { + InterpolateRow(dst_uv, rowptr, 0, dst_width * 2, 0); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(dst_uv, rowptr, rowstride, dst_width * 2, yf); + } + dst_uv += dst_stride; + y += dy; + } + free_aligned_buffer_64(row); + } +} +#endif // HAS_SCALEUVBILINEARUP + +// Scale UV to/from any dimensions, without interpolation. +// Fixed point math is used for performance: The upper 16 bits +// of x and dx is the integer part of the source position and +// the lower 16 bits are the fixed decimal part. + +static void ScaleUVSimple(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_uv, + uint8_t* dst_uv, + int x, + int dx, + int y, + int dy) { + int j; + void (*ScaleUVCols)(uint8_t * dst_uv, const uint8_t* src_uv, int dst_width, + int x, int dx) = + (src_width >= 32768) ? ScaleUVCols64_C : ScaleUVCols_C; + (void)src_height; +#if defined(HAS_SCALEUVCOLS_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleUVCols = ScaleUVCols_SSSE3; + } +#endif +#if defined(HAS_SCALEUVCOLS_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleUVCols = ScaleUVCols_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVCols = ScaleUVCols_NEON; + } + } +#endif +#if defined(HAS_SCALEUVCOLS_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ScaleUVCols = ScaleUVCols_Any_MMI; + if (IS_ALIGNED(dst_width, 1)) { + ScaleUVCols = ScaleUVCols_MMI; + } + } +#endif +#if defined(HAS_SCALEUVCOLS_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleUVCols = ScaleUVCols_Any_MSA; + if (IS_ALIGNED(dst_width, 4)) { + ScaleUVCols = ScaleUVCols_MSA; + } + } +#endif + if (src_width * 2 == dst_width && x < 0x8000) { + ScaleUVCols = ScaleUVColsUp2_C; +#if defined(HAS_SCALEUVCOLSUP2_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(dst_width, 8)) { + ScaleUVCols = ScaleUVColsUp2_SSSE3; + } +#endif +#if defined(HAS_SCALEUVCOLSUP2_MMI) + if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) { + ScaleUVCols = ScaleUVColsUp2_MMI; + } +#endif + } + + for (j = 0; j < dst_height; ++j) { + ScaleUVCols(dst_uv, src_uv + (y >> 16) * src_stride, dst_width, x, dx); + dst_uv += dst_stride; + y += dy; + } +} + +// Copy UV with optional flipping +#if HAS_UVCOPY +static int UVCopy(const uint8_t* src_UV, + int src_stride_UV, + uint8_t* dst_UV, + int dst_stride_UV, + int width, + int height) { + if (!src_UV || !dst_UV || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_UV = src_UV + (height - 1) * src_stride_UV; + src_stride_UV = -src_stride_UV; + } + + CopyPlane(src_UV, src_stride_UV, dst_UV, dst_stride_UV, width * 2, height); + return 0; +} +#endif // HAS_UVCOPY + +// Scale a UV plane (from NV12) +// This function in turn calls a scaling function +// suitable for handling the desired resolutions. +static void ScaleUV(const uint8_t* src, + int src_stride, + int src_width, + int src_height, + uint8_t* dst, + int dst_stride, + int dst_width, + int dst_height, + int clip_x, + int clip_y, + int clip_width, + int clip_height, + enum FilterMode filtering) { + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + // UV does not support box filter yet, but allow the user to pass it. + // Simplify filtering when possible. + filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, + filtering); + + // Negative src_height means invert the image. + if (src_height < 0) { + src_height = -src_height; + src = src + (src_height - 1) * src_stride; + src_stride = -src_stride; + } + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, + &dx, &dy); + src_width = Abs(src_width); + if (clip_x) { + int64_t clipf = (int64_t)(clip_x)*dx; + x += (clipf & 0xffff); + src += (clipf >> 16) * 2; + dst += clip_x * 2; + } + if (clip_y) { + int64_t clipf = (int64_t)(clip_y)*dy; + y += (clipf & 0xffff); + src += (clipf >> 16) * src_stride; + dst += clip_y * dst_stride; + } + + // Special case for integer step values. + if (((dx | dy) & 0xffff) == 0) { + if (!dx || !dy) { // 1 pixel wide and/or tall. + filtering = kFilterNone; + } else { + // Optimized even scale down. ie 2, 4, 6, 8, 10x. + if (!(dx & 0x10000) && !(dy & 0x10000)) { +#if HAS_SCALEUVDOWN2 + if (dx == 0x20000) { + // Optimized 1/2 downsample. + ScaleUVDown2(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy, + filtering); + return; + } +#endif +#if HAS_SCALEUVDOWN4BOX + if (dx == 0x40000 && filtering == kFilterBox) { + // Optimized 1/4 box downsample. + ScaleUVDown4Box(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy); + return; + } +#endif +#if HAS_SCALEUVDOWNEVEN + ScaleUVDownEven(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy, + filtering); + return; +#endif + } + // Optimized odd scale down. ie 3, 5, 7, 9x. + if ((dx & 0x10000) && (dy & 0x10000)) { + filtering = kFilterNone; +#ifdef HAS_UVCOPY + if (dx == 0x10000 && dy == 0x10000) { + // Straight copy. + UVCopy(src + (y >> 16) * src_stride + (x >> 16) * 2, src_stride, dst, + dst_stride, clip_width, clip_height); + return; + } +#endif + } + } + } + // HAS_SCALEPLANEVERTICAL + if (dx == 0x10000 && (x & 0xffff) == 0) { + // Arbitrary scale vertically, but unscaled horizontally. + ScalePlaneVertical(src_height, clip_width, clip_height, src_stride, + dst_stride, src, dst, x, y, dy, 4, filtering); + return; + } + +#if HAS_SCALEUVBILINEARUP + if (filtering && dy < 65536) { + ScaleUVBilinearUp(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy, + filtering); + return; + } +#endif +#if HAS_SCALEUVBILINEARDOWN + if (filtering) { + ScaleUVBilinearDown(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy, + filtering); + return; + } +#endif + ScaleUVSimple(src_width, src_height, clip_width, clip_height, src_stride, + dst_stride, src, dst, x, dx, y, dy); +} + +// Scale an UV image. +LIBYUV_API +int UVScale(const uint8_t* src_uv, + int src_stride_uv, + int src_width, + int src_height, + uint8_t* dst_uv, + int dst_stride_uv, + int dst_width, + int dst_height, + enum FilterMode filtering) { + if (!src_uv || src_width == 0 || src_height == 0 || src_width > 32768 || + src_height > 32768 || !dst_uv || dst_width <= 0 || dst_height <= 0) { + return -1; + } + ScaleUV(src_uv, src_stride_uv, src_width, src_height, dst_uv, dst_stride_uv, + dst_width, dst_height, 0, 0, dst_width, dst_height, filtering); + return 0; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/chromium/third_party/libyuv/source/test.sh b/chromium/third_party/libyuv/source/test.sh new file mode 100755 index 00000000000..7f12c3c156e --- /dev/null +++ b/chromium/third_party/libyuv/source/test.sh @@ -0,0 +1,35 @@ +#!/bin/bash +set -x + +function runbenchmark1 { + perf record /google/src/cloud/fbarchard/clean/google3/blaze-bin/third_party/libyuv/libyuv_test --gunit_filter=*$1 --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=1000 --libyuv_flags=-1 --libyuv_cpu_info=-1 + perf report | grep AVX +} + +runbenchmark1 ABGRToI420 +runbenchmark1 Android420ToI420 +runbenchmark1 ARGBToI420 +runbenchmark1 Convert16To8Plane +runbenchmark1 ConvertToARGB +runbenchmark1 ConvertToI420 +runbenchmark1 CopyPlane +runbenchmark1 H010ToAB30 +runbenchmark1 H010ToAR30 +runbenchmark1 HalfFloatPlane +runbenchmark1 I010ToAB30 +runbenchmark1 I010ToAR30 +runbenchmark1 I420Copy +runbenchmark1 I420Psnr +runbenchmark1 I420Scale +runbenchmark1 I420Ssim +runbenchmark1 I420ToARGB +runbenchmark1 I420ToNV12 +runbenchmark1 I420ToUYVY +runbenchmark1 I422ToI420 +runbenchmark1 InitCpuFlags +runbenchmark1 J420ToARGB +runbenchmark1 NV12ToARGB +runbenchmark1 NV12ToI420 +runbenchmark1 NV12ToI420Rotate +runbenchmark1 SetCpuFlags +runbenchmark1 YUY2ToI420 diff --git a/chromium/third_party/libyuv/tools_libyuv/OWNERS b/chromium/third_party/libyuv/tools_libyuv/OWNERS index 2cb971d2b72..aae4fb6e021 100644 --- a/chromium/third_party/libyuv/tools_libyuv/OWNERS +++ b/chromium/third_party/libyuv/tools_libyuv/OWNERS @@ -1 +1,4 @@ -phoglund@chromium.org +mbonadei@chromium.org +fbarchard@chromium.org +pbos@chromium.org + diff --git a/chromium/third_party/libyuv/tools_libyuv/autoroller/roll_deps.py b/chromium/third_party/libyuv/tools_libyuv/autoroller/roll_deps.py index 2b95e302cdb..9b9660de0bb 100755 --- a/chromium/third_party/libyuv/tools_libyuv/autoroller/roll_deps.py +++ b/chromium/third_party/libyuv/tools_libyuv/autoroller/roll_deps.py @@ -37,7 +37,7 @@ CHROMIUM_LOG_TEMPLATE = CHROMIUM_SRC_URL + '/+log/%s' CHROMIUM_FILE_TEMPLATE = CHROMIUM_SRC_URL + '/+/%s/%s' COMMIT_POSITION_RE = re.compile('^Cr-Commit-Position: .*#([0-9]+).*$') -CLANG_REVISION_RE = re.compile(r'^CLANG_REVISION = \'([0-9a-z]+)\'$') +CLANG_REVISION_RE = re.compile(r'^CLANG_REVISION = \'([0-9a-z-]+)\'$') ROLL_BRANCH_NAME = 'roll_chromium_revision' SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) @@ -69,6 +69,7 @@ def ParseDepsDict(deps_content): local_scope = {} global_scope = { 'Var': VarLookup(local_scope), + 'Str': lambda s: s, 'deps_os': {}, } exec(deps_content, global_scope, local_scope) @@ -274,7 +275,7 @@ def CalculateChangedClang(new_cr_rev): match = CLANG_REVISION_RE.match(line) if match: return match.group(1) - raise RollError('Could not parse Clang revision!') + raise RollError('Could not parse Clang revision from:\n' + '\n'.join(' ' + l for l in lines)) with open(CLANG_UPDATE_SCRIPT_LOCAL_PATH, 'rb') as f: current_lines = f.readlines() diff --git a/chromium/third_party/libyuv/tools_libyuv/msan/OWNERS b/chromium/third_party/libyuv/tools_libyuv/msan/OWNERS index 0a919805c2c..9b67a8f6789 100644 --- a/chromium/third_party/libyuv/tools_libyuv/msan/OWNERS +++ b/chromium/third_party/libyuv/tools_libyuv/msan/OWNERS @@ -1,3 +1,3 @@ +mbonadei@chromium.org +fbarchard@chromium.org pbos@chromium.org -phoglund@chromium.org - diff --git a/chromium/third_party/libyuv/tools_libyuv/ubsan/OWNERS b/chromium/third_party/libyuv/tools_libyuv/ubsan/OWNERS index da77b4ef23f..9b67a8f6789 100644 --- a/chromium/third_party/libyuv/tools_libyuv/ubsan/OWNERS +++ b/chromium/third_party/libyuv/tools_libyuv/ubsan/OWNERS @@ -1,4 +1,3 @@ -pbos@webrtc.org -phoglund@webrtc.org +mbonadei@chromium.org fbarchard@chromium.org - +pbos@chromium.org diff --git a/chromium/third_party/libyuv/unit_test/color_test.cc b/chromium/third_party/libyuv/unit_test/color_test.cc index d440fc91bec..842fd994441 100644 --- a/chromium/third_party/libyuv/unit_test/color_test.cc +++ b/chromium/third_party/libyuv/unit_test/color_test.cc @@ -20,21 +20,19 @@ namespace libyuv { -// TODO(fbarchard): Port high accuracy YUV to RGB to Neon. -#if !defined(LIBYUV_DISABLE_NEON) && \ - (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON)) -#define ERROR_R 1 -#define ERROR_G 1 -#define ERROR_B 3 +// TODO(fbarchard): clang x86 has a higher accuracy YUV to RGB. +// Port to Visual C and other CPUs +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) +#define ERROR_FULL 5 +#define ERROR_J420 4 +#else #define ERROR_FULL 6 #define ERROR_J420 6 -#else +#endif #define ERROR_R 1 #define ERROR_G 1 #define ERROR_B 3 -#define ERROR_FULL 5 -#define ERROR_J420 4 -#endif #define TESTCS(TESTNAME, YUVTOARGB, ARGBTOYUV, HS1, HS, HN, DIFF) \ TEST_F(LibYUVColorTest, TESTNAME) { \ @@ -187,6 +185,52 @@ static void YUVJToRGB(int y, int u, int v, int* r, int* g, int* b) { *r = orig_pixels[2]; } +static void YUVHToRGB(int y, int u, int v, int* r, int* g, int* b) { + const int kWidth = 16; + const int kHeight = 1; + const int kPixels = kWidth * kHeight; + const int kHalfPixels = ((kWidth + 1) / 2) * ((kHeight + 1) / 2); + + SIMD_ALIGNED(uint8_t orig_y[16]); + SIMD_ALIGNED(uint8_t orig_u[8]); + SIMD_ALIGNED(uint8_t orig_v[8]); + SIMD_ALIGNED(uint8_t orig_pixels[16 * 4]); + memset(orig_y, y, kPixels); + memset(orig_u, u, kHalfPixels); + memset(orig_v, v, kHalfPixels); + + /* YUV converted to ARGB. */ + H422ToARGB(orig_y, kWidth, orig_u, (kWidth + 1) / 2, orig_v, (kWidth + 1) / 2, + orig_pixels, kWidth * 4, kWidth, kHeight); + + *b = orig_pixels[0]; + *g = orig_pixels[1]; + *r = orig_pixels[2]; +} + +static void YUVRec2020ToRGB(int y, int u, int v, int* r, int* g, int* b) { + const int kWidth = 16; + const int kHeight = 1; + const int kPixels = kWidth * kHeight; + const int kHalfPixels = ((kWidth + 1) / 2) * ((kHeight + 1) / 2); + + SIMD_ALIGNED(uint8_t orig_y[16]); + SIMD_ALIGNED(uint8_t orig_u[8]); + SIMD_ALIGNED(uint8_t orig_v[8]); + SIMD_ALIGNED(uint8_t orig_pixels[16 * 4]); + memset(orig_y, y, kPixels); + memset(orig_u, u, kHalfPixels); + memset(orig_v, v, kHalfPixels); + + /* YUV converted to ARGB. */ + U422ToARGB(orig_y, kWidth, orig_u, (kWidth + 1) / 2, orig_v, (kWidth + 1) / 2, + orig_pixels, kWidth * 4, kWidth, kHeight); + + *b = orig_pixels[0]; + *g = orig_pixels[1]; + *r = orig_pixels[2]; +} + static void YToRGB(int y, int* r, int* g, int* b) { const int kWidth = 16; const int kHeight = 1; @@ -335,18 +379,41 @@ TEST_F(LibYUVColorTest, TestRoundToByte) { EXPECT_LE(allb, 255); } +// BT.601 YUV to RGB reference static void YUVToRGBReference(int y, int u, int v, int* r, int* g, int* b) { *r = RoundToByte((y - 16) * 1.164 - (v - 128) * -1.596); *g = RoundToByte((y - 16) * 1.164 - (u - 128) * 0.391 - (v - 128) * 0.813); *b = RoundToByte((y - 16) * 1.164 - (u - 128) * -2.018); } +// JPEG YUV to RGB reference static void YUVJToRGBReference(int y, int u, int v, int* r, int* g, int* b) { *r = RoundToByte(y - (v - 128) * -1.40200); *g = RoundToByte(y - (u - 128) * 0.34414 - (v - 128) * 0.71414); *b = RoundToByte(y - (u - 128) * -1.77200); } +// BT.709 YUV to RGB reference +// See also http://www.equasys.de/colorconversion.html +static void YUVHToRGBReference(int y, int u, int v, int* r, int* g, int* b) { + *r = RoundToByte((y - 16) * 1.164 - (v - 128) * -1.793); + *g = RoundToByte((y - 16) * 1.164 - (u - 128) * 0.213 - (v - 128) * 0.533); + *b = RoundToByte((y - 16) * 1.164 - (u - 128) * -2.112); +} + +// BT.2020 YUV to RGB reference +static void YUVRec2020ToRGBReference(int y, + int u, + int v, + int* r, + int* g, + int* b) { + *r = RoundToByte((y - 16) * 1.164384 - (v - 128) * -1.67867); + *g = RoundToByte((y - 16) * 1.164384 - (u - 128) * 0.187326 - + (v - 128) * 0.65042); + *b = RoundToByte((y - 16) * 1.164384 - (u - 128) * -2.14177); +} + TEST_F(LibYUVColorTest, TestYUV) { int r0, g0, b0, r1, g1, b1; @@ -473,7 +540,11 @@ static void PrintHistogram(int rh[256], int gh[256], int bh[256]) { // Step by 5 on inner loop goes from 0 to 255 inclusive. // Set to 1 for better converage. 3, 5 or 17 for faster testing. +#ifdef ENABLE_SLOW_TESTS +#define FASTSTEP 1 +#else #define FASTSTEP 5 +#endif TEST_F(LibYUVColorTest, TestFullYUV) { int rh[256] = { 0, @@ -531,6 +602,66 @@ TEST_F(LibYUVColorTest, TestFullYUVJ) { } PrintHistogram(rh, gh, bh); } + +TEST_F(LibYUVColorTest, TestFullYUVH) { + int rh[256] = { + 0, + }; + int gh[256] = { + 0, + }; + int bh[256] = { + 0, + }; + for (int u = 0; u < 256; ++u) { + for (int v = 0; v < 256; ++v) { + for (int y2 = 0; y2 < 256; y2 += FASTSTEP) { + int r0, g0, b0, r1, g1, b1; + int y = RANDOM256(y2); + YUVHToRGBReference(y, u, v, &r0, &g0, &b0); + YUVHToRGB(y, u, v, &r1, &g1, &b1); + EXPECT_NEAR(r0, r1, ERROR_R); + EXPECT_NEAR(g0, g1, ERROR_G); + // TODO(crbug.com/libyuv/862): Reduce the errors in the B channel. + EXPECT_NEAR(b0, b1, 15); + ++rh[r1 - r0 + 128]; + ++gh[g1 - g0 + 128]; + ++bh[b1 - b0 + 128]; + } + } + } + PrintHistogram(rh, gh, bh); +} + +TEST_F(LibYUVColorTest, TestFullYUVRec2020) { + int rh[256] = { + 0, + }; + int gh[256] = { + 0, + }; + int bh[256] = { + 0, + }; + for (int u = 0; u < 256; ++u) { + for (int v = 0; v < 256; ++v) { + for (int y2 = 0; y2 < 256; y2 += FASTSTEP) { + int r0, g0, b0, r1, g1, b1; + int y = RANDOM256(y2); + YUVRec2020ToRGBReference(y, u, v, &r0, &g0, &b0); + YUVRec2020ToRGB(y, u, v, &r1, &g1, &b1); + EXPECT_NEAR(r0, r1, ERROR_R); + EXPECT_NEAR(g0, g1, ERROR_G); + // TODO(crbug.com/libyuv/863): Reduce the errors in the B channel. + EXPECT_NEAR(b0, b1, 18); + ++rh[r1 - r0 + 128]; + ++gh[g1 - g0 + 128]; + ++bh[b1 - b0 + 128]; + } + } + } + PrintHistogram(rh, gh, bh); +} #undef FASTSTEP TEST_F(LibYUVColorTest, TestGreyYUVJ) { diff --git a/chromium/third_party/libyuv/unit_test/convert_test.cc b/chromium/third_party/libyuv/unit_test/convert_test.cc index fb9632a94fb..59a9480d679 100644 --- a/chromium/third_party/libyuv/unit_test/convert_test.cc +++ b/chromium/third_party/libyuv/unit_test/convert_test.cc @@ -31,18 +31,10 @@ #include "libyuv/row.h" /* For ARGBToAR30Row_AVX2 */ #endif -#if defined(__arm__) || defined(__aarch64__) -// arm version subsamples by summing 4 pixels then multiplying by matrix with -// 4x smaller coefficients which are rounded to nearest integer. -#define ARM_YUV_ERROR 4 -#else -#define ARM_YUV_ERROR 0 -#endif - // Some functions fail on big endian. Enable these tests on all cpus except -// PowerPC -#if !defined(__powerpc__) -#define LITTLE_ENDIAN_TEST 1 +// PowerPC, but they are not optimized so disabled by default. +#if !defined(__powerpc__) && defined(ENABLE_SLOW_TESTS) +#define LITTLE_ENDIAN_ONLY_TEST 1 #endif namespace libyuv { @@ -224,41 +216,23 @@ TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H010, uint16_t, 2, 2, 2) dst_y_opt, kWidth, dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \ dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \ } \ - int max_diff = 0; \ for (int i = 0; i < kHeight; ++i) { \ for (int j = 0; j < kWidth; ++j) { \ - int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \ - static_cast<int>(dst_y_opt[i * kWidth + j])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ - } \ + EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]); \ } \ } \ - EXPECT_EQ(0, max_diff); \ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \ - int abs_diff = abs( \ - static_cast<int>(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \ - static_cast<int>( \ - dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ - } \ + EXPECT_EQ(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j], \ + dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]); \ } \ } \ - EXPECT_LE(max_diff, 3); \ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \ - int abs_diff = abs( \ - static_cast<int>(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \ - static_cast<int>( \ - dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ - } \ + EXPECT_EQ(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j], \ + dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]); \ } \ } \ - EXPECT_LE(max_diff, 3); \ free_aligned_buffer_page_end(dst_y_c); \ free_aligned_buffer_page_end(dst_u_c); \ free_aligned_buffer_page_end(dst_v_c); \ @@ -353,30 +327,17 @@ int I400ToNV21(const uint8_t* src_y, src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_opt, kWidth, \ dst_uv_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, kWidth, NEG kHeight); \ } \ - int max_diff = 0; \ for (int i = 0; i < kHeight; ++i) { \ for (int j = 0; j < kWidth; ++j) { \ - int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \ - static_cast<int>(dst_y_opt[i * kWidth + j])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ - } \ + EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]); \ } \ } \ - EXPECT_LE(max_diff, 1); \ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X) * 2; ++j) { \ - int abs_diff = \ - abs(static_cast<int>( \ - dst_uv_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j]) - \ - static_cast<int>( \ - dst_uv_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ - } \ + EXPECT_EQ(dst_uv_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j], \ + dst_uv_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j]); \ } \ } \ - EXPECT_LE(max_diff, 1); \ free_aligned_buffer_page_end(dst_y_c); \ free_aligned_buffer_page_end(dst_uv_c); \ free_aligned_buffer_page_end(dst_y_opt); \ @@ -400,6 +361,7 @@ int I400ToNV21(const uint8_t* src_y, TESTPLANARTOBP(I420, 2, 2, NV12, 2, 2) TESTPLANARTOBP(I420, 2, 2, NV21, 2, 2) TESTPLANARTOBP(I422, 2, 1, NV21, 2, 2) +TESTPLANARTOBP(I444, 1, 1, NV12, 2, 2) TESTPLANARTOBP(I444, 1, 1, NV21, 2, 2) TESTPLANARTOBP(I400, 2, 2, NV21, 2, 2) @@ -447,32 +409,19 @@ TESTPLANARTOBP(I400, 2, 2, NV21, 2, 2) kWidth, dst_uv_opt, 2 * SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, \ NEG kHeight); \ } \ - int max_diff = 0; \ if (DOY) { \ for (int i = 0; i < kHeight; ++i) { \ for (int j = 0; j < kWidth; ++j) { \ - int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \ - static_cast<int>(dst_y_opt[i * kWidth + j])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ - } \ + EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]); \ } \ } \ - EXPECT_LE(max_diff, 1); \ } \ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ for (int j = 0; j < 2 * SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \ - int abs_diff = \ - abs(static_cast<int>( \ - dst_uv_c[i * 2 * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \ - static_cast<int>( \ - dst_uv_opt[i * 2 * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ - } \ + EXPECT_EQ(dst_uv_c[i * 2 * SUBSAMPLE(kWidth, SUBSAMP_X) + j], \ + dst_uv_opt[i * 2 * SUBSAMPLE(kWidth, SUBSAMP_X) + j]); \ } \ } \ - EXPECT_LE(max_diff, 1); \ free_aligned_buffer_page_end(dst_y_c); \ free_aligned_buffer_page_end(dst_uv_c); \ free_aligned_buffer_page_end(dst_y_opt); \ @@ -496,6 +445,7 @@ TESTPLANARTOBP(I400, 2, 2, NV21, 2, 2) SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _NullY, +, 0, 0) TESTBIPLANARTOBP(NV21, 2, 2, NV12, 2, 2) +TESTBIPLANARTOBP(NV12, 2, 2, NV12Mirror, 2, 2) #define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF, \ @@ -550,43 +500,25 @@ TESTBIPLANARTOBP(NV21, 2, 2, NV12, 2, 2) kWidth, dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_opt, \ SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \ } \ - int max_diff = 0; \ if (DOY) { \ for (int i = 0; i < kHeight; ++i) { \ for (int j = 0; j < kWidth; ++j) { \ - int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \ - static_cast<int>(dst_y_opt[i * kWidth + j])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ - } \ + EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]); \ } \ } \ - EXPECT_LE(max_diff, 1); \ } \ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \ - int abs_diff = abs( \ - static_cast<int>(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \ - static_cast<int>( \ - dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ - } \ + EXPECT_EQ(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j], \ + dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]); \ } \ } \ - EXPECT_LE(max_diff, 1); \ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \ - int abs_diff = abs( \ - static_cast<int>(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \ - static_cast<int>( \ - dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ - } \ + EXPECT_EQ(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j], \ + dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]); \ } \ } \ - EXPECT_LE(max_diff, 1); \ free_aligned_buffer_page_end(dst_y_c); \ free_aligned_buffer_page_end(dst_u_c); \ free_aligned_buffer_page_end(dst_v_c); \ @@ -691,7 +623,7 @@ TESTPLANARTOB(J420, 2, 2, RAW, 3, 3, 1) TESTPLANARTOB(J420, 2, 2, RGB24, 3, 3, 1) TESTPLANARTOB(H420, 2, 2, RAW, 3, 3, 1) TESTPLANARTOB(H420, 2, 2, RGB24, 3, 3, 1) -#ifdef LITTLE_ENDIAN_TEST +#ifdef LITTLE_ENDIAN_ONLY_TEST TESTPLANARTOB(I420, 2, 2, RGB565, 2, 2, 1) TESTPLANARTOB(J420, 2, 2, RGB565, 2, 2, 1) TESTPLANARTOB(H420, 2, 2, RGB565, 2, 2, 1) @@ -723,13 +655,13 @@ TESTPLANARTOB(I422, 2, 1, YUY2, 2, 4, 1) TESTPLANARTOB(I422, 2, 1, UYVY, 2, 4, 1) TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 1) TESTPLANARTOB(J420, 2, 2, J400, 1, 1, 1) -#ifdef LITTLE_ENDIAN_TEST +#ifdef LITTLE_ENDIAN_ONLY_TEST TESTPLANARTOB(I420, 2, 2, AR30, 4, 4, 1) TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1) #endif #define TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, W1280, DIFF, N, NEG, OFF, ATTEN) \ + YALIGN, W1280, N, NEG, OFF, ATTEN) \ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ @@ -764,15 +696,9 @@ TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1) dst_argb_opt + OFF, kStrideB, kWidth, NEG kHeight, \ ATTEN); \ } \ - int max_diff = 0; \ for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \ - int abs_diff = abs(static_cast<int>(dst_argb_c[i + OFF]) - \ - static_cast<int>(dst_argb_opt[i + OFF])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ - } \ + EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_opt[i + OFF]); \ } \ - EXPECT_LE(max_diff, DIFF); \ free_aligned_buffer_page_end(src_y); \ free_aligned_buffer_page_end(src_u); \ free_aligned_buffer_page_end(src_v); \ @@ -782,23 +708,48 @@ TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1) } #define TESTQPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, DIFF) \ + YALIGN) \ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, 0) \ + YALIGN, benchmark_width_ - 4, _Any, +, 0, 0) \ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, 0) \ + YALIGN, benchmark_width_, _Unaligned, +, 1, 0) \ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, DIFF, _Invert, -, 0, 0) \ + YALIGN, benchmark_width_, _Invert, -, 0, 0) \ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, DIFF, _Opt, +, 0, 0) \ + YALIGN, benchmark_width_, _Opt, +, 0, 0) \ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, DIFF, _Premult, +, 0, 1) - -TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1, 2) -TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1, 2) + YALIGN, benchmark_width_, _Premult, +, 0, 1) + +#define J420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \ + l, m) +#define J420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \ + l, m) +#define H420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \ + l, m) +#define H420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \ + l, m) +#define U420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \ + l, m) +#define U420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \ + l, m) + +TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1) +TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1) +TESTQPLANARTOB(J420Alpha, 2, 2, ARGB, 4, 4, 1) +TESTQPLANARTOB(J420Alpha, 2, 2, ABGR, 4, 4, 1) +TESTQPLANARTOB(H420Alpha, 2, 2, ARGB, 4, 4, 1) +TESTQPLANARTOB(H420Alpha, 2, 2, ABGR, 4, 4, 1) +TESTQPLANARTOB(U420Alpha, 2, 2, ARGB, 4, 4, 1) +TESTQPLANARTOB(U420Alpha, 2, 2, ABGR, 4, 4, 1) #define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, \ - BPP_B, W1280, DIFF, N, NEG, OFF) \ + BPP_B, W1280, N, NEG, OFF) \ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ const int kHeight = benchmark_height_; \ @@ -837,18 +788,12 @@ TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1, 2) kHeight); \ FMT_C##ToARGB(dst_argb_opt, kStrideB, dst_argb32_opt, kWidth * 4, kWidth, \ kHeight); \ - int max_diff = 0; \ for (int i = 0; i < kHeight; ++i) { \ for (int j = 0; j < kWidth * 4; ++j) { \ - int abs_diff = \ - abs(static_cast<int>(dst_argb32_c[i * kWidth * 4 + j]) - \ - static_cast<int>(dst_argb32_opt[i * kWidth * 4 + j])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ - } \ + EXPECT_EQ(dst_argb32_c[i * kWidth * 4 + j], \ + dst_argb32_opt[i * kWidth * 4 + j]); \ } \ } \ - EXPECT_LE(max_diff, DIFF); \ free_aligned_buffer_page_end(src_y); \ free_aligned_buffer_page_end(src_uv); \ free_aligned_buffer_page_end(dst_argb_c); \ @@ -857,95 +802,62 @@ TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1, 2) free_aligned_buffer_page_end(dst_argb32_opt); \ } -#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ - DIFF) \ +#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B) \ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ - benchmark_width_ - 4, DIFF, _Any, +, 0) \ + benchmark_width_ - 4, _Any, +, 0) \ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ - benchmark_width_, DIFF, _Unaligned, +, 1) \ + benchmark_width_, _Unaligned, +, 1) \ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ - benchmark_width_, DIFF, _Invert, -, 0) \ + benchmark_width_, _Invert, -, 0) \ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ - benchmark_width_, DIFF, _Opt, +, 0) - -TESTBIPLANARTOB(NV12, 2, 2, ARGB, ARGB, 4, 2) -TESTBIPLANARTOB(NV21, 2, 2, ARGB, ARGB, 4, 2) -TESTBIPLANARTOB(NV12, 2, 2, ABGR, ABGR, 4, 2) -TESTBIPLANARTOB(NV21, 2, 2, ABGR, ABGR, 4, 2) -TESTBIPLANARTOB(NV12, 2, 2, RGB24, RGB24, 3, 2) -TESTBIPLANARTOB(NV21, 2, 2, RGB24, RGB24, 3, 2) -TESTBIPLANARTOB(NV12, 2, 2, RAW, RAW, 3, 2) -TESTBIPLANARTOB(NV21, 2, 2, RAW, RAW, 3, 2) -#ifdef LITTLE_ENDIAN_TEST -TESTBIPLANARTOB(NV12, 2, 2, RGB565, RGB565, 2, 9) + benchmark_width_, _Opt, +, 0) + +#define JNV12ToARGB(a, b, c, d, e, f, g, h) \ + NV12ToARGBMatrix(a, b, c, d, e, f, &kYuvJPEGConstants, g, h) +#define JNV21ToARGB(a, b, c, d, e, f, g, h) \ + NV21ToARGBMatrix(a, b, c, d, e, f, &kYuvJPEGConstants, g, h) +#define JNV12ToABGR(a, b, c, d, e, f, g, h) \ + NV21ToARGBMatrix(a, b, c, d, e, f, &kYvuJPEGConstants, g, h) +#define JNV21ToABGR(a, b, c, d, e, f, g, h) \ + NV12ToARGBMatrix(a, b, c, d, e, f, &kYvuJPEGConstants, g, h) +#define JNV12ToRGB24(a, b, c, d, e, f, g, h) \ + NV12ToRGB24Matrix(a, b, c, d, e, f, &kYuvJPEGConstants, g, h) +#define JNV21ToRGB24(a, b, c, d, e, f, g, h) \ + NV21ToRGB24Matrix(a, b, c, d, e, f, &kYuvJPEGConstants, g, h) +#define JNV12ToRAW(a, b, c, d, e, f, g, h) \ + NV21ToRGB24Matrix(a, b, c, d, e, f, &kYvuJPEGConstants, g, h) +#define JNV21ToRAW(a, b, c, d, e, f, g, h) \ + NV12ToRGB24Matrix(a, b, c, d, e, f, &kYvuJPEGConstants, g, h) +#define JNV12ToRGB565(a, b, c, d, e, f, g, h) \ + NV12ToRGB565Matrix(a, b, c, d, e, f, &kYuvJPEGConstants, g, h) + +TESTBIPLANARTOB(JNV12, 2, 2, ARGB, ARGB, 4) +TESTBIPLANARTOB(JNV21, 2, 2, ARGB, ARGB, 4) +TESTBIPLANARTOB(JNV12, 2, 2, ABGR, ABGR, 4) +TESTBIPLANARTOB(JNV21, 2, 2, ABGR, ABGR, 4) +TESTBIPLANARTOB(JNV12, 2, 2, RGB24, RGB24, 3) +TESTBIPLANARTOB(JNV21, 2, 2, RGB24, RGB24, 3) +TESTBIPLANARTOB(JNV12, 2, 2, RAW, RAW, 3) +TESTBIPLANARTOB(JNV21, 2, 2, RAW, RAW, 3) +#ifdef LITTLE_ENDIAN_ONLY_TEST +TESTBIPLANARTOB(JNV12, 2, 2, RGB565, RGB565, 2) +#endif + +TESTBIPLANARTOB(NV12, 2, 2, ARGB, ARGB, 4) +TESTBIPLANARTOB(NV21, 2, 2, ARGB, ARGB, 4) +TESTBIPLANARTOB(NV12, 2, 2, ABGR, ABGR, 4) +TESTBIPLANARTOB(NV21, 2, 2, ABGR, ABGR, 4) +TESTBIPLANARTOB(NV12, 2, 2, RGB24, RGB24, 3) +TESTBIPLANARTOB(NV21, 2, 2, RGB24, RGB24, 3) +TESTBIPLANARTOB(NV12, 2, 2, RAW, RAW, 3) +TESTBIPLANARTOB(NV21, 2, 2, RAW, RAW, 3) +TESTBIPLANARTOB(NV21, 2, 2, YUV24, RAW, 3) +#ifdef LITTLE_ENDIAN_ONLY_TEST +TESTBIPLANARTOB(NV12, 2, 2, RGB565, RGB565, 2) #endif -TESTBIPLANARTOB(NV21, 2, 2, YUV24, RAW, 3, 2) -#ifdef DO_THREE_PLANES -// Do 3 allocations for yuv. conventional but slower. -#define TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ - W1280, DIFF, N, NEG, OFF) \ - TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ - const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ - const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ - const int kStride = (kStrideUV * SUBSAMP_X * 8 * BPP_A + 7) / 8; \ - align_buffer_page_end(src_argb, kStride* kHeight + OFF); \ - align_buffer_page_end(dst_y_c, kWidth* kHeight); \ - align_buffer_page_end(dst_u_c, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - align_buffer_page_end(dst_v_c, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - align_buffer_page_end(dst_y_opt, kWidth* kHeight); \ - align_buffer_page_end(dst_u_opt, \ - kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - align_buffer_page_end(dst_v_opt, \ - kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - memset(dst_y_c, 1, kWidth* kHeight); \ - memset(dst_u_c, 2, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - memset(dst_v_c, 3, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - memset(dst_y_opt, 101, kWidth* kHeight); \ - memset(dst_u_opt, 102, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - memset(dst_v_opt, 103, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - for (int i = 0; i < kHeight; ++i) \ - for (int j = 0; j < kStride; ++j) \ - src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff); \ - MaskCpuFlags(disable_cpu_flags_); \ - FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_c, kWidth, dst_u_c, \ - kStrideUV, dst_v_c, kStrideUV, kWidth, NEG kHeight); \ - MaskCpuFlags(benchmark_cpu_info_); \ - for (int i = 0; i < benchmark_iterations_; ++i) { \ - FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_opt, kWidth, \ - dst_u_opt, kStrideUV, dst_v_opt, kStrideUV, \ - kWidth, NEG kHeight); \ - } \ - for (int i = 0; i < kHeight; ++i) { \ - for (int j = 0; j < kWidth; ++j) { \ - EXPECT_NEAR(static_cast<int>(dst_y_c[i * kWidth + j]), \ - static_cast<int>(dst_y_opt[i * kWidth + j]), DIFF); \ - } \ - } \ - for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ - for (int j = 0; j < kStrideUV; ++j) { \ - EXPECT_NEAR(static_cast<int>(dst_u_c[i * kStrideUV + j]), \ - static_cast<int>(dst_u_opt[i * kStrideUV + j]), DIFF); \ - } \ - } \ - for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ - for (int j = 0; j < kStrideUV; ++j) { \ - EXPECT_NEAR(static_cast<int>(dst_v_c[i * kStrideUV + j]), \ - static_cast<int>(dst_v_opt[i * kStrideUV + j]), DIFF); \ - } \ - } \ - free_aligned_buffer_page_end(dst_y_c); \ - free_aligned_buffer_page_end(dst_u_c); \ - free_aligned_buffer_page_end(dst_v_c); \ - free_aligned_buffer_page_end(dst_y_opt); \ - free_aligned_buffer_page_end(dst_u_opt); \ - free_aligned_buffer_page_end(dst_v_opt); \ - free_aligned_buffer_page_end(src_argb); \ - } -#else #define TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ - W1280, DIFF, N, NEG, OFF) \ + W1280, N, NEG, OFF) \ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) { \ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ @@ -977,14 +889,12 @@ TESTBIPLANARTOB(NV21, 2, 2, YUV24, RAW, 3, 2) } \ for (int i = 0; i < kHeight; ++i) { \ for (int j = 0; j < kWidth; ++j) { \ - EXPECT_NEAR(static_cast<int>(dst_y_c[i * kWidth + j]), \ - static_cast<int>(dst_y_opt[i * kWidth + j]), DIFF); \ + EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]); \ } \ } \ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * 2; ++i) { \ for (int j = 0; j < kStrideUV; ++j) { \ - EXPECT_NEAR(static_cast<int>(dst_uv_c[i * kStrideUV + j]), \ - static_cast<int>(dst_uv_opt[i * kStrideUV + j]), DIFF); \ + EXPECT_EQ(dst_uv_c[i * kStrideUV + j], dst_uv_opt[i * kStrideUV + j]); \ } \ } \ free_aligned_buffer_page_end(dst_y_c); \ @@ -993,43 +903,39 @@ TESTBIPLANARTOB(NV21, 2, 2, YUV24, RAW, 3, 2) free_aligned_buffer_page_end(dst_uv_opt); \ free_aligned_buffer_page_end(src_argb); \ } -#endif -#define TESTATOPLANAR(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ - DIFF) \ +#define TESTATOPLANAR(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \ TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ - benchmark_width_ - 4, DIFF, _Any, +, 0) \ + benchmark_width_ - 4, _Any, +, 0) \ TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ - benchmark_width_, DIFF, _Unaligned, +, 1) \ + benchmark_width_, _Unaligned, +, 1) \ TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ - benchmark_width_, DIFF, _Invert, -, 0) \ + benchmark_width_, _Invert, -, 0) \ TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ - benchmark_width_, DIFF, _Opt, +, 0) - -TESTATOPLANAR(ABGR, 4, 1, I420, 2, 2, 4) -TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2, 4) -TESTATOPLANAR(ARGB, 4, 1, I422, 2, 1, 2) -TESTATOPLANAR(ARGB, 4, 1, I444, 1, 1, 2) -TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, ARM_YUV_ERROR) -TESTATOPLANAR(ARGB, 4, 1, J422, 2, 1, ARM_YUV_ERROR) -#ifdef LITTLE_ENDIAN_TEST -TESTATOPLANAR(ARGB1555, 2, 1, I420, 2, 2, 15) -TESTATOPLANAR(ARGB4444, 2, 1, I420, 2, 2, 17) -#endif -TESTATOPLANAR(BGRA, 4, 1, I420, 2, 2, 4) -TESTATOPLANAR(I400, 1, 1, I420, 2, 2, 2) -TESTATOPLANAR(J400, 1, 1, J420, 2, 2, 2) -TESTATOPLANAR(RAW, 3, 1, I420, 2, 2, 4) -TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2, 4) -TESTATOPLANAR(RGB24, 3, 1, J420, 2, 2, ARM_YUV_ERROR) -#ifdef LITTLE_ENDIAN_TEST -TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2, 5) + benchmark_width_, _Opt, +, 0) + +TESTATOPLANAR(ABGR, 4, 1, I420, 2, 2) +TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2) +TESTATOPLANAR(ARGB, 4, 1, I422, 2, 1) +TESTATOPLANAR(ARGB, 4, 1, I444, 1, 1) +TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2) +TESTATOPLANAR(ARGB, 4, 1, J422, 2, 1) +#ifdef LITTLE_ENDIAN_ONLY_TEST +TESTATOPLANAR(ARGB4444, 2, 1, I420, 2, 2) +TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2) +TESTATOPLANAR(ARGB1555, 2, 1, I420, 2, 2) #endif -TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2, 4) -TESTATOPLANAR(UYVY, 2, 1, I420, 2, 2, 2) -TESTATOPLANAR(UYVY, 2, 1, I422, 2, 1, 2) -TESTATOPLANAR(YUY2, 2, 1, I420, 2, 2, 2) -TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1, 2) +TESTATOPLANAR(BGRA, 4, 1, I420, 2, 2) +TESTATOPLANAR(I400, 1, 1, I420, 2, 2) +TESTATOPLANAR(J400, 1, 1, J420, 2, 2) +TESTATOPLANAR(RAW, 3, 1, I420, 2, 2) +TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2) +TESTATOPLANAR(RGB24, 3, 1, J420, 2, 2) +TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2) +TESTATOPLANAR(UYVY, 2, 1, I420, 2, 2) +TESTATOPLANAR(UYVY, 2, 1, I422, 2, 1) +TESTATOPLANAR(YUY2, 2, 1, I420, 2, 2) +TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1) #define TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, \ SUBSAMP_Y, W1280, N, NEG, OFF) \ @@ -1060,28 +966,17 @@ TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1, 2) FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_opt, kWidth, \ dst_uv_opt, kStrideUV * 2, kWidth, NEG kHeight); \ } \ - int max_diff = 0; \ for (int i = 0; i < kHeight; ++i) { \ for (int j = 0; j < kWidth; ++j) { \ - int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \ - static_cast<int>(dst_y_opt[i * kWidth + j])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ - } \ + EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]); \ } \ } \ - EXPECT_LE(max_diff, 4); \ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ for (int j = 0; j < kStrideUV * 2; ++j) { \ - int abs_diff = \ - abs(static_cast<int>(dst_uv_c[i * kStrideUV * 2 + j]) - \ - static_cast<int>(dst_uv_opt[i * kStrideUV * 2 + j])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ - } \ + EXPECT_EQ(dst_uv_c[i * kStrideUV * 2 + j], \ + dst_uv_opt[i * kStrideUV * 2 + j]); \ } \ } \ - EXPECT_LE(max_diff, 4); \ free_aligned_buffer_page_end(dst_y_c); \ free_aligned_buffer_page_end(dst_uv_c); \ free_aligned_buffer_page_end(dst_y_opt); \ @@ -1109,7 +1004,7 @@ TESTATOBIPLANAR(AYUV, 1, 4, NV12, 2, 2) TESTATOBIPLANAR(AYUV, 1, 4, NV21, 2, 2) #define TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ - HEIGHT_B, W1280, DIFF, N, NEG, OFF) \ + HEIGHT_B, W1280, N, NEG, OFF) \ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##N) { \ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ const int kHeight = benchmark_height_; \ @@ -1135,22 +1030,16 @@ TESTATOBIPLANAR(AYUV, 1, 4, NV21, 2, 2) FMT_A##To##FMT_B(src_argb + OFF, kStrideA, dst_argb_opt, kStrideB, \ kWidth, NEG kHeight); \ } \ - int max_diff = 0; \ for (int i = 0; i < kStrideB * kHeightB; ++i) { \ - int abs_diff = abs(static_cast<int>(dst_argb_c[i]) - \ - static_cast<int>(dst_argb_opt[i])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ - } \ + EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \ } \ - EXPECT_LE(max_diff, DIFF); \ free_aligned_buffer_page_end(src_argb); \ free_aligned_buffer_page_end(dst_argb_c); \ free_aligned_buffer_page_end(dst_argb_opt); \ } #define TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, \ - STRIDE_B, HEIGHT_B, DIFF) \ + STRIDE_B, HEIGHT_B) \ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##_Random) { \ for (int times = 0; times < benchmark_iterations_; ++times) { \ const int kWidth = (fastrand() & 63) + 1; \ @@ -1176,7 +1065,7 @@ TESTATOBIPLANAR(AYUV, 1, 4, NV21, 2, 2) FMT_A##To##FMT_B(src_argb, kStrideA, dst_argb_opt, kStrideB, kWidth, \ kHeight); \ for (int i = 0; i < kStrideB * kHeightB; ++i) { \ - EXPECT_NEAR(dst_argb_c[i], dst_argb_opt[i], DIFF); \ + EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \ } \ free_aligned_buffer_page_end(src_argb); \ free_aligned_buffer_page_end(dst_argb_c); \ @@ -1185,76 +1074,79 @@ TESTATOBIPLANAR(AYUV, 1, 4, NV21, 2, 2) } #define TESTATOB(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ - HEIGHT_B, DIFF) \ + HEIGHT_B) \ TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ - HEIGHT_B, benchmark_width_ - 4, DIFF, _Any, +, 0) \ + HEIGHT_B, benchmark_width_ - 4, _Any, +, 0) \ TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ - HEIGHT_B, benchmark_width_, DIFF, _Unaligned, +, 1) \ + HEIGHT_B, benchmark_width_, _Unaligned, +, 1) \ TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ - HEIGHT_B, benchmark_width_, DIFF, _Invert, -, 0) \ + HEIGHT_B, benchmark_width_, _Invert, -, 0) \ TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ - HEIGHT_B, benchmark_width_, DIFF, _Opt, +, 0) \ + HEIGHT_B, benchmark_width_, _Opt, +, 0) \ TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ - HEIGHT_B, DIFF) + HEIGHT_B) -// TODO(fbarchard): make ARM version of C code that matches NEON. -TESTATOB(AB30, 4, 4, 1, ABGR, 4, 4, 1, 0) -TESTATOB(AB30, 4, 4, 1, ARGB, 4, 4, 1, 0) -#ifdef LITTLE_ENDIAN_TEST -TESTATOB(ABGR, 4, 4, 1, AR30, 4, 4, 1, 0) +TESTATOB(AB30, 4, 4, 1, ABGR, 4, 4, 1) +TESTATOB(AB30, 4, 4, 1, ARGB, 4, 4, 1) +#ifdef LITTLE_ENDIAN_ONLY_TEST +TESTATOB(ABGR, 4, 4, 1, AR30, 4, 4, 1) #endif -TESTATOB(ABGR, 4, 4, 1, ARGB, 4, 4, 1, 0) -#ifdef LITTLE_ENDIAN_TEST -TESTATOB(AR30, 4, 4, 1, AB30, 4, 4, 1, 0) +TESTATOB(ABGR, 4, 4, 1, ARGB, 4, 4, 1) +#ifdef LITTLE_ENDIAN_ONLY_TEST +TESTATOB(AR30, 4, 4, 1, AB30, 4, 4, 1) #endif -TESTATOB(AR30, 4, 4, 1, ABGR, 4, 4, 1, 0) -#ifdef LITTLE_ENDIAN_TEST -TESTATOB(AR30, 4, 4, 1, AR30, 4, 4, 1, 0) -TESTATOB(AR30, 4, 4, 1, ARGB, 4, 4, 1, 0) +TESTATOB(AR30, 4, 4, 1, ABGR, 4, 4, 1) +#ifdef LITTLE_ENDIAN_ONLY_TEST +TESTATOB(AR30, 4, 4, 1, AR30, 4, 4, 1) +TESTATOB(AR30, 4, 4, 1, ARGB, 4, 4, 1) #endif -TESTATOB(ARGB, 4, 4, 1, ABGR, 4, 4, 1, 0) -#ifdef LITTLE_ENDIAN_TEST -TESTATOB(ARGB, 4, 4, 1, AR30, 4, 4, 1, 0) +TESTATOB(ARGB, 4, 4, 1, ABGR, 4, 4, 1) +#ifdef LITTLE_ENDIAN_ONLY_TEST +TESTATOB(ARGB, 4, 4, 1, AR30, 4, 4, 1) #endif -TESTATOB(ARGB, 4, 4, 1, ARGB, 4, 4, 1, 0) -TESTATOB(ARGB, 4, 4, 1, ARGB1555, 2, 2, 1, 0) -TESTATOB(ARGB, 4, 4, 1, ARGB4444, 2, 2, 1, 0) -TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1, 0) -TESTATOB(ARGB, 4, 4, 1, BGRA, 4, 4, 1, 0) -TESTATOB(ARGB, 4, 4, 1, I400, 1, 1, 1, 0) -TESTATOB(ARGB, 4, 4, 1, J400, 1, 1, 1, 0) -TESTATOB(RGBA, 4, 4, 1, J400, 1, 1, 1, 0) -TESTATOB(ARGB, 4, 4, 1, RAW, 3, 3, 1, 0) -TESTATOB(ARGB, 4, 4, 1, RGB24, 3, 3, 1, 0) -#ifdef LITTLE_ENDIAN_TEST -TESTATOB(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0) +TESTATOB(ARGB, 4, 4, 1, ARGB, 4, 4, 1) +TESTATOB(ARGB, 4, 4, 1, ARGB1555, 2, 2, 1) +TESTATOB(ARGB, 4, 4, 1, ARGB4444, 2, 2, 1) +TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1) +TESTATOB(ARGB, 4, 4, 1, BGRA, 4, 4, 1) +TESTATOB(ARGB, 4, 4, 1, I400, 1, 1, 1) +TESTATOB(ARGB, 4, 4, 1, J400, 1, 1, 1) +TESTATOB(RGBA, 4, 4, 1, J400, 1, 1, 1) +TESTATOB(ARGB, 4, 4, 1, RAW, 3, 3, 1) +TESTATOB(ARGB, 4, 4, 1, RGB24, 3, 3, 1) +TESTATOB(ABGR, 4, 4, 1, RAW, 3, 3, 1) +TESTATOB(ABGR, 4, 4, 1, RGB24, 3, 3, 1) +#ifdef LITTLE_ENDIAN_ONLY_TEST +TESTATOB(ARGB, 4, 4, 1, RGB565, 2, 2, 1) #endif -TESTATOB(ARGB, 4, 4, 1, RGBA, 4, 4, 1, 0) -TESTATOB(ARGB, 4, 4, 1, UYVY, 2, 4, 1, 4) -TESTATOB(ARGB, 4, 4, 1, YUY2, 2, 4, 1, 4) -TESTATOB(ARGB1555, 2, 2, 1, ARGB, 4, 4, 1, 0) -TESTATOB(ARGB4444, 2, 2, 1, ARGB, 4, 4, 1, 0) -TESTATOB(BGRA, 4, 4, 1, ARGB, 4, 4, 1, 0) -TESTATOB(I400, 1, 1, 1, ARGB, 4, 4, 1, 0) -TESTATOB(I400, 1, 1, 1, I400, 1, 1, 1, 0) -TESTATOB(I400, 1, 1, 1, I400Mirror, 1, 1, 1, 0) -TESTATOB(J400, 1, 1, 1, ARGB, 4, 4, 1, 0) -TESTATOB(J400, 1, 1, 1, J400, 1, 1, 1, 0) -TESTATOB(RAW, 3, 3, 1, ARGB, 4, 4, 1, 0) -TESTATOB(RAW, 3, 3, 1, RGBA, 4, 4, 1, 0) -TESTATOB(RAW, 3, 3, 1, RGB24, 3, 3, 1, 0) -TESTATOB(RGB24, 3, 3, 1, ARGB, 4, 4, 1, 0) -TESTATOB(RGB24, 3, 3, 1, J400, 1, 1, 1, 0) -#ifdef LITTLE_ENDIAN_TEST -TESTATOB(RGB565, 2, 2, 1, ARGB, 4, 4, 1, 0) +TESTATOB(ARGB, 4, 4, 1, RGBA, 4, 4, 1) +TESTATOB(ARGB, 4, 4, 1, UYVY, 2, 4, 1) +TESTATOB(ARGB, 4, 4, 1, YUY2, 2, 4, 1) // 4 +TESTATOB(ARGB1555, 2, 2, 1, ARGB, 4, 4, 1) +TESTATOB(ARGB4444, 2, 2, 1, ARGB, 4, 4, 1) +TESTATOB(BGRA, 4, 4, 1, ARGB, 4, 4, 1) +TESTATOB(I400, 1, 1, 1, ARGB, 4, 4, 1) +TESTATOB(I400, 1, 1, 1, I400, 1, 1, 1) +TESTATOB(I400, 1, 1, 1, I400Mirror, 1, 1, 1) +TESTATOB(J400, 1, 1, 1, ARGB, 4, 4, 1) +TESTATOB(J400, 1, 1, 1, J400, 1, 1, 1) +TESTATOB(RAW, 3, 3, 1, ARGB, 4, 4, 1) +TESTATOB(RAW, 3, 3, 1, RGBA, 4, 4, 1) +TESTATOB(RAW, 3, 3, 1, RGB24, 3, 3, 1) +TESTATOB(RGB24, 3, 3, 1, ARGB, 4, 4, 1) +TESTATOB(RGB24, 3, 3, 1, J400, 1, 1, 1) +TESTATOB(RGB24, 3, 3, 1, RGB24Mirror, 3, 3, 1) +TESTATOB(RAW, 3, 3, 1, J400, 1, 1, 1) +#ifdef LITTLE_ENDIAN_ONLY_TEST +TESTATOB(RGB565, 2, 2, 1, ARGB, 4, 4, 1) #endif -TESTATOB(RGBA, 4, 4, 1, ARGB, 4, 4, 1, 0) -TESTATOB(UYVY, 2, 4, 1, ARGB, 4, 4, 1, ARM_YUV_ERROR) -TESTATOB(YUY2, 2, 4, 1, ARGB, 4, 4, 1, ARM_YUV_ERROR) -TESTATOB(YUY2, 2, 4, 1, Y, 1, 1, 1, 0) +TESTATOB(RGBA, 4, 4, 1, ARGB, 4, 4, 1) +TESTATOB(UYVY, 2, 4, 1, ARGB, 4, 4, 1) +TESTATOB(YUY2, 2, 4, 1, ARGB, 4, 4, 1) +TESTATOB(YUY2, 2, 4, 1, Y, 1, 1, 1) #define TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ - HEIGHT_B, W1280, DIFF, N, NEG, OFF) \ + HEIGHT_B, W1280, N, NEG, OFF) \ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##Dither##N) { \ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ const int kHeight = benchmark_height_; \ @@ -1280,22 +1172,16 @@ TESTATOB(YUY2, 2, 4, 1, Y, 1, 1, 1, 0) FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA, dst_argb_opt, \ kStrideB, NULL, kWidth, NEG kHeight); \ } \ - int max_diff = 0; \ for (int i = 0; i < kStrideB * kHeightB; ++i) { \ - int abs_diff = abs(static_cast<int>(dst_argb_c[i]) - \ - static_cast<int>(dst_argb_opt[i])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ - } \ + EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \ } \ - EXPECT_LE(max_diff, DIFF); \ free_aligned_buffer_page_end(src_argb); \ free_aligned_buffer_page_end(dst_argb_c); \ free_aligned_buffer_page_end(dst_argb_opt); \ } #define TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, \ - STRIDE_B, HEIGHT_B, DIFF) \ + STRIDE_B, HEIGHT_B) \ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##Dither_Random) { \ for (int times = 0; times < benchmark_iterations_; ++times) { \ const int kWidth = (fastrand() & 63) + 1; \ @@ -1320,15 +1206,9 @@ TESTATOB(YUY2, 2, 4, 1, Y, 1, 1, 1, 0) MaskCpuFlags(benchmark_cpu_info_); \ FMT_A##To##FMT_B##Dither(src_argb, kStrideA, dst_argb_opt, kStrideB, \ NULL, kWidth, kHeight); \ - int max_diff = 0; \ for (int i = 0; i < kStrideB * kHeightB; ++i) { \ - int abs_diff = abs(static_cast<int>(dst_argb_c[i]) - \ - static_cast<int>(dst_argb_opt[i])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ - } \ + EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \ } \ - EXPECT_LE(max_diff, DIFF); \ free_aligned_buffer_page_end(src_argb); \ free_aligned_buffer_page_end(dst_argb_c); \ free_aligned_buffer_page_end(dst_argb_opt); \ @@ -1336,20 +1216,20 @@ TESTATOB(YUY2, 2, 4, 1, Y, 1, 1, 1, 0) } #define TESTATOBD(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ - HEIGHT_B, DIFF) \ + HEIGHT_B) \ TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ - HEIGHT_B, benchmark_width_ - 4, DIFF, _Any, +, 0) \ + HEIGHT_B, benchmark_width_ - 4, _Any, +, 0) \ TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ - HEIGHT_B, benchmark_width_, DIFF, _Unaligned, +, 1) \ + HEIGHT_B, benchmark_width_, _Unaligned, +, 1) \ TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ - HEIGHT_B, benchmark_width_, DIFF, _Invert, -, 0) \ + HEIGHT_B, benchmark_width_, _Invert, -, 0) \ TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ - HEIGHT_B, benchmark_width_, DIFF, _Opt, +, 0) \ + HEIGHT_B, benchmark_width_, _Opt, +, 0) \ TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ - HEIGHT_B, DIFF) + HEIGHT_B) -#ifdef LITTLE_ENDIAN_TEST -TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0) +#ifdef LITTLE_ENDIAN_ONLY_TEST +TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1) #endif #define TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, W1280, N, NEG, OFF) \ @@ -1930,6 +1810,65 @@ TEST_F(LibYUVConvertTest, TestMJPGToI420_NV21) { free_aligned_buffer_page_end(dst_vu); } +TEST_F(LibYUVConvertTest, TestMJPGToI420_NV12) { + int width = 0; + int height = 0; + int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height); + EXPECT_EQ(0, ret); + + int half_width = (width + 1) / 2; + int half_height = (height + 1) / 2; + int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * + benchmark_height_ / (width * height); + + // Convert to NV12 + align_buffer_page_end(dst_y, width * height); + align_buffer_page_end(dst_uv, half_width * half_height * 2); + + for (int times = 0; times < benchmark_iterations; ++times) { + ret = MJPGToNV12(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_uv, + half_width * 2, width, height, width, height); + } + // Expect sucesss + EXPECT_EQ(0, ret); + + // Convert to I420 + align_buffer_page_end(dst2_y, width * height); + align_buffer_page_end(dst2_u, half_width * half_height); + align_buffer_page_end(dst2_v, half_width * half_height); + for (int times = 0; times < benchmark_iterations; ++times) { + ret = MJPGToI420(kTest2Jpg, kTest2JpgLen, dst2_y, width, dst2_u, half_width, + dst2_v, half_width, width, height, width, height); + } + // Expect sucesss + EXPECT_EQ(0, ret); + + // Convert I420 to NV12 + align_buffer_page_end(dst3_y, width * height); + align_buffer_page_end(dst3_uv, half_width * half_height * 2); + + I420ToNV12(dst2_y, width, dst2_u, half_width, dst2_v, half_width, dst3_y, + width, dst3_uv, half_width * 2, width, height); + + for (int i = 0; i < width * height; ++i) { + EXPECT_EQ(dst_y[i], dst3_y[i]); + } + for (int i = 0; i < half_width * half_height * 2; ++i) { + EXPECT_EQ(dst_uv[i], dst3_uv[i]); + EXPECT_EQ(dst_uv[i], dst3_uv[i]); + } + + free_aligned_buffer_page_end(dst3_y); + free_aligned_buffer_page_end(dst3_uv); + + free_aligned_buffer_page_end(dst2_y); + free_aligned_buffer_page_end(dst2_u); + free_aligned_buffer_page_end(dst2_v); + + free_aligned_buffer_page_end(dst_y); + free_aligned_buffer_page_end(dst_uv); +} + TEST_F(LibYUVConvertTest, TestMJPGToNV21_420) { int width = 0; int height = 0; @@ -1960,6 +1899,40 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV21_420) { free_aligned_buffer_page_end(dst_uv); } +TEST_F(LibYUVConvertTest, TestMJPGToNV12_420) { + int width = 0; + int height = 0; + int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height); + EXPECT_EQ(0, ret); + + int half_width = (width + 1) / 2; + int half_height = (height + 1) / 2; + int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * + benchmark_height_ / (width * height); + + align_buffer_page_end(dst_y, width * height); + align_buffer_page_end(dst_uv, half_width * half_height * 2); + for (int times = 0; times < benchmark_iterations; ++times) { + ret = MJPGToNV12(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_uv, + half_width * 2, width, height, width, height); + } + // Expect sucesss + EXPECT_EQ(0, ret); + + // Test result matches known hash value. Hashes are for VU so flip the plane. + uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381); + align_buffer_page_end(dst_vu, half_width * half_height * 2); + SwapUVPlane(dst_uv, half_width * 2, dst_vu, half_width * 2, half_width, + half_height); + uint32_t dst_vu_hash = HashDjb2(dst_vu, half_width * half_height * 2, 5381); + EXPECT_EQ(dst_y_hash, 2682851208u); + EXPECT_EQ(dst_vu_hash, 1069662856u); + + free_aligned_buffer_page_end(dst_y); + free_aligned_buffer_page_end(dst_uv); + free_aligned_buffer_page_end(dst_vu); +} + TEST_F(LibYUVConvertTest, TestMJPGToNV21_422) { int width = 0; int height = 0; @@ -1990,6 +1963,40 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV21_422) { free_aligned_buffer_page_end(dst_uv); } +TEST_F(LibYUVConvertTest, TestMJPGToNV12_422) { + int width = 0; + int height = 0; + int ret = MJPGSize(kTest3Jpg, kTest3JpgLen, &width, &height); + EXPECT_EQ(0, ret); + + int half_width = (width + 1) / 2; + int half_height = (height + 1) / 2; + int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * + benchmark_height_ / (width * height); + + align_buffer_page_end(dst_y, width * height); + align_buffer_page_end(dst_uv, half_width * half_height * 2); + for (int times = 0; times < benchmark_iterations; ++times) { + ret = MJPGToNV12(kTest3Jpg, kTest3JpgLen, dst_y, width, dst_uv, + half_width * 2, width, height, width, height); + } + // Expect sucesss + EXPECT_EQ(0, ret); + + // Test result matches known hash value. Hashes are for VU so flip the plane. + uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381); + align_buffer_page_end(dst_vu, half_width * half_height * 2); + SwapUVPlane(dst_uv, half_width * 2, dst_vu, half_width * 2, half_width, + half_height); + uint32_t dst_vu_hash = HashDjb2(dst_vu, half_width * half_height * 2, 5381); + EXPECT_EQ(dst_y_hash, 2682851208u); + EXPECT_EQ(dst_vu_hash, 3543430771u); + + free_aligned_buffer_page_end(dst_y); + free_aligned_buffer_page_end(dst_uv); + free_aligned_buffer_page_end(dst_vu); +} + TEST_F(LibYUVConvertTest, TestMJPGToNV21_400) { int width = 0; int height = 0; @@ -2020,6 +2027,40 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV21_400) { free_aligned_buffer_page_end(dst_uv); } +TEST_F(LibYUVConvertTest, TestMJPGToNV12_400) { + int width = 0; + int height = 0; + int ret = MJPGSize(kTest0Jpg, kTest0JpgLen, &width, &height); + EXPECT_EQ(0, ret); + + int half_width = (width + 1) / 2; + int half_height = (height + 1) / 2; + int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * + benchmark_height_ / (width * height); + + align_buffer_page_end(dst_y, width * height); + align_buffer_page_end(dst_uv, half_width * half_height * 2); + for (int times = 0; times < benchmark_iterations; ++times) { + ret = MJPGToNV12(kTest0Jpg, kTest0JpgLen, dst_y, width, dst_uv, + half_width * 2, width, height, width, height); + } + // Expect sucesss + EXPECT_EQ(0, ret); + + // Test result matches known hash value. Hashes are for VU so flip the plane. + uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381); + align_buffer_page_end(dst_vu, half_width * half_height * 2); + SwapUVPlane(dst_uv, half_width * 2, dst_vu, half_width * 2, half_width, + half_height); + uint32_t dst_vu_hash = HashDjb2(dst_vu, half_width * half_height * 2, 5381); + EXPECT_EQ(dst_y_hash, 330644005u); + EXPECT_EQ(dst_vu_hash, 135214341u); + + free_aligned_buffer_page_end(dst_y); + free_aligned_buffer_page_end(dst_uv); + free_aligned_buffer_page_end(dst_vu); +} + TEST_F(LibYUVConvertTest, TestMJPGToNV21_444) { int width = 0; int height = 0; @@ -2050,6 +2091,40 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV21_444) { free_aligned_buffer_page_end(dst_uv); } +TEST_F(LibYUVConvertTest, TestMJPGToNV12_444) { + int width = 0; + int height = 0; + int ret = MJPGSize(kTest1Jpg, kTest1JpgLen, &width, &height); + EXPECT_EQ(0, ret); + + int half_width = (width + 1) / 2; + int half_height = (height + 1) / 2; + int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * + benchmark_height_ / (width * height); + + align_buffer_page_end(dst_y, width * height); + align_buffer_page_end(dst_uv, half_width * half_height * 2); + for (int times = 0; times < benchmark_iterations; ++times) { + ret = MJPGToNV12(kTest1Jpg, kTest1JpgLen, dst_y, width, dst_uv, + half_width * 2, width, height, width, height); + } + // Expect sucesss + EXPECT_EQ(0, ret); + + // Test result matches known hash value. Hashes are for VU so flip the plane. + uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381); + align_buffer_page_end(dst_vu, half_width * half_height * 2); + SwapUVPlane(dst_uv, half_width * 2, dst_vu, half_width * 2, half_width, + half_height); + uint32_t dst_vu_hash = HashDjb2(dst_vu, half_width * half_height * 2, 5381); + EXPECT_EQ(dst_y_hash, 2682851208u); + EXPECT_EQ(dst_vu_hash, 506143297u); + + free_aligned_buffer_page_end(dst_y); + free_aligned_buffer_page_end(dst_uv); + free_aligned_buffer_page_end(dst_vu); +} + TEST_F(LibYUVConvertTest, TestMJPGToARGB) { int width = 0; int height = 0; @@ -2376,7 +2451,7 @@ TEST_F(LibYUVConvertTest, TestDither) { } #define TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, W1280, DIFF, N, NEG, OFF, FMT_C, BPP_C) \ + YALIGN, W1280, N, NEG, OFF, FMT_C, BPP_C) \ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##Dither##N) { \ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ @@ -2407,7 +2482,6 @@ TEST_F(LibYUVConvertTest, TestDither) { src_y + OFF, kWidth, src_u + OFF, kStrideUV, src_v + OFF, kStrideUV, \ dst_argb_opt + OFF, kStrideB, NULL, kWidth, NEG kHeight); \ } \ - int max_diff = 0; \ /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \ align_buffer_page_end(dst_argb32_c, kWidth* BPP_C* kHeight); \ align_buffer_page_end(dst_argb32_opt, kWidth* BPP_C* kHeight); \ @@ -2418,13 +2492,8 @@ TEST_F(LibYUVConvertTest, TestDither) { FMT_B##To##FMT_C(dst_argb_opt + OFF, kStrideB, dst_argb32_opt, \ kWidth * BPP_C, kWidth, kHeight); \ for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) { \ - int abs_diff = abs(static_cast<int>(dst_argb32_c[i]) - \ - static_cast<int>(dst_argb32_opt[i])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ - } \ + EXPECT_EQ(dst_argb32_c[i], dst_argb32_opt[i]); \ } \ - EXPECT_LE(max_diff, DIFF); \ free_aligned_buffer_page_end(src_y); \ free_aligned_buffer_page_end(src_u); \ free_aligned_buffer_page_end(src_v); \ @@ -2434,22 +2503,21 @@ TEST_F(LibYUVConvertTest, TestDither) { free_aligned_buffer_page_end(dst_argb32_opt); \ } -#define TESTPLANARTOBD(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, DIFF, FMT_C, BPP_C) \ - TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, FMT_C, \ - BPP_C) \ - TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, FMT_C, \ - BPP_C) \ - TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, DIFF, _Invert, -, 0, FMT_C, BPP_C) \ - TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C) - -#ifdef LITTLE_ENDIAN_TEST -TESTPLANARTOBD(I420, 2, 2, RGB565, 2, 2, 1, 9, ARGB, 4) +#define TESTPLANARTOBD(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, FMT_C, BPP_C) \ + TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_ - 4, _Any, +, 0, FMT_C, BPP_C) \ + TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_, _Unaligned, +, 1, FMT_C, BPP_C) \ + TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_, _Invert, -, 0, FMT_C, BPP_C) \ + TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_, _Opt, +, 0, FMT_C, BPP_C) + +#ifdef LITTLE_ENDIAN_ONLY_TEST +TESTPLANARTOBD(I420, 2, 2, RGB565, 2, 2, 1, ARGB, 4) #endif + #define TESTPTOB(NAME, UYVYTOI420, UYVYTONV12) \ TEST_F(LibYUVConvertTest, NAME) { \ const int kWidth = benchmark_width_; \ @@ -2591,7 +2659,7 @@ TESTPLANARTOE(H420, 2, 2, RAW, 1, 3, RGB24, 3) TESTPLANARTOE(H420, 2, 2, RGB24, 1, 3, RAW, 3) TESTPLANARTOE(H420, 2, 2, ARGB, 1, 4, RAW, 3) TESTPLANARTOE(H420, 2, 2, RAW, 1, 3, ARGB, 4) -#ifdef LITTLE_ENDIAN_TEST +#ifdef LITTLE_ENDIAN_ONLY_TEST TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RGB565, 2) TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB1555, 2) TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB4444, 2) @@ -2688,6 +2756,12 @@ TESTPLANARTOE(I422, 2, 1, UYVY, 2, 4, ARGB, 4) TESTQPLANARTOE(I420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4) TESTQPLANARTOE(I420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4) +TESTQPLANARTOE(J420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4) +TESTQPLANARTOE(J420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4) +TESTQPLANARTOE(H420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4) +TESTQPLANARTOE(H420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4) +TESTQPLANARTOE(U420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4) +TESTQPLANARTOE(U420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4) #define TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, W1280, N, NEG, \ OFF, FMT_C, BPP_C) \ @@ -2738,7 +2812,7 @@ TESTQPLANARTOE(I420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4) _Opt, +, 0, FMT_C, BPP_C) // Caveat: Destination needs to be 4 bytes -#ifdef LITTLE_ENDIAN_TEST +#ifdef LITTLE_ENDIAN_ONLY_TEST TESTPLANETOE(ARGB, 1, 4, AR30, 1, 4, ARGB, 4) TESTPLANETOE(ABGR, 1, 4, AR30, 1, 4, ABGR, 4) TESTPLANETOE(AR30, 1, 4, ARGB, 1, 4, ABGR, 4) @@ -2854,7 +2928,7 @@ TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) { // TODO(fbarchard): Fix clamping issue affected by U channel. #define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ - ALIGN, YALIGN, W1280, DIFF, N, NEG, SOFF, DOFF) \ + ALIGN, YALIGN, W1280, N, NEG, SOFF, DOFF) \ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ @@ -2890,15 +2964,9 @@ TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) { reinterpret_cast<uint16_t*>(src_v + SOFF), kStrideUV, \ dst_argb_opt + DOFF, kStrideB, kWidth, NEG kHeight); \ } \ - int max_diff = 0; \ for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \ - int abs_diff = abs(static_cast<int>(dst_argb_c[i + DOFF]) - \ - static_cast<int>(dst_argb_opt[i + DOFF])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ - } \ + EXPECT_EQ(dst_argb_c[i + DOFF], dst_argb_opt[i + DOFF]); \ } \ - EXPECT_LE(max_diff, DIFF); \ free_aligned_buffer_page_end(src_y); \ free_aligned_buffer_page_end(src_u); \ free_aligned_buffer_page_end(src_v); \ @@ -2907,41 +2975,41 @@ TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) { } #define TESTPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, DIFF) \ + YALIGN) \ TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, 0) \ + YALIGN, benchmark_width_ - 4, _Any, +, 0, 0) \ TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, 1) \ + YALIGN, benchmark_width_, _Unaligned, +, 1, 1) \ TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, DIFF, _Invert, -, 0, 0) \ + YALIGN, benchmark_width_, _Invert, -, 0, 0) \ TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, DIFF, _Opt, +, 0, 0) - -TESTPLANAR16TOB(I010, 2, 2, ARGB, 4, 4, 1, 2) -TESTPLANAR16TOB(I010, 2, 2, ABGR, 4, 4, 1, 2) -TESTPLANAR16TOB(H010, 2, 2, ARGB, 4, 4, 1, 2) -TESTPLANAR16TOB(H010, 2, 2, ABGR, 4, 4, 1, 2) -TESTPLANAR16TOB(U010, 2, 2, ARGB, 4, 4, 1, 2) -TESTPLANAR16TOB(U010, 2, 2, ABGR, 4, 4, 1, 2) -TESTPLANAR16TOB(I210, 2, 1, ARGB, 4, 4, 1, 2) -TESTPLANAR16TOB(I210, 2, 1, ABGR, 4, 4, 1, 2) -TESTPLANAR16TOB(H210, 2, 1, ARGB, 4, 4, 1, 2) -TESTPLANAR16TOB(H210, 2, 1, ABGR, 4, 4, 1, 2) -TESTPLANAR16TOB(U210, 2, 1, ARGB, 4, 4, 1, 2) -TESTPLANAR16TOB(U210, 2, 1, ABGR, 4, 4, 1, 2) -#ifdef LITTLE_ENDIAN_TEST -TESTPLANAR16TOB(I010, 2, 2, AR30, 4, 4, 1, 2) -TESTPLANAR16TOB(I010, 2, 2, AB30, 4, 4, 1, 2) -TESTPLANAR16TOB(H010, 2, 2, AR30, 4, 4, 1, 2) -TESTPLANAR16TOB(H010, 2, 2, AB30, 4, 4, 1, 2) -TESTPLANAR16TOB(U010, 2, 2, AR30, 4, 4, 1, 2) -TESTPLANAR16TOB(U010, 2, 2, AB30, 4, 4, 1, 2) -TESTPLANAR16TOB(I210, 2, 1, AR30, 4, 4, 1, 2) -TESTPLANAR16TOB(I210, 2, 1, AB30, 4, 4, 1, 2) -TESTPLANAR16TOB(H210, 2, 1, AR30, 4, 4, 1, 2) -TESTPLANAR16TOB(H210, 2, 1, AB30, 4, 4, 1, 2) -TESTPLANAR16TOB(U210, 2, 1, AR30, 4, 4, 1, 2) -TESTPLANAR16TOB(U210, 2, 1, AB30, 4, 4, 1, 2) + YALIGN, benchmark_width_, _Opt, +, 0, 0) + +TESTPLANAR16TOB(I010, 2, 2, ARGB, 4, 4, 1) +TESTPLANAR16TOB(I010, 2, 2, ABGR, 4, 4, 1) +TESTPLANAR16TOB(H010, 2, 2, ARGB, 4, 4, 1) +TESTPLANAR16TOB(H010, 2, 2, ABGR, 4, 4, 1) +TESTPLANAR16TOB(U010, 2, 2, ARGB, 4, 4, 1) +TESTPLANAR16TOB(U010, 2, 2, ABGR, 4, 4, 1) +TESTPLANAR16TOB(I210, 2, 1, ARGB, 4, 4, 1) +TESTPLANAR16TOB(I210, 2, 1, ABGR, 4, 4, 1) +TESTPLANAR16TOB(H210, 2, 1, ARGB, 4, 4, 1) +TESTPLANAR16TOB(H210, 2, 1, ABGR, 4, 4, 1) +TESTPLANAR16TOB(U210, 2, 1, ARGB, 4, 4, 1) +TESTPLANAR16TOB(U210, 2, 1, ABGR, 4, 4, 1) +#ifdef LITTLE_ENDIAN_ONLY_TEST +TESTPLANAR16TOB(I010, 2, 2, AR30, 4, 4, 1) +TESTPLANAR16TOB(I010, 2, 2, AB30, 4, 4, 1) +TESTPLANAR16TOB(H010, 2, 2, AR30, 4, 4, 1) +TESTPLANAR16TOB(H010, 2, 2, AB30, 4, 4, 1) +TESTPLANAR16TOB(U010, 2, 2, AR30, 4, 4, 1) +TESTPLANAR16TOB(U010, 2, 2, AB30, 4, 4, 1) +TESTPLANAR16TOB(I210, 2, 1, AR30, 4, 4, 1) +TESTPLANAR16TOB(I210, 2, 1, AB30, 4, 4, 1) +TESTPLANAR16TOB(H210, 2, 1, AR30, 4, 4, 1) +TESTPLANAR16TOB(H210, 2, 1, AB30, 4, 4, 1) +TESTPLANAR16TOB(U210, 2, 1, AR30, 4, 4, 1) +TESTPLANAR16TOB(U210, 2, 1, AB30, 4, 4, 1) #endif static int Clamp(int y) { @@ -3300,4 +3368,66 @@ TEST_F(LibYUVConvertTest, TestARGBToRGB24) { free_aligned_buffer_page_end(dest_rgb24); } +// Test I400 with jpeg matrix is same as J400 +TEST_F(LibYUVConvertTest, TestI400) { + const int kSize = 256; + align_buffer_page_end(orig_i400, kSize); + align_buffer_page_end(argb_pixels_i400, kSize * 4); + align_buffer_page_end(argb_pixels_j400, kSize * 4); + align_buffer_page_end(argb_pixels_jpeg_i400, kSize * 4); + align_buffer_page_end(argb_pixels_h709_i400, kSize * 4); + align_buffer_page_end(argb_pixels_2020_i400, kSize * 4); + + // Test grey scale + for (int i = 0; i < kSize; ++i) { + orig_i400[i] = i; + } + + J400ToARGB(orig_i400, 0, argb_pixels_j400, 0, kSize, 1); + I400ToARGB(orig_i400, 0, argb_pixels_i400, 0, kSize, 1); + I400ToARGBMatrix(orig_i400, 0, argb_pixels_jpeg_i400, 0, &kYuvJPEGConstants, + kSize, 1); + I400ToARGBMatrix(orig_i400, 0, argb_pixels_h709_i400, 0, &kYuvH709Constants, + kSize, 1); + I400ToARGBMatrix(orig_i400, 0, argb_pixels_2020_i400, 0, &kYuv2020Constants, + kSize, 1); + + EXPECT_EQ(0, argb_pixels_i400[0]); + EXPECT_EQ(0, argb_pixels_j400[0]); + EXPECT_EQ(0, argb_pixels_jpeg_i400[0]); + EXPECT_EQ(0, argb_pixels_h709_i400[0]); + EXPECT_EQ(0, argb_pixels_2020_i400[0]); + EXPECT_EQ(0, argb_pixels_i400[16 * 4]); + EXPECT_EQ(16, argb_pixels_j400[16 * 4]); + EXPECT_EQ(16, argb_pixels_jpeg_i400[16 * 4]); + EXPECT_EQ(0, argb_pixels_h709_i400[16 * 4]); + EXPECT_EQ(0, argb_pixels_2020_i400[16 * 4]); + EXPECT_EQ(130, argb_pixels_i400[128 * 4]); + EXPECT_EQ(128, argb_pixels_j400[128 * 4]); + EXPECT_EQ(128, argb_pixels_jpeg_i400[128 * 4]); + EXPECT_EQ(130, argb_pixels_h709_i400[128 * 4]); + EXPECT_EQ(130, argb_pixels_2020_i400[128 * 4]); + EXPECT_EQ(255, argb_pixels_i400[255 * 4]); + EXPECT_EQ(255, argb_pixels_j400[255 * 4]); + EXPECT_EQ(255, argb_pixels_jpeg_i400[255 * 4]); + EXPECT_EQ(255, argb_pixels_h709_i400[255 * 4]); + EXPECT_EQ(255, argb_pixels_2020_i400[255 * 4]); + + for (int i = 0; i < kSize * 4; ++i) { + if ((i & 3) == 3) { + EXPECT_EQ(255, argb_pixels_j400[i]); + } else { + EXPECT_EQ(i / 4, argb_pixels_j400[i]); + } + EXPECT_EQ(argb_pixels_jpeg_i400[i], argb_pixels_j400[i]); + } + + free_aligned_buffer_page_end(orig_i400); + free_aligned_buffer_page_end(argb_pixels_i400); + free_aligned_buffer_page_end(argb_pixels_j400); + free_aligned_buffer_page_end(argb_pixels_jpeg_i400); + free_aligned_buffer_page_end(argb_pixels_h709_i400); + free_aligned_buffer_page_end(argb_pixels_2020_i400); +} + } // namespace libyuv diff --git a/chromium/third_party/libyuv/unit_test/cpu_test.cc b/chromium/third_party/libyuv/unit_test/cpu_test.cc index bc7af2f1574..7264de08016 100644 --- a/chromium/third_party/libyuv/unit_test/cpu_test.cc +++ b/chromium/third_party/libyuv/unit_test/cpu_test.cc @@ -160,6 +160,23 @@ TEST_F(LibYUVBaseTest, TestLinuxNeon) { #endif } +TEST_F(LibYUVBaseTest, TestLinuxMipsMsaMmi) { + if (FileExists("../../unit_test/testdata/mips.txt")) { + printf("Note: testing to load \"../../unit_test/testdata/mips.txt\"\n"); + + EXPECT_EQ(0, MipsCpuCaps("../../unit_test/testdata/mips.txt")); + EXPECT_EQ(kCpuHasMMI, + MipsCpuCaps("../../unit_test/testdata/mips_loongson3.txt")); + EXPECT_EQ(kCpuHasMMI, + MipsCpuCaps("../../unit_test/testdata/mips_loongson_mmi.txt")); + EXPECT_EQ(kCpuHasMSA, MipsCpuCaps("../../unit_test/testdata/mips_msa.txt")); + EXPECT_EQ(kCpuHasMMI | kCpuHasMSA, + MipsCpuCaps("../../unit_test/testdata/mips_loongson2k.txt")); + } else { + printf("WARNING: unable to load \"../../unit_test/testdata/mips.txt\"\n"); + } +} + // TODO(fbarchard): Fix clangcl test of cpuflags. #ifdef _MSC_VER TEST_F(LibYUVBaseTest, DISABLED_TestSetCpuFlags) { diff --git a/chromium/third_party/libyuv/unit_test/cpu_thread_test.cc b/chromium/third_party/libyuv/unit_test/cpu_thread_test.cc index 59061b98e0b..69aab74e7c8 100644 --- a/chromium/third_party/libyuv/unit_test/cpu_thread_test.cc +++ b/chromium/third_party/libyuv/unit_test/cpu_thread_test.cc @@ -12,7 +12,7 @@ #include "libyuv/cpu_id.h" -#if defined(__clang__) +#if defined(__clang__) && !defined(__wasm__) #if __has_include(<pthread.h>) #define LIBYUV_HAVE_PTHREAD 1 #endif @@ -30,7 +30,7 @@ namespace libyuv { void* ThreadMain(void* arg) { int* flags = static_cast<int*>(arg); - *flags = TestCpuFlag(kCpuHasSSSE3); + *flags = TestCpuFlag(kCpuInitialized); return nullptr; } #endif // LIBYUV_HAVE_PTHREAD diff --git a/chromium/third_party/libyuv/unit_test/planar_test.cc b/chromium/third_party/libyuv/unit_test/planar_test.cc index 02cd1fbc39d..e05ff15640c 100644 --- a/chromium/third_party/libyuv/unit_test/planar_test.cc +++ b/chromium/third_party/libyuv/unit_test/planar_test.cc @@ -21,6 +21,7 @@ #include "libyuv/cpu_id.h" #include "libyuv/planar_functions.h" #include "libyuv/rotate.h" +#include "libyuv/scale.h" #ifdef ENABLE_ROW_TESTS // row.h defines SIMD_ALIGNED, overriding unit_test.h @@ -781,27 +782,75 @@ TEST_F(LibYUVPlanarTest, TestARGBQuantize) { } } -TEST_F(LibYUVPlanarTest, TestARGBMirror) { - SIMD_ALIGNED(uint8_t orig_pixels[1280][4]); - SIMD_ALIGNED(uint8_t dst_pixels[1280][4]); +TEST_F(LibYUVPlanarTest, ARGBMirror_Opt) { + align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_ * 4); + align_buffer_page_end(dst_pixels_opt, + benchmark_width_ * benchmark_height_ * 4); + align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_ * 4); - for (int i = 0; i < 1280; ++i) { - orig_pixels[i][0] = i; - orig_pixels[i][1] = i / 2; - orig_pixels[i][2] = i / 3; - orig_pixels[i][3] = i / 4; + MemRandomize(src_pixels, benchmark_width_ * benchmark_height_ * 4); + MaskCpuFlags(disable_cpu_flags_); + ARGBMirror(src_pixels, benchmark_width_ * 4, dst_pixels_c, + benchmark_width_ * 4, benchmark_width_, benchmark_height_); + MaskCpuFlags(benchmark_cpu_info_); + + for (int i = 0; i < benchmark_iterations_; ++i) { + ARGBMirror(src_pixels, benchmark_width_ * 4, dst_pixels_opt, + benchmark_width_ * 4, benchmark_width_, benchmark_height_); + } + for (int i = 0; i < benchmark_width_ * benchmark_height_ * 4; ++i) { + EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } - ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 1280, 1); + free_aligned_buffer_page_end(src_pixels); + free_aligned_buffer_page_end(dst_pixels_opt); + free_aligned_buffer_page_end(dst_pixels_c); +} - for (int i = 0; i < 1280; ++i) { - EXPECT_EQ(i & 255, dst_pixels[1280 - 1 - i][0]); - EXPECT_EQ((i / 2) & 255, dst_pixels[1280 - 1 - i][1]); - EXPECT_EQ((i / 3) & 255, dst_pixels[1280 - 1 - i][2]); - EXPECT_EQ((i / 4) & 255, dst_pixels[1280 - 1 - i][3]); +TEST_F(LibYUVPlanarTest, MirrorPlane_Opt) { + align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_); + align_buffer_page_end(dst_pixels_opt, benchmark_width_ * benchmark_height_); + align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_); + + MemRandomize(src_pixels, benchmark_width_ * benchmark_height_); + MaskCpuFlags(disable_cpu_flags_); + MirrorPlane(src_pixels, benchmark_width_, dst_pixels_c, benchmark_width_, + benchmark_width_, benchmark_height_); + MaskCpuFlags(benchmark_cpu_info_); + + for (int i = 0; i < benchmark_iterations_; ++i) { + MirrorPlane(src_pixels, benchmark_width_, dst_pixels_opt, benchmark_width_, + benchmark_width_, benchmark_height_); } - for (int i = 0; i < benchmark_pixels_div1280_; ++i) { - ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 1280, 1); + for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) { + EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } + free_aligned_buffer_page_end(src_pixels); + free_aligned_buffer_page_end(dst_pixels_opt); + free_aligned_buffer_page_end(dst_pixels_c); +} + +TEST_F(LibYUVPlanarTest, MirrorUVPlane_Opt) { + align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_ * 2); + align_buffer_page_end(dst_pixels_opt, + benchmark_width_ * benchmark_height_ * 2); + align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_ * 2); + + MemRandomize(src_pixels, benchmark_width_ * benchmark_height_ * 2); + MaskCpuFlags(disable_cpu_flags_); + MirrorUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_c, + benchmark_width_ * 2, benchmark_width_, benchmark_height_); + MaskCpuFlags(benchmark_cpu_info_); + + for (int i = 0; i < benchmark_iterations_; ++i) { + MirrorUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_opt, + benchmark_width_ * 2, benchmark_width_, benchmark_height_); + } + for (int i = 0; i < benchmark_width_ * benchmark_height_ * 2; ++i) { + EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); + } + free_aligned_buffer_page_end(src_pixels); + free_aligned_buffer_page_end(dst_pixels_opt); + free_aligned_buffer_page_end(dst_pixels_c); } TEST_F(LibYUVPlanarTest, TestShade) { @@ -1076,7 +1125,8 @@ static int TestBlend(int width, int disable_cpu_flags, int benchmark_cpu_info, int invert, - int off) { + int off, + int attenuate) { if (width < 1) { width = 1; } @@ -1090,10 +1140,12 @@ static int TestBlend(int width, src_argb_a[i + off] = (fastrand() & 0xff); src_argb_b[i + off] = (fastrand() & 0xff); } - ARGBAttenuate(src_argb_a + off, kStride, src_argb_a + off, kStride, width, - height); - ARGBAttenuate(src_argb_b + off, kStride, src_argb_b + off, kStride, width, - height); + MemRandomize(src_argb_a, kStride * height + off); + MemRandomize(src_argb_b, kStride * height + off); + if (attenuate) { + ARGBAttenuate(src_argb_a + off, kStride, src_argb_a + off, kStride, width, + height); + } memset(dst_argb_c, 255, kStride * height); memset(dst_argb_opt, 255, kStride * height); @@ -1123,28 +1175,35 @@ static int TestBlend(int width, TEST_F(LibYUVPlanarTest, ARGBBlend_Any) { int max_diff = TestBlend(benchmark_width_ - 4, benchmark_height_, benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, +1, 0); + disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 1); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, ARGBBlend_Unaligned) { int max_diff = TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, +1, 1); + disable_cpu_flags_, benchmark_cpu_info_, +1, 1, 1); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, ARGBBlend_Invert) { int max_diff = TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, -1, 0); + disable_cpu_flags_, benchmark_cpu_info_, -1, 0, 1); + EXPECT_LE(max_diff, 1); +} + +TEST_F(LibYUVPlanarTest, ARGBBlend_Unattenuated) { + int max_diff = + TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 0); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, ARGBBlend_Opt) { int max_diff = TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, +1, 0); + disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 1); EXPECT_LE(max_diff, 1); } @@ -3234,33 +3293,33 @@ extern "C" void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width); extern "C" void GaussRow_C(const uint32_t* src, uint16_t* dst, int width); TEST_F(LibYUVPlanarTest, TestGaussRow_Opt) { - SIMD_ALIGNED(uint32_t orig_pixels[640 + 4]); - SIMD_ALIGNED(uint16_t dst_pixels_c[640]); - SIMD_ALIGNED(uint16_t dst_pixels_opt[640]); + SIMD_ALIGNED(uint32_t orig_pixels[1280 + 8]); + SIMD_ALIGNED(uint16_t dst_pixels_c[1280]); + SIMD_ALIGNED(uint16_t dst_pixels_opt[1280]); memset(orig_pixels, 0, sizeof(orig_pixels)); memset(dst_pixels_c, 1, sizeof(dst_pixels_c)); memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt)); - for (int i = 0; i < 640 + 4; ++i) { + for (int i = 0; i < 1280 + 8; ++i) { orig_pixels[i] = i * 256; } - GaussRow_C(&orig_pixels[0], &dst_pixels_c[0], 640); - for (int i = 0; i < benchmark_pixels_div1280_ * 2; ++i) { + GaussRow_C(&orig_pixels[0], &dst_pixels_c[0], 1280); + for (int i = 0; i < benchmark_pixels_div1280_; ++i) { #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON)) int has_neon = TestCpuFlag(kCpuHasNEON); if (has_neon) { - GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 640); + GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280); } else { - GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 640); + GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 1280); } #else - GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 640); + GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 1280); #endif } - for (int i = 0; i < 640; ++i) { + for (int i = 0; i < 1280; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } @@ -3286,48 +3345,115 @@ extern "C" void GaussCol_C(const uint16_t* src0, int width); TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) { - SIMD_ALIGNED(uint16_t orig_pixels[640 * 5]); - SIMD_ALIGNED(uint32_t dst_pixels_c[640]); - SIMD_ALIGNED(uint32_t dst_pixels_opt[640]); + SIMD_ALIGNED(uint16_t orig_pixels[1280 * 5]); + SIMD_ALIGNED(uint32_t dst_pixels_c[1280]); + SIMD_ALIGNED(uint32_t dst_pixels_opt[1280]); memset(orig_pixels, 0, sizeof(orig_pixels)); memset(dst_pixels_c, 1, sizeof(dst_pixels_c)); memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt)); - for (int i = 0; i < 640 * 5; ++i) { - orig_pixels[i] = i; + for (int i = 0; i < 1280 * 5; ++i) { + orig_pixels[i] = static_cast<float>(i); } - GaussCol_C(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2], - &orig_pixels[640 * 3], &orig_pixels[640 * 4], &dst_pixels_c[0], - 640); - for (int i = 0; i < benchmark_pixels_div1280_ * 2; ++i) { + GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2], + &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], &dst_pixels_c[0], + 1280); + for (int i = 0; i < benchmark_pixels_div1280_; ++i) { #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON)) int has_neon = TestCpuFlag(kCpuHasNEON); if (has_neon) { - GaussCol_NEON(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2], - &orig_pixels[640 * 3], &orig_pixels[640 * 4], - &dst_pixels_opt[0], 640); + GaussCol_NEON(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2], + &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], + &dst_pixels_opt[0], 1280); } else { - GaussCol_C(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2], - &orig_pixels[640 * 3], &orig_pixels[640 * 4], - &dst_pixels_opt[0], 640); + GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2], + &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], + &dst_pixels_opt[0], 1280); } #else - GaussCol_C(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2], - &orig_pixels[640 * 3], &orig_pixels[640 * 4], &dst_pixels_opt[0], - 640); + GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2], + &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], + &dst_pixels_opt[0], 1280); #endif } - for (int i = 0; i < 640; ++i) { + for (int i = 0; i < 1280; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } +} - EXPECT_EQ(dst_pixels_c[0], - static_cast<uint32_t>(0 * 1 + 640 * 4 + 640 * 2 * 6 + 640 * 3 * 4 + - 640 * 4 * 1)); - EXPECT_EQ(dst_pixels_c[639], static_cast<uint32_t>(30704)); +TEST_F(LibYUVPlanarTest, TestGaussRow_F32_Opt) { + SIMD_ALIGNED(float orig_pixels[1280 + 4]); + SIMD_ALIGNED(float dst_pixels_c[1280]); + SIMD_ALIGNED(float dst_pixels_opt[1280]); + + memset(orig_pixels, 0, sizeof(orig_pixels)); + memset(dst_pixels_c, 1, sizeof(dst_pixels_c)); + memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt)); + + for (int i = 0; i < 1280 + 4; ++i) { + orig_pixels[i] = static_cast<float>(i); + } + GaussRow_F32_C(&orig_pixels[0], &dst_pixels_c[0], 1280); + for (int i = 0; i < benchmark_pixels_div1280_; ++i) { +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + int has_neon = TestCpuFlag(kCpuHasNEON); + if (has_neon) { + GaussRow_F32_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280); + } else { + GaussRow_F32_C(&orig_pixels[0], &dst_pixels_opt[0], 1280); + } +#else + GaussRow_F32_C(&orig_pixels[0], &dst_pixels_opt[0], 1280); +#endif + } + + for (int i = 0; i < 1280; ++i) { + EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); + } +} + +TEST_F(LibYUVPlanarTest, TestGaussCol_F32_Opt) { + SIMD_ALIGNED(float dst_pixels_c[1280]); + SIMD_ALIGNED(float dst_pixels_opt[1280]); + align_buffer_page_end(orig_pixels_buf, 1280 * 5 * 4); // 5 rows + float* orig_pixels = reinterpret_cast<float*>(orig_pixels_buf); + + memset(orig_pixels, 0, 1280 * 5 * 4); + memset(dst_pixels_c, 1, sizeof(dst_pixels_c)); + memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt)); + + for (int i = 0; i < 1280 * 5; ++i) { + orig_pixels[i] = static_cast<float>(i); + } + GaussCol_F32_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2], + &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], + &dst_pixels_c[0], 1280); + for (int i = 0; i < benchmark_pixels_div1280_; ++i) { +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + int has_neon = TestCpuFlag(kCpuHasNEON); + if (has_neon) { + GaussCol_F32_NEON(&orig_pixels[0], &orig_pixels[1280], + &orig_pixels[1280 * 2], &orig_pixels[1280 * 3], + &orig_pixels[1280 * 4], &dst_pixels_opt[0], 1280); + } else { + GaussCol_F32_C(&orig_pixels[0], &orig_pixels[1280], + &orig_pixels[1280 * 2], &orig_pixels[1280 * 3], + &orig_pixels[1280 * 4], &dst_pixels_opt[0], 1280); + } +#else + GaussCol_F32_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2], + &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], + &dst_pixels_opt[0], 1280); +#endif + } + + for (int i = 0; i < 1280; ++i) { + EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); + } + free_aligned_buffer_page_end(orig_pixels_buf); } TEST_F(LibYUVPlanarTest, SwapUVRow) { @@ -3360,6 +3486,144 @@ TEST_F(LibYUVPlanarTest, SwapUVRow) { free_aligned_buffer_page_end(src_pixels_vu); free_aligned_buffer_page_end(dst_pixels_uv); } -#endif +#endif // ENABLE_ROW_TESTS + +TEST_F(LibYUVPlanarTest, TestGaussPlane_F32) { + const int kSize = benchmark_width_ * benchmark_height_ * 4; + align_buffer_page_end(orig_pixels, kSize); + align_buffer_page_end(dst_pixels_opt, kSize); + align_buffer_page_end(dst_pixels_c, kSize); + + for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) { + ((float*)(orig_pixels))[i] = (i & 1023) * 3.14f; + } + memset(dst_pixels_opt, 1, kSize); + memset(dst_pixels_c, 2, kSize); + + MaskCpuFlags(disable_cpu_flags_); + GaussPlane_F32((const float*)(orig_pixels), benchmark_width_, + (float*)(dst_pixels_c), benchmark_width_, benchmark_width_, + benchmark_height_); + MaskCpuFlags(benchmark_cpu_info_); + + for (int i = 0; i < benchmark_iterations_; ++i) { + GaussPlane_F32((const float*)(orig_pixels), benchmark_width_, + (float*)(dst_pixels_opt), benchmark_width_, benchmark_width_, + benchmark_height_); + } + for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) { + EXPECT_NEAR(((float*)(dst_pixels_c))[i], ((float*)(dst_pixels_opt))[i], 1.f) + << i; + } + + free_aligned_buffer_page_end(dst_pixels_c); + free_aligned_buffer_page_end(dst_pixels_opt); + free_aligned_buffer_page_end(orig_pixels); +} + +TEST_F(LibYUVPlanarTest, HalfMergeUVPlane_Opt) { + int dst_width = (benchmark_width_ + 1) / 2; + int dst_height = (benchmark_height_ + 1) / 2; + align_buffer_page_end(src_pixels_u, benchmark_width_ * benchmark_height_); + align_buffer_page_end(src_pixels_v, benchmark_width_ * benchmark_height_); + align_buffer_page_end(tmp_pixels_u, dst_width * dst_height); + align_buffer_page_end(tmp_pixels_v, dst_width * dst_height); + align_buffer_page_end(dst_pixels_uv_opt, dst_width * 2 * dst_height); + align_buffer_page_end(dst_pixels_uv_c, dst_width * 2 * dst_height); + + MemRandomize(src_pixels_u, benchmark_width_ * benchmark_height_); + MemRandomize(src_pixels_v, benchmark_width_ * benchmark_height_); + MemRandomize(tmp_pixels_u, dst_width * dst_height); + MemRandomize(tmp_pixels_v, dst_width * dst_height); + MemRandomize(dst_pixels_uv_opt, dst_width * 2 * dst_height); + MemRandomize(dst_pixels_uv_c, dst_width * 2 * dst_height); + + MaskCpuFlags(disable_cpu_flags_); + HalfMergeUVPlane(src_pixels_u, benchmark_width_, src_pixels_v, + benchmark_width_, dst_pixels_uv_c, dst_width * 2, + benchmark_width_, benchmark_height_); + MaskCpuFlags(benchmark_cpu_info_); + + for (int i = 0; i < benchmark_iterations_; ++i) { + HalfMergeUVPlane(src_pixels_u, benchmark_width_, src_pixels_v, + benchmark_width_, dst_pixels_uv_opt, dst_width * 2, + benchmark_width_, benchmark_height_); + } + + for (int i = 0; i < dst_width * 2 * dst_height; ++i) { + EXPECT_EQ(dst_pixels_uv_c[i], dst_pixels_uv_opt[i]); + } + + free_aligned_buffer_page_end(src_pixels_u); + free_aligned_buffer_page_end(src_pixels_v); + free_aligned_buffer_page_end(tmp_pixels_u); + free_aligned_buffer_page_end(tmp_pixels_v); + free_aligned_buffer_page_end(dst_pixels_uv_opt); + free_aligned_buffer_page_end(dst_pixels_uv_c); +} + +TEST_F(LibYUVPlanarTest, NV12Copy) { + const int halfwidth = (benchmark_width_ + 1) >> 1; + const int halfheight = (benchmark_height_ + 1) >> 1; + align_buffer_page_end(src_y, benchmark_width_ * benchmark_height_); + align_buffer_page_end(src_uv, halfwidth * 2 * halfheight); + align_buffer_page_end(dst_y, benchmark_width_ * benchmark_height_); + align_buffer_page_end(dst_uv, halfwidth * 2 * halfheight); + + MemRandomize(src_y, benchmark_width_ * benchmark_height_); + MemRandomize(src_uv, halfwidth * 2 * halfheight); + MemRandomize(dst_y, benchmark_width_ * benchmark_height_); + MemRandomize(dst_uv, halfwidth * 2 * halfheight); + + for (int i = 0; i < benchmark_iterations_; ++i) { + NV12Copy(src_y, benchmark_width_, src_uv, halfwidth * 2, dst_y, + benchmark_width_, dst_uv, halfwidth * 2, benchmark_width_, + benchmark_height_); + } + + for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) { + EXPECT_EQ(src_y[i], dst_y[i]); + } + for (int i = 0; i < halfwidth * 2 * halfheight; ++i) { + EXPECT_EQ(src_uv[i], dst_uv[i]); + } + + free_aligned_buffer_page_end(src_y); + free_aligned_buffer_page_end(src_uv); + free_aligned_buffer_page_end(dst_y); + free_aligned_buffer_page_end(dst_uv); +} + +TEST_F(LibYUVPlanarTest, NV21Copy) { + const int halfwidth = (benchmark_width_ + 1) >> 1; + const int halfheight = (benchmark_height_ + 1) >> 1; + align_buffer_page_end(src_y, benchmark_width_ * benchmark_height_); + align_buffer_page_end(src_vu, halfwidth * 2 * halfheight); + align_buffer_page_end(dst_y, benchmark_width_ * benchmark_height_); + align_buffer_page_end(dst_vu, halfwidth * 2 * halfheight); + + MemRandomize(src_y, benchmark_width_ * benchmark_height_); + MemRandomize(src_vu, halfwidth * 2 * halfheight); + MemRandomize(dst_y, benchmark_width_ * benchmark_height_); + MemRandomize(dst_vu, halfwidth * 2 * halfheight); + + for (int i = 0; i < benchmark_iterations_; ++i) { + NV21Copy(src_y, benchmark_width_, src_vu, halfwidth * 2, dst_y, + benchmark_width_, dst_vu, halfwidth * 2, benchmark_width_, + benchmark_height_); + } + + for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) { + EXPECT_EQ(src_y[i], dst_y[i]); + } + for (int i = 0; i < halfwidth * 2 * halfheight; ++i) { + EXPECT_EQ(src_vu[i], dst_vu[i]); + } + + free_aligned_buffer_page_end(src_y); + free_aligned_buffer_page_end(src_vu); + free_aligned_buffer_page_end(dst_y); + free_aligned_buffer_page_end(dst_vu); +} } // namespace libyuv diff --git a/chromium/third_party/libyuv/unit_test/rotate_argb_test.cc b/chromium/third_party/libyuv/unit_test/rotate_argb_test.cc index d2003895961..3208b66a2ad 100644 --- a/chromium/third_party/libyuv/unit_test/rotate_argb_test.cc +++ b/chromium/third_party/libyuv/unit_test/rotate_argb_test.cc @@ -183,4 +183,46 @@ TEST_F(LibYUVRotateTest, DISABLED_RotatePlane270_Odd) { benchmark_cpu_info_); } +TEST_F(LibYUVRotateTest, RotatePlane90_TestStride) { + int argb_plane_size = benchmark_width_ * 4 * abs(benchmark_height_); + + align_buffer_page_end(src_argb, argb_plane_size); + align_buffer_page_end(dst_argb, argb_plane_size); + + EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb, + benchmark_width_ * 4, benchmark_width_, + benchmark_height_, kRotate0)); + + EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb, + benchmark_width_ * 4 - 1, benchmark_width_ - 1, + benchmark_height_, kRotate0)); + + EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb, + benchmark_width_ * 4, benchmark_width_, + benchmark_height_, kRotate180)); + + EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb, + benchmark_width_ * 4 - 1, benchmark_width_ - 1, + benchmark_height_, kRotate180)); + + EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb, + abs(benchmark_height_) * 4, benchmark_width_, + benchmark_height_, kRotate90)); + + EXPECT_EQ(-1, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb, + abs(benchmark_height_) * 4, benchmark_width_ - 1, + benchmark_height_, kRotate90)); + + EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb, + abs(benchmark_height_) * 4, benchmark_width_, + benchmark_height_, kRotate270)); + + EXPECT_EQ(-1, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb, + abs(benchmark_height_) * 4, benchmark_width_ - 1, + benchmark_height_, kRotate270)); + + free_aligned_buffer_page_end(dst_argb); + free_aligned_buffer_page_end(src_argb); +} + } // namespace libyuv diff --git a/chromium/third_party/libyuv/unit_test/scale_argb_test.cc b/chromium/third_party/libyuv/unit_test/scale_argb_test.cc index 93b77f56a9c..2fdf5f60341 100644 --- a/chromium/third_party/libyuv/unit_test/scale_argb_test.cc +++ b/chromium/third_party/libyuv/unit_test/scale_argb_test.cc @@ -305,8 +305,10 @@ TEST_SCALETO(ARGBScale, 1, 1) TEST_SCALETO(ARGBScale, 320, 240) TEST_SCALETO(ARGBScale, 569, 480) TEST_SCALETO(ARGBScale, 640, 360) +#ifdef ENABLE_SLOW_TESTS TEST_SCALETO(ARGBScale, 1280, 720) TEST_SCALETO(ARGBScale, 1920, 1080) +#endif // ENABLE_SLOW_TESTS #undef TEST_SCALETO1 #undef TEST_SCALETO @@ -454,4 +456,79 @@ TEST_F(LibYUVScaleTest, YUVToRGBScaleDown) { EXPECT_LE(diff, 10); } +TEST_F(LibYUVScaleTest, ARGBTest3x) { + const int kSrcStride = 48 * 4; + const int kDstStride = 16 * 4; + const int kSize = kSrcStride * 3; + align_buffer_page_end(orig_pixels, kSize); + for (int i = 0; i < 48 * 3; ++i) { + orig_pixels[i * 4 + 0] = i; + orig_pixels[i * 4 + 1] = 255 - i; + orig_pixels[i * 4 + 2] = i + 1; + orig_pixels[i * 4 + 3] = i + 10; + } + align_buffer_page_end(dest_pixels, kDstStride); + + int iterations16 = + benchmark_width_ * benchmark_height_ / (16 * 1) * benchmark_iterations_; + for (int i = 0; i < iterations16; ++i) { + ARGBScale(orig_pixels, kSrcStride, 48, 3, dest_pixels, kDstStride, 16, 1, + kFilterBilinear); + } + + EXPECT_EQ(49, dest_pixels[0]); + EXPECT_EQ(255 - 49, dest_pixels[1]); + EXPECT_EQ(50, dest_pixels[2]); + EXPECT_EQ(59, dest_pixels[3]); + + ARGBScale(orig_pixels, kSrcStride, 48, 3, dest_pixels, kDstStride, 16, 1, + kFilterNone); + + EXPECT_EQ(49, dest_pixels[0]); + EXPECT_EQ(255 - 49, dest_pixels[1]); + EXPECT_EQ(50, dest_pixels[2]); + EXPECT_EQ(59, dest_pixels[3]); + + free_aligned_buffer_page_end(dest_pixels); + free_aligned_buffer_page_end(orig_pixels); +} + +TEST_F(LibYUVScaleTest, ARGBTest4x) { + const int kSrcStride = 64 * 4; + const int kDstStride = 16 * 4; + const int kSize = kSrcStride * 4; + align_buffer_page_end(orig_pixels, kSize); + for (int i = 0; i < 64 * 4; ++i) { + orig_pixels[i * 4 + 0] = i; + orig_pixels[i * 4 + 1] = 255 - i; + orig_pixels[i * 4 + 2] = i + 1; + orig_pixels[i * 4 + 3] = i + 10; + } + align_buffer_page_end(dest_pixels, kDstStride); + + int iterations16 = + benchmark_width_ * benchmark_height_ / (16 * 1) * benchmark_iterations_; + for (int i = 0; i < iterations16; ++i) { + ARGBScale(orig_pixels, kSrcStride, 64, 4, dest_pixels, kDstStride, 16, 1, + kFilterBilinear); + } + + EXPECT_NEAR((65 + 66 + 129 + 130 + 2) / 4, dest_pixels[0], 4); + EXPECT_NEAR((255 - 65 + 255 - 66 + 255 - 129 + 255 - 130 + 2) / 4, + dest_pixels[1], 4); + EXPECT_NEAR((1 * 4 + 65 + 66 + 129 + 130 + 2) / 4, dest_pixels[2], 4); + EXPECT_NEAR((10 * 4 + 65 + 66 + 129 + 130 + 2) / 4, dest_pixels[3], 4); + + ARGBScale(orig_pixels, kSrcStride, 64, 4, dest_pixels, kDstStride, 16, 1, + kFilterNone); + + EXPECT_EQ(130, dest_pixels[0]); + EXPECT_EQ(255 - 130, dest_pixels[1]); + EXPECT_EQ(130 + 1, dest_pixels[2]); + EXPECT_EQ(130 + 10, dest_pixels[3]); + + free_aligned_buffer_page_end(dest_pixels); + free_aligned_buffer_page_end(orig_pixels); +} + } // namespace libyuv diff --git a/chromium/third_party/libyuv/unit_test/scale_test.cc b/chromium/third_party/libyuv/unit_test/scale_test.cc index 94a78590067..d627af02d63 100644 --- a/chromium/third_party/libyuv/unit_test/scale_test.cc +++ b/chromium/third_party/libyuv/unit_test/scale_test.cc @@ -494,53 +494,173 @@ static int I444TestFilter_16(int src_width, return max_diff; } +// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact. +static int NV12TestFilter(int src_width, + int src_height, + int dst_width, + int dst_height, + FilterMode f, + int benchmark_iterations, + int disable_cpu_flags, + int benchmark_cpu_info) { + if (!SizeValid(src_width, src_height, dst_width, dst_height)) { + return 0; + } + + int i, j; + int src_width_uv = (Abs(src_width) + 1) >> 1; + int src_height_uv = (Abs(src_height) + 1) >> 1; + + int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height)); + int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv)*2; + + int src_stride_y = Abs(src_width); + int src_stride_uv = src_width_uv * 2; + + align_buffer_page_end(src_y, src_y_plane_size); + align_buffer_page_end(src_uv, src_uv_plane_size); + if (!src_y || !src_uv) { + printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n"); + return 0; + } + MemRandomize(src_y, src_y_plane_size); + MemRandomize(src_uv, src_uv_plane_size); + + int dst_width_uv = (dst_width + 1) >> 1; + int dst_height_uv = (dst_height + 1) >> 1; + + int64_t dst_y_plane_size = (dst_width) * (dst_height); + int64_t dst_uv_plane_size = (dst_width_uv) * (dst_height_uv)*2; + + int dst_stride_y = dst_width; + int dst_stride_uv = dst_width_uv * 2; + + align_buffer_page_end(dst_y_c, dst_y_plane_size); + align_buffer_page_end(dst_uv_c, dst_uv_plane_size); + align_buffer_page_end(dst_y_opt, dst_y_plane_size); + align_buffer_page_end(dst_uv_opt, dst_uv_plane_size); + if (!dst_y_c || !dst_uv_c || !dst_y_opt || !dst_uv_opt) { + printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n"); + return 0; + } + + MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. + double c_time = get_time(); + NV12Scale(src_y, src_stride_y, src_uv, src_stride_uv, src_width, src_height, + dst_y_c, dst_stride_y, dst_uv_c, dst_stride_uv, dst_width, + dst_height, f); + c_time = (get_time() - c_time); + + MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. + double opt_time = get_time(); + for (i = 0; i < benchmark_iterations; ++i) { + NV12Scale(src_y, src_stride_y, src_uv, src_stride_uv, src_width, src_height, + dst_y_opt, dst_stride_y, dst_uv_opt, dst_stride_uv, dst_width, + dst_height, f); + } + opt_time = (get_time() - opt_time) / benchmark_iterations; + // Report performance of C vs OPT. + printf("filter %d - %8d us C - %8d us OPT\n", f, + static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6)); + + // C version may be a little off from the optimized. Order of + // operations may introduce rounding somewhere. So do a difference + // of the buffers and look to see that the max difference is not + // over 3. + int max_diff = 0; + for (i = 0; i < (dst_height); ++i) { + for (j = 0; j < (dst_width); ++j) { + int abs_diff = Abs(dst_y_c[(i * dst_stride_y) + j] - + dst_y_opt[(i * dst_stride_y) + j]); + if (abs_diff > max_diff) { + max_diff = abs_diff; + } + } + } + + for (i = 0; i < (dst_height_uv); ++i) { + for (j = 0; j < (dst_width_uv * 2); ++j) { + int abs_diff = Abs(dst_uv_c[(i * dst_stride_uv) + j] - + dst_uv_opt[(i * dst_stride_uv) + j]); + if (abs_diff > max_diff) { + max_diff = abs_diff; + } + } + } + + free_aligned_buffer_page_end(dst_y_c); + free_aligned_buffer_page_end(dst_uv_c); + free_aligned_buffer_page_end(dst_y_opt); + free_aligned_buffer_page_end(dst_uv_opt); + free_aligned_buffer_page_end(src_y); + free_aligned_buffer_page_end(src_uv); + + return max_diff; +} + // The following adjustments in dimensions ensure the scale factor will be // exactly achieved. // 2 is chroma subsample. #define DX(x, nom, denom) static_cast<int>(((Abs(x) / nom + 1) / 2) * nom * 2) #define SX(x, nom, denom) static_cast<int>(((x / nom + 1) / 2) * denom * 2) -#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \ - TEST_F(LibYUVScaleTest, I420ScaleDownBy##name##_##filter) { \ - int diff = I420TestFilter( \ - SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ - DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ - kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \ - benchmark_cpu_info_); \ - EXPECT_LE(diff, max_diff); \ - } \ - TEST_F(LibYUVScaleTest, I444ScaleDownBy##name##_##filter) { \ - int diff = I444TestFilter( \ - SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ - DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ - kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \ - benchmark_cpu_info_); \ - EXPECT_LE(diff, max_diff); \ - } \ - TEST_F(LibYUVScaleTest, I420ScaleDownBy##name##_##filter##_16) { \ - int diff = I420TestFilter_16( \ - SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ - DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ - kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \ - benchmark_cpu_info_); \ - EXPECT_LE(diff, max_diff); \ - } \ - TEST_F(LibYUVScaleTest, I444ScaleDownBy##name##_##filter##_16) { \ - int diff = I444TestFilter_16( \ - SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ - DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ - kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \ - benchmark_cpu_info_); \ - EXPECT_LE(diff, max_diff); \ +#define TEST_FACTOR1(DISABLED_, name, filter, nom, denom, max_diff) \ + TEST_F(LibYUVScaleTest, I420ScaleDownBy##name##_##filter) { \ + int diff = I420TestFilter( \ + SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ + DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ + kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \ + benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } \ + TEST_F(LibYUVScaleTest, I444ScaleDownBy##name##_##filter) { \ + int diff = I444TestFilter( \ + SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ + DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ + kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \ + benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } \ + TEST_F(LibYUVScaleTest, DISABLED_##I420ScaleDownBy##name##_##filter##_16) { \ + int diff = I420TestFilter_16( \ + SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ + DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ + kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \ + benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } \ + TEST_F(LibYUVScaleTest, DISABLED_##I444ScaleDownBy##name##_##filter##_16) { \ + int diff = I444TestFilter_16( \ + SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ + DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ + kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \ + benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } \ + TEST_F(LibYUVScaleTest, NV12ScaleDownBy##name##_##filter) { \ + int diff = NV12TestFilter( \ + SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ + DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ + kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \ + benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ } // Test a scale factor with all 4 filters. Expect unfiltered to be exact, but // filtering is different fixed point implementations for SSSE3, Neon and C. -#define TEST_FACTOR(name, nom, denom, boxdiff) \ - TEST_FACTOR1(name, None, nom, denom, 0) \ - TEST_FACTOR1(name, Linear, nom, denom, 3) \ - TEST_FACTOR1(name, Bilinear, nom, denom, 3) \ - TEST_FACTOR1(name, Box, nom, denom, boxdiff) +#ifdef ENABLE_SLOW_TESTS +#define TEST_FACTOR(name, nom, denom, boxdiff) \ + TEST_FACTOR1(, name, None, nom, denom, 0) \ + TEST_FACTOR1(, name, Linear, nom, denom, 3) \ + TEST_FACTOR1(, name, Bilinear, nom, denom, 3) \ + TEST_FACTOR1(, name, Box, nom, denom, boxdiff) +#else +#define TEST_FACTOR(name, nom, denom, boxdiff) \ + TEST_FACTOR1(DISABLED_, name, None, nom, denom, 0) \ + TEST_FACTOR1(DISABLED_, name, Linear, nom, denom, 3) \ + TEST_FACTOR1(DISABLED_, name, Bilinear, nom, denom, 3) \ + TEST_FACTOR1(DISABLED_, name, Box, nom, denom, boxdiff) +#endif TEST_FACTOR(2, 1, 2, 0) TEST_FACTOR(4, 1, 4, 0) @@ -553,7 +673,7 @@ TEST_FACTOR(3, 1, 3, 0) #undef SX #undef DX -#define TEST_SCALETO1(name, width, height, filter, max_diff) \ +#define TEST_SCALETO1(DISABLED_, name, width, height, filter, max_diff) \ TEST_F(LibYUVScaleTest, I420##name##To##width##x##height##_##filter) { \ int diff = I420TestFilter(benchmark_width_, benchmark_height_, width, \ height, kFilter##filter, benchmark_iterations_, \ @@ -566,18 +686,26 @@ TEST_FACTOR(3, 1, 3, 0) disable_cpu_flags_, benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ - TEST_F(LibYUVScaleTest, I420##name##To##width##x##height##_##filter##_16) { \ + TEST_F(LibYUVScaleTest, \ + DISABLED_##I420##name##To##width##x##height##_##filter##_16) { \ int diff = I420TestFilter_16( \ benchmark_width_, benchmark_height_, width, height, kFilter##filter, \ benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ - TEST_F(LibYUVScaleTest, I444##name##To##width##x##height##_##filter##_16) { \ + TEST_F(LibYUVScaleTest, \ + DISABLED_##I444##name##To##width##x##height##_##filter##_16) { \ int diff = I444TestFilter_16( \ benchmark_width_, benchmark_height_, width, height, kFilter##filter, \ benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ + TEST_F(LibYUVScaleTest, NV12##name##To##width##x##height##_##filter) { \ + int diff = NV12TestFilter(benchmark_width_, benchmark_height_, width, \ + height, kFilter##filter, benchmark_iterations_, \ + disable_cpu_flags_, benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } \ TEST_F(LibYUVScaleTest, I420##name##From##width##x##height##_##filter) { \ int diff = I420TestFilter(width, height, Abs(benchmark_width_), \ Abs(benchmark_height_), kFilter##filter, \ @@ -593,7 +721,7 @@ TEST_FACTOR(3, 1, 3, 0) EXPECT_LE(diff, max_diff); \ } \ TEST_F(LibYUVScaleTest, \ - I420##name##From##width##x##height##_##filter##_16) { \ + DISABLED_##I420##name##From##width##x##height##_##filter##_16) { \ int diff = I420TestFilter_16(width, height, Abs(benchmark_width_), \ Abs(benchmark_height_), kFilter##filter, \ benchmark_iterations_, disable_cpu_flags_, \ @@ -601,27 +729,45 @@ TEST_FACTOR(3, 1, 3, 0) EXPECT_LE(diff, max_diff); \ } \ TEST_F(LibYUVScaleTest, \ - I444##name##From##width##x##height##_##filter##_16) { \ + DISABLED_##I444##name##From##width##x##height##_##filter##_16) { \ int diff = I444TestFilter_16(width, height, Abs(benchmark_width_), \ Abs(benchmark_height_), kFilter##filter, \ benchmark_iterations_, disable_cpu_flags_, \ benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ + } \ + TEST_F(LibYUVScaleTest, NV12##name##From##width##x##height##_##filter) { \ + int diff = NV12TestFilter(width, height, Abs(benchmark_width_), \ + Abs(benchmark_height_), kFilter##filter, \ + benchmark_iterations_, disable_cpu_flags_, \ + benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ } +#ifdef ENABLE_SLOW_TESTS +// Test scale to a specified size with all 4 filters. +#define TEST_SCALETO(name, width, height) \ + TEST_SCALETO1(, name, width, height, None, 0) \ + TEST_SCALETO1(, name, width, height, Linear, 3) \ + TEST_SCALETO1(, name, width, height, Bilinear, 3) \ + TEST_SCALETO1(, name, width, height, Box, 3) +#else // Test scale to a specified size with all 4 filters. -#define TEST_SCALETO(name, width, height) \ - TEST_SCALETO1(name, width, height, None, 0) \ - TEST_SCALETO1(name, width, height, Linear, 3) \ - TEST_SCALETO1(name, width, height, Bilinear, 3) \ - TEST_SCALETO1(name, width, height, Box, 3) +#define TEST_SCALETO(name, width, height) \ + TEST_SCALETO1(DISABLED_, name, width, height, None, 0) \ + TEST_SCALETO1(DISABLED_, name, width, height, Linear, 3) \ + TEST_SCALETO1(DISABLED_, name, width, height, Bilinear, 3) \ + TEST_SCALETO1(DISABLED_, name, width, height, Box, 3) +#endif TEST_SCALETO(Scale, 1, 1) TEST_SCALETO(Scale, 320, 240) TEST_SCALETO(Scale, 569, 480) TEST_SCALETO(Scale, 640, 360) TEST_SCALETO(Scale, 1280, 720) +#ifdef ENABLE_SLOW_TESTS TEST_SCALETO(Scale, 1920, 1080) +#endif // ENABLE_SLOW_TESTS #undef TEST_SCALETO1 #undef TEST_SCALETO @@ -878,14 +1024,14 @@ static int TestPlaneFilter_16(int src_width, #define DX(x, nom, denom) static_cast<int>(((Abs(x) / nom + 1) / 2) * nom * 2) #define SX(x, nom, denom) static_cast<int>(((x / nom + 1) / 2) * denom * 2) -#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \ - TEST_F(LibYUVScaleTest, ScalePlaneDownBy##name##_##filter##_16) { \ - int diff = TestPlaneFilter_16( \ - SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ - DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ - kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \ - benchmark_cpu_info_); \ - EXPECT_LE(diff, max_diff); \ +#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \ + TEST_F(LibYUVScaleTest, DISABLED_##ScalePlaneDownBy##name##_##filter##_16) { \ + int diff = TestPlaneFilter_16( \ + SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ + DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ + kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \ + benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ } // Test a scale factor with all 4 filters. Expect unfiltered to be exact, but @@ -906,4 +1052,171 @@ TEST_FACTOR(3, 1, 3, 0) #undef TEST_FACTOR #undef SX #undef DX + +TEST_F(LibYUVScaleTest, PlaneTest3x) { + const int kSrcStride = 48; + const int kDstStride = 16; + const int kSize = kSrcStride * 3; + align_buffer_page_end(orig_pixels, kSize); + for (int i = 0; i < 48 * 3; ++i) { + orig_pixels[i] = i; + } + align_buffer_page_end(dest_pixels, kDstStride); + + int iterations16 = + benchmark_width_ * benchmark_height_ / (16 * 1) * benchmark_iterations_; + for (int i = 0; i < iterations16; ++i) { + ScalePlane(orig_pixels, kSrcStride, 48, 3, dest_pixels, kDstStride, 16, 1, + kFilterBilinear); + } + + EXPECT_EQ(49, dest_pixels[0]); + + ScalePlane(orig_pixels, kSrcStride, 48, 3, dest_pixels, kDstStride, 16, 1, + kFilterNone); + + EXPECT_EQ(49, dest_pixels[0]); + + free_aligned_buffer_page_end(dest_pixels); + free_aligned_buffer_page_end(orig_pixels); +} + +TEST_F(LibYUVScaleTest, PlaneTest4x) { + const int kSrcStride = 64; + const int kDstStride = 16; + const int kSize = kSrcStride * 4; + align_buffer_page_end(orig_pixels, kSize); + for (int i = 0; i < 64 * 4; ++i) { + orig_pixels[i] = i; + } + align_buffer_page_end(dest_pixels, kDstStride); + + int iterations16 = + benchmark_width_ * benchmark_height_ / (16 * 1) * benchmark_iterations_; + for (int i = 0; i < iterations16; ++i) { + ScalePlane(orig_pixels, kSrcStride, 64, 4, dest_pixels, kDstStride, 16, 1, + kFilterBilinear); + } + + EXPECT_EQ((65 + 66 + 129 + 130 + 2) / 4, dest_pixels[0]); + + ScalePlane(orig_pixels, kSrcStride, 64, 4, dest_pixels, kDstStride, 16, 1, + kFilterNone); + + EXPECT_EQ(130, dest_pixels[0]); // expect the 3rd pixel of the 3rd row + + free_aligned_buffer_page_end(dest_pixels); + free_aligned_buffer_page_end(orig_pixels); +} + +// Intent is to test 200x50 to 50x200 but width and height can be parameters. +TEST_F(LibYUVScaleTest, PlaneTestRotate_None) { + const int kSize = benchmark_width_ * benchmark_height_; + align_buffer_page_end(orig_pixels, kSize); + for (int i = 0; i < kSize; ++i) { + orig_pixels[i] = i; + } + align_buffer_page_end(dest_opt_pixels, kSize); + align_buffer_page_end(dest_c_pixels, kSize); + + + MaskCpuFlags(disable_cpu_flags_); // Disable all CPU optimization. + ScalePlane(orig_pixels, benchmark_width_, + benchmark_width_, benchmark_height_, + dest_c_pixels, benchmark_height_, + benchmark_height_, benchmark_width_, + kFilterNone); + MaskCpuFlags(benchmark_cpu_info_); // Enable all CPU optimization. + + + for (int i = 0; i < benchmark_iterations_; ++i) { + ScalePlane(orig_pixels, benchmark_width_, + benchmark_width_, benchmark_height_, + dest_opt_pixels, benchmark_height_, + benchmark_height_, benchmark_width_, + kFilterNone); + } + + for (int i = 0; i < kSize; ++i) { + EXPECT_EQ(dest_c_pixels[i], dest_opt_pixels[i]); + } + + free_aligned_buffer_page_end(dest_c_pixels); + free_aligned_buffer_page_end(dest_opt_pixels); + free_aligned_buffer_page_end(orig_pixels); +} + +TEST_F(LibYUVScaleTest, PlaneTestRotate_Bilinear) { + const int kSize = benchmark_width_ * benchmark_height_; + align_buffer_page_end(orig_pixels, kSize); + for (int i = 0; i < kSize; ++i) { + orig_pixels[i] = i; + } + align_buffer_page_end(dest_opt_pixels, kSize); + align_buffer_page_end(dest_c_pixels, kSize); + + + MaskCpuFlags(disable_cpu_flags_); // Disable all CPU optimization. + ScalePlane(orig_pixels, benchmark_width_, + benchmark_width_, benchmark_height_, + dest_c_pixels, benchmark_height_, + benchmark_height_, benchmark_width_, + kFilterBilinear); + MaskCpuFlags(benchmark_cpu_info_); // Enable all CPU optimization. + + + for (int i = 0; i < benchmark_iterations_; ++i) { + ScalePlane(orig_pixels, benchmark_width_, + benchmark_width_, benchmark_height_, + dest_opt_pixels, benchmark_height_, + benchmark_height_, benchmark_width_, + kFilterBilinear); + } + + for (int i = 0; i < kSize; ++i) { + EXPECT_EQ(dest_c_pixels[i], dest_opt_pixels[i]); + } + + free_aligned_buffer_page_end(dest_c_pixels); + free_aligned_buffer_page_end(dest_opt_pixels); + free_aligned_buffer_page_end(orig_pixels); +} + +// Intent is to test 200x50 to 50x200 but width and height can be parameters. +TEST_F(LibYUVScaleTest, PlaneTestRotate_Box) { + const int kSize = benchmark_width_ * benchmark_height_; + align_buffer_page_end(orig_pixels, kSize); + for (int i = 0; i < kSize; ++i) { + orig_pixels[i] = i; + } + align_buffer_page_end(dest_opt_pixels, kSize); + align_buffer_page_end(dest_c_pixels, kSize); + + + MaskCpuFlags(disable_cpu_flags_); // Disable all CPU optimization. + ScalePlane(orig_pixels, benchmark_width_, + benchmark_width_, benchmark_height_, + dest_c_pixels, benchmark_height_, + benchmark_height_, benchmark_width_, + kFilterBox); + MaskCpuFlags(benchmark_cpu_info_); // Enable all CPU optimization. + + + for (int i = 0; i < benchmark_iterations_; ++i) { + ScalePlane(orig_pixels, benchmark_width_, + benchmark_width_, benchmark_height_, + dest_opt_pixels, benchmark_height_, + benchmark_height_, benchmark_width_, + kFilterBox); + } + + for (int i = 0; i < kSize; ++i) { + EXPECT_EQ(dest_c_pixels[i], dest_opt_pixels[i]); + } + + free_aligned_buffer_page_end(dest_c_pixels); + free_aligned_buffer_page_end(dest_opt_pixels); + free_aligned_buffer_page_end(orig_pixels); +} + } // namespace libyuv diff --git a/chromium/third_party/libyuv/unit_test/video_common_test.cc b/chromium/third_party/libyuv/unit_test/video_common_test.cc index a84206a2adb..eb183aaa796 100644 --- a/chromium/third_party/libyuv/unit_test/video_common_test.cc +++ b/chromium/third_party/libyuv/unit_test/video_common_test.cc @@ -65,7 +65,7 @@ TEST_F(LibYUVBaseTest, TestFourCC) { EXPECT_TRUE(TestValidFourCC(FOURCC_NV12, FOURCC_BPP_NV12)); EXPECT_TRUE(TestValidFourCC(FOURCC_YUY2, FOURCC_BPP_YUY2)); EXPECT_TRUE(TestValidFourCC(FOURCC_UYVY, FOURCC_BPP_UYVY)); - EXPECT_TRUE(TestValidFourCC(FOURCC_M420, FOURCC_BPP_M420)); + EXPECT_TRUE(TestValidFourCC(FOURCC_M420, FOURCC_BPP_M420)); // deprecated. EXPECT_TRUE(TestValidFourCC(FOURCC_Q420, FOURCC_BPP_Q420)); // deprecated. EXPECT_TRUE(TestValidFourCC(FOURCC_ARGB, FOURCC_BPP_ARGB)); EXPECT_TRUE(TestValidFourCC(FOURCC_BGRA, FOURCC_BPP_BGRA)); diff --git a/chromium/third_party/libyuv/util/cpuid.c b/chromium/third_party/libyuv/util/cpuid.c index 84c0602287b..46f9c1bfff4 100644 --- a/chromium/third_party/libyuv/util/cpuid.c +++ b/chromium/third_party/libyuv/util/cpuid.c @@ -12,10 +12,11 @@ #include <stdlib.h> #include <string.h> -#define INCLUDE_LIBYUV_COMPARE_H_ -#include "libyuv.h" -#include "./psnr.h" -#include "./ssim.h" +#include "libyuv/cpu_id.h" + +#ifdef __cplusplus +using namespace libyuv; +#endif int main(int argc, const char* argv[]) { int cpu_flags = TestCpuFlag(-1); @@ -83,7 +84,7 @@ int main(int argc, const char* argv[]) { int has_avx2 = TestCpuFlag(kCpuHasAVX2); int has_erms = TestCpuFlag(kCpuHasERMS); int has_fma3 = TestCpuFlag(kCpuHasFMA3); - int has_f16c = TestCpuFlag(kCpuHasF16C); + int has_f16c = TestCpuFlag(kCpuHasF16C); int has_gfni = TestCpuFlag(kCpuHasGFNI); int has_avx512bw = TestCpuFlag(kCpuHasAVX512BW); int has_avx512vl = TestCpuFlag(kCpuHasAVX512VL); diff --git a/chromium/third_party/libyuv/util/i444tonv12_eg.cc b/chromium/third_party/libyuv/util/i444tonv12_eg.cc new file mode 100644 index 00000000000..0fcb4095a80 --- /dev/null +++ b/chromium/third_party/libyuv/util/i444tonv12_eg.cc @@ -0,0 +1,28 @@ + +#include "libyuv/convert.h" + +#include <stdio.h> // for printf +#include <string.h> // for memset + +int main(int, char**) { + unsigned char src_i444[640 * 400 * 3]; + unsigned char dst_nv12[640 * 400 * 3 / 2]; + + for (size_t i = 0; i < sizeof(src_i444); ++i) { + src_i444[i] = i & 255; + } + memset(dst_nv12, 0, sizeof(dst_nv12)); + libyuv::I444ToNV12(&src_i444[0], 640, // source Y + &src_i444[640 * 400], 640, // source U + &src_i444[640 * 400 * 2], 640, // source V + &dst_nv12[0], 640, // dest Y + &dst_nv12[640 * 400], 640, // dest UV + 640, 400); // width and height + + int checksum = 0; + for (size_t i = 0; i < sizeof(dst_nv12); ++i) { + checksum += dst_nv12[i]; + } + printf("checksum %x %s\n", checksum, checksum == 0x2ec0c00 ? "PASS" : "FAIL"); + return 0; +}
\ No newline at end of file diff --git a/chromium/third_party/libyuv/winarm.mk b/chromium/third_party/libyuv/winarm.mk index c4307a431f9..b0a344ae06d 100644 --- a/chromium/third_party/libyuv/winarm.mk +++ b/chromium/third_party/libyuv/winarm.mk @@ -31,6 +31,7 @@ LOCAL_OBJ_FILES = \ source/scale_any.o\ source/scale_argb.o\ source/scale_common.o\ + source/scale_uv.o\ source/video_common.o .cc.o: |