代码拉取完成,页面将自动刷新
同步操作将从 src-openEuler/firefox 强制同步,此操作会覆盖自 Fork 仓库以来所做的任何修改,且无法恢复!!!
确定后同步将在后台操作,完成时将刷新页面,请耐心等待。
# HG changeset patch
# User Lee Salzman <lsalzman@mozilla.com>
# Date 1601995009 0
# Tue Oct 06 14:36:49 2020 +0000
# Node ID 48c0f5033c286bd515b6f16e0905ff4ca94faf98
# Parent 5bc02423412647e3ee9a0681b38e418a10901601
Bug 1642028 - cherry-pick Skia blitting cleanups. r=jrmuizel
Differential Revision: https://phabricator.services.mozilla.com/D92476
diff -r 5bc024234126 -r 48c0f5033c28 gfx/skia/skia/src/opts/SkBlitRow_opts.h
--- a/gfx/skia/skia/src/opts/SkBlitRow_opts.h Tue Oct 06 16:58:11 2020 +0000
+++ b/gfx/skia/skia/src/opts/SkBlitRow_opts.h Tue Oct 06 14:36:49 2020 +0000
@@ -58,37 +58,114 @@
return _mm256_add_epi32(src, _mm256_or_si256(rb, ga));
}
+#endif
-#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
#include <immintrin.h>
static inline __m128i SkPMSrcOver_SSE2(const __m128i& src, const __m128i& dst) {
- auto SkAlphaMulQ_SSE2 = [](const __m128i& c, const __m128i& scale) {
- const __m128i mask = _mm_set1_epi32(0xFF00FF);
- __m128i s = _mm_or_si128(_mm_slli_epi32(scale, 16), scale);
+ __m128i scale = _mm_sub_epi32(_mm_set1_epi32(256),
+ _mm_srli_epi32(src, 24));
+ __m128i scale_x2 = _mm_or_si128(_mm_slli_epi32(scale, 16), scale);
+
+ __m128i rb = _mm_and_si128(_mm_set1_epi32(0x00ff00ff), dst);
+ rb = _mm_mullo_epi16(rb, scale_x2);
+ rb = _mm_srli_epi16(rb, 8);
- // uint32_t rb = ((c & mask) * scale) >> 8
- __m128i rb = _mm_and_si128(mask, c);
- rb = _mm_mullo_epi16(rb, s);
- rb = _mm_srli_epi16(rb, 8);
+ __m128i ga = _mm_srli_epi16(dst, 8);
+ ga = _mm_mullo_epi16(ga, scale_x2);
+ ga = _mm_andnot_si128(_mm_set1_epi32(0x00ff00ff), ga);
+
+ return _mm_add_epi32(src, _mm_or_si128(rb, ga));
+ }
+#endif
- // uint32_t ag = ((c >> 8) & mask) * scale
- __m128i ag = _mm_srli_epi16(c, 8);
- ag = _mm_mullo_epi16(ag, s);
-
- // (rb & mask) | (ag & ~mask)
- ag = _mm_andnot_si128(mask, ag);
- return _mm_or_si128(rb, ag);
+#if defined(SK_ARM_HAS_NEON)
+ #include <arm_neon.h>
+ // SkMulDiv255Round() applied to each lane.
+ static inline uint8x8_t SkMulDiv255Round_neon8(uint8x8_t x, uint8x8_t y) {
+ uint16x8_t prod = vmull_u8(x, y);
+ return vraddhn_u16(prod, vrshrq_n_u16(prod, 8));
+ }
+ static inline uint8x8x4_t SkPMSrcOver_neon8(uint8x8x4_t dst, uint8x8x4_t src) {
+ uint8x8_t nalphas = vmvn_u8(src.val[3]); // 256 - alpha
+ return {
+ vadd_u8(src.val[0], SkMulDiv255Round_neon8(nalphas, dst.val[0])),
+ vadd_u8(src.val[1], SkMulDiv255Round_neon8(nalphas, dst.val[1])),
+ vadd_u8(src.val[2], SkMulDiv255Round_neon8(nalphas, dst.val[2])),
+ vadd_u8(src.val[3], SkMulDiv255Round_neon8(nalphas, dst.val[3])),
};
- return _mm_add_epi32(src,
- SkAlphaMulQ_SSE2(dst, _mm_sub_epi32(_mm_set1_epi32(256),
- _mm_srli_epi32(src, 24))));
+ }
+ // Variant assuming dst and src contain the color components of two consecutive pixels.
+ static inline uint8x8_t SkPMSrcOver_neon2(uint8x8_t dst, uint8x8_t src) {
+ const uint8x8_t alpha_indices = vcreate_u8(0x0707070703030303);
+ uint8x8_t nalphas = vmvn_u8(vtbl1_u8(src, alpha_indices));
+ return vadd_u8(src, SkMulDiv255Round_neon8(nalphas, dst));
}
#endif
namespace SK_OPTS_NS {
+/*not static*/
+inline void blit_row_s32a_opaque(SkPMColor* dst, const SkPMColor* src, int len, U8CPU alpha) {
+ SkASSERT(alpha == 0xFF);
+ sk_msan_assert_initialized(src, src+len);
+
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
+ while (len >= 8) {
+ _mm256_storeu_si256((__m256i*)dst,
+ SkPMSrcOver_AVX2(_mm256_loadu_si256((const __m256i*)src),
+ _mm256_loadu_si256((const __m256i*)dst)));
+ src += 8;
+ dst += 8;
+ len -= 8;
+ }
+#endif
+
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
+ while (len >= 4) {
+ _mm_storeu_si128((__m128i*)dst, SkPMSrcOver_SSE2(_mm_loadu_si128((const __m128i*)src),
+ _mm_loadu_si128((const __m128i*)dst)));
+ src += 4;
+ dst += 4;
+ len -= 4;
+ }
+#endif
+
+#if defined(SK_ARM_HAS_NEON)
+ while (len >= 8) {
+ vst4_u8((uint8_t*)dst, SkPMSrcOver_neon8(vld4_u8((const uint8_t*)dst),
+ vld4_u8((const uint8_t*)src)));
+ src += 8;
+ dst += 8;
+ len -= 8;
+ }
+
+ while (len >= 2) {
+ vst1_u8((uint8_t*)dst, SkPMSrcOver_neon2(vld1_u8((const uint8_t*)dst),
+ vld1_u8((const uint8_t*)src)));
+ src += 2;
+ dst += 2;
+ len -= 2;
+ }
+
+ if (len != 0) {
+ uint8x8_t result = SkPMSrcOver_neon2(vcreate_u8((uint64_t)*dst),
+ vcreate_u8((uint64_t)*src));
+ vst1_lane_u32(dst, vreinterpret_u32_u8(result), 0);
+ }
+ return;
+#endif
+
+ while (len --> 0) {
+ *dst = SkPMSrcOver(*src, *dst);
+ src++;
+ dst++;
+ }
+}
+
// Blend constant color over count src pixels, writing into dst.
+/*not static*/
inline void blit_row_color32(SkPMColor* dst, const SkPMColor* src, int count, SkPMColor color) {
constexpr int N = 4; // 8, 16 also reasonable choices
using U32 = skvx::Vec< N, uint32_t>;
@@ -120,259 +197,6 @@
}
}
-#if defined(SK_ARM_HAS_NEON)
-
-// Return a uint8x8_t value, r, computed as r[i] = SkMulDiv255Round(x[i], y[i]), where r[i], x[i],
-// y[i] are the i-th lanes of the corresponding NEON vectors.
-static inline uint8x8_t SkMulDiv255Round_neon8(uint8x8_t x, uint8x8_t y) {
- uint16x8_t prod = vmull_u8(x, y);
- return vraddhn_u16(prod, vrshrq_n_u16(prod, 8));
-}
-
-// The implementations of SkPMSrcOver below perform alpha blending consistently with
-// SkMulDiv255Round. They compute the color components (numbers in the interval [0, 255]) as:
-//
-// result_i = src_i + rint(g(src_alpha, dst_i))
-//
-// where g(x, y) = ((255.0 - x) * y) / 255.0 and rint rounds to the nearest integer.
-
-// In this variant of SkPMSrcOver each NEON register, dst.val[i], src.val[i], contains the value
-// of the same color component for 8 consecutive pixels. The result of this function follows the
-// same convention.
-static inline uint8x8x4_t SkPMSrcOver_neon8(uint8x8x4_t dst, uint8x8x4_t src) {
- uint8x8_t nalphas = vmvn_u8(src.val[3]);
- uint8x8x4_t result;
- result.val[0] = vadd_u8(src.val[0], SkMulDiv255Round_neon8(nalphas, dst.val[0]));
- result.val[1] = vadd_u8(src.val[1], SkMulDiv255Round_neon8(nalphas, dst.val[1]));
- result.val[2] = vadd_u8(src.val[2], SkMulDiv255Round_neon8(nalphas, dst.val[2]));
- result.val[3] = vadd_u8(src.val[3], SkMulDiv255Round_neon8(nalphas, dst.val[3]));
- return result;
-}
-
-// In this variant of SkPMSrcOver dst and src contain the color components of two consecutive
-// pixels. The return value follows the same convention.
-static inline uint8x8_t SkPMSrcOver_neon2(uint8x8_t dst, uint8x8_t src) {
- const uint8x8_t alpha_indices = vcreate_u8(0x0707070703030303);
- uint8x8_t nalphas = vmvn_u8(vtbl1_u8(src, alpha_indices));
- return vadd_u8(src, SkMulDiv255Round_neon8(nalphas, dst));
-}
-
-#endif
-
-/*not static*/ inline
-void blit_row_s32a_opaque(SkPMColor* dst, const SkPMColor* src, int len, U8CPU alpha) {
- SkASSERT(alpha == 0xFF);
- sk_msan_assert_initialized(src, src+len);
-// Require AVX2 because of AVX2 integer calculation intrinsics in SrcOver
-#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
- while (len >= 32) {
- // Load 32 source pixels.
- auto s0 = _mm256_loadu_si256((const __m256i*)(src) + 0),
- s1 = _mm256_loadu_si256((const __m256i*)(src) + 1),
- s2 = _mm256_loadu_si256((const __m256i*)(src) + 2),
- s3 = _mm256_loadu_si256((const __m256i*)(src) + 3);
-
- const auto alphaMask = _mm256_set1_epi32(0xFF000000);
-
- auto ORed = _mm256_or_si256(s3, _mm256_or_si256(s2, _mm256_or_si256(s1, s0)));
- if (_mm256_testz_si256(ORed, alphaMask)) {
- // All 32 source pixels are transparent. Nothing to do.
- src += 32;
- dst += 32;
- len -= 32;
- continue;
- }
-
- auto d0 = (__m256i*)(dst) + 0,
- d1 = (__m256i*)(dst) + 1,
- d2 = (__m256i*)(dst) + 2,
- d3 = (__m256i*)(dst) + 3;
-
- auto ANDed = _mm256_and_si256(s3, _mm256_and_si256(s2, _mm256_and_si256(s1, s0)));
- if (_mm256_testc_si256(ANDed, alphaMask)) {
- // All 32 source pixels are opaque. SrcOver becomes Src.
- _mm256_storeu_si256(d0, s0);
- _mm256_storeu_si256(d1, s1);
- _mm256_storeu_si256(d2, s2);
- _mm256_storeu_si256(d3, s3);
- src += 32;
- dst += 32;
- len -= 32;
- continue;
- }
-
- // TODO: This math is wrong.
- // Do SrcOver.
- _mm256_storeu_si256(d0, SkPMSrcOver_AVX2(s0, _mm256_loadu_si256(d0)));
- _mm256_storeu_si256(d1, SkPMSrcOver_AVX2(s1, _mm256_loadu_si256(d1)));
- _mm256_storeu_si256(d2, SkPMSrcOver_AVX2(s2, _mm256_loadu_si256(d2)));
- _mm256_storeu_si256(d3, SkPMSrcOver_AVX2(s3, _mm256_loadu_si256(d3)));
- src += 32;
- dst += 32;
- len -= 32;
- }
-
-#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
- while (len >= 16) {
- // Load 16 source pixels.
- auto s0 = _mm_loadu_si128((const __m128i*)(src) + 0),
- s1 = _mm_loadu_si128((const __m128i*)(src) + 1),
- s2 = _mm_loadu_si128((const __m128i*)(src) + 2),
- s3 = _mm_loadu_si128((const __m128i*)(src) + 3);
-
- const auto alphaMask = _mm_set1_epi32(0xFF000000);
-
- auto ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
- if (_mm_testz_si128(ORed, alphaMask)) {
- // All 16 source pixels are transparent. Nothing to do.
- src += 16;
- dst += 16;
- len -= 16;
- continue;
- }
-
- auto d0 = (__m128i*)(dst) + 0,
- d1 = (__m128i*)(dst) + 1,
- d2 = (__m128i*)(dst) + 2,
- d3 = (__m128i*)(dst) + 3;
-
- auto ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
- if (_mm_testc_si128(ANDed, alphaMask)) {
- // All 16 source pixels are opaque. SrcOver becomes Src.
- _mm_storeu_si128(d0, s0);
- _mm_storeu_si128(d1, s1);
- _mm_storeu_si128(d2, s2);
- _mm_storeu_si128(d3, s3);
- src += 16;
- dst += 16;
- len -= 16;
- continue;
- }
-
- // TODO: This math is wrong.
- // Do SrcOver.
- _mm_storeu_si128(d0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(d0)));
- _mm_storeu_si128(d1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(d1)));
- _mm_storeu_si128(d2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(d2)));
- _mm_storeu_si128(d3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(d3)));
- src += 16;
- dst += 16;
- len -= 16;
- }
-
-#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
- while (len >= 16) {
- // Load 16 source pixels.
- auto s0 = _mm_loadu_si128((const __m128i*)(src) + 0),
- s1 = _mm_loadu_si128((const __m128i*)(src) + 1),
- s2 = _mm_loadu_si128((const __m128i*)(src) + 2),
- s3 = _mm_loadu_si128((const __m128i*)(src) + 3);
-
- const auto alphaMask = _mm_set1_epi32(0xFF000000);
-
- auto ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
- if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(ORed, alphaMask),
- _mm_setzero_si128()))) {
- // All 16 source pixels are transparent. Nothing to do.
- src += 16;
- dst += 16;
- len -= 16;
- continue;
- }
-
- auto d0 = (__m128i*)(dst) + 0,
- d1 = (__m128i*)(dst) + 1,
- d2 = (__m128i*)(dst) + 2,
- d3 = (__m128i*)(dst) + 3;
-
- auto ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
- if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(ANDed, alphaMask),
- alphaMask))) {
- // All 16 source pixels are opaque. SrcOver becomes Src.
- _mm_storeu_si128(d0, s0);
- _mm_storeu_si128(d1, s1);
- _mm_storeu_si128(d2, s2);
- _mm_storeu_si128(d3, s3);
- src += 16;
- dst += 16;
- len -= 16;
- continue;
- }
-
- // TODO: This math is wrong.
- // Do SrcOver.
- _mm_storeu_si128(d0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(d0)));
- _mm_storeu_si128(d1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(d1)));
- _mm_storeu_si128(d2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(d2)));
- _mm_storeu_si128(d3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(d3)));
-
- src += 16;
- dst += 16;
- len -= 16;
- }
-
-#elif defined(SK_ARM_HAS_NEON)
- // Do 8-pixels at a time. A 16-pixels at a time version of this code was also tested, but it
- // underperformed on some of the platforms under test for inputs with frequent transitions of
- // alpha (corresponding to changes of the conditions [~]alpha_u64 == 0 below). It may be worth
- // revisiting the situation in the future.
- while (len >= 8) {
- // Load 8 pixels in 4 NEON registers. src_col.val[i] will contain the same color component
- // for 8 consecutive pixels (e.g. src_col.val[3] will contain all alpha components of 8
- // pixels).
- uint8x8x4_t src_col = vld4_u8(reinterpret_cast<const uint8_t*>(src));
- src += 8;
- len -= 8;
-
- // We now detect 2 special cases: the first occurs when all alphas are zero (the 8 pixels
- // are all transparent), the second when all alphas are fully set (they are all opaque).
- uint8x8_t alphas = src_col.val[3];
- uint64_t alphas_u64 = vget_lane_u64(vreinterpret_u64_u8(alphas), 0);
- if (alphas_u64 == 0) {
- // All pixels transparent.
- dst += 8;
- continue;
- }
-
- if (~alphas_u64 == 0) {
- // All pixels opaque.
- vst4_u8(reinterpret_cast<uint8_t*>(dst), src_col);
- dst += 8;
- continue;
- }
-
- uint8x8x4_t dst_col = vld4_u8(reinterpret_cast<uint8_t*>(dst));
- vst4_u8(reinterpret_cast<uint8_t*>(dst), SkPMSrcOver_neon8(dst_col, src_col));
- dst += 8;
- }
-
- // Deal with leftover pixels.
- for (; len >= 2; len -= 2, src += 2, dst += 2) {
- uint8x8_t src2 = vld1_u8(reinterpret_cast<const uint8_t*>(src));
- uint8x8_t dst2 = vld1_u8(reinterpret_cast<const uint8_t*>(dst));
- vst1_u8(reinterpret_cast<uint8_t*>(dst), SkPMSrcOver_neon2(dst2, src2));
- }
-
- if (len != 0) {
- uint8x8_t result = SkPMSrcOver_neon2(vcreate_u8(*dst), vcreate_u8(*src));
- vst1_lane_u32(dst, vreinterpret_u32_u8(result), 0);
- }
- return;
-#endif
-
- while (len-- > 0) {
- // This 0xFF000000 is not semantically necessary, but for compatibility
- // with chromium:611002 we need to keep it until we figure out where
- // the non-premultiplied src values (like 0x00FFFFFF) are coming from.
- // TODO(mtklein): sort this out and assert *src is premul here.
- if (*src & 0xFF000000) {
- *dst = (*src >= 0xFF000000) ? *src : SkPMSrcOver(*src, *dst);
- }
- src++;
- dst++;
- }
-}
-
} // SK_OPTS_NS
#endif//SkBlitRow_opts_DEFINED
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。