compare_win.cc 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270
  1. /*
  2. * Copyright 2012 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/basic_types.h"
  11. #include "libyuv/row.h"
  12. #if defined(_MSC_VER)
  13. #include <intrin.h> // For __popcnt
  14. #endif
  15. #ifdef __cplusplus
  16. namespace libyuv {
  17. extern "C" {
  18. #endif
  19. #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
  20. /* Visual Studio 2005 doesn't support __popcnt(). */
  21. #if (_MSC_VER > 1400)
  22. uint32 HammingDistance_SSE42(const uint8* src_a,
  23. const uint8* src_b,
  24. int count) {
  25. uint32 diff = 0u;
  26. int i;
  27. for (i = 0; i < count - 3; i += 4) {
  28. uint32 x = *((uint32*)src_a) ^ *((uint32*)src_b);
  29. src_a += 4;
  30. src_b += 4;
  31. diff += __popcnt(x);
  32. }
  33. return diff;
  34. }
  35. #endif
  36. #if (_MSC_VER >= 1900)
  37. __declspec(naked)
  38. #else
  39. __declspec(naked) __declspec(align(16))
  40. #endif
  41. uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
  42. __asm {
  43. mov eax, [esp + 4] // src_a
  44. mov edx, [esp + 8] // src_b
  45. mov ecx, [esp + 12] // count
  46. pxor xmm0, xmm0
  47. pxor xmm5, xmm5
  48. align 4
  49. wloop:
  50. movdqa xmm1, [eax]
  51. lea eax, [eax + 16]
  52. movdqa xmm2, [edx]
  53. lea edx, [edx + 16]
  54. sub ecx, 16
  55. movdqa xmm3, xmm1 // abs trick
  56. psubusb xmm1, xmm2
  57. psubusb xmm2, xmm3
  58. por xmm1, xmm2
  59. movdqa xmm2, xmm1
  60. punpcklbw xmm1, xmm5
  61. punpckhbw xmm2, xmm5
  62. pmaddwd xmm1, xmm1
  63. pmaddwd xmm2, xmm2
  64. paddd xmm0, xmm1
  65. paddd xmm0, xmm2
  66. jg wloop
  67. pshufd xmm1, xmm0, 0xee
  68. paddd xmm0, xmm1
  69. pshufd xmm1, xmm0, 0x01
  70. paddd xmm0, xmm1
  71. movd eax, xmm0
  72. ret
  73. }
  74. }
  75. // Visual C 2012 required for AVX2.
  76. #if _MSC_VER >= 1700
  77. // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
  78. #pragma warning(disable: 4752)
  79. #if (_MSC_VER >= 1900)
  80. __declspec(naked)
  81. #else
  82. __declspec(naked) __declspec(align(16))
  83. #endif
  84. uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
  85. __asm {
  86. mov eax, [esp + 4] // src_a
  87. mov edx, [esp + 8] // src_b
  88. mov ecx, [esp + 12] // count
  89. vpxor ymm0, ymm0, ymm0 // sum
  90. vpxor ymm5, ymm5, ymm5 // constant 0 for unpck
  91. sub edx, eax
  92. align 4
  93. wloop:
  94. vmovdqu ymm1, [eax]
  95. vmovdqu ymm2, [eax + edx]
  96. lea eax, [eax + 32]
  97. sub ecx, 32
  98. vpsubusb ymm3, ymm1, ymm2 // abs difference trick
  99. vpsubusb ymm2, ymm2, ymm1
  100. vpor ymm1, ymm2, ymm3
  101. vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order.
  102. vpunpckhbw ymm1, ymm1, ymm5
  103. vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32.
  104. vpmaddwd ymm1, ymm1, ymm1
  105. vpaddd ymm0, ymm0, ymm1
  106. vpaddd ymm0, ymm0, ymm2
  107. jg wloop
  108. vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes.
  109. vpaddd ymm0, ymm0, ymm1
  110. vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes.
  111. vpaddd ymm0, ymm0, ymm1
  112. vpermq ymm1, ymm0, 0x02 // high + low lane.
  113. vpaddd ymm0, ymm0, ymm1
  114. vmovd eax, xmm0
  115. vzeroupper
  116. ret
  117. }
  118. }
  119. #endif // _MSC_VER >= 1700
  120. #define HAS_HASHDJB2_SSE41
  121. static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
  122. static uvec32 kHashMul0 = {
  123. 0x0c3525e1, // 33 ^ 15
  124. 0xa3476dc1, // 33 ^ 14
  125. 0x3b4039a1, // 33 ^ 13
  126. 0x4f5f0981, // 33 ^ 12
  127. };
  128. static uvec32 kHashMul1 = {
  129. 0x30f35d61, // 33 ^ 11
  130. 0x855cb541, // 33 ^ 10
  131. 0x040a9121, // 33 ^ 9
  132. 0x747c7101, // 33 ^ 8
  133. };
  134. static uvec32 kHashMul2 = {
  135. 0xec41d4e1, // 33 ^ 7
  136. 0x4cfa3cc1, // 33 ^ 6
  137. 0x025528a1, // 33 ^ 5
  138. 0x00121881, // 33 ^ 4
  139. };
  140. static uvec32 kHashMul3 = {
  141. 0x00008c61, // 33 ^ 3
  142. 0x00000441, // 33 ^ 2
  143. 0x00000021, // 33 ^ 1
  144. 0x00000001, // 33 ^ 0
  145. };
  146. // 27: 66 0F 38 40 C6 pmulld xmm0,xmm6
  147. // 44: 66 0F 38 40 DD pmulld xmm3,xmm5
  148. // 59: 66 0F 38 40 E5 pmulld xmm4,xmm5
  149. // 72: 66 0F 38 40 D5 pmulld xmm2,xmm5
  150. // 83: 66 0F 38 40 CD pmulld xmm1,xmm5
  151. #define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
  152. _asm _emit 0x40 _asm _emit reg
  153. #if (_MSC_VER >= 1900)
  154. __declspec(naked)
  155. #else
  156. __declspec(naked) __declspec(align(16))
  157. #endif
  158. uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
  159. __asm {
  160. mov eax, [esp + 4] // src
  161. mov ecx, [esp + 8] // count
  162. movd xmm0, [esp + 12] // seed
  163. pxor xmm7, xmm7 // constant 0 for unpck
  164. movdqa xmm6, kHash16x33
  165. align 4
  166. wloop:
  167. movdqu xmm1, [eax] // src[0-15]
  168. lea eax, [eax + 16]
  169. pmulld(0xc6) // pmulld xmm0,xmm6 hash *= 33 ^ 16
  170. movdqa xmm5, kHashMul0
  171. movdqa xmm2, xmm1
  172. punpcklbw xmm2, xmm7 // src[0-7]
  173. movdqa xmm3, xmm2
  174. punpcklwd xmm3, xmm7 // src[0-3]
  175. pmulld(0xdd) // pmulld xmm3, xmm5
  176. movdqa xmm5, kHashMul1
  177. movdqa xmm4, xmm2
  178. punpckhwd xmm4, xmm7 // src[4-7]
  179. pmulld(0xe5) // pmulld xmm4, xmm5
  180. movdqa xmm5, kHashMul2
  181. punpckhbw xmm1, xmm7 // src[8-15]
  182. movdqa xmm2, xmm1
  183. punpcklwd xmm2, xmm7 // src[8-11]
  184. pmulld(0xd5) // pmulld xmm2, xmm5
  185. movdqa xmm5, kHashMul3
  186. punpckhwd xmm1, xmm7 // src[12-15]
  187. pmulld(0xcd) // pmulld xmm1, xmm5
  188. paddd xmm3, xmm4 // add 16 results
  189. paddd xmm1, xmm2
  190. sub ecx, 16
  191. paddd xmm1, xmm3
  192. pshufd xmm2, xmm1, 0x0e // upper 2 dwords
  193. paddd xmm1, xmm2
  194. pshufd xmm2, xmm1, 0x01
  195. paddd xmm1, xmm2
  196. paddd xmm0, xmm1
  197. jg wloop
  198. movd eax, xmm0 // return hash
  199. ret
  200. }
  201. }
  202. // Visual C 2012 required for AVX2.
  203. #if _MSC_VER >= 1700
  204. #if (_MSC_VER >= 1900)
  205. __declspec(naked)
  206. #else
  207. __declspec(naked) __declspec(align(16))
  208. #endif
  209. uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
  210. __asm {
  211. mov eax, [esp + 4] // src
  212. mov ecx, [esp + 8] // count
  213. movd xmm0, [esp + 12] // seed
  214. movdqa xmm6, kHash16x33
  215. align 4
  216. wloop:
  217. vpmovzxbd xmm3, dword ptr [eax] // src[0-3]
  218. pmulld xmm0, xmm6 // hash *= 33 ^ 16
  219. vpmovzxbd xmm4, dword ptr [eax + 4] // src[4-7]
  220. pmulld xmm3, kHashMul0
  221. vpmovzxbd xmm2, dword ptr [eax + 8] // src[8-11]
  222. pmulld xmm4, kHashMul1
  223. vpmovzxbd xmm1, dword ptr [eax + 12] // src[12-15]
  224. pmulld xmm2, kHashMul2
  225. lea eax, [eax + 16]
  226. pmulld xmm1, kHashMul3
  227. paddd xmm3, xmm4 // add 16 results
  228. paddd xmm1, xmm2
  229. sub ecx, 16
  230. paddd xmm1, xmm3
  231. pshufd xmm2, xmm1, 0x0e // upper 2 dwords
  232. paddd xmm1, xmm2
  233. pshufd xmm2, xmm1, 0x01
  234. paddd xmm1, xmm2
  235. paddd xmm0, xmm1
  236. jg wloop
  237. movd eax, xmm0 // return hash
  238. ret
  239. }
  240. }
  241. #endif // _MSC_VER >= 1700
  242. #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
  243. #ifdef __cplusplus
  244. } // extern "C"
  245. } // namespace libyuv
  246. #endif