compare_gcc.cc 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355
  1. /*
  2. * Copyright 2012 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/basic_types.h"
  11. #include "libyuv/compare_row.h"
  12. #include "libyuv/row.h"
  13. #ifdef __cplusplus
  14. namespace libyuv {
  15. extern "C" {
  16. #endif
  17. // This module is for GCC x86 and x64.
  18. #if !defined(LIBYUV_DISABLE_X86) && \
  19. (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
  20. #if defined(__x86_64__)
  21. uint32 HammingDistance_SSE42(const uint8* src_a,
  22. const uint8* src_b,
  23. int count) {
  24. uint64 diff = 0u;
  25. asm volatile(
  26. "xor %3,%3 \n"
  27. "xor %%r8,%%r8 \n"
  28. "xor %%r9,%%r9 \n"
  29. "xor %%r10,%%r10 \n"
  30. // Process 32 bytes per loop.
  31. LABELALIGN
  32. "1: \n"
  33. "mov (%0),%%rcx \n"
  34. "mov 0x8(%0),%%rdx \n"
  35. "xor (%1),%%rcx \n"
  36. "xor 0x8(%1),%%rdx \n"
  37. "popcnt %%rcx,%%rcx \n"
  38. "popcnt %%rdx,%%rdx \n"
  39. "mov 0x10(%0),%%rsi \n"
  40. "mov 0x18(%0),%%rdi \n"
  41. "xor 0x10(%1),%%rsi \n"
  42. "xor 0x18(%1),%%rdi \n"
  43. "popcnt %%rsi,%%rsi \n"
  44. "popcnt %%rdi,%%rdi \n"
  45. "add $0x20,%0 \n"
  46. "add $0x20,%1 \n"
  47. "add %%rcx,%3 \n"
  48. "add %%rdx,%%r8 \n"
  49. "add %%rsi,%%r9 \n"
  50. "add %%rdi,%%r10 \n"
  51. "sub $0x20,%2 \n"
  52. "jg 1b \n"
  53. "add %%r8, %3 \n"
  54. "add %%r9, %3 \n"
  55. "add %%r10, %3 \n"
  56. : "+r"(src_a), // %0
  57. "+r"(src_b), // %1
  58. "+r"(count), // %2
  59. "=r"(diff) // %3
  60. :
  61. : "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10");
  62. return static_cast<uint32>(diff);
  63. }
  64. #else
  65. uint32 HammingDistance_SSE42(const uint8* src_a,
  66. const uint8* src_b,
  67. int count) {
  68. uint32 diff = 0u;
  69. asm volatile(
  70. // Process 16 bytes per loop.
  71. LABELALIGN
  72. "1: \n"
  73. "mov (%0),%%ecx \n"
  74. "mov 0x4(%0),%%edx \n"
  75. "xor (%1),%%ecx \n"
  76. "xor 0x4(%1),%%edx \n"
  77. "popcnt %%ecx,%%ecx \n"
  78. "add %%ecx,%3 \n"
  79. "popcnt %%edx,%%edx \n"
  80. "add %%edx,%3 \n"
  81. "mov 0x8(%0),%%ecx \n"
  82. "mov 0xc(%0),%%edx \n"
  83. "xor 0x8(%1),%%ecx \n"
  84. "xor 0xc(%1),%%edx \n"
  85. "popcnt %%ecx,%%ecx \n"
  86. "add %%ecx,%3 \n"
  87. "popcnt %%edx,%%edx \n"
  88. "add %%edx,%3 \n"
  89. "add $0x10,%0 \n"
  90. "add $0x10,%1 \n"
  91. "sub $0x10,%2 \n"
  92. "jg 1b \n"
  93. : "+r"(src_a), // %0
  94. "+r"(src_b), // %1
  95. "+r"(count), // %2
  96. "+r"(diff) // %3
  97. :
  98. : "memory", "cc", "ecx", "edx");
  99. return diff;
  100. }
  101. #endif
  102. static vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15,
  103. 15, 15, 15, 15, 15, 15, 15, 15};
  104. static vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
  105. uint32 HammingDistance_SSSE3(const uint8* src_a,
  106. const uint8* src_b,
  107. int count) {
  108. uint32 diff = 0u;
  109. asm volatile(
  110. "movdqa %4,%%xmm2 \n"
  111. "movdqa %5,%%xmm3 \n"
  112. "pxor %%xmm0,%%xmm0 \n"
  113. "pxor %%xmm1,%%xmm1 \n"
  114. "sub %0,%1 \n"
  115. LABELALIGN
  116. "1: \n"
  117. "movdqa (%0),%%xmm4 \n"
  118. "movdqa 0x10(%0), %%xmm5 \n"
  119. "pxor (%0,%1), %%xmm4 \n"
  120. "movdqa %%xmm4,%%xmm6 \n"
  121. "pand %%xmm2,%%xmm6 \n"
  122. "psrlw $0x4,%%xmm4 \n"
  123. "movdqa %%xmm3,%%xmm7 \n"
  124. "pshufb %%xmm6,%%xmm7 \n"
  125. "pand %%xmm2,%%xmm4 \n"
  126. "movdqa %%xmm3,%%xmm6 \n"
  127. "pshufb %%xmm4,%%xmm6 \n"
  128. "paddb %%xmm7,%%xmm6 \n"
  129. "pxor 0x10(%0,%1),%%xmm5 \n"
  130. "add $0x20,%0 \n"
  131. "movdqa %%xmm5,%%xmm4 \n"
  132. "pand %%xmm2,%%xmm5 \n"
  133. "psrlw $0x4,%%xmm4 \n"
  134. "movdqa %%xmm3,%%xmm7 \n"
  135. "pshufb %%xmm5,%%xmm7 \n"
  136. "pand %%xmm2,%%xmm4 \n"
  137. "movdqa %%xmm3,%%xmm5 \n"
  138. "pshufb %%xmm4,%%xmm5 \n"
  139. "paddb %%xmm7,%%xmm5 \n"
  140. "paddb %%xmm5,%%xmm6 \n"
  141. "psadbw %%xmm1,%%xmm6 \n"
  142. "paddd %%xmm6,%%xmm0 \n"
  143. "sub $0x20,%2 \n"
  144. "jg 1b \n"
  145. "pshufd $0xaa,%%xmm0,%%xmm1 \n"
  146. "paddd %%xmm1,%%xmm0 \n"
  147. "movd %%xmm0, %3 \n"
  148. : "+r"(src_a), // %0
  149. "+r"(src_b), // %1
  150. "+r"(count), // %2
  151. "=r"(diff) // %3
  152. : "m"(kNibbleMask), // %4
  153. "m"(kBitCount) // %5
  154. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  155. "xmm7");
  156. return diff;
  157. }
  158. #ifdef HAS_HAMMINGDISTANCE_AVX2
  159. uint32 HammingDistance_AVX2(const uint8* src_a, const uint8* src_b, int count) {
  160. uint32 diff = 0u;
  161. asm volatile(
  162. "vbroadcastf128 %4,%%ymm2 \n"
  163. "vbroadcastf128 %5,%%ymm3 \n"
  164. "vpxor %%ymm0,%%ymm0,%%ymm0 \n"
  165. "vpxor %%ymm1,%%ymm1,%%ymm1 \n"
  166. "sub %0,%1 \n"
  167. LABELALIGN
  168. "1: \n"
  169. "vmovdqa (%0),%%ymm4 \n"
  170. "vmovdqa 0x20(%0), %%ymm5 \n"
  171. "vpxor (%0,%1), %%ymm4, %%ymm4 \n"
  172. "vpand %%ymm2,%%ymm4,%%ymm6 \n"
  173. "vpsrlw $0x4,%%ymm4,%%ymm4 \n"
  174. "vpshufb %%ymm6,%%ymm3,%%ymm6 \n"
  175. "vpand %%ymm2,%%ymm4,%%ymm4 \n"
  176. "vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
  177. "vpaddb %%ymm4,%%ymm6,%%ymm6 \n"
  178. "vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n"
  179. "add $0x40,%0 \n"
  180. "vpand %%ymm2,%%ymm4,%%ymm5 \n"
  181. "vpsrlw $0x4,%%ymm4,%%ymm4 \n"
  182. "vpshufb %%ymm5,%%ymm3,%%ymm5 \n"
  183. "vpand %%ymm2,%%ymm4,%%ymm4 \n"
  184. "vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
  185. "vpaddb %%ymm5,%%ymm4,%%ymm4 \n"
  186. "vpaddb %%ymm6,%%ymm4,%%ymm4 \n"
  187. "vpsadbw %%ymm1,%%ymm4,%%ymm4 \n"
  188. "vpaddd %%ymm0,%%ymm4,%%ymm0 \n"
  189. "sub $0x40,%2 \n"
  190. "jg 1b \n"
  191. "vpermq $0xb1,%%ymm0,%%ymm1 \n"
  192. "vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
  193. "vpermq $0xaa,%%ymm0,%%ymm1 \n"
  194. "vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
  195. "vmovd %%xmm0, %3 \n"
  196. "vzeroupper \n"
  197. : "+r"(src_a), // %0
  198. "+r"(src_b), // %1
  199. "+r"(count), // %2
  200. "=r"(diff) // %3
  201. : "m"(kNibbleMask), // %4
  202. "m"(kBitCount) // %5
  203. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  204. return diff;
  205. }
  206. #endif // HAS_HAMMINGDISTANCE_AVX2
  207. uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
  208. uint32 sse;
  209. asm volatile (
  210. "pxor %%xmm0,%%xmm0 \n"
  211. "pxor %%xmm5,%%xmm5 \n"
  212. LABELALIGN
  213. "1: \n"
  214. "movdqu " MEMACCESS(0) ",%%xmm1 \n"
  215. "lea " MEMLEA(0x10, 0) ",%0 \n"
  216. "movdqu " MEMACCESS(1) ",%%xmm2 \n"
  217. "lea " MEMLEA(0x10, 1) ",%1 \n"
  218. "movdqa %%xmm1,%%xmm3 \n"
  219. "psubusb %%xmm2,%%xmm1 \n"
  220. "psubusb %%xmm3,%%xmm2 \n"
  221. "por %%xmm2,%%xmm1 \n"
  222. "movdqa %%xmm1,%%xmm2 \n"
  223. "punpcklbw %%xmm5,%%xmm1 \n"
  224. "punpckhbw %%xmm5,%%xmm2 \n"
  225. "pmaddwd %%xmm1,%%xmm1 \n"
  226. "pmaddwd %%xmm2,%%xmm2 \n"
  227. "paddd %%xmm1,%%xmm0 \n"
  228. "paddd %%xmm2,%%xmm0 \n"
  229. "sub $0x10,%2 \n"
  230. "jg 1b \n"
  231. "pshufd $0xee,%%xmm0,%%xmm1 \n"
  232. "paddd %%xmm1,%%xmm0 \n"
  233. "pshufd $0x1,%%xmm0,%%xmm1 \n"
  234. "paddd %%xmm1,%%xmm0 \n"
  235. "movd %%xmm0,%3 \n"
  236. : "+r"(src_a), // %0
  237. "+r"(src_b), // %1
  238. "+r"(count), // %2
  239. "=g"(sse) // %3
  240. :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  241. );
  242. return sse;
  243. }
  244. static uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16
  245. static uvec32 kHashMul0 = {
  246. 0x0c3525e1, // 33 ^ 15
  247. 0xa3476dc1, // 33 ^ 14
  248. 0x3b4039a1, // 33 ^ 13
  249. 0x4f5f0981, // 33 ^ 12
  250. };
  251. static uvec32 kHashMul1 = {
  252. 0x30f35d61, // 33 ^ 11
  253. 0x855cb541, // 33 ^ 10
  254. 0x040a9121, // 33 ^ 9
  255. 0x747c7101, // 33 ^ 8
  256. };
  257. static uvec32 kHashMul2 = {
  258. 0xec41d4e1, // 33 ^ 7
  259. 0x4cfa3cc1, // 33 ^ 6
  260. 0x025528a1, // 33 ^ 5
  261. 0x00121881, // 33 ^ 4
  262. };
  263. static uvec32 kHashMul3 = {
  264. 0x00008c61, // 33 ^ 3
  265. 0x00000441, // 33 ^ 2
  266. 0x00000021, // 33 ^ 1
  267. 0x00000001, // 33 ^ 0
  268. };
  269. uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
  270. uint32 hash;
  271. asm volatile (
  272. "movd %2,%%xmm0 \n"
  273. "pxor %%xmm7,%%xmm7 \n"
  274. "movdqa %4,%%xmm6 \n"
  275. LABELALIGN
  276. "1: \n"
  277. "movdqu " MEMACCESS(0) ",%%xmm1 \n"
  278. "lea " MEMLEA(0x10, 0) ",%0 \n"
  279. "pmulld %%xmm6,%%xmm0 \n"
  280. "movdqa %5,%%xmm5 \n"
  281. "movdqa %%xmm1,%%xmm2 \n"
  282. "punpcklbw %%xmm7,%%xmm2 \n"
  283. "movdqa %%xmm2,%%xmm3 \n"
  284. "punpcklwd %%xmm7,%%xmm3 \n"
  285. "pmulld %%xmm5,%%xmm3 \n"
  286. "movdqa %6,%%xmm5 \n"
  287. "movdqa %%xmm2,%%xmm4 \n"
  288. "punpckhwd %%xmm7,%%xmm4 \n"
  289. "pmulld %%xmm5,%%xmm4 \n"
  290. "movdqa %7,%%xmm5 \n"
  291. "punpckhbw %%xmm7,%%xmm1 \n"
  292. "movdqa %%xmm1,%%xmm2 \n"
  293. "punpcklwd %%xmm7,%%xmm2 \n"
  294. "pmulld %%xmm5,%%xmm2 \n"
  295. "movdqa %8,%%xmm5 \n"
  296. "punpckhwd %%xmm7,%%xmm1 \n"
  297. "pmulld %%xmm5,%%xmm1 \n"
  298. "paddd %%xmm4,%%xmm3 \n"
  299. "paddd %%xmm2,%%xmm1 \n"
  300. "paddd %%xmm3,%%xmm1 \n"
  301. "pshufd $0xe,%%xmm1,%%xmm2 \n"
  302. "paddd %%xmm2,%%xmm1 \n"
  303. "pshufd $0x1,%%xmm1,%%xmm2 \n"
  304. "paddd %%xmm2,%%xmm1 \n"
  305. "paddd %%xmm1,%%xmm0 \n"
  306. "sub $0x10,%1 \n"
  307. "jg 1b \n"
  308. "movd %%xmm0,%3 \n"
  309. : "+r"(src), // %0
  310. "+r"(count), // %1
  311. "+rm"(seed), // %2
  312. "=g"(hash) // %3
  313. : "m"(kHash16x33), // %4
  314. "m"(kHashMul0), // %5
  315. "m"(kHashMul1), // %6
  316. "m"(kHashMul2), // %7
  317. "m"(kHashMul3) // %8
  318. : "memory", "cc"
  319. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  320. );
  321. return hash;
  322. }
  323. #endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
  324. #ifdef __cplusplus
  325. } // extern "C"
  326. } // namespace libyuv
  327. #endif