scale_gcc.cc 55 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377
  1. /*
  2. * Copyright 2013 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/row.h"
  11. #include "libyuv/scale_row.h"
  12. #ifdef __cplusplus
  13. namespace libyuv {
  14. extern "C" {
  15. #endif
  16. // This module is for GCC x86 and x64.
  17. #if !defined(LIBYUV_DISABLE_X86) && \
  18. (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
  19. // Offsets for source bytes 0 to 9
  20. static uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
  21. 128, 128, 128, 128, 128, 128, 128, 128};
  22. // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
  23. static uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12,
  24. 128, 128, 128, 128, 128, 128, 128, 128};
  25. // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
  26. static uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15,
  27. 128, 128, 128, 128, 128, 128, 128, 128};
  28. // Offsets for source bytes 0 to 10
  29. static uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
  30. // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
  31. static uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13};
  32. // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
  33. static uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10,
  34. 10, 11, 12, 13, 13, 14, 14, 15};
  35. // Coefficients for source bytes 0 to 10
  36. static uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
  37. // Coefficients for source bytes 10 to 21
  38. static uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
  39. // Coefficients for source bytes 21 to 31
  40. static uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
  41. // Coefficients for source bytes 21 to 31
  42. static vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
  43. static uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128,
  44. 128, 128, 128, 128, 128, 128, 128, 128};
  45. static uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3,
  46. 6, 8, 11, 14, 128, 128, 128, 128};
  47. // Arrange words 0,3,6 into 0,1,2
  48. static uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128,
  49. 128, 128, 128, 128, 128, 128, 128, 128};
  50. // Arrange words 0,3,6 into 3,4,5
  51. static uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1,
  52. 6, 7, 12, 13, 128, 128, 128, 128};
  53. // Scaling values for boxes of 3x3 and 2x3
  54. static uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
  55. 65536 / 9, 65536 / 6, 0, 0};
  56. // Arrange first value for pixels 0,1,2,3,4,5
  57. static uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128,
  58. 11, 128, 14, 128, 128, 128, 128, 128};
  59. // Arrange second value for pixels 0,1,2,3,4,5
  60. static uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128,
  61. 12, 128, 15, 128, 128, 128, 128, 128};
  62. // Arrange third value for pixels 0,1,2,3,4,5
  63. static uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128,
  64. 13, 128, 128, 128, 128, 128, 128, 128};
  65. // Scaling values for boxes of 3x2 and 2x2
  66. static uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
  67. 65536 / 3, 65536 / 2, 0, 0};
  68. // GCC versions of row functions are verbatim conversions from Visual C.
  69. // Generated using gcc disassembly on Visual C object file:
  70. // objdump -D yuvscaler.obj >yuvscaler.txt
  71. void ScaleRowDown2_SSSE3(const uint8* src_ptr,
  72. ptrdiff_t src_stride,
  73. uint8* dst_ptr,
  74. int dst_width) {
  75. (void)src_stride;
  76. asm volatile (
  77. LABELALIGN
  78. "1: \n"
  79. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  80. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  81. "lea " MEMLEA(0x20,0) ",%0 \n"
  82. "psrlw $0x8,%%xmm0 \n"
  83. "psrlw $0x8,%%xmm1 \n"
  84. "packuswb %%xmm1,%%xmm0 \n"
  85. "movdqu %%xmm0," MEMACCESS(1) " \n"
  86. "lea " MEMLEA(0x10,1) ",%1 \n"
  87. "sub $0x10,%2 \n"
  88. "jg 1b \n"
  89. : "+r"(src_ptr), // %0
  90. "+r"(dst_ptr), // %1
  91. "+r"(dst_width) // %2
  92. :: "memory", "cc", "xmm0", "xmm1"
  93. );
  94. }
  95. void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr,
  96. ptrdiff_t src_stride,
  97. uint8* dst_ptr,
  98. int dst_width) {
  99. (void)src_stride;
  100. asm volatile (
  101. "pcmpeqb %%xmm4,%%xmm4 \n"
  102. "psrlw $0xf,%%xmm4 \n"
  103. "packuswb %%xmm4,%%xmm4 \n"
  104. "pxor %%xmm5,%%xmm5 \n"
  105. LABELALIGN
  106. "1: \n"
  107. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  108. "movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n"
  109. "lea " MEMLEA(0x20,0) ",%0 \n"
  110. "pmaddubsw %%xmm4,%%xmm0 \n"
  111. "pmaddubsw %%xmm4,%%xmm1 \n"
  112. "pavgw %%xmm5,%%xmm0 \n"
  113. "pavgw %%xmm5,%%xmm1 \n"
  114. "packuswb %%xmm1,%%xmm0 \n"
  115. "movdqu %%xmm0," MEMACCESS(1) " \n"
  116. "lea " MEMLEA(0x10,1) ",%1 \n"
  117. "sub $0x10,%2 \n"
  118. "jg 1b \n"
  119. : "+r"(src_ptr), // %0
  120. "+r"(dst_ptr), // %1
  121. "+r"(dst_width) // %2
  122. :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
  123. );
  124. }
  125. void ScaleRowDown2Box_SSSE3(const uint8* src_ptr,
  126. ptrdiff_t src_stride,
  127. uint8* dst_ptr,
  128. int dst_width) {
  129. asm volatile (
  130. "pcmpeqb %%xmm4,%%xmm4 \n"
  131. "psrlw $0xf,%%xmm4 \n"
  132. "packuswb %%xmm4,%%xmm4 \n"
  133. "pxor %%xmm5,%%xmm5 \n"
  134. LABELALIGN
  135. "1: \n"
  136. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  137. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  138. MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
  139. MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
  140. "lea " MEMLEA(0x20,0) ",%0 \n"
  141. "pmaddubsw %%xmm4,%%xmm0 \n"
  142. "pmaddubsw %%xmm4,%%xmm1 \n"
  143. "pmaddubsw %%xmm4,%%xmm2 \n"
  144. "pmaddubsw %%xmm4,%%xmm3 \n"
  145. "paddw %%xmm2,%%xmm0 \n"
  146. "paddw %%xmm3,%%xmm1 \n"
  147. "psrlw $0x1,%%xmm0 \n"
  148. "psrlw $0x1,%%xmm1 \n"
  149. "pavgw %%xmm5,%%xmm0 \n"
  150. "pavgw %%xmm5,%%xmm1 \n"
  151. "packuswb %%xmm1,%%xmm0 \n"
  152. "movdqu %%xmm0," MEMACCESS(1) " \n"
  153. "lea " MEMLEA(0x10,1) ",%1 \n"
  154. "sub $0x10,%2 \n"
  155. "jg 1b \n"
  156. : "+r"(src_ptr), // %0
  157. "+r"(dst_ptr), // %1
  158. "+r"(dst_width) // %2
  159. : "r"((intptr_t)(src_stride)) // %3
  160. : "memory", "cc", NACL_R14
  161. "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  162. );
  163. }
  164. #ifdef HAS_SCALEROWDOWN2_AVX2
  165. void ScaleRowDown2_AVX2(const uint8* src_ptr,
  166. ptrdiff_t src_stride,
  167. uint8* dst_ptr,
  168. int dst_width) {
  169. (void)src_stride;
  170. asm volatile (
  171. LABELALIGN
  172. "1: \n"
  173. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  174. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  175. "lea " MEMLEA(0x40,0) ",%0 \n"
  176. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  177. "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
  178. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  179. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  180. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  181. "lea " MEMLEA(0x20,1) ",%1 \n"
  182. "sub $0x20,%2 \n"
  183. "jg 1b \n"
  184. "vzeroupper \n"
  185. : "+r"(src_ptr), // %0
  186. "+r"(dst_ptr), // %1
  187. "+r"(dst_width) // %2
  188. :: "memory", "cc", "xmm0", "xmm1"
  189. );
  190. }
  191. void ScaleRowDown2Linear_AVX2(const uint8* src_ptr,
  192. ptrdiff_t src_stride,
  193. uint8* dst_ptr,
  194. int dst_width) {
  195. (void)src_stride;
  196. asm volatile (
  197. "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
  198. "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
  199. "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
  200. "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
  201. LABELALIGN
  202. "1: \n"
  203. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  204. "vmovdqu " MEMACCESS2(0x20, 0) ",%%ymm1 \n"
  205. "lea " MEMLEA(0x40,0) ",%0 \n"
  206. "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
  207. "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
  208. "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
  209. "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
  210. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  211. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  212. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  213. "lea " MEMLEA(0x20,1) ",%1 \n"
  214. "sub $0x20,%2 \n"
  215. "jg 1b \n"
  216. "vzeroupper \n"
  217. : "+r"(src_ptr), // %0
  218. "+r"(dst_ptr), // %1
  219. "+r"(dst_width) // %2
  220. :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
  221. );
  222. }
  223. void ScaleRowDown2Box_AVX2(const uint8* src_ptr,
  224. ptrdiff_t src_stride,
  225. uint8* dst_ptr,
  226. int dst_width) {
  227. asm volatile (
  228. "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
  229. "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
  230. "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
  231. "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
  232. LABELALIGN
  233. "1: \n"
  234. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  235. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  236. MEMOPREG(vmovdqu,0x00,0,3,1,ymm2) // vmovdqu (%0,%3,1),%%ymm2
  237. MEMOPREG(vmovdqu,0x20,0,3,1,ymm3) // vmovdqu 0x20(%0,%3,1),%%ymm3
  238. "lea " MEMLEA(0x40,0) ",%0 \n"
  239. "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
  240. "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
  241. "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
  242. "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
  243. "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
  244. "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
  245. "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
  246. "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
  247. "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
  248. "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
  249. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  250. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  251. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  252. "lea " MEMLEA(0x20,1) ",%1 \n"
  253. "sub $0x20,%2 \n"
  254. "jg 1b \n"
  255. "vzeroupper \n"
  256. : "+r"(src_ptr), // %0
  257. "+r"(dst_ptr), // %1
  258. "+r"(dst_width) // %2
  259. : "r"((intptr_t)(src_stride)) // %3
  260. : "memory", "cc", NACL_R14
  261. "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  262. );
  263. }
  264. #endif // HAS_SCALEROWDOWN2_AVX2
  265. void ScaleRowDown4_SSSE3(const uint8* src_ptr,
  266. ptrdiff_t src_stride,
  267. uint8* dst_ptr,
  268. int dst_width) {
  269. (void)src_stride;
  270. asm volatile (
  271. "pcmpeqb %%xmm5,%%xmm5 \n"
  272. "psrld $0x18,%%xmm5 \n"
  273. "pslld $0x10,%%xmm5 \n"
  274. LABELALIGN
  275. "1: \n"
  276. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  277. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  278. "lea " MEMLEA(0x20,0) ",%0 \n"
  279. "pand %%xmm5,%%xmm0 \n"
  280. "pand %%xmm5,%%xmm1 \n"
  281. "packuswb %%xmm1,%%xmm0 \n"
  282. "psrlw $0x8,%%xmm0 \n"
  283. "packuswb %%xmm0,%%xmm0 \n"
  284. "movq %%xmm0," MEMACCESS(1) " \n"
  285. "lea " MEMLEA(0x8,1) ",%1 \n"
  286. "sub $0x8,%2 \n"
  287. "jg 1b \n"
  288. : "+r"(src_ptr), // %0
  289. "+r"(dst_ptr), // %1
  290. "+r"(dst_width) // %2
  291. :: "memory", "cc", "xmm0", "xmm1", "xmm5"
  292. );
  293. }
  294. void ScaleRowDown4Box_SSSE3(const uint8* src_ptr,
  295. ptrdiff_t src_stride,
  296. uint8* dst_ptr,
  297. int dst_width) {
  298. intptr_t stridex3;
  299. asm volatile (
  300. "pcmpeqb %%xmm4,%%xmm4 \n"
  301. "psrlw $0xf,%%xmm4 \n"
  302. "movdqa %%xmm4,%%xmm5 \n"
  303. "packuswb %%xmm4,%%xmm4 \n"
  304. "psllw $0x3,%%xmm5 \n"
  305. "lea " MEMLEA4(0x00,4,4,2) ",%3 \n"
  306. LABELALIGN
  307. "1: \n"
  308. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  309. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  310. MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
  311. MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
  312. "pmaddubsw %%xmm4,%%xmm0 \n"
  313. "pmaddubsw %%xmm4,%%xmm1 \n"
  314. "pmaddubsw %%xmm4,%%xmm2 \n"
  315. "pmaddubsw %%xmm4,%%xmm3 \n"
  316. "paddw %%xmm2,%%xmm0 \n"
  317. "paddw %%xmm3,%%xmm1 \n"
  318. MEMOPREG(movdqu,0x00,0,4,2,xmm2) // movdqu (%0,%4,2),%%xmm2
  319. MEMOPREG(movdqu,0x10,0,4,2,xmm3) // movdqu 0x10(%0,%4,2),%%xmm3
  320. "pmaddubsw %%xmm4,%%xmm2 \n"
  321. "pmaddubsw %%xmm4,%%xmm3 \n"
  322. "paddw %%xmm2,%%xmm0 \n"
  323. "paddw %%xmm3,%%xmm1 \n"
  324. MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
  325. MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
  326. "lea " MEMLEA(0x20,0) ",%0 \n"
  327. "pmaddubsw %%xmm4,%%xmm2 \n"
  328. "pmaddubsw %%xmm4,%%xmm3 \n"
  329. "paddw %%xmm2,%%xmm0 \n"
  330. "paddw %%xmm3,%%xmm1 \n"
  331. "phaddw %%xmm1,%%xmm0 \n"
  332. "paddw %%xmm5,%%xmm0 \n"
  333. "psrlw $0x4,%%xmm0 \n"
  334. "packuswb %%xmm0,%%xmm0 \n"
  335. "movq %%xmm0," MEMACCESS(1) " \n"
  336. "lea " MEMLEA(0x8,1) ",%1 \n"
  337. "sub $0x8,%2 \n"
  338. "jg 1b \n"
  339. : "+r"(src_ptr), // %0
  340. "+r"(dst_ptr), // %1
  341. "+r"(dst_width), // %2
  342. "=&r"(stridex3) // %3
  343. : "r"((intptr_t)(src_stride)) // %4
  344. : "memory", "cc", NACL_R14
  345. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  346. );
  347. }
  348. #ifdef HAS_SCALEROWDOWN4_AVX2
  349. void ScaleRowDown4_AVX2(const uint8* src_ptr,
  350. ptrdiff_t src_stride,
  351. uint8* dst_ptr,
  352. int dst_width) {
  353. (void)src_stride;
  354. asm volatile (
  355. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  356. "vpsrld $0x18,%%ymm5,%%ymm5 \n"
  357. "vpslld $0x10,%%ymm5,%%ymm5 \n"
  358. LABELALIGN
  359. "1: \n"
  360. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  361. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  362. "lea " MEMLEA(0x40,0) ",%0 \n"
  363. "vpand %%ymm5,%%ymm0,%%ymm0 \n"
  364. "vpand %%ymm5,%%ymm1,%%ymm1 \n"
  365. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  366. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  367. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  368. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
  369. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  370. "vmovdqu %%xmm0," MEMACCESS(1) " \n"
  371. "lea " MEMLEA(0x10,1) ",%1 \n"
  372. "sub $0x10,%2 \n"
  373. "jg 1b \n"
  374. "vzeroupper \n"
  375. : "+r"(src_ptr), // %0
  376. "+r"(dst_ptr), // %1
  377. "+r"(dst_width) // %2
  378. :: "memory", "cc", "xmm0", "xmm1", "xmm5"
  379. );
  380. }
  381. void ScaleRowDown4Box_AVX2(const uint8* src_ptr,
  382. ptrdiff_t src_stride,
  383. uint8* dst_ptr,
  384. int dst_width) {
  385. asm volatile (
  386. "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
  387. "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
  388. "vpsllw $0x3,%%ymm4,%%ymm5 \n"
  389. "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
  390. LABELALIGN
  391. "1: \n"
  392. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  393. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  394. MEMOPREG(vmovdqu,0x00,0,3,1,ymm2) // vmovdqu (%0,%3,1),%%ymm2
  395. MEMOPREG(vmovdqu,0x20,0,3,1,ymm3) // vmovdqu 0x20(%0,%3,1),%%ymm3
  396. "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
  397. "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
  398. "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
  399. "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
  400. "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
  401. "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
  402. MEMOPREG(vmovdqu,0x00,0,3,2,ymm2) // vmovdqu (%0,%3,2),%%ymm2
  403. MEMOPREG(vmovdqu,0x20,0,3,2,ymm3) // vmovdqu 0x20(%0,%3,2),%%ymm3
  404. "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
  405. "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
  406. "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
  407. "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
  408. MEMOPREG(vmovdqu,0x00,0,4,1,ymm2) // vmovdqu (%0,%4,1),%%ymm2
  409. MEMOPREG(vmovdqu,0x20,0,4,1,ymm3) // vmovdqu 0x20(%0,%4,1),%%ymm3
  410. "lea " MEMLEA(0x40,0) ",%0 \n"
  411. "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
  412. "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
  413. "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
  414. "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
  415. "vphaddw %%ymm1,%%ymm0,%%ymm0 \n"
  416. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  417. "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
  418. "vpsrlw $0x4,%%ymm0,%%ymm0 \n"
  419. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
  420. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  421. "vmovdqu %%xmm0," MEMACCESS(1) " \n"
  422. "lea " MEMLEA(0x10,1) ",%1 \n"
  423. "sub $0x10,%2 \n"
  424. "jg 1b \n"
  425. "vzeroupper \n"
  426. : "+r"(src_ptr), // %0
  427. "+r"(dst_ptr), // %1
  428. "+r"(dst_width) // %2
  429. : "r"((intptr_t)(src_stride)), // %3
  430. "r"((intptr_t)(src_stride * 3)) // %4
  431. : "memory", "cc", NACL_R14
  432. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  433. );
  434. }
  435. #endif // HAS_SCALEROWDOWN4_AVX2
  436. void ScaleRowDown34_SSSE3(const uint8* src_ptr,
  437. ptrdiff_t src_stride,
  438. uint8* dst_ptr,
  439. int dst_width) {
  440. (void)src_stride;
  441. asm volatile(
  442. "movdqa %0,%%xmm3 \n"
  443. "movdqa %1,%%xmm4 \n"
  444. "movdqa %2,%%xmm5 \n"
  445. :
  446. : "m"(kShuf0), // %0
  447. "m"(kShuf1), // %1
  448. "m"(kShuf2) // %2
  449. );
  450. asm volatile (
  451. LABELALIGN
  452. "1: \n"
  453. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  454. "movdqu " MEMACCESS2(0x10,0) ",%%xmm2 \n"
  455. "lea " MEMLEA(0x20,0) ",%0 \n"
  456. "movdqa %%xmm2,%%xmm1 \n"
  457. "palignr $0x8,%%xmm0,%%xmm1 \n"
  458. "pshufb %%xmm3,%%xmm0 \n"
  459. "pshufb %%xmm4,%%xmm1 \n"
  460. "pshufb %%xmm5,%%xmm2 \n"
  461. "movq %%xmm0," MEMACCESS(1) " \n"
  462. "movq %%xmm1," MEMACCESS2(0x8,1) " \n"
  463. "movq %%xmm2," MEMACCESS2(0x10,1) " \n"
  464. "lea " MEMLEA(0x18,1) ",%1 \n"
  465. "sub $0x18,%2 \n"
  466. "jg 1b \n"
  467. : "+r"(src_ptr), // %0
  468. "+r"(dst_ptr), // %1
  469. "+r"(dst_width) // %2
  470. :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  471. );
  472. }
  473. void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
  474. ptrdiff_t src_stride,
  475. uint8* dst_ptr,
  476. int dst_width) {
  477. asm volatile(
  478. "movdqa %0,%%xmm2 \n" // kShuf01
  479. "movdqa %1,%%xmm3 \n" // kShuf11
  480. "movdqa %2,%%xmm4 \n" // kShuf21
  481. :
  482. : "m"(kShuf01), // %0
  483. "m"(kShuf11), // %1
  484. "m"(kShuf21) // %2
  485. );
  486. asm volatile(
  487. "movdqa %0,%%xmm5 \n" // kMadd01
  488. "movdqa %1,%%xmm0 \n" // kMadd11
  489. "movdqa %2,%%xmm1 \n" // kRound34
  490. :
  491. : "m"(kMadd01), // %0
  492. "m"(kMadd11), // %1
  493. "m"(kRound34) // %2
  494. );
  495. asm volatile (
  496. LABELALIGN
  497. "1: \n"
  498. "movdqu " MEMACCESS(0) ",%%xmm6 \n"
  499. MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3),%%xmm7
  500. "pavgb %%xmm7,%%xmm6 \n"
  501. "pshufb %%xmm2,%%xmm6 \n"
  502. "pmaddubsw %%xmm5,%%xmm6 \n"
  503. "paddsw %%xmm1,%%xmm6 \n"
  504. "psrlw $0x2,%%xmm6 \n"
  505. "packuswb %%xmm6,%%xmm6 \n"
  506. "movq %%xmm6," MEMACCESS(1) " \n"
  507. "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n"
  508. MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3),%%xmm7
  509. "pavgb %%xmm7,%%xmm6 \n"
  510. "pshufb %%xmm3,%%xmm6 \n"
  511. "pmaddubsw %%xmm0,%%xmm6 \n"
  512. "paddsw %%xmm1,%%xmm6 \n"
  513. "psrlw $0x2,%%xmm6 \n"
  514. "packuswb %%xmm6,%%xmm6 \n"
  515. "movq %%xmm6," MEMACCESS2(0x8,1) " \n"
  516. "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
  517. MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3),%%xmm7
  518. "lea " MEMLEA(0x20,0) ",%0 \n"
  519. "pavgb %%xmm7,%%xmm6 \n"
  520. "pshufb %%xmm4,%%xmm6 \n"
  521. "pmaddubsw %4,%%xmm6 \n"
  522. "paddsw %%xmm1,%%xmm6 \n"
  523. "psrlw $0x2,%%xmm6 \n"
  524. "packuswb %%xmm6,%%xmm6 \n"
  525. "movq %%xmm6," MEMACCESS2(0x10,1) " \n"
  526. "lea " MEMLEA(0x18,1) ",%1 \n"
  527. "sub $0x18,%2 \n"
  528. "jg 1b \n"
  529. : "+r"(src_ptr), // %0
  530. "+r"(dst_ptr), // %1
  531. "+r"(dst_width) // %2
  532. : "r"((intptr_t)(src_stride)), // %3
  533. "m"(kMadd21) // %4
  534. : "memory", "cc", NACL_R14
  535. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  536. );
  537. }
  538. void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
  539. ptrdiff_t src_stride,
  540. uint8* dst_ptr,
  541. int dst_width) {
  542. asm volatile(
  543. "movdqa %0,%%xmm2 \n" // kShuf01
  544. "movdqa %1,%%xmm3 \n" // kShuf11
  545. "movdqa %2,%%xmm4 \n" // kShuf21
  546. :
  547. : "m"(kShuf01), // %0
  548. "m"(kShuf11), // %1
  549. "m"(kShuf21) // %2
  550. );
  551. asm volatile(
  552. "movdqa %0,%%xmm5 \n" // kMadd01
  553. "movdqa %1,%%xmm0 \n" // kMadd11
  554. "movdqa %2,%%xmm1 \n" // kRound34
  555. :
  556. : "m"(kMadd01), // %0
  557. "m"(kMadd11), // %1
  558. "m"(kRound34) // %2
  559. );
  560. asm volatile (
  561. LABELALIGN
  562. "1: \n"
  563. "movdqu " MEMACCESS(0) ",%%xmm6 \n"
  564. MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3,1),%%xmm7
  565. "pavgb %%xmm6,%%xmm7 \n"
  566. "pavgb %%xmm7,%%xmm6 \n"
  567. "pshufb %%xmm2,%%xmm6 \n"
  568. "pmaddubsw %%xmm5,%%xmm6 \n"
  569. "paddsw %%xmm1,%%xmm6 \n"
  570. "psrlw $0x2,%%xmm6 \n"
  571. "packuswb %%xmm6,%%xmm6 \n"
  572. "movq %%xmm6," MEMACCESS(1) " \n"
  573. "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n"
  574. MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3,1),%%xmm7
  575. "pavgb %%xmm6,%%xmm7 \n"
  576. "pavgb %%xmm7,%%xmm6 \n"
  577. "pshufb %%xmm3,%%xmm6 \n"
  578. "pmaddubsw %%xmm0,%%xmm6 \n"
  579. "paddsw %%xmm1,%%xmm6 \n"
  580. "psrlw $0x2,%%xmm6 \n"
  581. "packuswb %%xmm6,%%xmm6 \n"
  582. "movq %%xmm6," MEMACCESS2(0x8,1) " \n"
  583. "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
  584. MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3,1),%%xmm7
  585. "lea " MEMLEA(0x20,0) ",%0 \n"
  586. "pavgb %%xmm6,%%xmm7 \n"
  587. "pavgb %%xmm7,%%xmm6 \n"
  588. "pshufb %%xmm4,%%xmm6 \n"
  589. "pmaddubsw %4,%%xmm6 \n"
  590. "paddsw %%xmm1,%%xmm6 \n"
  591. "psrlw $0x2,%%xmm6 \n"
  592. "packuswb %%xmm6,%%xmm6 \n"
  593. "movq %%xmm6," MEMACCESS2(0x10,1) " \n"
  594. "lea " MEMLEA(0x18,1) ",%1 \n"
  595. "sub $0x18,%2 \n"
  596. "jg 1b \n"
  597. : "+r"(src_ptr), // %0
  598. "+r"(dst_ptr), // %1
  599. "+r"(dst_width) // %2
  600. : "r"((intptr_t)(src_stride)), // %3
  601. "m"(kMadd21) // %4
  602. : "memory", "cc", NACL_R14
  603. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  604. );
  605. }
  606. void ScaleRowDown38_SSSE3(const uint8* src_ptr,
  607. ptrdiff_t src_stride,
  608. uint8* dst_ptr,
  609. int dst_width) {
  610. (void)src_stride;
  611. asm volatile (
  612. "movdqa %3,%%xmm4 \n"
  613. "movdqa %4,%%xmm5 \n"
  614. LABELALIGN
  615. "1: \n"
  616. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  617. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  618. "lea " MEMLEA(0x20,0) ",%0 \n"
  619. "pshufb %%xmm4,%%xmm0 \n"
  620. "pshufb %%xmm5,%%xmm1 \n"
  621. "paddusb %%xmm1,%%xmm0 \n"
  622. "movq %%xmm0," MEMACCESS(1) " \n"
  623. "movhlps %%xmm0,%%xmm1 \n"
  624. "movd %%xmm1," MEMACCESS2(0x8,1) " \n"
  625. "lea " MEMLEA(0xc,1) ",%1 \n"
  626. "sub $0xc,%2 \n"
  627. "jg 1b \n"
  628. : "+r"(src_ptr), // %0
  629. "+r"(dst_ptr), // %1
  630. "+r"(dst_width) // %2
  631. : "m"(kShuf38a), // %3
  632. "m"(kShuf38b) // %4
  633. : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
  634. );
  635. }
  636. void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
  637. ptrdiff_t src_stride,
  638. uint8* dst_ptr,
  639. int dst_width) {
  640. asm volatile(
  641. "movdqa %0,%%xmm2 \n"
  642. "movdqa %1,%%xmm3 \n"
  643. "movdqa %2,%%xmm4 \n"
  644. "movdqa %3,%%xmm5 \n"
  645. :
  646. : "m"(kShufAb0), // %0
  647. "m"(kShufAb1), // %1
  648. "m"(kShufAb2), // %2
  649. "m"(kScaleAb2) // %3
  650. );
  651. asm volatile (
  652. LABELALIGN
  653. "1: \n"
  654. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  655. MEMOPREG(movdqu,0x00,0,3,1,xmm1) // movdqu (%0,%3,1),%%xmm1
  656. "lea " MEMLEA(0x10,0) ",%0 \n"
  657. "pavgb %%xmm1,%%xmm0 \n"
  658. "movdqa %%xmm0,%%xmm1 \n"
  659. "pshufb %%xmm2,%%xmm1 \n"
  660. "movdqa %%xmm0,%%xmm6 \n"
  661. "pshufb %%xmm3,%%xmm6 \n"
  662. "paddusw %%xmm6,%%xmm1 \n"
  663. "pshufb %%xmm4,%%xmm0 \n"
  664. "paddusw %%xmm0,%%xmm1 \n"
  665. "pmulhuw %%xmm5,%%xmm1 \n"
  666. "packuswb %%xmm1,%%xmm1 \n"
  667. "movd %%xmm1," MEMACCESS(1) " \n"
  668. "psrlq $0x10,%%xmm1 \n"
  669. "movd %%xmm1," MEMACCESS2(0x2,1) " \n"
  670. "lea " MEMLEA(0x6,1) ",%1 \n"
  671. "sub $0x6,%2 \n"
  672. "jg 1b \n"
  673. : "+r"(src_ptr), // %0
  674. "+r"(dst_ptr), // %1
  675. "+r"(dst_width) // %2
  676. : "r"((intptr_t)(src_stride)) // %3
  677. : "memory", "cc", NACL_R14
  678. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  679. );
  680. }
  681. void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
  682. ptrdiff_t src_stride,
  683. uint8* dst_ptr,
  684. int dst_width) {
  685. asm volatile(
  686. "movdqa %0,%%xmm2 \n"
  687. "movdqa %1,%%xmm3 \n"
  688. "movdqa %2,%%xmm4 \n"
  689. "pxor %%xmm5,%%xmm5 \n"
  690. :
  691. : "m"(kShufAc), // %0
  692. "m"(kShufAc3), // %1
  693. "m"(kScaleAc33) // %2
  694. );
  695. asm volatile (
  696. LABELALIGN
  697. "1: \n"
  698. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  699. MEMOPREG(movdqu,0x00,0,3,1,xmm6) // movdqu (%0,%3,1),%%xmm6
  700. "movhlps %%xmm0,%%xmm1 \n"
  701. "movhlps %%xmm6,%%xmm7 \n"
  702. "punpcklbw %%xmm5,%%xmm0 \n"
  703. "punpcklbw %%xmm5,%%xmm1 \n"
  704. "punpcklbw %%xmm5,%%xmm6 \n"
  705. "punpcklbw %%xmm5,%%xmm7 \n"
  706. "paddusw %%xmm6,%%xmm0 \n"
  707. "paddusw %%xmm7,%%xmm1 \n"
  708. MEMOPREG(movdqu,0x00,0,3,2,xmm6) // movdqu (%0,%3,2),%%xmm6
  709. "lea " MEMLEA(0x10,0) ",%0 \n"
  710. "movhlps %%xmm6,%%xmm7 \n"
  711. "punpcklbw %%xmm5,%%xmm6 \n"
  712. "punpcklbw %%xmm5,%%xmm7 \n"
  713. "paddusw %%xmm6,%%xmm0 \n"
  714. "paddusw %%xmm7,%%xmm1 \n"
  715. "movdqa %%xmm0,%%xmm6 \n"
  716. "psrldq $0x2,%%xmm0 \n"
  717. "paddusw %%xmm0,%%xmm6 \n"
  718. "psrldq $0x2,%%xmm0 \n"
  719. "paddusw %%xmm0,%%xmm6 \n"
  720. "pshufb %%xmm2,%%xmm6 \n"
  721. "movdqa %%xmm1,%%xmm7 \n"
  722. "psrldq $0x2,%%xmm1 \n"
  723. "paddusw %%xmm1,%%xmm7 \n"
  724. "psrldq $0x2,%%xmm1 \n"
  725. "paddusw %%xmm1,%%xmm7 \n"
  726. "pshufb %%xmm3,%%xmm7 \n"
  727. "paddusw %%xmm7,%%xmm6 \n"
  728. "pmulhuw %%xmm4,%%xmm6 \n"
  729. "packuswb %%xmm6,%%xmm6 \n"
  730. "movd %%xmm6," MEMACCESS(1) " \n"
  731. "psrlq $0x10,%%xmm6 \n"
  732. "movd %%xmm6," MEMACCESS2(0x2,1) " \n"
  733. "lea " MEMLEA(0x6,1) ",%1 \n"
  734. "sub $0x6,%2 \n"
  735. "jg 1b \n"
  736. : "+r"(src_ptr), // %0
  737. "+r"(dst_ptr), // %1
  738. "+r"(dst_width) // %2
  739. : "r"((intptr_t)(src_stride)) // %3
  740. : "memory", "cc", NACL_R14
  741. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  742. );
  743. }
  744. // Reads 16xN bytes and produces 16 shorts at a time.
  745. void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
  746. asm volatile (
  747. "pxor %%xmm5,%%xmm5 \n"
  748. LABELALIGN
  749. "1: \n"
  750. "movdqu " MEMACCESS(0) ",%%xmm3 \n"
  751. "lea " MEMLEA(0x10,0) ",%0 \n" // src_ptr += 16
  752. "movdqu " MEMACCESS(1) ",%%xmm0 \n"
  753. "movdqu " MEMACCESS2(0x10,1) ",%%xmm1 \n"
  754. "movdqa %%xmm3,%%xmm2 \n"
  755. "punpcklbw %%xmm5,%%xmm2 \n"
  756. "punpckhbw %%xmm5,%%xmm3 \n"
  757. "paddusw %%xmm2,%%xmm0 \n"
  758. "paddusw %%xmm3,%%xmm1 \n"
  759. "movdqu %%xmm0," MEMACCESS(1) " \n"
  760. "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
  761. "lea " MEMLEA(0x20,1) ",%1 \n"
  762. "sub $0x10,%2 \n"
  763. "jg 1b \n"
  764. : "+r"(src_ptr), // %0
  765. "+r"(dst_ptr), // %1
  766. "+r"(src_width) // %2
  767. :
  768. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  769. );
  770. }
  771. #ifdef HAS_SCALEADDROW_AVX2
  772. // Reads 32 bytes and accumulates to 32 shorts at a time.
  773. void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
  774. asm volatile (
  775. "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
  776. LABELALIGN
  777. "1: \n"
  778. "vmovdqu " MEMACCESS(0) ",%%ymm3 \n"
  779. "lea " MEMLEA(0x20,0) ",%0 \n" // src_ptr += 32
  780. "vpermq $0xd8,%%ymm3,%%ymm3 \n"
  781. "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
  782. "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
  783. "vpaddusw " MEMACCESS(1) ",%%ymm2,%%ymm0 \n"
  784. "vpaddusw " MEMACCESS2(0x20,1) ",%%ymm3,%%ymm1 \n"
  785. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  786. "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
  787. "lea " MEMLEA(0x40,1) ",%1 \n"
  788. "sub $0x20,%2 \n"
  789. "jg 1b \n"
  790. "vzeroupper \n"
  791. : "+r"(src_ptr), // %0
  792. "+r"(dst_ptr), // %1
  793. "+r"(src_width) // %2
  794. :
  795. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  796. );
  797. }
  798. #endif // HAS_SCALEADDROW_AVX2
  799. // Constant for making pixels signed to avoid pmaddubsw
  800. // saturation.
  801. static uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  802. 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
  803. // Constant for making pixels unsigned and adding .5 for rounding.
  804. static uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
  805. 0x4040, 0x4040, 0x4040, 0x4040};
  806. // Bilinear column filtering. SSSE3 version.
  807. void ScaleFilterCols_SSSE3(uint8* dst_ptr,
  808. const uint8* src_ptr,
  809. int dst_width,
  810. int x,
  811. int dx) {
  812. intptr_t x0, x1, temp_pixel;
  813. asm volatile (
  814. "movd %6,%%xmm2 \n"
  815. "movd %7,%%xmm3 \n"
  816. "movl $0x04040000,%k2 \n"
  817. "movd %k2,%%xmm5 \n"
  818. "pcmpeqb %%xmm6,%%xmm6 \n"
  819. "psrlw $0x9,%%xmm6 \n" // 0x007f007f
  820. "pcmpeqb %%xmm7,%%xmm7 \n"
  821. "psrlw $15,%%xmm7 \n" // 0x00010001
  822. "pextrw $0x1,%%xmm2,%k3 \n"
  823. "subl $0x2,%5 \n"
  824. "jl 29f \n"
  825. "movdqa %%xmm2,%%xmm0 \n"
  826. "paddd %%xmm3,%%xmm0 \n"
  827. "punpckldq %%xmm0,%%xmm2 \n"
  828. "punpckldq %%xmm3,%%xmm3 \n"
  829. "paddd %%xmm3,%%xmm3 \n"
  830. "pextrw $0x3,%%xmm2,%k4 \n"
  831. LABELALIGN
  832. "2: \n"
  833. "movdqa %%xmm2,%%xmm1 \n"
  834. "paddd %%xmm3,%%xmm2 \n"
  835. MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2
  836. "movd %k2,%%xmm0 \n"
  837. "psrlw $0x9,%%xmm1 \n"
  838. MEMOPARG(movzwl,0x00,1,4,1,k2) // movzwl (%1,%4,1),%k2
  839. "movd %k2,%%xmm4 \n"
  840. "pshufb %%xmm5,%%xmm1 \n"
  841. "punpcklwd %%xmm4,%%xmm0 \n"
  842. "psubb %8,%%xmm0 \n" // make pixels signed.
  843. "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) + 1
  844. "paddusb %%xmm7,%%xmm1 \n"
  845. "pmaddubsw %%xmm0,%%xmm1 \n"
  846. "pextrw $0x1,%%xmm2,%k3 \n"
  847. "pextrw $0x3,%%xmm2,%k4 \n"
  848. "paddw %9,%%xmm1 \n" // make pixels unsigned.
  849. "psrlw $0x7,%%xmm1 \n"
  850. "packuswb %%xmm1,%%xmm1 \n"
  851. "movd %%xmm1,%k2 \n"
  852. "mov %w2," MEMACCESS(0) " \n"
  853. "lea " MEMLEA(0x2,0) ",%0 \n"
  854. "subl $0x2,%5 \n"
  855. "jge 2b \n"
  856. LABELALIGN
  857. "29: \n"
  858. "addl $0x1,%5 \n"
  859. "jl 99f \n"
  860. MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2
  861. "movd %k2,%%xmm0 \n"
  862. "psrlw $0x9,%%xmm2 \n"
  863. "pshufb %%xmm5,%%xmm2 \n"
  864. "psubb %8,%%xmm0 \n" // make pixels signed.
  865. "pxor %%xmm6,%%xmm2 \n"
  866. "paddusb %%xmm7,%%xmm2 \n"
  867. "pmaddubsw %%xmm0,%%xmm2 \n"
  868. "paddw %9,%%xmm2 \n" // make pixels unsigned.
  869. "psrlw $0x7,%%xmm2 \n"
  870. "packuswb %%xmm2,%%xmm2 \n"
  871. "movd %%xmm2,%k2 \n"
  872. "mov %b2," MEMACCESS(0) " \n"
  873. "99: \n"
  874. : "+r"(dst_ptr), // %0
  875. "+r"(src_ptr), // %1
  876. "=&a"(temp_pixel), // %2
  877. "=&r"(x0), // %3
  878. "=&r"(x1), // %4
  879. #if defined(__x86_64__)
  880. "+rm"(dst_width) // %5
  881. #else
  882. "+m"(dst_width) // %5
  883. #endif
  884. : "rm"(x), // %6
  885. "rm"(dx), // %7
  886. #if defined(__x86_64__)
  887. "x"(kFsub80), // %8
  888. "x"(kFadd40) // %9
  889. #else
  890. "m"(kFsub80), // %8
  891. "m"(kFadd40) // %9
  892. #endif
  893. : "memory", "cc", NACL_R14
  894. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  895. );
  896. }
  897. // Reads 4 pixels, duplicates them and writes 8 pixels.
  898. // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
  899. void ScaleColsUp2_SSE2(uint8* dst_ptr,
  900. const uint8* src_ptr,
  901. int dst_width,
  902. int x,
  903. int dx) {
  904. (void)x;
  905. (void)dx;
  906. asm volatile (
  907. LABELALIGN
  908. "1: \n"
  909. "movdqu " MEMACCESS(1) ",%%xmm0 \n"
  910. "lea " MEMLEA(0x10,1) ",%1 \n"
  911. "movdqa %%xmm0,%%xmm1 \n"
  912. "punpcklbw %%xmm0,%%xmm0 \n"
  913. "punpckhbw %%xmm1,%%xmm1 \n"
  914. "movdqu %%xmm0," MEMACCESS(0) " \n"
  915. "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
  916. "lea " MEMLEA(0x20,0) ",%0 \n"
  917. "sub $0x20,%2 \n"
  918. "jg 1b \n"
  919. : "+r"(dst_ptr), // %0
  920. "+r"(src_ptr), // %1
  921. "+r"(dst_width) // %2
  922. :: "memory", "cc", "xmm0", "xmm1"
  923. );
  924. }
  925. void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
  926. ptrdiff_t src_stride,
  927. uint8* dst_argb,
  928. int dst_width) {
  929. (void)src_stride;
  930. asm volatile (
  931. LABELALIGN
  932. "1: \n"
  933. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  934. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  935. "lea " MEMLEA(0x20,0) ",%0 \n"
  936. "shufps $0xdd,%%xmm1,%%xmm0 \n"
  937. "movdqu %%xmm0," MEMACCESS(1) " \n"
  938. "lea " MEMLEA(0x10,1) ",%1 \n"
  939. "sub $0x4,%2 \n"
  940. "jg 1b \n"
  941. : "+r"(src_argb), // %0
  942. "+r"(dst_argb), // %1
  943. "+r"(dst_width) // %2
  944. :: "memory", "cc", "xmm0", "xmm1"
  945. );
  946. }
  947. void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
  948. ptrdiff_t src_stride,
  949. uint8* dst_argb,
  950. int dst_width) {
  951. (void)src_stride;
  952. asm volatile (
  953. LABELALIGN
  954. "1: \n"
  955. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  956. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  957. "lea " MEMLEA(0x20,0) ",%0 \n"
  958. "movdqa %%xmm0,%%xmm2 \n"
  959. "shufps $0x88,%%xmm1,%%xmm0 \n"
  960. "shufps $0xdd,%%xmm1,%%xmm2 \n"
  961. "pavgb %%xmm2,%%xmm0 \n"
  962. "movdqu %%xmm0," MEMACCESS(1) " \n"
  963. "lea " MEMLEA(0x10,1) ",%1 \n"
  964. "sub $0x4,%2 \n"
  965. "jg 1b \n"
  966. : "+r"(src_argb), // %0
  967. "+r"(dst_argb), // %1
  968. "+r"(dst_width) // %2
  969. :: "memory", "cc", "xmm0", "xmm1"
  970. );
  971. }
  972. void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
  973. ptrdiff_t src_stride,
  974. uint8* dst_argb,
  975. int dst_width) {
  976. asm volatile (
  977. LABELALIGN
  978. "1: \n"
  979. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  980. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  981. MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
  982. MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
  983. "lea " MEMLEA(0x20,0) ",%0 \n"
  984. "pavgb %%xmm2,%%xmm0 \n"
  985. "pavgb %%xmm3,%%xmm1 \n"
  986. "movdqa %%xmm0,%%xmm2 \n"
  987. "shufps $0x88,%%xmm1,%%xmm0 \n"
  988. "shufps $0xdd,%%xmm1,%%xmm2 \n"
  989. "pavgb %%xmm2,%%xmm0 \n"
  990. "movdqu %%xmm0," MEMACCESS(1) " \n"
  991. "lea " MEMLEA(0x10,1) ",%1 \n"
  992. "sub $0x4,%2 \n"
  993. "jg 1b \n"
  994. : "+r"(src_argb), // %0
  995. "+r"(dst_argb), // %1
  996. "+r"(dst_width) // %2
  997. : "r"((intptr_t)(src_stride)) // %3
  998. : "memory", "cc", NACL_R14
  999. "xmm0", "xmm1", "xmm2", "xmm3"
  1000. );
  1001. }
  1002. // Reads 4 pixels at a time.
  1003. // Alignment requirement: dst_argb 16 byte aligned.
  1004. void ScaleARGBRowDownEven_SSE2(const uint8* src_argb,
  1005. ptrdiff_t src_stride,
  1006. int src_stepx,
  1007. uint8* dst_argb,
  1008. int dst_width) {
  1009. intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
  1010. intptr_t src_stepx_x12;
  1011. (void)src_stride;
  1012. asm volatile (
  1013. "lea " MEMLEA3(0x00,1,4) ",%1 \n"
  1014. "lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
  1015. LABELALIGN
  1016. "1: \n"
  1017. "movd " MEMACCESS(0) ",%%xmm0 \n"
  1018. MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1
  1019. "punpckldq %%xmm1,%%xmm0 \n"
  1020. MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2
  1021. MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3
  1022. "lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
  1023. "punpckldq %%xmm3,%%xmm2 \n"
  1024. "punpcklqdq %%xmm2,%%xmm0 \n"
  1025. "movdqu %%xmm0," MEMACCESS(2) " \n"
  1026. "lea " MEMLEA(0x10,2) ",%2 \n"
  1027. "sub $0x4,%3 \n"
  1028. "jg 1b \n"
  1029. : "+r"(src_argb), // %0
  1030. "+r"(src_stepx_x4), // %1
  1031. "+r"(dst_argb), // %2
  1032. "+r"(dst_width), // %3
  1033. "=&r"(src_stepx_x12) // %4
  1034. :: "memory", "cc", NACL_R14
  1035. "xmm0", "xmm1", "xmm2", "xmm3"
  1036. );
  1037. }
  1038. // Blends four 2x2 to 4x1.
  1039. // Alignment requirement: dst_argb 16 byte aligned.
  1040. void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
  1041. ptrdiff_t src_stride,
  1042. int src_stepx,
  1043. uint8* dst_argb,
  1044. int dst_width) {
  1045. intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
  1046. intptr_t src_stepx_x12;
  1047. intptr_t row1 = (intptr_t)(src_stride);
  1048. asm volatile (
  1049. "lea " MEMLEA3(0x00,1,4) ",%1 \n"
  1050. "lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
  1051. "lea " MEMLEA4(0x00,0,5,1) ",%5 \n"
  1052. LABELALIGN
  1053. "1: \n"
  1054. "movq " MEMACCESS(0) ",%%xmm0 \n"
  1055. MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0
  1056. MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1
  1057. MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1
  1058. "lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
  1059. "movq " MEMACCESS(5) ",%%xmm2 \n"
  1060. MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2
  1061. MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3
  1062. MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3
  1063. "lea " MEMLEA4(0x00,5,1,4) ",%5 \n"
  1064. "pavgb %%xmm2,%%xmm0 \n"
  1065. "pavgb %%xmm3,%%xmm1 \n"
  1066. "movdqa %%xmm0,%%xmm2 \n"
  1067. "shufps $0x88,%%xmm1,%%xmm0 \n"
  1068. "shufps $0xdd,%%xmm1,%%xmm2 \n"
  1069. "pavgb %%xmm2,%%xmm0 \n"
  1070. "movdqu %%xmm0," MEMACCESS(2) " \n"
  1071. "lea " MEMLEA(0x10,2) ",%2 \n"
  1072. "sub $0x4,%3 \n"
  1073. "jg 1b \n"
  1074. : "+r"(src_argb), // %0
  1075. "+r"(src_stepx_x4), // %1
  1076. "+r"(dst_argb), // %2
  1077. "+rm"(dst_width), // %3
  1078. "=&r"(src_stepx_x12), // %4
  1079. "+r"(row1) // %5
  1080. :: "memory", "cc", NACL_R14
  1081. "xmm0", "xmm1", "xmm2", "xmm3"
  1082. );
  1083. }
  1084. void ScaleARGBCols_SSE2(uint8* dst_argb,
  1085. const uint8* src_argb,
  1086. int dst_width,
  1087. int x,
  1088. int dx) {
  1089. intptr_t x0, x1;
  1090. asm volatile (
  1091. "movd %5,%%xmm2 \n"
  1092. "movd %6,%%xmm3 \n"
  1093. "pshufd $0x0,%%xmm2,%%xmm2 \n"
  1094. "pshufd $0x11,%%xmm3,%%xmm0 \n"
  1095. "paddd %%xmm0,%%xmm2 \n"
  1096. "paddd %%xmm3,%%xmm3 \n"
  1097. "pshufd $0x5,%%xmm3,%%xmm0 \n"
  1098. "paddd %%xmm0,%%xmm2 \n"
  1099. "paddd %%xmm3,%%xmm3 \n"
  1100. "pshufd $0x0,%%xmm3,%%xmm3 \n"
  1101. "pextrw $0x1,%%xmm2,%k0 \n"
  1102. "pextrw $0x3,%%xmm2,%k1 \n"
  1103. "cmp $0x0,%4 \n"
  1104. "jl 99f \n"
  1105. "sub $0x4,%4 \n"
  1106. "jl 49f \n"
  1107. LABELALIGN
  1108. "40: \n"
  1109. MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
  1110. MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1
  1111. "pextrw $0x5,%%xmm2,%k0 \n"
  1112. "pextrw $0x7,%%xmm2,%k1 \n"
  1113. "paddd %%xmm3,%%xmm2 \n"
  1114. "punpckldq %%xmm1,%%xmm0 \n"
  1115. MEMOPREG(movd,0x00,3,0,4,xmm1) // movd (%3,%0,4),%%xmm1
  1116. MEMOPREG(movd,0x00,3,1,4,xmm4) // movd (%3,%1,4),%%xmm4
  1117. "pextrw $0x1,%%xmm2,%k0 \n"
  1118. "pextrw $0x3,%%xmm2,%k1 \n"
  1119. "punpckldq %%xmm4,%%xmm1 \n"
  1120. "punpcklqdq %%xmm1,%%xmm0 \n"
  1121. "movdqu %%xmm0," MEMACCESS(2) " \n"
  1122. "lea " MEMLEA(0x10,2) ",%2 \n"
  1123. "sub $0x4,%4 \n"
  1124. "jge 40b \n"
  1125. "49: \n"
  1126. "test $0x2,%4 \n"
  1127. "je 29f \n"
  1128. MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
  1129. MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1
  1130. "pextrw $0x5,%%xmm2,%k0 \n"
  1131. "punpckldq %%xmm1,%%xmm0 \n"
  1132. "movq %%xmm0," MEMACCESS(2) " \n"
  1133. "lea " MEMLEA(0x8,2) ",%2 \n"
  1134. "29: \n"
  1135. "test $0x1,%4 \n"
  1136. "je 99f \n"
  1137. MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
  1138. "movd %%xmm0," MEMACCESS(2) " \n"
  1139. "99: \n"
  1140. : "=&a"(x0), // %0
  1141. "=&d"(x1), // %1
  1142. "+r"(dst_argb), // %2
  1143. "+r"(src_argb), // %3
  1144. "+r"(dst_width) // %4
  1145. : "rm"(x), // %5
  1146. "rm"(dx) // %6
  1147. : "memory", "cc", NACL_R14
  1148. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
  1149. );
  1150. }
  1151. // Reads 4 pixels, duplicates them and writes 8 pixels.
  1152. // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
  1153. void ScaleARGBColsUp2_SSE2(uint8* dst_argb,
  1154. const uint8* src_argb,
  1155. int dst_width,
  1156. int x,
  1157. int dx) {
  1158. (void)x;
  1159. (void)dx;
  1160. asm volatile (
  1161. LABELALIGN
  1162. "1: \n"
  1163. "movdqu " MEMACCESS(1) ",%%xmm0 \n"
  1164. "lea " MEMLEA(0x10,1) ",%1 \n"
  1165. "movdqa %%xmm0,%%xmm1 \n"
  1166. "punpckldq %%xmm0,%%xmm0 \n"
  1167. "punpckhdq %%xmm1,%%xmm1 \n"
  1168. "movdqu %%xmm0," MEMACCESS(0) " \n"
  1169. "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
  1170. "lea " MEMLEA(0x20,0) ",%0 \n"
  1171. "sub $0x8,%2 \n"
  1172. "jg 1b \n"
  1173. : "+r"(dst_argb), // %0
  1174. "+r"(src_argb), // %1
  1175. "+r"(dst_width) // %2
  1176. :: "memory", "cc", NACL_R14
  1177. "xmm0", "xmm1"
  1178. );
  1179. }
  1180. // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
  1181. static uvec8 kShuffleColARGB = {
  1182. 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
  1183. 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
  1184. };
  1185. // Shuffle table for duplicating 2 fractions into 8 bytes each
  1186. static uvec8 kShuffleFractions = {
  1187. 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
  1188. };
  1189. // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
  1190. void ScaleARGBFilterCols_SSSE3(uint8* dst_argb,
  1191. const uint8* src_argb,
  1192. int dst_width,
  1193. int x,
  1194. int dx) {
  1195. intptr_t x0, x1;
  1196. asm volatile(
  1197. "movdqa %0,%%xmm4 \n"
  1198. "movdqa %1,%%xmm5 \n"
  1199. :
  1200. : "m"(kShuffleColARGB), // %0
  1201. "m"(kShuffleFractions) // %1
  1202. );
  1203. asm volatile (
  1204. "movd %5,%%xmm2 \n"
  1205. "movd %6,%%xmm3 \n"
  1206. "pcmpeqb %%xmm6,%%xmm6 \n"
  1207. "psrlw $0x9,%%xmm6 \n"
  1208. "pextrw $0x1,%%xmm2,%k3 \n"
  1209. "sub $0x2,%2 \n"
  1210. "jl 29f \n"
  1211. "movdqa %%xmm2,%%xmm0 \n"
  1212. "paddd %%xmm3,%%xmm0 \n"
  1213. "punpckldq %%xmm0,%%xmm2 \n"
  1214. "punpckldq %%xmm3,%%xmm3 \n"
  1215. "paddd %%xmm3,%%xmm3 \n"
  1216. "pextrw $0x3,%%xmm2,%k4 \n"
  1217. LABELALIGN
  1218. "2: \n"
  1219. "movdqa %%xmm2,%%xmm1 \n"
  1220. "paddd %%xmm3,%%xmm2 \n"
  1221. MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0
  1222. "psrlw $0x9,%%xmm1 \n"
  1223. MEMOPREG(movhps,0x00,1,4,4,xmm0) // movhps (%1,%4,4),%%xmm0
  1224. "pshufb %%xmm5,%%xmm1 \n"
  1225. "pshufb %%xmm4,%%xmm0 \n"
  1226. "pxor %%xmm6,%%xmm1 \n"
  1227. "pmaddubsw %%xmm1,%%xmm0 \n"
  1228. "psrlw $0x7,%%xmm0 \n"
  1229. "pextrw $0x1,%%xmm2,%k3 \n"
  1230. "pextrw $0x3,%%xmm2,%k4 \n"
  1231. "packuswb %%xmm0,%%xmm0 \n"
  1232. "movq %%xmm0," MEMACCESS(0) " \n"
  1233. "lea " MEMLEA(0x8,0) ",%0 \n"
  1234. "sub $0x2,%2 \n"
  1235. "jge 2b \n"
  1236. LABELALIGN
  1237. "29: \n"
  1238. "add $0x1,%2 \n"
  1239. "jl 99f \n"
  1240. "psrlw $0x9,%%xmm2 \n"
  1241. MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0
  1242. "pshufb %%xmm5,%%xmm2 \n"
  1243. "pshufb %%xmm4,%%xmm0 \n"
  1244. "pxor %%xmm6,%%xmm2 \n"
  1245. "pmaddubsw %%xmm2,%%xmm0 \n"
  1246. "psrlw $0x7,%%xmm0 \n"
  1247. "packuswb %%xmm0,%%xmm0 \n"
  1248. "movd %%xmm0," MEMACCESS(0) " \n"
  1249. LABELALIGN
  1250. "99: \n"
  1251. : "+r"(dst_argb), // %0
  1252. "+r"(src_argb), // %1
  1253. "+rm"(dst_width), // %2
  1254. "=&r"(x0), // %3
  1255. "=&r"(x1) // %4
  1256. : "rm"(x), // %5
  1257. "rm"(dx) // %6
  1258. : "memory", "cc", NACL_R14
  1259. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  1260. );
  1261. }
  1262. // Divide num by div and return as 16.16 fixed point result.
  1263. int FixedDiv_X86(int num, int div) {
  1264. asm volatile(
  1265. "cdq \n"
  1266. "shld $0x10,%%eax,%%edx \n"
  1267. "shl $0x10,%%eax \n"
  1268. "idiv %1 \n"
  1269. "mov %0, %%eax \n"
  1270. : "+a"(num) // %0
  1271. : "c"(div) // %1
  1272. : "memory", "cc", "edx");
  1273. return num;
  1274. }
  1275. // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
  1276. int FixedDiv1_X86(int num, int div) {
  1277. asm volatile(
  1278. "cdq \n"
  1279. "shld $0x10,%%eax,%%edx \n"
  1280. "shl $0x10,%%eax \n"
  1281. "sub $0x10001,%%eax \n"
  1282. "sbb $0x0,%%edx \n"
  1283. "sub $0x1,%1 \n"
  1284. "idiv %1 \n"
  1285. "mov %0, %%eax \n"
  1286. : "+a"(num) // %0
  1287. : "c"(div) // %1
  1288. : "memory", "cc", "edx");
  1289. return num;
  1290. }
  1291. #endif // defined(__x86_64__) || defined(__i386__)
  1292. #ifdef __cplusplus
  1293. } // extern "C"
  1294. } // namespace libyuv
  1295. #endif