scale_win.cc 44 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390
  1. /*
  2. * Copyright 2013 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/row.h"
  11. #include "libyuv/scale_row.h"
  12. #ifdef __cplusplus
  13. namespace libyuv {
  14. extern "C" {
  15. #endif
  16. // This module is for 32 bit Visual C x86 and clangcl
  17. #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
  18. // Offsets for source bytes 0 to 9
  19. static uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
  20. 128, 128, 128, 128, 128, 128, 128, 128};
  21. // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
  22. static uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12,
  23. 128, 128, 128, 128, 128, 128, 128, 128};
  24. // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
  25. static uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15,
  26. 128, 128, 128, 128, 128, 128, 128, 128};
  27. // Offsets for source bytes 0 to 10
  28. static uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
  29. // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
  30. static uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13};
  31. // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
  32. static uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10,
  33. 10, 11, 12, 13, 13, 14, 14, 15};
  34. // Coefficients for source bytes 0 to 10
  35. static uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
  36. // Coefficients for source bytes 10 to 21
  37. static uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
  38. // Coefficients for source bytes 21 to 31
  39. static uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
  40. // Coefficients for source bytes 21 to 31
  41. static vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
  42. static uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128,
  43. 128, 128, 128, 128, 128, 128, 128, 128};
  44. static uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3,
  45. 6, 8, 11, 14, 128, 128, 128, 128};
  46. // Arrange words 0,3,6 into 0,1,2
  47. static uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128,
  48. 128, 128, 128, 128, 128, 128, 128, 128};
  49. // Arrange words 0,3,6 into 3,4,5
  50. static uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1,
  51. 6, 7, 12, 13, 128, 128, 128, 128};
  52. // Scaling values for boxes of 3x3 and 2x3
  53. static uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
  54. 65536 / 9, 65536 / 6, 0, 0};
  55. // Arrange first value for pixels 0,1,2,3,4,5
  56. static uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128,
  57. 11, 128, 14, 128, 128, 128, 128, 128};
  58. // Arrange second value for pixels 0,1,2,3,4,5
  59. static uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128,
  60. 12, 128, 15, 128, 128, 128, 128, 128};
  61. // Arrange third value for pixels 0,1,2,3,4,5
  62. static uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128,
  63. 13, 128, 128, 128, 128, 128, 128, 128};
  64. // Scaling values for boxes of 3x2 and 2x2
  65. static uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
  66. 65536 / 3, 65536 / 2, 0, 0};
  67. // Reads 32 pixels, throws half away and writes 16 pixels.
  68. __declspec(naked) void ScaleRowDown2_SSSE3(const uint8* src_ptr,
  69. ptrdiff_t src_stride,
  70. uint8* dst_ptr,
  71. int dst_width) {
  72. __asm {
  73. mov eax, [esp + 4] // src_ptr
  74. // src_stride ignored
  75. mov edx, [esp + 12] // dst_ptr
  76. mov ecx, [esp + 16] // dst_width
  77. wloop:
  78. movdqu xmm0, [eax]
  79. movdqu xmm1, [eax + 16]
  80. lea eax, [eax + 32]
  81. psrlw xmm0, 8 // isolate odd pixels.
  82. psrlw xmm1, 8
  83. packuswb xmm0, xmm1
  84. movdqu [edx], xmm0
  85. lea edx, [edx + 16]
  86. sub ecx, 16
  87. jg wloop
  88. ret
  89. }
  90. }
  91. // Blends 32x1 rectangle to 16x1.
  92. __declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr,
  93. ptrdiff_t src_stride,
  94. uint8* dst_ptr,
  95. int dst_width) {
  96. __asm {
  97. mov eax, [esp + 4] // src_ptr
  98. // src_stride
  99. mov edx, [esp + 12] // dst_ptr
  100. mov ecx, [esp + 16] // dst_width
  101. pcmpeqb xmm4, xmm4 // constant 0x0101
  102. psrlw xmm4, 15
  103. packuswb xmm4, xmm4
  104. pxor xmm5, xmm5 // constant 0
  105. wloop:
  106. movdqu xmm0, [eax]
  107. movdqu xmm1, [eax + 16]
  108. lea eax, [eax + 32]
  109. pmaddubsw xmm0, xmm4 // horizontal add
  110. pmaddubsw xmm1, xmm4
  111. pavgw xmm0, xmm5 // (x + 1) / 2
  112. pavgw xmm1, xmm5
  113. packuswb xmm0, xmm1
  114. movdqu [edx], xmm0
  115. lea edx, [edx + 16]
  116. sub ecx, 16
  117. jg wloop
  118. ret
  119. }
  120. }
  121. // Blends 32x2 rectangle to 16x1.
  122. __declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8* src_ptr,
  123. ptrdiff_t src_stride,
  124. uint8* dst_ptr,
  125. int dst_width) {
  126. __asm {
  127. push esi
  128. mov eax, [esp + 4 + 4] // src_ptr
  129. mov esi, [esp + 4 + 8] // src_stride
  130. mov edx, [esp + 4 + 12] // dst_ptr
  131. mov ecx, [esp + 4 + 16] // dst_width
  132. pcmpeqb xmm4, xmm4 // constant 0x0101
  133. psrlw xmm4, 15
  134. packuswb xmm4, xmm4
  135. pxor xmm5, xmm5 // constant 0
  136. wloop:
  137. movdqu xmm0, [eax]
  138. movdqu xmm1, [eax + 16]
  139. movdqu xmm2, [eax + esi]
  140. movdqu xmm3, [eax + esi + 16]
  141. lea eax, [eax + 32]
  142. pmaddubsw xmm0, xmm4 // horizontal add
  143. pmaddubsw xmm1, xmm4
  144. pmaddubsw xmm2, xmm4
  145. pmaddubsw xmm3, xmm4
  146. paddw xmm0, xmm2 // vertical add
  147. paddw xmm1, xmm3
  148. psrlw xmm0, 1
  149. psrlw xmm1, 1
  150. pavgw xmm0, xmm5 // (x + 1) / 2
  151. pavgw xmm1, xmm5
  152. packuswb xmm0, xmm1
  153. movdqu [edx], xmm0
  154. lea edx, [edx + 16]
  155. sub ecx, 16
  156. jg wloop
  157. pop esi
  158. ret
  159. }
  160. }
  161. #ifdef HAS_SCALEROWDOWN2_AVX2
  162. // Reads 64 pixels, throws half away and writes 32 pixels.
  163. __declspec(naked) void ScaleRowDown2_AVX2(const uint8* src_ptr,
  164. ptrdiff_t src_stride,
  165. uint8* dst_ptr,
  166. int dst_width) {
  167. __asm {
  168. mov eax, [esp + 4] // src_ptr
  169. // src_stride ignored
  170. mov edx, [esp + 12] // dst_ptr
  171. mov ecx, [esp + 16] // dst_width
  172. wloop:
  173. vmovdqu ymm0, [eax]
  174. vmovdqu ymm1, [eax + 32]
  175. lea eax, [eax + 64]
  176. vpsrlw ymm0, ymm0, 8 // isolate odd pixels.
  177. vpsrlw ymm1, ymm1, 8
  178. vpackuswb ymm0, ymm0, ymm1
  179. vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
  180. vmovdqu [edx], ymm0
  181. lea edx, [edx + 32]
  182. sub ecx, 32
  183. jg wloop
  184. vzeroupper
  185. ret
  186. }
  187. }
  188. // Blends 64x1 rectangle to 32x1.
  189. __declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8* src_ptr,
  190. ptrdiff_t src_stride,
  191. uint8* dst_ptr,
  192. int dst_width) {
  193. __asm {
  194. mov eax, [esp + 4] // src_ptr
  195. // src_stride
  196. mov edx, [esp + 12] // dst_ptr
  197. mov ecx, [esp + 16] // dst_width
  198. vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
  199. vpsrlw ymm4, ymm4, 15
  200. vpackuswb ymm4, ymm4, ymm4
  201. vpxor ymm5, ymm5, ymm5 // constant 0
  202. wloop:
  203. vmovdqu ymm0, [eax]
  204. vmovdqu ymm1, [eax + 32]
  205. lea eax, [eax + 64]
  206. vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
  207. vpmaddubsw ymm1, ymm1, ymm4
  208. vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
  209. vpavgw ymm1, ymm1, ymm5
  210. vpackuswb ymm0, ymm0, ymm1
  211. vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
  212. vmovdqu [edx], ymm0
  213. lea edx, [edx + 32]
  214. sub ecx, 32
  215. jg wloop
  216. vzeroupper
  217. ret
  218. }
  219. }
  220. // For rounding, average = (sum + 2) / 4
  221. // becomes average((sum >> 1), 0)
  222. // Blends 64x2 rectangle to 32x1.
  223. __declspec(naked) void ScaleRowDown2Box_AVX2(const uint8* src_ptr,
  224. ptrdiff_t src_stride,
  225. uint8* dst_ptr,
  226. int dst_width) {
  227. __asm {
  228. push esi
  229. mov eax, [esp + 4 + 4] // src_ptr
  230. mov esi, [esp + 4 + 8] // src_stride
  231. mov edx, [esp + 4 + 12] // dst_ptr
  232. mov ecx, [esp + 4 + 16] // dst_width
  233. vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
  234. vpsrlw ymm4, ymm4, 15
  235. vpackuswb ymm4, ymm4, ymm4
  236. vpxor ymm5, ymm5, ymm5 // constant 0
  237. wloop:
  238. vmovdqu ymm0, [eax]
  239. vmovdqu ymm1, [eax + 32]
  240. vmovdqu ymm2, [eax + esi]
  241. vmovdqu ymm3, [eax + esi + 32]
  242. lea eax, [eax + 64]
  243. vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
  244. vpmaddubsw ymm1, ymm1, ymm4
  245. vpmaddubsw ymm2, ymm2, ymm4
  246. vpmaddubsw ymm3, ymm3, ymm4
  247. vpaddw ymm0, ymm0, ymm2 // vertical add
  248. vpaddw ymm1, ymm1, ymm3
  249. vpsrlw ymm0, ymm0, 1 // (x + 2) / 4 = (x / 2 + 1) / 2
  250. vpsrlw ymm1, ymm1, 1
  251. vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
  252. vpavgw ymm1, ymm1, ymm5
  253. vpackuswb ymm0, ymm0, ymm1
  254. vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
  255. vmovdqu [edx], ymm0
  256. lea edx, [edx + 32]
  257. sub ecx, 32
  258. jg wloop
  259. pop esi
  260. vzeroupper
  261. ret
  262. }
  263. }
  264. #endif // HAS_SCALEROWDOWN2_AVX2
  265. // Point samples 32 pixels to 8 pixels.
  266. __declspec(naked) void ScaleRowDown4_SSSE3(const uint8* src_ptr,
  267. ptrdiff_t src_stride,
  268. uint8* dst_ptr,
  269. int dst_width) {
  270. __asm {
  271. mov eax, [esp + 4] // src_ptr
  272. // src_stride ignored
  273. mov edx, [esp + 12] // dst_ptr
  274. mov ecx, [esp + 16] // dst_width
  275. pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000
  276. psrld xmm5, 24
  277. pslld xmm5, 16
  278. wloop:
  279. movdqu xmm0, [eax]
  280. movdqu xmm1, [eax + 16]
  281. lea eax, [eax + 32]
  282. pand xmm0, xmm5
  283. pand xmm1, xmm5
  284. packuswb xmm0, xmm1
  285. psrlw xmm0, 8
  286. packuswb xmm0, xmm0
  287. movq qword ptr [edx], xmm0
  288. lea edx, [edx + 8]
  289. sub ecx, 8
  290. jg wloop
  291. ret
  292. }
  293. }
  294. // Blends 32x4 rectangle to 8x1.
  295. __declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8* src_ptr,
  296. ptrdiff_t src_stride,
  297. uint8* dst_ptr,
  298. int dst_width) {
  299. __asm {
  300. push esi
  301. push edi
  302. mov eax, [esp + 8 + 4] // src_ptr
  303. mov esi, [esp + 8 + 8] // src_stride
  304. mov edx, [esp + 8 + 12] // dst_ptr
  305. mov ecx, [esp + 8 + 16] // dst_width
  306. lea edi, [esi + esi * 2] // src_stride * 3
  307. pcmpeqb xmm4, xmm4 // constant 0x0101
  308. psrlw xmm4, 15
  309. movdqa xmm5, xmm4
  310. packuswb xmm4, xmm4
  311. psllw xmm5, 3 // constant 0x0008
  312. wloop:
  313. movdqu xmm0, [eax] // average rows
  314. movdqu xmm1, [eax + 16]
  315. movdqu xmm2, [eax + esi]
  316. movdqu xmm3, [eax + esi + 16]
  317. pmaddubsw xmm0, xmm4 // horizontal add
  318. pmaddubsw xmm1, xmm4
  319. pmaddubsw xmm2, xmm4
  320. pmaddubsw xmm3, xmm4
  321. paddw xmm0, xmm2 // vertical add rows 0, 1
  322. paddw xmm1, xmm3
  323. movdqu xmm2, [eax + esi * 2]
  324. movdqu xmm3, [eax + esi * 2 + 16]
  325. pmaddubsw xmm2, xmm4
  326. pmaddubsw xmm3, xmm4
  327. paddw xmm0, xmm2 // add row 2
  328. paddw xmm1, xmm3
  329. movdqu xmm2, [eax + edi]
  330. movdqu xmm3, [eax + edi + 16]
  331. lea eax, [eax + 32]
  332. pmaddubsw xmm2, xmm4
  333. pmaddubsw xmm3, xmm4
  334. paddw xmm0, xmm2 // add row 3
  335. paddw xmm1, xmm3
  336. phaddw xmm0, xmm1
  337. paddw xmm0, xmm5 // + 8 for round
  338. psrlw xmm0, 4 // /16 for average of 4 * 4
  339. packuswb xmm0, xmm0
  340. movq qword ptr [edx], xmm0
  341. lea edx, [edx + 8]
  342. sub ecx, 8
  343. jg wloop
  344. pop edi
  345. pop esi
  346. ret
  347. }
  348. }
  349. #ifdef HAS_SCALEROWDOWN4_AVX2
  350. // Point samples 64 pixels to 16 pixels.
  351. __declspec(naked) void ScaleRowDown4_AVX2(const uint8* src_ptr,
  352. ptrdiff_t src_stride,
  353. uint8* dst_ptr,
  354. int dst_width) {
  355. __asm {
  356. mov eax, [esp + 4] // src_ptr
  357. // src_stride ignored
  358. mov edx, [esp + 12] // dst_ptr
  359. mov ecx, [esp + 16] // dst_width
  360. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000
  361. vpsrld ymm5, ymm5, 24
  362. vpslld ymm5, ymm5, 16
  363. wloop:
  364. vmovdqu ymm0, [eax]
  365. vmovdqu ymm1, [eax + 32]
  366. lea eax, [eax + 64]
  367. vpand ymm0, ymm0, ymm5
  368. vpand ymm1, ymm1, ymm5
  369. vpackuswb ymm0, ymm0, ymm1
  370. vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
  371. vpsrlw ymm0, ymm0, 8
  372. vpackuswb ymm0, ymm0, ymm0
  373. vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
  374. vmovdqu [edx], xmm0
  375. lea edx, [edx + 16]
  376. sub ecx, 16
  377. jg wloop
  378. vzeroupper
  379. ret
  380. }
  381. }
  382. // Blends 64x4 rectangle to 16x1.
  383. __declspec(naked) void ScaleRowDown4Box_AVX2(const uint8* src_ptr,
  384. ptrdiff_t src_stride,
  385. uint8* dst_ptr,
  386. int dst_width) {
  387. __asm {
  388. push esi
  389. push edi
  390. mov eax, [esp + 8 + 4] // src_ptr
  391. mov esi, [esp + 8 + 8] // src_stride
  392. mov edx, [esp + 8 + 12] // dst_ptr
  393. mov ecx, [esp + 8 + 16] // dst_width
  394. lea edi, [esi + esi * 2] // src_stride * 3
  395. vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101
  396. vpsrlw ymm4, ymm4, 15
  397. vpsllw ymm5, ymm4, 3 // constant 0x0008
  398. vpackuswb ymm4, ymm4, ymm4
  399. wloop:
  400. vmovdqu ymm0, [eax] // average rows
  401. vmovdqu ymm1, [eax + 32]
  402. vmovdqu ymm2, [eax + esi]
  403. vmovdqu ymm3, [eax + esi + 32]
  404. vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
  405. vpmaddubsw ymm1, ymm1, ymm4
  406. vpmaddubsw ymm2, ymm2, ymm4
  407. vpmaddubsw ymm3, ymm3, ymm4
  408. vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1
  409. vpaddw ymm1, ymm1, ymm3
  410. vmovdqu ymm2, [eax + esi * 2]
  411. vmovdqu ymm3, [eax + esi * 2 + 32]
  412. vpmaddubsw ymm2, ymm2, ymm4
  413. vpmaddubsw ymm3, ymm3, ymm4
  414. vpaddw ymm0, ymm0, ymm2 // add row 2
  415. vpaddw ymm1, ymm1, ymm3
  416. vmovdqu ymm2, [eax + edi]
  417. vmovdqu ymm3, [eax + edi + 32]
  418. lea eax, [eax + 64]
  419. vpmaddubsw ymm2, ymm2, ymm4
  420. vpmaddubsw ymm3, ymm3, ymm4
  421. vpaddw ymm0, ymm0, ymm2 // add row 3
  422. vpaddw ymm1, ymm1, ymm3
  423. vphaddw ymm0, ymm0, ymm1 // mutates
  424. vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw
  425. vpaddw ymm0, ymm0, ymm5 // + 8 for round
  426. vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4
  427. vpackuswb ymm0, ymm0, ymm0
  428. vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
  429. vmovdqu [edx], xmm0
  430. lea edx, [edx + 16]
  431. sub ecx, 16
  432. jg wloop
  433. pop edi
  434. pop esi
  435. vzeroupper
  436. ret
  437. }
  438. }
  439. #endif // HAS_SCALEROWDOWN4_AVX2
  440. // Point samples 32 pixels to 24 pixels.
  441. // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
  442. // Then shuffled to do the scaling.
  443. __declspec(naked) void ScaleRowDown34_SSSE3(const uint8* src_ptr,
  444. ptrdiff_t src_stride,
  445. uint8* dst_ptr,
  446. int dst_width) {
  447. __asm {
  448. mov eax, [esp + 4] // src_ptr
  449. // src_stride ignored
  450. mov edx, [esp + 12] // dst_ptr
  451. mov ecx, [esp + 16] // dst_width
  452. movdqa xmm3, xmmword ptr kShuf0
  453. movdqa xmm4, xmmword ptr kShuf1
  454. movdqa xmm5, xmmword ptr kShuf2
  455. wloop:
  456. movdqu xmm0, [eax]
  457. movdqu xmm1, [eax + 16]
  458. lea eax, [eax + 32]
  459. movdqa xmm2, xmm1
  460. palignr xmm1, xmm0, 8
  461. pshufb xmm0, xmm3
  462. pshufb xmm1, xmm4
  463. pshufb xmm2, xmm5
  464. movq qword ptr [edx], xmm0
  465. movq qword ptr [edx + 8], xmm1
  466. movq qword ptr [edx + 16], xmm2
  467. lea edx, [edx + 24]
  468. sub ecx, 24
  469. jg wloop
  470. ret
  471. }
  472. }
  473. // Blends 32x2 rectangle to 24x1
  474. // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
  475. // Then shuffled to do the scaling.
  476. // Register usage:
  477. // xmm0 src_row 0
  478. // xmm1 src_row 1
  479. // xmm2 shuf 0
  480. // xmm3 shuf 1
  481. // xmm4 shuf 2
  482. // xmm5 madd 0
  483. // xmm6 madd 1
  484. // xmm7 kRound34
  485. // Note that movdqa+palign may be better than movdqu.
  486. __declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
  487. ptrdiff_t src_stride,
  488. uint8* dst_ptr,
  489. int dst_width) {
  490. __asm {
  491. push esi
  492. mov eax, [esp + 4 + 4] // src_ptr
  493. mov esi, [esp + 4 + 8] // src_stride
  494. mov edx, [esp + 4 + 12] // dst_ptr
  495. mov ecx, [esp + 4 + 16] // dst_width
  496. movdqa xmm2, xmmword ptr kShuf01
  497. movdqa xmm3, xmmword ptr kShuf11
  498. movdqa xmm4, xmmword ptr kShuf21
  499. movdqa xmm5, xmmword ptr kMadd01
  500. movdqa xmm6, xmmword ptr kMadd11
  501. movdqa xmm7, xmmword ptr kRound34
  502. wloop:
  503. movdqu xmm0, [eax] // pixels 0..7
  504. movdqu xmm1, [eax + esi]
  505. pavgb xmm0, xmm1
  506. pshufb xmm0, xmm2
  507. pmaddubsw xmm0, xmm5
  508. paddsw xmm0, xmm7
  509. psrlw xmm0, 2
  510. packuswb xmm0, xmm0
  511. movq qword ptr [edx], xmm0
  512. movdqu xmm0, [eax + 8] // pixels 8..15
  513. movdqu xmm1, [eax + esi + 8]
  514. pavgb xmm0, xmm1
  515. pshufb xmm0, xmm3
  516. pmaddubsw xmm0, xmm6
  517. paddsw xmm0, xmm7
  518. psrlw xmm0, 2
  519. packuswb xmm0, xmm0
  520. movq qword ptr [edx + 8], xmm0
  521. movdqu xmm0, [eax + 16] // pixels 16..23
  522. movdqu xmm1, [eax + esi + 16]
  523. lea eax, [eax + 32]
  524. pavgb xmm0, xmm1
  525. pshufb xmm0, xmm4
  526. movdqa xmm1, xmmword ptr kMadd21
  527. pmaddubsw xmm0, xmm1
  528. paddsw xmm0, xmm7
  529. psrlw xmm0, 2
  530. packuswb xmm0, xmm0
  531. movq qword ptr [edx + 16], xmm0
  532. lea edx, [edx + 24]
  533. sub ecx, 24
  534. jg wloop
  535. pop esi
  536. ret
  537. }
  538. }
  539. // Note that movdqa+palign may be better than movdqu.
  540. __declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
  541. ptrdiff_t src_stride,
  542. uint8* dst_ptr,
  543. int dst_width) {
  544. __asm {
  545. push esi
  546. mov eax, [esp + 4 + 4] // src_ptr
  547. mov esi, [esp + 4 + 8] // src_stride
  548. mov edx, [esp + 4 + 12] // dst_ptr
  549. mov ecx, [esp + 4 + 16] // dst_width
  550. movdqa xmm2, xmmword ptr kShuf01
  551. movdqa xmm3, xmmword ptr kShuf11
  552. movdqa xmm4, xmmword ptr kShuf21
  553. movdqa xmm5, xmmword ptr kMadd01
  554. movdqa xmm6, xmmword ptr kMadd11
  555. movdqa xmm7, xmmword ptr kRound34
  556. wloop:
  557. movdqu xmm0, [eax] // pixels 0..7
  558. movdqu xmm1, [eax + esi]
  559. pavgb xmm1, xmm0
  560. pavgb xmm0, xmm1
  561. pshufb xmm0, xmm2
  562. pmaddubsw xmm0, xmm5
  563. paddsw xmm0, xmm7
  564. psrlw xmm0, 2
  565. packuswb xmm0, xmm0
  566. movq qword ptr [edx], xmm0
  567. movdqu xmm0, [eax + 8] // pixels 8..15
  568. movdqu xmm1, [eax + esi + 8]
  569. pavgb xmm1, xmm0
  570. pavgb xmm0, xmm1
  571. pshufb xmm0, xmm3
  572. pmaddubsw xmm0, xmm6
  573. paddsw xmm0, xmm7
  574. psrlw xmm0, 2
  575. packuswb xmm0, xmm0
  576. movq qword ptr [edx + 8], xmm0
  577. movdqu xmm0, [eax + 16] // pixels 16..23
  578. movdqu xmm1, [eax + esi + 16]
  579. lea eax, [eax + 32]
  580. pavgb xmm1, xmm0
  581. pavgb xmm0, xmm1
  582. pshufb xmm0, xmm4
  583. movdqa xmm1, xmmword ptr kMadd21
  584. pmaddubsw xmm0, xmm1
  585. paddsw xmm0, xmm7
  586. psrlw xmm0, 2
  587. packuswb xmm0, xmm0
  588. movq qword ptr [edx + 16], xmm0
  589. lea edx, [edx+24]
  590. sub ecx, 24
  591. jg wloop
  592. pop esi
  593. ret
  594. }
  595. }
  596. // 3/8 point sampler
  597. // Scale 32 pixels to 12
  598. __declspec(naked) void ScaleRowDown38_SSSE3(const uint8* src_ptr,
  599. ptrdiff_t src_stride,
  600. uint8* dst_ptr,
  601. int dst_width) {
  602. __asm {
  603. mov eax, [esp + 4] // src_ptr
  604. // src_stride ignored
  605. mov edx, [esp + 12] // dst_ptr
  606. mov ecx, [esp + 16] // dst_width
  607. movdqa xmm4, xmmword ptr kShuf38a
  608. movdqa xmm5, xmmword ptr kShuf38b
  609. xloop:
  610. movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
  611. movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
  612. lea eax, [eax + 32]
  613. pshufb xmm0, xmm4
  614. pshufb xmm1, xmm5
  615. paddusb xmm0, xmm1
  616. movq qword ptr [edx], xmm0 // write 12 pixels
  617. movhlps xmm1, xmm0
  618. movd [edx + 8], xmm1
  619. lea edx, [edx + 12]
  620. sub ecx, 12
  621. jg xloop
  622. ret
  623. }
  624. }
  625. // Scale 16x3 pixels to 6x1 with interpolation
  626. __declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
  627. ptrdiff_t src_stride,
  628. uint8* dst_ptr,
  629. int dst_width) {
  630. __asm {
  631. push esi
  632. mov eax, [esp + 4 + 4] // src_ptr
  633. mov esi, [esp + 4 + 8] // src_stride
  634. mov edx, [esp + 4 + 12] // dst_ptr
  635. mov ecx, [esp + 4 + 16] // dst_width
  636. movdqa xmm2, xmmword ptr kShufAc
  637. movdqa xmm3, xmmword ptr kShufAc3
  638. movdqa xmm4, xmmword ptr kScaleAc33
  639. pxor xmm5, xmm5
  640. xloop:
  641. movdqu xmm0, [eax] // sum up 3 rows into xmm0/1
  642. movdqu xmm6, [eax + esi]
  643. movhlps xmm1, xmm0
  644. movhlps xmm7, xmm6
  645. punpcklbw xmm0, xmm5
  646. punpcklbw xmm1, xmm5
  647. punpcklbw xmm6, xmm5
  648. punpcklbw xmm7, xmm5
  649. paddusw xmm0, xmm6
  650. paddusw xmm1, xmm7
  651. movdqu xmm6, [eax + esi * 2]
  652. lea eax, [eax + 16]
  653. movhlps xmm7, xmm6
  654. punpcklbw xmm6, xmm5
  655. punpcklbw xmm7, xmm5
  656. paddusw xmm0, xmm6
  657. paddusw xmm1, xmm7
  658. movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6
  659. psrldq xmm0, 2
  660. paddusw xmm6, xmm0
  661. psrldq xmm0, 2
  662. paddusw xmm6, xmm0
  663. pshufb xmm6, xmm2
  664. movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6
  665. psrldq xmm1, 2
  666. paddusw xmm7, xmm1
  667. psrldq xmm1, 2
  668. paddusw xmm7, xmm1
  669. pshufb xmm7, xmm3
  670. paddusw xmm6, xmm7
  671. pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6
  672. packuswb xmm6, xmm6
  673. movd [edx], xmm6 // write 6 pixels
  674. psrlq xmm6, 16
  675. movd [edx + 2], xmm6
  676. lea edx, [edx + 6]
  677. sub ecx, 6
  678. jg xloop
  679. pop esi
  680. ret
  681. }
  682. }
  683. // Scale 16x2 pixels to 6x1 with interpolation
  684. __declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
  685. ptrdiff_t src_stride,
  686. uint8* dst_ptr,
  687. int dst_width) {
  688. __asm {
  689. push esi
  690. mov eax, [esp + 4 + 4] // src_ptr
  691. mov esi, [esp + 4 + 8] // src_stride
  692. mov edx, [esp + 4 + 12] // dst_ptr
  693. mov ecx, [esp + 4 + 16] // dst_width
  694. movdqa xmm2, xmmword ptr kShufAb0
  695. movdqa xmm3, xmmword ptr kShufAb1
  696. movdqa xmm4, xmmword ptr kShufAb2
  697. movdqa xmm5, xmmword ptr kScaleAb2
  698. xloop:
  699. movdqu xmm0, [eax] // average 2 rows into xmm0
  700. movdqu xmm1, [eax + esi]
  701. lea eax, [eax + 16]
  702. pavgb xmm0, xmm1
  703. movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1
  704. pshufb xmm1, xmm2
  705. movdqa xmm6, xmm0
  706. pshufb xmm6, xmm3
  707. paddusw xmm1, xmm6
  708. pshufb xmm0, xmm4
  709. paddusw xmm1, xmm0
  710. pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2
  711. packuswb xmm1, xmm1
  712. movd [edx], xmm1 // write 6 pixels
  713. psrlq xmm1, 16
  714. movd [edx + 2], xmm1
  715. lea edx, [edx + 6]
  716. sub ecx, 6
  717. jg xloop
  718. pop esi
  719. ret
  720. }
  721. }
  722. // Reads 16 bytes and accumulates to 16 shorts at a time.
  723. __declspec(naked) void ScaleAddRow_SSE2(const uint8* src_ptr,
  724. uint16* dst_ptr,
  725. int src_width) {
  726. __asm {
  727. mov eax, [esp + 4] // src_ptr
  728. mov edx, [esp + 8] // dst_ptr
  729. mov ecx, [esp + 12] // src_width
  730. pxor xmm5, xmm5
  731. // sum rows
  732. xloop:
  733. movdqu xmm3, [eax] // read 16 bytes
  734. lea eax, [eax + 16]
  735. movdqu xmm0, [edx] // read 16 words from destination
  736. movdqu xmm1, [edx + 16]
  737. movdqa xmm2, xmm3
  738. punpcklbw xmm2, xmm5
  739. punpckhbw xmm3, xmm5
  740. paddusw xmm0, xmm2 // sum 16 words
  741. paddusw xmm1, xmm3
  742. movdqu [edx], xmm0 // write 16 words to destination
  743. movdqu [edx + 16], xmm1
  744. lea edx, [edx + 32]
  745. sub ecx, 16
  746. jg xloop
  747. ret
  748. }
  749. }
  750. #ifdef HAS_SCALEADDROW_AVX2
  751. // Reads 32 bytes and accumulates to 32 shorts at a time.
  752. __declspec(naked) void ScaleAddRow_AVX2(const uint8* src_ptr,
  753. uint16* dst_ptr,
  754. int src_width) {
  755. __asm {
  756. mov eax, [esp + 4] // src_ptr
  757. mov edx, [esp + 8] // dst_ptr
  758. mov ecx, [esp + 12] // src_width
  759. vpxor ymm5, ymm5, ymm5
  760. // sum rows
  761. xloop:
  762. vmovdqu ymm3, [eax] // read 32 bytes
  763. lea eax, [eax + 32]
  764. vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck
  765. vpunpcklbw ymm2, ymm3, ymm5
  766. vpunpckhbw ymm3, ymm3, ymm5
  767. vpaddusw ymm0, ymm2, [edx] // sum 16 words
  768. vpaddusw ymm1, ymm3, [edx + 32]
  769. vmovdqu [edx], ymm0 // write 32 words to destination
  770. vmovdqu [edx + 32], ymm1
  771. lea edx, [edx + 64]
  772. sub ecx, 32
  773. jg xloop
  774. vzeroupper
  775. ret
  776. }
  777. }
  778. #endif // HAS_SCALEADDROW_AVX2
  779. // Constant for making pixels signed to avoid pmaddubsw
  780. // saturation.
  781. static uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  782. 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
  783. // Constant for making pixels unsigned and adding .5 for rounding.
  784. static uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
  785. 0x4040, 0x4040, 0x4040, 0x4040};
  786. // Bilinear column filtering. SSSE3 version.
  787. __declspec(naked) void ScaleFilterCols_SSSE3(uint8* dst_ptr,
  788. const uint8* src_ptr,
  789. int dst_width,
  790. int x,
  791. int dx) {
  792. __asm {
  793. push ebx
  794. push esi
  795. push edi
  796. mov edi, [esp + 12 + 4] // dst_ptr
  797. mov esi, [esp + 12 + 8] // src_ptr
  798. mov ecx, [esp + 12 + 12] // dst_width
  799. movd xmm2, [esp + 12 + 16] // x
  800. movd xmm3, [esp + 12 + 20] // dx
  801. mov eax, 0x04040000 // shuffle to line up fractions with pixel.
  802. movd xmm5, eax
  803. pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
  804. psrlw xmm6, 9
  805. pcmpeqb xmm7, xmm7 // generate 0x0001
  806. psrlw xmm7, 15
  807. pextrw eax, xmm2, 1 // get x0 integer. preroll
  808. sub ecx, 2
  809. jl xloop29
  810. movdqa xmm0, xmm2 // x1 = x0 + dx
  811. paddd xmm0, xmm3
  812. punpckldq xmm2, xmm0 // x0 x1
  813. punpckldq xmm3, xmm3 // dx dx
  814. paddd xmm3, xmm3 // dx * 2, dx * 2
  815. pextrw edx, xmm2, 3 // get x1 integer. preroll
  816. // 2 Pixel loop.
  817. xloop2:
  818. movdqa xmm1, xmm2 // x0, x1 fractions.
  819. paddd xmm2, xmm3 // x += dx
  820. movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
  821. movd xmm0, ebx
  822. psrlw xmm1, 9 // 7 bit fractions.
  823. movzx ebx, word ptr [esi + edx] // 2 source x1 pixels
  824. movd xmm4, ebx
  825. pshufb xmm1, xmm5 // 0011
  826. punpcklwd xmm0, xmm4
  827. psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
  828. pxor xmm1, xmm6 // 0..7f and 7f..0
  829. paddusb xmm1, xmm7 // +1 so 0..7f and 80..1
  830. pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels.
  831. pextrw eax, xmm2, 1 // get x0 integer. next iteration.
  832. pextrw edx, xmm2, 3 // get x1 integer. next iteration.
  833. paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round.
  834. psrlw xmm1, 7 // 8.7 fixed point to low 8 bits.
  835. packuswb xmm1, xmm1 // 8 bits, 2 pixels.
  836. movd ebx, xmm1
  837. mov [edi], bx
  838. lea edi, [edi + 2]
  839. sub ecx, 2 // 2 pixels
  840. jge xloop2
  841. xloop29:
  842. add ecx, 2 - 1
  843. jl xloop99
  844. // 1 pixel remainder
  845. movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
  846. movd xmm0, ebx
  847. psrlw xmm2, 9 // 7 bit fractions.
  848. pshufb xmm2, xmm5 // 0011
  849. psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
  850. pxor xmm2, xmm6 // 0..7f and 7f..0
  851. paddusb xmm2, xmm7 // +1 so 0..7f and 80..1
  852. pmaddubsw xmm2, xmm0 // 16 bit
  853. paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round.
  854. psrlw xmm2, 7 // 8.7 fixed point to low 8 bits.
  855. packuswb xmm2, xmm2 // 8 bits
  856. movd ebx, xmm2
  857. mov [edi], bl
  858. xloop99:
  859. pop edi
  860. pop esi
  861. pop ebx
  862. ret
  863. }
  864. }
  865. // Reads 16 pixels, duplicates them and writes 32 pixels.
  866. __declspec(naked) void ScaleColsUp2_SSE2(uint8* dst_ptr,
  867. const uint8* src_ptr,
  868. int dst_width,
  869. int x,
  870. int dx) {
  871. __asm {
  872. mov edx, [esp + 4] // dst_ptr
  873. mov eax, [esp + 8] // src_ptr
  874. mov ecx, [esp + 12] // dst_width
  875. wloop:
  876. movdqu xmm0, [eax]
  877. lea eax, [eax + 16]
  878. movdqa xmm1, xmm0
  879. punpcklbw xmm0, xmm0
  880. punpckhbw xmm1, xmm1
  881. movdqu [edx], xmm0
  882. movdqu [edx + 16], xmm1
  883. lea edx, [edx + 32]
  884. sub ecx, 32
  885. jg wloop
  886. ret
  887. }
  888. }
  889. // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
  890. __declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
  891. ptrdiff_t src_stride,
  892. uint8* dst_argb,
  893. int dst_width) {
  894. __asm {
  895. mov eax, [esp + 4] // src_argb
  896. // src_stride ignored
  897. mov edx, [esp + 12] // dst_argb
  898. mov ecx, [esp + 16] // dst_width
  899. wloop:
  900. movdqu xmm0, [eax]
  901. movdqu xmm1, [eax + 16]
  902. lea eax, [eax + 32]
  903. shufps xmm0, xmm1, 0xdd
  904. movdqu [edx], xmm0
  905. lea edx, [edx + 16]
  906. sub ecx, 4
  907. jg wloop
  908. ret
  909. }
  910. }
  911. // Blends 8x1 rectangle to 4x1.
  912. __declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
  913. ptrdiff_t src_stride,
  914. uint8* dst_argb,
  915. int dst_width) {
  916. __asm {
  917. mov eax, [esp + 4] // src_argb
  918. // src_stride ignored
  919. mov edx, [esp + 12] // dst_argb
  920. mov ecx, [esp + 16] // dst_width
  921. wloop:
  922. movdqu xmm0, [eax]
  923. movdqu xmm1, [eax + 16]
  924. lea eax, [eax + 32]
  925. movdqa xmm2, xmm0
  926. shufps xmm0, xmm1, 0x88 // even pixels
  927. shufps xmm2, xmm1, 0xdd // odd pixels
  928. pavgb xmm0, xmm2
  929. movdqu [edx], xmm0
  930. lea edx, [edx + 16]
  931. sub ecx, 4
  932. jg wloop
  933. ret
  934. }
  935. }
  936. // Blends 8x2 rectangle to 4x1.
  937. __declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
  938. ptrdiff_t src_stride,
  939. uint8* dst_argb,
  940. int dst_width) {
  941. __asm {
  942. push esi
  943. mov eax, [esp + 4 + 4] // src_argb
  944. mov esi, [esp + 4 + 8] // src_stride
  945. mov edx, [esp + 4 + 12] // dst_argb
  946. mov ecx, [esp + 4 + 16] // dst_width
  947. wloop:
  948. movdqu xmm0, [eax]
  949. movdqu xmm1, [eax + 16]
  950. movdqu xmm2, [eax + esi]
  951. movdqu xmm3, [eax + esi + 16]
  952. lea eax, [eax + 32]
  953. pavgb xmm0, xmm2 // average rows
  954. pavgb xmm1, xmm3
  955. movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
  956. shufps xmm0, xmm1, 0x88 // even pixels
  957. shufps xmm2, xmm1, 0xdd // odd pixels
  958. pavgb xmm0, xmm2
  959. movdqu [edx], xmm0
  960. lea edx, [edx + 16]
  961. sub ecx, 4
  962. jg wloop
  963. pop esi
  964. ret
  965. }
  966. }
  967. // Reads 4 pixels at a time.
  968. __declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8* src_argb,
  969. ptrdiff_t src_stride,
  970. int src_stepx,
  971. uint8* dst_argb,
  972. int dst_width) {
  973. __asm {
  974. push ebx
  975. push edi
  976. mov eax, [esp + 8 + 4] // src_argb
  977. // src_stride ignored
  978. mov ebx, [esp + 8 + 12] // src_stepx
  979. mov edx, [esp + 8 + 16] // dst_argb
  980. mov ecx, [esp + 8 + 20] // dst_width
  981. lea ebx, [ebx * 4]
  982. lea edi, [ebx + ebx * 2]
  983. wloop:
  984. movd xmm0, [eax]
  985. movd xmm1, [eax + ebx]
  986. punpckldq xmm0, xmm1
  987. movd xmm2, [eax + ebx * 2]
  988. movd xmm3, [eax + edi]
  989. lea eax, [eax + ebx * 4]
  990. punpckldq xmm2, xmm3
  991. punpcklqdq xmm0, xmm2
  992. movdqu [edx], xmm0
  993. lea edx, [edx + 16]
  994. sub ecx, 4
  995. jg wloop
  996. pop edi
  997. pop ebx
  998. ret
  999. }
  1000. }
  1001. // Blends four 2x2 to 4x1.
  1002. __declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
  1003. ptrdiff_t src_stride,
  1004. int src_stepx,
  1005. uint8* dst_argb,
  1006. int dst_width) {
  1007. __asm {
  1008. push ebx
  1009. push esi
  1010. push edi
  1011. mov eax, [esp + 12 + 4] // src_argb
  1012. mov esi, [esp + 12 + 8] // src_stride
  1013. mov ebx, [esp + 12 + 12] // src_stepx
  1014. mov edx, [esp + 12 + 16] // dst_argb
  1015. mov ecx, [esp + 12 + 20] // dst_width
  1016. lea esi, [eax + esi] // row1 pointer
  1017. lea ebx, [ebx * 4]
  1018. lea edi, [ebx + ebx * 2]
  1019. wloop:
  1020. movq xmm0, qword ptr [eax] // row0 4 pairs
  1021. movhps xmm0, qword ptr [eax + ebx]
  1022. movq xmm1, qword ptr [eax + ebx * 2]
  1023. movhps xmm1, qword ptr [eax + edi]
  1024. lea eax, [eax + ebx * 4]
  1025. movq xmm2, qword ptr [esi] // row1 4 pairs
  1026. movhps xmm2, qword ptr [esi + ebx]
  1027. movq xmm3, qword ptr [esi + ebx * 2]
  1028. movhps xmm3, qword ptr [esi + edi]
  1029. lea esi, [esi + ebx * 4]
  1030. pavgb xmm0, xmm2 // average rows
  1031. pavgb xmm1, xmm3
  1032. movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
  1033. shufps xmm0, xmm1, 0x88 // even pixels
  1034. shufps xmm2, xmm1, 0xdd // odd pixels
  1035. pavgb xmm0, xmm2
  1036. movdqu [edx], xmm0
  1037. lea edx, [edx + 16]
  1038. sub ecx, 4
  1039. jg wloop
  1040. pop edi
  1041. pop esi
  1042. pop ebx
  1043. ret
  1044. }
  1045. }
  1046. // Column scaling unfiltered. SSE2 version.
  1047. __declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb,
  1048. const uint8* src_argb,
  1049. int dst_width,
  1050. int x,
  1051. int dx) {
  1052. __asm {
  1053. push edi
  1054. push esi
  1055. mov edi, [esp + 8 + 4] // dst_argb
  1056. mov esi, [esp + 8 + 8] // src_argb
  1057. mov ecx, [esp + 8 + 12] // dst_width
  1058. movd xmm2, [esp + 8 + 16] // x
  1059. movd xmm3, [esp + 8 + 20] // dx
  1060. pshufd xmm2, xmm2, 0 // x0 x0 x0 x0
  1061. pshufd xmm0, xmm3, 0x11 // dx 0 dx 0
  1062. paddd xmm2, xmm0
  1063. paddd xmm3, xmm3 // 0, 0, 0, dx * 2
  1064. pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0
  1065. paddd xmm2, xmm0 // x3 x2 x1 x0
  1066. paddd xmm3, xmm3 // 0, 0, 0, dx * 4
  1067. pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4
  1068. pextrw eax, xmm2, 1 // get x0 integer.
  1069. pextrw edx, xmm2, 3 // get x1 integer.
  1070. cmp ecx, 0
  1071. jle xloop99
  1072. sub ecx, 4
  1073. jl xloop49
  1074. // 4 Pixel loop.
  1075. xloop4:
  1076. movd xmm0, [esi + eax * 4] // 1 source x0 pixels
  1077. movd xmm1, [esi + edx * 4] // 1 source x1 pixels
  1078. pextrw eax, xmm2, 5 // get x2 integer.
  1079. pextrw edx, xmm2, 7 // get x3 integer.
  1080. paddd xmm2, xmm3 // x += dx
  1081. punpckldq xmm0, xmm1 // x0 x1
  1082. movd xmm1, [esi + eax * 4] // 1 source x2 pixels
  1083. movd xmm4, [esi + edx * 4] // 1 source x3 pixels
  1084. pextrw eax, xmm2, 1 // get x0 integer. next iteration.
  1085. pextrw edx, xmm2, 3 // get x1 integer. next iteration.
  1086. punpckldq xmm1, xmm4 // x2 x3
  1087. punpcklqdq xmm0, xmm1 // x0 x1 x2 x3
  1088. movdqu [edi], xmm0
  1089. lea edi, [edi + 16]
  1090. sub ecx, 4 // 4 pixels
  1091. jge xloop4
  1092. xloop49:
  1093. test ecx, 2
  1094. je xloop29
  1095. // 2 Pixels.
  1096. movd xmm0, [esi + eax * 4] // 1 source x0 pixels
  1097. movd xmm1, [esi + edx * 4] // 1 source x1 pixels
  1098. pextrw eax, xmm2, 5 // get x2 integer.
  1099. punpckldq xmm0, xmm1 // x0 x1
  1100. movq qword ptr [edi], xmm0
  1101. lea edi, [edi + 8]
  1102. xloop29:
  1103. test ecx, 1
  1104. je xloop99
  1105. // 1 Pixels.
  1106. movd xmm0, [esi + eax * 4] // 1 source x2 pixels
  1107. movd dword ptr [edi], xmm0
  1108. xloop99:
  1109. pop esi
  1110. pop edi
  1111. ret
  1112. }
  1113. }
  1114. // Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
  1115. // TODO(fbarchard): Port to Neon
  1116. // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
  1117. static uvec8 kShuffleColARGB = {
  1118. 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
  1119. 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
  1120. };
  1121. // Shuffle table for duplicating 2 fractions into 8 bytes each
  1122. static uvec8 kShuffleFractions = {
  1123. 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
  1124. };
  1125. __declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8* dst_argb,
  1126. const uint8* src_argb,
  1127. int dst_width,
  1128. int x,
  1129. int dx) {
  1130. __asm {
  1131. push esi
  1132. push edi
  1133. mov edi, [esp + 8 + 4] // dst_argb
  1134. mov esi, [esp + 8 + 8] // src_argb
  1135. mov ecx, [esp + 8 + 12] // dst_width
  1136. movd xmm2, [esp + 8 + 16] // x
  1137. movd xmm3, [esp + 8 + 20] // dx
  1138. movdqa xmm4, xmmword ptr kShuffleColARGB
  1139. movdqa xmm5, xmmword ptr kShuffleFractions
  1140. pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
  1141. psrlw xmm6, 9
  1142. pextrw eax, xmm2, 1 // get x0 integer. preroll
  1143. sub ecx, 2
  1144. jl xloop29
  1145. movdqa xmm0, xmm2 // x1 = x0 + dx
  1146. paddd xmm0, xmm3
  1147. punpckldq xmm2, xmm0 // x0 x1
  1148. punpckldq xmm3, xmm3 // dx dx
  1149. paddd xmm3, xmm3 // dx * 2, dx * 2
  1150. pextrw edx, xmm2, 3 // get x1 integer. preroll
  1151. // 2 Pixel loop.
  1152. xloop2:
  1153. movdqa xmm1, xmm2 // x0, x1 fractions.
  1154. paddd xmm2, xmm3 // x += dx
  1155. movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
  1156. psrlw xmm1, 9 // 7 bit fractions.
  1157. movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels
  1158. pshufb xmm1, xmm5 // 0000000011111111
  1159. pshufb xmm0, xmm4 // arrange pixels into pairs
  1160. pxor xmm1, xmm6 // 0..7f and 7f..0
  1161. pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels.
  1162. pextrw eax, xmm2, 1 // get x0 integer. next iteration.
  1163. pextrw edx, xmm2, 3 // get x1 integer. next iteration.
  1164. psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits.
  1165. packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels.
  1166. movq qword ptr [edi], xmm0
  1167. lea edi, [edi + 8]
  1168. sub ecx, 2 // 2 pixels
  1169. jge xloop2
  1170. xloop29:
  1171. add ecx, 2 - 1
  1172. jl xloop99
  1173. // 1 pixel remainder
  1174. psrlw xmm2, 9 // 7 bit fractions.
  1175. movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
  1176. pshufb xmm2, xmm5 // 00000000
  1177. pshufb xmm0, xmm4 // arrange pixels into pairs
  1178. pxor xmm2, xmm6 // 0..7f and 7f..0
  1179. pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel.
  1180. psrlw xmm0, 7
  1181. packuswb xmm0, xmm0 // argb 8 bits, 1 pixel.
  1182. movd [edi], xmm0
  1183. xloop99:
  1184. pop edi
  1185. pop esi
  1186. ret
  1187. }
  1188. }
  1189. // Reads 4 pixels, duplicates them and writes 8 pixels.
  1190. __declspec(naked) void ScaleARGBColsUp2_SSE2(uint8* dst_argb,
  1191. const uint8* src_argb,
  1192. int dst_width,
  1193. int x,
  1194. int dx) {
  1195. __asm {
  1196. mov edx, [esp + 4] // dst_argb
  1197. mov eax, [esp + 8] // src_argb
  1198. mov ecx, [esp + 12] // dst_width
  1199. wloop:
  1200. movdqu xmm0, [eax]
  1201. lea eax, [eax + 16]
  1202. movdqa xmm1, xmm0
  1203. punpckldq xmm0, xmm0
  1204. punpckhdq xmm1, xmm1
  1205. movdqu [edx], xmm0
  1206. movdqu [edx + 16], xmm1
  1207. lea edx, [edx + 32]
  1208. sub ecx, 8
  1209. jg wloop
  1210. ret
  1211. }
  1212. }
  1213. // Divide num by div and return as 16.16 fixed point result.
  1214. __declspec(naked) int FixedDiv_X86(int num, int div) {
  1215. __asm {
  1216. mov eax, [esp + 4] // num
  1217. cdq // extend num to 64 bits
  1218. shld edx, eax, 16 // 32.16
  1219. shl eax, 16
  1220. idiv dword ptr [esp + 8]
  1221. ret
  1222. }
  1223. }
  1224. // Divide num by div and return as 16.16 fixed point result.
  1225. __declspec(naked) int FixedDiv1_X86(int num, int div) {
  1226. __asm {
  1227. mov eax, [esp + 4] // num
  1228. mov ecx, [esp + 8] // denom
  1229. cdq // extend num to 64 bits
  1230. shld edx, eax, 16 // 32.16
  1231. shl eax, 16
  1232. sub eax, 0x00010001
  1233. sbb edx, 0
  1234. sub ecx, 1
  1235. idiv ecx
  1236. ret
  1237. }
  1238. }
  1239. #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
  1240. #ifdef __cplusplus
  1241. } // extern "C"
  1242. } // namespace libyuv
  1243. #endif