scale_dspr2.cc 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668
  1. /*
  2. * Copyright 2012 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/basic_types.h"
  11. #include "libyuv/row.h"
  12. #ifdef __cplusplus
  13. namespace libyuv {
  14. extern "C" {
  15. #endif
  16. // This module is for GCC MIPS DSPR2
  17. #if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips_dsp) && \
  18. (__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32)
  19. void ScaleRowDown2_DSPR2(const uint8* src_ptr,
  20. ptrdiff_t src_stride,
  21. uint8* dst,
  22. int dst_width) {
  23. __asm__ __volatile__(
  24. ".set push \n"
  25. ".set noreorder \n"
  26. "srl $t9, %[dst_width], 4 \n" // iterations -> by 16
  27. "beqz $t9, 2f \n"
  28. " nop \n"
  29. "1: \n"
  30. "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
  31. "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
  32. "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
  33. "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
  34. "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16|
  35. "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
  36. "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
  37. "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
  38. // TODO(fbarchard): Use odd pixels instead of even.
  39. "precrq.qb.ph $t8, $t1, $t0 \n" // |7|5|3|1|
  40. "precrq.qb.ph $t0, $t3, $t2 \n" // |15|13|11|9|
  41. "precrq.qb.ph $t1, $t5, $t4 \n" // |23|21|19|17|
  42. "precrq.qb.ph $t2, $t7, $t6 \n" // |31|29|27|25|
  43. "addiu %[src_ptr], %[src_ptr], 32 \n"
  44. "addiu $t9, $t9, -1 \n"
  45. "sw $t8, 0(%[dst]) \n"
  46. "sw $t0, 4(%[dst]) \n"
  47. "sw $t1, 8(%[dst]) \n"
  48. "sw $t2, 12(%[dst]) \n"
  49. "bgtz $t9, 1b \n"
  50. " addiu %[dst], %[dst], 16 \n"
  51. "2: \n"
  52. "andi $t9, %[dst_width], 0xf \n" // residue
  53. "beqz $t9, 3f \n"
  54. " nop \n"
  55. "21: \n"
  56. "lbu $t0, 1(%[src_ptr]) \n"
  57. "addiu %[src_ptr], %[src_ptr], 2 \n"
  58. "addiu $t9, $t9, -1 \n"
  59. "sb $t0, 0(%[dst]) \n"
  60. "bgtz $t9, 21b \n"
  61. " addiu %[dst], %[dst], 1 \n"
  62. "3: \n"
  63. ".set pop \n"
  64. : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst)
  65. : [dst_width] "r"(dst_width)
  66. : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
  67. }
  68. void ScaleRowDown2Box_DSPR2(const uint8* src_ptr,
  69. ptrdiff_t src_stride,
  70. uint8* dst,
  71. int dst_width) {
  72. const uint8* t = src_ptr + src_stride;
  73. __asm__ __volatile__(
  74. ".set push \n"
  75. ".set noreorder \n"
  76. "srl $t9, %[dst_width], 3 \n" // iterations -> step 8
  77. "bltz $t9, 2f \n"
  78. " nop \n"
  79. "1: \n"
  80. "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
  81. "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
  82. "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
  83. "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
  84. "lw $t4, 0(%[t]) \n" // |19|18|17|16|
  85. "lw $t5, 4(%[t]) \n" // |23|22|21|20|
  86. "lw $t6, 8(%[t]) \n" // |27|26|25|24|
  87. "lw $t7, 12(%[t]) \n" // |31|30|29|28|
  88. "addiu $t9, $t9, -1 \n"
  89. "srl $t8, $t0, 16 \n" // |X|X|3|2|
  90. "ins $t0, $t4, 16, 16 \n" // |17|16|1|0|
  91. "ins $t4, $t8, 0, 16 \n" // |19|18|3|2|
  92. "raddu.w.qb $t0, $t0 \n" // |17+16+1+0|
  93. "raddu.w.qb $t4, $t4 \n" // |19+18+3+2|
  94. "shra_r.w $t0, $t0, 2 \n" // |t0+2|>>2
  95. "shra_r.w $t4, $t4, 2 \n" // |t4+2|>>2
  96. "srl $t8, $t1, 16 \n" // |X|X|7|6|
  97. "ins $t1, $t5, 16, 16 \n" // |21|20|5|4|
  98. "ins $t5, $t8, 0, 16 \n" // |22|23|7|6|
  99. "raddu.w.qb $t1, $t1 \n" // |21+20+5+4|
  100. "raddu.w.qb $t5, $t5 \n" // |23+22+7+6|
  101. "shra_r.w $t1, $t1, 2 \n" // |t1+2|>>2
  102. "shra_r.w $t5, $t5, 2 \n" // |t5+2|>>2
  103. "srl $t8, $t2, 16 \n" // |X|X|11|10|
  104. "ins $t2, $t6, 16, 16 \n" // |25|24|9|8|
  105. "ins $t6, $t8, 0, 16 \n" // |27|26|11|10|
  106. "raddu.w.qb $t2, $t2 \n" // |25+24+9+8|
  107. "raddu.w.qb $t6, $t6 \n" // |27+26+11+10|
  108. "shra_r.w $t2, $t2, 2 \n" // |t2+2|>>2
  109. "shra_r.w $t6, $t6, 2 \n" // |t5+2|>>2
  110. "srl $t8, $t3, 16 \n" // |X|X|15|14|
  111. "ins $t3, $t7, 16, 16 \n" // |29|28|13|12|
  112. "ins $t7, $t8, 0, 16 \n" // |31|30|15|14|
  113. "raddu.w.qb $t3, $t3 \n" // |29+28+13+12|
  114. "raddu.w.qb $t7, $t7 \n" // |31+30+15+14|
  115. "shra_r.w $t3, $t3, 2 \n" // |t3+2|>>2
  116. "shra_r.w $t7, $t7, 2 \n" // |t7+2|>>2
  117. "addiu %[src_ptr], %[src_ptr], 16 \n"
  118. "addiu %[t], %[t], 16 \n"
  119. "sb $t0, 0(%[dst]) \n"
  120. "sb $t4, 1(%[dst]) \n"
  121. "sb $t1, 2(%[dst]) \n"
  122. "sb $t5, 3(%[dst]) \n"
  123. "sb $t2, 4(%[dst]) \n"
  124. "sb $t6, 5(%[dst]) \n"
  125. "sb $t3, 6(%[dst]) \n"
  126. "sb $t7, 7(%[dst]) \n"
  127. "bgtz $t9, 1b \n"
  128. " addiu %[dst], %[dst], 8 \n"
  129. "2: \n"
  130. "andi $t9, %[dst_width], 0x7 \n" // x = residue
  131. "beqz $t9, 3f \n"
  132. " nop \n"
  133. "21: \n"
  134. "lwr $t1, 0(%[src_ptr]) \n"
  135. "lwl $t1, 3(%[src_ptr]) \n"
  136. "lwr $t2, 0(%[t]) \n"
  137. "lwl $t2, 3(%[t]) \n"
  138. "srl $t8, $t1, 16 \n"
  139. "ins $t1, $t2, 16, 16 \n"
  140. "ins $t2, $t8, 0, 16 \n"
  141. "raddu.w.qb $t1, $t1 \n"
  142. "raddu.w.qb $t2, $t2 \n"
  143. "shra_r.w $t1, $t1, 2 \n"
  144. "shra_r.w $t2, $t2, 2 \n"
  145. "sb $t1, 0(%[dst]) \n"
  146. "sb $t2, 1(%[dst]) \n"
  147. "addiu %[src_ptr], %[src_ptr], 4 \n"
  148. "addiu $t9, $t9, -2 \n"
  149. "addiu %[t], %[t], 4 \n"
  150. "bgtz $t9, 21b \n"
  151. " addiu %[dst], %[dst], 2 \n"
  152. "3: \n"
  153. ".set pop \n"
  154. : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [t] "+r"(t)
  155. : [dst_width] "r"(dst_width)
  156. : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
  157. }
  158. void ScaleRowDown4_DSPR2(const uint8* src_ptr,
  159. ptrdiff_t src_stride,
  160. uint8* dst,
  161. int dst_width) {
  162. __asm__ __volatile__(
  163. ".set push \n"
  164. ".set noreorder \n"
  165. "srl $t9, %[dst_width], 3 \n"
  166. "beqz $t9, 2f \n"
  167. " nop \n"
  168. "1: \n"
  169. "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
  170. "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
  171. "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8|
  172. "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12|
  173. "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16|
  174. "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20|
  175. "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24|
  176. "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28|
  177. "precr.qb.ph $t1, $t2, $t1 \n" // |6|4|2|0|
  178. "precr.qb.ph $t2, $t4, $t3 \n" // |14|12|10|8|
  179. "precr.qb.ph $t5, $t6, $t5 \n" // |22|20|18|16|
  180. "precr.qb.ph $t6, $t8, $t7 \n" // |30|28|26|24|
  181. "precrq.qb.ph $t1, $t2, $t1 \n" // |14|10|6|2|
  182. "precrq.qb.ph $t5, $t6, $t5 \n" // |30|26|22|18|
  183. "addiu %[src_ptr], %[src_ptr], 32 \n"
  184. "addiu $t9, $t9, -1 \n"
  185. "sw $t1, 0(%[dst]) \n"
  186. "sw $t5, 4(%[dst]) \n"
  187. "bgtz $t9, 1b \n"
  188. " addiu %[dst], %[dst], 8 \n"
  189. "2: \n"
  190. "andi $t9, %[dst_width], 7 \n" // residue
  191. "beqz $t9, 3f \n"
  192. " nop \n"
  193. "21: \n"
  194. "lbu $t1, 2(%[src_ptr]) \n"
  195. "addiu %[src_ptr], %[src_ptr], 4 \n"
  196. "addiu $t9, $t9, -1 \n"
  197. "sb $t1, 0(%[dst]) \n"
  198. "bgtz $t9, 21b \n"
  199. " addiu %[dst], %[dst], 1 \n"
  200. "3: \n"
  201. ".set pop \n"
  202. : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst)
  203. : [dst_width] "r"(dst_width)
  204. : "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
  205. }
  206. void ScaleRowDown4Box_DSPR2(const uint8* src_ptr,
  207. ptrdiff_t src_stride,
  208. uint8* dst,
  209. int dst_width) {
  210. intptr_t stride = src_stride;
  211. const uint8* s1 = src_ptr + stride;
  212. const uint8* s2 = s1 + stride;
  213. const uint8* s3 = s2 + stride;
  214. __asm__ __volatile__(
  215. ".set push \n"
  216. ".set noreorder \n"
  217. "srl $t9, %[dst_width], 1 \n"
  218. "andi $t8, %[dst_width], 1 \n"
  219. "1: \n"
  220. "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
  221. "lw $t1, 0(%[s1]) \n" // |7|6|5|4|
  222. "lw $t2, 0(%[s2]) \n" // |11|10|9|8|
  223. "lw $t3, 0(%[s3]) \n" // |15|14|13|12|
  224. "lw $t4, 4(%[src_ptr]) \n" // |19|18|17|16|
  225. "lw $t5, 4(%[s1]) \n" // |23|22|21|20|
  226. "lw $t6, 4(%[s2]) \n" // |27|26|25|24|
  227. "lw $t7, 4(%[s3]) \n" // |31|30|29|28|
  228. "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0|
  229. "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4|
  230. "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8|
  231. "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12|
  232. "raddu.w.qb $t4, $t4 \n" // |19 + 18 + 17 + 16|
  233. "raddu.w.qb $t5, $t5 \n" // |23 + 22 + 21 + 20|
  234. "raddu.w.qb $t6, $t6 \n" // |27 + 26 + 25 + 24|
  235. "raddu.w.qb $t7, $t7 \n" // |31 + 30 + 29 + 28|
  236. "add $t0, $t0, $t1 \n"
  237. "add $t1, $t2, $t3 \n"
  238. "add $t0, $t0, $t1 \n"
  239. "add $t4, $t4, $t5 \n"
  240. "add $t6, $t6, $t7 \n"
  241. "add $t4, $t4, $t6 \n"
  242. "shra_r.w $t0, $t0, 4 \n"
  243. "shra_r.w $t4, $t4, 4 \n"
  244. "sb $t0, 0(%[dst]) \n"
  245. "sb $t4, 1(%[dst]) \n"
  246. "addiu %[src_ptr], %[src_ptr], 8 \n"
  247. "addiu %[s1], %[s1], 8 \n"
  248. "addiu %[s2], %[s2], 8 \n"
  249. "addiu %[s3], %[s3], 8 \n"
  250. "addiu $t9, $t9, -1 \n"
  251. "bgtz $t9, 1b \n"
  252. " addiu %[dst], %[dst], 2 \n"
  253. "beqz $t8, 2f \n"
  254. " nop \n"
  255. "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
  256. "lw $t1, 0(%[s1]) \n" // |7|6|5|4|
  257. "lw $t2, 0(%[s2]) \n" // |11|10|9|8|
  258. "lw $t3, 0(%[s3]) \n" // |15|14|13|12|
  259. "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0|
  260. "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4|
  261. "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8|
  262. "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12|
  263. "add $t0, $t0, $t1 \n"
  264. "add $t1, $t2, $t3 \n"
  265. "add $t0, $t0, $t1 \n"
  266. "shra_r.w $t0, $t0, 4 \n"
  267. "sb $t0, 0(%[dst]) \n"
  268. "2: \n"
  269. ".set pop \n"
  270. : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [s1] "+r"(s1), [s2] "+r"(s2),
  271. [s3] "+r"(s3)
  272. : [dst_width] "r"(dst_width)
  273. : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
  274. }
  275. void ScaleRowDown34_DSPR2(const uint8* src_ptr,
  276. ptrdiff_t src_stride,
  277. uint8* dst,
  278. int dst_width) {
  279. __asm__ __volatile__(
  280. ".set push \n"
  281. ".set noreorder \n"
  282. "1: \n"
  283. "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
  284. "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
  285. "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8|
  286. "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12|
  287. "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16|
  288. "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20|
  289. "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24|
  290. "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28|
  291. "precrq.qb.ph $t0, $t2, $t4 \n" // |7|5|15|13|
  292. "precrq.qb.ph $t9, $t6, $t8 \n" // |23|21|31|30|
  293. "addiu %[dst_width], %[dst_width], -24 \n"
  294. "ins $t1, $t1, 8, 16 \n" // |3|1|0|X|
  295. "ins $t4, $t0, 8, 16 \n" // |X|15|13|12|
  296. "ins $t5, $t5, 8, 16 \n" // |19|17|16|X|
  297. "ins $t8, $t9, 8, 16 \n" // |X|31|29|28|
  298. "addiu %[src_ptr], %[src_ptr], 32 \n"
  299. "packrl.ph $t0, $t3, $t0 \n" // |9|8|7|5|
  300. "packrl.ph $t9, $t7, $t9 \n" // |25|24|23|21|
  301. "prepend $t1, $t2, 8 \n" // |4|3|1|0|
  302. "prepend $t3, $t4, 24 \n" // |15|13|12|11|
  303. "prepend $t5, $t6, 8 \n" // |20|19|17|16|
  304. "prepend $t7, $t8, 24 \n" // |31|29|28|27|
  305. "sw $t1, 0(%[dst]) \n"
  306. "sw $t0, 4(%[dst]) \n"
  307. "sw $t3, 8(%[dst]) \n"
  308. "sw $t5, 12(%[dst]) \n"
  309. "sw $t9, 16(%[dst]) \n"
  310. "sw $t7, 20(%[dst]) \n"
  311. "bnez %[dst_width], 1b \n"
  312. " addiu %[dst], %[dst], 24 \n"
  313. ".set pop \n"
  314. : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [dst_width] "+r"(dst_width)
  315. :
  316. : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
  317. }
  318. void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr,
  319. ptrdiff_t src_stride,
  320. uint8* d,
  321. int dst_width) {
  322. __asm__ __volatile__(
  323. ".set push \n"
  324. ".set noreorder \n"
  325. "repl.ph $t3, 3 \n" // 0x00030003
  326. "1: \n"
  327. "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
  328. "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
  329. "rotr $t2, $t0, 8 \n" // |S0|S3|S2|S1|
  330. "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1|
  331. "muleu_s.ph.qbl $t4, $t2, $t3 \n" // |S0*3|S3*3|
  332. "muleu_s.ph.qbl $t5, $t6, $t3 \n" // |T0*3|T3*3|
  333. "andi $t0, $t2, 0xFFFF \n" // |0|0|S2|S1|
  334. "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1|
  335. "raddu.w.qb $t0, $t0 \n"
  336. "raddu.w.qb $t1, $t1 \n"
  337. "shra_r.w $t0, $t0, 1 \n"
  338. "shra_r.w $t1, $t1, 1 \n"
  339. "preceu.ph.qbr $t2, $t2 \n" // |0|S2|0|S1|
  340. "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1|
  341. "rotr $t2, $t2, 16 \n" // |0|S1|0|S2|
  342. "rotr $t6, $t6, 16 \n" // |0|T1|0|T2|
  343. "addu.ph $t2, $t2, $t4 \n"
  344. "addu.ph $t6, $t6, $t5 \n"
  345. "sll $t5, $t0, 1 \n"
  346. "add $t0, $t5, $t0 \n"
  347. "shra_r.ph $t2, $t2, 2 \n"
  348. "shra_r.ph $t6, $t6, 2 \n"
  349. "shll.ph $t4, $t2, 1 \n"
  350. "addq.ph $t4, $t4, $t2 \n"
  351. "addu $t0, $t0, $t1 \n"
  352. "addiu %[src_ptr], %[src_ptr], 4 \n"
  353. "shra_r.w $t0, $t0, 2 \n"
  354. "addu.ph $t6, $t6, $t4 \n"
  355. "shra_r.ph $t6, $t6, 2 \n"
  356. "srl $t1, $t6, 16 \n"
  357. "addiu %[dst_width], %[dst_width], -3 \n"
  358. "sb $t1, 0(%[d]) \n"
  359. "sb $t0, 1(%[d]) \n"
  360. "sb $t6, 2(%[d]) \n"
  361. "bgtz %[dst_width], 1b \n"
  362. " addiu %[d], %[d], 3 \n"
  363. "3: \n"
  364. ".set pop \n"
  365. : [src_ptr] "+r"(src_ptr), [src_stride] "+r"(src_stride), [d] "+r"(d),
  366. [dst_width] "+r"(dst_width)
  367. :
  368. : "t0", "t1", "t2", "t3", "t4", "t5", "t6");
  369. }
  370. void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr,
  371. ptrdiff_t src_stride,
  372. uint8* d,
  373. int dst_width) {
  374. __asm__ __volatile__(
  375. ".set push \n"
  376. ".set noreorder \n"
  377. "repl.ph $t2, 3 \n" // 0x00030003
  378. "1: \n"
  379. "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
  380. "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
  381. "rotr $t4, $t0, 8 \n" // |S0|S3|S2|S1|
  382. "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1|
  383. "muleu_s.ph.qbl $t3, $t4, $t2 \n" // |S0*3|S3*3|
  384. "muleu_s.ph.qbl $t5, $t6, $t2 \n" // |T0*3|T3*3|
  385. "andi $t0, $t4, 0xFFFF \n" // |0|0|S2|S1|
  386. "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1|
  387. "raddu.w.qb $t0, $t0 \n"
  388. "raddu.w.qb $t1, $t1 \n"
  389. "shra_r.w $t0, $t0, 1 \n"
  390. "shra_r.w $t1, $t1, 1 \n"
  391. "preceu.ph.qbr $t4, $t4 \n" // |0|S2|0|S1|
  392. "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1|
  393. "rotr $t4, $t4, 16 \n" // |0|S1|0|S2|
  394. "rotr $t6, $t6, 16 \n" // |0|T1|0|T2|
  395. "addu.ph $t4, $t4, $t3 \n"
  396. "addu.ph $t6, $t6, $t5 \n"
  397. "shra_r.ph $t6, $t6, 2 \n"
  398. "shra_r.ph $t4, $t4, 2 \n"
  399. "addu.ph $t6, $t6, $t4 \n"
  400. "addiu %[src_ptr], %[src_ptr], 4 \n"
  401. "shra_r.ph $t6, $t6, 1 \n"
  402. "addu $t0, $t0, $t1 \n"
  403. "addiu %[dst_width], %[dst_width], -3 \n"
  404. "shra_r.w $t0, $t0, 1 \n"
  405. "srl $t1, $t6, 16 \n"
  406. "sb $t1, 0(%[d]) \n"
  407. "sb $t0, 1(%[d]) \n"
  408. "sb $t6, 2(%[d]) \n"
  409. "bgtz %[dst_width], 1b \n"
  410. " addiu %[d], %[d], 3 \n"
  411. "3: \n"
  412. ".set pop \n"
  413. : [src_ptr] "+r"(src_ptr), [src_stride] "+r"(src_stride), [d] "+r"(d),
  414. [dst_width] "+r"(dst_width)
  415. :
  416. : "t0", "t1", "t2", "t3", "t4", "t5", "t6");
  417. }
  418. void ScaleRowDown38_DSPR2(const uint8* src_ptr,
  419. ptrdiff_t src_stride,
  420. uint8* dst,
  421. int dst_width) {
  422. __asm__ __volatile__(
  423. ".set push \n"
  424. ".set noreorder \n"
  425. "1: \n"
  426. "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
  427. "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
  428. "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
  429. "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
  430. "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16|
  431. "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
  432. "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
  433. "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
  434. "wsbh $t0, $t0 \n" // |2|3|0|1|
  435. "wsbh $t6, $t6 \n" // |26|27|24|25|
  436. "srl $t0, $t0, 8 \n" // |X|2|3|0|
  437. "srl $t3, $t3, 16 \n" // |X|X|15|14|
  438. "srl $t5, $t5, 16 \n" // |X|X|23|22|
  439. "srl $t7, $t7, 16 \n" // |X|X|31|30|
  440. "ins $t1, $t2, 24, 8 \n" // |8|6|5|4|
  441. "ins $t6, $t5, 0, 8 \n" // |26|27|24|22|
  442. "ins $t1, $t0, 0, 16 \n" // |8|6|3|0|
  443. "ins $t6, $t7, 24, 8 \n" // |30|27|24|22|
  444. "prepend $t2, $t3, 24 \n" // |X|15|14|11|
  445. "ins $t4, $t4, 16, 8 \n" // |19|16|17|X|
  446. "ins $t4, $t2, 0, 16 \n" // |19|16|14|11|
  447. "addiu %[src_ptr], %[src_ptr], 32 \n"
  448. "addiu %[dst_width], %[dst_width], -12 \n"
  449. "addiu $t8,%[dst_width], -12 \n"
  450. "sw $t1, 0(%[dst]) \n"
  451. "sw $t4, 4(%[dst]) \n"
  452. "sw $t6, 8(%[dst]) \n"
  453. "bgez $t8, 1b \n"
  454. " addiu %[dst], %[dst], 12 \n"
  455. ".set pop \n"
  456. : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [dst_width] "+r"(dst_width)
  457. :
  458. : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8");
  459. }
  460. void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr,
  461. ptrdiff_t src_stride,
  462. uint8* dst_ptr,
  463. int dst_width) {
  464. intptr_t stride = src_stride;
  465. const uint8* t = src_ptr + stride;
  466. const int c = 0x2AAA;
  467. __asm__ __volatile__(
  468. ".set push \n"
  469. ".set noreorder \n"
  470. "1: \n"
  471. "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
  472. "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
  473. "lw $t2, 0(%[t]) \n" // |T3|T2|T1|T0|
  474. "lw $t3, 4(%[t]) \n" // |T7|T6|T5|T4|
  475. "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6|
  476. "packrl.ph $t4, $t1, $t3 \n" // |S7|S6|T7|T6|
  477. "packrl.ph $t5, $t3, $t1 \n" // |T5|T4|S5|S4|
  478. "raddu.w.qb $t4, $t4 \n" // S7+S6+T7+T6
  479. "raddu.w.qb $t5, $t5 \n" // T5+T4+S5+S4
  480. "precrq.qb.ph $t6, $t0, $t2 \n" // |S3|S1|T3|T1|
  481. "precrq.qb.ph $t6, $t6, $t6 \n" // |S3|T3|S3|T3|
  482. "srl $t4, $t4, 2 \n" // t4 / 4
  483. "srl $t6, $t6, 16 \n" // |0|0|S3|T3|
  484. "raddu.w.qb $t6, $t6 \n" // 0+0+S3+T3
  485. "addu $t6, $t5, $t6 \n"
  486. "mul $t6, $t6, %[c] \n" // t6 * 0x2AAA
  487. "sll $t0, $t0, 8 \n" // |S2|S1|S0|0|
  488. "sll $t2, $t2, 8 \n" // |T2|T1|T0|0|
  489. "raddu.w.qb $t0, $t0 \n" // S2+S1+S0+0
  490. "raddu.w.qb $t2, $t2 \n" // T2+T1+T0+0
  491. "addu $t0, $t0, $t2 \n"
  492. "mul $t0, $t0, %[c] \n" // t0 * 0x2AAA
  493. "addiu %[src_ptr], %[src_ptr], 8 \n"
  494. "addiu %[t], %[t], 8 \n"
  495. "addiu %[dst_width], %[dst_width], -3 \n"
  496. "addiu %[dst_ptr], %[dst_ptr], 3 \n"
  497. "srl $t6, $t6, 16 \n"
  498. "srl $t0, $t0, 16 \n"
  499. "sb $t4, -1(%[dst_ptr]) \n"
  500. "sb $t6, -2(%[dst_ptr]) \n"
  501. "bgtz %[dst_width], 1b \n"
  502. " sb $t0, -3(%[dst_ptr]) \n"
  503. ".set pop \n"
  504. : [src_ptr] "+r"(src_ptr), [dst_ptr] "+r"(dst_ptr), [t] "+r"(t),
  505. [dst_width] "+r"(dst_width)
  506. : [c] "r"(c)
  507. : "t0", "t1", "t2", "t3", "t4", "t5", "t6");
  508. }
  509. void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
  510. ptrdiff_t src_stride,
  511. uint8* dst_ptr,
  512. int dst_width) {
  513. intptr_t stride = src_stride;
  514. const uint8* s1 = src_ptr + stride;
  515. stride += stride;
  516. const uint8* s2 = src_ptr + stride;
  517. const int c1 = 0x1C71;
  518. const int c2 = 0x2AAA;
  519. __asm__ __volatile__(
  520. ".set push \n"
  521. ".set noreorder \n"
  522. "1: \n"
  523. "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
  524. "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
  525. "lw $t2, 0(%[s1]) \n" // |T3|T2|T1|T0|
  526. "lw $t3, 4(%[s1]) \n" // |T7|T6|T5|T4|
  527. "lw $t4, 0(%[s2]) \n" // |R3|R2|R1|R0|
  528. "lw $t5, 4(%[s2]) \n" // |R7|R6|R5|R4|
  529. "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6|
  530. "packrl.ph $t6, $t1, $t3 \n" // |S7|S6|T7|T6|
  531. "raddu.w.qb $t6, $t6 \n" // S7+S6+T7+T6
  532. "packrl.ph $t7, $t3, $t1 \n" // |T5|T4|S5|S4|
  533. "raddu.w.qb $t7, $t7 \n" // T5+T4+S5+S4
  534. "sll $t8, $t5, 16 \n" // |R5|R4|0|0|
  535. "raddu.w.qb $t8, $t8 \n" // R5+R4
  536. "addu $t7, $t7, $t8 \n"
  537. "srl $t8, $t5, 16 \n" // |0|0|R7|R6|
  538. "raddu.w.qb $t8, $t8 \n" // R7 + R6
  539. "addu $t6, $t6, $t8 \n"
  540. "mul $t6, $t6, %[c2] \n" // t6 * 0x2AAA
  541. "precrq.qb.ph $t8, $t0, $t2 \n" // |S3|S1|T3|T1|
  542. "precrq.qb.ph $t8, $t8, $t4 \n" // |S3|T3|R3|R1|
  543. "srl $t8, $t8, 8 \n" // |0|S3|T3|R3|
  544. "raddu.w.qb $t8, $t8 \n" // S3 + T3 + R3
  545. "addu $t7, $t7, $t8 \n"
  546. "mul $t7, $t7, %[c1] \n" // t7 * 0x1C71
  547. "sll $t0, $t0, 8 \n" // |S2|S1|S0|0|
  548. "sll $t2, $t2, 8 \n" // |T2|T1|T0|0|
  549. "sll $t4, $t4, 8 \n" // |R2|R1|R0|0|
  550. "raddu.w.qb $t0, $t0 \n"
  551. "raddu.w.qb $t2, $t2 \n"
  552. "raddu.w.qb $t4, $t4 \n"
  553. "addu $t0, $t0, $t2 \n"
  554. "addu $t0, $t0, $t4 \n"
  555. "mul $t0, $t0, %[c1] \n" // t0 * 0x1C71
  556. "addiu %[src_ptr], %[src_ptr], 8 \n"
  557. "addiu %[s1], %[s1], 8 \n"
  558. "addiu %[s2], %[s2], 8 \n"
  559. "addiu %[dst_width], %[dst_width], -3 \n"
  560. "addiu %[dst_ptr], %[dst_ptr], 3 \n"
  561. "srl $t6, $t6, 16 \n"
  562. "srl $t7, $t7, 16 \n"
  563. "srl $t0, $t0, 16 \n"
  564. "sb $t6, -1(%[dst_ptr]) \n"
  565. "sb $t7, -2(%[dst_ptr]) \n"
  566. "bgtz %[dst_width], 1b \n"
  567. " sb $t0, -3(%[dst_ptr]) \n"
  568. ".set pop \n"
  569. : [src_ptr] "+r"(src_ptr), [dst_ptr] "+r"(dst_ptr), [s1] "+r"(s1),
  570. [s2] "+r"(s2), [dst_width] "+r"(dst_width)
  571. : [c1] "r"(c1), [c2] "r"(c2)
  572. : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8");
  573. }
  574. void ScaleAddRow_DSPR2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
  575. int x;
  576. for (x = 0; x < ((src_width - 1)); x += 8) {
  577. uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4;
  578. uint32 tmp_t5, tmp_t6, tmp_t7, tmp_t8;
  579. __asm__ __volatile__(
  580. ".set push \n"
  581. ".set noreorder \n"
  582. "lw %[tmp_t5], 0(%[src_ptr]) \n"
  583. "lw %[tmp_t6], 4(%[src_ptr]) \n"
  584. "lw %[tmp_t1], 0(%[dst_ptr]) \n"
  585. "lw %[tmp_t2], 4(%[dst_ptr]) \n"
  586. "lw %[tmp_t3], 8(%[dst_ptr]) \n"
  587. "lw %[tmp_t4], 12(%[dst_ptr]) \n"
  588. "preceu.ph.qbr %[tmp_t7], %[tmp_t5] \n"
  589. "preceu.ph.qbl %[tmp_t8], %[tmp_t5] \n"
  590. "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t7] \n"
  591. "addu.ph %[tmp_t2], %[tmp_t2], %[tmp_t8] \n"
  592. "preceu.ph.qbr %[tmp_t7], %[tmp_t6] \n"
  593. "preceu.ph.qbl %[tmp_t8], %[tmp_t6] \n"
  594. "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t7] \n"
  595. "addu.ph %[tmp_t4], %[tmp_t4], %[tmp_t8] \n"
  596. "sw %[tmp_t1], 0(%[dst_ptr]) \n"
  597. "sw %[tmp_t2], 4(%[dst_ptr]) \n"
  598. "sw %[tmp_t3], 8(%[dst_ptr]) \n"
  599. "sw %[tmp_t4], 12(%[dst_ptr]) \n"
  600. ".set pop \n"
  601. :
  602. [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), [tmp_t3] "=&r"(tmp_t3),
  603. [tmp_t4] "=&r"(tmp_t4), [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
  604. [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [src_ptr] "+r"(src_ptr)
  605. : [dst_ptr] "r"(dst_ptr));
  606. src_ptr += 8;
  607. dst_ptr += 8;
  608. }
  609. if ((src_width)&7) {
  610. for (x = 0; x < ((src_width - 1) & 7); x += 1) {
  611. dst_ptr[0] += src_ptr[0];
  612. src_ptr += 1;
  613. dst_ptr += 1;
  614. }
  615. }
  616. }
  617. #endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
  618. #ifdef __cplusplus
  619. } // extern "C"
  620. } // namespace libyuv
  621. #endif