row_dspr2.cc 90 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721
  1. /*
  2. * Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/row.h"
  11. #ifdef __cplusplus
  12. namespace libyuv {
  13. extern "C" {
  14. #endif
  15. // The following are available on Mips platforms:
  16. #if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips__) && \
  17. (_MIPS_SIM == _MIPS_SIM_ABI32)
  18. #ifdef HAS_COPYROW_MIPS
  19. void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
  20. __asm__ __volatile__(
  21. ".set noreorder \n"
  22. ".set noat \n"
  23. "slti $at, %[count], 8 \n"
  24. "bne $at ,$zero, $last8 \n"
  25. "xor $t8, %[src], %[dst] \n"
  26. "andi $t8, $t8, 0x3 \n"
  27. "bne $t8, $zero, unaligned \n"
  28. "negu $a3, %[dst] \n"
  29. // make dst/src aligned
  30. "andi $a3, $a3, 0x3 \n"
  31. "beq $a3, $zero, $chk16w \n"
  32. // word-aligned now count is the remining bytes count
  33. "subu %[count], %[count], $a3 \n"
  34. "lwr $t8, 0(%[src]) \n"
  35. "addu %[src], %[src], $a3 \n"
  36. "swr $t8, 0(%[dst]) \n"
  37. "addu %[dst], %[dst], $a3 \n"
  38. // Now the dst/src are mutually word-aligned with word-aligned addresses
  39. "$chk16w: \n"
  40. "andi $t8, %[count], 0x3f \n" // whole 64-B chunks?
  41. // t8 is the byte count after 64-byte chunks
  42. "beq %[count], $t8, chk8w \n"
  43. // There will be at most 1 32-byte chunk after it
  44. "subu $a3, %[count], $t8 \n" // the reminder
  45. // Here a3 counts bytes in 16w chunks
  46. "addu $a3, %[dst], $a3 \n"
  47. // Now a3 is the final dst after 64-byte chunks
  48. "addu $t0, %[dst], %[count] \n"
  49. // t0 is the "past the end" address
  50. // When in the loop we exercise "pref 30,x(a1)", the a1+x should not be
  51. // past
  52. // the "t0-32" address
  53. // This means: for x=128 the last "safe" a1 address is "t0-160"
  54. // Alternatively, for x=64 the last "safe" a1 address is "t0-96"
  55. // we will use "pref 30,128(a1)", so "t0-160" is the limit
  56. "subu $t9, $t0, 160 \n"
  57. // t9 is the "last safe pref 30,128(a1)" address
  58. "pref 0, 0(%[src]) \n" // first line of src
  59. "pref 0, 32(%[src]) \n" // second line of src
  60. "pref 0, 64(%[src]) \n"
  61. "pref 30, 32(%[dst]) \n"
  62. // In case the a1 > t9 don't use "pref 30" at all
  63. "sltu $v1, $t9, %[dst] \n"
  64. "bgtz $v1, $loop16w \n"
  65. "nop \n"
  66. // otherwise, start with using pref30
  67. "pref 30, 64(%[dst]) \n"
  68. "$loop16w: \n"
  69. "pref 0, 96(%[src]) \n"
  70. "lw $t0, 0(%[src]) \n"
  71. "bgtz $v1, $skip_pref30_96 \n" // skip
  72. "lw $t1, 4(%[src]) \n"
  73. "pref 30, 96(%[dst]) \n" // continue
  74. "$skip_pref30_96: \n"
  75. "lw $t2, 8(%[src]) \n"
  76. "lw $t3, 12(%[src]) \n"
  77. "lw $t4, 16(%[src]) \n"
  78. "lw $t5, 20(%[src]) \n"
  79. "lw $t6, 24(%[src]) \n"
  80. "lw $t7, 28(%[src]) \n"
  81. "pref 0, 128(%[src]) \n"
  82. // bring the next lines of src, addr 128
  83. "sw $t0, 0(%[dst]) \n"
  84. "sw $t1, 4(%[dst]) \n"
  85. "sw $t2, 8(%[dst]) \n"
  86. "sw $t3, 12(%[dst]) \n"
  87. "sw $t4, 16(%[dst]) \n"
  88. "sw $t5, 20(%[dst]) \n"
  89. "sw $t6, 24(%[dst]) \n"
  90. "sw $t7, 28(%[dst]) \n"
  91. "lw $t0, 32(%[src]) \n"
  92. "bgtz $v1, $skip_pref30_128 \n" // skip pref 30,128(a1)
  93. "lw $t1, 36(%[src]) \n"
  94. "pref 30, 128(%[dst]) \n" // set dest, addr 128
  95. "$skip_pref30_128: \n"
  96. "lw $t2, 40(%[src]) \n"
  97. "lw $t3, 44(%[src]) \n"
  98. "lw $t4, 48(%[src]) \n"
  99. "lw $t5, 52(%[src]) \n"
  100. "lw $t6, 56(%[src]) \n"
  101. "lw $t7, 60(%[src]) \n"
  102. "pref 0, 160(%[src]) \n"
  103. // bring the next lines of src, addr 160
  104. "sw $t0, 32(%[dst]) \n"
  105. "sw $t1, 36(%[dst]) \n"
  106. "sw $t2, 40(%[dst]) \n"
  107. "sw $t3, 44(%[dst]) \n"
  108. "sw $t4, 48(%[dst]) \n"
  109. "sw $t5, 52(%[dst]) \n"
  110. "sw $t6, 56(%[dst]) \n"
  111. "sw $t7, 60(%[dst]) \n"
  112. "addiu %[dst], %[dst], 64 \n" // adding 64 to dest
  113. "sltu $v1, $t9, %[dst] \n"
  114. "bne %[dst], $a3, $loop16w \n"
  115. " addiu %[src], %[src], 64 \n" // adding 64 to src
  116. "move %[count], $t8 \n"
  117. // Here we have src and dest word-aligned but less than 64-bytes to go
  118. "chk8w: \n"
  119. "pref 0, 0x0(%[src]) \n"
  120. "andi $t8, %[count], 0x1f \n" // 32-byte chunk?
  121. // the t8 is the reminder count past 32-bytes
  122. "beq %[count], $t8, chk1w \n"
  123. // count=t8,no 32-byte chunk
  124. " nop \n"
  125. "lw $t0, 0(%[src]) \n"
  126. "lw $t1, 4(%[src]) \n"
  127. "lw $t2, 8(%[src]) \n"
  128. "lw $t3, 12(%[src]) \n"
  129. "lw $t4, 16(%[src]) \n"
  130. "lw $t5, 20(%[src]) \n"
  131. "lw $t6, 24(%[src]) \n"
  132. "lw $t7, 28(%[src]) \n"
  133. "addiu %[src], %[src], 32 \n"
  134. "sw $t0, 0(%[dst]) \n"
  135. "sw $t1, 4(%[dst]) \n"
  136. "sw $t2, 8(%[dst]) \n"
  137. "sw $t3, 12(%[dst]) \n"
  138. "sw $t4, 16(%[dst]) \n"
  139. "sw $t5, 20(%[dst]) \n"
  140. "sw $t6, 24(%[dst]) \n"
  141. "sw $t7, 28(%[dst]) \n"
  142. "addiu %[dst], %[dst], 32 \n"
  143. "chk1w: \n"
  144. "andi %[count], $t8, 0x3 \n"
  145. // now count is the reminder past 1w chunks
  146. "beq %[count], $t8, $last8 \n"
  147. " subu $a3, $t8, %[count] \n"
  148. // a3 is count of bytes in 1w chunks
  149. "addu $a3, %[dst], $a3 \n"
  150. // now a3 is the dst address past the 1w chunks
  151. // copying in words (4-byte chunks)
  152. "$wordCopy_loop: \n"
  153. "lw $t3, 0(%[src]) \n"
  154. // the first t3 may be equal t0 ... optimize?
  155. "addiu %[src], %[src],4 \n"
  156. "addiu %[dst], %[dst],4 \n"
  157. "bne %[dst], $a3,$wordCopy_loop \n"
  158. " sw $t3, -4(%[dst]) \n"
  159. // For the last (<8) bytes
  160. "$last8: \n"
  161. "blez %[count], leave \n"
  162. " addu $a3, %[dst], %[count] \n" // a3 -last dst address
  163. "$last8loop: \n"
  164. "lb $v1, 0(%[src]) \n"
  165. "addiu %[src], %[src], 1 \n"
  166. "addiu %[dst], %[dst], 1 \n"
  167. "bne %[dst], $a3, $last8loop \n"
  168. " sb $v1, -1(%[dst]) \n"
  169. "leave: \n"
  170. " j $ra \n"
  171. " nop \n"
  172. //
  173. // UNALIGNED case
  174. //
  175. "unaligned: \n"
  176. // got here with a3="negu a1"
  177. "andi $a3, $a3, 0x3 \n" // a1 is word aligned?
  178. "beqz $a3, $ua_chk16w \n"
  179. " subu %[count], %[count], $a3 \n"
  180. // bytes left after initial a3 bytes
  181. "lwr $v1, 0(%[src]) \n"
  182. "lwl $v1, 3(%[src]) \n"
  183. "addu %[src], %[src], $a3 \n" // a3 may be 1, 2 or 3
  184. "swr $v1, 0(%[dst]) \n"
  185. "addu %[dst], %[dst], $a3 \n"
  186. // below the dst will be word aligned (NOTE1)
  187. "$ua_chk16w: \n"
  188. "andi $t8, %[count], 0x3f \n" // whole 64-B chunks?
  189. // t8 is the byte count after 64-byte chunks
  190. "beq %[count], $t8, ua_chk8w \n"
  191. // if a2==t8, no 64-byte chunks
  192. // There will be at most 1 32-byte chunk after it
  193. "subu $a3, %[count], $t8 \n" // the reminder
  194. // Here a3 counts bytes in 16w chunks
  195. "addu $a3, %[dst], $a3 \n"
  196. // Now a3 is the final dst after 64-byte chunks
  197. "addu $t0, %[dst], %[count] \n" // t0 "past the end"
  198. "subu $t9, $t0, 160 \n"
  199. // t9 is the "last safe pref 30,128(a1)" address
  200. "pref 0, 0(%[src]) \n" // first line of src
  201. "pref 0, 32(%[src]) \n" // second line addr 32
  202. "pref 0, 64(%[src]) \n"
  203. "pref 30, 32(%[dst]) \n"
  204. // safe, as we have at least 64 bytes ahead
  205. // In case the a1 > t9 don't use "pref 30" at all
  206. "sltu $v1, $t9, %[dst] \n"
  207. "bgtz $v1, $ua_loop16w \n"
  208. // skip "pref 30,64(a1)" for too short arrays
  209. " nop \n"
  210. // otherwise, start with using pref30
  211. "pref 30, 64(%[dst]) \n"
  212. "$ua_loop16w: \n"
  213. "pref 0, 96(%[src]) \n"
  214. "lwr $t0, 0(%[src]) \n"
  215. "lwl $t0, 3(%[src]) \n"
  216. "lwr $t1, 4(%[src]) \n"
  217. "bgtz $v1, $ua_skip_pref30_96 \n"
  218. " lwl $t1, 7(%[src]) \n"
  219. "pref 30, 96(%[dst]) \n"
  220. // continue setting up the dest, addr 96
  221. "$ua_skip_pref30_96: \n"
  222. "lwr $t2, 8(%[src]) \n"
  223. "lwl $t2, 11(%[src]) \n"
  224. "lwr $t3, 12(%[src]) \n"
  225. "lwl $t3, 15(%[src]) \n"
  226. "lwr $t4, 16(%[src]) \n"
  227. "lwl $t4, 19(%[src]) \n"
  228. "lwr $t5, 20(%[src]) \n"
  229. "lwl $t5, 23(%[src]) \n"
  230. "lwr $t6, 24(%[src]) \n"
  231. "lwl $t6, 27(%[src]) \n"
  232. "lwr $t7, 28(%[src]) \n"
  233. "lwl $t7, 31(%[src]) \n"
  234. "pref 0, 128(%[src]) \n"
  235. // bring the next lines of src, addr 128
  236. "sw $t0, 0(%[dst]) \n"
  237. "sw $t1, 4(%[dst]) \n"
  238. "sw $t2, 8(%[dst]) \n"
  239. "sw $t3, 12(%[dst]) \n"
  240. "sw $t4, 16(%[dst]) \n"
  241. "sw $t5, 20(%[dst]) \n"
  242. "sw $t6, 24(%[dst]) \n"
  243. "sw $t7, 28(%[dst]) \n"
  244. "lwr $t0, 32(%[src]) \n"
  245. "lwl $t0, 35(%[src]) \n"
  246. "lwr $t1, 36(%[src]) \n"
  247. "bgtz $v1, ua_skip_pref30_128 \n"
  248. " lwl $t1, 39(%[src]) \n"
  249. "pref 30, 128(%[dst]) \n"
  250. // continue setting up the dest, addr 128
  251. "ua_skip_pref30_128: \n"
  252. "lwr $t2, 40(%[src]) \n"
  253. "lwl $t2, 43(%[src]) \n"
  254. "lwr $t3, 44(%[src]) \n"
  255. "lwl $t3, 47(%[src]) \n"
  256. "lwr $t4, 48(%[src]) \n"
  257. "lwl $t4, 51(%[src]) \n"
  258. "lwr $t5, 52(%[src]) \n"
  259. "lwl $t5, 55(%[src]) \n"
  260. "lwr $t6, 56(%[src]) \n"
  261. "lwl $t6, 59(%[src]) \n"
  262. "lwr $t7, 60(%[src]) \n"
  263. "lwl $t7, 63(%[src]) \n"
  264. "pref 0, 160(%[src]) \n"
  265. // bring the next lines of src, addr 160
  266. "sw $t0, 32(%[dst]) \n"
  267. "sw $t1, 36(%[dst]) \n"
  268. "sw $t2, 40(%[dst]) \n"
  269. "sw $t3, 44(%[dst]) \n"
  270. "sw $t4, 48(%[dst]) \n"
  271. "sw $t5, 52(%[dst]) \n"
  272. "sw $t6, 56(%[dst]) \n"
  273. "sw $t7, 60(%[dst]) \n"
  274. "addiu %[dst],%[dst],64 \n" // adding 64 to dest
  275. "sltu $v1,$t9,%[dst] \n"
  276. "bne %[dst],$a3,$ua_loop16w \n"
  277. " addiu %[src],%[src],64 \n" // adding 64 to src
  278. "move %[count],$t8 \n"
  279. // Here we have src and dest word-aligned but less than 64-bytes to go
  280. "ua_chk8w: \n"
  281. "pref 0, 0x0(%[src]) \n"
  282. "andi $t8, %[count], 0x1f \n" // 32-byte chunk?
  283. // the t8 is the reminder count
  284. "beq %[count], $t8, $ua_chk1w \n"
  285. // when count==t8, no 32-byte chunk
  286. "lwr $t0, 0(%[src]) \n"
  287. "lwl $t0, 3(%[src]) \n"
  288. "lwr $t1, 4(%[src]) \n"
  289. "lwl $t1, 7(%[src]) \n"
  290. "lwr $t2, 8(%[src]) \n"
  291. "lwl $t2, 11(%[src]) \n"
  292. "lwr $t3, 12(%[src]) \n"
  293. "lwl $t3, 15(%[src]) \n"
  294. "lwr $t4, 16(%[src]) \n"
  295. "lwl $t4, 19(%[src]) \n"
  296. "lwr $t5, 20(%[src]) \n"
  297. "lwl $t5, 23(%[src]) \n"
  298. "lwr $t6, 24(%[src]) \n"
  299. "lwl $t6, 27(%[src]) \n"
  300. "lwr $t7, 28(%[src]) \n"
  301. "lwl $t7, 31(%[src]) \n"
  302. "addiu %[src], %[src], 32 \n"
  303. "sw $t0, 0(%[dst]) \n"
  304. "sw $t1, 4(%[dst]) \n"
  305. "sw $t2, 8(%[dst]) \n"
  306. "sw $t3, 12(%[dst]) \n"
  307. "sw $t4, 16(%[dst]) \n"
  308. "sw $t5, 20(%[dst]) \n"
  309. "sw $t6, 24(%[dst]) \n"
  310. "sw $t7, 28(%[dst]) \n"
  311. "addiu %[dst], %[dst], 32 \n"
  312. "$ua_chk1w: \n"
  313. "andi %[count], $t8, 0x3 \n"
  314. // now count is the reminder past 1w chunks
  315. "beq %[count], $t8, ua_smallCopy \n"
  316. "subu $a3, $t8, %[count] \n"
  317. // a3 is count of bytes in 1w chunks
  318. "addu $a3, %[dst], $a3 \n"
  319. // now a3 is the dst address past the 1w chunks
  320. // copying in words (4-byte chunks)
  321. "$ua_wordCopy_loop: \n"
  322. "lwr $v1, 0(%[src]) \n"
  323. "lwl $v1, 3(%[src]) \n"
  324. "addiu %[src], %[src], 4 \n"
  325. "addiu %[dst], %[dst], 4 \n"
  326. // note: dst=a1 is word aligned here, see NOTE1
  327. "bne %[dst], $a3, $ua_wordCopy_loop \n"
  328. " sw $v1,-4(%[dst]) \n"
  329. // Now less than 4 bytes (value in count) left to copy
  330. "ua_smallCopy: \n"
  331. "beqz %[count], leave \n"
  332. " addu $a3, %[dst], %[count] \n" // a3 = last dst address
  333. "$ua_smallCopy_loop: \n"
  334. "lb $v1, 0(%[src]) \n"
  335. "addiu %[src], %[src], 1 \n"
  336. "addiu %[dst], %[dst], 1 \n"
  337. "bne %[dst],$a3,$ua_smallCopy_loop \n"
  338. " sb $v1, -1(%[dst]) \n"
  339. "j $ra \n"
  340. " nop \n"
  341. ".set at \n"
  342. ".set reorder \n"
  343. : [dst] "+r"(dst), [src] "+r"(src)
  344. : [count] "r"(count)
  345. : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "a3", "v1",
  346. "at");
  347. }
  348. #endif // HAS_COPYROW_MIPS
  349. // DSPR2 functions
  350. #if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips_dsp) && \
  351. (__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32) && \
  352. (__mips_isa_rev < 6)
  353. void SplitUVRow_DSPR2(const uint8* src_uv,
  354. uint8* dst_u,
  355. uint8* dst_v,
  356. int width) {
  357. __asm__ __volatile__(
  358. ".set push \n"
  359. ".set noreorder \n"
  360. "srl $t4, %[width], 4 \n" // multiplies of 16
  361. "blez $t4, 2f \n"
  362. " andi %[width], %[width], 0xf \n" // residual
  363. "1: \n"
  364. "addiu $t4, $t4, -1 \n"
  365. "lw $t0, 0(%[src_uv]) \n" // V1 | U1 | V0 | U0
  366. "lw $t1, 4(%[src_uv]) \n" // V3 | U3 | V2 | U2
  367. "lw $t2, 8(%[src_uv]) \n" // V5 | U5 | V4 | U4
  368. "lw $t3, 12(%[src_uv]) \n" // V7 | U7 | V6 | U6
  369. "lw $t5, 16(%[src_uv]) \n" // V9 | U9 | V8 | U8
  370. "lw $t6, 20(%[src_uv]) \n" // V11 | U11 | V10 |
  371. // U10
  372. "lw $t7, 24(%[src_uv]) \n" // V13 | U13 | V12 |
  373. // U12
  374. "lw $t8, 28(%[src_uv]) \n" // V15 | U15 | V14 |
  375. // U14
  376. "addiu %[src_uv], %[src_uv], 32 \n"
  377. "precrq.qb.ph $t9, $t1, $t0 \n" // V3 | V2 | V1 | V0
  378. "precr.qb.ph $t0, $t1, $t0 \n" // U3 | U2 | U1 | U0
  379. "precrq.qb.ph $t1, $t3, $t2 \n" // V7 | V6 | V5 | V4
  380. "precr.qb.ph $t2, $t3, $t2 \n" // U7 | U6 | U5 | U4
  381. "precrq.qb.ph $t3, $t6, $t5 \n" // V11 | V10 | V9 | V8
  382. "precr.qb.ph $t5, $t6, $t5 \n" // U11 | U10 | U9 | U8
  383. "precrq.qb.ph $t6, $t8, $t7 \n" // V15 | V14 | V13 |
  384. // V12
  385. "precr.qb.ph $t7, $t8, $t7 \n" // U15 | U14 | U13 |
  386. // U12
  387. "sw $t9, 0(%[dst_v]) \n"
  388. "sw $t0, 0(%[dst_u]) \n"
  389. "sw $t1, 4(%[dst_v]) \n"
  390. "sw $t2, 4(%[dst_u]) \n"
  391. "sw $t3, 8(%[dst_v]) \n"
  392. "sw $t5, 8(%[dst_u]) \n"
  393. "sw $t6, 12(%[dst_v]) \n"
  394. "sw $t7, 12(%[dst_u]) \n"
  395. "addiu %[dst_v], %[dst_v], 16 \n"
  396. "bgtz $t4, 1b \n"
  397. " addiu %[dst_u], %[dst_u], 16 \n"
  398. "beqz %[width], 3f \n"
  399. " nop \n"
  400. "2: \n"
  401. "lbu $t0, 0(%[src_uv]) \n"
  402. "lbu $t1, 1(%[src_uv]) \n"
  403. "addiu %[src_uv], %[src_uv], 2 \n"
  404. "addiu %[width], %[width], -1 \n"
  405. "sb $t0, 0(%[dst_u]) \n"
  406. "sb $t1, 0(%[dst_v]) \n"
  407. "addiu %[dst_u], %[dst_u], 1 \n"
  408. "bgtz %[width], 2b \n"
  409. " addiu %[dst_v], %[dst_v], 1 \n"
  410. "3: \n"
  411. ".set pop \n"
  412. : [src_uv] "+r"(src_uv), [width] "+r"(width), [dst_u] "+r"(dst_u),
  413. [dst_v] "+r"(dst_v)
  414. :
  415. : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
  416. }
  417. void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) {
  418. __asm__ __volatile__(
  419. ".set push \n"
  420. ".set noreorder \n"
  421. "srl $t4, %[width], 4 \n" // multiplies of 16
  422. "andi $t5, %[width], 0xf \n"
  423. "blez $t4, 2f \n"
  424. " addu %[src], %[src], %[width] \n" // src += width
  425. "1: \n"
  426. "lw $t0, -16(%[src]) \n" // |3|2|1|0|
  427. "lw $t1, -12(%[src]) \n" // |7|6|5|4|
  428. "lw $t2, -8(%[src]) \n" // |11|10|9|8|
  429. "lw $t3, -4(%[src]) \n" // |15|14|13|12|
  430. "wsbh $t0, $t0 \n" // |2|3|0|1|
  431. "wsbh $t1, $t1 \n" // |6|7|4|5|
  432. "wsbh $t2, $t2 \n" // |10|11|8|9|
  433. "wsbh $t3, $t3 \n" // |14|15|12|13|
  434. "rotr $t0, $t0, 16 \n" // |0|1|2|3|
  435. "rotr $t1, $t1, 16 \n" // |4|5|6|7|
  436. "rotr $t2, $t2, 16 \n" // |8|9|10|11|
  437. "rotr $t3, $t3, 16 \n" // |12|13|14|15|
  438. "addiu %[src], %[src], -16 \n"
  439. "addiu $t4, $t4, -1 \n"
  440. "sw $t3, 0(%[dst]) \n" // |15|14|13|12|
  441. "sw $t2, 4(%[dst]) \n" // |11|10|9|8|
  442. "sw $t1, 8(%[dst]) \n" // |7|6|5|4|
  443. "sw $t0, 12(%[dst]) \n" // |3|2|1|0|
  444. "bgtz $t4, 1b \n"
  445. " addiu %[dst], %[dst], 16 \n"
  446. "beqz $t5, 3f \n"
  447. " nop \n"
  448. "2: \n"
  449. "lbu $t0, -1(%[src]) \n"
  450. "addiu $t5, $t5, -1 \n"
  451. "addiu %[src], %[src], -1 \n"
  452. "sb $t0, 0(%[dst]) \n"
  453. "bgez $t5, 2b \n"
  454. " addiu %[dst], %[dst], 1 \n"
  455. "3: \n"
  456. ".set pop \n"
  457. : [src] "+r"(src), [dst] "+r"(dst)
  458. : [width] "r"(width)
  459. : "t0", "t1", "t2", "t3", "t4", "t5");
  460. }
  461. void MirrorUVRow_DSPR2(const uint8* src_uv,
  462. uint8* dst_u,
  463. uint8* dst_v,
  464. int width) {
  465. int x;
  466. int y;
  467. __asm__ __volatile__(
  468. ".set push \n"
  469. ".set noreorder \n"
  470. "addu $t4, %[width], %[width] \n"
  471. "srl %[x], %[width], 4 \n"
  472. "andi %[y], %[width], 0xf \n"
  473. "blez %[x], 2f \n"
  474. " addu %[src_uv], %[src_uv], $t4 \n"
  475. "1: \n"
  476. "lw $t0, -32(%[src_uv]) \n" // |3|2|1|0|
  477. "lw $t1, -28(%[src_uv]) \n" // |7|6|5|4|
  478. "lw $t2, -24(%[src_uv]) \n" // |11|10|9|8|
  479. "lw $t3, -20(%[src_uv]) \n" // |15|14|13|12|
  480. "lw $t4, -16(%[src_uv]) \n" // |19|18|17|16|
  481. "lw $t6, -12(%[src_uv]) \n" // |23|22|21|20|
  482. "lw $t7, -8(%[src_uv]) \n" // |27|26|25|24|
  483. "lw $t8, -4(%[src_uv]) \n" // |31|30|29|28|
  484. "rotr $t0, $t0, 16 \n" // |1|0|3|2|
  485. "rotr $t1, $t1, 16 \n" // |5|4|7|6|
  486. "rotr $t2, $t2, 16 \n" // |9|8|11|10|
  487. "rotr $t3, $t3, 16 \n" // |13|12|15|14|
  488. "rotr $t4, $t4, 16 \n" // |17|16|19|18|
  489. "rotr $t6, $t6, 16 \n" // |21|20|23|22|
  490. "rotr $t7, $t7, 16 \n" // |25|24|27|26|
  491. "rotr $t8, $t8, 16 \n" // |29|28|31|30|
  492. "precr.qb.ph $t9, $t0, $t1 \n" // |0|2|4|6|
  493. "precrq.qb.ph $t5, $t0, $t1 \n" // |1|3|5|7|
  494. "precr.qb.ph $t0, $t2, $t3 \n" // |8|10|12|14|
  495. "precrq.qb.ph $t1, $t2, $t3 \n" // |9|11|13|15|
  496. "precr.qb.ph $t2, $t4, $t6 \n" // |16|18|20|22|
  497. "precrq.qb.ph $t3, $t4, $t6 \n" // |17|19|21|23|
  498. "precr.qb.ph $t4, $t7, $t8 \n" // |24|26|28|30|
  499. "precrq.qb.ph $t6, $t7, $t8 \n" // |25|27|29|31|
  500. "addiu %[src_uv], %[src_uv], -32 \n"
  501. "addiu %[x], %[x], -1 \n"
  502. "swr $t4, 0(%[dst_u]) \n"
  503. "swl $t4, 3(%[dst_u]) \n" // |30|28|26|24|
  504. "swr $t6, 0(%[dst_v]) \n"
  505. "swl $t6, 3(%[dst_v]) \n" // |31|29|27|25|
  506. "swr $t2, 4(%[dst_u]) \n"
  507. "swl $t2, 7(%[dst_u]) \n" // |22|20|18|16|
  508. "swr $t3, 4(%[dst_v]) \n"
  509. "swl $t3, 7(%[dst_v]) \n" // |23|21|19|17|
  510. "swr $t0, 8(%[dst_u]) \n"
  511. "swl $t0, 11(%[dst_u]) \n" // |14|12|10|8|
  512. "swr $t1, 8(%[dst_v]) \n"
  513. "swl $t1, 11(%[dst_v]) \n" // |15|13|11|9|
  514. "swr $t9, 12(%[dst_u]) \n"
  515. "swl $t9, 15(%[dst_u]) \n" // |6|4|2|0|
  516. "swr $t5, 12(%[dst_v]) \n"
  517. "swl $t5, 15(%[dst_v]) \n" // |7|5|3|1|
  518. "addiu %[dst_v], %[dst_v], 16 \n"
  519. "bgtz %[x], 1b \n"
  520. " addiu %[dst_u], %[dst_u], 16 \n"
  521. "beqz %[y], 3f \n"
  522. " nop \n"
  523. "b 2f \n"
  524. " nop \n"
  525. "2: \n"
  526. "lbu $t0, -2(%[src_uv]) \n"
  527. "lbu $t1, -1(%[src_uv]) \n"
  528. "addiu %[src_uv], %[src_uv], -2 \n"
  529. "addiu %[y], %[y], -1 \n"
  530. "sb $t0, 0(%[dst_u]) \n"
  531. "sb $t1, 0(%[dst_v]) \n"
  532. "addiu %[dst_u], %[dst_u], 1 \n"
  533. "bgtz %[y], 2b \n"
  534. " addiu %[dst_v], %[dst_v], 1 \n"
  535. "3: \n"
  536. ".set pop \n"
  537. : [src_uv] "+r"(src_uv), [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v),
  538. [x] "=&r"(x), [y] "=&r"(y)
  539. : [width] "r"(width)
  540. : "t0", "t1", "t2", "t3", "t4", "t5", "t7", "t8", "t9");
  541. }
  542. void I422ToARGBRow_DSPR2(const uint8* src_y,
  543. const uint8* src_u,
  544. const uint8* src_v,
  545. uint8* rgb_buf,
  546. const struct YuvConstants* yuvconstants,
  547. int width) {
  548. int x;
  549. uint32 tmp_ub = yuvconstants->kUVToB[0];
  550. uint32 tmp_ug = yuvconstants->kUVToG[0];
  551. uint32 tmp_vg = yuvconstants->kUVToG[1];
  552. uint32 tmp_vr = yuvconstants->kUVToR[1];
  553. uint32 tmp_bb = yuvconstants->kUVBiasB[0];
  554. uint32 tmp_bg = yuvconstants->kUVBiasG[0];
  555. uint32 tmp_br = yuvconstants->kUVBiasR[0];
  556. uint32 yg = yuvconstants->kYToRgb[0];
  557. uint32 tmp_yg;
  558. uint32 tmp_mask = 0x7fff7fff;
  559. tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
  560. tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
  561. tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
  562. tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
  563. tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
  564. tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
  565. tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
  566. tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
  567. yg = yg * 0x0101;
  568. for (x = 0; x < width - 1; x += 2) {
  569. uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
  570. uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
  571. __asm__ __volatile__(
  572. ".set push \n"
  573. ".set noreorder \n"
  574. "lbu %[tmp_t7], 0(%[src_y]) \n"
  575. "lbu %[tmp_t1], 1(%[src_y]) \n"
  576. "mul %[tmp_t7], %[tmp_t7], %[yg] \n"
  577. "mul %[tmp_t1], %[tmp_t1], %[yg] \n"
  578. "lbu %[tmp_t2], 0(%[src_u]) \n"
  579. "lbu %[tmp_t3], 0(%[src_v]) \n"
  580. "replv.ph %[tmp_t2], %[tmp_t2] \n"
  581. "replv.ph %[tmp_t3], %[tmp_t3] \n"
  582. "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n"
  583. "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n"
  584. "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n"
  585. "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n"
  586. "srl %[tmp_t7], %[tmp_t7], 16 \n"
  587. "ins %[tmp_t1], %[tmp_t7], 0, 16 \n"
  588. "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n"
  589. "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n"
  590. "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n"
  591. "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n"
  592. "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n"
  593. "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n"
  594. "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n"
  595. "shra.ph %[tmp_t7], %[tmp_t7], 6 \n"
  596. "shra.ph %[tmp_t8], %[tmp_t8], 6 \n"
  597. "shra.ph %[tmp_t9], %[tmp_t9], 6 \n"
  598. "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n"
  599. "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n"
  600. "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n"
  601. "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n"
  602. "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n"
  603. "precrq.ph.w %[tmp_t9], %[tmp_t8], %[tmp_t7] \n"
  604. "ins %[tmp_t7], %[tmp_t8], 16, 16 \n"
  605. "precr.qb.ph %[tmp_t8], %[tmp_t9], %[tmp_t7] \n"
  606. "precrq.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n"
  607. "sw %[tmp_t8], 0(%[rgb_buf]) \n"
  608. "sw %[tmp_t7], 4(%[rgb_buf]) \n"
  609. ".set pop \n"
  610. : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
  611. [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
  612. [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
  613. [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
  614. : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v),
  615. [tmp_ub] "r"(tmp_ub), [tmp_ug] "r"(tmp_ug), [yg] "r"(yg),
  616. [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr), [tmp_bb] "r"(tmp_bb),
  617. [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br), [tmp_yg] "r"(tmp_yg),
  618. [rgb_buf] "r"(rgb_buf), [tmp_mask] "r"(tmp_mask));
  619. src_y += 2;
  620. src_u += 1;
  621. src_v += 1;
  622. rgb_buf += 8; // Advance 4 pixels.
  623. }
  624. }
  625. // Bilinear filter 8x2 -> 8x1
  626. void InterpolateRow_DSPR2(uint8* dst_ptr,
  627. const uint8* src_ptr,
  628. ptrdiff_t src_stride,
  629. int dst_width,
  630. int source_y_fraction) {
  631. int y0_fraction = 256 - source_y_fraction;
  632. const uint8* src_ptr1 = src_ptr + src_stride;
  633. __asm__ __volatile__(
  634. ".set push \n"
  635. ".set noreorder \n"
  636. "replv.ph $t0, %[y0_fraction] \n"
  637. "replv.ph $t1, %[source_y_fraction] \n"
  638. "1: \n"
  639. "lw $t2, 0(%[src_ptr]) \n"
  640. "lw $t3, 0(%[src_ptr1]) \n"
  641. "lw $t4, 4(%[src_ptr]) \n"
  642. "lw $t5, 4(%[src_ptr1]) \n"
  643. "muleu_s.ph.qbl $t6, $t2, $t0 \n"
  644. "muleu_s.ph.qbr $t7, $t2, $t0 \n"
  645. "muleu_s.ph.qbl $t8, $t3, $t1 \n"
  646. "muleu_s.ph.qbr $t9, $t3, $t1 \n"
  647. "muleu_s.ph.qbl $t2, $t4, $t0 \n"
  648. "muleu_s.ph.qbr $t3, $t4, $t0 \n"
  649. "muleu_s.ph.qbl $t4, $t5, $t1 \n"
  650. "muleu_s.ph.qbr $t5, $t5, $t1 \n"
  651. "addq.ph $t6, $t6, $t8 \n"
  652. "addq.ph $t7, $t7, $t9 \n"
  653. "addq.ph $t2, $t2, $t4 \n"
  654. "addq.ph $t3, $t3, $t5 \n"
  655. "shra_r.ph $t6, $t6, 8 \n"
  656. "shra_r.ph $t7, $t7, 8 \n"
  657. "shra_r.ph $t2, $t2, 8 \n"
  658. "shra_r.ph $t3, $t3, 8 \n"
  659. "precr.qb.ph $t6, $t6, $t7 \n"
  660. "precr.qb.ph $t2, $t2, $t3 \n"
  661. "addiu %[src_ptr], %[src_ptr], 8 \n"
  662. "addiu %[src_ptr1], %[src_ptr1], 8 \n"
  663. "addiu %[dst_width], %[dst_width], -8 \n"
  664. "sw $t6, 0(%[dst_ptr]) \n"
  665. "sw $t2, 4(%[dst_ptr]) \n"
  666. "bgtz %[dst_width], 1b \n"
  667. " addiu %[dst_ptr], %[dst_ptr], 8 \n"
  668. ".set pop \n"
  669. : [dst_ptr] "+r"(dst_ptr), [src_ptr1] "+r"(src_ptr1),
  670. [src_ptr] "+r"(src_ptr), [dst_width] "+r"(dst_width)
  671. : [source_y_fraction] "r"(source_y_fraction),
  672. [y0_fraction] "r"(y0_fraction), [src_stride] "r"(src_stride)
  673. : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
  674. }
  675. #include <stdio.h>
  676. void RGB24ToARGBRow_DSPR2(const uint8* src_rgb24, uint8* dst_argb, int width) {
  677. int x;
  678. uint32 tmp_mask = 0xff;
  679. uint32 tmp_t1;
  680. for (x = 0; x < (width - 1); ++x) {
  681. __asm__ __volatile__(
  682. ".set push \n"
  683. ".set noreorder \n"
  684. "ulw %[tmp_t1], 0(%[src_rgb24]) \n"
  685. "addiu %[dst_argb], %[dst_argb], 4 \n"
  686. "addiu %[src_rgb24], %[src_rgb24], 3 \n"
  687. "ins %[tmp_t1], %[tmp_mask], 24, 8 \n"
  688. "sw %[tmp_t1], -4(%[dst_argb]) \n"
  689. ".set pop \n"
  690. : [src_rgb24] "+r"(src_rgb24), [dst_argb] "+r"(dst_argb),
  691. [tmp_t1] "=&r"(tmp_t1)
  692. : [tmp_mask] "r"(tmp_mask)
  693. : "memory");
  694. }
  695. uint8 b = src_rgb24[0];
  696. uint8 g = src_rgb24[1];
  697. uint8 r = src_rgb24[2];
  698. dst_argb[0] = b;
  699. dst_argb[1] = g;
  700. dst_argb[2] = r;
  701. dst_argb[3] = 255u;
  702. }
  703. void RAWToARGBRow_DSPR2(const uint8* src_raw, uint8* dst_argb, int width) {
  704. int x;
  705. uint32 tmp_mask = 0xff;
  706. uint32 tmp_t1, tmp_t2;
  707. for (x = 0; x < (width - 1); ++x) {
  708. __asm__ __volatile__(
  709. ".set push \n"
  710. ".set noreorder \n"
  711. "ulw %[tmp_t1], 0(%[src_raw]) \n"
  712. "addiu %[dst_argb], %[dst_argb], 4 \n"
  713. "addiu %[src_raw], %[src_raw], 3 \n"
  714. "srl %[tmp_t2], %[tmp_t1], 16 \n"
  715. "ins %[tmp_t1], %[tmp_mask], 24, 8 \n"
  716. "ins %[tmp_t1], %[tmp_t1], 16, 8 \n"
  717. "ins %[tmp_t1], %[tmp_t2], 0, 8 \n"
  718. "sw %[tmp_t1], -4(%[dst_argb]) \n"
  719. ".set pop \n"
  720. : [src_raw] "+r"(src_raw), [dst_argb] "+r"(dst_argb),
  721. [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2)
  722. : [tmp_mask] "r"(tmp_mask)
  723. : "memory");
  724. }
  725. uint8 r = src_raw[0];
  726. uint8 g = src_raw[1];
  727. uint8 b = src_raw[2];
  728. dst_argb[0] = b;
  729. dst_argb[1] = g;
  730. dst_argb[2] = r;
  731. dst_argb[3] = 255u;
  732. }
  733. void RGB565ToARGBRow_DSPR2(const uint8* src_rgb565,
  734. uint8* dst_argb,
  735. int width) {
  736. int x;
  737. uint32 tmp_mask = 0xff;
  738. uint32 tmp_t1, tmp_t2, tmp_t3;
  739. for (x = 0; x < width; ++x) {
  740. __asm__ __volatile__(
  741. ".set push \n"
  742. ".set noreorder \n"
  743. "lhu %[tmp_t1], 0(%[src_rgb565]) \n"
  744. "addiu %[dst_argb], %[dst_argb], 4 \n"
  745. "addiu %[src_rgb565], %[src_rgb565], 2 \n"
  746. "sll %[tmp_t2], %[tmp_t1], 8 \n"
  747. "ins %[tmp_t2], %[tmp_mask], 24,8 \n"
  748. "ins %[tmp_t2], %[tmp_t1], 3, 16 \n"
  749. "ins %[tmp_t2], %[tmp_t1], 5, 11 \n"
  750. "srl %[tmp_t3], %[tmp_t1], 9 \n"
  751. "ins %[tmp_t2], %[tmp_t3], 8, 2 \n"
  752. "ins %[tmp_t2], %[tmp_t1], 3, 5 \n"
  753. "srl %[tmp_t3], %[tmp_t1], 2 \n"
  754. "ins %[tmp_t2], %[tmp_t3], 0, 3 \n"
  755. "sw %[tmp_t2], -4(%[dst_argb]) \n"
  756. ".set pop \n"
  757. : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
  758. [tmp_t3] "=&r"(tmp_t3), [src_rgb565] "+r"(src_rgb565),
  759. [dst_argb] "+r"(dst_argb)
  760. : [tmp_mask] "r"(tmp_mask));
  761. }
  762. }
  763. void ARGB1555ToARGBRow_DSPR2(const uint8* src_argb1555,
  764. uint8* dst_argb,
  765. int width) {
  766. int x;
  767. uint32 tmp_t1, tmp_t2, tmp_t3;
  768. for (x = 0; x < width; ++x) {
  769. __asm__ __volatile__(
  770. ".set push \n"
  771. ".set noreorder \n"
  772. "lh %[tmp_t1], 0(%[src_argb1555]) \n"
  773. "addiu %[dst_argb], %[dst_argb], 4 \n"
  774. "addiu %[src_argb1555], %[src_argb1555], 2 \n"
  775. "sll %[tmp_t2], %[tmp_t1], 9 \n"
  776. "ins %[tmp_t2], %[tmp_t1], 4, 15 \n"
  777. "ins %[tmp_t2], %[tmp_t1], 6, 10 \n"
  778. "srl %[tmp_t3], %[tmp_t1], 7 \n"
  779. "ins %[tmp_t2], %[tmp_t3], 8, 3 \n"
  780. "ins %[tmp_t2], %[tmp_t1], 3, 5 \n"
  781. "srl %[tmp_t3], %[tmp_t1], 2 \n"
  782. "ins %[tmp_t2], %[tmp_t3], 0, 3 \n"
  783. "sw %[tmp_t2], -4(%[dst_argb]) \n"
  784. ".set pop \n"
  785. : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
  786. [tmp_t3] "=&r"(tmp_t3), [src_argb1555] "+r"(src_argb1555),
  787. [dst_argb] "+r"(dst_argb)
  788. :);
  789. }
  790. }
  791. void ARGB4444ToARGBRow_DSPR2(const uint8* src_argb4444,
  792. uint8* dst_argb,
  793. int width) {
  794. int x;
  795. uint32 tmp_t1;
  796. for (x = 0; x < width; ++x) {
  797. __asm__ __volatile__(
  798. ".set push \n"
  799. ".set noreorder \n"
  800. "lh %[tmp_t1], 0(%[src_argb4444]) \n"
  801. "addiu %[dst_argb], %[dst_argb], 4 \n"
  802. "addiu %[src_argb4444], %[src_argb4444], 2 \n"
  803. "ins %[tmp_t1], %[tmp_t1], 16, 16 \n"
  804. "ins %[tmp_t1], %[tmp_t1], 12, 16 \n"
  805. "ins %[tmp_t1], %[tmp_t1], 8, 12 \n"
  806. "ins %[tmp_t1], %[tmp_t1], 4, 8 \n"
  807. "sw %[tmp_t1], -4(%[dst_argb]) \n"
  808. ".set pop \n"
  809. : [src_argb4444] "+r"(src_argb4444), [dst_argb] "+r"(dst_argb),
  810. [tmp_t1] "=&r"(tmp_t1));
  811. }
  812. }
  813. void I444ToARGBRow_DSPR2(const uint8* y_buf,
  814. const uint8* u_buf,
  815. const uint8* v_buf,
  816. uint8* rgb_buf,
  817. const struct YuvConstants* yuvconstants,
  818. int width) {
  819. int x;
  820. uint32 tmp_ub = yuvconstants->kUVToB[0];
  821. uint32 tmp_ug = yuvconstants->kUVToG[0];
  822. uint32 tmp_vg = yuvconstants->kUVToG[1];
  823. uint32 tmp_vr = yuvconstants->kUVToR[1];
  824. uint32 tmp_bb = yuvconstants->kUVBiasB[0];
  825. uint32 tmp_bg = yuvconstants->kUVBiasG[0];
  826. uint32 tmp_br = yuvconstants->kUVBiasR[0];
  827. uint32 yg = yuvconstants->kYToRgb[0];
  828. uint32 tmp_mask = 0x7fff7fff;
  829. uint32 tmp_yg;
  830. tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
  831. tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
  832. tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
  833. tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
  834. tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
  835. tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
  836. tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
  837. tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
  838. yg = yg * 0x0101;
  839. for (x = 0; x < width - 1; x += 2) {
  840. uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
  841. uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
  842. __asm__ __volatile__(
  843. ".set push \n"
  844. ".set noreorder \n"
  845. "lbu %[tmp_t7], 0(%[y_buf]) \n"
  846. "lbu %[tmp_t1], 1(%[y_buf]) \n"
  847. "mul %[tmp_t7], %[tmp_t7], %[yg] \n"
  848. "mul %[tmp_t1], %[tmp_t1], %[yg] \n"
  849. "lh %[tmp_t2], 0(%[u_buf]) \n"
  850. "lh %[tmp_t3], 0(%[v_buf]) \n"
  851. "preceu.ph.qbr %[tmp_t2], %[tmp_t2] \n"
  852. "preceu.ph.qbr %[tmp_t3], %[tmp_t3] \n"
  853. "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n"
  854. "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n"
  855. "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n"
  856. "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n"
  857. "srl %[tmp_t7], %[tmp_t7], 16 \n"
  858. "ins %[tmp_t1], %[tmp_t7], 0, 16 \n"
  859. "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n"
  860. "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n"
  861. "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n"
  862. "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n"
  863. "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n"
  864. "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n"
  865. "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n"
  866. "shra.ph %[tmp_t7], %[tmp_t7], 6 \n"
  867. "shra.ph %[tmp_t8], %[tmp_t8], 6 \n"
  868. "shra.ph %[tmp_t9], %[tmp_t9], 6 \n"
  869. "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n"
  870. "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n"
  871. "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n"
  872. "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n"
  873. "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n"
  874. "precrq.ph.w %[tmp_t2], %[tmp_t8], %[tmp_t7] \n"
  875. "ins %[tmp_t7], %[tmp_t8], 16, 16 \n"
  876. "precr.qb.ph %[tmp_t8], %[tmp_t2], %[tmp_t7] \n"
  877. "precrq.qb.ph %[tmp_t7], %[tmp_t2], %[tmp_t7] \n"
  878. "sw %[tmp_t8], 0(%[rgb_buf]) \n"
  879. "sw %[tmp_t7], 4(%[rgb_buf]) \n"
  880. ".set pop \n"
  881. : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
  882. [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
  883. [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
  884. [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
  885. : [y_buf] "r"(y_buf), [yg] "r"(yg), [u_buf] "r"(u_buf),
  886. [v_buf] "r"(v_buf), [tmp_ub] "r"(tmp_ub), [tmp_ug] "r"(tmp_ug),
  887. [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr), [tmp_bb] "r"(tmp_bb),
  888. [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br), [tmp_yg] "r"(tmp_yg),
  889. [rgb_buf] "r"(rgb_buf), [tmp_mask] "r"(tmp_mask));
  890. y_buf += 2;
  891. u_buf += 2;
  892. v_buf += 2;
  893. rgb_buf += 8; // Advance 1 pixel.
  894. }
  895. }
  896. void I422ToARGB4444Row_DSPR2(const uint8* src_y,
  897. const uint8* src_u,
  898. const uint8* src_v,
  899. uint8* dst_argb4444,
  900. const struct YuvConstants* yuvconstants,
  901. int width) {
  902. int x;
  903. uint32 tmp_ub = yuvconstants->kUVToB[0];
  904. uint32 tmp_ug = yuvconstants->kUVToG[0];
  905. uint32 tmp_vg = yuvconstants->kUVToG[1];
  906. uint32 tmp_vr = yuvconstants->kUVToR[1];
  907. uint32 tmp_bb = yuvconstants->kUVBiasB[0];
  908. uint32 tmp_bg = yuvconstants->kUVBiasG[0];
  909. uint32 tmp_br = yuvconstants->kUVBiasR[0];
  910. uint32 yg = yuvconstants->kYToRgb[0];
  911. uint32 tmp_yg;
  912. uint32 tmp_mask = 0x7fff7fff;
  913. tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
  914. tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
  915. tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
  916. tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
  917. tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
  918. tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
  919. tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
  920. tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
  921. yg = yg * 0x0101;
  922. for (x = 0; x < width - 1; x += 2) {
  923. uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
  924. uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
  925. __asm__ __volatile__(
  926. ".set push \n"
  927. ".set noreorder \n"
  928. "lbu %[tmp_t7], 0(%[src_y]) \n"
  929. "lbu %[tmp_t1], 1(%[src_y]) \n"
  930. "mul %[tmp_t7], %[tmp_t7], %[yg] \n"
  931. "mul %[tmp_t1], %[tmp_t1], %[yg] \n"
  932. "lbu %[tmp_t2], 0(%[src_u]) \n"
  933. "lbu %[tmp_t3], 0(%[src_v]) \n"
  934. "replv.ph %[tmp_t2], %[tmp_t2] \n"
  935. "replv.ph %[tmp_t3], %[tmp_t3] \n"
  936. "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n"
  937. "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n"
  938. "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n"
  939. "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n"
  940. "srl %[tmp_t7], %[tmp_t7], 16 \n"
  941. "ins %[tmp_t1], %[tmp_t7], 0, 16 \n"
  942. "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n"
  943. "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n"
  944. "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n"
  945. "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n"
  946. "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n"
  947. "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n"
  948. "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n"
  949. "shra.ph %[tmp_t7], %[tmp_t7], 6 \n"
  950. "shra.ph %[tmp_t8], %[tmp_t8], 6 \n"
  951. "shra.ph %[tmp_t9], %[tmp_t9], 6 \n"
  952. "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n"
  953. "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n"
  954. "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n"
  955. "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n"
  956. "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n"
  957. "precrq.ph.w %[tmp_t2], %[tmp_t8], %[tmp_t7] \n"
  958. "ins %[tmp_t7], %[tmp_t8], 16, 16 \n"
  959. "precr.qb.ph %[tmp_t8], %[tmp_t2], %[tmp_t7] \n"
  960. "precrq.qb.ph %[tmp_t7], %[tmp_t2], %[tmp_t7] \n"
  961. "shrl.qb %[tmp_t1], %[tmp_t8], 4 \n"
  962. "shrl.qb %[tmp_t2], %[tmp_t7], 4 \n"
  963. "shrl.ph %[tmp_t8], %[tmp_t1], 4 \n"
  964. "shrl.ph %[tmp_t7], %[tmp_t2], 4 \n"
  965. "or %[tmp_t8], %[tmp_t8], %[tmp_t1] \n"
  966. "or %[tmp_t7], %[tmp_t7], %[tmp_t2] \n"
  967. "precr.qb.ph %[tmp_t8], %[tmp_t7], %[tmp_t8] \n"
  968. "sw %[tmp_t8], 0(%[dst_argb4444]) \n"
  969. ".set pop \n"
  970. : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
  971. [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
  972. [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
  973. [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
  974. : [dst_argb4444] "r"(dst_argb4444), [yg] "r"(yg), [src_u] "r"(src_u),
  975. [src_v] "r"(src_v), [src_y] "r"(src_y), [tmp_ub] "r"(tmp_ub),
  976. [tmp_ug] "r"(tmp_ug), [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr),
  977. [tmp_bb] "r"(tmp_bb), [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br),
  978. [tmp_yg] "r"(tmp_yg), [tmp_mask] "r"(tmp_mask));
  979. src_y += 2;
  980. src_u += 1;
  981. src_v += 1;
  982. dst_argb4444 += 4; // Advance 2 pixels.
  983. }
  984. }
  985. void I422ToARGB1555Row_DSPR2(const uint8* src_y,
  986. const uint8* src_u,
  987. const uint8* src_v,
  988. uint8* dst_argb1555,
  989. const struct YuvConstants* yuvconstants,
  990. int width) {
  991. int x;
  992. uint32 tmp_ub = yuvconstants->kUVToB[0];
  993. uint32 tmp_ug = yuvconstants->kUVToG[0];
  994. uint32 tmp_vg = yuvconstants->kUVToG[1];
  995. uint32 tmp_vr = yuvconstants->kUVToR[1];
  996. uint32 tmp_bb = yuvconstants->kUVBiasB[0];
  997. uint32 tmp_bg = yuvconstants->kUVBiasG[0];
  998. uint32 tmp_br = yuvconstants->kUVBiasR[0];
  999. uint32 yg = yuvconstants->kYToRgb[0];
  1000. uint32 tmp_yg;
  1001. uint32 tmp_mask = 0x80008000;
  1002. tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
  1003. tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
  1004. tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
  1005. tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
  1006. tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
  1007. tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
  1008. tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
  1009. tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
  1010. yg = yg * 0x0101;
  1011. for (x = 0; x < width - 1; x += 2) {
  1012. uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
  1013. uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
  1014. __asm__ __volatile__(
  1015. ".set push \n"
  1016. ".set noreorder \n"
  1017. "lbu %[tmp_t7], 0(%[src_y]) \n"
  1018. "lbu %[tmp_t1], 1(%[src_y]) \n"
  1019. "mul %[tmp_t7], %[tmp_t7], %[yg] \n"
  1020. "mul %[tmp_t1], %[tmp_t1], %[yg] \n"
  1021. "lbu %[tmp_t2], 0(%[src_u]) \n"
  1022. "lbu %[tmp_t3], 0(%[src_v]) \n"
  1023. "replv.ph %[tmp_t2], %[tmp_t2] \n"
  1024. "replv.ph %[tmp_t3], %[tmp_t3] \n"
  1025. "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n"
  1026. "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n"
  1027. "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n"
  1028. "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n"
  1029. "srl %[tmp_t7], %[tmp_t7], 16 \n"
  1030. "ins %[tmp_t1], %[tmp_t7], 0, 16 \n"
  1031. "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n"
  1032. "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n"
  1033. "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n"
  1034. "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n"
  1035. "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n"
  1036. "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n"
  1037. "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n"
  1038. "shra.ph %[tmp_t7], %[tmp_t7], 6 \n"
  1039. "shra.ph %[tmp_t8], %[tmp_t8], 6 \n"
  1040. "shra.ph %[tmp_t9], %[tmp_t9], 6 \n"
  1041. "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n"
  1042. "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n"
  1043. "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n"
  1044. "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n"
  1045. "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n"
  1046. "precrq.ph.w %[tmp_t2], %[tmp_t8], %[tmp_t7] \n"
  1047. "ins %[tmp_t7], %[tmp_t8], 16, 16 \n"
  1048. "precr.qb.ph %[tmp_t8], %[tmp_t2], %[tmp_t7] \n"
  1049. "precrq.qb.ph %[tmp_t7], %[tmp_t2], %[tmp_t7] \n"
  1050. "ins %[tmp_t3], %[tmp_t8], 7, 24 \n"
  1051. "ins %[tmp_t3], %[tmp_t8], 10, 16 \n"
  1052. "ins %[tmp_t3], %[tmp_t8], 13, 8 \n"
  1053. "ins %[tmp_t4], %[tmp_t7], 7, 24 \n"
  1054. "ins %[tmp_t4], %[tmp_t7], 10, 16 \n"
  1055. "ins %[tmp_t4], %[tmp_t7], 13, 8 \n"
  1056. "precrq.ph.w %[tmp_t8], %[tmp_t4], %[tmp_t3] \n"
  1057. "or %[tmp_t8], %[tmp_t8], %[tmp_mask]\n"
  1058. "sw %[tmp_t8], 0(%[dst_argb1555]) \n"
  1059. ".set pop \n"
  1060. : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
  1061. [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
  1062. [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
  1063. [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
  1064. : [dst_argb1555] "r"(dst_argb1555), [yg] "r"(yg), [src_u] "r"(src_u),
  1065. [src_v] "r"(src_v), [src_y] "r"(src_y), [tmp_ub] "r"(tmp_ub),
  1066. [tmp_ug] "r"(tmp_ug), [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr),
  1067. [tmp_bb] "r"(tmp_bb), [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br),
  1068. [tmp_yg] "r"(tmp_yg), [tmp_mask] "r"(tmp_mask));
  1069. src_y += 2;
  1070. src_u += 1;
  1071. src_v += 1;
  1072. dst_argb1555 += 4; // Advance 2 pixels.
  1073. }
  1074. }
  1075. void NV12ToARGBRow_DSPR2(const uint8* src_y,
  1076. const uint8* src_uv,
  1077. uint8* rgb_buf,
  1078. const struct YuvConstants* yuvconstants,
  1079. int width) {
  1080. int x;
  1081. uint32 tmp_ub = yuvconstants->kUVToB[0];
  1082. uint32 tmp_ug = yuvconstants->kUVToG[0];
  1083. uint32 tmp_vg = yuvconstants->kUVToG[1];
  1084. uint32 tmp_vr = yuvconstants->kUVToR[1];
  1085. uint32 tmp_bb = yuvconstants->kUVBiasB[0];
  1086. uint32 tmp_bg = yuvconstants->kUVBiasG[0];
  1087. uint32 tmp_br = yuvconstants->kUVBiasR[0];
  1088. uint32 yg = yuvconstants->kYToRgb[0];
  1089. uint32 tmp_mask = 0x7fff7fff;
  1090. uint32 tmp_yg;
  1091. tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
  1092. tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
  1093. tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
  1094. tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
  1095. tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
  1096. tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
  1097. tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
  1098. tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
  1099. yg = yg * 0x0101;
  1100. for (x = 0; x < width - 1; x += 2) {
  1101. uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
  1102. uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
  1103. __asm__ __volatile__(
  1104. ".set push \n"
  1105. ".set noreorder \n"
  1106. "lbu %[tmp_t7], 0(%[src_y]) \n"
  1107. "lbu %[tmp_t1], 1(%[src_y]) \n"
  1108. "mul %[tmp_t7], %[tmp_t7], %[yg] \n"
  1109. "mul %[tmp_t1], %[tmp_t1], %[yg] \n"
  1110. "lbu %[tmp_t2], 0(%[src_uv]) \n"
  1111. "lbu %[tmp_t3], 1(%[src_uv]) \n"
  1112. "replv.ph %[tmp_t2], %[tmp_t2] \n"
  1113. "replv.ph %[tmp_t3], %[tmp_t3] \n"
  1114. "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n"
  1115. "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n"
  1116. "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n"
  1117. "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n"
  1118. "srl %[tmp_t7], %[tmp_t7], 16 \n"
  1119. "ins %[tmp_t1], %[tmp_t7], 0, 16 \n"
  1120. "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n"
  1121. "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n"
  1122. "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n"
  1123. "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n"
  1124. "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n"
  1125. "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n"
  1126. "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n"
  1127. "shra.ph %[tmp_t7], %[tmp_t7], 6 \n"
  1128. "shra.ph %[tmp_t8], %[tmp_t8], 6 \n"
  1129. "shra.ph %[tmp_t9], %[tmp_t9], 6 \n"
  1130. "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n"
  1131. "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n"
  1132. "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n"
  1133. "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n"
  1134. "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n"
  1135. "precrq.ph.w %[tmp_t2], %[tmp_t8], %[tmp_t7] \n"
  1136. "ins %[tmp_t7], %[tmp_t8], 16, 16 \n"
  1137. "precr.qb.ph %[tmp_t8], %[tmp_t2], %[tmp_t7] \n"
  1138. "precrq.qb.ph %[tmp_t7], %[tmp_t2], %[tmp_t7] \n"
  1139. "sw %[tmp_t8], 0(%[rgb_buf]) \n"
  1140. "sw %[tmp_t7], 4(%[rgb_buf]) \n"
  1141. ".set pop \n"
  1142. : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
  1143. [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
  1144. [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
  1145. [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
  1146. : [src_y] "r"(src_y), [src_uv] "r"(src_uv), [yg] "r"(yg),
  1147. [tmp_ub] "r"(tmp_ub), [tmp_ug] "r"(tmp_ug), [tmp_vg] "r"(tmp_vg),
  1148. [tmp_vr] "r"(tmp_vr), [tmp_bb] "r"(tmp_bb), [tmp_bg] "r"(tmp_bg),
  1149. [tmp_br] "r"(tmp_br), [tmp_yg] "r"(tmp_yg), [rgb_buf] "r"(rgb_buf),
  1150. [tmp_mask] "r"(tmp_mask));
  1151. src_y += 2;
  1152. src_uv += 2;
  1153. rgb_buf += 8; // Advance 2 pixels.
  1154. }
  1155. }
  1156. void BGRAToUVRow_DSPR2(const uint8* src_rgb0,
  1157. int src_stride_rgb,
  1158. uint8* dst_u,
  1159. uint8* dst_v,
  1160. int width) {
  1161. const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;
  1162. int x;
  1163. int const1 = 0xffda0000;
  1164. int const2 = 0x0070ffb6;
  1165. int const3 = 0x00700000;
  1166. int const4 = 0xffeeffa2;
  1167. int const5 = 0x100;
  1168. for (x = 0; x < width - 1; x += 2) {
  1169. int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
  1170. int tmp_t6, tmp_t7, tmp_t8;
  1171. __asm__ __volatile__(
  1172. ".set push \n"
  1173. ".set noreorder \n"
  1174. "lw %[tmp_t1], 0(%[src_rgb0]) \n"
  1175. "lw %[tmp_t2], 4(%[src_rgb0]) \n"
  1176. "lw %[tmp_t3], 0(%[src_rgb1]) \n"
  1177. "lw %[tmp_t4], 4(%[src_rgb1]) \n"
  1178. "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"
  1179. "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"
  1180. "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"
  1181. "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"
  1182. "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"
  1183. "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"
  1184. "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"
  1185. "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"
  1186. "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t6] \n"
  1187. "addu.ph %[tmp_t7], %[tmp_t7], %[tmp_t8] \n"
  1188. "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t2] \n"
  1189. "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t4] \n"
  1190. "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t7] \n"
  1191. "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t3] \n"
  1192. "shrl.ph %[tmp_t5], %[tmp_t5], 2 \n"
  1193. "shrl.ph %[tmp_t1], %[tmp_t1], 2 \n"
  1194. "mult $ac0, %[const5], %[const5] \n"
  1195. "mult $ac1, %[const5], %[const5] \n"
  1196. "dpaq_s.w.ph $ac0, %[tmp_t5], %[const1] \n"
  1197. "dpaq_s.w.ph $ac1, %[tmp_t5], %[const3] \n"
  1198. "dpaq_s.w.ph $ac0, %[tmp_t1], %[const2] \n"
  1199. "dpaq_s.w.ph $ac1, %[tmp_t1], %[const4] \n"
  1200. "extr_r.w %[tmp_t7], $ac0, 9 \n"
  1201. "extr_r.w %[tmp_t8], $ac1, 9 \n"
  1202. "addiu %[dst_u], %[dst_u], 1 \n"
  1203. "addiu %[dst_v], %[dst_v], 1 \n"
  1204. "addiu %[src_rgb0], %[src_rgb0], 8 \n"
  1205. "addiu %[src_rgb1], %[src_rgb1], 8 \n"
  1206. "sb %[tmp_t7], -1(%[dst_u]) \n"
  1207. "sb %[tmp_t8], -1(%[dst_v]) \n"
  1208. ".set pop \n"
  1209. : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
  1210. [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
  1211. [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
  1212. [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
  1213. [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1),
  1214. [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v)
  1215. : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3),
  1216. [const4] "r"(const4), [const5] "r"(const5)
  1217. : "hi", "lo", "$ac1lo", "$ac1hi");
  1218. }
  1219. }
  1220. void BGRAToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {
  1221. int x;
  1222. int const1 = 0x00420000;
  1223. int const2 = 0x00190081;
  1224. int const5 = 0x40;
  1225. for (x = 0; x < width; x += 4) {
  1226. int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
  1227. int tmp_t6, tmp_t7, tmp_t8;
  1228. __asm__ __volatile__(
  1229. ".set push \n"
  1230. ".set noreorder \n"
  1231. "lw %[tmp_t1], 0(%[src_argb0]) \n"
  1232. "lw %[tmp_t2], 4(%[src_argb0]) \n"
  1233. "lw %[tmp_t3], 8(%[src_argb0]) \n"
  1234. "lw %[tmp_t4], 12(%[src_argb0]) \n"
  1235. "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"
  1236. "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"
  1237. "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"
  1238. "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"
  1239. "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"
  1240. "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"
  1241. "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"
  1242. "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"
  1243. "mult $ac0, %[const5], %[const5] \n"
  1244. "mult $ac1, %[const5], %[const5] \n"
  1245. "mult $ac2, %[const5], %[const5] \n"
  1246. "mult $ac3, %[const5], %[const5] \n"
  1247. "dpa.w.ph $ac0, %[tmp_t5], %[const1] \n"
  1248. "dpa.w.ph $ac1, %[tmp_t6], %[const1] \n"
  1249. "dpa.w.ph $ac2, %[tmp_t7], %[const1] \n"
  1250. "dpa.w.ph $ac3, %[tmp_t8], %[const1] \n"
  1251. "dpa.w.ph $ac0, %[tmp_t1], %[const2] \n"
  1252. "dpa.w.ph $ac1, %[tmp_t2], %[const2] \n"
  1253. "dpa.w.ph $ac2, %[tmp_t3], %[const2] \n"
  1254. "dpa.w.ph $ac3, %[tmp_t4], %[const2] \n"
  1255. "extr_r.w %[tmp_t1], $ac0, 8 \n"
  1256. "extr_r.w %[tmp_t2], $ac1, 8 \n"
  1257. "extr_r.w %[tmp_t3], $ac2, 8 \n"
  1258. "extr_r.w %[tmp_t4], $ac3, 8 \n"
  1259. "addiu %[src_argb0],%[src_argb0], 16 \n"
  1260. "addiu %[dst_y], %[dst_y], 4 \n"
  1261. "sb %[tmp_t1], -4(%[dst_y]) \n"
  1262. "sb %[tmp_t2], -3(%[dst_y]) \n"
  1263. "sb %[tmp_t3], -2(%[dst_y]) \n"
  1264. "sb %[tmp_t4], -1(%[dst_y]) \n"
  1265. ".set pop \n"
  1266. : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
  1267. [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
  1268. [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
  1269. [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
  1270. [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y)
  1271. : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5)
  1272. : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo",
  1273. "$ac3hi");
  1274. }
  1275. }
  1276. void ABGRToUVRow_DSPR2(const uint8* src_rgb0,
  1277. int src_stride_rgb,
  1278. uint8* dst_u,
  1279. uint8* dst_v,
  1280. int width) {
  1281. const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;
  1282. int x;
  1283. int const1 = 0xffb6ffda;
  1284. int const2 = 0x00000070;
  1285. int const3 = 0xffa20070;
  1286. int const4 = 0x0000ffee;
  1287. int const5 = 0x100;
  1288. for (x = 0; x < width - 1; x += 2) {
  1289. int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
  1290. int tmp_t6, tmp_t7, tmp_t8;
  1291. __asm__ __volatile__(
  1292. ".set push \n"
  1293. ".set noreorder \n"
  1294. "lw %[tmp_t1], 0(%[src_rgb0]) \n"
  1295. "lw %[tmp_t2], 4(%[src_rgb0]) \n"
  1296. "lw %[tmp_t3], 0(%[src_rgb1]) \n"
  1297. "lw %[tmp_t4], 4(%[src_rgb1]) \n"
  1298. "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"
  1299. "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"
  1300. "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"
  1301. "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"
  1302. "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"
  1303. "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"
  1304. "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"
  1305. "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"
  1306. "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t6] \n"
  1307. "addu.ph %[tmp_t7], %[tmp_t7], %[tmp_t8] \n"
  1308. "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t2] \n"
  1309. "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t4] \n"
  1310. "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t7] \n"
  1311. "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t3] \n"
  1312. "shrl.ph %[tmp_t5], %[tmp_t5], 2 \n"
  1313. "shrl.ph %[tmp_t1], %[tmp_t1], 2 \n"
  1314. "mult $ac0, %[const5], %[const5] \n"
  1315. "mult $ac1, %[const5], %[const5] \n"
  1316. "dpaq_s.w.ph $ac0, %[tmp_t5], %[const1] \n"
  1317. "dpaq_s.w.ph $ac1, %[tmp_t5], %[const3] \n"
  1318. "dpaq_s.w.ph $ac0, %[tmp_t1], %[const2] \n"
  1319. "dpaq_s.w.ph $ac1, %[tmp_t1], %[const4] \n"
  1320. "extr_r.w %[tmp_t7], $ac0, 9 \n"
  1321. "extr_r.w %[tmp_t8], $ac1, 9 \n"
  1322. "addiu %[dst_u], %[dst_u], 1 \n"
  1323. "addiu %[dst_v], %[dst_v], 1 \n"
  1324. "addiu %[src_rgb0], %[src_rgb0], 8 \n"
  1325. "addiu %[src_rgb1], %[src_rgb1], 8 \n"
  1326. "sb %[tmp_t7], -1(%[dst_u]) \n"
  1327. "sb %[tmp_t8], -1(%[dst_v]) \n"
  1328. ".set pop \n"
  1329. : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
  1330. [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
  1331. [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
  1332. [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
  1333. [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1),
  1334. [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v)
  1335. : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3),
  1336. [const4] "r"(const4), [const5] "r"(const5)
  1337. : "hi", "lo", "$ac1lo", "$ac1hi");
  1338. }
  1339. }
  1340. void ARGBToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {
  1341. int x;
  1342. int const1 = 0x00810019;
  1343. int const2 = 0x00000042;
  1344. int const5 = 0x40;
  1345. for (x = 0; x < width; x += 4) {
  1346. int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
  1347. int tmp_t6, tmp_t7, tmp_t8;
  1348. __asm__ __volatile__(
  1349. ".set push \n"
  1350. ".set noreorder \n"
  1351. "lw %[tmp_t1], 0(%[src_argb0]) \n"
  1352. "lw %[tmp_t2], 4(%[src_argb0]) \n"
  1353. "lw %[tmp_t3], 8(%[src_argb0]) \n"
  1354. "lw %[tmp_t4], 12(%[src_argb0]) \n"
  1355. "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"
  1356. "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"
  1357. "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"
  1358. "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"
  1359. "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"
  1360. "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"
  1361. "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"
  1362. "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"
  1363. "mult $ac0, %[const5], %[const5] \n"
  1364. "mult $ac1, %[const5], %[const5] \n"
  1365. "mult $ac2, %[const5], %[const5] \n"
  1366. "mult $ac3, %[const5], %[const5] \n"
  1367. "dpa.w.ph $ac0, %[tmp_t5], %[const1] \n"
  1368. "dpa.w.ph $ac1, %[tmp_t6], %[const1] \n"
  1369. "dpa.w.ph $ac2, %[tmp_t7], %[const1] \n"
  1370. "dpa.w.ph $ac3, %[tmp_t8], %[const1] \n"
  1371. "dpa.w.ph $ac0, %[tmp_t1], %[const2] \n"
  1372. "dpa.w.ph $ac1, %[tmp_t2], %[const2] \n"
  1373. "dpa.w.ph $ac2, %[tmp_t3], %[const2] \n"
  1374. "dpa.w.ph $ac3, %[tmp_t4], %[const2] \n"
  1375. "extr_r.w %[tmp_t1], $ac0, 8 \n"
  1376. "extr_r.w %[tmp_t2], $ac1, 8 \n"
  1377. "extr_r.w %[tmp_t3], $ac2, 8 \n"
  1378. "extr_r.w %[tmp_t4], $ac3, 8 \n"
  1379. "addiu %[dst_y], %[dst_y], 4 \n"
  1380. "addiu %[src_argb0],%[src_argb0], 16 \n"
  1381. "sb %[tmp_t1], -4(%[dst_y]) \n"
  1382. "sb %[tmp_t2], -3(%[dst_y]) \n"
  1383. "sb %[tmp_t3], -2(%[dst_y]) \n"
  1384. "sb %[tmp_t4], -1(%[dst_y]) \n"
  1385. ".set pop \n"
  1386. : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
  1387. [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
  1388. [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
  1389. [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
  1390. [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y)
  1391. : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5)
  1392. : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo",
  1393. "$ac3hi");
  1394. }
  1395. }
  1396. void ABGRToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {
  1397. int x;
  1398. int const1 = 0x00810042;
  1399. int const2 = 0x00000019;
  1400. int const5 = 0x40;
  1401. for (x = 0; x < width; x += 4) {
  1402. int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
  1403. int tmp_t6, tmp_t7, tmp_t8;
  1404. __asm__ __volatile__(
  1405. ".set push \n"
  1406. ".set noreorder \n"
  1407. "lw %[tmp_t1], 0(%[src_argb0]) \n"
  1408. "lw %[tmp_t2], 4(%[src_argb0]) \n"
  1409. "lw %[tmp_t3], 8(%[src_argb0]) \n"
  1410. "lw %[tmp_t4], 12(%[src_argb0]) \n"
  1411. "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"
  1412. "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"
  1413. "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"
  1414. "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"
  1415. "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"
  1416. "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"
  1417. "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"
  1418. "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"
  1419. "mult $ac0, %[const5], %[const5] \n"
  1420. "mult $ac1, %[const5], %[const5] \n"
  1421. "mult $ac2, %[const5], %[const5] \n"
  1422. "mult $ac3, %[const5], %[const5] \n"
  1423. "dpa.w.ph $ac0, %[tmp_t5], %[const1] \n"
  1424. "dpa.w.ph $ac1, %[tmp_t6], %[const1] \n"
  1425. "dpa.w.ph $ac2, %[tmp_t7], %[const1] \n"
  1426. "dpa.w.ph $ac3, %[tmp_t8], %[const1] \n"
  1427. "dpa.w.ph $ac0, %[tmp_t1], %[const2] \n"
  1428. "dpa.w.ph $ac1, %[tmp_t2], %[const2] \n"
  1429. "dpa.w.ph $ac2, %[tmp_t3], %[const2] \n"
  1430. "dpa.w.ph $ac3, %[tmp_t4], %[const2] \n"
  1431. "extr_r.w %[tmp_t1], $ac0, 8 \n"
  1432. "extr_r.w %[tmp_t2], $ac1, 8 \n"
  1433. "extr_r.w %[tmp_t3], $ac2, 8 \n"
  1434. "extr_r.w %[tmp_t4], $ac3, 8 \n"
  1435. "addiu %[src_argb0],%[src_argb0], 16 \n"
  1436. "addiu %[dst_y], %[dst_y], 4 \n"
  1437. "sb %[tmp_t1], -4(%[dst_y]) \n"
  1438. "sb %[tmp_t2], -3(%[dst_y]) \n"
  1439. "sb %[tmp_t3], -2(%[dst_y]) \n"
  1440. "sb %[tmp_t4], -1(%[dst_y]) \n"
  1441. ".set pop \n"
  1442. : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
  1443. [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
  1444. [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
  1445. [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
  1446. [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y)
  1447. : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5)
  1448. : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo",
  1449. "$ac3hi");
  1450. }
  1451. }
  1452. void RGBAToUVRow_DSPR2(const uint8* src_rgb0,
  1453. int src_stride_rgb,
  1454. uint8* dst_u,
  1455. uint8* dst_v,
  1456. int width) {
  1457. const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;
  1458. int x;
  1459. int const1 = 0xffb60070;
  1460. int const2 = 0x0000ffda;
  1461. int const3 = 0xffa2ffee;
  1462. int const4 = 0x00000070;
  1463. int const5 = 0x100;
  1464. for (x = 0; x < width - 1; x += 2) {
  1465. int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
  1466. int tmp_t6, tmp_t7, tmp_t8;
  1467. __asm__ __volatile__(
  1468. ".set push \n"
  1469. ".set noreorder \n"
  1470. "ulw %[tmp_t1], 0+1(%[src_rgb0]) \n"
  1471. "ulw %[tmp_t2], 4+1(%[src_rgb0]) \n"
  1472. "ulw %[tmp_t3], 0+1(%[src_rgb1]) \n"
  1473. "ulw %[tmp_t4], 4+1(%[src_rgb1]) \n"
  1474. "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"
  1475. "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"
  1476. "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"
  1477. "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"
  1478. "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"
  1479. "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"
  1480. "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"
  1481. "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"
  1482. "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t6] \n"
  1483. "addu.ph %[tmp_t7], %[tmp_t7], %[tmp_t8] \n"
  1484. "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t2] \n"
  1485. "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t4] \n"
  1486. "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t7] \n"
  1487. "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t3] \n"
  1488. "shrl.ph %[tmp_t5], %[tmp_t5], 2 \n"
  1489. "shrl.ph %[tmp_t1], %[tmp_t1], 2 \n"
  1490. "mult $ac0, %[const5], %[const5] \n"
  1491. "mult $ac1, %[const5], %[const5] \n"
  1492. "dpaq_s.w.ph $ac0, %[tmp_t5], %[const1] \n"
  1493. "dpaq_s.w.ph $ac1, %[tmp_t5], %[const3] \n"
  1494. "dpaq_s.w.ph $ac0, %[tmp_t1], %[const2] \n"
  1495. "dpaq_s.w.ph $ac1, %[tmp_t1], %[const4] \n"
  1496. "extr_r.w %[tmp_t7], $ac0, 9 \n"
  1497. "extr_r.w %[tmp_t8], $ac1, 9 \n"
  1498. "addiu %[src_rgb0], %[src_rgb0], 8 \n"
  1499. "addiu %[src_rgb1], %[src_rgb1], 8 \n"
  1500. "addiu %[dst_u], %[dst_u], 1 \n"
  1501. "addiu %[dst_v], %[dst_v], 1 \n"
  1502. "sb %[tmp_t7], -1(%[dst_u]) \n"
  1503. "sb %[tmp_t8], -1(%[dst_v]) \n"
  1504. ".set pop \n"
  1505. : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
  1506. [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
  1507. [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
  1508. [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
  1509. [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1),
  1510. [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v)
  1511. : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3),
  1512. [const4] "r"(const4), [const5] "r"(const5)
  1513. : "hi", "lo", "$ac1lo", "$ac1hi");
  1514. }
  1515. }
  1516. void RGBAToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {
  1517. int x;
  1518. int const1 = 0x00420081;
  1519. int const2 = 0x00190000;
  1520. int const5 = 0x40;
  1521. for (x = 0; x < width; x += 4) {
  1522. int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
  1523. int tmp_t6, tmp_t7, tmp_t8;
  1524. __asm__ __volatile__(
  1525. ".set push \n"
  1526. ".set noreorder \n"
  1527. "lw %[tmp_t1], 0(%[src_argb0]) \n"
  1528. "lw %[tmp_t2], 4(%[src_argb0]) \n"
  1529. "lw %[tmp_t3], 8(%[src_argb0]) \n"
  1530. "lw %[tmp_t4], 12(%[src_argb0]) \n"
  1531. "preceu.ph.qbl %[tmp_t5], %[tmp_t1] \n"
  1532. "preceu.ph.qbr %[tmp_t1], %[tmp_t1] \n"
  1533. "preceu.ph.qbl %[tmp_t6], %[tmp_t2] \n"
  1534. "preceu.ph.qbr %[tmp_t2], %[tmp_t2] \n"
  1535. "preceu.ph.qbl %[tmp_t7], %[tmp_t3] \n"
  1536. "preceu.ph.qbr %[tmp_t3], %[tmp_t3] \n"
  1537. "preceu.ph.qbl %[tmp_t8], %[tmp_t4] \n"
  1538. "preceu.ph.qbr %[tmp_t4], %[tmp_t4] \n"
  1539. "mult $ac0, %[const5], %[const5] \n"
  1540. "mult $ac1, %[const5], %[const5] \n"
  1541. "mult $ac2, %[const5], %[const5] \n"
  1542. "mult $ac3, %[const5], %[const5] \n"
  1543. "dpa.w.ph $ac0, %[tmp_t5], %[const1] \n"
  1544. "dpa.w.ph $ac1, %[tmp_t6], %[const1] \n"
  1545. "dpa.w.ph $ac2, %[tmp_t7], %[const1] \n"
  1546. "dpa.w.ph $ac3, %[tmp_t8], %[const1] \n"
  1547. "dpa.w.ph $ac0, %[tmp_t1], %[const2] \n"
  1548. "dpa.w.ph $ac1, %[tmp_t2], %[const2] \n"
  1549. "dpa.w.ph $ac2, %[tmp_t3], %[const2] \n"
  1550. "dpa.w.ph $ac3, %[tmp_t4], %[const2] \n"
  1551. "extr_r.w %[tmp_t1], $ac0, 8 \n"
  1552. "extr_r.w %[tmp_t2], $ac1, 8 \n"
  1553. "extr_r.w %[tmp_t3], $ac2, 8 \n"
  1554. "extr_r.w %[tmp_t4], $ac3, 8 \n"
  1555. "addiu %[dst_y], %[dst_y], 4 \n"
  1556. "addiu %[src_argb0],%[src_argb0], 16 \n"
  1557. "sb %[tmp_t1], -4(%[dst_y]) \n"
  1558. "sb %[tmp_t2], -3(%[dst_y]) \n"
  1559. "sb %[tmp_t3], -2(%[dst_y]) \n"
  1560. "sb %[tmp_t4], -1(%[dst_y]) \n"
  1561. ".set pop \n"
  1562. : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
  1563. [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
  1564. [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
  1565. [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
  1566. [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y)
  1567. : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5)
  1568. : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo",
  1569. "$ac3hi");
  1570. }
  1571. }
  1572. void ARGBToUVRow_DSPR2(const uint8* src_rgb0,
  1573. int src_stride_rgb,
  1574. uint8* dst_u,
  1575. uint8* dst_v,
  1576. int width) {
  1577. const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;
  1578. int x;
  1579. int const1 = 0xffb60070;
  1580. int const2 = 0x0000ffda;
  1581. int const3 = 0xffa2ffee;
  1582. int const4 = 0x00000070;
  1583. int const5 = 0x100;
  1584. for (x = 0; x < width - 1; x += 2) {
  1585. int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
  1586. int tmp_t6, tmp_t7, tmp_t8;
  1587. __asm__ __volatile__(
  1588. ".set push \n"
  1589. ".set noreorder \n"
  1590. "lw %[tmp_t1], 0(%[src_rgb0]) \n"
  1591. "lw %[tmp_t2], 4(%[src_rgb0]) \n"
  1592. "lw %[tmp_t3], 0(%[src_rgb1]) \n"
  1593. "lw %[tmp_t4], 4(%[src_rgb1]) \n"
  1594. "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"
  1595. "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"
  1596. "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"
  1597. "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"
  1598. "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"
  1599. "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"
  1600. "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"
  1601. "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"
  1602. "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t6] \n"
  1603. "addu.ph %[tmp_t7], %[tmp_t7], %[tmp_t8] \n"
  1604. "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t2] \n"
  1605. "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t4] \n"
  1606. "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t7] \n"
  1607. "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t3] \n"
  1608. "shrl.ph %[tmp_t5], %[tmp_t5], 2 \n"
  1609. "shrl.ph %[tmp_t1], %[tmp_t1], 2 \n"
  1610. "mult $ac0, %[const5], %[const5] \n"
  1611. "mult $ac1, %[const5], %[const5] \n"
  1612. "dpaq_s.w.ph $ac0, %[tmp_t5], %[const1] \n"
  1613. "dpaq_s.w.ph $ac1, %[tmp_t5], %[const3] \n"
  1614. "dpaq_s.w.ph $ac0, %[tmp_t1], %[const2] \n"
  1615. "dpaq_s.w.ph $ac1, %[tmp_t1], %[const4] \n"
  1616. "extr_r.w %[tmp_t7], $ac0, 9 \n"
  1617. "extr_r.w %[tmp_t8], $ac1, 9 \n"
  1618. "addiu %[src_rgb0], %[src_rgb0], 8 \n"
  1619. "addiu %[src_rgb1], %[src_rgb1], 8 \n"
  1620. "addiu %[dst_u], %[dst_u], 1 \n"
  1621. "addiu %[dst_v], %[dst_v], 1 \n"
  1622. "sb %[tmp_t7], -1(%[dst_u]) \n"
  1623. "sb %[tmp_t8], -1(%[dst_v]) \n"
  1624. ".set pop \n"
  1625. : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
  1626. [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
  1627. [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
  1628. [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
  1629. [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1),
  1630. [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v)
  1631. : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3),
  1632. [const4] "r"(const4), [const5] "r"(const5)
  1633. : "hi", "lo", "$ac1lo", "$ac1hi");
  1634. }
  1635. }
  1636. #endif // __mips_dsp_rev >= 2
  1637. #endif // defined(__mips__)
  1638. #ifdef __cplusplus
  1639. } // extern "C"
  1640. } // namespace libyuv
  1641. #endif