row_neon64.cc 129 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786
  1. /*
  2. * Copyright 2014 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/row.h"
  11. #ifdef __cplusplus
  12. namespace libyuv {
  13. extern "C" {
  14. #endif
  15. // This module is for GCC Neon armv8 64 bit.
  16. #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
  17. // Read 8 Y, 4 U and 4 V from 422
  18. #define READYUV422 \
  19. "ld1 {v0.8b}, [%0], #8 \n" \
  20. "ld1 {v1.s}[0], [%1], #4 \n" \
  21. "ld1 {v1.s}[1], [%2], #4 \n"
  22. // Read 8 Y, 8 U and 8 V from 444
  23. #define READYUV444 \
  24. "ld1 {v0.8b}, [%0], #8 \n" \
  25. "ld1 {v1.d}[0], [%1], #8 \n" \
  26. "ld1 {v1.d}[1], [%2], #8 \n" \
  27. "uaddlp v1.8h, v1.16b \n" \
  28. "rshrn v1.8b, v1.8h, #1 \n"
  29. // Read 8 Y, and set 4 U and 4 V to 128
  30. #define READYUV400 \
  31. "ld1 {v0.8b}, [%0], #8 \n" \
  32. "movi v1.8b , #128 \n"
  33. // Read 8 Y and 4 UV from NV12
  34. #define READNV12 \
  35. "ld1 {v0.8b}, [%0], #8 \n" \
  36. "ld1 {v2.8b}, [%1], #8 \n" \
  37. "uzp1 v1.8b, v2.8b, v2.8b \n" \
  38. "uzp2 v3.8b, v2.8b, v2.8b \n" \
  39. "ins v1.s[1], v3.s[0] \n"
  40. // Read 8 Y and 4 VU from NV21
  41. #define READNV21 \
  42. "ld1 {v0.8b}, [%0], #8 \n" \
  43. "ld1 {v2.8b}, [%1], #8 \n" \
  44. "uzp1 v3.8b, v2.8b, v2.8b \n" \
  45. "uzp2 v1.8b, v2.8b, v2.8b \n" \
  46. "ins v1.s[1], v3.s[0] \n"
  47. // Read 8 YUY2
  48. #define READYUY2 \
  49. "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \
  50. "uzp2 v3.8b, v1.8b, v1.8b \n" \
  51. "uzp1 v1.8b, v1.8b, v1.8b \n" \
  52. "ins v1.s[1], v3.s[0] \n"
  53. // Read 8 UYVY
  54. #define READUYVY \
  55. "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \
  56. "orr v0.8b, v3.8b, v3.8b \n" \
  57. "uzp1 v1.8b, v2.8b, v2.8b \n" \
  58. "uzp2 v3.8b, v2.8b, v2.8b \n" \
  59. "ins v1.s[1], v3.s[0] \n"
  60. #define YUVTORGB_SETUP \
  61. "ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \
  62. "ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \
  63. "ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \
  64. "ld1r {v31.4s}, [%[kYToRgb]] \n" \
  65. "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \
  66. "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n"
  67. #define YUVTORGB(vR, vG, vB) \
  68. "uxtl v0.8h, v0.8b \n" /* Extract Y */ \
  69. "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \
  70. "ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \
  71. "ushll v0.4s, v0.4h, #0 \n" \
  72. "mul v3.4s, v3.4s, v31.4s \n" \
  73. "mul v0.4s, v0.4s, v31.4s \n" \
  74. "sqshrun v0.4h, v0.4s, #16 \n" \
  75. "sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */ \
  76. "uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \
  77. "mov v2.d[0], v1.d[1] \n" /* Extract V */ \
  78. "uxtl v2.8h, v2.8b \n" \
  79. "uxtl v1.8h, v1.8b \n" /* Extract U */ \
  80. "mul v3.8h, v1.8h, v27.8h \n" \
  81. "mul v5.8h, v1.8h, v29.8h \n" \
  82. "mul v6.8h, v2.8h, v30.8h \n" \
  83. "mul v7.8h, v2.8h, v28.8h \n" \
  84. "sqadd v6.8h, v6.8h, v5.8h \n" \
  85. "sqadd " #vB \
  86. ".8h, v24.8h, v0.8h \n" /* B */ \
  87. "sqadd " #vG \
  88. ".8h, v25.8h, v0.8h \n" /* G */ \
  89. "sqadd " #vR \
  90. ".8h, v26.8h, v0.8h \n" /* R */ \
  91. "sqadd " #vB ".8h, " #vB \
  92. ".8h, v3.8h \n" /* B */ \
  93. "sqsub " #vG ".8h, " #vG \
  94. ".8h, v6.8h \n" /* G */ \
  95. "sqadd " #vR ".8h, " #vR \
  96. ".8h, v7.8h \n" /* R */ \
  97. "sqshrun " #vB ".8b, " #vB \
  98. ".8h, #6 \n" /* B */ \
  99. "sqshrun " #vG ".8b, " #vG \
  100. ".8h, #6 \n" /* G */ \
  101. "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */
  102. void I444ToARGBRow_NEON(const uint8* src_y,
  103. const uint8* src_u,
  104. const uint8* src_v,
  105. uint8* dst_argb,
  106. const struct YuvConstants* yuvconstants,
  107. int width) {
  108. asm volatile (
  109. YUVTORGB_SETUP
  110. "movi v23.8b, #255 \n" /* A */
  111. "1: \n"
  112. READYUV444
  113. YUVTORGB(v22, v21, v20)
  114. "subs %w4, %w4, #8 \n"
  115. "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
  116. "b.gt 1b \n"
  117. : "+r"(src_y), // %0
  118. "+r"(src_u), // %1
  119. "+r"(src_v), // %2
  120. "+r"(dst_argb), // %3
  121. "+r"(width) // %4
  122. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  123. [kUVToG]"r"(&yuvconstants->kUVToG),
  124. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  125. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  126. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  127. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  128. );
  129. }
  130. void I422ToARGBRow_NEON(const uint8* src_y,
  131. const uint8* src_u,
  132. const uint8* src_v,
  133. uint8* dst_argb,
  134. const struct YuvConstants* yuvconstants,
  135. int width) {
  136. asm volatile (
  137. YUVTORGB_SETUP
  138. "movi v23.8b, #255 \n" /* A */
  139. "1: \n"
  140. READYUV422
  141. YUVTORGB(v22, v21, v20)
  142. "subs %w4, %w4, #8 \n"
  143. "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
  144. "b.gt 1b \n"
  145. : "+r"(src_y), // %0
  146. "+r"(src_u), // %1
  147. "+r"(src_v), // %2
  148. "+r"(dst_argb), // %3
  149. "+r"(width) // %4
  150. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  151. [kUVToG]"r"(&yuvconstants->kUVToG),
  152. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  153. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  154. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  155. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  156. );
  157. }
  158. void I422AlphaToARGBRow_NEON(const uint8* src_y,
  159. const uint8* src_u,
  160. const uint8* src_v,
  161. const uint8* src_a,
  162. uint8* dst_argb,
  163. const struct YuvConstants* yuvconstants,
  164. int width) {
  165. asm volatile (
  166. YUVTORGB_SETUP
  167. "1: \n"
  168. READYUV422
  169. YUVTORGB(v22, v21, v20)
  170. "ld1 {v23.8b}, [%3], #8 \n"
  171. "subs %w5, %w5, #8 \n"
  172. "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n"
  173. "b.gt 1b \n"
  174. : "+r"(src_y), // %0
  175. "+r"(src_u), // %1
  176. "+r"(src_v), // %2
  177. "+r"(src_a), // %3
  178. "+r"(dst_argb), // %4
  179. "+r"(width) // %5
  180. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  181. [kUVToG]"r"(&yuvconstants->kUVToG),
  182. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  183. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  184. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  185. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  186. );
  187. }
  188. void I422ToRGBARow_NEON(const uint8* src_y,
  189. const uint8* src_u,
  190. const uint8* src_v,
  191. uint8* dst_rgba,
  192. const struct YuvConstants* yuvconstants,
  193. int width) {
  194. asm volatile (
  195. YUVTORGB_SETUP
  196. "movi v20.8b, #255 \n" /* A */
  197. "1: \n"
  198. READYUV422
  199. YUVTORGB(v23, v22, v21)
  200. "subs %w4, %w4, #8 \n"
  201. "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
  202. "b.gt 1b \n"
  203. : "+r"(src_y), // %0
  204. "+r"(src_u), // %1
  205. "+r"(src_v), // %2
  206. "+r"(dst_rgba), // %3
  207. "+r"(width) // %4
  208. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  209. [kUVToG]"r"(&yuvconstants->kUVToG),
  210. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  211. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  212. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  213. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  214. );
  215. }
  216. void I422ToRGB24Row_NEON(const uint8* src_y,
  217. const uint8* src_u,
  218. const uint8* src_v,
  219. uint8* dst_rgb24,
  220. const struct YuvConstants* yuvconstants,
  221. int width) {
  222. asm volatile (
  223. YUVTORGB_SETUP
  224. "1: \n"
  225. READYUV422
  226. YUVTORGB(v22, v21, v20)
  227. "subs %w4, %w4, #8 \n"
  228. "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
  229. "b.gt 1b \n"
  230. : "+r"(src_y), // %0
  231. "+r"(src_u), // %1
  232. "+r"(src_v), // %2
  233. "+r"(dst_rgb24), // %3
  234. "+r"(width) // %4
  235. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  236. [kUVToG]"r"(&yuvconstants->kUVToG),
  237. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  238. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  239. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  240. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  241. );
  242. }
  243. #define ARGBTORGB565 \
  244. "shll v0.8h, v22.8b, #8 \n" /* R */ \
  245. "shll v21.8h, v21.8b, #8 \n" /* G */ \
  246. "shll v20.8h, v20.8b, #8 \n" /* B */ \
  247. "sri v0.8h, v21.8h, #5 \n" /* RG */ \
  248. "sri v0.8h, v20.8h, #11 \n" /* RGB */
  249. void I422ToRGB565Row_NEON(const uint8* src_y,
  250. const uint8* src_u,
  251. const uint8* src_v,
  252. uint8* dst_rgb565,
  253. const struct YuvConstants* yuvconstants,
  254. int width) {
  255. asm volatile(
  256. YUVTORGB_SETUP
  257. "1: \n" READYUV422 YUVTORGB(
  258. v22, v21,
  259. v20) "subs %w4, %w4, #8 \n" ARGBTORGB565
  260. "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels
  261. // RGB565.
  262. "b.gt 1b \n"
  263. : "+r"(src_y), // %0
  264. "+r"(src_u), // %1
  265. "+r"(src_v), // %2
  266. "+r"(dst_rgb565), // %3
  267. "+r"(width) // %4
  268. : [kUVToRB] "r"(&yuvconstants->kUVToRB),
  269. [kUVToG] "r"(&yuvconstants->kUVToG),
  270. [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
  271. [kYToRgb] "r"(&yuvconstants->kYToRgb)
  272. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  273. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
  274. }
  275. #define ARGBTOARGB1555 \
  276. "shll v0.8h, v23.8b, #8 \n" /* A */ \
  277. "shll v22.8h, v22.8b, #8 \n" /* R */ \
  278. "shll v21.8h, v21.8b, #8 \n" /* G */ \
  279. "shll v20.8h, v20.8b, #8 \n" /* B */ \
  280. "sri v0.8h, v22.8h, #1 \n" /* AR */ \
  281. "sri v0.8h, v21.8h, #6 \n" /* ARG */ \
  282. "sri v0.8h, v20.8h, #11 \n" /* ARGB */
  283. void I422ToARGB1555Row_NEON(const uint8* src_y,
  284. const uint8* src_u,
  285. const uint8* src_v,
  286. uint8* dst_argb1555,
  287. const struct YuvConstants* yuvconstants,
  288. int width) {
  289. asm volatile(
  290. YUVTORGB_SETUP
  291. "movi v23.8b, #255 \n"
  292. "1: \n" READYUV422 YUVTORGB(
  293. v22, v21,
  294. v20) "subs %w4, %w4, #8 \n" ARGBTOARGB1555
  295. "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels
  296. // RGB565.
  297. "b.gt 1b \n"
  298. : "+r"(src_y), // %0
  299. "+r"(src_u), // %1
  300. "+r"(src_v), // %2
  301. "+r"(dst_argb1555), // %3
  302. "+r"(width) // %4
  303. : [kUVToRB] "r"(&yuvconstants->kUVToRB),
  304. [kUVToG] "r"(&yuvconstants->kUVToG),
  305. [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
  306. [kYToRgb] "r"(&yuvconstants->kYToRgb)
  307. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  308. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
  309. }
  310. #define ARGBTOARGB4444 \
  311. /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \
  312. "ushr v20.8b, v20.8b, #4 \n" /* B */ \
  313. "bic v21.8b, v21.8b, v4.8b \n" /* G */ \
  314. "ushr v22.8b, v22.8b, #4 \n" /* R */ \
  315. "bic v23.8b, v23.8b, v4.8b \n" /* A */ \
  316. "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \
  317. "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \
  318. "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */
  319. void I422ToARGB4444Row_NEON(const uint8* src_y,
  320. const uint8* src_u,
  321. const uint8* src_v,
  322. uint8* dst_argb4444,
  323. const struct YuvConstants* yuvconstants,
  324. int width) {
  325. asm volatile (
  326. YUVTORGB_SETUP
  327. "movi v4.16b, #0x0f \n" // bits to clear with vbic.
  328. "1: \n"
  329. READYUV422
  330. YUVTORGB(v22, v21, v20)
  331. "subs %w4, %w4, #8 \n"
  332. "movi v23.8b, #255 \n"
  333. ARGBTOARGB4444
  334. "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444.
  335. "b.gt 1b \n"
  336. : "+r"(src_y), // %0
  337. "+r"(src_u), // %1
  338. "+r"(src_v), // %2
  339. "+r"(dst_argb4444), // %3
  340. "+r"(width) // %4
  341. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  342. [kUVToG]"r"(&yuvconstants->kUVToG),
  343. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  344. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  345. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  346. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  347. );
  348. }
  349. void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
  350. asm volatile (
  351. YUVTORGB_SETUP
  352. "movi v23.8b, #255 \n"
  353. "1: \n"
  354. READYUV400
  355. YUVTORGB(v22, v21, v20)
  356. "subs %w2, %w2, #8 \n"
  357. "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
  358. "b.gt 1b \n"
  359. : "+r"(src_y), // %0
  360. "+r"(dst_argb), // %1
  361. "+r"(width) // %2
  362. : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
  363. [kUVToG]"r"(&kYuvI601Constants.kUVToG),
  364. [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
  365. [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
  366. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  367. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  368. );
  369. }
  370. void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
  371. asm volatile(
  372. "movi v23.8b, #255 \n"
  373. "1: \n"
  374. "ld1 {v20.8b}, [%0], #8 \n"
  375. "orr v21.8b, v20.8b, v20.8b \n"
  376. "orr v22.8b, v20.8b, v20.8b \n"
  377. "subs %w2, %w2, #8 \n"
  378. "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
  379. "b.gt 1b \n"
  380. : "+r"(src_y), // %0
  381. "+r"(dst_argb), // %1
  382. "+r"(width) // %2
  383. :
  384. : "cc", "memory", "v20", "v21", "v22", "v23");
  385. }
  386. void NV12ToARGBRow_NEON(const uint8* src_y,
  387. const uint8* src_uv,
  388. uint8* dst_argb,
  389. const struct YuvConstants* yuvconstants,
  390. int width) {
  391. asm volatile (
  392. YUVTORGB_SETUP
  393. "movi v23.8b, #255 \n"
  394. "1: \n"
  395. READNV12
  396. YUVTORGB(v22, v21, v20)
  397. "subs %w3, %w3, #8 \n"
  398. "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
  399. "b.gt 1b \n"
  400. : "+r"(src_y), // %0
  401. "+r"(src_uv), // %1
  402. "+r"(dst_argb), // %2
  403. "+r"(width) // %3
  404. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  405. [kUVToG]"r"(&yuvconstants->kUVToG),
  406. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  407. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  408. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  409. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  410. );
  411. }
  412. void NV21ToARGBRow_NEON(const uint8* src_y,
  413. const uint8* src_vu,
  414. uint8* dst_argb,
  415. const struct YuvConstants* yuvconstants,
  416. int width) {
  417. asm volatile (
  418. YUVTORGB_SETUP
  419. "movi v23.8b, #255 \n"
  420. "1: \n"
  421. READNV21
  422. YUVTORGB(v22, v21, v20)
  423. "subs %w3, %w3, #8 \n"
  424. "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
  425. "b.gt 1b \n"
  426. : "+r"(src_y), // %0
  427. "+r"(src_vu), // %1
  428. "+r"(dst_argb), // %2
  429. "+r"(width) // %3
  430. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  431. [kUVToG]"r"(&yuvconstants->kUVToG),
  432. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  433. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  434. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  435. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  436. );
  437. }
  438. void NV12ToRGB565Row_NEON(const uint8* src_y,
  439. const uint8* src_uv,
  440. uint8* dst_rgb565,
  441. const struct YuvConstants* yuvconstants,
  442. int width) {
  443. asm volatile(
  444. YUVTORGB_SETUP
  445. "1: \n" READNV12 YUVTORGB(
  446. v22, v21,
  447. v20) "subs %w3, %w3, #8 \n" ARGBTORGB565
  448. "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels
  449. // RGB565.
  450. "b.gt 1b \n"
  451. : "+r"(src_y), // %0
  452. "+r"(src_uv), // %1
  453. "+r"(dst_rgb565), // %2
  454. "+r"(width) // %3
  455. : [kUVToRB] "r"(&yuvconstants->kUVToRB),
  456. [kUVToG] "r"(&yuvconstants->kUVToG),
  457. [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
  458. [kYToRgb] "r"(&yuvconstants->kYToRgb)
  459. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  460. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
  461. }
  462. void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
  463. uint8* dst_argb,
  464. const struct YuvConstants* yuvconstants,
  465. int width) {
  466. asm volatile (
  467. YUVTORGB_SETUP
  468. "movi v23.8b, #255 \n"
  469. "1: \n"
  470. READYUY2
  471. YUVTORGB(v22, v21, v20)
  472. "subs %w2, %w2, #8 \n"
  473. "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
  474. "b.gt 1b \n"
  475. : "+r"(src_yuy2), // %0
  476. "+r"(dst_argb), // %1
  477. "+r"(width) // %2
  478. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  479. [kUVToG]"r"(&yuvconstants->kUVToG),
  480. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  481. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  482. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  483. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  484. );
  485. }
  486. void UYVYToARGBRow_NEON(const uint8* src_uyvy,
  487. uint8* dst_argb,
  488. const struct YuvConstants* yuvconstants,
  489. int width) {
  490. asm volatile (
  491. YUVTORGB_SETUP
  492. "movi v23.8b, #255 \n"
  493. "1: \n"
  494. READUYVY
  495. YUVTORGB(v22, v21, v20)
  496. "subs %w2, %w2, #8 \n"
  497. "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
  498. "b.gt 1b \n"
  499. : "+r"(src_uyvy), // %0
  500. "+r"(dst_argb), // %1
  501. "+r"(width) // %2
  502. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  503. [kUVToG]"r"(&yuvconstants->kUVToG),
  504. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  505. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  506. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  507. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  508. );
  509. }
  510. // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
  511. void SplitUVRow_NEON(const uint8* src_uv,
  512. uint8* dst_u,
  513. uint8* dst_v,
  514. int width) {
  515. asm volatile(
  516. "1: \n"
  517. "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
  518. "subs %w3, %w3, #16 \n" // 16 processed per loop
  519. "st1 {v0.16b}, [%1], #16 \n" // store U
  520. "st1 {v1.16b}, [%2], #16 \n" // store V
  521. "b.gt 1b \n"
  522. : "+r"(src_uv), // %0
  523. "+r"(dst_u), // %1
  524. "+r"(dst_v), // %2
  525. "+r"(width) // %3 // Output registers
  526. : // Input registers
  527. : "cc", "memory", "v0", "v1" // Clobber List
  528. );
  529. }
  530. // Reads 16 U's and V's and writes out 16 pairs of UV.
  531. void MergeUVRow_NEON(const uint8* src_u,
  532. const uint8* src_v,
  533. uint8* dst_uv,
  534. int width) {
  535. asm volatile(
  536. "1: \n"
  537. "ld1 {v0.16b}, [%0], #16 \n" // load U
  538. "ld1 {v1.16b}, [%1], #16 \n" // load V
  539. "subs %w3, %w3, #16 \n" // 16 processed per loop
  540. "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
  541. "b.gt 1b \n"
  542. : "+r"(src_u), // %0
  543. "+r"(src_v), // %1
  544. "+r"(dst_uv), // %2
  545. "+r"(width) // %3 // Output registers
  546. : // Input registers
  547. : "cc", "memory", "v0", "v1" // Clobber List
  548. );
  549. }
  550. // Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
  551. void SplitRGBRow_NEON(const uint8* src_rgb,
  552. uint8* dst_r,
  553. uint8* dst_g,
  554. uint8* dst_b,
  555. int width) {
  556. asm volatile(
  557. "1: \n"
  558. "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB
  559. "subs %w4, %w4, #16 \n" // 16 processed per loop
  560. "st1 {v0.16b}, [%1], #16 \n" // store R
  561. "st1 {v1.16b}, [%2], #16 \n" // store G
  562. "st1 {v2.16b}, [%3], #16 \n" // store B
  563. "b.gt 1b \n"
  564. : "+r"(src_rgb), // %0
  565. "+r"(dst_r), // %1
  566. "+r"(dst_g), // %2
  567. "+r"(dst_b), // %3
  568. "+r"(width) // %4
  569. : // Input registers
  570. : "cc", "memory", "v0", "v1", "v2" // Clobber List
  571. );
  572. }
  573. // Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
  574. void MergeRGBRow_NEON(const uint8* src_r,
  575. const uint8* src_g,
  576. const uint8* src_b,
  577. uint8* dst_rgb,
  578. int width) {
  579. asm volatile(
  580. "1: \n"
  581. "ld1 {v0.16b}, [%0], #16 \n" // load R
  582. "ld1 {v1.16b}, [%1], #16 \n" // load G
  583. "ld1 {v2.16b}, [%2], #16 \n" // load B
  584. "subs %w4, %w4, #16 \n" // 16 processed per loop
  585. "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB
  586. "b.gt 1b \n"
  587. : "+r"(src_r), // %0
  588. "+r"(src_g), // %1
  589. "+r"(src_b), // %2
  590. "+r"(dst_rgb), // %3
  591. "+r"(width) // %4
  592. : // Input registers
  593. : "cc", "memory", "v0", "v1", "v2" // Clobber List
  594. );
  595. }
  596. // Copy multiple of 32.
  597. void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
  598. asm volatile(
  599. "1: \n"
  600. "ldp q0, q1, [%0], #32 \n"
  601. "subs %w2, %w2, #32 \n" // 32 processed per loop
  602. "stp q0, q1, [%1], #32 \n"
  603. "b.gt 1b \n"
  604. : "+r"(src), // %0
  605. "+r"(dst), // %1
  606. "+r"(count) // %2 // Output registers
  607. : // Input registers
  608. : "cc", "memory", "v0", "v1" // Clobber List
  609. );
  610. }
  611. // SetRow writes 'count' bytes using an 8 bit value repeated.
  612. void SetRow_NEON(uint8* dst, uint8 v8, int count) {
  613. asm volatile(
  614. "dup v0.16b, %w2 \n" // duplicate 16 bytes
  615. "1: \n"
  616. "subs %w1, %w1, #16 \n" // 16 bytes per loop
  617. "st1 {v0.16b}, [%0], #16 \n" // store
  618. "b.gt 1b \n"
  619. : "+r"(dst), // %0
  620. "+r"(count) // %1
  621. : "r"(v8) // %2
  622. : "cc", "memory", "v0");
  623. }
  624. void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
  625. asm volatile(
  626. "dup v0.4s, %w2 \n" // duplicate 4 ints
  627. "1: \n"
  628. "subs %w1, %w1, #4 \n" // 4 ints per loop
  629. "st1 {v0.16b}, [%0], #16 \n" // store
  630. "b.gt 1b \n"
  631. : "+r"(dst), // %0
  632. "+r"(count) // %1
  633. : "r"(v32) // %2
  634. : "cc", "memory", "v0");
  635. }
  636. void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
  637. asm volatile(
  638. // Start at end of source row.
  639. "add %0, %0, %w2, sxtw \n"
  640. "sub %0, %0, #16 \n"
  641. "1: \n"
  642. "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
  643. "subs %w2, %w2, #16 \n" // 16 pixels per loop.
  644. "rev64 v0.16b, v0.16b \n"
  645. "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
  646. "st1 {v0.D}[0], [%1], #8 \n"
  647. "b.gt 1b \n"
  648. : "+r"(src), // %0
  649. "+r"(dst), // %1
  650. "+r"(width) // %2
  651. : "r"((ptrdiff_t)-16) // %3
  652. : "cc", "memory", "v0");
  653. }
  654. void MirrorUVRow_NEON(const uint8* src_uv,
  655. uint8* dst_u,
  656. uint8* dst_v,
  657. int width) {
  658. asm volatile(
  659. // Start at end of source row.
  660. "add %0, %0, %w3, sxtw #1 \n"
  661. "sub %0, %0, #16 \n"
  662. "1: \n"
  663. "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
  664. "subs %w3, %w3, #8 \n" // 8 pixels per loop.
  665. "rev64 v0.8b, v0.8b \n"
  666. "rev64 v1.8b, v1.8b \n"
  667. "st1 {v0.8b}, [%1], #8 \n" // dst += 8
  668. "st1 {v1.8b}, [%2], #8 \n"
  669. "b.gt 1b \n"
  670. : "+r"(src_uv), // %0
  671. "+r"(dst_u), // %1
  672. "+r"(dst_v), // %2
  673. "+r"(width) // %3
  674. : "r"((ptrdiff_t)-16) // %4
  675. : "cc", "memory", "v0", "v1");
  676. }
  677. void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
  678. asm volatile(
  679. // Start at end of source row.
  680. "add %0, %0, %w2, sxtw #2 \n"
  681. "sub %0, %0, #16 \n"
  682. "1: \n"
  683. "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
  684. "subs %w2, %w2, #4 \n" // 4 pixels per loop.
  685. "rev64 v0.4s, v0.4s \n"
  686. "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
  687. "st1 {v0.D}[0], [%1], #8 \n"
  688. "b.gt 1b \n"
  689. : "+r"(src), // %0
  690. "+r"(dst), // %1
  691. "+r"(width) // %2
  692. : "r"((ptrdiff_t)-16) // %3
  693. : "cc", "memory", "v0");
  694. }
  695. void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
  696. asm volatile(
  697. "movi v4.8b, #255 \n" // Alpha
  698. "1: \n"
  699. "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
  700. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  701. "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB
  702. "b.gt 1b \n"
  703. : "+r"(src_rgb24), // %0
  704. "+r"(dst_argb), // %1
  705. "+r"(width) // %2
  706. :
  707. : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
  708. );
  709. }
  710. void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
  711. asm volatile(
  712. "movi v5.8b, #255 \n" // Alpha
  713. "1: \n"
  714. "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
  715. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  716. "orr v3.8b, v1.8b, v1.8b \n" // move g
  717. "orr v4.8b, v0.8b, v0.8b \n" // move r
  718. "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
  719. "b.gt 1b \n"
  720. : "+r"(src_raw), // %0
  721. "+r"(dst_argb), // %1
  722. "+r"(width) // %2
  723. :
  724. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
  725. );
  726. }
  727. void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
  728. asm volatile(
  729. "1: \n"
  730. "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
  731. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  732. "orr v3.8b, v1.8b, v1.8b \n" // move g
  733. "orr v4.8b, v0.8b, v0.8b \n" // move r
  734. "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r
  735. "b.gt 1b \n"
  736. : "+r"(src_raw), // %0
  737. "+r"(dst_rgb24), // %1
  738. "+r"(width) // %2
  739. :
  740. : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
  741. );
  742. }
  743. #define RGB565TOARGB \
  744. "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \
  745. "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \
  746. "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \
  747. "orr v1.8b, v4.8b, v6.8b \n" /* G */ \
  748. "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
  749. "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \
  750. "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \
  751. "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \
  752. "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \
  753. "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \
  754. "dup v2.2D, v0.D[1] \n" /* R */
  755. void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
  756. asm volatile(
  757. "movi v3.8b, #255 \n" // Alpha
  758. "1: \n"
  759. "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
  760. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  761. RGB565TOARGB
  762. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
  763. "b.gt 1b \n"
  764. : "+r"(src_rgb565), // %0
  765. "+r"(dst_argb), // %1
  766. "+r"(width) // %2
  767. :
  768. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List
  769. );
  770. }
  771. #define ARGB1555TOARGB \
  772. "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
  773. "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
  774. "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \
  775. \
  776. "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \
  777. "xtn2 v3.16b, v2.8h \n" \
  778. \
  779. "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
  780. "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
  781. \
  782. "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \
  783. "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
  784. "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
  785. \
  786. "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
  787. "orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \
  788. "dup v1.2D, v0.D[1] \n" \
  789. "dup v3.2D, v2.D[1] \n"
  790. // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
  791. #define RGB555TOARGB \
  792. "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
  793. "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
  794. "xtn v3.8b, v2.8h \n" /* RRRRR000 */ \
  795. \
  796. "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
  797. "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
  798. \
  799. "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \
  800. "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
  801. "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
  802. \
  803. "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
  804. "orr v2.16b, v1.16b, v3.16b \n" /* R */ \
  805. "dup v1.2D, v0.D[1] \n" /* G */
  806. void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555,
  807. uint8* dst_argb,
  808. int width) {
  809. asm volatile(
  810. "movi v3.8b, #255 \n" // Alpha
  811. "1: \n"
  812. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
  813. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  814. ARGB1555TOARGB
  815. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
  816. // pixels
  817. "b.gt 1b \n"
  818. : "+r"(src_argb1555), // %0
  819. "+r"(dst_argb), // %1
  820. "+r"(width) // %2
  821. :
  822. : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
  823. );
  824. }
  825. #define ARGB4444TOARGB \
  826. "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \
  827. "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \
  828. "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \
  829. "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \
  830. "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \
  831. "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \
  832. "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \
  833. "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \
  834. "dup v0.2D, v2.D[1] \n" \
  835. "dup v1.2D, v3.D[1] \n"
  836. void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,
  837. uint8* dst_argb,
  838. int width) {
  839. asm volatile(
  840. "1: \n"
  841. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
  842. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  843. ARGB4444TOARGB
  844. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
  845. // pixels
  846. "b.gt 1b \n"
  847. : "+r"(src_argb4444), // %0
  848. "+r"(dst_argb), // %1
  849. "+r"(width) // %2
  850. :
  851. : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
  852. );
  853. }
  854. void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
  855. asm volatile(
  856. "1: \n"
  857. "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB
  858. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  859. "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of
  860. // RGB24.
  861. "b.gt 1b \n"
  862. : "+r"(src_argb), // %0
  863. "+r"(dst_rgb24), // %1
  864. "+r"(width) // %2
  865. :
  866. : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
  867. );
  868. }
  869. void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
  870. asm volatile(
  871. "1: \n"
  872. "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
  873. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  874. "orr v4.8b, v2.8b, v2.8b \n" // mov g
  875. "orr v5.8b, v1.8b, v1.8b \n" // mov b
  876. "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b
  877. "b.gt 1b \n"
  878. : "+r"(src_argb), // %0
  879. "+r"(dst_raw), // %1
  880. "+r"(width) // %2
  881. :
  882. : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List
  883. );
  884. }
  885. void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
  886. asm volatile(
  887. "1: \n"
  888. "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
  889. "subs %w2, %w2, #16 \n" // 16 processed per loop.
  890. "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
  891. "b.gt 1b \n"
  892. : "+r"(src_yuy2), // %0
  893. "+r"(dst_y), // %1
  894. "+r"(width) // %2
  895. :
  896. : "cc", "memory", "v0", "v1" // Clobber List
  897. );
  898. }
  899. void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
  900. asm volatile(
  901. "1: \n"
  902. "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
  903. "subs %w2, %w2, #16 \n" // 16 processed per loop.
  904. "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
  905. "b.gt 1b \n"
  906. : "+r"(src_uyvy), // %0
  907. "+r"(dst_y), // %1
  908. "+r"(width) // %2
  909. :
  910. : "cc", "memory", "v0", "v1" // Clobber List
  911. );
  912. }
  913. void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
  914. uint8* dst_u,
  915. uint8* dst_v,
  916. int width) {
  917. asm volatile(
  918. "1: \n"
  919. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2
  920. "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
  921. "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
  922. "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
  923. "b.gt 1b \n"
  924. : "+r"(src_yuy2), // %0
  925. "+r"(dst_u), // %1
  926. "+r"(dst_v), // %2
  927. "+r"(width) // %3
  928. :
  929. : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
  930. );
  931. }
  932. void UYVYToUV422Row_NEON(const uint8* src_uyvy,
  933. uint8* dst_u,
  934. uint8* dst_v,
  935. int width) {
  936. asm volatile(
  937. "1: \n"
  938. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY
  939. "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
  940. "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
  941. "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
  942. "b.gt 1b \n"
  943. : "+r"(src_uyvy), // %0
  944. "+r"(dst_u), // %1
  945. "+r"(dst_v), // %2
  946. "+r"(width) // %3
  947. :
  948. : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
  949. );
  950. }
  951. void YUY2ToUVRow_NEON(const uint8* src_yuy2,
  952. int stride_yuy2,
  953. uint8* dst_u,
  954. uint8* dst_v,
  955. int width) {
  956. const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
  957. asm volatile(
  958. "1: \n"
  959. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
  960. "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
  961. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
  962. "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
  963. "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
  964. "st1 {v1.8b}, [%2], #8 \n" // store 8 U.
  965. "st1 {v3.8b}, [%3], #8 \n" // store 8 V.
  966. "b.gt 1b \n"
  967. : "+r"(src_yuy2), // %0
  968. "+r"(src_yuy2b), // %1
  969. "+r"(dst_u), // %2
  970. "+r"(dst_v), // %3
  971. "+r"(width) // %4
  972. :
  973. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
  974. "v7" // Clobber List
  975. );
  976. }
  977. void UYVYToUVRow_NEON(const uint8* src_uyvy,
  978. int stride_uyvy,
  979. uint8* dst_u,
  980. uint8* dst_v,
  981. int width) {
  982. const uint8* src_uyvyb = src_uyvy + stride_uyvy;
  983. asm volatile(
  984. "1: \n"
  985. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
  986. "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
  987. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
  988. "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
  989. "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
  990. "st1 {v0.8b}, [%2], #8 \n" // store 8 U.
  991. "st1 {v2.8b}, [%3], #8 \n" // store 8 V.
  992. "b.gt 1b \n"
  993. : "+r"(src_uyvy), // %0
  994. "+r"(src_uyvyb), // %1
  995. "+r"(dst_u), // %2
  996. "+r"(dst_v), // %3
  997. "+r"(width) // %4
  998. :
  999. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
  1000. "v7" // Clobber List
  1001. );
  1002. }
  1003. // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
  1004. void ARGBShuffleRow_NEON(const uint8* src_argb,
  1005. uint8* dst_argb,
  1006. const uint8* shuffler,
  1007. int width) {
  1008. asm volatile(
  1009. "ld1 {v2.16b}, [%3] \n" // shuffler
  1010. "1: \n"
  1011. "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
  1012. "subs %w2, %w2, #4 \n" // 4 processed per loop
  1013. "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
  1014. "st1 {v1.16b}, [%1], #16 \n" // store 4.
  1015. "b.gt 1b \n"
  1016. : "+r"(src_argb), // %0
  1017. "+r"(dst_argb), // %1
  1018. "+r"(width) // %2
  1019. : "r"(shuffler) // %3
  1020. : "cc", "memory", "v0", "v1", "v2" // Clobber List
  1021. );
  1022. }
  1023. void I422ToYUY2Row_NEON(const uint8* src_y,
  1024. const uint8* src_u,
  1025. const uint8* src_v,
  1026. uint8* dst_yuy2,
  1027. int width) {
  1028. asm volatile(
  1029. "1: \n"
  1030. "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
  1031. "orr v2.8b, v1.8b, v1.8b \n"
  1032. "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
  1033. "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
  1034. "subs %w4, %w4, #16 \n" // 16 pixels
  1035. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
  1036. "b.gt 1b \n"
  1037. : "+r"(src_y), // %0
  1038. "+r"(src_u), // %1
  1039. "+r"(src_v), // %2
  1040. "+r"(dst_yuy2), // %3
  1041. "+r"(width) // %4
  1042. :
  1043. : "cc", "memory", "v0", "v1", "v2", "v3");
  1044. }
  1045. void I422ToUYVYRow_NEON(const uint8* src_y,
  1046. const uint8* src_u,
  1047. const uint8* src_v,
  1048. uint8* dst_uyvy,
  1049. int width) {
  1050. asm volatile(
  1051. "1: \n"
  1052. "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
  1053. "orr v3.8b, v2.8b, v2.8b \n"
  1054. "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
  1055. "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
  1056. "subs %w4, %w4, #16 \n" // 16 pixels
  1057. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
  1058. "b.gt 1b \n"
  1059. : "+r"(src_y), // %0
  1060. "+r"(src_u), // %1
  1061. "+r"(src_v), // %2
  1062. "+r"(dst_uyvy), // %3
  1063. "+r"(width) // %4
  1064. :
  1065. : "cc", "memory", "v0", "v1", "v2", "v3");
  1066. }
  1067. void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
  1068. asm volatile(
  1069. "1: \n"
  1070. "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
  1071. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1072. ARGBTORGB565
  1073. "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565.
  1074. "b.gt 1b \n"
  1075. : "+r"(src_argb), // %0
  1076. "+r"(dst_rgb565), // %1
  1077. "+r"(width) // %2
  1078. :
  1079. : "cc", "memory", "v0", "v20", "v21", "v22", "v23");
  1080. }
  1081. void ARGBToRGB565DitherRow_NEON(const uint8* src_argb,
  1082. uint8* dst_rgb,
  1083. const uint32 dither4,
  1084. int width) {
  1085. asm volatile(
  1086. "dup v1.4s, %w2 \n" // dither4
  1087. "1: \n"
  1088. "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels
  1089. "subs %w3, %w3, #8 \n" // 8 processed per loop.
  1090. "uqadd v20.8b, v20.8b, v1.8b \n"
  1091. "uqadd v21.8b, v21.8b, v1.8b \n"
  1092. "uqadd v22.8b, v22.8b, v1.8b \n" ARGBTORGB565
  1093. "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565.
  1094. "b.gt 1b \n"
  1095. : "+r"(dst_rgb) // %0
  1096. : "r"(src_argb), // %1
  1097. "r"(dither4), // %2
  1098. "r"(width) // %3
  1099. : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23");
  1100. }
  1101. void ARGBToARGB1555Row_NEON(const uint8* src_argb,
  1102. uint8* dst_argb1555,
  1103. int width) {
  1104. asm volatile(
  1105. "1: \n"
  1106. "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
  1107. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1108. ARGBTOARGB1555
  1109. "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
  1110. // ARGB1555.
  1111. "b.gt 1b \n"
  1112. : "+r"(src_argb), // %0
  1113. "+r"(dst_argb1555), // %1
  1114. "+r"(width) // %2
  1115. :
  1116. : "cc", "memory", "v0", "v20", "v21", "v22", "v23");
  1117. }
  1118. void ARGBToARGB4444Row_NEON(const uint8* src_argb,
  1119. uint8* dst_argb4444,
  1120. int width) {
  1121. asm volatile(
  1122. "movi v4.16b, #0x0f \n" // bits to clear with
  1123. // vbic.
  1124. "1: \n"
  1125. "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
  1126. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1127. ARGBTOARGB4444
  1128. "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
  1129. // ARGB4444.
  1130. "b.gt 1b \n"
  1131. : "+r"(src_argb), // %0
  1132. "+r"(dst_argb4444), // %1
  1133. "+r"(width) // %2
  1134. :
  1135. : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23");
  1136. }
  1137. void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
  1138. asm volatile(
  1139. "movi v4.8b, #13 \n" // B * 0.1016 coefficient
  1140. "movi v5.8b, #65 \n" // G * 0.5078 coefficient
  1141. "movi v6.8b, #33 \n" // R * 0.2578 coefficient
  1142. "movi v7.8b, #16 \n" // Add 16 constant
  1143. "1: \n"
  1144. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
  1145. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1146. "umull v3.8h, v0.8b, v4.8b \n" // B
  1147. "umlal v3.8h, v1.8b, v5.8b \n" // G
  1148. "umlal v3.8h, v2.8b, v6.8b \n" // R
  1149. "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
  1150. "uqadd v0.8b, v0.8b, v7.8b \n"
  1151. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
  1152. "b.gt 1b \n"
  1153. : "+r"(src_argb), // %0
  1154. "+r"(dst_y), // %1
  1155. "+r"(width) // %2
  1156. :
  1157. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
  1158. }
  1159. void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
  1160. asm volatile(
  1161. "1: \n"
  1162. "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16
  1163. // pixels
  1164. "subs %w2, %w2, #16 \n" // 16 processed per loop
  1165. "st1 {v3.16b}, [%1], #16 \n" // store 16 A's.
  1166. "b.gt 1b \n"
  1167. : "+r"(src_argb), // %0
  1168. "+r"(dst_a), // %1
  1169. "+r"(width) // %2
  1170. :
  1171. : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
  1172. );
  1173. }
  1174. void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
  1175. asm volatile(
  1176. "movi v4.8b, #15 \n" // B * 0.11400 coefficient
  1177. "movi v5.8b, #75 \n" // G * 0.58700 coefficient
  1178. "movi v6.8b, #38 \n" // R * 0.29900 coefficient
  1179. "1: \n"
  1180. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
  1181. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1182. "umull v3.8h, v0.8b, v4.8b \n" // B
  1183. "umlal v3.8h, v1.8b, v5.8b \n" // G
  1184. "umlal v3.8h, v2.8b, v6.8b \n" // R
  1185. "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y
  1186. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
  1187. "b.gt 1b \n"
  1188. : "+r"(src_argb), // %0
  1189. "+r"(dst_y), // %1
  1190. "+r"(width) // %2
  1191. :
  1192. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
  1193. }
  1194. // 8x1 pixels.
  1195. void ARGBToUV444Row_NEON(const uint8* src_argb,
  1196. uint8* dst_u,
  1197. uint8* dst_v,
  1198. int width) {
  1199. asm volatile(
  1200. "movi v24.8b, #112 \n" // UB / VR 0.875
  1201. // coefficient
  1202. "movi v25.8b, #74 \n" // UG -0.5781 coefficient
  1203. "movi v26.8b, #38 \n" // UR -0.2969 coefficient
  1204. "movi v27.8b, #18 \n" // VB -0.1406 coefficient
  1205. "movi v28.8b, #94 \n" // VG -0.7344 coefficient
  1206. "movi v29.16b,#0x80 \n" // 128.5
  1207. "1: \n"
  1208. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
  1209. // pixels.
  1210. "subs %w3, %w3, #8 \n" // 8 processed per loop.
  1211. "umull v4.8h, v0.8b, v24.8b \n" // B
  1212. "umlsl v4.8h, v1.8b, v25.8b \n" // G
  1213. "umlsl v4.8h, v2.8b, v26.8b \n" // R
  1214. "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned
  1215. "umull v3.8h, v2.8b, v24.8b \n" // R
  1216. "umlsl v3.8h, v1.8b, v28.8b \n" // G
  1217. "umlsl v3.8h, v0.8b, v27.8b \n" // B
  1218. "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned
  1219. "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U
  1220. "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
  1221. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
  1222. "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
  1223. "b.gt 1b \n"
  1224. : "+r"(src_argb), // %0
  1225. "+r"(dst_u), // %1
  1226. "+r"(dst_v), // %2
  1227. "+r"(width) // %3
  1228. :
  1229. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26",
  1230. "v27", "v28", "v29");
  1231. }
  1232. #define RGBTOUV_SETUP_REG \
  1233. "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \
  1234. "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \
  1235. "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \
  1236. "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \
  1237. "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \
  1238. "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */
  1239. // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
  1240. // clang-format off
  1241. #define RGBTOUV(QB, QG, QR) \
  1242. "mul v3.8h, " #QB ",v20.8h \n" /* B */ \
  1243. "mul v4.8h, " #QR ",v20.8h \n" /* R */ \
  1244. "mls v3.8h, " #QG ",v21.8h \n" /* G */ \
  1245. "mls v4.8h, " #QG ",v24.8h \n" /* G */ \
  1246. "mls v3.8h, " #QR ",v22.8h \n" /* R */ \
  1247. "mls v4.8h, " #QB ",v23.8h \n" /* B */ \
  1248. "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \
  1249. "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \
  1250. "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \
  1251. "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */
  1252. // clang-format on
  1253. // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
  1254. // TODO(fbarchard): consider ptrdiff_t for all strides.
  1255. void ARGBToUVRow_NEON(const uint8* src_argb,
  1256. int src_stride_argb,
  1257. uint8* dst_u,
  1258. uint8* dst_v,
  1259. int width) {
  1260. const uint8* src_argb_1 = src_argb + src_stride_argb;
  1261. asm volatile (
  1262. RGBTOUV_SETUP_REG
  1263. "1: \n"
  1264. "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
  1265. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
  1266. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
  1267. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
  1268. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
  1269. "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
  1270. "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
  1271. "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
  1272. "urshr v0.8h, v0.8h, #1 \n" // 2x average
  1273. "urshr v1.8h, v1.8h, #1 \n"
  1274. "urshr v2.8h, v2.8h, #1 \n"
  1275. "subs %w4, %w4, #16 \n" // 32 processed per loop.
  1276. RGBTOUV(v0.8h, v1.8h, v2.8h)
  1277. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
  1278. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
  1279. "b.gt 1b \n"
  1280. : "+r"(src_argb), // %0
  1281. "+r"(src_argb_1), // %1
  1282. "+r"(dst_u), // %2
  1283. "+r"(dst_v), // %3
  1284. "+r"(width) // %4
  1285. :
  1286. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  1287. "v20", "v21", "v22", "v23", "v24", "v25"
  1288. );
  1289. }
  1290. // TODO(fbarchard): Subsample match C code.
  1291. void ARGBToUVJRow_NEON(const uint8* src_argb,
  1292. int src_stride_argb,
  1293. uint8* dst_u,
  1294. uint8* dst_v,
  1295. int width) {
  1296. const uint8* src_argb_1 = src_argb + src_stride_argb;
  1297. asm volatile (
  1298. "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
  1299. "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
  1300. "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2
  1301. "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2
  1302. "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
  1303. "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
  1304. "1: \n"
  1305. "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
  1306. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
  1307. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
  1308. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
  1309. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
  1310. "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
  1311. "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
  1312. "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
  1313. "urshr v0.8h, v0.8h, #1 \n" // 2x average
  1314. "urshr v1.8h, v1.8h, #1 \n"
  1315. "urshr v2.8h, v2.8h, #1 \n"
  1316. "subs %w4, %w4, #16 \n" // 32 processed per loop.
  1317. RGBTOUV(v0.8h, v1.8h, v2.8h)
  1318. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
  1319. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
  1320. "b.gt 1b \n"
  1321. : "+r"(src_argb), // %0
  1322. "+r"(src_argb_1), // %1
  1323. "+r"(dst_u), // %2
  1324. "+r"(dst_v), // %3
  1325. "+r"(width) // %4
  1326. :
  1327. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  1328. "v20", "v21", "v22", "v23", "v24", "v25"
  1329. );
  1330. }
  1331. void BGRAToUVRow_NEON(const uint8* src_bgra,
  1332. int src_stride_bgra,
  1333. uint8* dst_u,
  1334. uint8* dst_v,
  1335. int width) {
  1336. const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
  1337. asm volatile (
  1338. RGBTOUV_SETUP_REG
  1339. "1: \n"
  1340. "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
  1341. "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts.
  1342. "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
  1343. "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts.
  1344. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more
  1345. "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts.
  1346. "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
  1347. "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts.
  1348. "urshr v0.8h, v0.8h, #1 \n" // 2x average
  1349. "urshr v1.8h, v3.8h, #1 \n"
  1350. "urshr v2.8h, v2.8h, #1 \n"
  1351. "subs %w4, %w4, #16 \n" // 32 processed per loop.
  1352. RGBTOUV(v0.8h, v1.8h, v2.8h)
  1353. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
  1354. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
  1355. "b.gt 1b \n"
  1356. : "+r"(src_bgra), // %0
  1357. "+r"(src_bgra_1), // %1
  1358. "+r"(dst_u), // %2
  1359. "+r"(dst_v), // %3
  1360. "+r"(width) // %4
  1361. :
  1362. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  1363. "v20", "v21", "v22", "v23", "v24", "v25"
  1364. );
  1365. }
  1366. void ABGRToUVRow_NEON(const uint8* src_abgr,
  1367. int src_stride_abgr,
  1368. uint8* dst_u,
  1369. uint8* dst_v,
  1370. int width) {
  1371. const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
  1372. asm volatile (
  1373. RGBTOUV_SETUP_REG
  1374. "1: \n"
  1375. "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
  1376. "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
  1377. "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
  1378. "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
  1379. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
  1380. "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
  1381. "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
  1382. "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
  1383. "urshr v0.8h, v3.8h, #1 \n" // 2x average
  1384. "urshr v2.8h, v2.8h, #1 \n"
  1385. "urshr v1.8h, v1.8h, #1 \n"
  1386. "subs %w4, %w4, #16 \n" // 32 processed per loop.
  1387. RGBTOUV(v0.8h, v2.8h, v1.8h)
  1388. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
  1389. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
  1390. "b.gt 1b \n"
  1391. : "+r"(src_abgr), // %0
  1392. "+r"(src_abgr_1), // %1
  1393. "+r"(dst_u), // %2
  1394. "+r"(dst_v), // %3
  1395. "+r"(width) // %4
  1396. :
  1397. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  1398. "v20", "v21", "v22", "v23", "v24", "v25"
  1399. );
  1400. }
  1401. void RGBAToUVRow_NEON(const uint8* src_rgba,
  1402. int src_stride_rgba,
  1403. uint8* dst_u,
  1404. uint8* dst_v,
  1405. int width) {
  1406. const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
  1407. asm volatile (
  1408. RGBTOUV_SETUP_REG
  1409. "1: \n"
  1410. "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
  1411. "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts.
  1412. "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
  1413. "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts.
  1414. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
  1415. "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts.
  1416. "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
  1417. "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts.
  1418. "urshr v0.8h, v0.8h, #1 \n" // 2x average
  1419. "urshr v1.8h, v1.8h, #1 \n"
  1420. "urshr v2.8h, v2.8h, #1 \n"
  1421. "subs %w4, %w4, #16 \n" // 32 processed per loop.
  1422. RGBTOUV(v0.8h, v1.8h, v2.8h)
  1423. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
  1424. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
  1425. "b.gt 1b \n"
  1426. : "+r"(src_rgba), // %0
  1427. "+r"(src_rgba_1), // %1
  1428. "+r"(dst_u), // %2
  1429. "+r"(dst_v), // %3
  1430. "+r"(width) // %4
  1431. :
  1432. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  1433. "v20", "v21", "v22", "v23", "v24", "v25"
  1434. );
  1435. }
  1436. void RGB24ToUVRow_NEON(const uint8* src_rgb24,
  1437. int src_stride_rgb24,
  1438. uint8* dst_u,
  1439. uint8* dst_v,
  1440. int width) {
  1441. const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
  1442. asm volatile (
  1443. RGBTOUV_SETUP_REG
  1444. "1: \n"
  1445. "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
  1446. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
  1447. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
  1448. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
  1449. "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more.
  1450. "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
  1451. "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
  1452. "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
  1453. "urshr v0.8h, v0.8h, #1 \n" // 2x average
  1454. "urshr v1.8h, v1.8h, #1 \n"
  1455. "urshr v2.8h, v2.8h, #1 \n"
  1456. "subs %w4, %w4, #16 \n" // 32 processed per loop.
  1457. RGBTOUV(v0.8h, v1.8h, v2.8h)
  1458. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
  1459. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
  1460. "b.gt 1b \n"
  1461. : "+r"(src_rgb24), // %0
  1462. "+r"(src_rgb24_1), // %1
  1463. "+r"(dst_u), // %2
  1464. "+r"(dst_v), // %3
  1465. "+r"(width) // %4
  1466. :
  1467. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  1468. "v20", "v21", "v22", "v23", "v24", "v25"
  1469. );
  1470. }
  1471. void RAWToUVRow_NEON(const uint8* src_raw,
  1472. int src_stride_raw,
  1473. uint8* dst_u,
  1474. uint8* dst_v,
  1475. int width) {
  1476. const uint8* src_raw_1 = src_raw + src_stride_raw;
  1477. asm volatile (
  1478. RGBTOUV_SETUP_REG
  1479. "1: \n"
  1480. "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels.
  1481. "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
  1482. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
  1483. "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
  1484. "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels
  1485. "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
  1486. "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
  1487. "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
  1488. "urshr v2.8h, v2.8h, #1 \n" // 2x average
  1489. "urshr v1.8h, v1.8h, #1 \n"
  1490. "urshr v0.8h, v0.8h, #1 \n"
  1491. "subs %w4, %w4, #16 \n" // 32 processed per loop.
  1492. RGBTOUV(v2.8h, v1.8h, v0.8h)
  1493. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
  1494. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
  1495. "b.gt 1b \n"
  1496. : "+r"(src_raw), // %0
  1497. "+r"(src_raw_1), // %1
  1498. "+r"(dst_u), // %2
  1499. "+r"(dst_v), // %3
  1500. "+r"(width) // %4
  1501. :
  1502. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  1503. "v20", "v21", "v22", "v23", "v24", "v25"
  1504. );
  1505. }
  1506. // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
  1507. void RGB565ToUVRow_NEON(const uint8* src_rgb565,
  1508. int src_stride_rgb565,
  1509. uint8* dst_u,
  1510. uint8* dst_v,
  1511. int width) {
  1512. const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
  1513. asm volatile(
  1514. "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) /
  1515. // 2
  1516. "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2
  1517. "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2
  1518. "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2
  1519. "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2
  1520. "movi v27.16b, #0x80 \n" // 128.5 0x8080 in 16bit
  1521. "1: \n"
  1522. "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
  1523. RGB565TOARGB
  1524. "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
  1525. "uaddlp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
  1526. "uaddlp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
  1527. "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels.
  1528. RGB565TOARGB
  1529. "uaddlp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
  1530. "uaddlp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
  1531. "uaddlp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
  1532. "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels.
  1533. RGB565TOARGB
  1534. "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
  1535. "uadalp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
  1536. "uadalp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
  1537. "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels.
  1538. RGB565TOARGB
  1539. "uadalp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
  1540. "uadalp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
  1541. "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
  1542. "ins v16.D[1], v17.D[0] \n"
  1543. "ins v18.D[1], v19.D[0] \n"
  1544. "ins v20.D[1], v21.D[0] \n"
  1545. "urshr v4.8h, v16.8h, #1 \n" // 2x average
  1546. "urshr v5.8h, v18.8h, #1 \n"
  1547. "urshr v6.8h, v20.8h, #1 \n"
  1548. "subs %w4, %w4, #16 \n" // 16 processed per loop.
  1549. "mul v16.8h, v4.8h, v22.8h \n" // B
  1550. "mls v16.8h, v5.8h, v23.8h \n" // G
  1551. "mls v16.8h, v6.8h, v24.8h \n" // R
  1552. "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned
  1553. "mul v17.8h, v6.8h, v22.8h \n" // R
  1554. "mls v17.8h, v5.8h, v26.8h \n" // G
  1555. "mls v17.8h, v4.8h, v25.8h \n" // B
  1556. "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned
  1557. "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U
  1558. "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V
  1559. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
  1560. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
  1561. "b.gt 1b \n"
  1562. : "+r"(src_rgb565), // %0
  1563. "+r"(src_rgb565_1), // %1
  1564. "+r"(dst_u), // %2
  1565. "+r"(dst_v), // %3
  1566. "+r"(width) // %4
  1567. :
  1568. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
  1569. "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
  1570. "v27");
  1571. }
  1572. // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
  1573. void ARGB1555ToUVRow_NEON(const uint8* src_argb1555,
  1574. int src_stride_argb1555,
  1575. uint8* dst_u,
  1576. uint8* dst_v,
  1577. int width) {
  1578. const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
  1579. asm volatile(
  1580. RGBTOUV_SETUP_REG
  1581. "1: \n"
  1582. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
  1583. RGB555TOARGB
  1584. "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
  1585. "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
  1586. "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
  1587. "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels.
  1588. RGB555TOARGB
  1589. "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
  1590. "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
  1591. "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
  1592. "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels.
  1593. RGB555TOARGB
  1594. "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
  1595. "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
  1596. "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
  1597. "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels.
  1598. RGB555TOARGB
  1599. "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
  1600. "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
  1601. "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
  1602. "ins v16.D[1], v26.D[0] \n"
  1603. "ins v17.D[1], v27.D[0] \n"
  1604. "ins v18.D[1], v28.D[0] \n"
  1605. "urshr v4.8h, v16.8h, #1 \n" // 2x average
  1606. "urshr v5.8h, v17.8h, #1 \n"
  1607. "urshr v6.8h, v18.8h, #1 \n"
  1608. "subs %w4, %w4, #16 \n" // 16 processed per loop.
  1609. "mul v2.8h, v4.8h, v20.8h \n" // B
  1610. "mls v2.8h, v5.8h, v21.8h \n" // G
  1611. "mls v2.8h, v6.8h, v22.8h \n" // R
  1612. "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned
  1613. "mul v3.8h, v6.8h, v20.8h \n" // R
  1614. "mls v3.8h, v5.8h, v24.8h \n" // G
  1615. "mls v3.8h, v4.8h, v23.8h \n" // B
  1616. "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
  1617. "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U
  1618. "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
  1619. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
  1620. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
  1621. "b.gt 1b \n"
  1622. : "+r"(src_argb1555), // %0
  1623. "+r"(src_argb1555_1), // %1
  1624. "+r"(dst_u), // %2
  1625. "+r"(dst_v), // %3
  1626. "+r"(width) // %4
  1627. :
  1628. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
  1629. "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
  1630. "v28");
  1631. }
  1632. // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
  1633. void ARGB4444ToUVRow_NEON(const uint8* src_argb4444,
  1634. int src_stride_argb4444,
  1635. uint8* dst_u,
  1636. uint8* dst_v,
  1637. int width) {
  1638. const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
  1639. asm volatile(
  1640. RGBTOUV_SETUP_REG
  1641. "1: \n"
  1642. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
  1643. ARGB4444TOARGB
  1644. "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
  1645. "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
  1646. "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
  1647. "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels.
  1648. ARGB4444TOARGB
  1649. "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
  1650. "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
  1651. "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
  1652. "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels.
  1653. ARGB4444TOARGB
  1654. "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
  1655. "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
  1656. "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
  1657. "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels.
  1658. ARGB4444TOARGB
  1659. "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
  1660. "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
  1661. "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
  1662. "ins v16.D[1], v26.D[0] \n"
  1663. "ins v17.D[1], v27.D[0] \n"
  1664. "ins v18.D[1], v28.D[0] \n"
  1665. "urshr v4.8h, v16.8h, #1 \n" // 2x average
  1666. "urshr v5.8h, v17.8h, #1 \n"
  1667. "urshr v6.8h, v18.8h, #1 \n"
  1668. "subs %w4, %w4, #16 \n" // 16 processed per loop.
  1669. "mul v2.8h, v4.8h, v20.8h \n" // B
  1670. "mls v2.8h, v5.8h, v21.8h \n" // G
  1671. "mls v2.8h, v6.8h, v22.8h \n" // R
  1672. "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned
  1673. "mul v3.8h, v6.8h, v20.8h \n" // R
  1674. "mls v3.8h, v5.8h, v24.8h \n" // G
  1675. "mls v3.8h, v4.8h, v23.8h \n" // B
  1676. "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
  1677. "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U
  1678. "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
  1679. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
  1680. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
  1681. "b.gt 1b \n"
  1682. : "+r"(src_argb4444), // %0
  1683. "+r"(src_argb4444_1), // %1
  1684. "+r"(dst_u), // %2
  1685. "+r"(dst_v), // %3
  1686. "+r"(width) // %4
  1687. :
  1688. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
  1689. "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
  1690. "v28"
  1691. );
  1692. }
  1693. void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
  1694. asm volatile(
  1695. "movi v24.8b, #13 \n" // B * 0.1016 coefficient
  1696. "movi v25.8b, #65 \n" // G * 0.5078 coefficient
  1697. "movi v26.8b, #33 \n" // R * 0.2578 coefficient
  1698. "movi v27.8b, #16 \n" // Add 16 constant
  1699. "1: \n"
  1700. "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
  1701. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1702. RGB565TOARGB
  1703. "umull v3.8h, v0.8b, v24.8b \n" // B
  1704. "umlal v3.8h, v1.8b, v25.8b \n" // G
  1705. "umlal v3.8h, v2.8b, v26.8b \n" // R
  1706. "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
  1707. "uqadd v0.8b, v0.8b, v27.8b \n"
  1708. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
  1709. "b.gt 1b \n"
  1710. : "+r"(src_rgb565), // %0
  1711. "+r"(dst_y), // %1
  1712. "+r"(width) // %2
  1713. :
  1714. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", "v24", "v25", "v26",
  1715. "v27");
  1716. }
  1717. void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
  1718. asm volatile(
  1719. "movi v4.8b, #13 \n" // B * 0.1016 coefficient
  1720. "movi v5.8b, #65 \n" // G * 0.5078 coefficient
  1721. "movi v6.8b, #33 \n" // R * 0.2578 coefficient
  1722. "movi v7.8b, #16 \n" // Add 16 constant
  1723. "1: \n"
  1724. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
  1725. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1726. ARGB1555TOARGB
  1727. "umull v3.8h, v0.8b, v4.8b \n" // B
  1728. "umlal v3.8h, v1.8b, v5.8b \n" // G
  1729. "umlal v3.8h, v2.8b, v6.8b \n" // R
  1730. "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
  1731. "uqadd v0.8b, v0.8b, v7.8b \n"
  1732. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
  1733. "b.gt 1b \n"
  1734. : "+r"(src_argb1555), // %0
  1735. "+r"(dst_y), // %1
  1736. "+r"(width) // %2
  1737. :
  1738. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
  1739. }
  1740. void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
  1741. asm volatile(
  1742. "movi v24.8b, #13 \n" // B * 0.1016 coefficient
  1743. "movi v25.8b, #65 \n" // G * 0.5078 coefficient
  1744. "movi v26.8b, #33 \n" // R * 0.2578 coefficient
  1745. "movi v27.8b, #16 \n" // Add 16 constant
  1746. "1: \n"
  1747. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
  1748. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1749. ARGB4444TOARGB
  1750. "umull v3.8h, v0.8b, v24.8b \n" // B
  1751. "umlal v3.8h, v1.8b, v25.8b \n" // G
  1752. "umlal v3.8h, v2.8b, v26.8b \n" // R
  1753. "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
  1754. "uqadd v0.8b, v0.8b, v27.8b \n"
  1755. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
  1756. "b.gt 1b \n"
  1757. : "+r"(src_argb4444), // %0
  1758. "+r"(dst_y), // %1
  1759. "+r"(width) // %2
  1760. :
  1761. : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27");
  1762. }
  1763. void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
  1764. asm volatile(
  1765. "movi v4.8b, #33 \n" // R * 0.2578 coefficient
  1766. "movi v5.8b, #65 \n" // G * 0.5078 coefficient
  1767. "movi v6.8b, #13 \n" // B * 0.1016 coefficient
  1768. "movi v7.8b, #16 \n" // Add 16 constant
  1769. "1: \n"
  1770. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
  1771. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1772. "umull v16.8h, v1.8b, v4.8b \n" // R
  1773. "umlal v16.8h, v2.8b, v5.8b \n" // G
  1774. "umlal v16.8h, v3.8b, v6.8b \n" // B
  1775. "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
  1776. "uqadd v0.8b, v0.8b, v7.8b \n"
  1777. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
  1778. "b.gt 1b \n"
  1779. : "+r"(src_bgra), // %0
  1780. "+r"(dst_y), // %1
  1781. "+r"(width) // %2
  1782. :
  1783. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
  1784. }
  1785. void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
  1786. asm volatile(
  1787. "movi v4.8b, #33 \n" // R * 0.2578 coefficient
  1788. "movi v5.8b, #65 \n" // G * 0.5078 coefficient
  1789. "movi v6.8b, #13 \n" // B * 0.1016 coefficient
  1790. "movi v7.8b, #16 \n" // Add 16 constant
  1791. "1: \n"
  1792. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
  1793. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1794. "umull v16.8h, v0.8b, v4.8b \n" // R
  1795. "umlal v16.8h, v1.8b, v5.8b \n" // G
  1796. "umlal v16.8h, v2.8b, v6.8b \n" // B
  1797. "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
  1798. "uqadd v0.8b, v0.8b, v7.8b \n"
  1799. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
  1800. "b.gt 1b \n"
  1801. : "+r"(src_abgr), // %0
  1802. "+r"(dst_y), // %1
  1803. "+r"(width) // %2
  1804. :
  1805. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
  1806. }
  1807. void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
  1808. asm volatile(
  1809. "movi v4.8b, #13 \n" // B * 0.1016 coefficient
  1810. "movi v5.8b, #65 \n" // G * 0.5078 coefficient
  1811. "movi v6.8b, #33 \n" // R * 0.2578 coefficient
  1812. "movi v7.8b, #16 \n" // Add 16 constant
  1813. "1: \n"
  1814. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
  1815. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1816. "umull v16.8h, v1.8b, v4.8b \n" // B
  1817. "umlal v16.8h, v2.8b, v5.8b \n" // G
  1818. "umlal v16.8h, v3.8b, v6.8b \n" // R
  1819. "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
  1820. "uqadd v0.8b, v0.8b, v7.8b \n"
  1821. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
  1822. "b.gt 1b \n"
  1823. : "+r"(src_rgba), // %0
  1824. "+r"(dst_y), // %1
  1825. "+r"(width) // %2
  1826. :
  1827. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
  1828. }
  1829. void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
  1830. asm volatile(
  1831. "movi v4.8b, #13 \n" // B * 0.1016 coefficient
  1832. "movi v5.8b, #65 \n" // G * 0.5078 coefficient
  1833. "movi v6.8b, #33 \n" // R * 0.2578 coefficient
  1834. "movi v7.8b, #16 \n" // Add 16 constant
  1835. "1: \n"
  1836. "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
  1837. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1838. "umull v16.8h, v0.8b, v4.8b \n" // B
  1839. "umlal v16.8h, v1.8b, v5.8b \n" // G
  1840. "umlal v16.8h, v2.8b, v6.8b \n" // R
  1841. "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
  1842. "uqadd v0.8b, v0.8b, v7.8b \n"
  1843. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
  1844. "b.gt 1b \n"
  1845. : "+r"(src_rgb24), // %0
  1846. "+r"(dst_y), // %1
  1847. "+r"(width) // %2
  1848. :
  1849. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
  1850. }
  1851. void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
  1852. asm volatile(
  1853. "movi v4.8b, #33 \n" // R * 0.2578 coefficient
  1854. "movi v5.8b, #65 \n" // G * 0.5078 coefficient
  1855. "movi v6.8b, #13 \n" // B * 0.1016 coefficient
  1856. "movi v7.8b, #16 \n" // Add 16 constant
  1857. "1: \n"
  1858. "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
  1859. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1860. "umull v16.8h, v0.8b, v4.8b \n" // B
  1861. "umlal v16.8h, v1.8b, v5.8b \n" // G
  1862. "umlal v16.8h, v2.8b, v6.8b \n" // R
  1863. "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
  1864. "uqadd v0.8b, v0.8b, v7.8b \n"
  1865. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
  1866. "b.gt 1b \n"
  1867. : "+r"(src_raw), // %0
  1868. "+r"(dst_y), // %1
  1869. "+r"(width) // %2
  1870. :
  1871. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
  1872. }
  1873. // Bilinear filter 16x2 -> 16x1
  1874. void InterpolateRow_NEON(uint8* dst_ptr,
  1875. const uint8* src_ptr,
  1876. ptrdiff_t src_stride,
  1877. int dst_width,
  1878. int source_y_fraction) {
  1879. int y1_fraction = source_y_fraction;
  1880. int y0_fraction = 256 - y1_fraction;
  1881. const uint8* src_ptr1 = src_ptr + src_stride;
  1882. asm volatile(
  1883. "cmp %w4, #0 \n"
  1884. "b.eq 100f \n"
  1885. "cmp %w4, #128 \n"
  1886. "b.eq 50f \n"
  1887. "dup v5.16b, %w4 \n"
  1888. "dup v4.16b, %w5 \n"
  1889. // General purpose row blend.
  1890. "1: \n"
  1891. "ld1 {v0.16b}, [%1], #16 \n"
  1892. "ld1 {v1.16b}, [%2], #16 \n"
  1893. "subs %w3, %w3, #16 \n"
  1894. "umull v2.8h, v0.8b, v4.8b \n"
  1895. "umull2 v3.8h, v0.16b, v4.16b \n"
  1896. "umlal v2.8h, v1.8b, v5.8b \n"
  1897. "umlal2 v3.8h, v1.16b, v5.16b \n"
  1898. "rshrn v0.8b, v2.8h, #8 \n"
  1899. "rshrn2 v0.16b, v3.8h, #8 \n"
  1900. "st1 {v0.16b}, [%0], #16 \n"
  1901. "b.gt 1b \n"
  1902. "b 99f \n"
  1903. // Blend 50 / 50.
  1904. "50: \n"
  1905. "ld1 {v0.16b}, [%1], #16 \n"
  1906. "ld1 {v1.16b}, [%2], #16 \n"
  1907. "subs %w3, %w3, #16 \n"
  1908. "urhadd v0.16b, v0.16b, v1.16b \n"
  1909. "st1 {v0.16b}, [%0], #16 \n"
  1910. "b.gt 50b \n"
  1911. "b 99f \n"
  1912. // Blend 100 / 0 - Copy row unchanged.
  1913. "100: \n"
  1914. "ld1 {v0.16b}, [%1], #16 \n"
  1915. "subs %w3, %w3, #16 \n"
  1916. "st1 {v0.16b}, [%0], #16 \n"
  1917. "b.gt 100b \n"
  1918. "99: \n"
  1919. : "+r"(dst_ptr), // %0
  1920. "+r"(src_ptr), // %1
  1921. "+r"(src_ptr1), // %2
  1922. "+r"(dst_width), // %3
  1923. "+r"(y1_fraction), // %4
  1924. "+r"(y0_fraction) // %5
  1925. :
  1926. : "cc", "memory", "v0", "v1", "v3", "v4", "v5");
  1927. }
  1928. // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
  1929. void ARGBBlendRow_NEON(const uint8* src_argb0,
  1930. const uint8* src_argb1,
  1931. uint8* dst_argb,
  1932. int width) {
  1933. asm volatile(
  1934. "subs %w3, %w3, #8 \n"
  1935. "b.lt 89f \n"
  1936. // Blend 8 pixels.
  1937. "8: \n"
  1938. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0
  1939. // pixels
  1940. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1
  1941. // pixels
  1942. "subs %w3, %w3, #8 \n" // 8 processed per loop.
  1943. "umull v16.8h, v4.8b, v3.8b \n" // db * a
  1944. "umull v17.8h, v5.8b, v3.8b \n" // dg * a
  1945. "umull v18.8h, v6.8b, v3.8b \n" // dr * a
  1946. "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
  1947. "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
  1948. "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
  1949. "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
  1950. "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
  1951. "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
  1952. "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
  1953. "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
  1954. "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
  1955. "movi v3.8b, #255 \n" // a = 255
  1956. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
  1957. // pixels
  1958. "b.ge 8b \n"
  1959. "89: \n"
  1960. "adds %w3, %w3, #8-1 \n"
  1961. "b.lt 99f \n"
  1962. // Blend 1 pixels.
  1963. "1: \n"
  1964. "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0.
  1965. "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1.
  1966. "subs %w3, %w3, #1 \n" // 1 processed per loop.
  1967. "umull v16.8h, v4.8b, v3.8b \n" // db * a
  1968. "umull v17.8h, v5.8b, v3.8b \n" // dg * a
  1969. "umull v18.8h, v6.8b, v3.8b \n" // dr * a
  1970. "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
  1971. "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
  1972. "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
  1973. "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
  1974. "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
  1975. "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
  1976. "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
  1977. "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
  1978. "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
  1979. "movi v3.8b, #255 \n" // a = 255
  1980. "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel.
  1981. "b.ge 1b \n"
  1982. "99: \n"
  1983. : "+r"(src_argb0), // %0
  1984. "+r"(src_argb1), // %1
  1985. "+r"(dst_argb), // %2
  1986. "+r"(width) // %3
  1987. :
  1988. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
  1989. "v17", "v18");
  1990. }
  1991. // Attenuate 8 pixels at a time.
  1992. void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
  1993. asm volatile(
  1994. // Attenuate 8 pixels.
  1995. "1: \n"
  1996. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
  1997. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  1998. "umull v4.8h, v0.8b, v3.8b \n" // b * a
  1999. "umull v5.8h, v1.8b, v3.8b \n" // g * a
  2000. "umull v6.8h, v2.8b, v3.8b \n" // r * a
  2001. "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8
  2002. "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8
  2003. "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
  2004. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
  2005. // pixels
  2006. "b.gt 1b \n"
  2007. : "+r"(src_argb), // %0
  2008. "+r"(dst_argb), // %1
  2009. "+r"(width) // %2
  2010. :
  2011. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
  2012. }
  2013. // Quantize 8 ARGB pixels (32 bytes).
  2014. // dst = (dst * scale >> 16) * interval_size + interval_offset;
  2015. void ARGBQuantizeRow_NEON(uint8* dst_argb,
  2016. int scale,
  2017. int interval_size,
  2018. int interval_offset,
  2019. int width) {
  2020. asm volatile(
  2021. "dup v4.8h, %w2 \n"
  2022. "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1
  2023. "dup v5.8h, %w3 \n" // interval multiply.
  2024. "dup v6.8h, %w4 \n" // interval add
  2025. // 8 pixel loop.
  2026. "1: \n"
  2027. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB.
  2028. "subs %w1, %w1, #8 \n" // 8 processed per loop.
  2029. "uxtl v0.8h, v0.8b \n" // b (0 .. 255)
  2030. "uxtl v1.8h, v1.8b \n"
  2031. "uxtl v2.8h, v2.8b \n"
  2032. "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale
  2033. "sqdmulh v1.8h, v1.8h, v4.8h \n" // g
  2034. "sqdmulh v2.8h, v2.8h, v4.8h \n" // r
  2035. "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size
  2036. "mul v1.8h, v1.8h, v5.8h \n" // g
  2037. "mul v2.8h, v2.8h, v5.8h \n" // r
  2038. "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset
  2039. "add v1.8h, v1.8h, v6.8h \n" // g
  2040. "add v2.8h, v2.8h, v6.8h \n" // r
  2041. "uqxtn v0.8b, v0.8h \n"
  2042. "uqxtn v1.8b, v1.8h \n"
  2043. "uqxtn v2.8b, v2.8h \n"
  2044. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB
  2045. "b.gt 1b \n"
  2046. : "+r"(dst_argb), // %0
  2047. "+r"(width) // %1
  2048. : "r"(scale), // %2
  2049. "r"(interval_size), // %3
  2050. "r"(interval_offset) // %4
  2051. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
  2052. }
  2053. // Shade 8 pixels at a time by specified value.
  2054. // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
  2055. // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
  2056. void ARGBShadeRow_NEON(const uint8* src_argb,
  2057. uint8* dst_argb,
  2058. int width,
  2059. uint32 value) {
  2060. asm volatile(
  2061. "dup v0.4s, %w3 \n" // duplicate scale value.
  2062. "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb.
  2063. "ushr v0.8h, v0.8h, #1 \n" // scale / 2.
  2064. // 8 pixel loop.
  2065. "1: \n"
  2066. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB
  2067. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  2068. "uxtl v4.8h, v4.8b \n" // b (0 .. 255)
  2069. "uxtl v5.8h, v5.8b \n"
  2070. "uxtl v6.8h, v6.8b \n"
  2071. "uxtl v7.8h, v7.8b \n"
  2072. "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2
  2073. "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g
  2074. "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r
  2075. "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a
  2076. "uqxtn v4.8b, v4.8h \n"
  2077. "uqxtn v5.8b, v5.8h \n"
  2078. "uqxtn v6.8b, v6.8h \n"
  2079. "uqxtn v7.8b, v7.8h \n"
  2080. "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB
  2081. "b.gt 1b \n"
  2082. : "+r"(src_argb), // %0
  2083. "+r"(dst_argb), // %1
  2084. "+r"(width) // %2
  2085. : "r"(value) // %3
  2086. : "cc", "memory", "v0", "v4", "v5", "v6", "v7");
  2087. }
  2088. // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
  2089. // Similar to ARGBToYJ but stores ARGB.
  2090. // C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
  2091. void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
  2092. asm volatile(
  2093. "movi v24.8b, #15 \n" // B * 0.11400 coefficient
  2094. "movi v25.8b, #75 \n" // G * 0.58700 coefficient
  2095. "movi v26.8b, #38 \n" // R * 0.29900 coefficient
  2096. "1: \n"
  2097. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
  2098. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  2099. "umull v4.8h, v0.8b, v24.8b \n" // B
  2100. "umlal v4.8h, v1.8b, v25.8b \n" // G
  2101. "umlal v4.8h, v2.8b, v26.8b \n" // R
  2102. "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B
  2103. "orr v1.8b, v0.8b, v0.8b \n" // G
  2104. "orr v2.8b, v0.8b, v0.8b \n" // R
  2105. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels.
  2106. "b.gt 1b \n"
  2107. : "+r"(src_argb), // %0
  2108. "+r"(dst_argb), // %1
  2109. "+r"(width) // %2
  2110. :
  2111. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26");
  2112. }
  2113. // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
  2114. // b = (r * 35 + g * 68 + b * 17) >> 7
  2115. // g = (r * 45 + g * 88 + b * 22) >> 7
  2116. // r = (r * 50 + g * 98 + b * 24) >> 7
  2117. void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
  2118. asm volatile(
  2119. "movi v20.8b, #17 \n" // BB coefficient
  2120. "movi v21.8b, #68 \n" // BG coefficient
  2121. "movi v22.8b, #35 \n" // BR coefficient
  2122. "movi v24.8b, #22 \n" // GB coefficient
  2123. "movi v25.8b, #88 \n" // GG coefficient
  2124. "movi v26.8b, #45 \n" // GR coefficient
  2125. "movi v28.8b, #24 \n" // BB coefficient
  2126. "movi v29.8b, #98 \n" // BG coefficient
  2127. "movi v30.8b, #50 \n" // BR coefficient
  2128. "1: \n"
  2129. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
  2130. "subs %w1, %w1, #8 \n" // 8 processed per loop.
  2131. "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
  2132. "umlal v4.8h, v1.8b, v21.8b \n" // G
  2133. "umlal v4.8h, v2.8b, v22.8b \n" // R
  2134. "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G
  2135. "umlal v5.8h, v1.8b, v25.8b \n" // G
  2136. "umlal v5.8h, v2.8b, v26.8b \n" // R
  2137. "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R
  2138. "umlal v6.8h, v1.8b, v29.8b \n" // G
  2139. "umlal v6.8h, v2.8b, v30.8b \n" // R
  2140. "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B
  2141. "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G
  2142. "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
  2143. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels.
  2144. "b.gt 1b \n"
  2145. : "+r"(dst_argb), // %0
  2146. "+r"(width) // %1
  2147. :
  2148. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
  2149. "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30");
  2150. }
  2151. // Tranform 8 ARGB pixels (32 bytes) with color matrix.
  2152. // TODO(fbarchard): Was same as Sepia except matrix is provided. This function
  2153. // needs to saturate. Consider doing a non-saturating version.
  2154. void ARGBColorMatrixRow_NEON(const uint8* src_argb,
  2155. uint8* dst_argb,
  2156. const int8* matrix_argb,
  2157. int width) {
  2158. asm volatile(
  2159. "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors.
  2160. "sxtl v0.8h, v2.8b \n" // B,G coefficients s16.
  2161. "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
  2162. "1: \n"
  2163. "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB
  2164. "subs %w2, %w2, #8 \n" // 8 processed per loop.
  2165. "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
  2166. "uxtl v17.8h, v17.8b \n" // g
  2167. "uxtl v18.8h, v18.8b \n" // r
  2168. "uxtl v19.8h, v19.8b \n" // a
  2169. "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B
  2170. "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G
  2171. "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R
  2172. "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A
  2173. "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B
  2174. "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G
  2175. "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R
  2176. "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A
  2177. "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
  2178. "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
  2179. "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
  2180. "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
  2181. "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B
  2182. "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G
  2183. "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R
  2184. "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A
  2185. "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
  2186. "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
  2187. "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
  2188. "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
  2189. "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B
  2190. "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G
  2191. "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R
  2192. "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A
  2193. "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
  2194. "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
  2195. "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
  2196. "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
  2197. "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B
  2198. "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G
  2199. "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
  2200. "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
  2201. "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB
  2202. "b.gt 1b \n"
  2203. : "+r"(src_argb), // %0
  2204. "+r"(dst_argb), // %1
  2205. "+r"(width) // %2
  2206. : "r"(matrix_argb) // %3
  2207. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
  2208. "v17", "v18", "v19", "v22", "v23", "v24", "v25");
  2209. }
  2210. // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
  2211. // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
  2212. void ARGBMultiplyRow_NEON(const uint8* src_argb0,
  2213. const uint8* src_argb1,
  2214. uint8* dst_argb,
  2215. int width) {
  2216. asm volatile(
  2217. // 8 pixel loop.
  2218. "1: \n"
  2219. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
  2220. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
  2221. "subs %w3, %w3, #8 \n" // 8 processed per loop.
  2222. "umull v0.8h, v0.8b, v4.8b \n" // multiply B
  2223. "umull v1.8h, v1.8b, v5.8b \n" // multiply G
  2224. "umull v2.8h, v2.8b, v6.8b \n" // multiply R
  2225. "umull v3.8h, v3.8b, v7.8b \n" // multiply A
  2226. "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B
  2227. "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G
  2228. "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
  2229. "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
  2230. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
  2231. "b.gt 1b \n"
  2232. : "+r"(src_argb0), // %0
  2233. "+r"(src_argb1), // %1
  2234. "+r"(dst_argb), // %2
  2235. "+r"(width) // %3
  2236. :
  2237. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
  2238. }
  2239. // Add 2 rows of ARGB pixels together, 8 pixels at a time.
  2240. void ARGBAddRow_NEON(const uint8* src_argb0,
  2241. const uint8* src_argb1,
  2242. uint8* dst_argb,
  2243. int width) {
  2244. asm volatile(
  2245. // 8 pixel loop.
  2246. "1: \n"
  2247. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
  2248. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
  2249. "subs %w3, %w3, #8 \n" // 8 processed per loop.
  2250. "uqadd v0.8b, v0.8b, v4.8b \n"
  2251. "uqadd v1.8b, v1.8b, v5.8b \n"
  2252. "uqadd v2.8b, v2.8b, v6.8b \n"
  2253. "uqadd v3.8b, v3.8b, v7.8b \n"
  2254. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
  2255. "b.gt 1b \n"
  2256. : "+r"(src_argb0), // %0
  2257. "+r"(src_argb1), // %1
  2258. "+r"(dst_argb), // %2
  2259. "+r"(width) // %3
  2260. :
  2261. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
  2262. }
  2263. // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
  2264. void ARGBSubtractRow_NEON(const uint8* src_argb0,
  2265. const uint8* src_argb1,
  2266. uint8* dst_argb,
  2267. int width) {
  2268. asm volatile(
  2269. // 8 pixel loop.
  2270. "1: \n"
  2271. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
  2272. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
  2273. "subs %w3, %w3, #8 \n" // 8 processed per loop.
  2274. "uqsub v0.8b, v0.8b, v4.8b \n"
  2275. "uqsub v1.8b, v1.8b, v5.8b \n"
  2276. "uqsub v2.8b, v2.8b, v6.8b \n"
  2277. "uqsub v3.8b, v3.8b, v7.8b \n"
  2278. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
  2279. "b.gt 1b \n"
  2280. : "+r"(src_argb0), // %0
  2281. "+r"(src_argb1), // %1
  2282. "+r"(dst_argb), // %2
  2283. "+r"(width) // %3
  2284. :
  2285. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
  2286. }
  2287. // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
  2288. // A = 255
  2289. // R = Sobel
  2290. // G = Sobel
  2291. // B = Sobel
  2292. void SobelRow_NEON(const uint8* src_sobelx,
  2293. const uint8* src_sobely,
  2294. uint8* dst_argb,
  2295. int width) {
  2296. asm volatile(
  2297. "movi v3.8b, #255 \n" // alpha
  2298. // 8 pixel loop.
  2299. "1: \n"
  2300. "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
  2301. "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
  2302. "subs %w3, %w3, #8 \n" // 8 processed per loop.
  2303. "uqadd v0.8b, v0.8b, v1.8b \n" // add
  2304. "orr v1.8b, v0.8b, v0.8b \n"
  2305. "orr v2.8b, v0.8b, v0.8b \n"
  2306. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
  2307. "b.gt 1b \n"
  2308. : "+r"(src_sobelx), // %0
  2309. "+r"(src_sobely), // %1
  2310. "+r"(dst_argb), // %2
  2311. "+r"(width) // %3
  2312. :
  2313. : "cc", "memory", "v0", "v1", "v2", "v3");
  2314. }
  2315. // Adds Sobel X and Sobel Y and stores Sobel into plane.
  2316. void SobelToPlaneRow_NEON(const uint8* src_sobelx,
  2317. const uint8* src_sobely,
  2318. uint8* dst_y,
  2319. int width) {
  2320. asm volatile(
  2321. // 16 pixel loop.
  2322. "1: \n"
  2323. "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
  2324. "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
  2325. "subs %w3, %w3, #16 \n" // 16 processed per loop.
  2326. "uqadd v0.16b, v0.16b, v1.16b \n" // add
  2327. "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
  2328. "b.gt 1b \n"
  2329. : "+r"(src_sobelx), // %0
  2330. "+r"(src_sobely), // %1
  2331. "+r"(dst_y), // %2
  2332. "+r"(width) // %3
  2333. :
  2334. : "cc", "memory", "v0", "v1");
  2335. }
  2336. // Mixes Sobel X, Sobel Y and Sobel into ARGB.
  2337. // A = 255
  2338. // R = Sobel X
  2339. // G = Sobel
  2340. // B = Sobel Y
  2341. void SobelXYRow_NEON(const uint8* src_sobelx,
  2342. const uint8* src_sobely,
  2343. uint8* dst_argb,
  2344. int width) {
  2345. asm volatile(
  2346. "movi v3.8b, #255 \n" // alpha
  2347. // 8 pixel loop.
  2348. "1: \n"
  2349. "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
  2350. "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
  2351. "subs %w3, %w3, #8 \n" // 8 processed per loop.
  2352. "uqadd v1.8b, v0.8b, v2.8b \n" // add
  2353. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
  2354. "b.gt 1b \n"
  2355. : "+r"(src_sobelx), // %0
  2356. "+r"(src_sobely), // %1
  2357. "+r"(dst_argb), // %2
  2358. "+r"(width) // %3
  2359. :
  2360. : "cc", "memory", "v0", "v1", "v2", "v3");
  2361. }
  2362. // SobelX as a matrix is
  2363. // -1 0 1
  2364. // -2 0 2
  2365. // -1 0 1
  2366. void SobelXRow_NEON(const uint8* src_y0,
  2367. const uint8* src_y1,
  2368. const uint8* src_y2,
  2369. uint8* dst_sobelx,
  2370. int width) {
  2371. asm volatile(
  2372. "1: \n"
  2373. "ld1 {v0.8b}, [%0],%5 \n" // top
  2374. "ld1 {v1.8b}, [%0],%6 \n"
  2375. "usubl v0.8h, v0.8b, v1.8b \n"
  2376. "ld1 {v2.8b}, [%1],%5 \n" // center * 2
  2377. "ld1 {v3.8b}, [%1],%6 \n"
  2378. "usubl v1.8h, v2.8b, v3.8b \n"
  2379. "add v0.8h, v0.8h, v1.8h \n"
  2380. "add v0.8h, v0.8h, v1.8h \n"
  2381. "ld1 {v2.8b}, [%2],%5 \n" // bottom
  2382. "ld1 {v3.8b}, [%2],%6 \n"
  2383. "subs %w4, %w4, #8 \n" // 8 pixels
  2384. "usubl v1.8h, v2.8b, v3.8b \n"
  2385. "add v0.8h, v0.8h, v1.8h \n"
  2386. "abs v0.8h, v0.8h \n"
  2387. "uqxtn v0.8b, v0.8h \n"
  2388. "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
  2389. "b.gt 1b \n"
  2390. : "+r"(src_y0), // %0
  2391. "+r"(src_y1), // %1
  2392. "+r"(src_y2), // %2
  2393. "+r"(dst_sobelx), // %3
  2394. "+r"(width) // %4
  2395. : "r"(2LL), // %5
  2396. "r"(6LL) // %6
  2397. : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
  2398. );
  2399. }
  2400. // SobelY as a matrix is
  2401. // -1 -2 -1
  2402. // 0 0 0
  2403. // 1 2 1
  2404. void SobelYRow_NEON(const uint8* src_y0,
  2405. const uint8* src_y1,
  2406. uint8* dst_sobely,
  2407. int width) {
  2408. asm volatile(
  2409. "1: \n"
  2410. "ld1 {v0.8b}, [%0],%4 \n" // left
  2411. "ld1 {v1.8b}, [%1],%4 \n"
  2412. "usubl v0.8h, v0.8b, v1.8b \n"
  2413. "ld1 {v2.8b}, [%0],%4 \n" // center * 2
  2414. "ld1 {v3.8b}, [%1],%4 \n"
  2415. "usubl v1.8h, v2.8b, v3.8b \n"
  2416. "add v0.8h, v0.8h, v1.8h \n"
  2417. "add v0.8h, v0.8h, v1.8h \n"
  2418. "ld1 {v2.8b}, [%0],%5 \n" // right
  2419. "ld1 {v3.8b}, [%1],%5 \n"
  2420. "subs %w3, %w3, #8 \n" // 8 pixels
  2421. "usubl v1.8h, v2.8b, v3.8b \n"
  2422. "add v0.8h, v0.8h, v1.8h \n"
  2423. "abs v0.8h, v0.8h \n"
  2424. "uqxtn v0.8b, v0.8h \n"
  2425. "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
  2426. "b.gt 1b \n"
  2427. : "+r"(src_y0), // %0
  2428. "+r"(src_y1), // %1
  2429. "+r"(dst_sobely), // %2
  2430. "+r"(width) // %3
  2431. : "r"(1LL), // %4
  2432. "r"(6LL) // %5
  2433. : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
  2434. );
  2435. }
  2436. // Caveat - rounds float to half float whereas scaling version truncates.
  2437. void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
  2438. asm volatile(
  2439. "1: \n"
  2440. "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
  2441. "subs %w2, %w2, #8 \n" // 8 pixels per loop
  2442. "uxtl v2.4s, v1.4h \n" // 8 int's
  2443. "uxtl2 v3.4s, v1.8h \n"
  2444. "scvtf v2.4s, v2.4s \n" // 8 floats
  2445. "scvtf v3.4s, v3.4s \n"
  2446. "fcvtn v1.4h, v2.4s \n" // 8 half floats
  2447. "fcvtn2 v1.8h, v3.4s \n"
  2448. "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
  2449. "b.gt 1b \n"
  2450. : "+r"(src), // %0
  2451. "+r"(dst), // %1
  2452. "+r"(width) // %2
  2453. :
  2454. : "cc", "memory", "v1", "v2", "v3");
  2455. }
  2456. void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
  2457. asm volatile(
  2458. "1: \n"
  2459. "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
  2460. "subs %w2, %w2, #8 \n" // 8 pixels per loop
  2461. "uxtl v2.4s, v1.4h \n" // 8 int's
  2462. "uxtl2 v3.4s, v1.8h \n"
  2463. "scvtf v2.4s, v2.4s \n" // 8 floats
  2464. "scvtf v3.4s, v3.4s \n"
  2465. "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent
  2466. "fmul v3.4s, v3.4s, %3.s[0] \n"
  2467. "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat
  2468. "uqshrn2 v1.8h, v3.4s, #13 \n"
  2469. "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
  2470. "b.gt 1b \n"
  2471. : "+r"(src), // %0
  2472. "+r"(dst), // %1
  2473. "+r"(width) // %2
  2474. : "w"(scale * 1.9259299444e-34f) // %3
  2475. : "cc", "memory", "v1", "v2", "v3");
  2476. }
  2477. float ScaleMaxSamples_NEON(const float* src,
  2478. float* dst,
  2479. float scale,
  2480. int width) {
  2481. float fmax;
  2482. asm volatile(
  2483. "movi v5.4s, #0 \n" // max
  2484. "movi v6.4s, #0 \n"
  2485. "1: \n"
  2486. "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
  2487. "subs %w2, %w2, #8 \n" // 8 processed per loop
  2488. "fmul v3.4s, v1.4s, %4.s[0] \n" // scale
  2489. "fmul v4.4s, v2.4s, %4.s[0] \n" // scale
  2490. "fmax v5.4s, v5.4s, v1.4s \n" // max
  2491. "fmax v6.4s, v6.4s, v2.4s \n"
  2492. "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
  2493. "b.gt 1b \n"
  2494. "fmax v5.4s, v5.4s, v6.4s \n" // max
  2495. "fmaxv %s3, v5.4s \n" // signed max acculator
  2496. : "+r"(src), // %0
  2497. "+r"(dst), // %1
  2498. "+r"(width), // %2
  2499. "=w"(fmax) // %3
  2500. : "w"(scale) // %4
  2501. : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
  2502. return fmax;
  2503. }
  2504. float ScaleSumSamples_NEON(const float* src,
  2505. float* dst,
  2506. float scale,
  2507. int width) {
  2508. float fsum;
  2509. asm volatile(
  2510. "movi v5.4s, #0 \n" // max
  2511. "movi v6.4s, #0 \n" // max
  2512. "1: \n"
  2513. "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
  2514. "subs %w2, %w2, #8 \n" // 8 processed per loop
  2515. "fmul v3.4s, v1.4s, %4.s[0] \n" // scale
  2516. "fmul v4.4s, v2.4s, %4.s[0] \n"
  2517. "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares
  2518. "fmla v6.4s, v2.4s, v2.4s \n"
  2519. "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
  2520. "b.gt 1b \n"
  2521. "faddp v5.4s, v5.4s, v6.4s \n"
  2522. "faddp v5.4s, v5.4s, v5.4s \n"
  2523. "faddp %3.4s, v5.4s, v5.4s \n" // sum
  2524. : "+r"(src), // %0
  2525. "+r"(dst), // %1
  2526. "+r"(width), // %2
  2527. "=w"(fsum) // %3
  2528. : "w"(scale) // %4
  2529. : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
  2530. return fsum;
  2531. }
  2532. void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
  2533. asm volatile(
  2534. "1: \n"
  2535. "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
  2536. "subs %w2, %w2, #8 \n" // 8 processed per loop
  2537. "fmul v1.4s, v1.4s, %3.s[0] \n" // scale
  2538. "fmul v2.4s, v2.4s, %3.s[0] \n" // scale
  2539. "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
  2540. "b.gt 1b \n"
  2541. : "+r"(src), // %0
  2542. "+r"(dst), // %1
  2543. "+r"(width) // %2
  2544. : "w"(scale) // %3
  2545. : "cc", "memory", "v1", "v2");
  2546. }
  2547. // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
  2548. void GaussCol_NEON(const uint16* src0,
  2549. const uint16* src1,
  2550. const uint16* src2,
  2551. const uint16* src3,
  2552. const uint16* src4,
  2553. uint32* dst,
  2554. int width) {
  2555. asm volatile(
  2556. "movi v6.8h, #4 \n" // constant 4
  2557. "movi v7.8h, #6 \n" // constant 6
  2558. "1: \n"
  2559. "ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows
  2560. "ld1 {v2.8h}, [%4], #16 \n"
  2561. "uaddl v0.4s, v1.4h, v2.4h \n" // * 1
  2562. "uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1
  2563. "ld1 {v2.8h}, [%1], #16 \n"
  2564. "umlal v0.4s, v2.4h, v6.4h \n" // * 4
  2565. "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
  2566. "ld1 {v2.8h}, [%2], #16 \n"
  2567. "umlal v0.4s, v2.4h, v7.4h \n" // * 6
  2568. "umlal2 v1.4s, v2.8h, v7.8h \n" // * 6
  2569. "ld1 {v2.8h}, [%3], #16 \n"
  2570. "umlal v0.4s, v2.4h, v6.4h \n" // * 4
  2571. "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
  2572. "subs %w6, %w6, #8 \n" // 8 processed per loop
  2573. "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples
  2574. "b.gt 1b \n"
  2575. : "+r"(src0), // %0
  2576. "+r"(src1), // %1
  2577. "+r"(src2), // %2
  2578. "+r"(src3), // %3
  2579. "+r"(src4), // %4
  2580. "+r"(dst), // %5
  2581. "+r"(width) // %6
  2582. :
  2583. : "cc", "memory", "v0", "v1", "v2", "v6", "v7");
  2584. }
  2585. // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
  2586. void GaussRow_NEON(const uint32* src, uint16* dst, int width) {
  2587. const uint32* src1 = src + 1;
  2588. const uint32* src2 = src + 2;
  2589. const uint32* src3 = src + 3;
  2590. asm volatile(
  2591. "movi v6.4s, #4 \n" // constant 4
  2592. "movi v7.4s, #6 \n" // constant 6
  2593. "1: \n"
  2594. "ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples
  2595. "add v0.4s, v0.4s, v1.4s \n" // * 1
  2596. "add v1.4s, v1.4s, v2.4s \n" // * 1
  2597. "ld1 {v2.4s,v3.4s}, [%2], #32 \n"
  2598. "mla v0.4s, v2.4s, v7.4s \n" // * 6
  2599. "mla v1.4s, v3.4s, v7.4s \n" // * 6
  2600. "ld1 {v2.4s,v3.4s}, [%1], #32 \n"
  2601. "ld1 {v4.4s,v5.4s}, [%3], #32 \n"
  2602. "add v2.4s, v2.4s, v4.4s \n" // add rows for * 4
  2603. "add v3.4s, v3.4s, v5.4s \n"
  2604. "mla v0.4s, v2.4s, v6.4s \n" // * 4
  2605. "mla v1.4s, v3.4s, v6.4s \n" // * 4
  2606. "subs %w5, %w5, #8 \n" // 8 processed per loop
  2607. "uqrshrn v0.4h, v0.4s, #8 \n" // round and pack
  2608. "uqrshrn2 v0.8h, v1.4s, #8 \n"
  2609. "st1 {v0.8h}, [%4], #16 \n" // store 8 samples
  2610. "b.gt 1b \n"
  2611. : "+r"(src), // %0
  2612. "+r"(src1), // %1
  2613. "+r"(src2), // %2
  2614. "+r"(src3), // %3
  2615. "+r"(dst), // %4
  2616. "+r"(width) // %5
  2617. : "r"(32LL) // %6
  2618. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
  2619. }
  2620. #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
  2621. #ifdef __cplusplus
  2622. } // extern "C"
  2623. } // namespace libyuv
  2624. #endif