12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786 |
- #include "libyuv/row.h"
- #ifdef __cplusplus
- namespace libyuv {
- extern "C" {
- #endif
- #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
- #define READYUV422 \
- "ld1 {v0.8b}, [%0], #8 \n" \
- "ld1 {v1.s}[0], [%1], #4 \n" \
- "ld1 {v1.s}[1], [%2], #4 \n"
- #define READYUV444 \
- "ld1 {v0.8b}, [%0], #8 \n" \
- "ld1 {v1.d}[0], [%1], #8 \n" \
- "ld1 {v1.d}[1], [%2], #8 \n" \
- "uaddlp v1.8h, v1.16b \n" \
- "rshrn v1.8b, v1.8h, #1 \n"
- #define READYUV400 \
- "ld1 {v0.8b}, [%0], #8 \n" \
- "movi v1.8b , #128 \n"
- #define READNV12 \
- "ld1 {v0.8b}, [%0], #8 \n" \
- "ld1 {v2.8b}, [%1], #8 \n" \
- "uzp1 v1.8b, v2.8b, v2.8b \n" \
- "uzp2 v3.8b, v2.8b, v2.8b \n" \
- "ins v1.s[1], v3.s[0] \n"
- #define READNV21 \
- "ld1 {v0.8b}, [%0], #8 \n" \
- "ld1 {v2.8b}, [%1], #8 \n" \
- "uzp1 v3.8b, v2.8b, v2.8b \n" \
- "uzp2 v1.8b, v2.8b, v2.8b \n" \
- "ins v1.s[1], v3.s[0] \n"
- #define READYUY2 \
- "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \
- "uzp2 v3.8b, v1.8b, v1.8b \n" \
- "uzp1 v1.8b, v1.8b, v1.8b \n" \
- "ins v1.s[1], v3.s[0] \n"
- #define READUYVY \
- "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \
- "orr v0.8b, v3.8b, v3.8b \n" \
- "uzp1 v1.8b, v2.8b, v2.8b \n" \
- "uzp2 v3.8b, v2.8b, v2.8b \n" \
- "ins v1.s[1], v3.s[0] \n"
- #define YUVTORGB_SETUP \
- "ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \
- "ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \
- "ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \
- "ld1r {v31.4s}, [%[kYToRgb]] \n" \
- "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \
- "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n"
- #define YUVTORGB(vR, vG, vB) \
- "uxtl v0.8h, v0.8b \n" \
- "shll v2.8h, v1.8b, #8 \n" \
- "ushll2 v3.4s, v0.8h, #0 \n" \
- "ushll v0.4s, v0.4h, #0 \n" \
- "mul v3.4s, v3.4s, v31.4s \n" \
- "mul v0.4s, v0.4s, v31.4s \n" \
- "sqshrun v0.4h, v0.4s, #16 \n" \
- "sqshrun2 v0.8h, v3.4s, #16 \n" \
- "uaddw v1.8h, v2.8h, v1.8b \n" \
- "mov v2.d[0], v1.d[1] \n" \
- "uxtl v2.8h, v2.8b \n" \
- "uxtl v1.8h, v1.8b \n" \
- "mul v3.8h, v1.8h, v27.8h \n" \
- "mul v5.8h, v1.8h, v29.8h \n" \
- "mul v6.8h, v2.8h, v30.8h \n" \
- "mul v7.8h, v2.8h, v28.8h \n" \
- "sqadd v6.8h, v6.8h, v5.8h \n" \
- "sqadd " #vB \
- ".8h, v24.8h, v0.8h \n" \
- "sqadd " #vG \
- ".8h, v25.8h, v0.8h \n" \
- "sqadd " #vR \
- ".8h, v26.8h, v0.8h \n" \
- "sqadd " #vB ".8h, " #vB \
- ".8h, v3.8h \n" \
- "sqsub " #vG ".8h, " #vG \
- ".8h, v6.8h \n" \
- "sqadd " #vR ".8h, " #vR \
- ".8h, v7.8h \n" \
- "sqshrun " #vB ".8b, " #vB \
- ".8h, #6 \n" \
- "sqshrun " #vG ".8b, " #vG \
- ".8h, #6 \n" \
- "sqshrun " #vR ".8b, " #vR ".8h, #6 \n"
- void I444ToARGBRow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n"
- READYUV444
- YUVTORGB(v22, v21, v20)
- "subs %w4, %w4, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_y),
- "+r"(src_u),
- "+r"(src_v),
- "+r"(dst_argb),
- "+r"(width)
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
- }
- void I422ToARGBRow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n"
- READYUV422
- YUVTORGB(v22, v21, v20)
- "subs %w4, %w4, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_y),
- "+r"(src_u),
- "+r"(src_v),
- "+r"(dst_argb),
- "+r"(width)
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
- }
- void I422AlphaToARGBRow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- const uint8* src_a,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP
- "1: \n"
- READYUV422
- YUVTORGB(v22, v21, v20)
- "ld1 {v23.8b}, [%3], #8 \n"
- "subs %w5, %w5, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_y),
- "+r"(src_u),
- "+r"(src_v),
- "+r"(src_a),
- "+r"(dst_argb),
- "+r"(width)
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
- }
- void I422ToRGBARow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgba,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP
- "movi v20.8b, #255 \n"
- "1: \n"
- READYUV422
- YUVTORGB(v23, v22, v21)
- "subs %w4, %w4, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_y),
- "+r"(src_u),
- "+r"(src_v),
- "+r"(dst_rgba),
- "+r"(width)
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
- }
- void I422ToRGB24Row_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgb24,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP
- "1: \n"
- READYUV422
- YUVTORGB(v22, v21, v20)
- "subs %w4, %w4, #8 \n"
- "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
- "b.gt 1b \n"
- : "+r"(src_y),
- "+r"(src_u),
- "+r"(src_v),
- "+r"(dst_rgb24),
- "+r"(width)
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
- }
- #define ARGBTORGB565 \
- "shll v0.8h, v22.8b, #8 \n" \
- "shll v21.8h, v21.8b, #8 \n" \
- "shll v20.8h, v20.8b, #8 \n" \
- "sri v0.8h, v21.8h, #5 \n" \
- "sri v0.8h, v20.8h, #11 \n"
- void I422ToRGB565Row_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgb565,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile(
- YUVTORGB_SETUP
- "1: \n" READYUV422 YUVTORGB(
- v22, v21,
- v20) "subs %w4, %w4, #8 \n" ARGBTORGB565
- "st1 {v0.8h}, [%3], #16 \n"
-
- "b.gt 1b \n"
- : "+r"(src_y),
- "+r"(src_u),
- "+r"(src_v),
- "+r"(dst_rgb565),
- "+r"(width)
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
- }
- #define ARGBTOARGB1555 \
- "shll v0.8h, v23.8b, #8 \n" \
- "shll v22.8h, v22.8b, #8 \n" \
- "shll v21.8h, v21.8b, #8 \n" \
- "shll v20.8h, v20.8b, #8 \n" \
- "sri v0.8h, v22.8h, #1 \n" \
- "sri v0.8h, v21.8h, #6 \n" \
- "sri v0.8h, v20.8h, #11 \n"
- void I422ToARGB1555Row_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb1555,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile(
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n" READYUV422 YUVTORGB(
- v22, v21,
- v20) "subs %w4, %w4, #8 \n" ARGBTOARGB1555
- "st1 {v0.8h}, [%3], #16 \n"
-
- "b.gt 1b \n"
- : "+r"(src_y),
- "+r"(src_u),
- "+r"(src_v),
- "+r"(dst_argb1555),
- "+r"(width)
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
- }
- #define ARGBTOARGB4444 \
- \
- "ushr v20.8b, v20.8b, #4 \n" \
- "bic v21.8b, v21.8b, v4.8b \n" \
- "ushr v22.8b, v22.8b, #4 \n" \
- "bic v23.8b, v23.8b, v4.8b \n" \
- "orr v0.8b, v20.8b, v21.8b \n" \
- "orr v1.8b, v22.8b, v23.8b \n" \
- "zip1 v0.16b, v0.16b, v1.16b \n"
- void I422ToARGB4444Row_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb4444,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP
- "movi v4.16b, #0x0f \n"
- "1: \n"
- READYUV422
- YUVTORGB(v22, v21, v20)
- "subs %w4, %w4, #8 \n"
- "movi v23.8b, #255 \n"
- ARGBTOARGB4444
- "st1 {v0.8h}, [%3], #16 \n"
- "b.gt 1b \n"
- : "+r"(src_y),
- "+r"(src_u),
- "+r"(src_v),
- "+r"(dst_argb4444),
- "+r"(width)
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
- }
- void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
- asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n"
- READYUV400
- YUVTORGB(v22, v21, v20)
- "subs %w2, %w2, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_y),
- "+r"(dst_argb),
- "+r"(width)
- : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
- [kUVToG]"r"(&kYuvI601Constants.kUVToG),
- [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
- [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
- }
- void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
- asm volatile(
- "movi v23.8b, #255 \n"
- "1: \n"
- "ld1 {v20.8b}, [%0], #8 \n"
- "orr v21.8b, v20.8b, v20.8b \n"
- "orr v22.8b, v20.8b, v20.8b \n"
- "subs %w2, %w2, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_y),
- "+r"(dst_argb),
- "+r"(width)
- :
- : "cc", "memory", "v20", "v21", "v22", "v23");
- }
- void NV12ToARGBRow_NEON(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n"
- READNV12
- YUVTORGB(v22, v21, v20)
- "subs %w3, %w3, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_y),
- "+r"(src_uv),
- "+r"(dst_argb),
- "+r"(width)
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
- }
- void NV21ToARGBRow_NEON(const uint8* src_y,
- const uint8* src_vu,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n"
- READNV21
- YUVTORGB(v22, v21, v20)
- "subs %w3, %w3, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_y),
- "+r"(src_vu),
- "+r"(dst_argb),
- "+r"(width)
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
- }
- void NV12ToRGB565Row_NEON(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_rgb565,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile(
- YUVTORGB_SETUP
- "1: \n" READNV12 YUVTORGB(
- v22, v21,
- v20) "subs %w3, %w3, #8 \n" ARGBTORGB565
- "st1 {v0.8h}, [%2], 16 \n"
-
- "b.gt 1b \n"
- : "+r"(src_y),
- "+r"(src_uv),
- "+r"(dst_rgb565),
- "+r"(width)
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
- }
- void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n"
- READYUY2
- YUVTORGB(v22, v21, v20)
- "subs %w2, %w2, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_yuy2),
- "+r"(dst_argb),
- "+r"(width)
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
- }
- void UYVYToARGBRow_NEON(const uint8* src_uyvy,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n"
- READUYVY
- YUVTORGB(v22, v21, v20)
- "subs %w2, %w2, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
- "b.gt 1b \n"
- : "+r"(src_uyvy),
- "+r"(dst_argb),
- "+r"(width)
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
- }
- void SplitUVRow_NEON(const uint8* src_uv,
- uint8* dst_u,
- uint8* dst_v,
- int width) {
- asm volatile(
- "1: \n"
- "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
- "subs %w3, %w3, #16 \n"
- "st1 {v0.16b}, [%1], #16 \n"
- "st1 {v1.16b}, [%2], #16 \n"
- "b.gt 1b \n"
- : "+r"(src_uv),
- "+r"(dst_u),
- "+r"(dst_v),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1"
- );
- }
- void MergeUVRow_NEON(const uint8* src_u,
- const uint8* src_v,
- uint8* dst_uv,
- int width) {
- asm volatile(
- "1: \n"
- "ld1 {v0.16b}, [%0], #16 \n"
- "ld1 {v1.16b}, [%1], #16 \n"
- "subs %w3, %w3, #16 \n"
- "st2 {v0.16b,v1.16b}, [%2], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_u),
- "+r"(src_v),
- "+r"(dst_uv),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1"
- );
- }
- void SplitRGBRow_NEON(const uint8* src_rgb,
- uint8* dst_r,
- uint8* dst_g,
- uint8* dst_b,
- int width) {
- asm volatile(
- "1: \n"
- "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"
- "subs %w4, %w4, #16 \n"
- "st1 {v0.16b}, [%1], #16 \n"
- "st1 {v1.16b}, [%2], #16 \n"
- "st1 {v2.16b}, [%3], #16 \n"
- "b.gt 1b \n"
- : "+r"(src_rgb),
- "+r"(dst_r),
- "+r"(dst_g),
- "+r"(dst_b),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2"
- );
- }
- void MergeRGBRow_NEON(const uint8* src_r,
- const uint8* src_g,
- const uint8* src_b,
- uint8* dst_rgb,
- int width) {
- asm volatile(
- "1: \n"
- "ld1 {v0.16b}, [%0], #16 \n"
- "ld1 {v1.16b}, [%1], #16 \n"
- "ld1 {v2.16b}, [%2], #16 \n"
- "subs %w4, %w4, #16 \n"
- "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n"
- "b.gt 1b \n"
- : "+r"(src_r),
- "+r"(src_g),
- "+r"(src_b),
- "+r"(dst_rgb),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2"
- );
- }
- void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
- asm volatile(
- "1: \n"
- "ldp q0, q1, [%0], #32 \n"
- "subs %w2, %w2, #32 \n"
- "stp q0, q1, [%1], #32 \n"
- "b.gt 1b \n"
- : "+r"(src),
- "+r"(dst),
- "+r"(count)
- :
- : "cc", "memory", "v0", "v1"
- );
- }
- void SetRow_NEON(uint8* dst, uint8 v8, int count) {
- asm volatile(
- "dup v0.16b, %w2 \n"
- "1: \n"
- "subs %w1, %w1, #16 \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 1b \n"
- : "+r"(dst),
- "+r"(count)
- : "r"(v8)
- : "cc", "memory", "v0");
- }
- void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
- asm volatile(
- "dup v0.4s, %w2 \n"
- "1: \n"
- "subs %w1, %w1, #4 \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 1b \n"
- : "+r"(dst),
- "+r"(count)
- : "r"(v32)
- : "cc", "memory", "v0");
- }
- void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
- asm volatile(
-
- "add %0, %0, %w2, sxtw \n"
- "sub %0, %0, #16 \n"
- "1: \n"
- "ld1 {v0.16b}, [%0], %3 \n"
- "subs %w2, %w2, #16 \n"
- "rev64 v0.16b, v0.16b \n"
- "st1 {v0.D}[1], [%1], #8 \n"
- "st1 {v0.D}[0], [%1], #8 \n"
- "b.gt 1b \n"
- : "+r"(src),
- "+r"(dst),
- "+r"(width)
- : "r"((ptrdiff_t)-16)
- : "cc", "memory", "v0");
- }
- void MirrorUVRow_NEON(const uint8* src_uv,
- uint8* dst_u,
- uint8* dst_v,
- int width) {
- asm volatile(
-
- "add %0, %0, %w3, sxtw #1 \n"
- "sub %0, %0, #16 \n"
- "1: \n"
- "ld2 {v0.8b, v1.8b}, [%0], %4 \n"
- "subs %w3, %w3, #8 \n"
- "rev64 v0.8b, v0.8b \n"
- "rev64 v1.8b, v1.8b \n"
- "st1 {v0.8b}, [%1], #8 \n"
- "st1 {v1.8b}, [%2], #8 \n"
- "b.gt 1b \n"
- : "+r"(src_uv),
- "+r"(dst_u),
- "+r"(dst_v),
- "+r"(width)
- : "r"((ptrdiff_t)-16)
- : "cc", "memory", "v0", "v1");
- }
- void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
- asm volatile(
-
- "add %0, %0, %w2, sxtw #2 \n"
- "sub %0, %0, #16 \n"
- "1: \n"
- "ld1 {v0.16b}, [%0], %3 \n"
- "subs %w2, %w2, #4 \n"
- "rev64 v0.4s, v0.4s \n"
- "st1 {v0.D}[1], [%1], #8 \n"
- "st1 {v0.D}[0], [%1], #8 \n"
- "b.gt 1b \n"
- : "+r"(src),
- "+r"(dst),
- "+r"(width)
- : "r"((ptrdiff_t)-16)
- : "cc", "memory", "v0");
- }
- void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
- asm volatile(
- "movi v4.8b, #255 \n"
- "1: \n"
- "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"
- "subs %w2, %w2, #8 \n"
- "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_rgb24),
- "+r"(dst_argb),
- "+r"(width)
- :
- : "cc", "memory", "v1", "v2", "v3", "v4"
- );
- }
- void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
- asm volatile(
- "movi v5.8b, #255 \n"
- "1: \n"
- "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"
- "subs %w2, %w2, #8 \n"
- "orr v3.8b, v1.8b, v1.8b \n"
- "orr v4.8b, v0.8b, v0.8b \n"
- "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_raw),
- "+r"(dst_argb),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"
- );
- }
- void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
- asm volatile(
- "1: \n"
- "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"
- "subs %w2, %w2, #8 \n"
- "orr v3.8b, v1.8b, v1.8b \n"
- "orr v4.8b, v0.8b, v0.8b \n"
- "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n"
- "b.gt 1b \n"
- : "+r"(src_raw),
- "+r"(dst_rgb24),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4"
- );
- }
- #define RGB565TOARGB \
- "shrn v6.8b, v0.8h, #5 \n" \
- "shl v6.8b, v6.8b, #2 \n" \
- "ushr v4.8b, v6.8b, #6 \n" \
- "orr v1.8b, v4.8b, v6.8b \n" \
- "xtn v2.8b, v0.8h \n" \
- "ushr v0.8h, v0.8h, #11 \n" \
- "xtn2 v2.16b,v0.8h \n" \
- "shl v2.16b, v2.16b, #3 \n" \
- "ushr v0.16b, v2.16b, #5 \n" \
- "orr v0.16b, v0.16b, v2.16b \n" \
- "dup v2.2D, v0.D[1] \n"
- void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
- asm volatile(
- "movi v3.8b, #255 \n"
- "1: \n"
- "ld1 {v0.16b}, [%0], #16 \n"
- "subs %w2, %w2, #8 \n"
- RGB565TOARGB
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_rgb565),
- "+r"(dst_argb),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"
- );
- }
- #define ARGB1555TOARGB \
- "ushr v2.8h, v0.8h, #10 \n" \
- "shl v2.8h, v2.8h, #3 \n" \
- "xtn v3.8b, v2.8h \n" \
- \
- "sshr v2.8h, v0.8h, #15 \n" \
- "xtn2 v3.16b, v2.8h \n" \
- \
- "xtn v2.8b, v0.8h \n" \
- "shrn2 v2.16b,v0.8h, #5 \n" \
- \
- "ushr v1.16b, v3.16b, #5 \n" \
- "shl v0.16b, v2.16b, #3 \n" \
- "ushr v2.16b, v0.16b, #5 \n" \
- \
- "orr v0.16b, v0.16b, v2.16b \n" \
- "orr v2.16b, v1.16b, v3.16b \n" \
- "dup v1.2D, v0.D[1] \n" \
- "dup v3.2D, v2.D[1] \n"
- #define RGB555TOARGB \
- "ushr v2.8h, v0.8h, #10 \n" \
- "shl v2.8h, v2.8h, #3 \n" \
- "xtn v3.8b, v2.8h \n" \
- \
- "xtn v2.8b, v0.8h \n" \
- "shrn2 v2.16b,v0.8h, #5 \n" \
- \
- "ushr v1.16b, v3.16b, #5 \n" \
- "shl v0.16b, v2.16b, #3 \n" \
- "ushr v2.16b, v0.16b, #5 \n" \
- \
- "orr v0.16b, v0.16b, v2.16b \n" \
- "orr v2.16b, v1.16b, v3.16b \n" \
- "dup v1.2D, v0.D[1] \n"
- void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555,
- uint8* dst_argb,
- int width) {
- asm volatile(
- "movi v3.8b, #255 \n"
- "1: \n"
- "ld1 {v0.16b}, [%0], #16 \n"
- "subs %w2, %w2, #8 \n"
- ARGB1555TOARGB
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"
-
- "b.gt 1b \n"
- : "+r"(src_argb1555),
- "+r"(dst_argb),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3"
- );
- }
- #define ARGB4444TOARGB \
- "shrn v1.8b, v0.8h, #8 \n" \
- "xtn2 v1.16b, v0.8h \n" \
- "shl v2.16b, v1.16b, #4 \n" \
- "ushr v3.16b, v1.16b, #4 \n" \
- "ushr v0.16b, v2.16b, #4 \n" \
- "shl v1.16b, v3.16b, #4 \n" \
- "orr v2.16b, v0.16b, v2.16b \n" \
- "orr v3.16b, v1.16b, v3.16b \n" \
- "dup v0.2D, v2.D[1] \n" \
- "dup v1.2D, v3.D[1] \n"
- void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,
- uint8* dst_argb,
- int width) {
- asm volatile(
- "1: \n"
- "ld1 {v0.16b}, [%0], #16 \n"
- "subs %w2, %w2, #8 \n"
- ARGB4444TOARGB
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"
-
- "b.gt 1b \n"
- : "+r"(src_argb4444),
- "+r"(dst_argb),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4"
- );
- }
- void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
- asm volatile(
- "1: \n"
- "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"
- "subs %w2, %w2, #8 \n"
- "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"
-
- "b.gt 1b \n"
- : "+r"(src_argb),
- "+r"(dst_rgb24),
- "+r"(width)
- :
- : "cc", "memory", "v1", "v2", "v3", "v4"
- );
- }
- void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
- asm volatile(
- "1: \n"
- "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"
- "subs %w2, %w2, #8 \n"
- "orr v4.8b, v2.8b, v2.8b \n"
- "orr v5.8b, v1.8b, v1.8b \n"
- "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"
- "b.gt 1b \n"
- : "+r"(src_argb),
- "+r"(dst_raw),
- "+r"(width)
- :
- : "cc", "memory", "v1", "v2", "v3", "v4", "v5"
- );
- }
- void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
- asm volatile(
- "1: \n"
- "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
- "subs %w2, %w2, #16 \n"
- "st1 {v0.16b}, [%1], #16 \n"
- "b.gt 1b \n"
- : "+r"(src_yuy2),
- "+r"(dst_y),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1"
- );
- }
- void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
- asm volatile(
- "1: \n"
- "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
- "subs %w2, %w2, #16 \n"
- "st1 {v1.16b}, [%1], #16 \n"
- "b.gt 1b \n"
- : "+r"(src_uyvy),
- "+r"(dst_y),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1"
- );
- }
- void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
- uint8* dst_u,
- uint8* dst_v,
- int width) {
- asm volatile(
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
- "subs %w3, %w3, #16 \n"
- "st1 {v1.8b}, [%1], #8 \n"
- "st1 {v3.8b}, [%2], #8 \n"
- "b.gt 1b \n"
- : "+r"(src_yuy2),
- "+r"(dst_u),
- "+r"(dst_v),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3"
- );
- }
- void UYVYToUV422Row_NEON(const uint8* src_uyvy,
- uint8* dst_u,
- uint8* dst_v,
- int width) {
- asm volatile(
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
- "subs %w3, %w3, #16 \n"
- "st1 {v0.8b}, [%1], #8 \n"
- "st1 {v2.8b}, [%2], #8 \n"
- "b.gt 1b \n"
- : "+r"(src_uyvy),
- "+r"(dst_u),
- "+r"(dst_v),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3"
- );
- }
- void YUY2ToUVRow_NEON(const uint8* src_yuy2,
- int stride_yuy2,
- uint8* dst_u,
- uint8* dst_v,
- int width) {
- const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
- asm volatile(
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
- "subs %w4, %w4, #16 \n"
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"
- "urhadd v1.8b, v1.8b, v5.8b \n"
- "urhadd v3.8b, v3.8b, v7.8b \n"
- "st1 {v1.8b}, [%2], #8 \n"
- "st1 {v3.8b}, [%3], #8 \n"
- "b.gt 1b \n"
- : "+r"(src_yuy2),
- "+r"(src_yuy2b),
- "+r"(dst_u),
- "+r"(dst_v),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
- "v7"
- );
- }
- void UYVYToUVRow_NEON(const uint8* src_uyvy,
- int stride_uyvy,
- uint8* dst_u,
- uint8* dst_v,
- int width) {
- const uint8* src_uyvyb = src_uyvy + stride_uyvy;
- asm volatile(
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
- "subs %w4, %w4, #16 \n"
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"
- "urhadd v0.8b, v0.8b, v4.8b \n"
- "urhadd v2.8b, v2.8b, v6.8b \n"
- "st1 {v0.8b}, [%2], #8 \n"
- "st1 {v2.8b}, [%3], #8 \n"
- "b.gt 1b \n"
- : "+r"(src_uyvy),
- "+r"(src_uyvyb),
- "+r"(dst_u),
- "+r"(dst_v),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
- "v7"
- );
- }
- void ARGBShuffleRow_NEON(const uint8* src_argb,
- uint8* dst_argb,
- const uint8* shuffler,
- int width) {
- asm volatile(
- "ld1 {v2.16b}, [%3] \n"
- "1: \n"
- "ld1 {v0.16b}, [%0], #16 \n"
- "subs %w2, %w2, #4 \n"
- "tbl v1.16b, {v0.16b}, v2.16b \n"
- "st1 {v1.16b}, [%1], #16 \n"
- "b.gt 1b \n"
- : "+r"(src_argb),
- "+r"(dst_argb),
- "+r"(width)
- : "r"(shuffler)
- : "cc", "memory", "v0", "v1", "v2"
- );
- }
- void I422ToYUY2Row_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_yuy2,
- int width) {
- asm volatile(
- "1: \n"
- "ld2 {v0.8b, v1.8b}, [%0], #16 \n"
- "orr v2.8b, v1.8b, v1.8b \n"
- "ld1 {v1.8b}, [%1], #8 \n"
- "ld1 {v3.8b}, [%2], #8 \n"
- "subs %w4, %w4, #16 \n"
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_y),
- "+r"(src_u),
- "+r"(src_v),
- "+r"(dst_yuy2),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3");
- }
- void I422ToUYVYRow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_uyvy,
- int width) {
- asm volatile(
- "1: \n"
- "ld2 {v1.8b,v2.8b}, [%0], #16 \n"
- "orr v3.8b, v2.8b, v2.8b \n"
- "ld1 {v0.8b}, [%1], #8 \n"
- "ld1 {v2.8b}, [%2], #8 \n"
- "subs %w4, %w4, #16 \n"
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_y),
- "+r"(src_u),
- "+r"(src_v),
- "+r"(dst_uyvy),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3");
- }
- void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
- asm volatile(
- "1: \n"
- "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"
- "subs %w2, %w2, #8 \n"
- ARGBTORGB565
- "st1 {v0.16b}, [%1], #16 \n"
- "b.gt 1b \n"
- : "+r"(src_argb),
- "+r"(dst_rgb565),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v20", "v21", "v22", "v23");
- }
- void ARGBToRGB565DitherRow_NEON(const uint8* src_argb,
- uint8* dst_rgb,
- const uint32 dither4,
- int width) {
- asm volatile(
- "dup v1.4s, %w2 \n"
- "1: \n"
- "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
- "subs %w3, %w3, #8 \n"
- "uqadd v20.8b, v20.8b, v1.8b \n"
- "uqadd v21.8b, v21.8b, v1.8b \n"
- "uqadd v22.8b, v22.8b, v1.8b \n" ARGBTORGB565
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 1b \n"
- : "+r"(dst_rgb)
- : "r"(src_argb),
- "r"(dither4),
- "r"(width)
- : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23");
- }
- void ARGBToARGB1555Row_NEON(const uint8* src_argb,
- uint8* dst_argb1555,
- int width) {
- asm volatile(
- "1: \n"
- "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"
- "subs %w2, %w2, #8 \n"
- ARGBTOARGB1555
- "st1 {v0.16b}, [%1], #16 \n"
-
- "b.gt 1b \n"
- : "+r"(src_argb),
- "+r"(dst_argb1555),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v20", "v21", "v22", "v23");
- }
- void ARGBToARGB4444Row_NEON(const uint8* src_argb,
- uint8* dst_argb4444,
- int width) {
- asm volatile(
- "movi v4.16b, #0x0f \n"
-
- "1: \n"
- "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"
- "subs %w2, %w2, #8 \n"
- ARGBTOARGB4444
- "st1 {v0.16b}, [%1], #16 \n"
-
- "b.gt 1b \n"
- : "+r"(src_argb),
- "+r"(dst_argb4444),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23");
- }
- void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
- asm volatile(
- "movi v4.8b, #13 \n"
- "movi v5.8b, #65 \n"
- "movi v6.8b, #33 \n"
- "movi v7.8b, #16 \n"
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
- "subs %w2, %w2, #8 \n"
- "umull v3.8h, v0.8b, v4.8b \n"
- "umlal v3.8h, v1.8b, v5.8b \n"
- "umlal v3.8h, v2.8b, v6.8b \n"
- "sqrshrun v0.8b, v3.8h, #7 \n"
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n"
- "b.gt 1b \n"
- : "+r"(src_argb),
- "+r"(dst_y),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
- }
- void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
- asm volatile(
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"
-
- "subs %w2, %w2, #16 \n"
- "st1 {v3.16b}, [%1], #16 \n"
- "b.gt 1b \n"
- : "+r"(src_argb),
- "+r"(dst_a),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3"
- );
- }
- void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
- asm volatile(
- "movi v4.8b, #15 \n"
- "movi v5.8b, #75 \n"
- "movi v6.8b, #38 \n"
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
- "subs %w2, %w2, #8 \n"
- "umull v3.8h, v0.8b, v4.8b \n"
- "umlal v3.8h, v1.8b, v5.8b \n"
- "umlal v3.8h, v2.8b, v6.8b \n"
- "sqrshrun v0.8b, v3.8h, #7 \n"
- "st1 {v0.8b}, [%1], #8 \n"
- "b.gt 1b \n"
- : "+r"(src_argb),
- "+r"(dst_y),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
- }
- void ARGBToUV444Row_NEON(const uint8* src_argb,
- uint8* dst_u,
- uint8* dst_v,
- int width) {
- asm volatile(
- "movi v24.8b, #112 \n"
-
- "movi v25.8b, #74 \n"
- "movi v26.8b, #38 \n"
- "movi v27.8b, #18 \n"
- "movi v28.8b, #94 \n"
- "movi v29.16b,#0x80 \n"
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
-
- "subs %w3, %w3, #8 \n"
- "umull v4.8h, v0.8b, v24.8b \n"
- "umlsl v4.8h, v1.8b, v25.8b \n"
- "umlsl v4.8h, v2.8b, v26.8b \n"
- "add v4.8h, v4.8h, v29.8h \n"
- "umull v3.8h, v2.8b, v24.8b \n"
- "umlsl v3.8h, v1.8b, v28.8b \n"
- "umlsl v3.8h, v0.8b, v27.8b \n"
- "add v3.8h, v3.8h, v29.8h \n"
- "uqshrn v0.8b, v4.8h, #8 \n"
- "uqshrn v1.8b, v3.8h, #8 \n"
- "st1 {v0.8b}, [%1], #8 \n"
- "st1 {v1.8b}, [%2], #8 \n"
- "b.gt 1b \n"
- : "+r"(src_argb),
- "+r"(dst_u),
- "+r"(dst_v),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26",
- "v27", "v28", "v29");
- }
- #define RGBTOUV_SETUP_REG \
- "movi v20.8h, #56, lsl #0 \n" \
- "movi v21.8h, #37, lsl #0 \n" \
- "movi v22.8h, #19, lsl #0 \n" \
- "movi v23.8h, #9, lsl #0 \n" \
- "movi v24.8h, #47, lsl #0 \n" \
- "movi v25.16b, #0x80 \n"
- #define RGBTOUV(QB, QG, QR) \
- "mul v3.8h, " #QB ",v20.8h \n" \
- "mul v4.8h, " #QR ",v20.8h \n" \
- "mls v3.8h, " #QG ",v21.8h \n" \
- "mls v4.8h, " #QG ",v24.8h \n" \
- "mls v3.8h, " #QR ",v22.8h \n" \
- "mls v4.8h, " #QB ",v23.8h \n" \
- "add v3.8h, v3.8h, v25.8h \n" \
- "add v4.8h, v4.8h, v25.8h \n" \
- "uqshrn v0.8b, v3.8h, #8 \n" \
- "uqshrn v1.8b, v4.8h, #8 \n"
- void ARGBToUVRow_NEON(const uint8* src_argb,
- int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
- int width) {
- const uint8* src_argb_1 = src_argb + src_stride_argb;
- asm volatile (
- RGBTOUV_SETUP_REG
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"
- "uaddlp v0.8h, v0.16b \n"
- "uaddlp v1.8h, v1.16b \n"
- "uaddlp v2.8h, v2.16b \n"
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"
- "uadalp v0.8h, v4.16b \n"
- "uadalp v1.8h, v5.16b \n"
- "uadalp v2.8h, v6.16b \n"
- "urshr v0.8h, v0.8h, #1 \n"
- "urshr v1.8h, v1.8h, #1 \n"
- "urshr v2.8h, v2.8h, #1 \n"
- "subs %w4, %w4, #16 \n"
- RGBTOUV(v0.8h, v1.8h, v2.8h)
- "st1 {v0.8b}, [%2], #8 \n"
- "st1 {v1.8b}, [%3], #8 \n"
- "b.gt 1b \n"
- : "+r"(src_argb),
- "+r"(src_argb_1),
- "+r"(dst_u),
- "+r"(dst_v),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
- "v20", "v21", "v22", "v23", "v24", "v25"
- );
- }
- void ARGBToUVJRow_NEON(const uint8* src_argb,
- int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
- int width) {
- const uint8* src_argb_1 = src_argb + src_stride_argb;
- asm volatile (
- "movi v20.8h, #63, lsl #0 \n"
- "movi v21.8h, #42, lsl #0 \n"
- "movi v22.8h, #21, lsl #0 \n"
- "movi v23.8h, #10, lsl #0 \n"
- "movi v24.8h, #53, lsl #0 \n"
- "movi v25.16b, #0x80 \n"
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"
- "uaddlp v0.8h, v0.16b \n"
- "uaddlp v1.8h, v1.16b \n"
- "uaddlp v2.8h, v2.16b \n"
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"
- "uadalp v0.8h, v4.16b \n"
- "uadalp v1.8h, v5.16b \n"
- "uadalp v2.8h, v6.16b \n"
- "urshr v0.8h, v0.8h, #1 \n"
- "urshr v1.8h, v1.8h, #1 \n"
- "urshr v2.8h, v2.8h, #1 \n"
- "subs %w4, %w4, #16 \n"
- RGBTOUV(v0.8h, v1.8h, v2.8h)
- "st1 {v0.8b}, [%2], #8 \n"
- "st1 {v1.8b}, [%3], #8 \n"
- "b.gt 1b \n"
- : "+r"(src_argb),
- "+r"(src_argb_1),
- "+r"(dst_u),
- "+r"(dst_v),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
- "v20", "v21", "v22", "v23", "v24", "v25"
- );
- }
- void BGRAToUVRow_NEON(const uint8* src_bgra,
- int src_stride_bgra,
- uint8* dst_u,
- uint8* dst_v,
- int width) {
- const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
- asm volatile (
- RGBTOUV_SETUP_REG
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"
- "uaddlp v0.8h, v3.16b \n"
- "uaddlp v3.8h, v2.16b \n"
- "uaddlp v2.8h, v1.16b \n"
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"
- "uadalp v0.8h, v7.16b \n"
- "uadalp v3.8h, v6.16b \n"
- "uadalp v2.8h, v5.16b \n"
- "urshr v0.8h, v0.8h, #1 \n"
- "urshr v1.8h, v3.8h, #1 \n"
- "urshr v2.8h, v2.8h, #1 \n"
- "subs %w4, %w4, #16 \n"
- RGBTOUV(v0.8h, v1.8h, v2.8h)
- "st1 {v0.8b}, [%2], #8 \n"
- "st1 {v1.8b}, [%3], #8 \n"
- "b.gt 1b \n"
- : "+r"(src_bgra),
- "+r"(src_bgra_1),
- "+r"(dst_u),
- "+r"(dst_v),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
- "v20", "v21", "v22", "v23", "v24", "v25"
- );
- }
- void ABGRToUVRow_NEON(const uint8* src_abgr,
- int src_stride_abgr,
- uint8* dst_u,
- uint8* dst_v,
- int width) {
- const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
- asm volatile (
- RGBTOUV_SETUP_REG
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"
- "uaddlp v3.8h, v2.16b \n"
- "uaddlp v2.8h, v1.16b \n"
- "uaddlp v1.8h, v0.16b \n"
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"
- "uadalp v3.8h, v6.16b \n"
- "uadalp v2.8h, v5.16b \n"
- "uadalp v1.8h, v4.16b \n"
- "urshr v0.8h, v3.8h, #1 \n"
- "urshr v2.8h, v2.8h, #1 \n"
- "urshr v1.8h, v1.8h, #1 \n"
- "subs %w4, %w4, #16 \n"
- RGBTOUV(v0.8h, v2.8h, v1.8h)
- "st1 {v0.8b}, [%2], #8 \n"
- "st1 {v1.8b}, [%3], #8 \n"
- "b.gt 1b \n"
- : "+r"(src_abgr),
- "+r"(src_abgr_1),
- "+r"(dst_u),
- "+r"(dst_v),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
- "v20", "v21", "v22", "v23", "v24", "v25"
- );
- }
- void RGBAToUVRow_NEON(const uint8* src_rgba,
- int src_stride_rgba,
- uint8* dst_u,
- uint8* dst_v,
- int width) {
- const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
- asm volatile (
- RGBTOUV_SETUP_REG
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"
- "uaddlp v0.8h, v1.16b \n"
- "uaddlp v1.8h, v2.16b \n"
- "uaddlp v2.8h, v3.16b \n"
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"
- "uadalp v0.8h, v5.16b \n"
- "uadalp v1.8h, v6.16b \n"
- "uadalp v2.8h, v7.16b \n"
- "urshr v0.8h, v0.8h, #1 \n"
- "urshr v1.8h, v1.8h, #1 \n"
- "urshr v2.8h, v2.8h, #1 \n"
- "subs %w4, %w4, #16 \n"
- RGBTOUV(v0.8h, v1.8h, v2.8h)
- "st1 {v0.8b}, [%2], #8 \n"
- "st1 {v1.8b}, [%3], #8 \n"
- "b.gt 1b \n"
- : "+r"(src_rgba),
- "+r"(src_rgba_1),
- "+r"(dst_u),
- "+r"(dst_v),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
- "v20", "v21", "v22", "v23", "v24", "v25"
- );
- }
- void RGB24ToUVRow_NEON(const uint8* src_rgb24,
- int src_stride_rgb24,
- uint8* dst_u,
- uint8* dst_v,
- int width) {
- const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
- asm volatile (
- RGBTOUV_SETUP_REG
- "1: \n"
- "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"
- "uaddlp v0.8h, v0.16b \n"
- "uaddlp v1.8h, v1.16b \n"
- "uaddlp v2.8h, v2.16b \n"
- "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"
- "uadalp v0.8h, v4.16b \n"
- "uadalp v1.8h, v5.16b \n"
- "uadalp v2.8h, v6.16b \n"
- "urshr v0.8h, v0.8h, #1 \n"
- "urshr v1.8h, v1.8h, #1 \n"
- "urshr v2.8h, v2.8h, #1 \n"
- "subs %w4, %w4, #16 \n"
- RGBTOUV(v0.8h, v1.8h, v2.8h)
- "st1 {v0.8b}, [%2], #8 \n"
- "st1 {v1.8b}, [%3], #8 \n"
- "b.gt 1b \n"
- : "+r"(src_rgb24),
- "+r"(src_rgb24_1),
- "+r"(dst_u),
- "+r"(dst_v),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
- "v20", "v21", "v22", "v23", "v24", "v25"
- );
- }
- void RAWToUVRow_NEON(const uint8* src_raw,
- int src_stride_raw,
- uint8* dst_u,
- uint8* dst_v,
- int width) {
- const uint8* src_raw_1 = src_raw + src_stride_raw;
- asm volatile (
- RGBTOUV_SETUP_REG
- "1: \n"
- "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"
- "uaddlp v2.8h, v2.16b \n"
- "uaddlp v1.8h, v1.16b \n"
- "uaddlp v0.8h, v0.16b \n"
- "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"
- "uadalp v2.8h, v6.16b \n"
- "uadalp v1.8h, v5.16b \n"
- "uadalp v0.8h, v4.16b \n"
- "urshr v2.8h, v2.8h, #1 \n"
- "urshr v1.8h, v1.8h, #1 \n"
- "urshr v0.8h, v0.8h, #1 \n"
- "subs %w4, %w4, #16 \n"
- RGBTOUV(v2.8h, v1.8h, v0.8h)
- "st1 {v0.8b}, [%2], #8 \n"
- "st1 {v1.8b}, [%3], #8 \n"
- "b.gt 1b \n"
- : "+r"(src_raw),
- "+r"(src_raw_1),
- "+r"(dst_u),
- "+r"(dst_v),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
- "v20", "v21", "v22", "v23", "v24", "v25"
- );
- }
- void RGB565ToUVRow_NEON(const uint8* src_rgb565,
- int src_stride_rgb565,
- uint8* dst_u,
- uint8* dst_v,
- int width) {
- const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
- asm volatile(
- "movi v22.8h, #56, lsl #0 \n"
-
- "movi v23.8h, #37, lsl #0 \n"
- "movi v24.8h, #19, lsl #0 \n"
- "movi v25.8h, #9 , lsl #0 \n"
- "movi v26.8h, #47, lsl #0 \n"
- "movi v27.16b, #0x80 \n"
- "1: \n"
- "ld1 {v0.16b}, [%0], #16 \n"
- RGB565TOARGB
- "uaddlp v16.4h, v0.8b \n"
- "uaddlp v18.4h, v1.8b \n"
- "uaddlp v20.4h, v2.8b \n"
- "ld1 {v0.16b}, [%0], #16 \n"
- RGB565TOARGB
- "uaddlp v17.4h, v0.8b \n"
- "uaddlp v19.4h, v1.8b \n"
- "uaddlp v21.4h, v2.8b \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- RGB565TOARGB
- "uadalp v16.4h, v0.8b \n"
- "uadalp v18.4h, v1.8b \n"
- "uadalp v20.4h, v2.8b \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- RGB565TOARGB
- "uadalp v17.4h, v0.8b \n"
- "uadalp v19.4h, v1.8b \n"
- "uadalp v21.4h, v2.8b \n"
- "ins v16.D[1], v17.D[0] \n"
- "ins v18.D[1], v19.D[0] \n"
- "ins v20.D[1], v21.D[0] \n"
- "urshr v4.8h, v16.8h, #1 \n"
- "urshr v5.8h, v18.8h, #1 \n"
- "urshr v6.8h, v20.8h, #1 \n"
- "subs %w4, %w4, #16 \n"
- "mul v16.8h, v4.8h, v22.8h \n"
- "mls v16.8h, v5.8h, v23.8h \n"
- "mls v16.8h, v6.8h, v24.8h \n"
- "add v16.8h, v16.8h, v27.8h \n"
- "mul v17.8h, v6.8h, v22.8h \n"
- "mls v17.8h, v5.8h, v26.8h \n"
- "mls v17.8h, v4.8h, v25.8h \n"
- "add v17.8h, v17.8h, v27.8h \n"
- "uqshrn v0.8b, v16.8h, #8 \n"
- "uqshrn v1.8b, v17.8h, #8 \n"
- "st1 {v0.8b}, [%2], #8 \n"
- "st1 {v1.8b}, [%3], #8 \n"
- "b.gt 1b \n"
- : "+r"(src_rgb565),
- "+r"(src_rgb565_1),
- "+r"(dst_u),
- "+r"(dst_v),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
- "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
- "v27");
- }
- void ARGB1555ToUVRow_NEON(const uint8* src_argb1555,
- int src_stride_argb1555,
- uint8* dst_u,
- uint8* dst_v,
- int width) {
- const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
- asm volatile(
- RGBTOUV_SETUP_REG
- "1: \n"
- "ld1 {v0.16b}, [%0], #16 \n"
- RGB555TOARGB
- "uaddlp v16.4h, v0.8b \n"
- "uaddlp v17.4h, v1.8b \n"
- "uaddlp v18.4h, v2.8b \n"
- "ld1 {v0.16b}, [%0], #16 \n"
- RGB555TOARGB
- "uaddlp v26.4h, v0.8b \n"
- "uaddlp v27.4h, v1.8b \n"
- "uaddlp v28.4h, v2.8b \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- RGB555TOARGB
- "uadalp v16.4h, v0.8b \n"
- "uadalp v17.4h, v1.8b \n"
- "uadalp v18.4h, v2.8b \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- RGB555TOARGB
- "uadalp v26.4h, v0.8b \n"
- "uadalp v27.4h, v1.8b \n"
- "uadalp v28.4h, v2.8b \n"
- "ins v16.D[1], v26.D[0] \n"
- "ins v17.D[1], v27.D[0] \n"
- "ins v18.D[1], v28.D[0] \n"
- "urshr v4.8h, v16.8h, #1 \n"
- "urshr v5.8h, v17.8h, #1 \n"
- "urshr v6.8h, v18.8h, #1 \n"
- "subs %w4, %w4, #16 \n"
- "mul v2.8h, v4.8h, v20.8h \n"
- "mls v2.8h, v5.8h, v21.8h \n"
- "mls v2.8h, v6.8h, v22.8h \n"
- "add v2.8h, v2.8h, v25.8h \n"
- "mul v3.8h, v6.8h, v20.8h \n"
- "mls v3.8h, v5.8h, v24.8h \n"
- "mls v3.8h, v4.8h, v23.8h \n"
- "add v3.8h, v3.8h, v25.8h \n"
- "uqshrn v0.8b, v2.8h, #8 \n"
- "uqshrn v1.8b, v3.8h, #8 \n"
- "st1 {v0.8b}, [%2], #8 \n"
- "st1 {v1.8b}, [%3], #8 \n"
- "b.gt 1b \n"
- : "+r"(src_argb1555),
- "+r"(src_argb1555_1),
- "+r"(dst_u),
- "+r"(dst_v),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
- "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
- "v28");
- }
- void ARGB4444ToUVRow_NEON(const uint8* src_argb4444,
- int src_stride_argb4444,
- uint8* dst_u,
- uint8* dst_v,
- int width) {
- const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
- asm volatile(
- RGBTOUV_SETUP_REG
- "1: \n"
- "ld1 {v0.16b}, [%0], #16 \n"
- ARGB4444TOARGB
- "uaddlp v16.4h, v0.8b \n"
- "uaddlp v17.4h, v1.8b \n"
- "uaddlp v18.4h, v2.8b \n"
- "ld1 {v0.16b}, [%0], #16 \n"
- ARGB4444TOARGB
- "uaddlp v26.4h, v0.8b \n"
- "uaddlp v27.4h, v1.8b \n"
- "uaddlp v28.4h, v2.8b \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- ARGB4444TOARGB
- "uadalp v16.4h, v0.8b \n"
- "uadalp v17.4h, v1.8b \n"
- "uadalp v18.4h, v2.8b \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- ARGB4444TOARGB
- "uadalp v26.4h, v0.8b \n"
- "uadalp v27.4h, v1.8b \n"
- "uadalp v28.4h, v2.8b \n"
- "ins v16.D[1], v26.D[0] \n"
- "ins v17.D[1], v27.D[0] \n"
- "ins v18.D[1], v28.D[0] \n"
- "urshr v4.8h, v16.8h, #1 \n"
- "urshr v5.8h, v17.8h, #1 \n"
- "urshr v6.8h, v18.8h, #1 \n"
- "subs %w4, %w4, #16 \n"
- "mul v2.8h, v4.8h, v20.8h \n"
- "mls v2.8h, v5.8h, v21.8h \n"
- "mls v2.8h, v6.8h, v22.8h \n"
- "add v2.8h, v2.8h, v25.8h \n"
- "mul v3.8h, v6.8h, v20.8h \n"
- "mls v3.8h, v5.8h, v24.8h \n"
- "mls v3.8h, v4.8h, v23.8h \n"
- "add v3.8h, v3.8h, v25.8h \n"
- "uqshrn v0.8b, v2.8h, #8 \n"
- "uqshrn v1.8b, v3.8h, #8 \n"
- "st1 {v0.8b}, [%2], #8 \n"
- "st1 {v1.8b}, [%3], #8 \n"
- "b.gt 1b \n"
- : "+r"(src_argb4444),
- "+r"(src_argb4444_1),
- "+r"(dst_u),
- "+r"(dst_v),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
- "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
- "v28"
- );
- }
- void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
- asm volatile(
- "movi v24.8b, #13 \n"
- "movi v25.8b, #65 \n"
- "movi v26.8b, #33 \n"
- "movi v27.8b, #16 \n"
- "1: \n"
- "ld1 {v0.16b}, [%0], #16 \n"
- "subs %w2, %w2, #8 \n"
- RGB565TOARGB
- "umull v3.8h, v0.8b, v24.8b \n"
- "umlal v3.8h, v1.8b, v25.8b \n"
- "umlal v3.8h, v2.8b, v26.8b \n"
- "sqrshrun v0.8b, v3.8h, #7 \n"
- "uqadd v0.8b, v0.8b, v27.8b \n"
- "st1 {v0.8b}, [%1], #8 \n"
- "b.gt 1b \n"
- : "+r"(src_rgb565),
- "+r"(dst_y),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", "v24", "v25", "v26",
- "v27");
- }
- void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
- asm volatile(
- "movi v4.8b, #13 \n"
- "movi v5.8b, #65 \n"
- "movi v6.8b, #33 \n"
- "movi v7.8b, #16 \n"
- "1: \n"
- "ld1 {v0.16b}, [%0], #16 \n"
- "subs %w2, %w2, #8 \n"
- ARGB1555TOARGB
- "umull v3.8h, v0.8b, v4.8b \n"
- "umlal v3.8h, v1.8b, v5.8b \n"
- "umlal v3.8h, v2.8b, v6.8b \n"
- "sqrshrun v0.8b, v3.8h, #7 \n"
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n"
- "b.gt 1b \n"
- : "+r"(src_argb1555),
- "+r"(dst_y),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
- }
- void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
- asm volatile(
- "movi v24.8b, #13 \n"
- "movi v25.8b, #65 \n"
- "movi v26.8b, #33 \n"
- "movi v27.8b, #16 \n"
- "1: \n"
- "ld1 {v0.16b}, [%0], #16 \n"
- "subs %w2, %w2, #8 \n"
- ARGB4444TOARGB
- "umull v3.8h, v0.8b, v24.8b \n"
- "umlal v3.8h, v1.8b, v25.8b \n"
- "umlal v3.8h, v2.8b, v26.8b \n"
- "sqrshrun v0.8b, v3.8h, #7 \n"
- "uqadd v0.8b, v0.8b, v27.8b \n"
- "st1 {v0.8b}, [%1], #8 \n"
- "b.gt 1b \n"
- : "+r"(src_argb4444),
- "+r"(dst_y),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27");
- }
- void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
- asm volatile(
- "movi v4.8b, #33 \n"
- "movi v5.8b, #65 \n"
- "movi v6.8b, #13 \n"
- "movi v7.8b, #16 \n"
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
- "subs %w2, %w2, #8 \n"
- "umull v16.8h, v1.8b, v4.8b \n"
- "umlal v16.8h, v2.8b, v5.8b \n"
- "umlal v16.8h, v3.8b, v6.8b \n"
- "sqrshrun v0.8b, v16.8h, #7 \n"
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n"
- "b.gt 1b \n"
- : "+r"(src_bgra),
- "+r"(dst_y),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
- }
- void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
- asm volatile(
- "movi v4.8b, #33 \n"
- "movi v5.8b, #65 \n"
- "movi v6.8b, #13 \n"
- "movi v7.8b, #16 \n"
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
- "subs %w2, %w2, #8 \n"
- "umull v16.8h, v0.8b, v4.8b \n"
- "umlal v16.8h, v1.8b, v5.8b \n"
- "umlal v16.8h, v2.8b, v6.8b \n"
- "sqrshrun v0.8b, v16.8h, #7 \n"
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n"
- "b.gt 1b \n"
- : "+r"(src_abgr),
- "+r"(dst_y),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
- }
- void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
- asm volatile(
- "movi v4.8b, #13 \n"
- "movi v5.8b, #65 \n"
- "movi v6.8b, #33 \n"
- "movi v7.8b, #16 \n"
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
- "subs %w2, %w2, #8 \n"
- "umull v16.8h, v1.8b, v4.8b \n"
- "umlal v16.8h, v2.8b, v5.8b \n"
- "umlal v16.8h, v3.8b, v6.8b \n"
- "sqrshrun v0.8b, v16.8h, #7 \n"
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n"
- "b.gt 1b \n"
- : "+r"(src_rgba),
- "+r"(dst_y),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
- }
- void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
- asm volatile(
- "movi v4.8b, #13 \n"
- "movi v5.8b, #65 \n"
- "movi v6.8b, #33 \n"
- "movi v7.8b, #16 \n"
- "1: \n"
- "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"
- "subs %w2, %w2, #8 \n"
- "umull v16.8h, v0.8b, v4.8b \n"
- "umlal v16.8h, v1.8b, v5.8b \n"
- "umlal v16.8h, v2.8b, v6.8b \n"
- "sqrshrun v0.8b, v16.8h, #7 \n"
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n"
- "b.gt 1b \n"
- : "+r"(src_rgb24),
- "+r"(dst_y),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
- }
- void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
- asm volatile(
- "movi v4.8b, #33 \n"
- "movi v5.8b, #65 \n"
- "movi v6.8b, #13 \n"
- "movi v7.8b, #16 \n"
- "1: \n"
- "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"
- "subs %w2, %w2, #8 \n"
- "umull v16.8h, v0.8b, v4.8b \n"
- "umlal v16.8h, v1.8b, v5.8b \n"
- "umlal v16.8h, v2.8b, v6.8b \n"
- "sqrshrun v0.8b, v16.8h, #7 \n"
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n"
- "b.gt 1b \n"
- : "+r"(src_raw),
- "+r"(dst_y),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
- }
- void InterpolateRow_NEON(uint8* dst_ptr,
- const uint8* src_ptr,
- ptrdiff_t src_stride,
- int dst_width,
- int source_y_fraction) {
- int y1_fraction = source_y_fraction;
- int y0_fraction = 256 - y1_fraction;
- const uint8* src_ptr1 = src_ptr + src_stride;
- asm volatile(
- "cmp %w4, #0 \n"
- "b.eq 100f \n"
- "cmp %w4, #128 \n"
- "b.eq 50f \n"
- "dup v5.16b, %w4 \n"
- "dup v4.16b, %w5 \n"
-
- "1: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "ld1 {v1.16b}, [%2], #16 \n"
- "subs %w3, %w3, #16 \n"
- "umull v2.8h, v0.8b, v4.8b \n"
- "umull2 v3.8h, v0.16b, v4.16b \n"
- "umlal v2.8h, v1.8b, v5.8b \n"
- "umlal2 v3.8h, v1.16b, v5.16b \n"
- "rshrn v0.8b, v2.8h, #8 \n"
- "rshrn2 v0.16b, v3.8h, #8 \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 1b \n"
- "b 99f \n"
-
- "50: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "ld1 {v1.16b}, [%2], #16 \n"
- "subs %w3, %w3, #16 \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 50b \n"
- "b 99f \n"
-
- "100: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "subs %w3, %w3, #16 \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 100b \n"
- "99: \n"
- : "+r"(dst_ptr),
- "+r"(src_ptr),
- "+r"(src_ptr1),
- "+r"(dst_width),
- "+r"(y1_fraction),
- "+r"(y0_fraction)
- :
- : "cc", "memory", "v0", "v1", "v3", "v4", "v5");
- }
- void ARGBBlendRow_NEON(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
- int width) {
- asm volatile(
- "subs %w3, %w3, #8 \n"
- "b.lt 89f \n"
-
- "8: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
-
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"
-
- "subs %w3, %w3, #8 \n"
- "umull v16.8h, v4.8b, v3.8b \n"
- "umull v17.8h, v5.8b, v3.8b \n"
- "umull v18.8h, v6.8b, v3.8b \n"
- "uqrshrn v16.8b, v16.8h, #8 \n"
- "uqrshrn v17.8b, v17.8h, #8 \n"
- "uqrshrn v18.8b, v18.8h, #8 \n"
- "uqsub v4.8b, v4.8b, v16.8b \n"
- "uqsub v5.8b, v5.8b, v17.8b \n"
- "uqsub v6.8b, v6.8b, v18.8b \n"
- "uqadd v0.8b, v0.8b, v4.8b \n"
- "uqadd v1.8b, v1.8b, v5.8b \n"
- "uqadd v2.8b, v2.8b, v6.8b \n"
- "movi v3.8b, #255 \n"
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
-
- "b.ge 8b \n"
- "89: \n"
- "adds %w3, %w3, #8-1 \n"
- "b.lt 99f \n"
-
- "1: \n"
- "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n"
- "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n"
- "subs %w3, %w3, #1 \n"
- "umull v16.8h, v4.8b, v3.8b \n"
- "umull v17.8h, v5.8b, v3.8b \n"
- "umull v18.8h, v6.8b, v3.8b \n"
- "uqrshrn v16.8b, v16.8h, #8 \n"
- "uqrshrn v17.8b, v17.8h, #8 \n"
- "uqrshrn v18.8b, v18.8h, #8 \n"
- "uqsub v4.8b, v4.8b, v16.8b \n"
- "uqsub v5.8b, v5.8b, v17.8b \n"
- "uqsub v6.8b, v6.8b, v18.8b \n"
- "uqadd v0.8b, v0.8b, v4.8b \n"
- "uqadd v1.8b, v1.8b, v5.8b \n"
- "uqadd v2.8b, v2.8b, v6.8b \n"
- "movi v3.8b, #255 \n"
- "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n"
- "b.ge 1b \n"
- "99: \n"
- : "+r"(src_argb0),
- "+r"(src_argb1),
- "+r"(dst_argb),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
- "v17", "v18");
- }
- void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
- asm volatile(
-
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
- "subs %w2, %w2, #8 \n"
- "umull v4.8h, v0.8b, v3.8b \n"
- "umull v5.8h, v1.8b, v3.8b \n"
- "umull v6.8h, v2.8b, v3.8b \n"
- "uqrshrn v0.8b, v4.8h, #8 \n"
- "uqrshrn v1.8b, v5.8h, #8 \n"
- "uqrshrn v2.8b, v6.8h, #8 \n"
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"
-
- "b.gt 1b \n"
- : "+r"(src_argb),
- "+r"(dst_argb),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
- }
- void ARGBQuantizeRow_NEON(uint8* dst_argb,
- int scale,
- int interval_size,
- int interval_offset,
- int width) {
- asm volatile(
- "dup v4.8h, %w2 \n"
- "ushr v4.8h, v4.8h, #1 \n"
- "dup v5.8h, %w3 \n"
- "dup v6.8h, %w4 \n"
-
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"
- "subs %w1, %w1, #8 \n"
- "uxtl v0.8h, v0.8b \n"
- "uxtl v1.8h, v1.8b \n"
- "uxtl v2.8h, v2.8b \n"
- "sqdmulh v0.8h, v0.8h, v4.8h \n"
- "sqdmulh v1.8h, v1.8h, v4.8h \n"
- "sqdmulh v2.8h, v2.8h, v4.8h \n"
- "mul v0.8h, v0.8h, v5.8h \n"
- "mul v1.8h, v1.8h, v5.8h \n"
- "mul v2.8h, v2.8h, v5.8h \n"
- "add v0.8h, v0.8h, v6.8h \n"
- "add v1.8h, v1.8h, v6.8h \n"
- "add v2.8h, v2.8h, v6.8h \n"
- "uqxtn v0.8b, v0.8h \n"
- "uqxtn v1.8b, v1.8h \n"
- "uqxtn v2.8b, v2.8h \n"
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
- "b.gt 1b \n"
- : "+r"(dst_argb),
- "+r"(width)
- : "r"(scale),
- "r"(interval_size),
- "r"(interval_offset)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
- }
- void ARGBShadeRow_NEON(const uint8* src_argb,
- uint8* dst_argb,
- int width,
- uint32 value) {
- asm volatile(
- "dup v0.4s, %w3 \n"
- "zip1 v0.8b, v0.8b, v0.8b \n"
- "ushr v0.8h, v0.8h, #1 \n"
-
- "1: \n"
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"
- "subs %w2, %w2, #8 \n"
- "uxtl v4.8h, v4.8b \n"
- "uxtl v5.8h, v5.8b \n"
- "uxtl v6.8h, v6.8b \n"
- "uxtl v7.8h, v7.8b \n"
- "sqrdmulh v4.8h, v4.8h, v0.h[0] \n"
- "sqrdmulh v5.8h, v5.8h, v0.h[1] \n"
- "sqrdmulh v6.8h, v6.8h, v0.h[2] \n"
- "sqrdmulh v7.8h, v7.8h, v0.h[3] \n"
- "uqxtn v4.8b, v4.8h \n"
- "uqxtn v5.8b, v5.8h \n"
- "uqxtn v6.8b, v6.8h \n"
- "uqxtn v7.8b, v7.8h \n"
- "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_argb),
- "+r"(dst_argb),
- "+r"(width)
- : "r"(value)
- : "cc", "memory", "v0", "v4", "v5", "v6", "v7");
- }
- void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
- asm volatile(
- "movi v24.8b, #15 \n"
- "movi v25.8b, #75 \n"
- "movi v26.8b, #38 \n"
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
- "subs %w2, %w2, #8 \n"
- "umull v4.8h, v0.8b, v24.8b \n"
- "umlal v4.8h, v1.8b, v25.8b \n"
- "umlal v4.8h, v2.8b, v26.8b \n"
- "sqrshrun v0.8b, v4.8h, #7 \n"
- "orr v1.8b, v0.8b, v0.8b \n"
- "orr v2.8b, v0.8b, v0.8b \n"
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_argb),
- "+r"(dst_argb),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26");
- }
- void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
- asm volatile(
- "movi v20.8b, #17 \n"
- "movi v21.8b, #68 \n"
- "movi v22.8b, #35 \n"
- "movi v24.8b, #22 \n"
- "movi v25.8b, #88 \n"
- "movi v26.8b, #45 \n"
- "movi v28.8b, #24 \n"
- "movi v29.8b, #98 \n"
- "movi v30.8b, #50 \n"
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"
- "subs %w1, %w1, #8 \n"
- "umull v4.8h, v0.8b, v20.8b \n"
- "umlal v4.8h, v1.8b, v21.8b \n"
- "umlal v4.8h, v2.8b, v22.8b \n"
- "umull v5.8h, v0.8b, v24.8b \n"
- "umlal v5.8h, v1.8b, v25.8b \n"
- "umlal v5.8h, v2.8b, v26.8b \n"
- "umull v6.8h, v0.8b, v28.8b \n"
- "umlal v6.8h, v1.8b, v29.8b \n"
- "umlal v6.8h, v2.8b, v30.8b \n"
- "uqshrn v0.8b, v4.8h, #7 \n"
- "uqshrn v1.8b, v5.8h, #7 \n"
- "uqshrn v2.8b, v6.8h, #7 \n"
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
- "b.gt 1b \n"
- : "+r"(dst_argb),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30");
- }
- void ARGBColorMatrixRow_NEON(const uint8* src_argb,
- uint8* dst_argb,
- const int8* matrix_argb,
- int width) {
- asm volatile(
- "ld1 {v2.16b}, [%3] \n"
- "sxtl v0.8h, v2.8b \n"
- "sxtl2 v1.8h, v2.16b \n"
- "1: \n"
- "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"
- "subs %w2, %w2, #8 \n"
- "uxtl v16.8h, v16.8b \n"
- "uxtl v17.8h, v17.8b \n"
- "uxtl v18.8h, v18.8b \n"
- "uxtl v19.8h, v19.8b \n"
- "mul v22.8h, v16.8h, v0.h[0] \n"
- "mul v23.8h, v16.8h, v0.h[4] \n"
- "mul v24.8h, v16.8h, v1.h[0] \n"
- "mul v25.8h, v16.8h, v1.h[4] \n"
- "mul v4.8h, v17.8h, v0.h[1] \n"
- "mul v5.8h, v17.8h, v0.h[5] \n"
- "mul v6.8h, v17.8h, v1.h[1] \n"
- "mul v7.8h, v17.8h, v1.h[5] \n"
- "sqadd v22.8h, v22.8h, v4.8h \n"
- "sqadd v23.8h, v23.8h, v5.8h \n"
- "sqadd v24.8h, v24.8h, v6.8h \n"
- "sqadd v25.8h, v25.8h, v7.8h \n"
- "mul v4.8h, v18.8h, v0.h[2] \n"
- "mul v5.8h, v18.8h, v0.h[6] \n"
- "mul v6.8h, v18.8h, v1.h[2] \n"
- "mul v7.8h, v18.8h, v1.h[6] \n"
- "sqadd v22.8h, v22.8h, v4.8h \n"
- "sqadd v23.8h, v23.8h, v5.8h \n"
- "sqadd v24.8h, v24.8h, v6.8h \n"
- "sqadd v25.8h, v25.8h, v7.8h \n"
- "mul v4.8h, v19.8h, v0.h[3] \n"
- "mul v5.8h, v19.8h, v0.h[7] \n"
- "mul v6.8h, v19.8h, v1.h[3] \n"
- "mul v7.8h, v19.8h, v1.h[7] \n"
- "sqadd v22.8h, v22.8h, v4.8h \n"
- "sqadd v23.8h, v23.8h, v5.8h \n"
- "sqadd v24.8h, v24.8h, v6.8h \n"
- "sqadd v25.8h, v25.8h, v7.8h \n"
- "sqshrun v16.8b, v22.8h, #6 \n"
- "sqshrun v17.8b, v23.8h, #6 \n"
- "sqshrun v18.8b, v24.8h, #6 \n"
- "sqshrun v19.8b, v25.8h, #6 \n"
- "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_argb),
- "+r"(dst_argb),
- "+r"(width)
- : "r"(matrix_argb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
- "v17", "v18", "v19", "v22", "v23", "v24", "v25");
- }
- void ARGBMultiplyRow_NEON(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
- int width) {
- asm volatile(
-
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"
- "subs %w3, %w3, #8 \n"
- "umull v0.8h, v0.8b, v4.8b \n"
- "umull v1.8h, v1.8b, v5.8b \n"
- "umull v2.8h, v2.8b, v6.8b \n"
- "umull v3.8h, v3.8b, v7.8b \n"
- "rshrn v0.8b, v0.8h, #8 \n"
- "rshrn v1.8b, v1.8h, #8 \n"
- "rshrn v2.8b, v2.8h, #8 \n"
- "rshrn v3.8b, v3.8h, #8 \n"
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_argb0),
- "+r"(src_argb1),
- "+r"(dst_argb),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
- }
- void ARGBAddRow_NEON(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
- int width) {
- asm volatile(
-
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"
- "subs %w3, %w3, #8 \n"
- "uqadd v0.8b, v0.8b, v4.8b \n"
- "uqadd v1.8b, v1.8b, v5.8b \n"
- "uqadd v2.8b, v2.8b, v6.8b \n"
- "uqadd v3.8b, v3.8b, v7.8b \n"
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_argb0),
- "+r"(src_argb1),
- "+r"(dst_argb),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
- }
- void ARGBSubtractRow_NEON(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
- int width) {
- asm volatile(
-
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"
- "subs %w3, %w3, #8 \n"
- "uqsub v0.8b, v0.8b, v4.8b \n"
- "uqsub v1.8b, v1.8b, v5.8b \n"
- "uqsub v2.8b, v2.8b, v6.8b \n"
- "uqsub v3.8b, v3.8b, v7.8b \n"
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_argb0),
- "+r"(src_argb1),
- "+r"(dst_argb),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
- }
- void SobelRow_NEON(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_argb,
- int width) {
- asm volatile(
- "movi v3.8b, #255 \n"
-
- "1: \n"
- "ld1 {v0.8b}, [%0], #8 \n"
- "ld1 {v1.8b}, [%1], #8 \n"
- "subs %w3, %w3, #8 \n"
- "uqadd v0.8b, v0.8b, v1.8b \n"
- "orr v1.8b, v0.8b, v0.8b \n"
- "orr v2.8b, v0.8b, v0.8b \n"
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_sobelx),
- "+r"(src_sobely),
- "+r"(dst_argb),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3");
- }
- void SobelToPlaneRow_NEON(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_y,
- int width) {
- asm volatile(
-
- "1: \n"
- "ld1 {v0.16b}, [%0], #16 \n"
- "ld1 {v1.16b}, [%1], #16 \n"
- "subs %w3, %w3, #16 \n"
- "uqadd v0.16b, v0.16b, v1.16b \n"
- "st1 {v0.16b}, [%2], #16 \n"
- "b.gt 1b \n"
- : "+r"(src_sobelx),
- "+r"(src_sobely),
- "+r"(dst_y),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1");
- }
- void SobelXYRow_NEON(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_argb,
- int width) {
- asm volatile(
- "movi v3.8b, #255 \n"
-
- "1: \n"
- "ld1 {v2.8b}, [%0], #8 \n"
- "ld1 {v0.8b}, [%1], #8 \n"
- "subs %w3, %w3, #8 \n"
- "uqadd v1.8b, v0.8b, v2.8b \n"
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_sobelx),
- "+r"(src_sobely),
- "+r"(dst_argb),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v3");
- }
- void SobelXRow_NEON(const uint8* src_y0,
- const uint8* src_y1,
- const uint8* src_y2,
- uint8* dst_sobelx,
- int width) {
- asm volatile(
- "1: \n"
- "ld1 {v0.8b}, [%0],%5 \n"
- "ld1 {v1.8b}, [%0],%6 \n"
- "usubl v0.8h, v0.8b, v1.8b \n"
- "ld1 {v2.8b}, [%1],%5 \n"
- "ld1 {v3.8b}, [%1],%6 \n"
- "usubl v1.8h, v2.8b, v3.8b \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "ld1 {v2.8b}, [%2],%5 \n"
- "ld1 {v3.8b}, [%2],%6 \n"
- "subs %w4, %w4, #8 \n"
- "usubl v1.8h, v2.8b, v3.8b \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "abs v0.8h, v0.8h \n"
- "uqxtn v0.8b, v0.8h \n"
- "st1 {v0.8b}, [%3], #8 \n"
- "b.gt 1b \n"
- : "+r"(src_y0),
- "+r"(src_y1),
- "+r"(src_y2),
- "+r"(dst_sobelx),
- "+r"(width)
- : "r"(2LL),
- "r"(6LL)
- : "cc", "memory", "v0", "v1", "v2", "v3"
- );
- }
- void SobelYRow_NEON(const uint8* src_y0,
- const uint8* src_y1,
- uint8* dst_sobely,
- int width) {
- asm volatile(
- "1: \n"
- "ld1 {v0.8b}, [%0],%4 \n"
- "ld1 {v1.8b}, [%1],%4 \n"
- "usubl v0.8h, v0.8b, v1.8b \n"
- "ld1 {v2.8b}, [%0],%4 \n"
- "ld1 {v3.8b}, [%1],%4 \n"
- "usubl v1.8h, v2.8b, v3.8b \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "ld1 {v2.8b}, [%0],%5 \n"
- "ld1 {v3.8b}, [%1],%5 \n"
- "subs %w3, %w3, #8 \n"
- "usubl v1.8h, v2.8b, v3.8b \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "abs v0.8h, v0.8h \n"
- "uqxtn v0.8b, v0.8h \n"
- "st1 {v0.8b}, [%2], #8 \n"
- "b.gt 1b \n"
- : "+r"(src_y0),
- "+r"(src_y1),
- "+r"(dst_sobely),
- "+r"(width)
- : "r"(1LL),
- "r"(6LL)
- : "cc", "memory", "v0", "v1", "v2", "v3"
- );
- }
- void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
- asm volatile(
- "1: \n"
- "ld1 {v1.16b}, [%0], #16 \n"
- "subs %w2, %w2, #8 \n"
- "uxtl v2.4s, v1.4h \n"
- "uxtl2 v3.4s, v1.8h \n"
- "scvtf v2.4s, v2.4s \n"
- "scvtf v3.4s, v3.4s \n"
- "fcvtn v1.4h, v2.4s \n"
- "fcvtn2 v1.8h, v3.4s \n"
- "st1 {v1.16b}, [%1], #16 \n"
- "b.gt 1b \n"
- : "+r"(src),
- "+r"(dst),
- "+r"(width)
- :
- : "cc", "memory", "v1", "v2", "v3");
- }
- void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
- asm volatile(
- "1: \n"
- "ld1 {v1.16b}, [%0], #16 \n"
- "subs %w2, %w2, #8 \n"
- "uxtl v2.4s, v1.4h \n"
- "uxtl2 v3.4s, v1.8h \n"
- "scvtf v2.4s, v2.4s \n"
- "scvtf v3.4s, v3.4s \n"
- "fmul v2.4s, v2.4s, %3.s[0] \n"
- "fmul v3.4s, v3.4s, %3.s[0] \n"
- "uqshrn v1.4h, v2.4s, #13 \n"
- "uqshrn2 v1.8h, v3.4s, #13 \n"
- "st1 {v1.16b}, [%1], #16 \n"
- "b.gt 1b \n"
- : "+r"(src),
- "+r"(dst),
- "+r"(width)
- : "w"(scale * 1.9259299444e-34f)
- : "cc", "memory", "v1", "v2", "v3");
- }
- float ScaleMaxSamples_NEON(const float* src,
- float* dst,
- float scale,
- int width) {
- float fmax;
- asm volatile(
- "movi v5.4s, #0 \n"
- "movi v6.4s, #0 \n"
- "1: \n"
- "ld1 {v1.4s, v2.4s}, [%0], #32 \n"
- "subs %w2, %w2, #8 \n"
- "fmul v3.4s, v1.4s, %4.s[0] \n"
- "fmul v4.4s, v2.4s, %4.s[0] \n"
- "fmax v5.4s, v5.4s, v1.4s \n"
- "fmax v6.4s, v6.4s, v2.4s \n"
- "st1 {v3.4s, v4.4s}, [%1], #32 \n"
- "b.gt 1b \n"
- "fmax v5.4s, v5.4s, v6.4s \n"
- "fmaxv %s3, v5.4s \n"
- : "+r"(src),
- "+r"(dst),
- "+r"(width),
- "=w"(fmax)
- : "w"(scale)
- : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
- return fmax;
- }
- float ScaleSumSamples_NEON(const float* src,
- float* dst,
- float scale,
- int width) {
- float fsum;
- asm volatile(
- "movi v5.4s, #0 \n"
- "movi v6.4s, #0 \n"
- "1: \n"
- "ld1 {v1.4s, v2.4s}, [%0], #32 \n"
- "subs %w2, %w2, #8 \n"
- "fmul v3.4s, v1.4s, %4.s[0] \n"
- "fmul v4.4s, v2.4s, %4.s[0] \n"
- "fmla v5.4s, v1.4s, v1.4s \n"
- "fmla v6.4s, v2.4s, v2.4s \n"
- "st1 {v3.4s, v4.4s}, [%1], #32 \n"
- "b.gt 1b \n"
- "faddp v5.4s, v5.4s, v6.4s \n"
- "faddp v5.4s, v5.4s, v5.4s \n"
- "faddp %3.4s, v5.4s, v5.4s \n"
- : "+r"(src),
- "+r"(dst),
- "+r"(width),
- "=w"(fsum)
- : "w"(scale)
- : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
- return fsum;
- }
- void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
- asm volatile(
- "1: \n"
- "ld1 {v1.4s, v2.4s}, [%0], #32 \n"
- "subs %w2, %w2, #8 \n"
- "fmul v1.4s, v1.4s, %3.s[0] \n"
- "fmul v2.4s, v2.4s, %3.s[0] \n"
- "st1 {v1.4s, v2.4s}, [%1], #32 \n"
- "b.gt 1b \n"
- : "+r"(src),
- "+r"(dst),
- "+r"(width)
- : "w"(scale)
- : "cc", "memory", "v1", "v2");
- }
- void GaussCol_NEON(const uint16* src0,
- const uint16* src1,
- const uint16* src2,
- const uint16* src3,
- const uint16* src4,
- uint32* dst,
- int width) {
- asm volatile(
- "movi v6.8h, #4 \n"
- "movi v7.8h, #6 \n"
- "1: \n"
- "ld1 {v1.8h}, [%0], #16 \n"
- "ld1 {v2.8h}, [%4], #16 \n"
- "uaddl v0.4s, v1.4h, v2.4h \n"
- "uaddl2 v1.4s, v1.8h, v2.8h \n"
- "ld1 {v2.8h}, [%1], #16 \n"
- "umlal v0.4s, v2.4h, v6.4h \n"
- "umlal2 v1.4s, v2.8h, v6.8h \n"
- "ld1 {v2.8h}, [%2], #16 \n"
- "umlal v0.4s, v2.4h, v7.4h \n"
- "umlal2 v1.4s, v2.8h, v7.8h \n"
- "ld1 {v2.8h}, [%3], #16 \n"
- "umlal v0.4s, v2.4h, v6.4h \n"
- "umlal2 v1.4s, v2.8h, v6.8h \n"
- "subs %w6, %w6, #8 \n"
- "st1 {v0.4s,v1.4s}, [%5], #32 \n"
- "b.gt 1b \n"
- : "+r"(src0),
- "+r"(src1),
- "+r"(src2),
- "+r"(src3),
- "+r"(src4),
- "+r"(dst),
- "+r"(width)
- :
- : "cc", "memory", "v0", "v1", "v2", "v6", "v7");
- }
- void GaussRow_NEON(const uint32* src, uint16* dst, int width) {
- const uint32* src1 = src + 1;
- const uint32* src2 = src + 2;
- const uint32* src3 = src + 3;
- asm volatile(
- "movi v6.4s, #4 \n"
- "movi v7.4s, #6 \n"
- "1: \n"
- "ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n"
- "add v0.4s, v0.4s, v1.4s \n"
- "add v1.4s, v1.4s, v2.4s \n"
- "ld1 {v2.4s,v3.4s}, [%2], #32 \n"
- "mla v0.4s, v2.4s, v7.4s \n"
- "mla v1.4s, v3.4s, v7.4s \n"
- "ld1 {v2.4s,v3.4s}, [%1], #32 \n"
- "ld1 {v4.4s,v5.4s}, [%3], #32 \n"
- "add v2.4s, v2.4s, v4.4s \n"
- "add v3.4s, v3.4s, v5.4s \n"
- "mla v0.4s, v2.4s, v6.4s \n"
- "mla v1.4s, v3.4s, v6.4s \n"
- "subs %w5, %w5, #8 \n"
- "uqrshrn v0.4h, v0.4s, #8 \n"
- "uqrshrn2 v0.8h, v1.4s, #8 \n"
- "st1 {v0.8h}, [%4], #16 \n"
- "b.gt 1b \n"
- : "+r"(src),
- "+r"(src1),
- "+r"(src2),
- "+r"(src3),
- "+r"(dst),
- "+r"(width)
- : "r"(32LL)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
- }
- #endif
- #ifdef __cplusplus
- }
- }
- #endif
|