row_gcc.cc 248 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042
  1. /*
  2. * Copyright 2011 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/row.h"
  11. #ifdef __cplusplus
  12. namespace libyuv {
  13. extern "C" {
  14. #endif
  15. // This module is for GCC x86 and x64.
  16. #if !defined(LIBYUV_DISABLE_X86) && \
  17. (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
  18. #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
  19. // Constants for ARGB
  20. static vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
  21. 13, 65, 33, 0, 13, 65, 33, 0};
  22. // JPeg full range.
  23. static vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
  24. 15, 75, 38, 0, 15, 75, 38, 0};
  25. #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
  26. #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
  27. static vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
  28. 112, -74, -38, 0, 112, -74, -38, 0};
  29. static vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
  30. 127, -84, -43, 0, 127, -84, -43, 0};
  31. static vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0,
  32. -18, -94, 112, 0, -18, -94, 112, 0};
  33. static vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
  34. -20, -107, 127, 0, -20, -107, 127, 0};
  35. // Constants for BGRA
  36. static vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
  37. 0, 33, 65, 13, 0, 33, 65, 13};
  38. static vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
  39. 0, -38, -74, 112, 0, -38, -74, 112};
  40. static vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
  41. 0, 112, -94, -18, 0, 112, -94, -18};
  42. // Constants for ABGR
  43. static vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
  44. 33, 65, 13, 0, 33, 65, 13, 0};
  45. static vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
  46. -38, -74, 112, 0, -38, -74, 112, 0};
  47. static vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
  48. 112, -94, -18, 0, 112, -94, -18, 0};
  49. // Constants for RGBA.
  50. static vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
  51. 0, 13, 65, 33, 0, 13, 65, 33};
  52. static vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
  53. 0, 112, -74, -38, 0, 112, -74, -38};
  54. static vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
  55. 0, -18, -94, 112, 0, -18, -94, 112};
  56. static uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
  57. 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
  58. // 7 bit fixed point 0.5.
  59. static vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
  60. static uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
  61. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
  62. static uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
  63. 0x8080u, 0x8080u, 0x8080u, 0x8080u};
  64. #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
  65. #ifdef HAS_RGB24TOARGBROW_SSSE3
  66. // Shuffle table for converting RGB24 to ARGB.
  67. static uvec8 kShuffleMaskRGB24ToARGB = {0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u,
  68. 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
  69. // Shuffle table for converting RAW to ARGB.
  70. static uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u,
  71. 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
  72. // Shuffle table for converting RAW to RGB24. First 8.
  73. static const uvec8 kShuffleMaskRAWToRGB24_0 = {
  74. 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
  75. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
  76. // Shuffle table for converting RAW to RGB24. Middle 8.
  77. static const uvec8 kShuffleMaskRAWToRGB24_1 = {
  78. 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
  79. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
  80. // Shuffle table for converting RAW to RGB24. Last 8.
  81. static const uvec8 kShuffleMaskRAWToRGB24_2 = {
  82. 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
  83. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
  84. // Shuffle table for converting ARGB to RGB24.
  85. static uvec8 kShuffleMaskARGBToRGB24 = {
  86. 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
  87. // Shuffle table for converting ARGB to RAW.
  88. static uvec8 kShuffleMaskARGBToRAW = {
  89. 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
  90. // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
  91. static uvec8 kShuffleMaskARGBToRGB24_0 = {
  92. 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
  93. // YUY2 shuf 16 Y to 32 Y.
  94. static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10,
  95. 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4,
  96. 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
  97. // YUY2 shuf 8 UV to 16 UV.
  98. static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9,
  99. 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7,
  100. 5, 7, 9, 11, 9, 11, 13, 15, 13, 15};
  101. // UYVY shuf 16 Y to 32 Y.
  102. static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11,
  103. 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5,
  104. 7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
  105. // UYVY shuf 8 UV to 16 UV.
  106. static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8,
  107. 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6,
  108. 4, 6, 8, 10, 8, 10, 12, 14, 12, 14};
  109. // NV21 shuf 8 VU to 16 UV.
  110. static const lvec8 kShuffleNV21 = {
  111. 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
  112. 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
  113. };
  114. #endif // HAS_RGB24TOARGBROW_SSSE3
  115. #ifdef HAS_J400TOARGBROW_SSE2
  116. void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
  117. asm volatile (
  118. "pcmpeqb %%xmm5,%%xmm5 \n"
  119. "pslld $0x18,%%xmm5 \n"
  120. LABELALIGN
  121. "1: \n"
  122. "movq " MEMACCESS(0) ",%%xmm0 \n"
  123. "lea " MEMLEA(0x8,0) ",%0 \n"
  124. "punpcklbw %%xmm0,%%xmm0 \n"
  125. "movdqa %%xmm0,%%xmm1 \n"
  126. "punpcklwd %%xmm0,%%xmm0 \n"
  127. "punpckhwd %%xmm1,%%xmm1 \n"
  128. "por %%xmm5,%%xmm0 \n"
  129. "por %%xmm5,%%xmm1 \n"
  130. "movdqu %%xmm0," MEMACCESS(1) " \n"
  131. "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
  132. "lea " MEMLEA(0x20,1) ",%1 \n"
  133. "sub $0x8,%2 \n"
  134. "jg 1b \n"
  135. : "+r"(src_y), // %0
  136. "+r"(dst_argb), // %1
  137. "+r"(width) // %2
  138. :: "memory", "cc", "xmm0", "xmm1", "xmm5"
  139. );
  140. }
  141. #endif // HAS_J400TOARGBROW_SSE2
  142. #ifdef HAS_RGB24TOARGBROW_SSSE3
  143. void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
  144. asm volatile (
  145. "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
  146. "pslld $0x18,%%xmm5 \n"
  147. "movdqa %3,%%xmm4 \n"
  148. LABELALIGN
  149. "1: \n"
  150. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  151. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  152. "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
  153. "lea " MEMLEA(0x30,0) ",%0 \n"
  154. "movdqa %%xmm3,%%xmm2 \n"
  155. "palignr $0x8,%%xmm1,%%xmm2 \n"
  156. "pshufb %%xmm4,%%xmm2 \n"
  157. "por %%xmm5,%%xmm2 \n"
  158. "palignr $0xc,%%xmm0,%%xmm1 \n"
  159. "pshufb %%xmm4,%%xmm0 \n"
  160. "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
  161. "por %%xmm5,%%xmm0 \n"
  162. "pshufb %%xmm4,%%xmm1 \n"
  163. "movdqu %%xmm0," MEMACCESS(1) " \n"
  164. "por %%xmm5,%%xmm1 \n"
  165. "palignr $0x4,%%xmm3,%%xmm3 \n"
  166. "pshufb %%xmm4,%%xmm3 \n"
  167. "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
  168. "por %%xmm5,%%xmm3 \n"
  169. "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
  170. "lea " MEMLEA(0x40,1) ",%1 \n"
  171. "sub $0x10,%2 \n"
  172. "jg 1b \n"
  173. : "+r"(src_rgb24), // %0
  174. "+r"(dst_argb), // %1
  175. "+r"(width) // %2
  176. : "m"(kShuffleMaskRGB24ToARGB) // %3
  177. : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  178. );
  179. }
  180. void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width) {
  181. asm volatile (
  182. "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
  183. "pslld $0x18,%%xmm5 \n"
  184. "movdqa %3,%%xmm4 \n"
  185. LABELALIGN
  186. "1: \n"
  187. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  188. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  189. "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
  190. "lea " MEMLEA(0x30,0) ",%0 \n"
  191. "movdqa %%xmm3,%%xmm2 \n"
  192. "palignr $0x8,%%xmm1,%%xmm2 \n"
  193. "pshufb %%xmm4,%%xmm2 \n"
  194. "por %%xmm5,%%xmm2 \n"
  195. "palignr $0xc,%%xmm0,%%xmm1 \n"
  196. "pshufb %%xmm4,%%xmm0 \n"
  197. "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
  198. "por %%xmm5,%%xmm0 \n"
  199. "pshufb %%xmm4,%%xmm1 \n"
  200. "movdqu %%xmm0," MEMACCESS(1) " \n"
  201. "por %%xmm5,%%xmm1 \n"
  202. "palignr $0x4,%%xmm3,%%xmm3 \n"
  203. "pshufb %%xmm4,%%xmm3 \n"
  204. "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
  205. "por %%xmm5,%%xmm3 \n"
  206. "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
  207. "lea " MEMLEA(0x40,1) ",%1 \n"
  208. "sub $0x10,%2 \n"
  209. "jg 1b \n"
  210. : "+r"(src_raw), // %0
  211. "+r"(dst_argb), // %1
  212. "+r"(width) // %2
  213. : "m"(kShuffleMaskRAWToARGB) // %3
  214. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  215. );
  216. }
  217. void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
  218. asm volatile (
  219. "movdqa %3,%%xmm3 \n"
  220. "movdqa %4,%%xmm4 \n"
  221. "movdqa %5,%%xmm5 \n"
  222. LABELALIGN
  223. "1: \n"
  224. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  225. "movdqu " MEMACCESS2(0x4,0) ",%%xmm1 \n"
  226. "movdqu " MEMACCESS2(0x8,0) ",%%xmm2 \n"
  227. "lea " MEMLEA(0x18,0) ",%0 \n"
  228. "pshufb %%xmm3,%%xmm0 \n"
  229. "pshufb %%xmm4,%%xmm1 \n"
  230. "pshufb %%xmm5,%%xmm2 \n"
  231. "movq %%xmm0," MEMACCESS(1) " \n"
  232. "movq %%xmm1," MEMACCESS2(0x8,1) " \n"
  233. "movq %%xmm2," MEMACCESS2(0x10,1) " \n"
  234. "lea " MEMLEA(0x18,1) ",%1 \n"
  235. "sub $0x8,%2 \n"
  236. "jg 1b \n"
  237. : "+r"(src_raw), // %0
  238. "+r"(dst_rgb24), // %1
  239. "+r"(width) // %2
  240. : "m"(kShuffleMaskRAWToRGB24_0), // %3
  241. "m"(kShuffleMaskRAWToRGB24_1), // %4
  242. "m"(kShuffleMaskRAWToRGB24_2) // %5
  243. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  244. );
  245. }
  246. void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
  247. asm volatile (
  248. "mov $0x1080108,%%eax \n"
  249. "movd %%eax,%%xmm5 \n"
  250. "pshufd $0x0,%%xmm5,%%xmm5 \n"
  251. "mov $0x20802080,%%eax \n"
  252. "movd %%eax,%%xmm6 \n"
  253. "pshufd $0x0,%%xmm6,%%xmm6 \n"
  254. "pcmpeqb %%xmm3,%%xmm3 \n"
  255. "psllw $0xb,%%xmm3 \n"
  256. "pcmpeqb %%xmm4,%%xmm4 \n"
  257. "psllw $0xa,%%xmm4 \n"
  258. "psrlw $0x5,%%xmm4 \n"
  259. "pcmpeqb %%xmm7,%%xmm7 \n"
  260. "psllw $0x8,%%xmm7 \n"
  261. "sub %0,%1 \n"
  262. "sub %0,%1 \n"
  263. LABELALIGN
  264. "1: \n"
  265. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  266. "movdqa %%xmm0,%%xmm1 \n"
  267. "movdqa %%xmm0,%%xmm2 \n"
  268. "pand %%xmm3,%%xmm1 \n"
  269. "psllw $0xb,%%xmm2 \n"
  270. "pmulhuw %%xmm5,%%xmm1 \n"
  271. "pmulhuw %%xmm5,%%xmm2 \n"
  272. "psllw $0x8,%%xmm1 \n"
  273. "por %%xmm2,%%xmm1 \n"
  274. "pand %%xmm4,%%xmm0 \n"
  275. "pmulhuw %%xmm6,%%xmm0 \n"
  276. "por %%xmm7,%%xmm0 \n"
  277. "movdqa %%xmm1,%%xmm2 \n"
  278. "punpcklbw %%xmm0,%%xmm1 \n"
  279. "punpckhbw %%xmm0,%%xmm2 \n"
  280. MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2)
  281. MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2)
  282. "lea " MEMLEA(0x10,0) ",%0 \n"
  283. "sub $0x8,%2 \n"
  284. "jg 1b \n"
  285. : "+r"(src), // %0
  286. "+r"(dst), // %1
  287. "+r"(width) // %2
  288. :
  289. : "memory", "cc", "eax", NACL_R14
  290. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  291. );
  292. }
  293. void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
  294. asm volatile (
  295. "mov $0x1080108,%%eax \n"
  296. "movd %%eax,%%xmm5 \n"
  297. "pshufd $0x0,%%xmm5,%%xmm5 \n"
  298. "mov $0x42004200,%%eax \n"
  299. "movd %%eax,%%xmm6 \n"
  300. "pshufd $0x0,%%xmm6,%%xmm6 \n"
  301. "pcmpeqb %%xmm3,%%xmm3 \n"
  302. "psllw $0xb,%%xmm3 \n"
  303. "movdqa %%xmm3,%%xmm4 \n"
  304. "psrlw $0x6,%%xmm4 \n"
  305. "pcmpeqb %%xmm7,%%xmm7 \n"
  306. "psllw $0x8,%%xmm7 \n"
  307. "sub %0,%1 \n"
  308. "sub %0,%1 \n"
  309. LABELALIGN
  310. "1: \n"
  311. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  312. "movdqa %%xmm0,%%xmm1 \n"
  313. "movdqa %%xmm0,%%xmm2 \n"
  314. "psllw $0x1,%%xmm1 \n"
  315. "psllw $0xb,%%xmm2 \n"
  316. "pand %%xmm3,%%xmm1 \n"
  317. "pmulhuw %%xmm5,%%xmm2 \n"
  318. "pmulhuw %%xmm5,%%xmm1 \n"
  319. "psllw $0x8,%%xmm1 \n"
  320. "por %%xmm2,%%xmm1 \n"
  321. "movdqa %%xmm0,%%xmm2 \n"
  322. "pand %%xmm4,%%xmm0 \n"
  323. "psraw $0x8,%%xmm2 \n"
  324. "pmulhuw %%xmm6,%%xmm0 \n"
  325. "pand %%xmm7,%%xmm2 \n"
  326. "por %%xmm2,%%xmm0 \n"
  327. "movdqa %%xmm1,%%xmm2 \n"
  328. "punpcklbw %%xmm0,%%xmm1 \n"
  329. "punpckhbw %%xmm0,%%xmm2 \n"
  330. MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2)
  331. MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2)
  332. "lea " MEMLEA(0x10,0) ",%0 \n"
  333. "sub $0x8,%2 \n"
  334. "jg 1b \n"
  335. : "+r"(src), // %0
  336. "+r"(dst), // %1
  337. "+r"(width) // %2
  338. :
  339. : "memory", "cc", "eax", NACL_R14
  340. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  341. );
  342. }
  343. void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
  344. asm volatile (
  345. "mov $0xf0f0f0f,%%eax \n"
  346. "movd %%eax,%%xmm4 \n"
  347. "pshufd $0x0,%%xmm4,%%xmm4 \n"
  348. "movdqa %%xmm4,%%xmm5 \n"
  349. "pslld $0x4,%%xmm5 \n"
  350. "sub %0,%1 \n"
  351. "sub %0,%1 \n"
  352. LABELALIGN
  353. "1: \n"
  354. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  355. "movdqa %%xmm0,%%xmm2 \n"
  356. "pand %%xmm4,%%xmm0 \n"
  357. "pand %%xmm5,%%xmm2 \n"
  358. "movdqa %%xmm0,%%xmm1 \n"
  359. "movdqa %%xmm2,%%xmm3 \n"
  360. "psllw $0x4,%%xmm1 \n"
  361. "psrlw $0x4,%%xmm3 \n"
  362. "por %%xmm1,%%xmm0 \n"
  363. "por %%xmm3,%%xmm2 \n"
  364. "movdqa %%xmm0,%%xmm1 \n"
  365. "punpcklbw %%xmm2,%%xmm0 \n"
  366. "punpckhbw %%xmm2,%%xmm1 \n"
  367. MEMOPMEM(movdqu,xmm0,0x00,1,0,2) // movdqu %%xmm0,(%1,%0,2)
  368. MEMOPMEM(movdqu,xmm1,0x10,1,0,2) // movdqu %%xmm1,0x10(%1,%0,2)
  369. "lea " MEMLEA(0x10,0) ",%0 \n"
  370. "sub $0x8,%2 \n"
  371. "jg 1b \n"
  372. : "+r"(src), // %0
  373. "+r"(dst), // %1
  374. "+r"(width) // %2
  375. :
  376. : "memory", "cc", "eax", NACL_R14
  377. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  378. );
  379. }
  380. void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int width) {
  381. asm volatile (
  382. "movdqa %3,%%xmm6 \n"
  383. LABELALIGN
  384. "1: \n"
  385. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  386. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  387. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  388. "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
  389. "lea " MEMLEA(0x40,0) ",%0 \n"
  390. "pshufb %%xmm6,%%xmm0 \n"
  391. "pshufb %%xmm6,%%xmm1 \n"
  392. "pshufb %%xmm6,%%xmm2 \n"
  393. "pshufb %%xmm6,%%xmm3 \n"
  394. "movdqa %%xmm1,%%xmm4 \n"
  395. "psrldq $0x4,%%xmm1 \n"
  396. "pslldq $0xc,%%xmm4 \n"
  397. "movdqa %%xmm2,%%xmm5 \n"
  398. "por %%xmm4,%%xmm0 \n"
  399. "pslldq $0x8,%%xmm5 \n"
  400. "movdqu %%xmm0," MEMACCESS(1) " \n"
  401. "por %%xmm5,%%xmm1 \n"
  402. "psrldq $0x8,%%xmm2 \n"
  403. "pslldq $0x4,%%xmm3 \n"
  404. "por %%xmm3,%%xmm2 \n"
  405. "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
  406. "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
  407. "lea " MEMLEA(0x30,1) ",%1 \n"
  408. "sub $0x10,%2 \n"
  409. "jg 1b \n"
  410. : "+r"(src), // %0
  411. "+r"(dst), // %1
  412. "+r"(width) // %2
  413. : "m"(kShuffleMaskARGBToRGB24) // %3
  414. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  415. );
  416. }
  417. void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int width) {
  418. asm volatile (
  419. "movdqa %3,%%xmm6 \n"
  420. LABELALIGN
  421. "1: \n"
  422. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  423. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  424. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  425. "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
  426. "lea " MEMLEA(0x40,0) ",%0 \n"
  427. "pshufb %%xmm6,%%xmm0 \n"
  428. "pshufb %%xmm6,%%xmm1 \n"
  429. "pshufb %%xmm6,%%xmm2 \n"
  430. "pshufb %%xmm6,%%xmm3 \n"
  431. "movdqa %%xmm1,%%xmm4 \n"
  432. "psrldq $0x4,%%xmm1 \n"
  433. "pslldq $0xc,%%xmm4 \n"
  434. "movdqa %%xmm2,%%xmm5 \n"
  435. "por %%xmm4,%%xmm0 \n"
  436. "pslldq $0x8,%%xmm5 \n"
  437. "movdqu %%xmm0," MEMACCESS(1) " \n"
  438. "por %%xmm5,%%xmm1 \n"
  439. "psrldq $0x8,%%xmm2 \n"
  440. "pslldq $0x4,%%xmm3 \n"
  441. "por %%xmm3,%%xmm2 \n"
  442. "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
  443. "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
  444. "lea " MEMLEA(0x30,1) ",%1 \n"
  445. "sub $0x10,%2 \n"
  446. "jg 1b \n"
  447. : "+r"(src), // %0
  448. "+r"(dst), // %1
  449. "+r"(width) // %2
  450. : "m"(kShuffleMaskARGBToRAW) // %3
  451. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  452. );
  453. }
  454. void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int width) {
  455. asm volatile (
  456. "pcmpeqb %%xmm3,%%xmm3 \n"
  457. "psrld $0x1b,%%xmm3 \n"
  458. "pcmpeqb %%xmm4,%%xmm4 \n"
  459. "psrld $0x1a,%%xmm4 \n"
  460. "pslld $0x5,%%xmm4 \n"
  461. "pcmpeqb %%xmm5,%%xmm5 \n"
  462. "pslld $0xb,%%xmm5 \n"
  463. LABELALIGN
  464. "1: \n"
  465. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  466. "movdqa %%xmm0,%%xmm1 \n"
  467. "movdqa %%xmm0,%%xmm2 \n"
  468. "pslld $0x8,%%xmm0 \n"
  469. "psrld $0x3,%%xmm1 \n"
  470. "psrld $0x5,%%xmm2 \n"
  471. "psrad $0x10,%%xmm0 \n"
  472. "pand %%xmm3,%%xmm1 \n"
  473. "pand %%xmm4,%%xmm2 \n"
  474. "pand %%xmm5,%%xmm0 \n"
  475. "por %%xmm2,%%xmm1 \n"
  476. "por %%xmm1,%%xmm0 \n"
  477. "packssdw %%xmm0,%%xmm0 \n"
  478. "lea " MEMLEA(0x10,0) ",%0 \n"
  479. "movq %%xmm0," MEMACCESS(1) " \n"
  480. "lea " MEMLEA(0x8,1) ",%1 \n"
  481. "sub $0x4,%2 \n"
  482. "jg 1b \n"
  483. : "+r"(src), // %0
  484. "+r"(dst), // %1
  485. "+r"(width) // %2
  486. :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  487. );
  488. }
  489. void ARGBToRGB565DitherRow_SSE2(const uint8* src,
  490. uint8* dst,
  491. const uint32 dither4,
  492. int width) {
  493. asm volatile(
  494. "movd %3,%%xmm6 \n"
  495. "punpcklbw %%xmm6,%%xmm6 \n"
  496. "movdqa %%xmm6,%%xmm7 \n"
  497. "punpcklwd %%xmm6,%%xmm6 \n"
  498. "punpckhwd %%xmm7,%%xmm7 \n"
  499. "pcmpeqb %%xmm3,%%xmm3 \n"
  500. "psrld $0x1b,%%xmm3 \n"
  501. "pcmpeqb %%xmm4,%%xmm4 \n"
  502. "psrld $0x1a,%%xmm4 \n"
  503. "pslld $0x5,%%xmm4 \n"
  504. "pcmpeqb %%xmm5,%%xmm5 \n"
  505. "pslld $0xb,%%xmm5 \n"
  506. LABELALIGN
  507. "1: \n"
  508. "movdqu (%0),%%xmm0 \n"
  509. "paddusb %%xmm6,%%xmm0 \n"
  510. "movdqa %%xmm0,%%xmm1 \n"
  511. "movdqa %%xmm0,%%xmm2 \n"
  512. "pslld $0x8,%%xmm0 \n"
  513. "psrld $0x3,%%xmm1 \n"
  514. "psrld $0x5,%%xmm2 \n"
  515. "psrad $0x10,%%xmm0 \n"
  516. "pand %%xmm3,%%xmm1 \n"
  517. "pand %%xmm4,%%xmm2 \n"
  518. "pand %%xmm5,%%xmm0 \n"
  519. "por %%xmm2,%%xmm1 \n"
  520. "por %%xmm1,%%xmm0 \n"
  521. "packssdw %%xmm0,%%xmm0 \n"
  522. "lea 0x10(%0),%0 \n"
  523. "movq %%xmm0,(%1) \n"
  524. "lea 0x8(%1),%1 \n"
  525. "sub $0x4,%2 \n"
  526. "jg 1b \n"
  527. : "+r"(src), // %0
  528. "+r"(dst), // %1
  529. "+r"(width) // %2
  530. : "m"(dither4) // %3
  531. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  532. "xmm7");
  533. }
  534. #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
  535. void ARGBToRGB565DitherRow_AVX2(const uint8* src,
  536. uint8* dst,
  537. const uint32 dither4,
  538. int width) {
  539. asm volatile(
  540. "vbroadcastss %3,%%xmm6 \n"
  541. "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n"
  542. "vpermq $0xd8,%%ymm6,%%ymm6 \n"
  543. "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n"
  544. "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
  545. "vpsrld $0x1b,%%ymm3,%%ymm3 \n"
  546. "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
  547. "vpsrld $0x1a,%%ymm4,%%ymm4 \n"
  548. "vpslld $0x5,%%ymm4,%%ymm4 \n"
  549. "vpslld $0xb,%%ymm3,%%ymm5 \n"
  550. LABELALIGN
  551. "1: \n"
  552. "vmovdqu (%0),%%ymm0 \n"
  553. "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n"
  554. "vpsrld $0x5,%%ymm0,%%ymm2 \n"
  555. "vpsrld $0x3,%%ymm0,%%ymm1 \n"
  556. "vpsrld $0x8,%%ymm0,%%ymm0 \n"
  557. "vpand %%ymm4,%%ymm2,%%ymm2 \n"
  558. "vpand %%ymm3,%%ymm1,%%ymm1 \n"
  559. "vpand %%ymm5,%%ymm0,%%ymm0 \n"
  560. "vpor %%ymm2,%%ymm1,%%ymm1 \n"
  561. "vpor %%ymm1,%%ymm0,%%ymm0 \n"
  562. "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
  563. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  564. "lea 0x20(%0),%0 \n"
  565. "vmovdqu %%xmm0,(%1) \n"
  566. "lea 0x10(%1),%1 \n"
  567. "sub $0x8,%2 \n"
  568. "jg 1b \n"
  569. "vzeroupper \n"
  570. : "+r"(src), // %0
  571. "+r"(dst), // %1
  572. "+r"(width) // %2
  573. : "m"(dither4) // %3
  574. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  575. "xmm7");
  576. }
  577. #endif // HAS_ARGBTORGB565DITHERROW_AVX2
  578. void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) {
  579. asm volatile (
  580. "pcmpeqb %%xmm4,%%xmm4 \n"
  581. "psrld $0x1b,%%xmm4 \n"
  582. "movdqa %%xmm4,%%xmm5 \n"
  583. "pslld $0x5,%%xmm5 \n"
  584. "movdqa %%xmm4,%%xmm6 \n"
  585. "pslld $0xa,%%xmm6 \n"
  586. "pcmpeqb %%xmm7,%%xmm7 \n"
  587. "pslld $0xf,%%xmm7 \n"
  588. LABELALIGN
  589. "1: \n"
  590. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  591. "movdqa %%xmm0,%%xmm1 \n"
  592. "movdqa %%xmm0,%%xmm2 \n"
  593. "movdqa %%xmm0,%%xmm3 \n"
  594. "psrad $0x10,%%xmm0 \n"
  595. "psrld $0x3,%%xmm1 \n"
  596. "psrld $0x6,%%xmm2 \n"
  597. "psrld $0x9,%%xmm3 \n"
  598. "pand %%xmm7,%%xmm0 \n"
  599. "pand %%xmm4,%%xmm1 \n"
  600. "pand %%xmm5,%%xmm2 \n"
  601. "pand %%xmm6,%%xmm3 \n"
  602. "por %%xmm1,%%xmm0 \n"
  603. "por %%xmm3,%%xmm2 \n"
  604. "por %%xmm2,%%xmm0 \n"
  605. "packssdw %%xmm0,%%xmm0 \n"
  606. "lea " MEMLEA(0x10,0) ",%0 \n"
  607. "movq %%xmm0," MEMACCESS(1) " \n"
  608. "lea " MEMLEA(0x8,1) ",%1 \n"
  609. "sub $0x4,%2 \n"
  610. "jg 1b \n"
  611. : "+r"(src), // %0
  612. "+r"(dst), // %1
  613. "+r"(width) // %2
  614. :: "memory", "cc",
  615. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  616. );
  617. }
  618. void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) {
  619. asm volatile (
  620. "pcmpeqb %%xmm4,%%xmm4 \n"
  621. "psllw $0xc,%%xmm4 \n"
  622. "movdqa %%xmm4,%%xmm3 \n"
  623. "psrlw $0x8,%%xmm3 \n"
  624. LABELALIGN
  625. "1: \n"
  626. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  627. "movdqa %%xmm0,%%xmm1 \n"
  628. "pand %%xmm3,%%xmm0 \n"
  629. "pand %%xmm4,%%xmm1 \n"
  630. "psrlq $0x4,%%xmm0 \n"
  631. "psrlq $0x8,%%xmm1 \n"
  632. "por %%xmm1,%%xmm0 \n"
  633. "packuswb %%xmm0,%%xmm0 \n"
  634. "lea " MEMLEA(0x10,0) ",%0 \n"
  635. "movq %%xmm0," MEMACCESS(1) " \n"
  636. "lea " MEMLEA(0x8,1) ",%1 \n"
  637. "sub $0x4,%2 \n"
  638. "jg 1b \n"
  639. : "+r"(src), // %0
  640. "+r"(dst), // %1
  641. "+r"(width) // %2
  642. :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
  643. );
  644. }
  645. #endif // HAS_RGB24TOARGBROW_SSSE3
  646. #ifdef HAS_ARGBTOYROW_SSSE3
  647. // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
  648. void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
  649. asm volatile (
  650. "movdqa %3,%%xmm4 \n"
  651. "movdqa %4,%%xmm5 \n"
  652. LABELALIGN
  653. "1: \n"
  654. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  655. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  656. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  657. "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
  658. "pmaddubsw %%xmm4,%%xmm0 \n"
  659. "pmaddubsw %%xmm4,%%xmm1 \n"
  660. "pmaddubsw %%xmm4,%%xmm2 \n"
  661. "pmaddubsw %%xmm4,%%xmm3 \n"
  662. "lea " MEMLEA(0x40,0) ",%0 \n"
  663. "phaddw %%xmm1,%%xmm0 \n"
  664. "phaddw %%xmm3,%%xmm2 \n"
  665. "psrlw $0x7,%%xmm0 \n"
  666. "psrlw $0x7,%%xmm2 \n"
  667. "packuswb %%xmm2,%%xmm0 \n"
  668. "paddb %%xmm5,%%xmm0 \n"
  669. "movdqu %%xmm0," MEMACCESS(1) " \n"
  670. "lea " MEMLEA(0x10,1) ",%1 \n"
  671. "sub $0x10,%2 \n"
  672. "jg 1b \n"
  673. : "+r"(src_argb), // %0
  674. "+r"(dst_y), // %1
  675. "+r"(width) // %2
  676. : "m"(kARGBToY), // %3
  677. "m"(kAddY16) // %4
  678. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  679. );
  680. }
  681. #endif // HAS_ARGBTOYROW_SSSE3
  682. #ifdef HAS_ARGBTOYJROW_SSSE3
  683. // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
  684. // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
  685. void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
  686. asm volatile (
  687. "movdqa %3,%%xmm4 \n"
  688. "movdqa %4,%%xmm5 \n"
  689. LABELALIGN
  690. "1: \n"
  691. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  692. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  693. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  694. "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
  695. "pmaddubsw %%xmm4,%%xmm0 \n"
  696. "pmaddubsw %%xmm4,%%xmm1 \n"
  697. "pmaddubsw %%xmm4,%%xmm2 \n"
  698. "pmaddubsw %%xmm4,%%xmm3 \n"
  699. "lea " MEMLEA(0x40,0) ",%0 \n"
  700. "phaddw %%xmm1,%%xmm0 \n"
  701. "phaddw %%xmm3,%%xmm2 \n"
  702. "paddw %%xmm5,%%xmm0 \n"
  703. "paddw %%xmm5,%%xmm2 \n"
  704. "psrlw $0x7,%%xmm0 \n"
  705. "psrlw $0x7,%%xmm2 \n"
  706. "packuswb %%xmm2,%%xmm0 \n"
  707. "movdqu %%xmm0," MEMACCESS(1) " \n"
  708. "lea " MEMLEA(0x10,1) ",%1 \n"
  709. "sub $0x10,%2 \n"
  710. "jg 1b \n"
  711. : "+r"(src_argb), // %0
  712. "+r"(dst_y), // %1
  713. "+r"(width) // %2
  714. : "m"(kARGBToYJ), // %3
  715. "m"(kAddYJ64) // %4
  716. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  717. );
  718. }
  719. #endif // HAS_ARGBTOYJROW_SSSE3
  720. #ifdef HAS_ARGBTOYROW_AVX2
  721. // vpermd for vphaddw + vpackuswb vpermd.
  722. static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
  723. // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
  724. void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
  725. asm volatile (
  726. "vbroadcastf128 %3,%%ymm4 \n"
  727. "vbroadcastf128 %4,%%ymm5 \n"
  728. "vmovdqu %5,%%ymm6 \n"
  729. LABELALIGN
  730. "1: \n"
  731. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  732. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  733. "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
  734. "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
  735. "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
  736. "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
  737. "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
  738. "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
  739. "lea " MEMLEA(0x80,0) ",%0 \n"
  740. "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
  741. "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
  742. "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
  743. "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
  744. "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
  745. "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
  746. "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y
  747. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  748. "lea " MEMLEA(0x20,1) ",%1 \n"
  749. "sub $0x20,%2 \n"
  750. "jg 1b \n"
  751. "vzeroupper \n"
  752. : "+r"(src_argb), // %0
  753. "+r"(dst_y), // %1
  754. "+r"(width) // %2
  755. : "m"(kARGBToY), // %3
  756. "m"(kAddY16), // %4
  757. "m"(kPermdARGBToY_AVX) // %5
  758. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  759. );
  760. }
  761. #endif // HAS_ARGBTOYROW_AVX2
  762. #ifdef HAS_ARGBTOYJROW_AVX2
  763. // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
  764. void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
  765. asm volatile (
  766. "vbroadcastf128 %3,%%ymm4 \n"
  767. "vbroadcastf128 %4,%%ymm5 \n"
  768. "vmovdqu %5,%%ymm6 \n"
  769. LABELALIGN
  770. "1: \n"
  771. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  772. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  773. "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
  774. "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
  775. "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
  776. "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
  777. "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
  778. "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
  779. "lea " MEMLEA(0x80,0) ",%0 \n"
  780. "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
  781. "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
  782. "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding.
  783. "vpaddw %%ymm5,%%ymm2,%%ymm2 \n"
  784. "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
  785. "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
  786. "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
  787. "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
  788. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  789. "lea " MEMLEA(0x20,1) ",%1 \n"
  790. "sub $0x20,%2 \n"
  791. "jg 1b \n"
  792. "vzeroupper \n"
  793. : "+r"(src_argb), // %0
  794. "+r"(dst_y), // %1
  795. "+r"(width) // %2
  796. : "m"(kARGBToYJ), // %3
  797. "m"(kAddYJ64), // %4
  798. "m"(kPermdARGBToY_AVX) // %5
  799. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  800. );
  801. }
  802. #endif // HAS_ARGBTOYJROW_AVX2
  803. #ifdef HAS_ARGBTOUVROW_SSSE3
  804. void ARGBToUVRow_SSSE3(const uint8* src_argb0,
  805. int src_stride_argb,
  806. uint8* dst_u,
  807. uint8* dst_v,
  808. int width) {
  809. asm volatile (
  810. "movdqa %5,%%xmm3 \n"
  811. "movdqa %6,%%xmm4 \n"
  812. "movdqa %7,%%xmm5 \n"
  813. "sub %1,%2 \n"
  814. LABELALIGN
  815. "1: \n"
  816. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  817. MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
  818. "pavgb %%xmm7,%%xmm0 \n"
  819. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  820. MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
  821. "pavgb %%xmm7,%%xmm1 \n"
  822. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  823. MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
  824. "pavgb %%xmm7,%%xmm2 \n"
  825. "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
  826. MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
  827. "pavgb %%xmm7,%%xmm6 \n"
  828. "lea " MEMLEA(0x40,0) ",%0 \n"
  829. "movdqa %%xmm0,%%xmm7 \n"
  830. "shufps $0x88,%%xmm1,%%xmm0 \n"
  831. "shufps $0xdd,%%xmm1,%%xmm7 \n"
  832. "pavgb %%xmm7,%%xmm0 \n"
  833. "movdqa %%xmm2,%%xmm7 \n"
  834. "shufps $0x88,%%xmm6,%%xmm2 \n"
  835. "shufps $0xdd,%%xmm6,%%xmm7 \n"
  836. "pavgb %%xmm7,%%xmm2 \n"
  837. "movdqa %%xmm0,%%xmm1 \n"
  838. "movdqa %%xmm2,%%xmm6 \n"
  839. "pmaddubsw %%xmm4,%%xmm0 \n"
  840. "pmaddubsw %%xmm4,%%xmm2 \n"
  841. "pmaddubsw %%xmm3,%%xmm1 \n"
  842. "pmaddubsw %%xmm3,%%xmm6 \n"
  843. "phaddw %%xmm2,%%xmm0 \n"
  844. "phaddw %%xmm6,%%xmm1 \n"
  845. "psraw $0x8,%%xmm0 \n"
  846. "psraw $0x8,%%xmm1 \n"
  847. "packsswb %%xmm1,%%xmm0 \n"
  848. "paddb %%xmm5,%%xmm0 \n"
  849. "movlps %%xmm0," MEMACCESS(1) " \n"
  850. MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
  851. "lea " MEMLEA(0x8,1) ",%1 \n"
  852. "sub $0x10,%3 \n"
  853. "jg 1b \n"
  854. : "+r"(src_argb0), // %0
  855. "+r"(dst_u), // %1
  856. "+r"(dst_v), // %2
  857. "+rm"(width) // %3
  858. : "r"((intptr_t)(src_stride_argb)), // %4
  859. "m"(kARGBToV), // %5
  860. "m"(kARGBToU), // %6
  861. "m"(kAddUV128) // %7
  862. : "memory", "cc", NACL_R14
  863. "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
  864. );
  865. }
  866. #endif // HAS_ARGBTOUVROW_SSSE3
  867. #ifdef HAS_ARGBTOUVROW_AVX2
  868. // vpshufb for vphaddw + vpackuswb packed to shorts.
  869. static const lvec8 kShufARGBToUV_AVX = {
  870. 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
  871. 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
  872. void ARGBToUVRow_AVX2(const uint8* src_argb0,
  873. int src_stride_argb,
  874. uint8* dst_u,
  875. uint8* dst_v,
  876. int width) {
  877. asm volatile (
  878. "vbroadcastf128 %5,%%ymm5 \n"
  879. "vbroadcastf128 %6,%%ymm6 \n"
  880. "vbroadcastf128 %7,%%ymm7 \n"
  881. "sub %1,%2 \n"
  882. LABELALIGN
  883. "1: \n"
  884. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  885. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  886. "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
  887. "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
  888. VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
  889. VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
  890. VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
  891. VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
  892. "lea " MEMLEA(0x80,0) ",%0 \n"
  893. "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
  894. "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
  895. "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
  896. "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
  897. "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
  898. "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
  899. "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
  900. "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
  901. "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
  902. "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
  903. "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
  904. "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
  905. "vpsraw $0x8,%%ymm1,%%ymm1 \n"
  906. "vpsraw $0x8,%%ymm0,%%ymm0 \n"
  907. "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
  908. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  909. "vpshufb %8,%%ymm0,%%ymm0 \n"
  910. "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
  911. "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
  912. VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
  913. "lea " MEMLEA(0x10,1) ",%1 \n"
  914. "sub $0x20,%3 \n"
  915. "jg 1b \n"
  916. "vzeroupper \n"
  917. : "+r"(src_argb0), // %0
  918. "+r"(dst_u), // %1
  919. "+r"(dst_v), // %2
  920. "+rm"(width) // %3
  921. : "r"((intptr_t)(src_stride_argb)), // %4
  922. "m"(kAddUV128), // %5
  923. "m"(kARGBToV), // %6
  924. "m"(kARGBToU), // %7
  925. "m"(kShufARGBToUV_AVX) // %8
  926. : "memory", "cc", NACL_R14
  927. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  928. );
  929. }
  930. #endif // HAS_ARGBTOUVROW_AVX2
  931. #ifdef HAS_ARGBTOUVJROW_AVX2
  932. void ARGBToUVJRow_AVX2(const uint8* src_argb0,
  933. int src_stride_argb,
  934. uint8* dst_u,
  935. uint8* dst_v,
  936. int width) {
  937. asm volatile (
  938. "vbroadcastf128 %5,%%ymm5 \n"
  939. "vbroadcastf128 %6,%%ymm6 \n"
  940. "vbroadcastf128 %7,%%ymm7 \n"
  941. "sub %1,%2 \n"
  942. LABELALIGN
  943. "1: \n"
  944. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  945. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  946. "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
  947. "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
  948. VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
  949. VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
  950. VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
  951. VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
  952. "lea " MEMLEA(0x80,0) ",%0 \n"
  953. "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
  954. "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
  955. "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
  956. "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
  957. "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
  958. "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
  959. "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
  960. "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
  961. "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
  962. "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
  963. "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
  964. "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
  965. "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
  966. "vpaddw %%ymm5,%%ymm1,%%ymm1 \n"
  967. "vpsraw $0x8,%%ymm1,%%ymm1 \n"
  968. "vpsraw $0x8,%%ymm0,%%ymm0 \n"
  969. "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
  970. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  971. "vpshufb %8,%%ymm0,%%ymm0 \n"
  972. "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
  973. VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
  974. "lea " MEMLEA(0x10,1) ",%1 \n"
  975. "sub $0x20,%3 \n"
  976. "jg 1b \n"
  977. "vzeroupper \n"
  978. : "+r"(src_argb0), // %0
  979. "+r"(dst_u), // %1
  980. "+r"(dst_v), // %2
  981. "+rm"(width) // %3
  982. : "r"((intptr_t)(src_stride_argb)), // %4
  983. "m"(kAddUVJ128), // %5
  984. "m"(kARGBToVJ), // %6
  985. "m"(kARGBToUJ), // %7
  986. "m"(kShufARGBToUV_AVX) // %8
  987. : "memory", "cc", NACL_R14
  988. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  989. );
  990. }
  991. #endif // HAS_ARGBTOUVJROW_AVX2
  992. #ifdef HAS_ARGBTOUVJROW_SSSE3
  993. void ARGBToUVJRow_SSSE3(const uint8* src_argb0,
  994. int src_stride_argb,
  995. uint8* dst_u,
  996. uint8* dst_v,
  997. int width) {
  998. asm volatile (
  999. "movdqa %5,%%xmm3 \n"
  1000. "movdqa %6,%%xmm4 \n"
  1001. "movdqa %7,%%xmm5 \n"
  1002. "sub %1,%2 \n"
  1003. LABELALIGN
  1004. "1: \n"
  1005. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  1006. MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
  1007. "pavgb %%xmm7,%%xmm0 \n"
  1008. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  1009. MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
  1010. "pavgb %%xmm7,%%xmm1 \n"
  1011. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  1012. MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
  1013. "pavgb %%xmm7,%%xmm2 \n"
  1014. "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
  1015. MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
  1016. "pavgb %%xmm7,%%xmm6 \n"
  1017. "lea " MEMLEA(0x40,0) ",%0 \n"
  1018. "movdqa %%xmm0,%%xmm7 \n"
  1019. "shufps $0x88,%%xmm1,%%xmm0 \n"
  1020. "shufps $0xdd,%%xmm1,%%xmm7 \n"
  1021. "pavgb %%xmm7,%%xmm0 \n"
  1022. "movdqa %%xmm2,%%xmm7 \n"
  1023. "shufps $0x88,%%xmm6,%%xmm2 \n"
  1024. "shufps $0xdd,%%xmm6,%%xmm7 \n"
  1025. "pavgb %%xmm7,%%xmm2 \n"
  1026. "movdqa %%xmm0,%%xmm1 \n"
  1027. "movdqa %%xmm2,%%xmm6 \n"
  1028. "pmaddubsw %%xmm4,%%xmm0 \n"
  1029. "pmaddubsw %%xmm4,%%xmm2 \n"
  1030. "pmaddubsw %%xmm3,%%xmm1 \n"
  1031. "pmaddubsw %%xmm3,%%xmm6 \n"
  1032. "phaddw %%xmm2,%%xmm0 \n"
  1033. "phaddw %%xmm6,%%xmm1 \n"
  1034. "paddw %%xmm5,%%xmm0 \n"
  1035. "paddw %%xmm5,%%xmm1 \n"
  1036. "psraw $0x8,%%xmm0 \n"
  1037. "psraw $0x8,%%xmm1 \n"
  1038. "packsswb %%xmm1,%%xmm0 \n"
  1039. "movlps %%xmm0," MEMACCESS(1) " \n"
  1040. MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
  1041. "lea " MEMLEA(0x8,1) ",%1 \n"
  1042. "sub $0x10,%3 \n"
  1043. "jg 1b \n"
  1044. : "+r"(src_argb0), // %0
  1045. "+r"(dst_u), // %1
  1046. "+r"(dst_v), // %2
  1047. "+rm"(width) // %3
  1048. : "r"((intptr_t)(src_stride_argb)), // %4
  1049. "m"(kARGBToVJ), // %5
  1050. "m"(kARGBToUJ), // %6
  1051. "m"(kAddUVJ128) // %7
  1052. : "memory", "cc", NACL_R14
  1053. "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
  1054. );
  1055. }
  1056. #endif // HAS_ARGBTOUVJROW_SSSE3
  1057. #ifdef HAS_ARGBTOUV444ROW_SSSE3
  1058. void ARGBToUV444Row_SSSE3(const uint8* src_argb,
  1059. uint8* dst_u,
  1060. uint8* dst_v,
  1061. int width) {
  1062. asm volatile (
  1063. "movdqa %4,%%xmm3 \n"
  1064. "movdqa %5,%%xmm4 \n"
  1065. "movdqa %6,%%xmm5 \n"
  1066. "sub %1,%2 \n"
  1067. LABELALIGN
  1068. "1: \n"
  1069. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  1070. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  1071. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  1072. "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
  1073. "pmaddubsw %%xmm4,%%xmm0 \n"
  1074. "pmaddubsw %%xmm4,%%xmm1 \n"
  1075. "pmaddubsw %%xmm4,%%xmm2 \n"
  1076. "pmaddubsw %%xmm4,%%xmm6 \n"
  1077. "phaddw %%xmm1,%%xmm0 \n"
  1078. "phaddw %%xmm6,%%xmm2 \n"
  1079. "psraw $0x8,%%xmm0 \n"
  1080. "psraw $0x8,%%xmm2 \n"
  1081. "packsswb %%xmm2,%%xmm0 \n"
  1082. "paddb %%xmm5,%%xmm0 \n"
  1083. "movdqu %%xmm0," MEMACCESS(1) " \n"
  1084. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  1085. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  1086. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  1087. "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
  1088. "pmaddubsw %%xmm3,%%xmm0 \n"
  1089. "pmaddubsw %%xmm3,%%xmm1 \n"
  1090. "pmaddubsw %%xmm3,%%xmm2 \n"
  1091. "pmaddubsw %%xmm3,%%xmm6 \n"
  1092. "phaddw %%xmm1,%%xmm0 \n"
  1093. "phaddw %%xmm6,%%xmm2 \n"
  1094. "psraw $0x8,%%xmm0 \n"
  1095. "psraw $0x8,%%xmm2 \n"
  1096. "packsswb %%xmm2,%%xmm0 \n"
  1097. "paddb %%xmm5,%%xmm0 \n"
  1098. "lea " MEMLEA(0x40,0) ",%0 \n"
  1099. MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1)
  1100. "lea " MEMLEA(0x10,1) ",%1 \n"
  1101. "sub $0x10,%3 \n"
  1102. "jg 1b \n"
  1103. : "+r"(src_argb), // %0
  1104. "+r"(dst_u), // %1
  1105. "+r"(dst_v), // %2
  1106. "+rm"(width) // %3
  1107. : "m"(kARGBToV), // %4
  1108. "m"(kARGBToU), // %5
  1109. "m"(kAddUV128) // %6
  1110. : "memory", "cc", NACL_R14
  1111. "xmm0", "xmm1", "xmm2", "xmm6"
  1112. );
  1113. }
  1114. #endif // HAS_ARGBTOUV444ROW_SSSE3
  1115. void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width) {
  1116. asm volatile (
  1117. "movdqa %4,%%xmm5 \n"
  1118. "movdqa %3,%%xmm4 \n"
  1119. LABELALIGN
  1120. "1: \n"
  1121. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  1122. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  1123. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  1124. "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
  1125. "pmaddubsw %%xmm4,%%xmm0 \n"
  1126. "pmaddubsw %%xmm4,%%xmm1 \n"
  1127. "pmaddubsw %%xmm4,%%xmm2 \n"
  1128. "pmaddubsw %%xmm4,%%xmm3 \n"
  1129. "lea " MEMLEA(0x40,0) ",%0 \n"
  1130. "phaddw %%xmm1,%%xmm0 \n"
  1131. "phaddw %%xmm3,%%xmm2 \n"
  1132. "psrlw $0x7,%%xmm0 \n"
  1133. "psrlw $0x7,%%xmm2 \n"
  1134. "packuswb %%xmm2,%%xmm0 \n"
  1135. "paddb %%xmm5,%%xmm0 \n"
  1136. "movdqu %%xmm0," MEMACCESS(1) " \n"
  1137. "lea " MEMLEA(0x10,1) ",%1 \n"
  1138. "sub $0x10,%2 \n"
  1139. "jg 1b \n"
  1140. : "+r"(src_bgra), // %0
  1141. "+r"(dst_y), // %1
  1142. "+r"(width) // %2
  1143. : "m"(kBGRAToY), // %3
  1144. "m"(kAddY16) // %4
  1145. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1146. );
  1147. }
  1148. void BGRAToUVRow_SSSE3(const uint8* src_bgra0,
  1149. int src_stride_bgra,
  1150. uint8* dst_u,
  1151. uint8* dst_v,
  1152. int width) {
  1153. asm volatile (
  1154. "movdqa %5,%%xmm3 \n"
  1155. "movdqa %6,%%xmm4 \n"
  1156. "movdqa %7,%%xmm5 \n"
  1157. "sub %1,%2 \n"
  1158. LABELALIGN
  1159. "1: \n"
  1160. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  1161. MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
  1162. "pavgb %%xmm7,%%xmm0 \n"
  1163. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  1164. MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
  1165. "pavgb %%xmm7,%%xmm1 \n"
  1166. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  1167. MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
  1168. "pavgb %%xmm7,%%xmm2 \n"
  1169. "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
  1170. MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
  1171. "pavgb %%xmm7,%%xmm6 \n"
  1172. "lea " MEMLEA(0x40,0) ",%0 \n"
  1173. "movdqa %%xmm0,%%xmm7 \n"
  1174. "shufps $0x88,%%xmm1,%%xmm0 \n"
  1175. "shufps $0xdd,%%xmm1,%%xmm7 \n"
  1176. "pavgb %%xmm7,%%xmm0 \n"
  1177. "movdqa %%xmm2,%%xmm7 \n"
  1178. "shufps $0x88,%%xmm6,%%xmm2 \n"
  1179. "shufps $0xdd,%%xmm6,%%xmm7 \n"
  1180. "pavgb %%xmm7,%%xmm2 \n"
  1181. "movdqa %%xmm0,%%xmm1 \n"
  1182. "movdqa %%xmm2,%%xmm6 \n"
  1183. "pmaddubsw %%xmm4,%%xmm0 \n"
  1184. "pmaddubsw %%xmm4,%%xmm2 \n"
  1185. "pmaddubsw %%xmm3,%%xmm1 \n"
  1186. "pmaddubsw %%xmm3,%%xmm6 \n"
  1187. "phaddw %%xmm2,%%xmm0 \n"
  1188. "phaddw %%xmm6,%%xmm1 \n"
  1189. "psraw $0x8,%%xmm0 \n"
  1190. "psraw $0x8,%%xmm1 \n"
  1191. "packsswb %%xmm1,%%xmm0 \n"
  1192. "paddb %%xmm5,%%xmm0 \n"
  1193. "movlps %%xmm0," MEMACCESS(1) " \n"
  1194. MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
  1195. "lea " MEMLEA(0x8,1) ",%1 \n"
  1196. "sub $0x10,%3 \n"
  1197. "jg 1b \n"
  1198. : "+r"(src_bgra0), // %0
  1199. "+r"(dst_u), // %1
  1200. "+r"(dst_v), // %2
  1201. "+rm"(width) // %3
  1202. : "r"((intptr_t)(src_stride_bgra)), // %4
  1203. "m"(kBGRAToV), // %5
  1204. "m"(kBGRAToU), // %6
  1205. "m"(kAddUV128) // %7
  1206. : "memory", "cc", NACL_R14
  1207. "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
  1208. );
  1209. }
  1210. void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width) {
  1211. asm volatile (
  1212. "movdqa %4,%%xmm5 \n"
  1213. "movdqa %3,%%xmm4 \n"
  1214. LABELALIGN
  1215. "1: \n"
  1216. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  1217. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  1218. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  1219. "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
  1220. "pmaddubsw %%xmm4,%%xmm0 \n"
  1221. "pmaddubsw %%xmm4,%%xmm1 \n"
  1222. "pmaddubsw %%xmm4,%%xmm2 \n"
  1223. "pmaddubsw %%xmm4,%%xmm3 \n"
  1224. "lea " MEMLEA(0x40,0) ",%0 \n"
  1225. "phaddw %%xmm1,%%xmm0 \n"
  1226. "phaddw %%xmm3,%%xmm2 \n"
  1227. "psrlw $0x7,%%xmm0 \n"
  1228. "psrlw $0x7,%%xmm2 \n"
  1229. "packuswb %%xmm2,%%xmm0 \n"
  1230. "paddb %%xmm5,%%xmm0 \n"
  1231. "movdqu %%xmm0," MEMACCESS(1) " \n"
  1232. "lea " MEMLEA(0x10,1) ",%1 \n"
  1233. "sub $0x10,%2 \n"
  1234. "jg 1b \n"
  1235. : "+r"(src_abgr), // %0
  1236. "+r"(dst_y), // %1
  1237. "+r"(width) // %2
  1238. : "m"(kABGRToY), // %3
  1239. "m"(kAddY16) // %4
  1240. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1241. );
  1242. }
  1243. void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width) {
  1244. asm volatile (
  1245. "movdqa %4,%%xmm5 \n"
  1246. "movdqa %3,%%xmm4 \n"
  1247. LABELALIGN
  1248. "1: \n"
  1249. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  1250. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  1251. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  1252. "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
  1253. "pmaddubsw %%xmm4,%%xmm0 \n"
  1254. "pmaddubsw %%xmm4,%%xmm1 \n"
  1255. "pmaddubsw %%xmm4,%%xmm2 \n"
  1256. "pmaddubsw %%xmm4,%%xmm3 \n"
  1257. "lea " MEMLEA(0x40,0) ",%0 \n"
  1258. "phaddw %%xmm1,%%xmm0 \n"
  1259. "phaddw %%xmm3,%%xmm2 \n"
  1260. "psrlw $0x7,%%xmm0 \n"
  1261. "psrlw $0x7,%%xmm2 \n"
  1262. "packuswb %%xmm2,%%xmm0 \n"
  1263. "paddb %%xmm5,%%xmm0 \n"
  1264. "movdqu %%xmm0," MEMACCESS(1) " \n"
  1265. "lea " MEMLEA(0x10,1) ",%1 \n"
  1266. "sub $0x10,%2 \n"
  1267. "jg 1b \n"
  1268. : "+r"(src_rgba), // %0
  1269. "+r"(dst_y), // %1
  1270. "+r"(width) // %2
  1271. : "m"(kRGBAToY), // %3
  1272. "m"(kAddY16) // %4
  1273. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1274. );
  1275. }
  1276. void ABGRToUVRow_SSSE3(const uint8* src_abgr0,
  1277. int src_stride_abgr,
  1278. uint8* dst_u,
  1279. uint8* dst_v,
  1280. int width) {
  1281. asm volatile (
  1282. "movdqa %5,%%xmm3 \n"
  1283. "movdqa %6,%%xmm4 \n"
  1284. "movdqa %7,%%xmm5 \n"
  1285. "sub %1,%2 \n"
  1286. LABELALIGN
  1287. "1: \n"
  1288. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  1289. MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
  1290. "pavgb %%xmm7,%%xmm0 \n"
  1291. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  1292. MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
  1293. "pavgb %%xmm7,%%xmm1 \n"
  1294. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  1295. MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
  1296. "pavgb %%xmm7,%%xmm2 \n"
  1297. "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
  1298. MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
  1299. "pavgb %%xmm7,%%xmm6 \n"
  1300. "lea " MEMLEA(0x40,0) ",%0 \n"
  1301. "movdqa %%xmm0,%%xmm7 \n"
  1302. "shufps $0x88,%%xmm1,%%xmm0 \n"
  1303. "shufps $0xdd,%%xmm1,%%xmm7 \n"
  1304. "pavgb %%xmm7,%%xmm0 \n"
  1305. "movdqa %%xmm2,%%xmm7 \n"
  1306. "shufps $0x88,%%xmm6,%%xmm2 \n"
  1307. "shufps $0xdd,%%xmm6,%%xmm7 \n"
  1308. "pavgb %%xmm7,%%xmm2 \n"
  1309. "movdqa %%xmm0,%%xmm1 \n"
  1310. "movdqa %%xmm2,%%xmm6 \n"
  1311. "pmaddubsw %%xmm4,%%xmm0 \n"
  1312. "pmaddubsw %%xmm4,%%xmm2 \n"
  1313. "pmaddubsw %%xmm3,%%xmm1 \n"
  1314. "pmaddubsw %%xmm3,%%xmm6 \n"
  1315. "phaddw %%xmm2,%%xmm0 \n"
  1316. "phaddw %%xmm6,%%xmm1 \n"
  1317. "psraw $0x8,%%xmm0 \n"
  1318. "psraw $0x8,%%xmm1 \n"
  1319. "packsswb %%xmm1,%%xmm0 \n"
  1320. "paddb %%xmm5,%%xmm0 \n"
  1321. "movlps %%xmm0," MEMACCESS(1) " \n"
  1322. MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
  1323. "lea " MEMLEA(0x8,1) ",%1 \n"
  1324. "sub $0x10,%3 \n"
  1325. "jg 1b \n"
  1326. : "+r"(src_abgr0), // %0
  1327. "+r"(dst_u), // %1
  1328. "+r"(dst_v), // %2
  1329. "+rm"(width) // %3
  1330. : "r"((intptr_t)(src_stride_abgr)), // %4
  1331. "m"(kABGRToV), // %5
  1332. "m"(kABGRToU), // %6
  1333. "m"(kAddUV128) // %7
  1334. : "memory", "cc", NACL_R14
  1335. "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
  1336. );
  1337. }
  1338. void RGBAToUVRow_SSSE3(const uint8* src_rgba0,
  1339. int src_stride_rgba,
  1340. uint8* dst_u,
  1341. uint8* dst_v,
  1342. int width) {
  1343. asm volatile (
  1344. "movdqa %5,%%xmm3 \n"
  1345. "movdqa %6,%%xmm4 \n"
  1346. "movdqa %7,%%xmm5 \n"
  1347. "sub %1,%2 \n"
  1348. LABELALIGN
  1349. "1: \n"
  1350. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  1351. MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
  1352. "pavgb %%xmm7,%%xmm0 \n"
  1353. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  1354. MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
  1355. "pavgb %%xmm7,%%xmm1 \n"
  1356. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  1357. MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
  1358. "pavgb %%xmm7,%%xmm2 \n"
  1359. "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
  1360. MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
  1361. "pavgb %%xmm7,%%xmm6 \n"
  1362. "lea " MEMLEA(0x40,0) ",%0 \n"
  1363. "movdqa %%xmm0,%%xmm7 \n"
  1364. "shufps $0x88,%%xmm1,%%xmm0 \n"
  1365. "shufps $0xdd,%%xmm1,%%xmm7 \n"
  1366. "pavgb %%xmm7,%%xmm0 \n"
  1367. "movdqa %%xmm2,%%xmm7 \n"
  1368. "shufps $0x88,%%xmm6,%%xmm2 \n"
  1369. "shufps $0xdd,%%xmm6,%%xmm7 \n"
  1370. "pavgb %%xmm7,%%xmm2 \n"
  1371. "movdqa %%xmm0,%%xmm1 \n"
  1372. "movdqa %%xmm2,%%xmm6 \n"
  1373. "pmaddubsw %%xmm4,%%xmm0 \n"
  1374. "pmaddubsw %%xmm4,%%xmm2 \n"
  1375. "pmaddubsw %%xmm3,%%xmm1 \n"
  1376. "pmaddubsw %%xmm3,%%xmm6 \n"
  1377. "phaddw %%xmm2,%%xmm0 \n"
  1378. "phaddw %%xmm6,%%xmm1 \n"
  1379. "psraw $0x8,%%xmm0 \n"
  1380. "psraw $0x8,%%xmm1 \n"
  1381. "packsswb %%xmm1,%%xmm0 \n"
  1382. "paddb %%xmm5,%%xmm0 \n"
  1383. "movlps %%xmm0," MEMACCESS(1) " \n"
  1384. MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
  1385. "lea " MEMLEA(0x8,1) ",%1 \n"
  1386. "sub $0x10,%3 \n"
  1387. "jg 1b \n"
  1388. : "+r"(src_rgba0), // %0
  1389. "+r"(dst_u), // %1
  1390. "+r"(dst_v), // %2
  1391. "+rm"(width) // %3
  1392. : "r"((intptr_t)(src_stride_rgba)), // %4
  1393. "m"(kRGBAToV), // %5
  1394. "m"(kRGBAToU), // %6
  1395. "m"(kAddUV128) // %7
  1396. : "memory", "cc", NACL_R14
  1397. "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
  1398. );
  1399. }
  1400. #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
  1401. // Read 8 UV from 444
  1402. #define READYUV444 \
  1403. "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
  1404. MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \
  1405. "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
  1406. "punpcklbw %%xmm1,%%xmm0 \n" \
  1407. "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
  1408. "punpcklbw %%xmm4,%%xmm4 \n" \
  1409. "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
  1410. // Read 4 UV from 422, upsample to 8 UV
  1411. #define READYUV422 \
  1412. "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
  1413. MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
  1414. "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
  1415. "punpcklbw %%xmm1,%%xmm0 \n" \
  1416. "punpcklwd %%xmm0,%%xmm0 \n" \
  1417. "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
  1418. "punpcklbw %%xmm4,%%xmm4 \n" \
  1419. "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
  1420. // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
  1421. #define READYUVA422 \
  1422. "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
  1423. MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
  1424. "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
  1425. "punpcklbw %%xmm1,%%xmm0 \n" \
  1426. "punpcklwd %%xmm0,%%xmm0 \n" \
  1427. "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
  1428. "punpcklbw %%xmm4,%%xmm4 \n" \
  1429. "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \
  1430. "movq " MEMACCESS([a_buf]) ",%%xmm5 \n" \
  1431. "lea " MEMLEA(0x8, [a_buf]) ",%[a_buf] \n"
  1432. // Read 4 UV from NV12, upsample to 8 UV
  1433. #define READNV12 \
  1434. "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
  1435. "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \
  1436. "punpcklwd %%xmm0,%%xmm0 \n" \
  1437. "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
  1438. "punpcklbw %%xmm4,%%xmm4 \n" \
  1439. "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
  1440. // Read 4 VU from NV21, upsample to 8 UV
  1441. #define READNV21 \
  1442. "movq " MEMACCESS([vu_buf]) ",%%xmm0 \n" \
  1443. "lea " MEMLEA(0x8, [vu_buf]) ",%[vu_buf] \n" \
  1444. "pshufb %[kShuffleNV21], %%xmm0 \n" \
  1445. "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
  1446. "punpcklbw %%xmm4,%%xmm4 \n" \
  1447. "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
  1448. // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
  1449. #define READYUY2 \
  1450. "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \
  1451. "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \
  1452. "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm0 \n" \
  1453. "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \
  1454. "lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n"
  1455. // Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
  1456. #define READUYVY \
  1457. "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \
  1458. "pshufb %[kShuffleUYVYY], %%xmm4 \n" \
  1459. "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \
  1460. "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \
  1461. "lea " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf] \n"
  1462. #if defined(__x86_64__)
  1463. #define YUVTORGB_SETUP(yuvconstants) \
  1464. "movdqa " MEMACCESS([yuvconstants]) ",%%xmm8 \n" \
  1465. "movdqa " MEMACCESS2(32, [yuvconstants]) ",%%xmm9 \n" \
  1466. "movdqa " MEMACCESS2(64, [yuvconstants]) ",%%xmm10 \n" \
  1467. "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm11 \n" \
  1468. "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm12 \n" \
  1469. "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm13 \n" \
  1470. "movdqa " MEMACCESS2(192, [yuvconstants]) ",%%xmm14 \n"
  1471. // Convert 8 pixels: 8 UV and 8 Y
  1472. #define YUVTORGB(yuvconstants) \
  1473. "movdqa %%xmm0,%%xmm1 \n" \
  1474. "movdqa %%xmm0,%%xmm2 \n" \
  1475. "movdqa %%xmm0,%%xmm3 \n" \
  1476. "movdqa %%xmm11,%%xmm0 \n" \
  1477. "pmaddubsw %%xmm8,%%xmm1 \n" \
  1478. "psubw %%xmm1,%%xmm0 \n" \
  1479. "movdqa %%xmm12,%%xmm1 \n" \
  1480. "pmaddubsw %%xmm9,%%xmm2 \n" \
  1481. "psubw %%xmm2,%%xmm1 \n" \
  1482. "movdqa %%xmm13,%%xmm2 \n" \
  1483. "pmaddubsw %%xmm10,%%xmm3 \n" \
  1484. "psubw %%xmm3,%%xmm2 \n" \
  1485. "pmulhuw %%xmm14,%%xmm4 \n" \
  1486. "paddsw %%xmm4,%%xmm0 \n" \
  1487. "paddsw %%xmm4,%%xmm1 \n" \
  1488. "paddsw %%xmm4,%%xmm2 \n" \
  1489. "psraw $0x6,%%xmm0 \n" \
  1490. "psraw $0x6,%%xmm1 \n" \
  1491. "psraw $0x6,%%xmm2 \n" \
  1492. "packuswb %%xmm0,%%xmm0 \n" \
  1493. "packuswb %%xmm1,%%xmm1 \n" \
  1494. "packuswb %%xmm2,%%xmm2 \n"
  1495. #define YUVTORGB_REGS \
  1496. "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
  1497. #else
  1498. #define YUVTORGB_SETUP(yuvconstants)
  1499. // Convert 8 pixels: 8 UV and 8 Y
  1500. #define YUVTORGB(yuvconstants) \
  1501. "movdqa %%xmm0,%%xmm1 \n" \
  1502. "movdqa %%xmm0,%%xmm2 \n" \
  1503. "movdqa %%xmm0,%%xmm3 \n" \
  1504. "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm0 \n" \
  1505. "pmaddubsw " MEMACCESS([yuvconstants]) ",%%xmm1 \n" \
  1506. "psubw %%xmm1,%%xmm0 \n" \
  1507. "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm1 \n" \
  1508. "pmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%xmm2 \n" \
  1509. "psubw %%xmm2,%%xmm1 \n" \
  1510. "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm2 \n" \
  1511. "pmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%xmm3 \n" \
  1512. "psubw %%xmm3,%%xmm2 \n" \
  1513. "pmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%xmm4 \n" \
  1514. "paddsw %%xmm4,%%xmm0 \n" \
  1515. "paddsw %%xmm4,%%xmm1 \n" \
  1516. "paddsw %%xmm4,%%xmm2 \n" \
  1517. "psraw $0x6,%%xmm0 \n" \
  1518. "psraw $0x6,%%xmm1 \n" \
  1519. "psraw $0x6,%%xmm2 \n" \
  1520. "packuswb %%xmm0,%%xmm0 \n" \
  1521. "packuswb %%xmm1,%%xmm1 \n" \
  1522. "packuswb %%xmm2,%%xmm2 \n"
  1523. #define YUVTORGB_REGS
  1524. #endif
  1525. // Store 8 ARGB values.
  1526. #define STOREARGB \
  1527. "punpcklbw %%xmm1,%%xmm0 \n" \
  1528. "punpcklbw %%xmm5,%%xmm2 \n" \
  1529. "movdqa %%xmm0,%%xmm1 \n" \
  1530. "punpcklwd %%xmm2,%%xmm0 \n" \
  1531. "punpckhwd %%xmm2,%%xmm1 \n" \
  1532. "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \
  1533. "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \
  1534. "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n"
  1535. // Store 8 RGBA values.
  1536. #define STORERGBA \
  1537. "pcmpeqb %%xmm5,%%xmm5 \n" \
  1538. "punpcklbw %%xmm2,%%xmm1 \n" \
  1539. "punpcklbw %%xmm0,%%xmm5 \n" \
  1540. "movdqa %%xmm5,%%xmm0 \n" \
  1541. "punpcklwd %%xmm1,%%xmm5 \n" \
  1542. "punpckhwd %%xmm1,%%xmm0 \n" \
  1543. "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \
  1544. "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \
  1545. "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n"
  1546. void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
  1547. const uint8* u_buf,
  1548. const uint8* v_buf,
  1549. uint8* dst_argb,
  1550. const struct YuvConstants* yuvconstants,
  1551. int width) {
  1552. asm volatile (
  1553. YUVTORGB_SETUP(yuvconstants)
  1554. "sub %[u_buf],%[v_buf] \n"
  1555. "pcmpeqb %%xmm5,%%xmm5 \n"
  1556. LABELALIGN
  1557. "1: \n"
  1558. READYUV444
  1559. YUVTORGB(yuvconstants)
  1560. STOREARGB
  1561. "sub $0x8,%[width] \n"
  1562. "jg 1b \n"
  1563. : [y_buf]"+r"(y_buf), // %[y_buf]
  1564. [u_buf]"+r"(u_buf), // %[u_buf]
  1565. [v_buf]"+r"(v_buf), // %[v_buf]
  1566. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  1567. [width]"+rm"(width) // %[width]
  1568. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  1569. : "memory", "cc", NACL_R14 YUVTORGB_REGS
  1570. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1571. );
  1572. }
  1573. void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
  1574. const uint8* u_buf,
  1575. const uint8* v_buf,
  1576. uint8* dst_rgb24,
  1577. const struct YuvConstants* yuvconstants,
  1578. int width) {
  1579. asm volatile (
  1580. YUVTORGB_SETUP(yuvconstants)
  1581. "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
  1582. "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
  1583. "sub %[u_buf],%[v_buf] \n"
  1584. LABELALIGN
  1585. "1: \n"
  1586. READYUV422
  1587. YUVTORGB(yuvconstants)
  1588. "punpcklbw %%xmm1,%%xmm0 \n"
  1589. "punpcklbw %%xmm2,%%xmm2 \n"
  1590. "movdqa %%xmm0,%%xmm1 \n"
  1591. "punpcklwd %%xmm2,%%xmm0 \n"
  1592. "punpckhwd %%xmm2,%%xmm1 \n"
  1593. "pshufb %%xmm5,%%xmm0 \n"
  1594. "pshufb %%xmm6,%%xmm1 \n"
  1595. "palignr $0xc,%%xmm0,%%xmm1 \n"
  1596. "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n"
  1597. "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
  1598. "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
  1599. "subl $0x8,%[width] \n"
  1600. "jg 1b \n"
  1601. : [y_buf]"+r"(y_buf), // %[y_buf]
  1602. [u_buf]"+r"(u_buf), // %[u_buf]
  1603. [v_buf]"+r"(v_buf), // %[v_buf]
  1604. [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
  1605. #if defined(__i386__)
  1606. [width]"+m"(width) // %[width]
  1607. #else
  1608. [width]"+rm"(width) // %[width]
  1609. #endif
  1610. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  1611. [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
  1612. [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
  1613. : "memory", "cc", NACL_R14 YUVTORGB_REGS
  1614. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  1615. );
  1616. }
  1617. void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
  1618. const uint8* u_buf,
  1619. const uint8* v_buf,
  1620. uint8* dst_argb,
  1621. const struct YuvConstants* yuvconstants,
  1622. int width) {
  1623. asm volatile (
  1624. YUVTORGB_SETUP(yuvconstants)
  1625. "sub %[u_buf],%[v_buf] \n"
  1626. "pcmpeqb %%xmm5,%%xmm5 \n"
  1627. LABELALIGN
  1628. "1: \n"
  1629. READYUV422
  1630. YUVTORGB(yuvconstants)
  1631. STOREARGB
  1632. "sub $0x8,%[width] \n"
  1633. "jg 1b \n"
  1634. : [y_buf]"+r"(y_buf), // %[y_buf]
  1635. [u_buf]"+r"(u_buf), // %[u_buf]
  1636. [v_buf]"+r"(v_buf), // %[v_buf]
  1637. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  1638. [width]"+rm"(width) // %[width]
  1639. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  1640. : "memory", "cc", NACL_R14 YUVTORGB_REGS
  1641. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1642. );
  1643. }
  1644. #ifdef HAS_I422ALPHATOARGBROW_SSSE3
  1645. void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
  1646. const uint8* u_buf,
  1647. const uint8* v_buf,
  1648. const uint8* a_buf,
  1649. uint8* dst_argb,
  1650. const struct YuvConstants* yuvconstants,
  1651. int width) {
  1652. // clang-format off
  1653. asm volatile (
  1654. YUVTORGB_SETUP(yuvconstants)
  1655. "sub %[u_buf],%[v_buf] \n"
  1656. LABELALIGN
  1657. "1: \n"
  1658. READYUVA422
  1659. YUVTORGB(yuvconstants)
  1660. STOREARGB
  1661. "subl $0x8,%[width] \n"
  1662. "jg 1b \n"
  1663. : [y_buf]"+r"(y_buf), // %[y_buf]
  1664. [u_buf]"+r"(u_buf), // %[u_buf]
  1665. [v_buf]"+r"(v_buf), // %[v_buf]
  1666. [a_buf]"+r"(a_buf), // %[a_buf]
  1667. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  1668. #if defined(__i386__)
  1669. [width]"+m"(width) // %[width]
  1670. #else
  1671. [width]"+rm"(width) // %[width]
  1672. #endif
  1673. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  1674. : "memory", "cc", NACL_R14 YUVTORGB_REGS
  1675. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1676. );
  1677. // clang-format on
  1678. }
  1679. #endif // HAS_I422ALPHATOARGBROW_SSSE3
  1680. void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
  1681. const uint8* uv_buf,
  1682. uint8* dst_argb,
  1683. const struct YuvConstants* yuvconstants,
  1684. int width) {
  1685. // clang-format off
  1686. asm volatile (
  1687. YUVTORGB_SETUP(yuvconstants)
  1688. "pcmpeqb %%xmm5,%%xmm5 \n"
  1689. LABELALIGN
  1690. "1: \n"
  1691. READNV12
  1692. YUVTORGB(yuvconstants)
  1693. STOREARGB
  1694. "sub $0x8,%[width] \n"
  1695. "jg 1b \n"
  1696. : [y_buf]"+r"(y_buf), // %[y_buf]
  1697. [uv_buf]"+r"(uv_buf), // %[uv_buf]
  1698. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  1699. [width]"+rm"(width) // %[width]
  1700. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  1701. : "memory", "cc", YUVTORGB_REGS // Does not use r14.
  1702. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1703. );
  1704. // clang-format on
  1705. }
  1706. void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
  1707. const uint8* vu_buf,
  1708. uint8* dst_argb,
  1709. const struct YuvConstants* yuvconstants,
  1710. int width) {
  1711. // clang-format off
  1712. asm volatile (
  1713. YUVTORGB_SETUP(yuvconstants)
  1714. "pcmpeqb %%xmm5,%%xmm5 \n"
  1715. LABELALIGN
  1716. "1: \n"
  1717. READNV21
  1718. YUVTORGB(yuvconstants)
  1719. STOREARGB
  1720. "sub $0x8,%[width] \n"
  1721. "jg 1b \n"
  1722. : [y_buf]"+r"(y_buf), // %[y_buf]
  1723. [vu_buf]"+r"(vu_buf), // %[vu_buf]
  1724. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  1725. [width]"+rm"(width) // %[width]
  1726. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  1727. [kShuffleNV21]"m"(kShuffleNV21)
  1728. : "memory", "cc", YUVTORGB_REGS // Does not use r14.
  1729. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1730. );
  1731. // clang-format on
  1732. }
  1733. void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf,
  1734. uint8* dst_argb,
  1735. const struct YuvConstants* yuvconstants,
  1736. int width) {
  1737. // clang-format off
  1738. asm volatile (
  1739. YUVTORGB_SETUP(yuvconstants)
  1740. "pcmpeqb %%xmm5,%%xmm5 \n"
  1741. LABELALIGN
  1742. "1: \n"
  1743. READYUY2
  1744. YUVTORGB(yuvconstants)
  1745. STOREARGB
  1746. "sub $0x8,%[width] \n"
  1747. "jg 1b \n"
  1748. : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
  1749. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  1750. [width]"+rm"(width) // %[width]
  1751. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  1752. [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
  1753. [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
  1754. : "memory", "cc", YUVTORGB_REGS // Does not use r14.
  1755. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1756. );
  1757. // clang-format on
  1758. }
  1759. void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf,
  1760. uint8* dst_argb,
  1761. const struct YuvConstants* yuvconstants,
  1762. int width) {
  1763. // clang-format off
  1764. asm volatile (
  1765. YUVTORGB_SETUP(yuvconstants)
  1766. "pcmpeqb %%xmm5,%%xmm5 \n"
  1767. LABELALIGN
  1768. "1: \n"
  1769. READUYVY
  1770. YUVTORGB(yuvconstants)
  1771. STOREARGB
  1772. "sub $0x8,%[width] \n"
  1773. "jg 1b \n"
  1774. : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
  1775. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  1776. [width]"+rm"(width) // %[width]
  1777. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  1778. [kShuffleUYVYY]"m"(kShuffleUYVYY),
  1779. [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
  1780. : "memory", "cc", YUVTORGB_REGS // Does not use r14.
  1781. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1782. );
  1783. // clang-format on
  1784. }
  1785. void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
  1786. const uint8* u_buf,
  1787. const uint8* v_buf,
  1788. uint8* dst_rgba,
  1789. const struct YuvConstants* yuvconstants,
  1790. int width) {
  1791. asm volatile (
  1792. YUVTORGB_SETUP(yuvconstants)
  1793. "sub %[u_buf],%[v_buf] \n"
  1794. "pcmpeqb %%xmm5,%%xmm5 \n"
  1795. LABELALIGN
  1796. "1: \n"
  1797. READYUV422
  1798. YUVTORGB(yuvconstants)
  1799. STORERGBA
  1800. "sub $0x8,%[width] \n"
  1801. "jg 1b \n"
  1802. : [y_buf]"+r"(y_buf), // %[y_buf]
  1803. [u_buf]"+r"(u_buf), // %[u_buf]
  1804. [v_buf]"+r"(v_buf), // %[v_buf]
  1805. [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
  1806. [width]"+rm"(width) // %[width]
  1807. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  1808. : "memory", "cc", NACL_R14 YUVTORGB_REGS
  1809. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1810. );
  1811. }
  1812. #endif // HAS_I422TOARGBROW_SSSE3
  1813. // Read 16 UV from 444
  1814. #define READYUV444_AVX2 \
  1815. "vmovdqu " MEMACCESS([u_buf]) ",%%xmm0 \n" \
  1816. MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1) \
  1817. "lea " MEMLEA(0x10, [u_buf]) ",%[u_buf] \n" \
  1818. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  1819. "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
  1820. "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
  1821. "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
  1822. "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
  1823. "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
  1824. "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
  1825. // Read 8 UV from 422, upsample to 16 UV.
  1826. #define READYUV422_AVX2 \
  1827. "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
  1828. MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \
  1829. "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
  1830. "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
  1831. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  1832. "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
  1833. "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
  1834. "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
  1835. "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
  1836. "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
  1837. // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
  1838. #define READYUVA422_AVX2 \
  1839. "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
  1840. MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \
  1841. "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
  1842. "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
  1843. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  1844. "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
  1845. "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
  1846. "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
  1847. "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
  1848. "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \
  1849. "vmovdqu " MEMACCESS([a_buf]) ",%%xmm5 \n" \
  1850. "vpermq $0xd8,%%ymm5,%%ymm5 \n" \
  1851. "lea " MEMLEA(0x10, [a_buf]) ",%[a_buf] \n"
  1852. // Read 8 UV from NV12, upsample to 16 UV.
  1853. #define READNV12_AVX2 \
  1854. "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
  1855. "lea " MEMLEA(0x10, [uv_buf]) ",%[uv_buf] \n" \
  1856. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  1857. "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
  1858. "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
  1859. "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
  1860. "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
  1861. "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
  1862. // Read 8 VU from NV21, upsample to 16 UV.
  1863. #define READNV21_AVX2 \
  1864. "vmovdqu " MEMACCESS([vu_buf]) ",%%xmm0 \n" \
  1865. "lea " MEMLEA(0x10, [vu_buf]) ",%[vu_buf] \n" \
  1866. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  1867. "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \
  1868. "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
  1869. "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
  1870. "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
  1871. "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
  1872. // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
  1873. #define READYUY2_AVX2 \
  1874. "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \
  1875. "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \
  1876. "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm0 \n" \
  1877. "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \
  1878. "lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n"
  1879. // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
  1880. #define READUYVY_AVX2 \
  1881. "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \
  1882. "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \
  1883. "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm0 \n" \
  1884. "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \
  1885. "lea " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf] \n"
  1886. #if defined(__x86_64__)
  1887. #define YUVTORGB_SETUP_AVX2(yuvconstants) \
  1888. "vmovdqa " MEMACCESS([yuvconstants]) ",%%ymm8 \n" \
  1889. "vmovdqa " MEMACCESS2(32, [yuvconstants]) ",%%ymm9 \n" \
  1890. "vmovdqa " MEMACCESS2(64, [yuvconstants]) ",%%ymm10 \n" \
  1891. "vmovdqa " MEMACCESS2(96, [yuvconstants]) ",%%ymm11 \n" \
  1892. "vmovdqa " MEMACCESS2(128, [yuvconstants]) ",%%ymm12 \n" \
  1893. "vmovdqa " MEMACCESS2(160, [yuvconstants]) ",%%ymm13 \n" \
  1894. "vmovdqa " MEMACCESS2(192, [yuvconstants]) ",%%ymm14 \n"
  1895. #define YUVTORGB_AVX2(yuvconstants) \
  1896. "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \
  1897. "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \
  1898. "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \
  1899. "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \
  1900. "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \
  1901. "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \
  1902. "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \
  1903. "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
  1904. "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
  1905. "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \
  1906. "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
  1907. "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
  1908. "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
  1909. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
  1910. "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
  1911. "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
  1912. #define YUVTORGB_REGS_AVX2 \
  1913. "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
  1914. #else // Convert 16 pixels: 16 UV and 16 Y.
  1915. #define YUVTORGB_SETUP_AVX2(yuvconstants)
  1916. #define YUVTORGB_AVX2(yuvconstants) \
  1917. "vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \
  1918. "vpmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1 \n" \
  1919. "vpmaddubsw " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0 \n" \
  1920. "vmovdqu " MEMACCESS2(160, [yuvconstants]) ",%%ymm3 \n" \
  1921. "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \
  1922. "vmovdqu " MEMACCESS2(128, [yuvconstants]) ",%%ymm3 \n" \
  1923. "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \
  1924. "vmovdqu " MEMACCESS2(96, [yuvconstants]) ",%%ymm3 \n" \
  1925. "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \
  1926. "vpmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4 \n" \
  1927. "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
  1928. "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
  1929. "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \
  1930. "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
  1931. "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
  1932. "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
  1933. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
  1934. "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
  1935. "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
  1936. #define YUVTORGB_REGS_AVX2
  1937. #endif
  1938. // Store 16 ARGB values.
  1939. #define STOREARGB_AVX2 \
  1940. "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
  1941. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  1942. "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \
  1943. "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
  1944. "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \
  1945. "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \
  1946. "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \
  1947. "vmovdqu %%ymm0," MEMACCESS2(0x20, [dst_argb]) " \n" \
  1948. "lea " MEMLEA(0x40, [dst_argb]) ", %[dst_argb] \n"
  1949. #ifdef HAS_I444TOARGBROW_AVX2
  1950. // 16 pixels
  1951. // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
  1952. void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf,
  1953. const uint8* u_buf,
  1954. const uint8* v_buf,
  1955. uint8* dst_argb,
  1956. const struct YuvConstants* yuvconstants,
  1957. int width) {
  1958. asm volatile (
  1959. YUVTORGB_SETUP_AVX2(yuvconstants)
  1960. "sub %[u_buf],%[v_buf] \n"
  1961. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  1962. LABELALIGN
  1963. "1: \n"
  1964. READYUV444_AVX2
  1965. YUVTORGB_AVX2(yuvconstants)
  1966. STOREARGB_AVX2
  1967. "sub $0x10,%[width] \n"
  1968. "jg 1b \n"
  1969. "vzeroupper \n"
  1970. : [y_buf]"+r"(y_buf), // %[y_buf]
  1971. [u_buf]"+r"(u_buf), // %[u_buf]
  1972. [v_buf]"+r"(v_buf), // %[v_buf]
  1973. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  1974. [width]"+rm"(width) // %[width]
  1975. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  1976. : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
  1977. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1978. );
  1979. }
  1980. #endif // HAS_I444TOARGBROW_AVX2
  1981. #if defined(HAS_I422TOARGBROW_AVX2)
  1982. // 16 pixels
  1983. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  1984. void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
  1985. const uint8* u_buf,
  1986. const uint8* v_buf,
  1987. uint8* dst_argb,
  1988. const struct YuvConstants* yuvconstants,
  1989. int width) {
  1990. asm volatile (
  1991. YUVTORGB_SETUP_AVX2(yuvconstants)
  1992. "sub %[u_buf],%[v_buf] \n"
  1993. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  1994. LABELALIGN
  1995. "1: \n"
  1996. READYUV422_AVX2
  1997. YUVTORGB_AVX2(yuvconstants)
  1998. STOREARGB_AVX2
  1999. "sub $0x10,%[width] \n"
  2000. "jg 1b \n"
  2001. "vzeroupper \n"
  2002. : [y_buf]"+r"(y_buf), // %[y_buf]
  2003. [u_buf]"+r"(u_buf), // %[u_buf]
  2004. [v_buf]"+r"(v_buf), // %[v_buf]
  2005. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2006. [width]"+rm"(width) // %[width]
  2007. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2008. : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
  2009. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2010. );
  2011. }
  2012. #endif // HAS_I422TOARGBROW_AVX2
  2013. #if defined(HAS_I422ALPHATOARGBROW_AVX2)
  2014. // 16 pixels
  2015. // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
  2016. void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf,
  2017. const uint8* u_buf,
  2018. const uint8* v_buf,
  2019. const uint8* a_buf,
  2020. uint8* dst_argb,
  2021. const struct YuvConstants* yuvconstants,
  2022. int width) {
  2023. // clang-format off
  2024. asm volatile (
  2025. YUVTORGB_SETUP_AVX2(yuvconstants)
  2026. "sub %[u_buf],%[v_buf] \n"
  2027. LABELALIGN
  2028. "1: \n"
  2029. READYUVA422_AVX2
  2030. YUVTORGB_AVX2(yuvconstants)
  2031. STOREARGB_AVX2
  2032. "subl $0x10,%[width] \n"
  2033. "jg 1b \n"
  2034. "vzeroupper \n"
  2035. : [y_buf]"+r"(y_buf), // %[y_buf]
  2036. [u_buf]"+r"(u_buf), // %[u_buf]
  2037. [v_buf]"+r"(v_buf), // %[v_buf]
  2038. [a_buf]"+r"(a_buf), // %[a_buf]
  2039. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2040. #if defined(__i386__)
  2041. [width]"+m"(width) // %[width]
  2042. #else
  2043. [width]"+rm"(width) // %[width]
  2044. #endif
  2045. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2046. : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
  2047. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2048. );
  2049. // clang-format on
  2050. }
  2051. #endif // HAS_I422ALPHATOARGBROW_AVX2
  2052. #if defined(HAS_I422TORGBAROW_AVX2)
  2053. // 16 pixels
  2054. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
  2055. void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
  2056. const uint8* u_buf,
  2057. const uint8* v_buf,
  2058. uint8* dst_argb,
  2059. const struct YuvConstants* yuvconstants,
  2060. int width) {
  2061. asm volatile (
  2062. YUVTORGB_SETUP_AVX2(yuvconstants)
  2063. "sub %[u_buf],%[v_buf] \n"
  2064. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2065. LABELALIGN
  2066. "1: \n"
  2067. READYUV422_AVX2
  2068. YUVTORGB_AVX2(yuvconstants)
  2069. // Step 3: Weave into RGBA
  2070. "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
  2071. "vpermq $0xd8,%%ymm1,%%ymm1 \n"
  2072. "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n"
  2073. "vpermq $0xd8,%%ymm2,%%ymm2 \n"
  2074. "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n"
  2075. "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n"
  2076. "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n"
  2077. "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
  2078. "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
  2079. "sub $0x10,%[width] \n"
  2080. "jg 1b \n"
  2081. "vzeroupper \n"
  2082. : [y_buf]"+r"(y_buf), // %[y_buf]
  2083. [u_buf]"+r"(u_buf), // %[u_buf]
  2084. [v_buf]"+r"(v_buf), // %[v_buf]
  2085. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2086. [width]"+rm"(width) // %[width]
  2087. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2088. : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
  2089. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2090. );
  2091. }
  2092. #endif // HAS_I422TORGBAROW_AVX2
  2093. #if defined(HAS_NV12TOARGBROW_AVX2)
  2094. // 16 pixels.
  2095. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  2096. void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf,
  2097. const uint8* uv_buf,
  2098. uint8* dst_argb,
  2099. const struct YuvConstants* yuvconstants,
  2100. int width) {
  2101. // clang-format off
  2102. asm volatile (
  2103. YUVTORGB_SETUP_AVX2(yuvconstants)
  2104. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2105. LABELALIGN
  2106. "1: \n"
  2107. READNV12_AVX2
  2108. YUVTORGB_AVX2(yuvconstants)
  2109. STOREARGB_AVX2
  2110. "sub $0x10,%[width] \n"
  2111. "jg 1b \n"
  2112. "vzeroupper \n"
  2113. : [y_buf]"+r"(y_buf), // %[y_buf]
  2114. [uv_buf]"+r"(uv_buf), // %[uv_buf]
  2115. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2116. [width]"+rm"(width) // %[width]
  2117. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2118. : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
  2119. "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2120. );
  2121. // clang-format on
  2122. }
  2123. #endif // HAS_NV12TOARGBROW_AVX2
  2124. #if defined(HAS_NV21TOARGBROW_AVX2)
  2125. // 16 pixels.
  2126. // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  2127. void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf,
  2128. const uint8* vu_buf,
  2129. uint8* dst_argb,
  2130. const struct YuvConstants* yuvconstants,
  2131. int width) {
  2132. // clang-format off
  2133. asm volatile (
  2134. YUVTORGB_SETUP_AVX2(yuvconstants)
  2135. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2136. LABELALIGN
  2137. "1: \n"
  2138. READNV21_AVX2
  2139. YUVTORGB_AVX2(yuvconstants)
  2140. STOREARGB_AVX2
  2141. "sub $0x10,%[width] \n"
  2142. "jg 1b \n"
  2143. "vzeroupper \n"
  2144. : [y_buf]"+r"(y_buf), // %[y_buf]
  2145. [vu_buf]"+r"(vu_buf), // %[vu_buf]
  2146. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2147. [width]"+rm"(width) // %[width]
  2148. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  2149. [kShuffleNV21]"m"(kShuffleNV21)
  2150. : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
  2151. "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2152. );
  2153. // clang-format on
  2154. }
  2155. #endif // HAS_NV21TOARGBROW_AVX2
  2156. #if defined(HAS_YUY2TOARGBROW_AVX2)
  2157. // 16 pixels.
  2158. // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
  2159. void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf,
  2160. uint8* dst_argb,
  2161. const struct YuvConstants* yuvconstants,
  2162. int width) {
  2163. // clang-format off
  2164. asm volatile (
  2165. YUVTORGB_SETUP_AVX2(yuvconstants)
  2166. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2167. LABELALIGN
  2168. "1: \n"
  2169. READYUY2_AVX2
  2170. YUVTORGB_AVX2(yuvconstants)
  2171. STOREARGB_AVX2
  2172. "sub $0x10,%[width] \n"
  2173. "jg 1b \n"
  2174. "vzeroupper \n"
  2175. : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
  2176. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2177. [width]"+rm"(width) // %[width]
  2178. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  2179. [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
  2180. [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
  2181. : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
  2182. "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2183. );
  2184. // clang-format on
  2185. }
  2186. #endif // HAS_YUY2TOARGBROW_AVX2
  2187. #if defined(HAS_UYVYTOARGBROW_AVX2)
  2188. // 16 pixels.
  2189. // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
  2190. void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf,
  2191. uint8* dst_argb,
  2192. const struct YuvConstants* yuvconstants,
  2193. int width) {
  2194. // clang-format off
  2195. asm volatile (
  2196. YUVTORGB_SETUP_AVX2(yuvconstants)
  2197. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2198. LABELALIGN
  2199. "1: \n"
  2200. READUYVY_AVX2
  2201. YUVTORGB_AVX2(yuvconstants)
  2202. STOREARGB_AVX2
  2203. "sub $0x10,%[width] \n"
  2204. "jg 1b \n"
  2205. "vzeroupper \n"
  2206. : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
  2207. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2208. [width]"+rm"(width) // %[width]
  2209. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  2210. [kShuffleUYVYY]"m"(kShuffleUYVYY),
  2211. [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
  2212. : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
  2213. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2214. );
  2215. // clang-format on
  2216. }
  2217. #endif // HAS_UYVYTOARGBROW_AVX2
  2218. #ifdef HAS_I400TOARGBROW_SSE2
  2219. void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
  2220. asm volatile (
  2221. "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164
  2222. "movd %%eax,%%xmm2 \n"
  2223. "pshufd $0x0,%%xmm2,%%xmm2 \n"
  2224. "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16
  2225. "movd %%eax,%%xmm3 \n"
  2226. "pshufd $0x0,%%xmm3,%%xmm3 \n"
  2227. "pcmpeqb %%xmm4,%%xmm4 \n"
  2228. "pslld $0x18,%%xmm4 \n"
  2229. LABELALIGN
  2230. "1: \n"
  2231. // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
  2232. "movq " MEMACCESS(0) ",%%xmm0 \n"
  2233. "lea " MEMLEA(0x8,0) ",%0 \n"
  2234. "punpcklbw %%xmm0,%%xmm0 \n"
  2235. "pmulhuw %%xmm2,%%xmm0 \n"
  2236. "psubusw %%xmm3,%%xmm0 \n"
  2237. "psrlw $6, %%xmm0 \n"
  2238. "packuswb %%xmm0,%%xmm0 \n"
  2239. // Step 2: Weave into ARGB
  2240. "punpcklbw %%xmm0,%%xmm0 \n"
  2241. "movdqa %%xmm0,%%xmm1 \n"
  2242. "punpcklwd %%xmm0,%%xmm0 \n"
  2243. "punpckhwd %%xmm1,%%xmm1 \n"
  2244. "por %%xmm4,%%xmm0 \n"
  2245. "por %%xmm4,%%xmm1 \n"
  2246. "movdqu %%xmm0," MEMACCESS(1) " \n"
  2247. "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
  2248. "lea " MEMLEA(0x20,1) ",%1 \n"
  2249. "sub $0x8,%2 \n"
  2250. "jg 1b \n"
  2251. : "+r"(y_buf), // %0
  2252. "+r"(dst_argb), // %1
  2253. "+rm"(width) // %2
  2254. :
  2255. : "memory", "cc", "eax"
  2256. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
  2257. );
  2258. }
  2259. #endif // HAS_I400TOARGBROW_SSE2
  2260. #ifdef HAS_I400TOARGBROW_AVX2
  2261. // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
  2262. // note: vpunpcklbw mutates and vpackuswb unmutates.
  2263. void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
  2264. asm volatile (
  2265. "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16
  2266. "vmovd %%eax,%%xmm2 \n"
  2267. "vbroadcastss %%xmm2,%%ymm2 \n"
  2268. "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164
  2269. "vmovd %%eax,%%xmm3 \n"
  2270. "vbroadcastss %%xmm3,%%ymm3 \n"
  2271. "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
  2272. "vpslld $0x18,%%ymm4,%%ymm4 \n"
  2273. LABELALIGN
  2274. "1: \n"
  2275. // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
  2276. "vmovdqu " MEMACCESS(0) ",%%xmm0 \n"
  2277. "lea " MEMLEA(0x10,0) ",%0 \n"
  2278. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  2279. "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
  2280. "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
  2281. "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n"
  2282. "vpsrlw $0x6,%%ymm0,%%ymm0 \n"
  2283. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
  2284. "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n"
  2285. "vpermq $0xd8,%%ymm1,%%ymm1 \n"
  2286. "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n"
  2287. "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n"
  2288. "vpor %%ymm4,%%ymm0,%%ymm0 \n"
  2289. "vpor %%ymm4,%%ymm1,%%ymm1 \n"
  2290. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  2291. "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
  2292. "lea " MEMLEA(0x40,1) ",%1 \n"
  2293. "sub $0x10,%2 \n"
  2294. "jg 1b \n"
  2295. "vzeroupper \n"
  2296. : "+r"(y_buf), // %0
  2297. "+r"(dst_argb), // %1
  2298. "+rm"(width) // %2
  2299. :
  2300. : "memory", "cc", "eax"
  2301. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
  2302. );
  2303. }
  2304. #endif // HAS_I400TOARGBROW_AVX2
  2305. #ifdef HAS_MIRRORROW_SSSE3
  2306. // Shuffle table for reversing the bytes.
  2307. static uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
  2308. 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
  2309. void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
  2310. intptr_t temp_width = (intptr_t)(width);
  2311. asm volatile (
  2312. "movdqa %3,%%xmm5 \n"
  2313. LABELALIGN
  2314. "1: \n"
  2315. MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0
  2316. "pshufb %%xmm5,%%xmm0 \n"
  2317. "movdqu %%xmm0," MEMACCESS(1) " \n"
  2318. "lea " MEMLEA(0x10,1) ",%1 \n"
  2319. "sub $0x10,%2 \n"
  2320. "jg 1b \n"
  2321. : "+r"(src), // %0
  2322. "+r"(dst), // %1
  2323. "+r"(temp_width) // %2
  2324. : "m"(kShuffleMirror) // %3
  2325. : "memory", "cc", NACL_R14
  2326. "xmm0", "xmm5"
  2327. );
  2328. }
  2329. #endif // HAS_MIRRORROW_SSSE3
  2330. #ifdef HAS_MIRRORROW_AVX2
  2331. void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
  2332. intptr_t temp_width = (intptr_t)(width);
  2333. asm volatile (
  2334. "vbroadcastf128 %3,%%ymm5 \n"
  2335. LABELALIGN
  2336. "1: \n"
  2337. MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0) // vmovdqu -0x20(%0,%2),%%ymm0
  2338. "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
  2339. "vpermq $0x4e,%%ymm0,%%ymm0 \n"
  2340. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  2341. "lea " MEMLEA(0x20,1) ",%1 \n"
  2342. "sub $0x20,%2 \n"
  2343. "jg 1b \n"
  2344. "vzeroupper \n"
  2345. : "+r"(src), // %0
  2346. "+r"(dst), // %1
  2347. "+r"(temp_width) // %2
  2348. : "m"(kShuffleMirror) // %3
  2349. : "memory", "cc", NACL_R14
  2350. "xmm0", "xmm5"
  2351. );
  2352. }
  2353. #endif // HAS_MIRRORROW_AVX2
  2354. #ifdef HAS_MIRRORUVROW_SSSE3
  2355. // Shuffle table for reversing the bytes of UV channels.
  2356. static uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
  2357. 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
  2358. void MirrorUVRow_SSSE3(const uint8* src,
  2359. uint8* dst_u,
  2360. uint8* dst_v,
  2361. int width) {
  2362. intptr_t temp_width = (intptr_t)(width);
  2363. asm volatile (
  2364. "movdqa %4,%%xmm1 \n"
  2365. "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n"
  2366. "sub %1,%2 \n"
  2367. LABELALIGN
  2368. "1: \n"
  2369. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  2370. "lea " MEMLEA(-0x10,0) ",%0 \n"
  2371. "pshufb %%xmm1,%%xmm0 \n"
  2372. "movlpd %%xmm0," MEMACCESS(1) " \n"
  2373. MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2)
  2374. "lea " MEMLEA(0x8,1) ",%1 \n"
  2375. "sub $8,%3 \n"
  2376. "jg 1b \n"
  2377. : "+r"(src), // %0
  2378. "+r"(dst_u), // %1
  2379. "+r"(dst_v), // %2
  2380. "+r"(temp_width) // %3
  2381. : "m"(kShuffleMirrorUV) // %4
  2382. : "memory", "cc", NACL_R14
  2383. "xmm0", "xmm1"
  2384. );
  2385. }
  2386. #endif // HAS_MIRRORUVROW_SSSE3
  2387. #ifdef HAS_ARGBMIRRORROW_SSE2
  2388. void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
  2389. intptr_t temp_width = (intptr_t)(width);
  2390. asm volatile (
  2391. "lea " MEMLEA4(-0x10,0,2,4) ",%0 \n"
  2392. LABELALIGN
  2393. "1: \n"
  2394. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  2395. "pshufd $0x1b,%%xmm0,%%xmm0 \n"
  2396. "lea " MEMLEA(-0x10,0) ",%0 \n"
  2397. "movdqu %%xmm0," MEMACCESS(1) " \n"
  2398. "lea " MEMLEA(0x10,1) ",%1 \n"
  2399. "sub $0x4,%2 \n"
  2400. "jg 1b \n"
  2401. : "+r"(src), // %0
  2402. "+r"(dst), // %1
  2403. "+r"(temp_width) // %2
  2404. :
  2405. : "memory", "cc"
  2406. , "xmm0"
  2407. );
  2408. }
  2409. #endif // HAS_ARGBMIRRORROW_SSE2
  2410. #ifdef HAS_ARGBMIRRORROW_AVX2
  2411. // Shuffle table for reversing the bytes.
  2412. static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
  2413. void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
  2414. intptr_t temp_width = (intptr_t)(width);
  2415. asm volatile (
  2416. "vmovdqu %3,%%ymm5 \n"
  2417. LABELALIGN
  2418. "1: \n"
  2419. VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
  2420. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  2421. "lea " MEMLEA(0x20,1) ",%1 \n"
  2422. "sub $0x8,%2 \n"
  2423. "jg 1b \n"
  2424. "vzeroupper \n"
  2425. : "+r"(src), // %0
  2426. "+r"(dst), // %1
  2427. "+r"(temp_width) // %2
  2428. : "m"(kARGBShuffleMirror_AVX2) // %3
  2429. : "memory", "cc", NACL_R14
  2430. "xmm0", "xmm5"
  2431. );
  2432. }
  2433. #endif // HAS_ARGBMIRRORROW_AVX2
  2434. #ifdef HAS_SPLITUVROW_AVX2
  2435. void SplitUVRow_AVX2(const uint8* src_uv,
  2436. uint8* dst_u,
  2437. uint8* dst_v,
  2438. int width) {
  2439. asm volatile (
  2440. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2441. "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
  2442. "sub %1,%2 \n"
  2443. LABELALIGN
  2444. "1: \n"
  2445. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  2446. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  2447. "lea " MEMLEA(0x40,0) ",%0 \n"
  2448. "vpsrlw $0x8,%%ymm0,%%ymm2 \n"
  2449. "vpsrlw $0x8,%%ymm1,%%ymm3 \n"
  2450. "vpand %%ymm5,%%ymm0,%%ymm0 \n"
  2451. "vpand %%ymm5,%%ymm1,%%ymm1 \n"
  2452. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  2453. "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
  2454. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  2455. "vpermq $0xd8,%%ymm2,%%ymm2 \n"
  2456. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  2457. MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1) // vmovdqu %%ymm2,(%1,%2)
  2458. "lea " MEMLEA(0x20,1) ",%1 \n"
  2459. "sub $0x20,%3 \n"
  2460. "jg 1b \n"
  2461. "vzeroupper \n"
  2462. : "+r"(src_uv), // %0
  2463. "+r"(dst_u), // %1
  2464. "+r"(dst_v), // %2
  2465. "+r"(width) // %3
  2466. :
  2467. : "memory", "cc", NACL_R14
  2468. "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  2469. );
  2470. }
  2471. #endif // HAS_SPLITUVROW_AVX2
  2472. #ifdef HAS_SPLITUVROW_SSE2
  2473. void SplitUVRow_SSE2(const uint8* src_uv,
  2474. uint8* dst_u,
  2475. uint8* dst_v,
  2476. int width) {
  2477. asm volatile (
  2478. "pcmpeqb %%xmm5,%%xmm5 \n"
  2479. "psrlw $0x8,%%xmm5 \n"
  2480. "sub %1,%2 \n"
  2481. LABELALIGN
  2482. "1: \n"
  2483. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  2484. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  2485. "lea " MEMLEA(0x20,0) ",%0 \n"
  2486. "movdqa %%xmm0,%%xmm2 \n"
  2487. "movdqa %%xmm1,%%xmm3 \n"
  2488. "pand %%xmm5,%%xmm0 \n"
  2489. "pand %%xmm5,%%xmm1 \n"
  2490. "packuswb %%xmm1,%%xmm0 \n"
  2491. "psrlw $0x8,%%xmm2 \n"
  2492. "psrlw $0x8,%%xmm3 \n"
  2493. "packuswb %%xmm3,%%xmm2 \n"
  2494. "movdqu %%xmm0," MEMACCESS(1) " \n"
  2495. MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2)
  2496. "lea " MEMLEA(0x10,1) ",%1 \n"
  2497. "sub $0x10,%3 \n"
  2498. "jg 1b \n"
  2499. : "+r"(src_uv), // %0
  2500. "+r"(dst_u), // %1
  2501. "+r"(dst_v), // %2
  2502. "+r"(width) // %3
  2503. :
  2504. : "memory", "cc", NACL_R14
  2505. "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  2506. );
  2507. }
  2508. #endif // HAS_SPLITUVROW_SSE2
  2509. #ifdef HAS_MERGEUVROW_AVX2
  2510. void MergeUVRow_AVX2(const uint8* src_u,
  2511. const uint8* src_v,
  2512. uint8* dst_uv,
  2513. int width) {
  2514. asm volatile (
  2515. "sub %0,%1 \n"
  2516. LABELALIGN
  2517. "1: \n"
  2518. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  2519. MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1
  2520. "lea " MEMLEA(0x20,0) ",%0 \n"
  2521. "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
  2522. "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
  2523. "vextractf128 $0x0,%%ymm2," MEMACCESS(2) " \n"
  2524. "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
  2525. "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
  2526. "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
  2527. "lea " MEMLEA(0x40,2) ",%2 \n"
  2528. "sub $0x20,%3 \n"
  2529. "jg 1b \n"
  2530. "vzeroupper \n"
  2531. : "+r"(src_u), // %0
  2532. "+r"(src_v), // %1
  2533. "+r"(dst_uv), // %2
  2534. "+r"(width) // %3
  2535. :
  2536. : "memory", "cc", NACL_R14
  2537. "xmm0", "xmm1", "xmm2"
  2538. );
  2539. }
  2540. #endif // HAS_MERGEUVROW_AVX2
  2541. #ifdef HAS_MERGEUVROW_SSE2
  2542. void MergeUVRow_SSE2(const uint8* src_u,
  2543. const uint8* src_v,
  2544. uint8* dst_uv,
  2545. int width) {
  2546. asm volatile (
  2547. "sub %0,%1 \n"
  2548. LABELALIGN
  2549. "1: \n"
  2550. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  2551. MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
  2552. "lea " MEMLEA(0x10,0) ",%0 \n"
  2553. "movdqa %%xmm0,%%xmm2 \n"
  2554. "punpcklbw %%xmm1,%%xmm0 \n"
  2555. "punpckhbw %%xmm1,%%xmm2 \n"
  2556. "movdqu %%xmm0," MEMACCESS(2) " \n"
  2557. "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
  2558. "lea " MEMLEA(0x20,2) ",%2 \n"
  2559. "sub $0x10,%3 \n"
  2560. "jg 1b \n"
  2561. : "+r"(src_u), // %0
  2562. "+r"(src_v), // %1
  2563. "+r"(dst_uv), // %2
  2564. "+r"(width) // %3
  2565. :
  2566. : "memory", "cc", NACL_R14
  2567. "xmm0", "xmm1", "xmm2"
  2568. );
  2569. }
  2570. #endif // HAS_MERGEUVROW_SSE2
  2571. // Use scale to convert lsb formats to msb, depending how many bits there are:
  2572. // 128 = 9 bits
  2573. // 64 = 10 bits
  2574. // 16 = 12 bits
  2575. // 1 = 16 bits
  2576. #ifdef HAS_MERGEUVROW_16_AVX2
  2577. void MergeUVRow_16_AVX2(const uint16* src_u,
  2578. const uint16* src_v,
  2579. uint16* dst_uv,
  2580. int scale,
  2581. int width) {
  2582. // clang-format off
  2583. asm volatile (
  2584. "vmovd %4,%%xmm3 \n"
  2585. "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
  2586. "vbroadcastss %%xmm3,%%ymm3 \n"
  2587. "sub %0,%1 \n"
  2588. // 16 pixels per loop.
  2589. LABELALIGN
  2590. "1: \n"
  2591. "vmovdqu (%0),%%ymm0 \n"
  2592. "vmovdqu (%0,%1,1),%%ymm1 \n"
  2593. "add $0x20,%0 \n"
  2594. "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
  2595. "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
  2596. "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates
  2597. "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n"
  2598. "vextractf128 $0x0,%%ymm2,(%2) \n"
  2599. "vextractf128 $0x0,%%ymm0,0x10(%2) \n"
  2600. "vextractf128 $0x1,%%ymm2,0x20(%2) \n"
  2601. "vextractf128 $0x1,%%ymm0,0x30(%2) \n"
  2602. "add $0x40,%2 \n"
  2603. "sub $0x10,%3 \n"
  2604. "jg 1b \n"
  2605. "vzeroupper \n"
  2606. : "+r"(src_u), // %0
  2607. "+r"(src_v), // %1
  2608. "+r"(dst_uv), // %2
  2609. "+r"(width) // %3
  2610. : "r"(scale) // %4
  2611. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
  2612. // clang-format on
  2613. }
  2614. #endif // HAS_MERGEUVROW_AVX2
  2615. #ifdef HAS_MULTIPLYROW_16_AVX2
  2616. void MultiplyRow_16_AVX2(const uint16* src_y,
  2617. uint16* dst_y,
  2618. int scale,
  2619. int width) {
  2620. // clang-format off
  2621. asm volatile (
  2622. "vmovd %3,%%xmm3 \n"
  2623. "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
  2624. "vbroadcastss %%xmm3,%%ymm3 \n"
  2625. "sub %0,%1 \n"
  2626. // 16 pixels per loop.
  2627. LABELALIGN
  2628. "1: \n"
  2629. "vmovdqu (%0),%%ymm0 \n"
  2630. "vmovdqu 0x20(%0),%%ymm1 \n"
  2631. "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
  2632. "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
  2633. "vmovdqu %%ymm0,(%0,%1) \n"
  2634. "vmovdqu %%ymm1,0x20(%0,%1) \n"
  2635. "add $0x40,%0 \n"
  2636. "sub $0x20,%2 \n"
  2637. "jg 1b \n"
  2638. "vzeroupper \n"
  2639. : "+r"(src_y), // %0
  2640. "+r"(dst_y), // %1
  2641. "+r"(width) // %2
  2642. : "r"(scale) // %3
  2643. : "memory", "cc", "xmm0", "xmm1", "xmm3");
  2644. // clang-format on
  2645. }
  2646. #endif // HAS_MULTIPLYROW_16_AVX2
  2647. #ifdef HAS_SPLITRGBROW_SSSE3
  2648. // Shuffle table for converting RGB to Planar.
  2649. static uvec8 kShuffleMaskRGBToR0 = {0u, 3u, 6u, 9u, 12u, 15u,
  2650. 128u, 128u, 128u, 128u, 128u, 128u,
  2651. 128u, 128u, 128u, 128u};
  2652. static uvec8 kShuffleMaskRGBToR1 = {128u, 128u, 128u, 128u, 128u, 128u,
  2653. 2u, 5u, 8u, 11u, 14u, 128u,
  2654. 128u, 128u, 128u, 128u};
  2655. static uvec8 kShuffleMaskRGBToR2 = {128u, 128u, 128u, 128u, 128u, 128u,
  2656. 128u, 128u, 128u, 128u, 128u, 1u,
  2657. 4u, 7u, 10u, 13u};
  2658. static uvec8 kShuffleMaskRGBToG0 = {1u, 4u, 7u, 10u, 13u, 128u,
  2659. 128u, 128u, 128u, 128u, 128u, 128u,
  2660. 128u, 128u, 128u, 128u};
  2661. static uvec8 kShuffleMaskRGBToG1 = {128u, 128u, 128u, 128u, 128u, 0u,
  2662. 3u, 6u, 9u, 12u, 15u, 128u,
  2663. 128u, 128u, 128u, 128u};
  2664. static uvec8 kShuffleMaskRGBToG2 = {128u, 128u, 128u, 128u, 128u, 128u,
  2665. 128u, 128u, 128u, 128u, 128u, 2u,
  2666. 5u, 8u, 11u, 14u};
  2667. static uvec8 kShuffleMaskRGBToB0 = {2u, 5u, 8u, 11u, 14u, 128u,
  2668. 128u, 128u, 128u, 128u, 128u, 128u,
  2669. 128u, 128u, 128u, 128u};
  2670. static uvec8 kShuffleMaskRGBToB1 = {128u, 128u, 128u, 128u, 128u, 1u,
  2671. 4u, 7u, 10u, 13u, 128u, 128u,
  2672. 128u, 128u, 128u, 128u};
  2673. static uvec8 kShuffleMaskRGBToB2 = {128u, 128u, 128u, 128u, 128u, 128u,
  2674. 128u, 128u, 128u, 128u, 0u, 3u,
  2675. 6u, 9u, 12u, 15u};
  2676. void SplitRGBRow_SSSE3(const uint8* src_rgb,
  2677. uint8* dst_r,
  2678. uint8* dst_g,
  2679. uint8* dst_b,
  2680. int width) {
  2681. asm volatile (
  2682. LABELALIGN
  2683. "1: \n"
  2684. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  2685. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  2686. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  2687. "pshufb %5, %%xmm0 \n"
  2688. "pshufb %6, %%xmm1 \n"
  2689. "pshufb %7, %%xmm2 \n"
  2690. "por %%xmm1,%%xmm0 \n"
  2691. "por %%xmm2,%%xmm0 \n"
  2692. "movdqu %%xmm0," MEMACCESS(1) " \n"
  2693. "lea " MEMLEA(0x10,1) ",%1 \n"
  2694. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  2695. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  2696. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  2697. "pshufb %8, %%xmm0 \n"
  2698. "pshufb %9, %%xmm1 \n"
  2699. "pshufb %10, %%xmm2 \n"
  2700. "por %%xmm1,%%xmm0 \n"
  2701. "por %%xmm2,%%xmm0 \n"
  2702. "movdqu %%xmm0," MEMACCESS(2) " \n"
  2703. "lea " MEMLEA(0x10,2) ",%2 \n"
  2704. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  2705. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  2706. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  2707. "pshufb %11, %%xmm0 \n"
  2708. "pshufb %12, %%xmm1 \n"
  2709. "pshufb %13, %%xmm2 \n"
  2710. "por %%xmm1,%%xmm0 \n"
  2711. "por %%xmm2,%%xmm0 \n"
  2712. "movdqu %%xmm0," MEMACCESS(3) " \n"
  2713. "lea " MEMLEA(0x10,3) ",%3 \n"
  2714. "lea " MEMLEA(0x30,0) ",%0 \n"
  2715. "sub $0x10,%4 \n"
  2716. "jg 1b \n"
  2717. : "+r"(src_rgb), // %0
  2718. "+r"(dst_r), // %1
  2719. "+r"(dst_g), // %2
  2720. "+r"(dst_b), // %3
  2721. "+r"(width) // %4
  2722. : "m"(kShuffleMaskRGBToR0), // %5
  2723. "m"(kShuffleMaskRGBToR1), // %6
  2724. "m"(kShuffleMaskRGBToR2), // %7
  2725. "m"(kShuffleMaskRGBToG0), // %8
  2726. "m"(kShuffleMaskRGBToG1), // %9
  2727. "m"(kShuffleMaskRGBToG2), // %10
  2728. "m"(kShuffleMaskRGBToB0), // %11
  2729. "m"(kShuffleMaskRGBToB1), // %12
  2730. "m"(kShuffleMaskRGBToB2) // %13
  2731. : "memory", "cc", NACL_R14
  2732. "xmm0", "xmm1", "xmm2"
  2733. );
  2734. }
  2735. #endif // HAS_SPLITRGBROW_SSSE3
  2736. #ifdef HAS_MERGERGBROW_SSSE3
  2737. // Shuffle table for converting RGB to Planar.
  2738. static uvec8 kShuffleMaskRToRGB0 = {0u, 128u, 128u, 1u, 128u, 128u,
  2739. 2u, 128u, 128u, 3u, 128u, 128u,
  2740. 4u, 128u, 128u, 5u};
  2741. static uvec8 kShuffleMaskGToRGB0 = {128u, 0u, 128u, 128u, 1u, 128u,
  2742. 128u, 2u, 128u, 128u, 3u, 128u,
  2743. 128u, 4u, 128u, 128u};
  2744. static uvec8 kShuffleMaskBToRGB0 = {128u, 128u, 0u, 128u, 128u, 1u,
  2745. 128u, 128u, 2u, 128u, 128u, 3u,
  2746. 128u, 128u, 4u, 128u};
  2747. static uvec8 kShuffleMaskGToRGB1 = {5u, 128u, 128u, 6u, 128u, 128u,
  2748. 7u, 128u, 128u, 8u, 128u, 128u,
  2749. 9u, 128u, 128u, 10u};
  2750. static uvec8 kShuffleMaskBToRGB1 = {128u, 5u, 128u, 128u, 6u, 128u,
  2751. 128u, 7u, 128u, 128u, 8u, 128u,
  2752. 128u, 9u, 128u, 128u};
  2753. static uvec8 kShuffleMaskRToRGB1 = {128u, 128u, 6u, 128u, 128u, 7u,
  2754. 128u, 128u, 8u, 128u, 128u, 9u,
  2755. 128u, 128u, 10u, 128u};
  2756. static uvec8 kShuffleMaskBToRGB2 = {10u, 128u, 128u, 11u, 128u, 128u,
  2757. 12u, 128u, 128u, 13u, 128u, 128u,
  2758. 14u, 128u, 128u, 15u};
  2759. static uvec8 kShuffleMaskRToRGB2 = {128u, 11u, 128u, 128u, 12u, 128u,
  2760. 128u, 13u, 128u, 128u, 14u, 128u,
  2761. 128u, 15u, 128u, 128u};
  2762. static uvec8 kShuffleMaskGToRGB2 = {128u, 128u, 11u, 128u, 128u, 12u,
  2763. 128u, 128u, 13u, 128u, 128u, 14u,
  2764. 128u, 128u, 15u, 128u};
  2765. void MergeRGBRow_SSSE3(const uint8* src_r,
  2766. const uint8* src_g,
  2767. const uint8* src_b,
  2768. uint8* dst_rgb,
  2769. int width) {
  2770. asm volatile (
  2771. LABELALIGN
  2772. "1: \n"
  2773. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  2774. "movdqu " MEMACCESS(1) ",%%xmm1 \n"
  2775. "movdqu " MEMACCESS(2) ",%%xmm2 \n"
  2776. "pshufb %5, %%xmm0 \n"
  2777. "pshufb %6, %%xmm1 \n"
  2778. "pshufb %7, %%xmm2 \n"
  2779. "por %%xmm1,%%xmm0 \n"
  2780. "por %%xmm2,%%xmm0 \n"
  2781. "movdqu %%xmm0," MEMACCESS(3) " \n"
  2782. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  2783. "movdqu " MEMACCESS(1) ",%%xmm1 \n"
  2784. "movdqu " MEMACCESS(2) ",%%xmm2 \n"
  2785. "pshufb %8, %%xmm0 \n"
  2786. "pshufb %9, %%xmm1 \n"
  2787. "pshufb %10, %%xmm2 \n"
  2788. "por %%xmm1,%%xmm0 \n"
  2789. "por %%xmm2,%%xmm0 \n"
  2790. "movdqu %%xmm0," MEMACCESS2(16, 3) " \n"
  2791. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  2792. "movdqu " MEMACCESS(1) ",%%xmm1 \n"
  2793. "movdqu " MEMACCESS(2) ",%%xmm2 \n"
  2794. "pshufb %11, %%xmm0 \n"
  2795. "pshufb %12, %%xmm1 \n"
  2796. "pshufb %13, %%xmm2 \n"
  2797. "por %%xmm1,%%xmm0 \n"
  2798. "por %%xmm2,%%xmm0 \n"
  2799. "movdqu %%xmm0," MEMACCESS2(32, 3) " \n"
  2800. "lea " MEMLEA(0x10,0) ",%0 \n"
  2801. "lea " MEMLEA(0x10,1) ",%1 \n"
  2802. "lea " MEMLEA(0x10,2) ",%2 \n"
  2803. "lea " MEMLEA(0x30,3) ",%3 \n"
  2804. "sub $0x10,%4 \n"
  2805. "jg 1b \n"
  2806. : "+r"(src_r), // %0
  2807. "+r"(src_g), // %1
  2808. "+r"(src_b), // %2
  2809. "+r"(dst_rgb), // %3
  2810. "+r"(width) // %4
  2811. : "m"(kShuffleMaskRToRGB0), // %5
  2812. "m"(kShuffleMaskGToRGB0), // %6
  2813. "m"(kShuffleMaskBToRGB0), // %7
  2814. "m"(kShuffleMaskRToRGB1), // %8
  2815. "m"(kShuffleMaskGToRGB1), // %9
  2816. "m"(kShuffleMaskBToRGB1), // %10
  2817. "m"(kShuffleMaskRToRGB2), // %11
  2818. "m"(kShuffleMaskGToRGB2), // %12
  2819. "m"(kShuffleMaskBToRGB2) // %13
  2820. : "memory", "cc", NACL_R14
  2821. "xmm0", "xmm1", "xmm2"
  2822. );
  2823. }
  2824. #endif // HAS_MERGERGBROW_SSSE3
  2825. #ifdef HAS_COPYROW_SSE2
  2826. void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
  2827. asm volatile (
  2828. "test $0xf,%0 \n"
  2829. "jne 2f \n"
  2830. "test $0xf,%1 \n"
  2831. "jne 2f \n"
  2832. LABELALIGN
  2833. "1: \n"
  2834. "movdqa " MEMACCESS(0) ",%%xmm0 \n"
  2835. "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  2836. "lea " MEMLEA(0x20,0) ",%0 \n"
  2837. "movdqa %%xmm0," MEMACCESS(1) " \n"
  2838. "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
  2839. "lea " MEMLEA(0x20,1) ",%1 \n"
  2840. "sub $0x20,%2 \n"
  2841. "jg 1b \n"
  2842. "jmp 9f \n"
  2843. LABELALIGN
  2844. "2: \n"
  2845. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  2846. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  2847. "lea " MEMLEA(0x20,0) ",%0 \n"
  2848. "movdqu %%xmm0," MEMACCESS(1) " \n"
  2849. "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
  2850. "lea " MEMLEA(0x20,1) ",%1 \n"
  2851. "sub $0x20,%2 \n"
  2852. "jg 2b \n"
  2853. "9: \n"
  2854. : "+r"(src), // %0
  2855. "+r"(dst), // %1
  2856. "+r"(count) // %2
  2857. :
  2858. : "memory", "cc"
  2859. , "xmm0", "xmm1"
  2860. );
  2861. }
  2862. #endif // HAS_COPYROW_SSE2
  2863. #ifdef HAS_COPYROW_AVX
  2864. void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
  2865. asm volatile (
  2866. LABELALIGN
  2867. "1: \n"
  2868. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  2869. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  2870. "lea " MEMLEA(0x40,0) ",%0 \n"
  2871. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  2872. "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
  2873. "lea " MEMLEA(0x40,1) ",%1 \n"
  2874. "sub $0x40,%2 \n"
  2875. "jg 1b \n"
  2876. : "+r"(src), // %0
  2877. "+r"(dst), // %1
  2878. "+r"(count) // %2
  2879. :
  2880. : "memory", "cc"
  2881. , "xmm0", "xmm1"
  2882. );
  2883. }
  2884. #endif // HAS_COPYROW_AVX
  2885. #ifdef HAS_COPYROW_ERMS
  2886. // Multiple of 1.
  2887. void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
  2888. size_t width_tmp = (size_t)(width);
  2889. asm volatile("rep movsb " MEMMOVESTRING(0, 1) " \n"
  2890. : "+S"(src), // %0
  2891. "+D"(dst), // %1
  2892. "+c"(width_tmp) // %2
  2893. :
  2894. : "memory", "cc");
  2895. }
  2896. #endif // HAS_COPYROW_ERMS
  2897. #ifdef HAS_ARGBCOPYALPHAROW_SSE2
  2898. // width in pixels
  2899. void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
  2900. asm volatile (
  2901. "pcmpeqb %%xmm0,%%xmm0 \n"
  2902. "pslld $0x18,%%xmm0 \n"
  2903. "pcmpeqb %%xmm1,%%xmm1 \n"
  2904. "psrld $0x8,%%xmm1 \n"
  2905. LABELALIGN
  2906. "1: \n"
  2907. "movdqu " MEMACCESS(0) ",%%xmm2 \n"
  2908. "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n"
  2909. "lea " MEMLEA(0x20,0) ",%0 \n"
  2910. "movdqu " MEMACCESS(1) ",%%xmm4 \n"
  2911. "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n"
  2912. "pand %%xmm0,%%xmm2 \n"
  2913. "pand %%xmm0,%%xmm3 \n"
  2914. "pand %%xmm1,%%xmm4 \n"
  2915. "pand %%xmm1,%%xmm5 \n"
  2916. "por %%xmm4,%%xmm2 \n"
  2917. "por %%xmm5,%%xmm3 \n"
  2918. "movdqu %%xmm2," MEMACCESS(1) " \n"
  2919. "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
  2920. "lea " MEMLEA(0x20,1) ",%1 \n"
  2921. "sub $0x8,%2 \n"
  2922. "jg 1b \n"
  2923. : "+r"(src), // %0
  2924. "+r"(dst), // %1
  2925. "+r"(width) // %2
  2926. :
  2927. : "memory", "cc"
  2928. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2929. );
  2930. }
  2931. #endif // HAS_ARGBCOPYALPHAROW_SSE2
  2932. #ifdef HAS_ARGBCOPYALPHAROW_AVX2
  2933. // width in pixels
  2934. void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
  2935. asm volatile (
  2936. "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
  2937. "vpsrld $0x8,%%ymm0,%%ymm0 \n"
  2938. LABELALIGN
  2939. "1: \n"
  2940. "vmovdqu " MEMACCESS(0) ",%%ymm1 \n"
  2941. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n"
  2942. "lea " MEMLEA(0x40,0) ",%0 \n"
  2943. "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n"
  2944. "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n"
  2945. "vmovdqu %%ymm1," MEMACCESS(1) " \n"
  2946. "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n"
  2947. "lea " MEMLEA(0x40,1) ",%1 \n"
  2948. "sub $0x10,%2 \n"
  2949. "jg 1b \n"
  2950. "vzeroupper \n"
  2951. : "+r"(src), // %0
  2952. "+r"(dst), // %1
  2953. "+r"(width) // %2
  2954. :
  2955. : "memory", "cc"
  2956. , "xmm0", "xmm1", "xmm2"
  2957. );
  2958. }
  2959. #endif // HAS_ARGBCOPYALPHAROW_AVX2
  2960. #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
  2961. // width in pixels
  2962. void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
  2963. asm volatile (
  2964. LABELALIGN
  2965. "1: \n"
  2966. "movdqu " MEMACCESS(0) ", %%xmm0 \n"
  2967. "movdqu " MEMACCESS2(0x10, 0) ", %%xmm1 \n"
  2968. "lea " MEMLEA(0x20, 0) ", %0 \n"
  2969. "psrld $0x18, %%xmm0 \n"
  2970. "psrld $0x18, %%xmm1 \n"
  2971. "packssdw %%xmm1, %%xmm0 \n"
  2972. "packuswb %%xmm0, %%xmm0 \n"
  2973. "movq %%xmm0," MEMACCESS(1) " \n"
  2974. "lea " MEMLEA(0x8, 1) ", %1 \n"
  2975. "sub $0x8, %2 \n"
  2976. "jg 1b \n"
  2977. : "+r"(src_argb), // %0
  2978. "+r"(dst_a), // %1
  2979. "+rm"(width) // %2
  2980. :
  2981. : "memory", "cc"
  2982. , "xmm0", "xmm1"
  2983. );
  2984. }
  2985. #endif // HAS_ARGBEXTRACTALPHAROW_SSE2
  2986. #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
  2987. static const uvec8 kShuffleAlphaShort_AVX2 = {
  2988. 3u, 128u, 128u, 128u, 7u, 128u, 128u, 128u,
  2989. 11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};
  2990. void ARGBExtractAlphaRow_AVX2(const uint8* src_argb, uint8* dst_a, int width) {
  2991. asm volatile (
  2992. "vmovdqa %3,%%ymm4 \n"
  2993. "vbroadcastf128 %4,%%ymm5 \n"
  2994. LABELALIGN
  2995. "1: \n"
  2996. "vmovdqu " MEMACCESS(0) ", %%ymm0 \n"
  2997. "vmovdqu " MEMACCESS2(0x20, 0) ", %%ymm1 \n"
  2998. "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0
  2999. "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
  3000. "vmovdqu " MEMACCESS2(0x40, 0) ", %%ymm2 \n"
  3001. "vmovdqu " MEMACCESS2(0x60, 0) ", %%ymm3 \n"
  3002. "lea " MEMLEA(0x80, 0) ", %0 \n"
  3003. "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates
  3004. "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
  3005. "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
  3006. "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates
  3007. "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
  3008. "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate.
  3009. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  3010. "lea " MEMLEA(0x20,1) ",%1 \n"
  3011. "sub $0x20, %2 \n"
  3012. "jg 1b \n"
  3013. "vzeroupper \n"
  3014. : "+r"(src_argb), // %0
  3015. "+r"(dst_a), // %1
  3016. "+rm"(width) // %2
  3017. : "m"(kPermdARGBToY_AVX), // %3
  3018. "m"(kShuffleAlphaShort_AVX2) // %4
  3019. : "memory", "cc"
  3020. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  3021. );
  3022. }
  3023. #endif // HAS_ARGBEXTRACTALPHAROW_AVX2
  3024. #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
  3025. // width in pixels
  3026. void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
  3027. asm volatile (
  3028. "pcmpeqb %%xmm0,%%xmm0 \n"
  3029. "pslld $0x18,%%xmm0 \n"
  3030. "pcmpeqb %%xmm1,%%xmm1 \n"
  3031. "psrld $0x8,%%xmm1 \n"
  3032. LABELALIGN
  3033. "1: \n"
  3034. "movq " MEMACCESS(0) ",%%xmm2 \n"
  3035. "lea " MEMLEA(0x8,0) ",%0 \n"
  3036. "punpcklbw %%xmm2,%%xmm2 \n"
  3037. "punpckhwd %%xmm2,%%xmm3 \n"
  3038. "punpcklwd %%xmm2,%%xmm2 \n"
  3039. "movdqu " MEMACCESS(1) ",%%xmm4 \n"
  3040. "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n"
  3041. "pand %%xmm0,%%xmm2 \n"
  3042. "pand %%xmm0,%%xmm3 \n"
  3043. "pand %%xmm1,%%xmm4 \n"
  3044. "pand %%xmm1,%%xmm5 \n"
  3045. "por %%xmm4,%%xmm2 \n"
  3046. "por %%xmm5,%%xmm3 \n"
  3047. "movdqu %%xmm2," MEMACCESS(1) " \n"
  3048. "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
  3049. "lea " MEMLEA(0x20,1) ",%1 \n"
  3050. "sub $0x8,%2 \n"
  3051. "jg 1b \n"
  3052. : "+r"(src), // %0
  3053. "+r"(dst), // %1
  3054. "+r"(width) // %2
  3055. :
  3056. : "memory", "cc"
  3057. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  3058. );
  3059. }
  3060. #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
  3061. #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
  3062. // width in pixels
  3063. void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
  3064. asm volatile (
  3065. "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
  3066. "vpsrld $0x8,%%ymm0,%%ymm0 \n"
  3067. LABELALIGN
  3068. "1: \n"
  3069. "vpmovzxbd " MEMACCESS(0) ",%%ymm1 \n"
  3070. "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2 \n"
  3071. "lea " MEMLEA(0x10,0) ",%0 \n"
  3072. "vpslld $0x18,%%ymm1,%%ymm1 \n"
  3073. "vpslld $0x18,%%ymm2,%%ymm2 \n"
  3074. "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n"
  3075. "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n"
  3076. "vmovdqu %%ymm1," MEMACCESS(1) " \n"
  3077. "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n"
  3078. "lea " MEMLEA(0x40,1) ",%1 \n"
  3079. "sub $0x10,%2 \n"
  3080. "jg 1b \n"
  3081. "vzeroupper \n"
  3082. : "+r"(src), // %0
  3083. "+r"(dst), // %1
  3084. "+r"(width) // %2
  3085. :
  3086. : "memory", "cc"
  3087. , "xmm0", "xmm1", "xmm2"
  3088. );
  3089. }
  3090. #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
  3091. #ifdef HAS_SETROW_X86
  3092. void SetRow_X86(uint8* dst, uint8 v8, int width) {
  3093. size_t width_tmp = (size_t)(width >> 2);
  3094. const uint32 v32 = v8 * 0x01010101u; // Duplicate byte to all bytes.
  3095. asm volatile("rep stosl " MEMSTORESTRING(eax, 0) " \n"
  3096. : "+D"(dst), // %0
  3097. "+c"(width_tmp) // %1
  3098. : "a"(v32) // %2
  3099. : "memory", "cc");
  3100. }
  3101. void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
  3102. size_t width_tmp = (size_t)(width);
  3103. asm volatile("rep stosb " MEMSTORESTRING(al, 0) " \n"
  3104. : "+D"(dst), // %0
  3105. "+c"(width_tmp) // %1
  3106. : "a"(v8) // %2
  3107. : "memory", "cc");
  3108. }
  3109. void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
  3110. size_t width_tmp = (size_t)(width);
  3111. asm volatile("rep stosl " MEMSTORESTRING(eax, 0) " \n"
  3112. : "+D"(dst_argb), // %0
  3113. "+c"(width_tmp) // %1
  3114. : "a"(v32) // %2
  3115. : "memory", "cc");
  3116. }
  3117. #endif // HAS_SETROW_X86
  3118. #ifdef HAS_YUY2TOYROW_SSE2
  3119. void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) {
  3120. asm volatile (
  3121. "pcmpeqb %%xmm5,%%xmm5 \n"
  3122. "psrlw $0x8,%%xmm5 \n"
  3123. LABELALIGN
  3124. "1: \n"
  3125. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  3126. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  3127. "lea " MEMLEA(0x20,0) ",%0 \n"
  3128. "pand %%xmm5,%%xmm0 \n"
  3129. "pand %%xmm5,%%xmm1 \n"
  3130. "packuswb %%xmm1,%%xmm0 \n"
  3131. "movdqu %%xmm0," MEMACCESS(1) " \n"
  3132. "lea " MEMLEA(0x10,1) ",%1 \n"
  3133. "sub $0x10,%2 \n"
  3134. "jg 1b \n"
  3135. : "+r"(src_yuy2), // %0
  3136. "+r"(dst_y), // %1
  3137. "+r"(width) // %2
  3138. :
  3139. : "memory", "cc"
  3140. , "xmm0", "xmm1", "xmm5"
  3141. );
  3142. }
  3143. void YUY2ToUVRow_SSE2(const uint8* src_yuy2,
  3144. int stride_yuy2,
  3145. uint8* dst_u,
  3146. uint8* dst_v,
  3147. int width) {
  3148. asm volatile (
  3149. "pcmpeqb %%xmm5,%%xmm5 \n"
  3150. "psrlw $0x8,%%xmm5 \n"
  3151. "sub %1,%2 \n"
  3152. LABELALIGN
  3153. "1: \n"
  3154. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  3155. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  3156. MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
  3157. MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
  3158. "lea " MEMLEA(0x20,0) ",%0 \n"
  3159. "pavgb %%xmm2,%%xmm0 \n"
  3160. "pavgb %%xmm3,%%xmm1 \n"
  3161. "psrlw $0x8,%%xmm0 \n"
  3162. "psrlw $0x8,%%xmm1 \n"
  3163. "packuswb %%xmm1,%%xmm0 \n"
  3164. "movdqa %%xmm0,%%xmm1 \n"
  3165. "pand %%xmm5,%%xmm0 \n"
  3166. "packuswb %%xmm0,%%xmm0 \n"
  3167. "psrlw $0x8,%%xmm1 \n"
  3168. "packuswb %%xmm1,%%xmm1 \n"
  3169. "movq %%xmm0," MEMACCESS(1) " \n"
  3170. MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
  3171. "lea " MEMLEA(0x8,1) ",%1 \n"
  3172. "sub $0x10,%3 \n"
  3173. "jg 1b \n"
  3174. : "+r"(src_yuy2), // %0
  3175. "+r"(dst_u), // %1
  3176. "+r"(dst_v), // %2
  3177. "+r"(width) // %3
  3178. : "r"((intptr_t)(stride_yuy2)) // %4
  3179. : "memory", "cc", NACL_R14
  3180. "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  3181. );
  3182. }
  3183. void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
  3184. uint8* dst_u,
  3185. uint8* dst_v,
  3186. int width) {
  3187. asm volatile (
  3188. "pcmpeqb %%xmm5,%%xmm5 \n"
  3189. "psrlw $0x8,%%xmm5 \n"
  3190. "sub %1,%2 \n"
  3191. LABELALIGN
  3192. "1: \n"
  3193. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  3194. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  3195. "lea " MEMLEA(0x20,0) ",%0 \n"
  3196. "psrlw $0x8,%%xmm0 \n"
  3197. "psrlw $0x8,%%xmm1 \n"
  3198. "packuswb %%xmm1,%%xmm0 \n"
  3199. "movdqa %%xmm0,%%xmm1 \n"
  3200. "pand %%xmm5,%%xmm0 \n"
  3201. "packuswb %%xmm0,%%xmm0 \n"
  3202. "psrlw $0x8,%%xmm1 \n"
  3203. "packuswb %%xmm1,%%xmm1 \n"
  3204. "movq %%xmm0," MEMACCESS(1) " \n"
  3205. MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
  3206. "lea " MEMLEA(0x8,1) ",%1 \n"
  3207. "sub $0x10,%3 \n"
  3208. "jg 1b \n"
  3209. : "+r"(src_yuy2), // %0
  3210. "+r"(dst_u), // %1
  3211. "+r"(dst_v), // %2
  3212. "+r"(width) // %3
  3213. :
  3214. : "memory", "cc", NACL_R14
  3215. "xmm0", "xmm1", "xmm5"
  3216. );
  3217. }
  3218. void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width) {
  3219. asm volatile (
  3220. LABELALIGN
  3221. "1: \n"
  3222. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  3223. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  3224. "lea " MEMLEA(0x20,0) ",%0 \n"
  3225. "psrlw $0x8,%%xmm0 \n"
  3226. "psrlw $0x8,%%xmm1 \n"
  3227. "packuswb %%xmm1,%%xmm0 \n"
  3228. "movdqu %%xmm0," MEMACCESS(1) " \n"
  3229. "lea " MEMLEA(0x10,1) ",%1 \n"
  3230. "sub $0x10,%2 \n"
  3231. "jg 1b \n"
  3232. : "+r"(src_uyvy), // %0
  3233. "+r"(dst_y), // %1
  3234. "+r"(width) // %2
  3235. :
  3236. : "memory", "cc"
  3237. , "xmm0", "xmm1"
  3238. );
  3239. }
  3240. void UYVYToUVRow_SSE2(const uint8* src_uyvy,
  3241. int stride_uyvy,
  3242. uint8* dst_u,
  3243. uint8* dst_v,
  3244. int width) {
  3245. asm volatile (
  3246. "pcmpeqb %%xmm5,%%xmm5 \n"
  3247. "psrlw $0x8,%%xmm5 \n"
  3248. "sub %1,%2 \n"
  3249. LABELALIGN
  3250. "1: \n"
  3251. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  3252. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  3253. MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
  3254. MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
  3255. "lea " MEMLEA(0x20,0) ",%0 \n"
  3256. "pavgb %%xmm2,%%xmm0 \n"
  3257. "pavgb %%xmm3,%%xmm1 \n"
  3258. "pand %%xmm5,%%xmm0 \n"
  3259. "pand %%xmm5,%%xmm1 \n"
  3260. "packuswb %%xmm1,%%xmm0 \n"
  3261. "movdqa %%xmm0,%%xmm1 \n"
  3262. "pand %%xmm5,%%xmm0 \n"
  3263. "packuswb %%xmm0,%%xmm0 \n"
  3264. "psrlw $0x8,%%xmm1 \n"
  3265. "packuswb %%xmm1,%%xmm1 \n"
  3266. "movq %%xmm0," MEMACCESS(1) " \n"
  3267. MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
  3268. "lea " MEMLEA(0x8,1) ",%1 \n"
  3269. "sub $0x10,%3 \n"
  3270. "jg 1b \n"
  3271. : "+r"(src_uyvy), // %0
  3272. "+r"(dst_u), // %1
  3273. "+r"(dst_v), // %2
  3274. "+r"(width) // %3
  3275. : "r"((intptr_t)(stride_uyvy)) // %4
  3276. : "memory", "cc", NACL_R14
  3277. "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  3278. );
  3279. }
  3280. void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
  3281. uint8* dst_u,
  3282. uint8* dst_v,
  3283. int width) {
  3284. asm volatile (
  3285. "pcmpeqb %%xmm5,%%xmm5 \n"
  3286. "psrlw $0x8,%%xmm5 \n"
  3287. "sub %1,%2 \n"
  3288. LABELALIGN
  3289. "1: \n"
  3290. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  3291. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  3292. "lea " MEMLEA(0x20,0) ",%0 \n"
  3293. "pand %%xmm5,%%xmm0 \n"
  3294. "pand %%xmm5,%%xmm1 \n"
  3295. "packuswb %%xmm1,%%xmm0 \n"
  3296. "movdqa %%xmm0,%%xmm1 \n"
  3297. "pand %%xmm5,%%xmm0 \n"
  3298. "packuswb %%xmm0,%%xmm0 \n"
  3299. "psrlw $0x8,%%xmm1 \n"
  3300. "packuswb %%xmm1,%%xmm1 \n"
  3301. "movq %%xmm0," MEMACCESS(1) " \n"
  3302. MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
  3303. "lea " MEMLEA(0x8,1) ",%1 \n"
  3304. "sub $0x10,%3 \n"
  3305. "jg 1b \n"
  3306. : "+r"(src_uyvy), // %0
  3307. "+r"(dst_u), // %1
  3308. "+r"(dst_v), // %2
  3309. "+r"(width) // %3
  3310. :
  3311. : "memory", "cc", NACL_R14
  3312. "xmm0", "xmm1", "xmm5"
  3313. );
  3314. }
  3315. #endif // HAS_YUY2TOYROW_SSE2
  3316. #ifdef HAS_YUY2TOYROW_AVX2
  3317. void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
  3318. asm volatile (
  3319. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  3320. "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
  3321. LABELALIGN
  3322. "1: \n"
  3323. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  3324. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  3325. "lea " MEMLEA(0x40,0) ",%0 \n"
  3326. "vpand %%ymm5,%%ymm0,%%ymm0 \n"
  3327. "vpand %%ymm5,%%ymm1,%%ymm1 \n"
  3328. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  3329. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3330. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  3331. "lea " MEMLEA(0x20,1) ",%1 \n"
  3332. "sub $0x20,%2 \n"
  3333. "jg 1b \n"
  3334. "vzeroupper \n"
  3335. : "+r"(src_yuy2), // %0
  3336. "+r"(dst_y), // %1
  3337. "+r"(width) // %2
  3338. :
  3339. : "memory", "cc"
  3340. , "xmm0", "xmm1", "xmm5"
  3341. );
  3342. }
  3343. void YUY2ToUVRow_AVX2(const uint8* src_yuy2,
  3344. int stride_yuy2,
  3345. uint8* dst_u,
  3346. uint8* dst_v,
  3347. int width) {
  3348. asm volatile (
  3349. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  3350. "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
  3351. "sub %1,%2 \n"
  3352. LABELALIGN
  3353. "1: \n"
  3354. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  3355. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  3356. VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
  3357. VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
  3358. "lea " MEMLEA(0x40,0) ",%0 \n"
  3359. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  3360. "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
  3361. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  3362. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3363. "vpand %%ymm5,%%ymm0,%%ymm1 \n"
  3364. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  3365. "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
  3366. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
  3367. "vpermq $0xd8,%%ymm1,%%ymm1 \n"
  3368. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3369. "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
  3370. VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
  3371. "lea " MEMLEA(0x10,1) ",%1 \n"
  3372. "sub $0x20,%3 \n"
  3373. "jg 1b \n"
  3374. "vzeroupper \n"
  3375. : "+r"(src_yuy2), // %0
  3376. "+r"(dst_u), // %1
  3377. "+r"(dst_v), // %2
  3378. "+r"(width) // %3
  3379. : "r"((intptr_t)(stride_yuy2)) // %4
  3380. : "memory", "cc", NACL_R14
  3381. "xmm0", "xmm1", "xmm5"
  3382. );
  3383. }
  3384. void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
  3385. uint8* dst_u,
  3386. uint8* dst_v,
  3387. int width) {
  3388. asm volatile (
  3389. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  3390. "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
  3391. "sub %1,%2 \n"
  3392. LABELALIGN
  3393. "1: \n"
  3394. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  3395. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  3396. "lea " MEMLEA(0x40,0) ",%0 \n"
  3397. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  3398. "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
  3399. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  3400. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3401. "vpand %%ymm5,%%ymm0,%%ymm1 \n"
  3402. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  3403. "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
  3404. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
  3405. "vpermq $0xd8,%%ymm1,%%ymm1 \n"
  3406. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3407. "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
  3408. VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
  3409. "lea " MEMLEA(0x10,1) ",%1 \n"
  3410. "sub $0x20,%3 \n"
  3411. "jg 1b \n"
  3412. "vzeroupper \n"
  3413. : "+r"(src_yuy2), // %0
  3414. "+r"(dst_u), // %1
  3415. "+r"(dst_v), // %2
  3416. "+r"(width) // %3
  3417. :
  3418. : "memory", "cc", NACL_R14
  3419. "xmm0", "xmm1", "xmm5"
  3420. );
  3421. }
  3422. void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width) {
  3423. asm volatile (
  3424. LABELALIGN
  3425. "1: \n"
  3426. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  3427. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  3428. "lea " MEMLEA(0x40,0) ",%0 \n"
  3429. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  3430. "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
  3431. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  3432. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3433. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  3434. "lea " MEMLEA(0x20,1) ",%1 \n"
  3435. "sub $0x20,%2 \n"
  3436. "jg 1b \n"
  3437. "vzeroupper \n"
  3438. : "+r"(src_uyvy), // %0
  3439. "+r"(dst_y), // %1
  3440. "+r"(width) // %2
  3441. :
  3442. : "memory", "cc"
  3443. , "xmm0", "xmm1", "xmm5"
  3444. );
  3445. }
  3446. void UYVYToUVRow_AVX2(const uint8* src_uyvy,
  3447. int stride_uyvy,
  3448. uint8* dst_u,
  3449. uint8* dst_v,
  3450. int width) {
  3451. asm volatile (
  3452. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  3453. "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
  3454. "sub %1,%2 \n"
  3455. LABELALIGN
  3456. "1: \n"
  3457. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  3458. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  3459. VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
  3460. VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
  3461. "lea " MEMLEA(0x40,0) ",%0 \n"
  3462. "vpand %%ymm5,%%ymm0,%%ymm0 \n"
  3463. "vpand %%ymm5,%%ymm1,%%ymm1 \n"
  3464. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  3465. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3466. "vpand %%ymm5,%%ymm0,%%ymm1 \n"
  3467. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  3468. "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
  3469. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
  3470. "vpermq $0xd8,%%ymm1,%%ymm1 \n"
  3471. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3472. "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
  3473. VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
  3474. "lea " MEMLEA(0x10,1) ",%1 \n"
  3475. "sub $0x20,%3 \n"
  3476. "jg 1b \n"
  3477. "vzeroupper \n"
  3478. : "+r"(src_uyvy), // %0
  3479. "+r"(dst_u), // %1
  3480. "+r"(dst_v), // %2
  3481. "+r"(width) // %3
  3482. : "r"((intptr_t)(stride_uyvy)) // %4
  3483. : "memory", "cc", NACL_R14
  3484. "xmm0", "xmm1", "xmm5"
  3485. );
  3486. }
  3487. void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
  3488. uint8* dst_u,
  3489. uint8* dst_v,
  3490. int width) {
  3491. asm volatile (
  3492. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  3493. "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
  3494. "sub %1,%2 \n"
  3495. LABELALIGN
  3496. "1: \n"
  3497. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  3498. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  3499. "lea " MEMLEA(0x40,0) ",%0 \n"
  3500. "vpand %%ymm5,%%ymm0,%%ymm0 \n"
  3501. "vpand %%ymm5,%%ymm1,%%ymm1 \n"
  3502. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  3503. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3504. "vpand %%ymm5,%%ymm0,%%ymm1 \n"
  3505. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  3506. "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
  3507. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
  3508. "vpermq $0xd8,%%ymm1,%%ymm1 \n"
  3509. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3510. "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
  3511. VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
  3512. "lea " MEMLEA(0x10,1) ",%1 \n"
  3513. "sub $0x20,%3 \n"
  3514. "jg 1b \n"
  3515. "vzeroupper \n"
  3516. : "+r"(src_uyvy), // %0
  3517. "+r"(dst_u), // %1
  3518. "+r"(dst_v), // %2
  3519. "+r"(width) // %3
  3520. :
  3521. : "memory", "cc", NACL_R14
  3522. "xmm0", "xmm1", "xmm5"
  3523. );
  3524. }
  3525. #endif // HAS_YUY2TOYROW_AVX2
  3526. #ifdef HAS_ARGBBLENDROW_SSSE3
  3527. // Shuffle table for isolating alpha.
  3528. static uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
  3529. 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
  3530. // Blend 8 pixels at a time
  3531. void ARGBBlendRow_SSSE3(const uint8* src_argb0,
  3532. const uint8* src_argb1,
  3533. uint8* dst_argb,
  3534. int width) {
  3535. asm volatile (
  3536. "pcmpeqb %%xmm7,%%xmm7 \n"
  3537. "psrlw $0xf,%%xmm7 \n"
  3538. "pcmpeqb %%xmm6,%%xmm6 \n"
  3539. "psrlw $0x8,%%xmm6 \n"
  3540. "pcmpeqb %%xmm5,%%xmm5 \n"
  3541. "psllw $0x8,%%xmm5 \n"
  3542. "pcmpeqb %%xmm4,%%xmm4 \n"
  3543. "pslld $0x18,%%xmm4 \n"
  3544. "sub $0x4,%3 \n"
  3545. "jl 49f \n"
  3546. // 4 pixel loop.
  3547. LABELALIGN
  3548. "40: \n"
  3549. "movdqu " MEMACCESS(0) ",%%xmm3 \n"
  3550. "lea " MEMLEA(0x10,0) ",%0 \n"
  3551. "movdqa %%xmm3,%%xmm0 \n"
  3552. "pxor %%xmm4,%%xmm3 \n"
  3553. "movdqu " MEMACCESS(1) ",%%xmm2 \n"
  3554. "pshufb %4,%%xmm3 \n"
  3555. "pand %%xmm6,%%xmm2 \n"
  3556. "paddw %%xmm7,%%xmm3 \n"
  3557. "pmullw %%xmm3,%%xmm2 \n"
  3558. "movdqu " MEMACCESS(1) ",%%xmm1 \n"
  3559. "lea " MEMLEA(0x10,1) ",%1 \n"
  3560. "psrlw $0x8,%%xmm1 \n"
  3561. "por %%xmm4,%%xmm0 \n"
  3562. "pmullw %%xmm3,%%xmm1 \n"
  3563. "psrlw $0x8,%%xmm2 \n"
  3564. "paddusb %%xmm2,%%xmm0 \n"
  3565. "pand %%xmm5,%%xmm1 \n"
  3566. "paddusb %%xmm1,%%xmm0 \n"
  3567. "movdqu %%xmm0," MEMACCESS(2) " \n"
  3568. "lea " MEMLEA(0x10,2) ",%2 \n"
  3569. "sub $0x4,%3 \n"
  3570. "jge 40b \n"
  3571. "49: \n"
  3572. "add $0x3,%3 \n"
  3573. "jl 99f \n"
  3574. // 1 pixel loop.
  3575. "91: \n"
  3576. "movd " MEMACCESS(0) ",%%xmm3 \n"
  3577. "lea " MEMLEA(0x4,0) ",%0 \n"
  3578. "movdqa %%xmm3,%%xmm0 \n"
  3579. "pxor %%xmm4,%%xmm3 \n"
  3580. "movd " MEMACCESS(1) ",%%xmm2 \n"
  3581. "pshufb %4,%%xmm3 \n"
  3582. "pand %%xmm6,%%xmm2 \n"
  3583. "paddw %%xmm7,%%xmm3 \n"
  3584. "pmullw %%xmm3,%%xmm2 \n"
  3585. "movd " MEMACCESS(1) ",%%xmm1 \n"
  3586. "lea " MEMLEA(0x4,1) ",%1 \n"
  3587. "psrlw $0x8,%%xmm1 \n"
  3588. "por %%xmm4,%%xmm0 \n"
  3589. "pmullw %%xmm3,%%xmm1 \n"
  3590. "psrlw $0x8,%%xmm2 \n"
  3591. "paddusb %%xmm2,%%xmm0 \n"
  3592. "pand %%xmm5,%%xmm1 \n"
  3593. "paddusb %%xmm1,%%xmm0 \n"
  3594. "movd %%xmm0," MEMACCESS(2) " \n"
  3595. "lea " MEMLEA(0x4,2) ",%2 \n"
  3596. "sub $0x1,%3 \n"
  3597. "jge 91b \n"
  3598. "99: \n"
  3599. : "+r"(src_argb0), // %0
  3600. "+r"(src_argb1), // %1
  3601. "+r"(dst_argb), // %2
  3602. "+r"(width) // %3
  3603. : "m"(kShuffleAlpha) // %4
  3604. : "memory", "cc"
  3605. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  3606. );
  3607. }
  3608. #endif // HAS_ARGBBLENDROW_SSSE3
  3609. #ifdef HAS_BLENDPLANEROW_SSSE3
  3610. // Blend 8 pixels at a time.
  3611. // unsigned version of math
  3612. // =((A2*C2)+(B2*(255-C2))+255)/256
  3613. // signed version of math
  3614. // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
  3615. void BlendPlaneRow_SSSE3(const uint8* src0,
  3616. const uint8* src1,
  3617. const uint8* alpha,
  3618. uint8* dst,
  3619. int width) {
  3620. asm volatile(
  3621. "pcmpeqb %%xmm5,%%xmm5 \n"
  3622. "psllw $0x8,%%xmm5 \n"
  3623. "mov $0x80808080,%%eax \n"
  3624. "movd %%eax,%%xmm6 \n"
  3625. "pshufd $0x0,%%xmm6,%%xmm6 \n"
  3626. "mov $0x807f807f,%%eax \n"
  3627. "movd %%eax,%%xmm7 \n"
  3628. "pshufd $0x0,%%xmm7,%%xmm7 \n"
  3629. "sub %2,%0 \n"
  3630. "sub %2,%1 \n"
  3631. "sub %2,%3 \n"
  3632. // 8 pixel loop.
  3633. LABELALIGN
  3634. "1: \n"
  3635. "movq (%2),%%xmm0 \n"
  3636. "punpcklbw %%xmm0,%%xmm0 \n"
  3637. "pxor %%xmm5,%%xmm0 \n"
  3638. "movq (%0,%2,1),%%xmm1 \n"
  3639. "movq (%1,%2,1),%%xmm2 \n"
  3640. "punpcklbw %%xmm2,%%xmm1 \n"
  3641. "psubb %%xmm6,%%xmm1 \n"
  3642. "pmaddubsw %%xmm1,%%xmm0 \n"
  3643. "paddw %%xmm7,%%xmm0 \n"
  3644. "psrlw $0x8,%%xmm0 \n"
  3645. "packuswb %%xmm0,%%xmm0 \n"
  3646. "movq %%xmm0,(%3,%2,1) \n"
  3647. "lea 0x8(%2),%2 \n"
  3648. "sub $0x8,%4 \n"
  3649. "jg 1b \n"
  3650. : "+r"(src0), // %0
  3651. "+r"(src1), // %1
  3652. "+r"(alpha), // %2
  3653. "+r"(dst), // %3
  3654. "+rm"(width) // %4
  3655. ::"memory",
  3656. "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
  3657. }
  3658. #endif // HAS_BLENDPLANEROW_SSSE3
  3659. #ifdef HAS_BLENDPLANEROW_AVX2
  3660. // Blend 32 pixels at a time.
  3661. // unsigned version of math
  3662. // =((A2*C2)+(B2*(255-C2))+255)/256
  3663. // signed version of math
  3664. // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
  3665. void BlendPlaneRow_AVX2(const uint8* src0,
  3666. const uint8* src1,
  3667. const uint8* alpha,
  3668. uint8* dst,
  3669. int width) {
  3670. asm volatile(
  3671. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  3672. "vpsllw $0x8,%%ymm5,%%ymm5 \n"
  3673. "mov $0x80808080,%%eax \n"
  3674. "vmovd %%eax,%%xmm6 \n"
  3675. "vbroadcastss %%xmm6,%%ymm6 \n"
  3676. "mov $0x807f807f,%%eax \n"
  3677. "vmovd %%eax,%%xmm7 \n"
  3678. "vbroadcastss %%xmm7,%%ymm7 \n"
  3679. "sub %2,%0 \n"
  3680. "sub %2,%1 \n"
  3681. "sub %2,%3 \n"
  3682. // 32 pixel loop.
  3683. LABELALIGN
  3684. "1: \n"
  3685. "vmovdqu (%2),%%ymm0 \n"
  3686. "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n"
  3687. "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
  3688. "vpxor %%ymm5,%%ymm3,%%ymm3 \n"
  3689. "vpxor %%ymm5,%%ymm0,%%ymm0 \n"
  3690. "vmovdqu (%0,%2,1),%%ymm1 \n"
  3691. "vmovdqu (%1,%2,1),%%ymm2 \n"
  3692. "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n"
  3693. "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
  3694. "vpsubb %%ymm6,%%ymm4,%%ymm4 \n"
  3695. "vpsubb %%ymm6,%%ymm1,%%ymm1 \n"
  3696. "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
  3697. "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n"
  3698. "vpaddw %%ymm7,%%ymm3,%%ymm3 \n"
  3699. "vpaddw %%ymm7,%%ymm0,%%ymm0 \n"
  3700. "vpsrlw $0x8,%%ymm3,%%ymm3 \n"
  3701. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  3702. "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n"
  3703. "vmovdqu %%ymm0,(%3,%2,1) \n"
  3704. "lea 0x20(%2),%2 \n"
  3705. "sub $0x20,%4 \n"
  3706. "jg 1b \n"
  3707. "vzeroupper \n"
  3708. : "+r"(src0), // %0
  3709. "+r"(src1), // %1
  3710. "+r"(alpha), // %2
  3711. "+r"(dst), // %3
  3712. "+rm"(width) // %4
  3713. ::"memory",
  3714. "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  3715. "xmm7");
  3716. }
  3717. #endif // HAS_BLENDPLANEROW_AVX2
  3718. #ifdef HAS_ARGBATTENUATEROW_SSSE3
  3719. // Shuffle table duplicating alpha
  3720. static uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
  3721. 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
  3722. static uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
  3723. 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
  3724. // Attenuate 4 pixels at a time.
  3725. void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
  3726. asm volatile (
  3727. "pcmpeqb %%xmm3,%%xmm3 \n"
  3728. "pslld $0x18,%%xmm3 \n"
  3729. "movdqa %3,%%xmm4 \n"
  3730. "movdqa %4,%%xmm5 \n"
  3731. // 4 pixel loop.
  3732. LABELALIGN
  3733. "1: \n"
  3734. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  3735. "pshufb %%xmm4,%%xmm0 \n"
  3736. "movdqu " MEMACCESS(0) ",%%xmm1 \n"
  3737. "punpcklbw %%xmm1,%%xmm1 \n"
  3738. "pmulhuw %%xmm1,%%xmm0 \n"
  3739. "movdqu " MEMACCESS(0) ",%%xmm1 \n"
  3740. "pshufb %%xmm5,%%xmm1 \n"
  3741. "movdqu " MEMACCESS(0) ",%%xmm2 \n"
  3742. "punpckhbw %%xmm2,%%xmm2 \n"
  3743. "pmulhuw %%xmm2,%%xmm1 \n"
  3744. "movdqu " MEMACCESS(0) ",%%xmm2 \n"
  3745. "lea " MEMLEA(0x10,0) ",%0 \n"
  3746. "pand %%xmm3,%%xmm2 \n"
  3747. "psrlw $0x8,%%xmm0 \n"
  3748. "psrlw $0x8,%%xmm1 \n"
  3749. "packuswb %%xmm1,%%xmm0 \n"
  3750. "por %%xmm2,%%xmm0 \n"
  3751. "movdqu %%xmm0," MEMACCESS(1) " \n"
  3752. "lea " MEMLEA(0x10,1) ",%1 \n"
  3753. "sub $0x4,%2 \n"
  3754. "jg 1b \n"
  3755. : "+r"(src_argb), // %0
  3756. "+r"(dst_argb), // %1
  3757. "+r"(width) // %2
  3758. : "m"(kShuffleAlpha0), // %3
  3759. "m"(kShuffleAlpha1) // %4
  3760. : "memory", "cc"
  3761. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  3762. );
  3763. }
  3764. #endif // HAS_ARGBATTENUATEROW_SSSE3
  3765. #ifdef HAS_ARGBATTENUATEROW_AVX2
  3766. // Shuffle table duplicating alpha.
  3767. static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u,
  3768. 128u, 128u, 14u, 15u, 14u, 15u,
  3769. 14u, 15u, 128u, 128u};
  3770. // Attenuate 8 pixels at a time.
  3771. void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
  3772. asm volatile (
  3773. "vbroadcastf128 %3,%%ymm4 \n"
  3774. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  3775. "vpslld $0x18,%%ymm5,%%ymm5 \n"
  3776. "sub %0,%1 \n"
  3777. // 8 pixel loop.
  3778. LABELALIGN
  3779. "1: \n"
  3780. "vmovdqu " MEMACCESS(0) ",%%ymm6 \n"
  3781. "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
  3782. "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
  3783. "vpshufb %%ymm4,%%ymm0,%%ymm2 \n"
  3784. "vpshufb %%ymm4,%%ymm1,%%ymm3 \n"
  3785. "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
  3786. "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
  3787. "vpand %%ymm5,%%ymm6,%%ymm6 \n"
  3788. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  3789. "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
  3790. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  3791. "vpor %%ymm6,%%ymm0,%%ymm0 \n"
  3792. MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1)
  3793. "lea " MEMLEA(0x20,0) ",%0 \n"
  3794. "sub $0x8,%2 \n"
  3795. "jg 1b \n"
  3796. "vzeroupper \n"
  3797. : "+r"(src_argb), // %0
  3798. "+r"(dst_argb), // %1
  3799. "+r"(width) // %2
  3800. : "m"(kShuffleAlpha_AVX2) // %3
  3801. : "memory", "cc"
  3802. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  3803. );
  3804. }
  3805. #endif // HAS_ARGBATTENUATEROW_AVX2
  3806. #ifdef HAS_ARGBUNATTENUATEROW_SSE2
  3807. // Unattenuate 4 pixels at a time.
  3808. void ARGBUnattenuateRow_SSE2(const uint8* src_argb,
  3809. uint8* dst_argb,
  3810. int width) {
  3811. uintptr_t alpha;
  3812. asm volatile (
  3813. // 4 pixel loop.
  3814. LABELALIGN
  3815. "1: \n"
  3816. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  3817. "movzb " MEMACCESS2(0x03,0) ",%3 \n"
  3818. "punpcklbw %%xmm0,%%xmm0 \n"
  3819. MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
  3820. "movzb " MEMACCESS2(0x07,0) ",%3 \n"
  3821. MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3
  3822. "pshuflw $0x40,%%xmm2,%%xmm2 \n"
  3823. "pshuflw $0x40,%%xmm3,%%xmm3 \n"
  3824. "movlhps %%xmm3,%%xmm2 \n"
  3825. "pmulhuw %%xmm2,%%xmm0 \n"
  3826. "movdqu " MEMACCESS(0) ",%%xmm1 \n"
  3827. "movzb " MEMACCESS2(0x0b,0) ",%3 \n"
  3828. "punpckhbw %%xmm1,%%xmm1 \n"
  3829. MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
  3830. "movzb " MEMACCESS2(0x0f,0) ",%3 \n"
  3831. MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3
  3832. "pshuflw $0x40,%%xmm2,%%xmm2 \n"
  3833. "pshuflw $0x40,%%xmm3,%%xmm3 \n"
  3834. "movlhps %%xmm3,%%xmm2 \n"
  3835. "pmulhuw %%xmm2,%%xmm1 \n"
  3836. "lea " MEMLEA(0x10,0) ",%0 \n"
  3837. "packuswb %%xmm1,%%xmm0 \n"
  3838. "movdqu %%xmm0," MEMACCESS(1) " \n"
  3839. "lea " MEMLEA(0x10,1) ",%1 \n"
  3840. "sub $0x4,%2 \n"
  3841. "jg 1b \n"
  3842. : "+r"(src_argb), // %0
  3843. "+r"(dst_argb), // %1
  3844. "+r"(width), // %2
  3845. "=&r"(alpha) // %3
  3846. : "r"(fixed_invtbl8) // %4
  3847. : "memory", "cc", NACL_R14
  3848. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  3849. );
  3850. }
  3851. #endif // HAS_ARGBUNATTENUATEROW_SSE2
  3852. #ifdef HAS_ARGBUNATTENUATEROW_AVX2
  3853. // Shuffle table duplicating alpha.
  3854. static const uvec8 kUnattenShuffleAlpha_AVX2 = {
  3855. 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
  3856. // Unattenuate 8 pixels at a time.
  3857. void ARGBUnattenuateRow_AVX2(const uint8* src_argb,
  3858. uint8* dst_argb,
  3859. int width) {
  3860. uintptr_t alpha;
  3861. asm volatile (
  3862. "sub %0,%1 \n"
  3863. "vbroadcastf128 %5,%%ymm5 \n"
  3864. // 8 pixel loop.
  3865. LABELALIGN
  3866. "1: \n"
  3867. // replace VPGATHER
  3868. "movzb " MEMACCESS2(0x03,0) ",%3 \n"
  3869. MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0
  3870. "movzb " MEMACCESS2(0x07,0) ",%3 \n"
  3871. MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1
  3872. "movzb " MEMACCESS2(0x0b,0) ",%3 \n"
  3873. "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n"
  3874. MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2
  3875. "movzb " MEMACCESS2(0x0f,0) ",%3 \n"
  3876. MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3
  3877. "movzb " MEMACCESS2(0x13,0) ",%3 \n"
  3878. "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n"
  3879. MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0
  3880. "movzb " MEMACCESS2(0x17,0) ",%3 \n"
  3881. MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1
  3882. "movzb " MEMACCESS2(0x1b,0) ",%3 \n"
  3883. "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n"
  3884. MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2
  3885. "movzb " MEMACCESS2(0x1f,0) ",%3 \n"
  3886. MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3
  3887. "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n"
  3888. "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n"
  3889. "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n"
  3890. "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n"
  3891. // end of VPGATHER
  3892. "vmovdqu " MEMACCESS(0) ",%%ymm6 \n"
  3893. "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
  3894. "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
  3895. "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n"
  3896. "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n"
  3897. "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
  3898. "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
  3899. "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
  3900. "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
  3901. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  3902. MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1)
  3903. "lea " MEMLEA(0x20,0) ",%0 \n"
  3904. "sub $0x8,%2 \n"
  3905. "jg 1b \n"
  3906. "vzeroupper \n"
  3907. : "+r"(src_argb), // %0
  3908. "+r"(dst_argb), // %1
  3909. "+r"(width), // %2
  3910. "=&r"(alpha) // %3
  3911. : "r"(fixed_invtbl8), // %4
  3912. "m"(kUnattenShuffleAlpha_AVX2) // %5
  3913. : "memory", "cc", NACL_R14
  3914. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  3915. );
  3916. }
  3917. #endif // HAS_ARGBUNATTENUATEROW_AVX2
  3918. #ifdef HAS_ARGBGRAYROW_SSSE3
  3919. // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
  3920. void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
  3921. asm volatile (
  3922. "movdqa %3,%%xmm4 \n"
  3923. "movdqa %4,%%xmm5 \n"
  3924. // 8 pixel loop.
  3925. LABELALIGN
  3926. "1: \n"
  3927. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  3928. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  3929. "pmaddubsw %%xmm4,%%xmm0 \n"
  3930. "pmaddubsw %%xmm4,%%xmm1 \n"
  3931. "phaddw %%xmm1,%%xmm0 \n"
  3932. "paddw %%xmm5,%%xmm0 \n"
  3933. "psrlw $0x7,%%xmm0 \n"
  3934. "packuswb %%xmm0,%%xmm0 \n"
  3935. "movdqu " MEMACCESS(0) ",%%xmm2 \n"
  3936. "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n"
  3937. "lea " MEMLEA(0x20,0) ",%0 \n"
  3938. "psrld $0x18,%%xmm2 \n"
  3939. "psrld $0x18,%%xmm3 \n"
  3940. "packuswb %%xmm3,%%xmm2 \n"
  3941. "packuswb %%xmm2,%%xmm2 \n"
  3942. "movdqa %%xmm0,%%xmm3 \n"
  3943. "punpcklbw %%xmm0,%%xmm0 \n"
  3944. "punpcklbw %%xmm2,%%xmm3 \n"
  3945. "movdqa %%xmm0,%%xmm1 \n"
  3946. "punpcklwd %%xmm3,%%xmm0 \n"
  3947. "punpckhwd %%xmm3,%%xmm1 \n"
  3948. "movdqu %%xmm0," MEMACCESS(1) " \n"
  3949. "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
  3950. "lea " MEMLEA(0x20,1) ",%1 \n"
  3951. "sub $0x8,%2 \n"
  3952. "jg 1b \n"
  3953. : "+r"(src_argb), // %0
  3954. "+r"(dst_argb), // %1
  3955. "+r"(width) // %2
  3956. : "m"(kARGBToYJ), // %3
  3957. "m"(kAddYJ64) // %4
  3958. : "memory", "cc"
  3959. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  3960. );
  3961. }
  3962. #endif // HAS_ARGBGRAYROW_SSSE3
  3963. #ifdef HAS_ARGBSEPIAROW_SSSE3
  3964. // b = (r * 35 + g * 68 + b * 17) >> 7
  3965. // g = (r * 45 + g * 88 + b * 22) >> 7
  3966. // r = (r * 50 + g * 98 + b * 24) >> 7
  3967. // Constant for ARGB color to sepia tone
  3968. static vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
  3969. 17, 68, 35, 0, 17, 68, 35, 0};
  3970. static vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
  3971. 22, 88, 45, 0, 22, 88, 45, 0};
  3972. static vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
  3973. 24, 98, 50, 0, 24, 98, 50, 0};
  3974. // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
  3975. void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
  3976. asm volatile (
  3977. "movdqa %2,%%xmm2 \n"
  3978. "movdqa %3,%%xmm3 \n"
  3979. "movdqa %4,%%xmm4 \n"
  3980. // 8 pixel loop.
  3981. LABELALIGN
  3982. "1: \n"
  3983. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  3984. "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
  3985. "pmaddubsw %%xmm2,%%xmm0 \n"
  3986. "pmaddubsw %%xmm2,%%xmm6 \n"
  3987. "phaddw %%xmm6,%%xmm0 \n"
  3988. "psrlw $0x7,%%xmm0 \n"
  3989. "packuswb %%xmm0,%%xmm0 \n"
  3990. "movdqu " MEMACCESS(0) ",%%xmm5 \n"
  3991. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  3992. "pmaddubsw %%xmm3,%%xmm5 \n"
  3993. "pmaddubsw %%xmm3,%%xmm1 \n"
  3994. "phaddw %%xmm1,%%xmm5 \n"
  3995. "psrlw $0x7,%%xmm5 \n"
  3996. "packuswb %%xmm5,%%xmm5 \n"
  3997. "punpcklbw %%xmm5,%%xmm0 \n"
  3998. "movdqu " MEMACCESS(0) ",%%xmm5 \n"
  3999. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  4000. "pmaddubsw %%xmm4,%%xmm5 \n"
  4001. "pmaddubsw %%xmm4,%%xmm1 \n"
  4002. "phaddw %%xmm1,%%xmm5 \n"
  4003. "psrlw $0x7,%%xmm5 \n"
  4004. "packuswb %%xmm5,%%xmm5 \n"
  4005. "movdqu " MEMACCESS(0) ",%%xmm6 \n"
  4006. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  4007. "psrld $0x18,%%xmm6 \n"
  4008. "psrld $0x18,%%xmm1 \n"
  4009. "packuswb %%xmm1,%%xmm6 \n"
  4010. "packuswb %%xmm6,%%xmm6 \n"
  4011. "punpcklbw %%xmm6,%%xmm5 \n"
  4012. "movdqa %%xmm0,%%xmm1 \n"
  4013. "punpcklwd %%xmm5,%%xmm0 \n"
  4014. "punpckhwd %%xmm5,%%xmm1 \n"
  4015. "movdqu %%xmm0," MEMACCESS(0) " \n"
  4016. "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
  4017. "lea " MEMLEA(0x20,0) ",%0 \n"
  4018. "sub $0x8,%1 \n"
  4019. "jg 1b \n"
  4020. : "+r"(dst_argb), // %0
  4021. "+r"(width) // %1
  4022. : "m"(kARGBToSepiaB), // %2
  4023. "m"(kARGBToSepiaG), // %3
  4024. "m"(kARGBToSepiaR) // %4
  4025. : "memory", "cc"
  4026. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  4027. );
  4028. }
  4029. #endif // HAS_ARGBSEPIAROW_SSSE3
  4030. #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
  4031. // Tranform 8 ARGB pixels (32 bytes) with color matrix.
  4032. // Same as Sepia except matrix is provided.
  4033. void ARGBColorMatrixRow_SSSE3(const uint8* src_argb,
  4034. uint8* dst_argb,
  4035. const int8* matrix_argb,
  4036. int width) {
  4037. asm volatile (
  4038. "movdqu " MEMACCESS(3) ",%%xmm5 \n"
  4039. "pshufd $0x00,%%xmm5,%%xmm2 \n"
  4040. "pshufd $0x55,%%xmm5,%%xmm3 \n"
  4041. "pshufd $0xaa,%%xmm5,%%xmm4 \n"
  4042. "pshufd $0xff,%%xmm5,%%xmm5 \n"
  4043. // 8 pixel loop.
  4044. LABELALIGN
  4045. "1: \n"
  4046. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4047. "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
  4048. "pmaddubsw %%xmm2,%%xmm0 \n"
  4049. "pmaddubsw %%xmm2,%%xmm7 \n"
  4050. "movdqu " MEMACCESS(0) ",%%xmm6 \n"
  4051. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  4052. "pmaddubsw %%xmm3,%%xmm6 \n"
  4053. "pmaddubsw %%xmm3,%%xmm1 \n"
  4054. "phaddsw %%xmm7,%%xmm0 \n"
  4055. "phaddsw %%xmm1,%%xmm6 \n"
  4056. "psraw $0x6,%%xmm0 \n"
  4057. "psraw $0x6,%%xmm6 \n"
  4058. "packuswb %%xmm0,%%xmm0 \n"
  4059. "packuswb %%xmm6,%%xmm6 \n"
  4060. "punpcklbw %%xmm6,%%xmm0 \n"
  4061. "movdqu " MEMACCESS(0) ",%%xmm1 \n"
  4062. "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
  4063. "pmaddubsw %%xmm4,%%xmm1 \n"
  4064. "pmaddubsw %%xmm4,%%xmm7 \n"
  4065. "phaddsw %%xmm7,%%xmm1 \n"
  4066. "movdqu " MEMACCESS(0) ",%%xmm6 \n"
  4067. "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
  4068. "pmaddubsw %%xmm5,%%xmm6 \n"
  4069. "pmaddubsw %%xmm5,%%xmm7 \n"
  4070. "phaddsw %%xmm7,%%xmm6 \n"
  4071. "psraw $0x6,%%xmm1 \n"
  4072. "psraw $0x6,%%xmm6 \n"
  4073. "packuswb %%xmm1,%%xmm1 \n"
  4074. "packuswb %%xmm6,%%xmm6 \n"
  4075. "punpcklbw %%xmm6,%%xmm1 \n"
  4076. "movdqa %%xmm0,%%xmm6 \n"
  4077. "punpcklwd %%xmm1,%%xmm0 \n"
  4078. "punpckhwd %%xmm1,%%xmm6 \n"
  4079. "movdqu %%xmm0," MEMACCESS(1) " \n"
  4080. "movdqu %%xmm6," MEMACCESS2(0x10,1) " \n"
  4081. "lea " MEMLEA(0x20,0) ",%0 \n"
  4082. "lea " MEMLEA(0x20,1) ",%1 \n"
  4083. "sub $0x8,%2 \n"
  4084. "jg 1b \n"
  4085. : "+r"(src_argb), // %0
  4086. "+r"(dst_argb), // %1
  4087. "+r"(width) // %2
  4088. : "r"(matrix_argb) // %3
  4089. : "memory", "cc"
  4090. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  4091. );
  4092. }
  4093. #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
  4094. #ifdef HAS_ARGBQUANTIZEROW_SSE2
  4095. // Quantize 4 ARGB pixels (16 bytes).
  4096. void ARGBQuantizeRow_SSE2(uint8* dst_argb,
  4097. int scale,
  4098. int interval_size,
  4099. int interval_offset,
  4100. int width) {
  4101. asm volatile (
  4102. "movd %2,%%xmm2 \n"
  4103. "movd %3,%%xmm3 \n"
  4104. "movd %4,%%xmm4 \n"
  4105. "pshuflw $0x40,%%xmm2,%%xmm2 \n"
  4106. "pshufd $0x44,%%xmm2,%%xmm2 \n"
  4107. "pshuflw $0x40,%%xmm3,%%xmm3 \n"
  4108. "pshufd $0x44,%%xmm3,%%xmm3 \n"
  4109. "pshuflw $0x40,%%xmm4,%%xmm4 \n"
  4110. "pshufd $0x44,%%xmm4,%%xmm4 \n"
  4111. "pxor %%xmm5,%%xmm5 \n"
  4112. "pcmpeqb %%xmm6,%%xmm6 \n"
  4113. "pslld $0x18,%%xmm6 \n"
  4114. // 4 pixel loop.
  4115. LABELALIGN
  4116. "1: \n"
  4117. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4118. "punpcklbw %%xmm5,%%xmm0 \n"
  4119. "pmulhuw %%xmm2,%%xmm0 \n"
  4120. "movdqu " MEMACCESS(0) ",%%xmm1 \n"
  4121. "punpckhbw %%xmm5,%%xmm1 \n"
  4122. "pmulhuw %%xmm2,%%xmm1 \n"
  4123. "pmullw %%xmm3,%%xmm0 \n"
  4124. "movdqu " MEMACCESS(0) ",%%xmm7 \n"
  4125. "pmullw %%xmm3,%%xmm1 \n"
  4126. "pand %%xmm6,%%xmm7 \n"
  4127. "paddw %%xmm4,%%xmm0 \n"
  4128. "paddw %%xmm4,%%xmm1 \n"
  4129. "packuswb %%xmm1,%%xmm0 \n"
  4130. "por %%xmm7,%%xmm0 \n"
  4131. "movdqu %%xmm0," MEMACCESS(0) " \n"
  4132. "lea " MEMLEA(0x10,0) ",%0 \n"
  4133. "sub $0x4,%1 \n"
  4134. "jg 1b \n"
  4135. : "+r"(dst_argb), // %0
  4136. "+r"(width) // %1
  4137. : "r"(scale), // %2
  4138. "r"(interval_size), // %3
  4139. "r"(interval_offset) // %4
  4140. : "memory", "cc"
  4141. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  4142. );
  4143. }
  4144. #endif // HAS_ARGBQUANTIZEROW_SSE2
  4145. #ifdef HAS_ARGBSHADEROW_SSE2
  4146. // Shade 4 pixels at a time by specified value.
  4147. void ARGBShadeRow_SSE2(const uint8* src_argb,
  4148. uint8* dst_argb,
  4149. int width,
  4150. uint32 value) {
  4151. asm volatile (
  4152. "movd %3,%%xmm2 \n"
  4153. "punpcklbw %%xmm2,%%xmm2 \n"
  4154. "punpcklqdq %%xmm2,%%xmm2 \n"
  4155. // 4 pixel loop.
  4156. LABELALIGN
  4157. "1: \n"
  4158. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4159. "lea " MEMLEA(0x10,0) ",%0 \n"
  4160. "movdqa %%xmm0,%%xmm1 \n"
  4161. "punpcklbw %%xmm0,%%xmm0 \n"
  4162. "punpckhbw %%xmm1,%%xmm1 \n"
  4163. "pmulhuw %%xmm2,%%xmm0 \n"
  4164. "pmulhuw %%xmm2,%%xmm1 \n"
  4165. "psrlw $0x8,%%xmm0 \n"
  4166. "psrlw $0x8,%%xmm1 \n"
  4167. "packuswb %%xmm1,%%xmm0 \n"
  4168. "movdqu %%xmm0," MEMACCESS(1) " \n"
  4169. "lea " MEMLEA(0x10,1) ",%1 \n"
  4170. "sub $0x4,%2 \n"
  4171. "jg 1b \n"
  4172. : "+r"(src_argb), // %0
  4173. "+r"(dst_argb), // %1
  4174. "+r"(width) // %2
  4175. : "r"(value) // %3
  4176. : "memory", "cc"
  4177. , "xmm0", "xmm1", "xmm2"
  4178. );
  4179. }
  4180. #endif // HAS_ARGBSHADEROW_SSE2
  4181. #ifdef HAS_ARGBMULTIPLYROW_SSE2
  4182. // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
  4183. void ARGBMultiplyRow_SSE2(const uint8* src_argb0,
  4184. const uint8* src_argb1,
  4185. uint8* dst_argb,
  4186. int width) {
  4187. asm volatile (
  4188. "pxor %%xmm5,%%xmm5 \n"
  4189. // 4 pixel loop.
  4190. LABELALIGN
  4191. "1: \n"
  4192. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4193. "lea " MEMLEA(0x10,0) ",%0 \n"
  4194. "movdqu " MEMACCESS(1) ",%%xmm2 \n"
  4195. "lea " MEMLEA(0x10,1) ",%1 \n"
  4196. "movdqu %%xmm0,%%xmm1 \n"
  4197. "movdqu %%xmm2,%%xmm3 \n"
  4198. "punpcklbw %%xmm0,%%xmm0 \n"
  4199. "punpckhbw %%xmm1,%%xmm1 \n"
  4200. "punpcklbw %%xmm5,%%xmm2 \n"
  4201. "punpckhbw %%xmm5,%%xmm3 \n"
  4202. "pmulhuw %%xmm2,%%xmm0 \n"
  4203. "pmulhuw %%xmm3,%%xmm1 \n"
  4204. "packuswb %%xmm1,%%xmm0 \n"
  4205. "movdqu %%xmm0," MEMACCESS(2) " \n"
  4206. "lea " MEMLEA(0x10,2) ",%2 \n"
  4207. "sub $0x4,%3 \n"
  4208. "jg 1b \n"
  4209. : "+r"(src_argb0), // %0
  4210. "+r"(src_argb1), // %1
  4211. "+r"(dst_argb), // %2
  4212. "+r"(width) // %3
  4213. :
  4214. : "memory", "cc"
  4215. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  4216. );
  4217. }
  4218. #endif // HAS_ARGBMULTIPLYROW_SSE2
  4219. #ifdef HAS_ARGBMULTIPLYROW_AVX2
  4220. // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
  4221. void ARGBMultiplyRow_AVX2(const uint8* src_argb0,
  4222. const uint8* src_argb1,
  4223. uint8* dst_argb,
  4224. int width) {
  4225. asm volatile (
  4226. "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
  4227. // 4 pixel loop.
  4228. LABELALIGN
  4229. "1: \n"
  4230. "vmovdqu " MEMACCESS(0) ",%%ymm1 \n"
  4231. "lea " MEMLEA(0x20,0) ",%0 \n"
  4232. "vmovdqu " MEMACCESS(1) ",%%ymm3 \n"
  4233. "lea " MEMLEA(0x20,1) ",%1 \n"
  4234. "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n"
  4235. "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n"
  4236. "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
  4237. "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
  4238. "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
  4239. "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
  4240. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  4241. "vmovdqu %%ymm0," MEMACCESS(2) " \n"
  4242. "lea " MEMLEA(0x20,2) ",%2 \n"
  4243. "sub $0x8,%3 \n"
  4244. "jg 1b \n"
  4245. "vzeroupper \n"
  4246. : "+r"(src_argb0), // %0
  4247. "+r"(src_argb1), // %1
  4248. "+r"(dst_argb), // %2
  4249. "+r"(width) // %3
  4250. :
  4251. : "memory", "cc"
  4252. #if defined(__AVX2__)
  4253. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  4254. #endif
  4255. );
  4256. }
  4257. #endif // HAS_ARGBMULTIPLYROW_AVX2
  4258. #ifdef HAS_ARGBADDROW_SSE2
  4259. // Add 2 rows of ARGB pixels together, 4 pixels at a time.
  4260. void ARGBAddRow_SSE2(const uint8* src_argb0,
  4261. const uint8* src_argb1,
  4262. uint8* dst_argb,
  4263. int width) {
  4264. asm volatile (
  4265. // 4 pixel loop.
  4266. LABELALIGN
  4267. "1: \n"
  4268. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4269. "lea " MEMLEA(0x10,0) ",%0 \n"
  4270. "movdqu " MEMACCESS(1) ",%%xmm1 \n"
  4271. "lea " MEMLEA(0x10,1) ",%1 \n"
  4272. "paddusb %%xmm1,%%xmm0 \n"
  4273. "movdqu %%xmm0," MEMACCESS(2) " \n"
  4274. "lea " MEMLEA(0x10,2) ",%2 \n"
  4275. "sub $0x4,%3 \n"
  4276. "jg 1b \n"
  4277. : "+r"(src_argb0), // %0
  4278. "+r"(src_argb1), // %1
  4279. "+r"(dst_argb), // %2
  4280. "+r"(width) // %3
  4281. :
  4282. : "memory", "cc"
  4283. , "xmm0", "xmm1"
  4284. );
  4285. }
  4286. #endif // HAS_ARGBADDROW_SSE2
  4287. #ifdef HAS_ARGBADDROW_AVX2
  4288. // Add 2 rows of ARGB pixels together, 4 pixels at a time.
  4289. void ARGBAddRow_AVX2(const uint8* src_argb0,
  4290. const uint8* src_argb1,
  4291. uint8* dst_argb,
  4292. int width) {
  4293. asm volatile (
  4294. // 4 pixel loop.
  4295. LABELALIGN
  4296. "1: \n"
  4297. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  4298. "lea " MEMLEA(0x20,0) ",%0 \n"
  4299. "vpaddusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
  4300. "lea " MEMLEA(0x20,1) ",%1 \n"
  4301. "vmovdqu %%ymm0," MEMACCESS(2) " \n"
  4302. "lea " MEMLEA(0x20,2) ",%2 \n"
  4303. "sub $0x8,%3 \n"
  4304. "jg 1b \n"
  4305. "vzeroupper \n"
  4306. : "+r"(src_argb0), // %0
  4307. "+r"(src_argb1), // %1
  4308. "+r"(dst_argb), // %2
  4309. "+r"(width) // %3
  4310. :
  4311. : "memory", "cc"
  4312. , "xmm0"
  4313. );
  4314. }
  4315. #endif // HAS_ARGBADDROW_AVX2
  4316. #ifdef HAS_ARGBSUBTRACTROW_SSE2
  4317. // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
  4318. void ARGBSubtractRow_SSE2(const uint8* src_argb0,
  4319. const uint8* src_argb1,
  4320. uint8* dst_argb,
  4321. int width) {
  4322. asm volatile (
  4323. // 4 pixel loop.
  4324. LABELALIGN
  4325. "1: \n"
  4326. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4327. "lea " MEMLEA(0x10,0) ",%0 \n"
  4328. "movdqu " MEMACCESS(1) ",%%xmm1 \n"
  4329. "lea " MEMLEA(0x10,1) ",%1 \n"
  4330. "psubusb %%xmm1,%%xmm0 \n"
  4331. "movdqu %%xmm0," MEMACCESS(2) " \n"
  4332. "lea " MEMLEA(0x10,2) ",%2 \n"
  4333. "sub $0x4,%3 \n"
  4334. "jg 1b \n"
  4335. : "+r"(src_argb0), // %0
  4336. "+r"(src_argb1), // %1
  4337. "+r"(dst_argb), // %2
  4338. "+r"(width) // %3
  4339. :
  4340. : "memory", "cc"
  4341. , "xmm0", "xmm1"
  4342. );
  4343. }
  4344. #endif // HAS_ARGBSUBTRACTROW_SSE2
  4345. #ifdef HAS_ARGBSUBTRACTROW_AVX2
  4346. // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
  4347. void ARGBSubtractRow_AVX2(const uint8* src_argb0,
  4348. const uint8* src_argb1,
  4349. uint8* dst_argb,
  4350. int width) {
  4351. asm volatile (
  4352. // 4 pixel loop.
  4353. LABELALIGN
  4354. "1: \n"
  4355. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  4356. "lea " MEMLEA(0x20,0) ",%0 \n"
  4357. "vpsubusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
  4358. "lea " MEMLEA(0x20,1) ",%1 \n"
  4359. "vmovdqu %%ymm0," MEMACCESS(2) " \n"
  4360. "lea " MEMLEA(0x20,2) ",%2 \n"
  4361. "sub $0x8,%3 \n"
  4362. "jg 1b \n"
  4363. "vzeroupper \n"
  4364. : "+r"(src_argb0), // %0
  4365. "+r"(src_argb1), // %1
  4366. "+r"(dst_argb), // %2
  4367. "+r"(width) // %3
  4368. :
  4369. : "memory", "cc"
  4370. , "xmm0"
  4371. );
  4372. }
  4373. #endif // HAS_ARGBSUBTRACTROW_AVX2
  4374. #ifdef HAS_SOBELXROW_SSE2
  4375. // SobelX as a matrix is
  4376. // -1 0 1
  4377. // -2 0 2
  4378. // -1 0 1
  4379. void SobelXRow_SSE2(const uint8* src_y0,
  4380. const uint8* src_y1,
  4381. const uint8* src_y2,
  4382. uint8* dst_sobelx,
  4383. int width) {
  4384. asm volatile (
  4385. "sub %0,%1 \n"
  4386. "sub %0,%2 \n"
  4387. "sub %0,%3 \n"
  4388. "pxor %%xmm5,%%xmm5 \n"
  4389. // 8 pixel loop.
  4390. LABELALIGN
  4391. "1: \n"
  4392. "movq " MEMACCESS(0) ",%%xmm0 \n"
  4393. "movq " MEMACCESS2(0x2,0) ",%%xmm1 \n"
  4394. "punpcklbw %%xmm5,%%xmm0 \n"
  4395. "punpcklbw %%xmm5,%%xmm1 \n"
  4396. "psubw %%xmm1,%%xmm0 \n"
  4397. MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
  4398. MEMOPREG(movq,0x02,0,1,1,xmm2) // movq 0x2(%0,%1,1),%%xmm2
  4399. "punpcklbw %%xmm5,%%xmm1 \n"
  4400. "punpcklbw %%xmm5,%%xmm2 \n"
  4401. "psubw %%xmm2,%%xmm1 \n"
  4402. MEMOPREG(movq,0x00,0,2,1,xmm2) // movq (%0,%2,1),%%xmm2
  4403. MEMOPREG(movq,0x02,0,2,1,xmm3) // movq 0x2(%0,%2,1),%%xmm3
  4404. "punpcklbw %%xmm5,%%xmm2 \n"
  4405. "punpcklbw %%xmm5,%%xmm3 \n"
  4406. "psubw %%xmm3,%%xmm2 \n"
  4407. "paddw %%xmm2,%%xmm0 \n"
  4408. "paddw %%xmm1,%%xmm0 \n"
  4409. "paddw %%xmm1,%%xmm0 \n"
  4410. "pxor %%xmm1,%%xmm1 \n"
  4411. "psubw %%xmm0,%%xmm1 \n"
  4412. "pmaxsw %%xmm1,%%xmm0 \n"
  4413. "packuswb %%xmm0,%%xmm0 \n"
  4414. MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1)
  4415. "lea " MEMLEA(0x8,0) ",%0 \n"
  4416. "sub $0x8,%4 \n"
  4417. "jg 1b \n"
  4418. : "+r"(src_y0), // %0
  4419. "+r"(src_y1), // %1
  4420. "+r"(src_y2), // %2
  4421. "+r"(dst_sobelx), // %3
  4422. "+r"(width) // %4
  4423. :
  4424. : "memory", "cc", NACL_R14
  4425. "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  4426. );
  4427. }
  4428. #endif // HAS_SOBELXROW_SSE2
  4429. #ifdef HAS_SOBELYROW_SSE2
  4430. // SobelY as a matrix is
  4431. // -1 -2 -1
  4432. // 0 0 0
  4433. // 1 2 1
  4434. void SobelYRow_SSE2(const uint8* src_y0,
  4435. const uint8* src_y1,
  4436. uint8* dst_sobely,
  4437. int width) {
  4438. asm volatile (
  4439. "sub %0,%1 \n"
  4440. "sub %0,%2 \n"
  4441. "pxor %%xmm5,%%xmm5 \n"
  4442. // 8 pixel loop.
  4443. LABELALIGN
  4444. "1: \n"
  4445. "movq " MEMACCESS(0) ",%%xmm0 \n"
  4446. MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
  4447. "punpcklbw %%xmm5,%%xmm0 \n"
  4448. "punpcklbw %%xmm5,%%xmm1 \n"
  4449. "psubw %%xmm1,%%xmm0 \n"
  4450. "movq " MEMACCESS2(0x1,0) ",%%xmm1 \n"
  4451. MEMOPREG(movq,0x01,0,1,1,xmm2) // movq 0x1(%0,%1,1),%%xmm2
  4452. "punpcklbw %%xmm5,%%xmm1 \n"
  4453. "punpcklbw %%xmm5,%%xmm2 \n"
  4454. "psubw %%xmm2,%%xmm1 \n"
  4455. "movq " MEMACCESS2(0x2,0) ",%%xmm2 \n"
  4456. MEMOPREG(movq,0x02,0,1,1,xmm3) // movq 0x2(%0,%1,1),%%xmm3
  4457. "punpcklbw %%xmm5,%%xmm2 \n"
  4458. "punpcklbw %%xmm5,%%xmm3 \n"
  4459. "psubw %%xmm3,%%xmm2 \n"
  4460. "paddw %%xmm2,%%xmm0 \n"
  4461. "paddw %%xmm1,%%xmm0 \n"
  4462. "paddw %%xmm1,%%xmm0 \n"
  4463. "pxor %%xmm1,%%xmm1 \n"
  4464. "psubw %%xmm0,%%xmm1 \n"
  4465. "pmaxsw %%xmm1,%%xmm0 \n"
  4466. "packuswb %%xmm0,%%xmm0 \n"
  4467. MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1)
  4468. "lea " MEMLEA(0x8,0) ",%0 \n"
  4469. "sub $0x8,%3 \n"
  4470. "jg 1b \n"
  4471. : "+r"(src_y0), // %0
  4472. "+r"(src_y1), // %1
  4473. "+r"(dst_sobely), // %2
  4474. "+r"(width) // %3
  4475. :
  4476. : "memory", "cc", NACL_R14
  4477. "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  4478. );
  4479. }
  4480. #endif // HAS_SOBELYROW_SSE2
  4481. #ifdef HAS_SOBELROW_SSE2
  4482. // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
  4483. // A = 255
  4484. // R = Sobel
  4485. // G = Sobel
  4486. // B = Sobel
  4487. void SobelRow_SSE2(const uint8* src_sobelx,
  4488. const uint8* src_sobely,
  4489. uint8* dst_argb,
  4490. int width) {
  4491. asm volatile (
  4492. "sub %0,%1 \n"
  4493. "pcmpeqb %%xmm5,%%xmm5 \n"
  4494. "pslld $0x18,%%xmm5 \n"
  4495. // 8 pixel loop.
  4496. LABELALIGN
  4497. "1: \n"
  4498. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4499. MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
  4500. "lea " MEMLEA(0x10,0) ",%0 \n"
  4501. "paddusb %%xmm1,%%xmm0 \n"
  4502. "movdqa %%xmm0,%%xmm2 \n"
  4503. "punpcklbw %%xmm0,%%xmm2 \n"
  4504. "punpckhbw %%xmm0,%%xmm0 \n"
  4505. "movdqa %%xmm2,%%xmm1 \n"
  4506. "punpcklwd %%xmm2,%%xmm1 \n"
  4507. "punpckhwd %%xmm2,%%xmm2 \n"
  4508. "por %%xmm5,%%xmm1 \n"
  4509. "por %%xmm5,%%xmm2 \n"
  4510. "movdqa %%xmm0,%%xmm3 \n"
  4511. "punpcklwd %%xmm0,%%xmm3 \n"
  4512. "punpckhwd %%xmm0,%%xmm0 \n"
  4513. "por %%xmm5,%%xmm3 \n"
  4514. "por %%xmm5,%%xmm0 \n"
  4515. "movdqu %%xmm1," MEMACCESS(2) " \n"
  4516. "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
  4517. "movdqu %%xmm3," MEMACCESS2(0x20,2) " \n"
  4518. "movdqu %%xmm0," MEMACCESS2(0x30,2) " \n"
  4519. "lea " MEMLEA(0x40,2) ",%2 \n"
  4520. "sub $0x10,%3 \n"
  4521. "jg 1b \n"
  4522. : "+r"(src_sobelx), // %0
  4523. "+r"(src_sobely), // %1
  4524. "+r"(dst_argb), // %2
  4525. "+r"(width) // %3
  4526. :
  4527. : "memory", "cc", NACL_R14
  4528. "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  4529. );
  4530. }
  4531. #endif // HAS_SOBELROW_SSE2
  4532. #ifdef HAS_SOBELTOPLANEROW_SSE2
  4533. // Adds Sobel X and Sobel Y and stores Sobel into a plane.
  4534. void SobelToPlaneRow_SSE2(const uint8* src_sobelx,
  4535. const uint8* src_sobely,
  4536. uint8* dst_y,
  4537. int width) {
  4538. asm volatile (
  4539. "sub %0,%1 \n"
  4540. "pcmpeqb %%xmm5,%%xmm5 \n"
  4541. "pslld $0x18,%%xmm5 \n"
  4542. // 8 pixel loop.
  4543. LABELALIGN
  4544. "1: \n"
  4545. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4546. MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
  4547. "lea " MEMLEA(0x10,0) ",%0 \n"
  4548. "paddusb %%xmm1,%%xmm0 \n"
  4549. "movdqu %%xmm0," MEMACCESS(2) " \n"
  4550. "lea " MEMLEA(0x10,2) ",%2 \n"
  4551. "sub $0x10,%3 \n"
  4552. "jg 1b \n"
  4553. : "+r"(src_sobelx), // %0
  4554. "+r"(src_sobely), // %1
  4555. "+r"(dst_y), // %2
  4556. "+r"(width) // %3
  4557. :
  4558. : "memory", "cc", NACL_R14
  4559. "xmm0", "xmm1"
  4560. );
  4561. }
  4562. #endif // HAS_SOBELTOPLANEROW_SSE2
  4563. #ifdef HAS_SOBELXYROW_SSE2
  4564. // Mixes Sobel X, Sobel Y and Sobel into ARGB.
  4565. // A = 255
  4566. // R = Sobel X
  4567. // G = Sobel
  4568. // B = Sobel Y
  4569. void SobelXYRow_SSE2(const uint8* src_sobelx,
  4570. const uint8* src_sobely,
  4571. uint8* dst_argb,
  4572. int width) {
  4573. asm volatile (
  4574. "sub %0,%1 \n"
  4575. "pcmpeqb %%xmm5,%%xmm5 \n"
  4576. // 8 pixel loop.
  4577. LABELALIGN
  4578. "1: \n"
  4579. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4580. MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
  4581. "lea " MEMLEA(0x10,0) ",%0 \n"
  4582. "movdqa %%xmm0,%%xmm2 \n"
  4583. "paddusb %%xmm1,%%xmm2 \n"
  4584. "movdqa %%xmm0,%%xmm3 \n"
  4585. "punpcklbw %%xmm5,%%xmm3 \n"
  4586. "punpckhbw %%xmm5,%%xmm0 \n"
  4587. "movdqa %%xmm1,%%xmm4 \n"
  4588. "punpcklbw %%xmm2,%%xmm4 \n"
  4589. "punpckhbw %%xmm2,%%xmm1 \n"
  4590. "movdqa %%xmm4,%%xmm6 \n"
  4591. "punpcklwd %%xmm3,%%xmm6 \n"
  4592. "punpckhwd %%xmm3,%%xmm4 \n"
  4593. "movdqa %%xmm1,%%xmm7 \n"
  4594. "punpcklwd %%xmm0,%%xmm7 \n"
  4595. "punpckhwd %%xmm0,%%xmm1 \n"
  4596. "movdqu %%xmm6," MEMACCESS(2) " \n"
  4597. "movdqu %%xmm4," MEMACCESS2(0x10,2) " \n"
  4598. "movdqu %%xmm7," MEMACCESS2(0x20,2) " \n"
  4599. "movdqu %%xmm1," MEMACCESS2(0x30,2) " \n"
  4600. "lea " MEMLEA(0x40,2) ",%2 \n"
  4601. "sub $0x10,%3 \n"
  4602. "jg 1b \n"
  4603. : "+r"(src_sobelx), // %0
  4604. "+r"(src_sobely), // %1
  4605. "+r"(dst_argb), // %2
  4606. "+r"(width) // %3
  4607. :
  4608. : "memory", "cc", NACL_R14
  4609. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  4610. );
  4611. }
  4612. #endif // HAS_SOBELXYROW_SSE2
  4613. #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
  4614. // Creates a table of cumulative sums where each value is a sum of all values
  4615. // above and to the left of the value, inclusive of the value.
  4616. void ComputeCumulativeSumRow_SSE2(const uint8* row,
  4617. int32* cumsum,
  4618. const int32* previous_cumsum,
  4619. int width) {
  4620. asm volatile (
  4621. "pxor %%xmm0,%%xmm0 \n"
  4622. "pxor %%xmm1,%%xmm1 \n"
  4623. "sub $0x4,%3 \n"
  4624. "jl 49f \n"
  4625. "test $0xf,%1 \n"
  4626. "jne 49f \n"
  4627. // 4 pixel loop.
  4628. LABELALIGN
  4629. "40: \n"
  4630. "movdqu " MEMACCESS(0) ",%%xmm2 \n"
  4631. "lea " MEMLEA(0x10,0) ",%0 \n"
  4632. "movdqa %%xmm2,%%xmm4 \n"
  4633. "punpcklbw %%xmm1,%%xmm2 \n"
  4634. "movdqa %%xmm2,%%xmm3 \n"
  4635. "punpcklwd %%xmm1,%%xmm2 \n"
  4636. "punpckhwd %%xmm1,%%xmm3 \n"
  4637. "punpckhbw %%xmm1,%%xmm4 \n"
  4638. "movdqa %%xmm4,%%xmm5 \n"
  4639. "punpcklwd %%xmm1,%%xmm4 \n"
  4640. "punpckhwd %%xmm1,%%xmm5 \n"
  4641. "paddd %%xmm2,%%xmm0 \n"
  4642. "movdqu " MEMACCESS(2) ",%%xmm2 \n"
  4643. "paddd %%xmm0,%%xmm2 \n"
  4644. "paddd %%xmm3,%%xmm0 \n"
  4645. "movdqu " MEMACCESS2(0x10,2) ",%%xmm3 \n"
  4646. "paddd %%xmm0,%%xmm3 \n"
  4647. "paddd %%xmm4,%%xmm0 \n"
  4648. "movdqu " MEMACCESS2(0x20,2) ",%%xmm4 \n"
  4649. "paddd %%xmm0,%%xmm4 \n"
  4650. "paddd %%xmm5,%%xmm0 \n"
  4651. "movdqu " MEMACCESS2(0x30,2) ",%%xmm5 \n"
  4652. "lea " MEMLEA(0x40,2) ",%2 \n"
  4653. "paddd %%xmm0,%%xmm5 \n"
  4654. "movdqu %%xmm2," MEMACCESS(1) " \n"
  4655. "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
  4656. "movdqu %%xmm4," MEMACCESS2(0x20,1) " \n"
  4657. "movdqu %%xmm5," MEMACCESS2(0x30,1) " \n"
  4658. "lea " MEMLEA(0x40,1) ",%1 \n"
  4659. "sub $0x4,%3 \n"
  4660. "jge 40b \n"
  4661. "49: \n"
  4662. "add $0x3,%3 \n"
  4663. "jl 19f \n"
  4664. // 1 pixel loop.
  4665. LABELALIGN
  4666. "10: \n"
  4667. "movd " MEMACCESS(0) ",%%xmm2 \n"
  4668. "lea " MEMLEA(0x4,0) ",%0 \n"
  4669. "punpcklbw %%xmm1,%%xmm2 \n"
  4670. "punpcklwd %%xmm1,%%xmm2 \n"
  4671. "paddd %%xmm2,%%xmm0 \n"
  4672. "movdqu " MEMACCESS(2) ",%%xmm2 \n"
  4673. "lea " MEMLEA(0x10,2) ",%2 \n"
  4674. "paddd %%xmm0,%%xmm2 \n"
  4675. "movdqu %%xmm2," MEMACCESS(1) " \n"
  4676. "lea " MEMLEA(0x10,1) ",%1 \n"
  4677. "sub $0x1,%3 \n"
  4678. "jge 10b \n"
  4679. "19: \n"
  4680. : "+r"(row), // %0
  4681. "+r"(cumsum), // %1
  4682. "+r"(previous_cumsum), // %2
  4683. "+r"(width) // %3
  4684. :
  4685. : "memory", "cc"
  4686. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  4687. );
  4688. }
  4689. #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
  4690. #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
  4691. void CumulativeSumToAverageRow_SSE2(const int32* topleft,
  4692. const int32* botleft,
  4693. int width,
  4694. int area,
  4695. uint8* dst,
  4696. int count) {
  4697. asm volatile (
  4698. "movd %5,%%xmm5 \n"
  4699. "cvtdq2ps %%xmm5,%%xmm5 \n"
  4700. "rcpss %%xmm5,%%xmm4 \n"
  4701. "pshufd $0x0,%%xmm4,%%xmm4 \n"
  4702. "sub $0x4,%3 \n"
  4703. "jl 49f \n"
  4704. "cmpl $0x80,%5 \n"
  4705. "ja 40f \n"
  4706. "pshufd $0x0,%%xmm5,%%xmm5 \n"
  4707. "pcmpeqb %%xmm6,%%xmm6 \n"
  4708. "psrld $0x10,%%xmm6 \n"
  4709. "cvtdq2ps %%xmm6,%%xmm6 \n"
  4710. "addps %%xmm6,%%xmm5 \n"
  4711. "mulps %%xmm4,%%xmm5 \n"
  4712. "cvtps2dq %%xmm5,%%xmm5 \n"
  4713. "packssdw %%xmm5,%%xmm5 \n"
  4714. // 4 pixel small loop.
  4715. LABELALIGN
  4716. "4: \n"
  4717. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4718. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  4719. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  4720. "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
  4721. MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
  4722. MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
  4723. MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
  4724. MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3
  4725. "lea " MEMLEA(0x40,0) ",%0 \n"
  4726. "psubd " MEMACCESS(1) ",%%xmm0 \n"
  4727. "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
  4728. "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
  4729. "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
  4730. MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
  4731. MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
  4732. MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
  4733. MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3
  4734. "lea " MEMLEA(0x40,1) ",%1 \n"
  4735. "packssdw %%xmm1,%%xmm0 \n"
  4736. "packssdw %%xmm3,%%xmm2 \n"
  4737. "pmulhuw %%xmm5,%%xmm0 \n"
  4738. "pmulhuw %%xmm5,%%xmm2 \n"
  4739. "packuswb %%xmm2,%%xmm0 \n"
  4740. "movdqu %%xmm0," MEMACCESS(2) " \n"
  4741. "lea " MEMLEA(0x10,2) ",%2 \n"
  4742. "sub $0x4,%3 \n"
  4743. "jge 4b \n"
  4744. "jmp 49f \n"
  4745. // 4 pixel loop \n"
  4746. LABELALIGN
  4747. "40: \n"
  4748. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4749. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  4750. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  4751. "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
  4752. MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
  4753. MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
  4754. MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
  4755. MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3
  4756. "lea " MEMLEA(0x40,0) ",%0 \n"
  4757. "psubd " MEMACCESS(1) ",%%xmm0 \n"
  4758. "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
  4759. "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
  4760. "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
  4761. MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
  4762. MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
  4763. MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
  4764. MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3
  4765. "lea " MEMLEA(0x40,1) ",%1 \n"
  4766. "cvtdq2ps %%xmm0,%%xmm0 \n"
  4767. "cvtdq2ps %%xmm1,%%xmm1 \n"
  4768. "mulps %%xmm4,%%xmm0 \n"
  4769. "mulps %%xmm4,%%xmm1 \n"
  4770. "cvtdq2ps %%xmm2,%%xmm2 \n"
  4771. "cvtdq2ps %%xmm3,%%xmm3 \n"
  4772. "mulps %%xmm4,%%xmm2 \n"
  4773. "mulps %%xmm4,%%xmm3 \n"
  4774. "cvtps2dq %%xmm0,%%xmm0 \n"
  4775. "cvtps2dq %%xmm1,%%xmm1 \n"
  4776. "cvtps2dq %%xmm2,%%xmm2 \n"
  4777. "cvtps2dq %%xmm3,%%xmm3 \n"
  4778. "packssdw %%xmm1,%%xmm0 \n"
  4779. "packssdw %%xmm3,%%xmm2 \n"
  4780. "packuswb %%xmm2,%%xmm0 \n"
  4781. "movdqu %%xmm0," MEMACCESS(2) " \n"
  4782. "lea " MEMLEA(0x10,2) ",%2 \n"
  4783. "sub $0x4,%3 \n"
  4784. "jge 40b \n"
  4785. "49: \n"
  4786. "add $0x3,%3 \n"
  4787. "jl 19f \n"
  4788. // 1 pixel loop \n"
  4789. LABELALIGN
  4790. "10: \n"
  4791. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4792. MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
  4793. "lea " MEMLEA(0x10,0) ",%0 \n"
  4794. "psubd " MEMACCESS(1) ",%%xmm0 \n"
  4795. MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
  4796. "lea " MEMLEA(0x10,1) ",%1 \n"
  4797. "cvtdq2ps %%xmm0,%%xmm0 \n"
  4798. "mulps %%xmm4,%%xmm0 \n"
  4799. "cvtps2dq %%xmm0,%%xmm0 \n"
  4800. "packssdw %%xmm0,%%xmm0 \n"
  4801. "packuswb %%xmm0,%%xmm0 \n"
  4802. "movd %%xmm0," MEMACCESS(2) " \n"
  4803. "lea " MEMLEA(0x4,2) ",%2 \n"
  4804. "sub $0x1,%3 \n"
  4805. "jge 10b \n"
  4806. "19: \n"
  4807. : "+r"(topleft), // %0
  4808. "+r"(botleft), // %1
  4809. "+r"(dst), // %2
  4810. "+rm"(count) // %3
  4811. : "r"((intptr_t)(width)), // %4
  4812. "rm"(area) // %5
  4813. : "memory", "cc", NACL_R14
  4814. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  4815. );
  4816. }
  4817. #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
  4818. #ifdef HAS_ARGBAFFINEROW_SSE2
  4819. // Copy ARGB pixels from source image with slope to a row of destination.
  4820. LIBYUV_API
  4821. void ARGBAffineRow_SSE2(const uint8* src_argb,
  4822. int src_argb_stride,
  4823. uint8* dst_argb,
  4824. const float* src_dudv,
  4825. int width) {
  4826. intptr_t src_argb_stride_temp = src_argb_stride;
  4827. intptr_t temp;
  4828. asm volatile (
  4829. "movq " MEMACCESS(3) ",%%xmm2 \n"
  4830. "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n"
  4831. "shl $0x10,%1 \n"
  4832. "add $0x4,%1 \n"
  4833. "movd %1,%%xmm5 \n"
  4834. "sub $0x4,%4 \n"
  4835. "jl 49f \n"
  4836. "pshufd $0x44,%%xmm7,%%xmm7 \n"
  4837. "pshufd $0x0,%%xmm5,%%xmm5 \n"
  4838. "movdqa %%xmm2,%%xmm0 \n"
  4839. "addps %%xmm7,%%xmm0 \n"
  4840. "movlhps %%xmm0,%%xmm2 \n"
  4841. "movdqa %%xmm7,%%xmm4 \n"
  4842. "addps %%xmm4,%%xmm4 \n"
  4843. "movdqa %%xmm2,%%xmm3 \n"
  4844. "addps %%xmm4,%%xmm3 \n"
  4845. "addps %%xmm4,%%xmm4 \n"
  4846. // 4 pixel loop \n"
  4847. LABELALIGN
  4848. "40: \n"
  4849. "cvttps2dq %%xmm2,%%xmm0 \n" // x, y float to int first 2
  4850. "cvttps2dq %%xmm3,%%xmm1 \n" // x, y float to int next 2
  4851. "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
  4852. "pmaddwd %%xmm5,%%xmm0 \n" // off = x * 4 + y * stride
  4853. "movd %%xmm0,%k1 \n"
  4854. "pshufd $0x39,%%xmm0,%%xmm0 \n"
  4855. "movd %%xmm0,%k5 \n"
  4856. "pshufd $0x39,%%xmm0,%%xmm0 \n"
  4857. MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1
  4858. MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
  4859. "punpckldq %%xmm6,%%xmm1 \n"
  4860. "addps %%xmm4,%%xmm2 \n"
  4861. "movq %%xmm1," MEMACCESS(2) " \n"
  4862. "movd %%xmm0,%k1 \n"
  4863. "pshufd $0x39,%%xmm0,%%xmm0 \n"
  4864. "movd %%xmm0,%k5 \n"
  4865. MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
  4866. MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
  4867. "punpckldq %%xmm6,%%xmm0 \n"
  4868. "addps %%xmm4,%%xmm3 \n"
  4869. "movq %%xmm0," MEMACCESS2(0x08,2) " \n"
  4870. "lea " MEMLEA(0x10,2) ",%2 \n"
  4871. "sub $0x4,%4 \n"
  4872. "jge 40b \n"
  4873. "49: \n"
  4874. "add $0x3,%4 \n"
  4875. "jl 19f \n"
  4876. // 1 pixel loop \n"
  4877. LABELALIGN
  4878. "10: \n"
  4879. "cvttps2dq %%xmm2,%%xmm0 \n"
  4880. "packssdw %%xmm0,%%xmm0 \n"
  4881. "pmaddwd %%xmm5,%%xmm0 \n"
  4882. "addps %%xmm7,%%xmm2 \n"
  4883. "movd %%xmm0,%k1 \n"
  4884. MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
  4885. "movd %%xmm0," MEMACCESS(2) " \n"
  4886. "lea " MEMLEA(0x04,2) ",%2 \n"
  4887. "sub $0x1,%4 \n"
  4888. "jge 10b \n"
  4889. "19: \n"
  4890. : "+r"(src_argb), // %0
  4891. "+r"(src_argb_stride_temp), // %1
  4892. "+r"(dst_argb), // %2
  4893. "+r"(src_dudv), // %3
  4894. "+rm"(width), // %4
  4895. "=&r"(temp) // %5
  4896. :
  4897. : "memory", "cc", NACL_R14
  4898. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  4899. );
  4900. }
  4901. #endif // HAS_ARGBAFFINEROW_SSE2
  4902. #ifdef HAS_INTERPOLATEROW_SSSE3
  4903. // Bilinear filter 16x2 -> 16x1
  4904. void InterpolateRow_SSSE3(uint8* dst_ptr,
  4905. const uint8* src_ptr,
  4906. ptrdiff_t src_stride,
  4907. int dst_width,
  4908. int source_y_fraction) {
  4909. asm volatile (
  4910. "sub %1,%0 \n"
  4911. "cmp $0x0,%3 \n"
  4912. "je 100f \n"
  4913. "cmp $0x80,%3 \n"
  4914. "je 50f \n"
  4915. "movd %3,%%xmm0 \n"
  4916. "neg %3 \n"
  4917. "add $0x100,%3 \n"
  4918. "movd %3,%%xmm5 \n"
  4919. "punpcklbw %%xmm0,%%xmm5 \n"
  4920. "punpcklwd %%xmm5,%%xmm5 \n"
  4921. "pshufd $0x0,%%xmm5,%%xmm5 \n"
  4922. "mov $0x80808080,%%eax \n"
  4923. "movd %%eax,%%xmm4 \n"
  4924. "pshufd $0x0,%%xmm4,%%xmm4 \n"
  4925. // General purpose row blend.
  4926. LABELALIGN
  4927. "1: \n"
  4928. "movdqu " MEMACCESS(1) ",%%xmm0 \n"
  4929. MEMOPREG(movdqu,0x00,1,4,1,xmm2)
  4930. "movdqa %%xmm0,%%xmm1 \n"
  4931. "punpcklbw %%xmm2,%%xmm0 \n"
  4932. "punpckhbw %%xmm2,%%xmm1 \n"
  4933. "psubb %%xmm4,%%xmm0 \n"
  4934. "psubb %%xmm4,%%xmm1 \n"
  4935. "movdqa %%xmm5,%%xmm2 \n"
  4936. "movdqa %%xmm5,%%xmm3 \n"
  4937. "pmaddubsw %%xmm0,%%xmm2 \n"
  4938. "pmaddubsw %%xmm1,%%xmm3 \n"
  4939. "paddw %%xmm4,%%xmm2 \n"
  4940. "paddw %%xmm4,%%xmm3 \n"
  4941. "psrlw $0x8,%%xmm2 \n"
  4942. "psrlw $0x8,%%xmm3 \n"
  4943. "packuswb %%xmm3,%%xmm2 \n"
  4944. MEMOPMEM(movdqu,xmm2,0x00,1,0,1)
  4945. "lea " MEMLEA(0x10,1) ",%1 \n"
  4946. "sub $0x10,%2 \n"
  4947. "jg 1b \n"
  4948. "jmp 99f \n"
  4949. // Blend 50 / 50.
  4950. LABELALIGN
  4951. "50: \n"
  4952. "movdqu " MEMACCESS(1) ",%%xmm0 \n"
  4953. MEMOPREG(movdqu,0x00,1,4,1,xmm1)
  4954. "pavgb %%xmm1,%%xmm0 \n"
  4955. MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
  4956. "lea " MEMLEA(0x10,1) ",%1 \n"
  4957. "sub $0x10,%2 \n"
  4958. "jg 50b \n"
  4959. "jmp 99f \n"
  4960. // Blend 100 / 0 - Copy row unchanged.
  4961. LABELALIGN
  4962. "100: \n"
  4963. "movdqu " MEMACCESS(1) ",%%xmm0 \n"
  4964. MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
  4965. "lea " MEMLEA(0x10,1) ",%1 \n"
  4966. "sub $0x10,%2 \n"
  4967. "jg 100b \n"
  4968. "99: \n"
  4969. : "+r"(dst_ptr), // %0
  4970. "+r"(src_ptr), // %1
  4971. "+rm"(dst_width), // %2
  4972. "+r"(source_y_fraction) // %3
  4973. : "r"((intptr_t)(src_stride)) // %4
  4974. : "memory", "cc", "eax", NACL_R14
  4975. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  4976. );
  4977. }
  4978. #endif // HAS_INTERPOLATEROW_SSSE3
  4979. #ifdef HAS_INTERPOLATEROW_AVX2
  4980. // Bilinear filter 32x2 -> 32x1
  4981. void InterpolateRow_AVX2(uint8* dst_ptr,
  4982. const uint8* src_ptr,
  4983. ptrdiff_t src_stride,
  4984. int dst_width,
  4985. int source_y_fraction) {
  4986. asm volatile (
  4987. "cmp $0x0,%3 \n"
  4988. "je 100f \n"
  4989. "sub %1,%0 \n"
  4990. "cmp $0x80,%3 \n"
  4991. "je 50f \n"
  4992. "vmovd %3,%%xmm0 \n"
  4993. "neg %3 \n"
  4994. "add $0x100,%3 \n"
  4995. "vmovd %3,%%xmm5 \n"
  4996. "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n"
  4997. "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n"
  4998. "vbroadcastss %%xmm5,%%ymm5 \n"
  4999. "mov $0x80808080,%%eax \n"
  5000. "vmovd %%eax,%%xmm4 \n"
  5001. "vbroadcastss %%xmm4,%%ymm4 \n"
  5002. // General purpose row blend.
  5003. LABELALIGN
  5004. "1: \n"
  5005. "vmovdqu " MEMACCESS(1) ",%%ymm0 \n"
  5006. MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)
  5007. "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
  5008. "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
  5009. "vpsubb %%ymm4,%%ymm1,%%ymm1 \n"
  5010. "vpsubb %%ymm4,%%ymm0,%%ymm0 \n"
  5011. "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n"
  5012. "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n"
  5013. "vpaddw %%ymm4,%%ymm1,%%ymm1 \n"
  5014. "vpaddw %%ymm4,%%ymm0,%%ymm0 \n"
  5015. "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
  5016. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  5017. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  5018. MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
  5019. "lea " MEMLEA(0x20,1) ",%1 \n"
  5020. "sub $0x20,%2 \n"
  5021. "jg 1b \n"
  5022. "jmp 99f \n"
  5023. // Blend 50 / 50.
  5024. LABELALIGN
  5025. "50: \n"
  5026. "vmovdqu " MEMACCESS(1) ",%%ymm0 \n"
  5027. VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0) // vpavgb (%1,%4,1),%%ymm0,%%ymm0
  5028. MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
  5029. "lea " MEMLEA(0x20,1) ",%1 \n"
  5030. "sub $0x20,%2 \n"
  5031. "jg 50b \n"
  5032. "jmp 99f \n"
  5033. // Blend 100 / 0 - Copy row unchanged.
  5034. LABELALIGN
  5035. "100: \n"
  5036. "rep movsb " MEMMOVESTRING(1,0) " \n"
  5037. "jmp 999f \n"
  5038. "99: \n"
  5039. "vzeroupper \n"
  5040. "999: \n"
  5041. : "+D"(dst_ptr), // %0
  5042. "+S"(src_ptr), // %1
  5043. "+cm"(dst_width), // %2
  5044. "+r"(source_y_fraction) // %3
  5045. : "r"((intptr_t)(src_stride)) // %4
  5046. : "memory", "cc", "eax", NACL_R14
  5047. "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"
  5048. );
  5049. }
  5050. #endif // HAS_INTERPOLATEROW_AVX2
  5051. #ifdef HAS_ARGBSHUFFLEROW_SSSE3
  5052. // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
  5053. void ARGBShuffleRow_SSSE3(const uint8* src_argb,
  5054. uint8* dst_argb,
  5055. const uint8* shuffler,
  5056. int width) {
  5057. asm volatile (
  5058. "movdqu " MEMACCESS(3) ",%%xmm5 \n"
  5059. LABELALIGN
  5060. "1: \n"
  5061. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  5062. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  5063. "lea " MEMLEA(0x20,0) ",%0 \n"
  5064. "pshufb %%xmm5,%%xmm0 \n"
  5065. "pshufb %%xmm5,%%xmm1 \n"
  5066. "movdqu %%xmm0," MEMACCESS(1) " \n"
  5067. "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
  5068. "lea " MEMLEA(0x20,1) ",%1 \n"
  5069. "sub $0x8,%2 \n"
  5070. "jg 1b \n"
  5071. : "+r"(src_argb), // %0
  5072. "+r"(dst_argb), // %1
  5073. "+r"(width) // %2
  5074. : "r"(shuffler) // %3
  5075. : "memory", "cc"
  5076. , "xmm0", "xmm1", "xmm5"
  5077. );
  5078. }
  5079. #endif // HAS_ARGBSHUFFLEROW_SSSE3
  5080. #ifdef HAS_ARGBSHUFFLEROW_AVX2
  5081. // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
  5082. void ARGBShuffleRow_AVX2(const uint8* src_argb,
  5083. uint8* dst_argb,
  5084. const uint8* shuffler,
  5085. int width) {
  5086. asm volatile (
  5087. "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n"
  5088. LABELALIGN
  5089. "1: \n"
  5090. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  5091. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  5092. "lea " MEMLEA(0x40,0) ",%0 \n"
  5093. "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
  5094. "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
  5095. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  5096. "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
  5097. "lea " MEMLEA(0x40,1) ",%1 \n"
  5098. "sub $0x10,%2 \n"
  5099. "jg 1b \n"
  5100. "vzeroupper \n"
  5101. : "+r"(src_argb), // %0
  5102. "+r"(dst_argb), // %1
  5103. "+r"(width) // %2
  5104. : "r"(shuffler) // %3
  5105. : "memory", "cc"
  5106. , "xmm0", "xmm1", "xmm5"
  5107. );
  5108. }
  5109. #endif // HAS_ARGBSHUFFLEROW_AVX2
  5110. #ifdef HAS_ARGBSHUFFLEROW_SSE2
  5111. // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
  5112. void ARGBShuffleRow_SSE2(const uint8* src_argb,
  5113. uint8* dst_argb,
  5114. const uint8* shuffler,
  5115. int width) {
  5116. uintptr_t pixel_temp;
  5117. asm volatile (
  5118. "pxor %%xmm5,%%xmm5 \n"
  5119. "mov " MEMACCESS(4) ",%k2 \n"
  5120. "cmp $0x3000102,%k2 \n"
  5121. "je 3012f \n"
  5122. "cmp $0x10203,%k2 \n"
  5123. "je 123f \n"
  5124. "cmp $0x30201,%k2 \n"
  5125. "je 321f \n"
  5126. "cmp $0x2010003,%k2 \n"
  5127. "je 2103f \n"
  5128. LABELALIGN
  5129. "1: \n"
  5130. "movzb " MEMACCESS(4) ",%2 \n"
  5131. MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
  5132. "mov %b2," MEMACCESS(1) " \n"
  5133. "movzb " MEMACCESS2(0x1,4) ",%2 \n"
  5134. MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
  5135. "mov %b2," MEMACCESS2(0x1,1) " \n"
  5136. "movzb " MEMACCESS2(0x2,4) ",%2 \n"
  5137. MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
  5138. "mov %b2," MEMACCESS2(0x2,1) " \n"
  5139. "movzb " MEMACCESS2(0x3,4) ",%2 \n"
  5140. MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
  5141. "mov %b2," MEMACCESS2(0x3,1) " \n"
  5142. "lea " MEMLEA(0x4,0) ",%0 \n"
  5143. "lea " MEMLEA(0x4,1) ",%1 \n"
  5144. "sub $0x1,%3 \n"
  5145. "jg 1b \n"
  5146. "jmp 99f \n"
  5147. LABELALIGN
  5148. "123: \n"
  5149. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  5150. "lea " MEMLEA(0x10,0) ",%0 \n"
  5151. "movdqa %%xmm0,%%xmm1 \n"
  5152. "punpcklbw %%xmm5,%%xmm0 \n"
  5153. "punpckhbw %%xmm5,%%xmm1 \n"
  5154. "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
  5155. "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
  5156. "pshufhw $0x1b,%%xmm1,%%xmm1 \n"
  5157. "pshuflw $0x1b,%%xmm1,%%xmm1 \n"
  5158. "packuswb %%xmm1,%%xmm0 \n"
  5159. "movdqu %%xmm0," MEMACCESS(1) " \n"
  5160. "lea " MEMLEA(0x10,1) ",%1 \n"
  5161. "sub $0x4,%3 \n"
  5162. "jg 123b \n"
  5163. "jmp 99f \n"
  5164. LABELALIGN
  5165. "321: \n"
  5166. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  5167. "lea " MEMLEA(0x10,0) ",%0 \n"
  5168. "movdqa %%xmm0,%%xmm1 \n"
  5169. "punpcklbw %%xmm5,%%xmm0 \n"
  5170. "punpckhbw %%xmm5,%%xmm1 \n"
  5171. "pshufhw $0x39,%%xmm0,%%xmm0 \n"
  5172. "pshuflw $0x39,%%xmm0,%%xmm0 \n"
  5173. "pshufhw $0x39,%%xmm1,%%xmm1 \n"
  5174. "pshuflw $0x39,%%xmm1,%%xmm1 \n"
  5175. "packuswb %%xmm1,%%xmm0 \n"
  5176. "movdqu %%xmm0," MEMACCESS(1) " \n"
  5177. "lea " MEMLEA(0x10,1) ",%1 \n"
  5178. "sub $0x4,%3 \n"
  5179. "jg 321b \n"
  5180. "jmp 99f \n"
  5181. LABELALIGN
  5182. "2103: \n"
  5183. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  5184. "lea " MEMLEA(0x10,0) ",%0 \n"
  5185. "movdqa %%xmm0,%%xmm1 \n"
  5186. "punpcklbw %%xmm5,%%xmm0 \n"
  5187. "punpckhbw %%xmm5,%%xmm1 \n"
  5188. "pshufhw $0x93,%%xmm0,%%xmm0 \n"
  5189. "pshuflw $0x93,%%xmm0,%%xmm0 \n"
  5190. "pshufhw $0x93,%%xmm1,%%xmm1 \n"
  5191. "pshuflw $0x93,%%xmm1,%%xmm1 \n"
  5192. "packuswb %%xmm1,%%xmm0 \n"
  5193. "movdqu %%xmm0," MEMACCESS(1) " \n"
  5194. "lea " MEMLEA(0x10,1) ",%1 \n"
  5195. "sub $0x4,%3 \n"
  5196. "jg 2103b \n"
  5197. "jmp 99f \n"
  5198. LABELALIGN
  5199. "3012: \n"
  5200. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  5201. "lea " MEMLEA(0x10,0) ",%0 \n"
  5202. "movdqa %%xmm0,%%xmm1 \n"
  5203. "punpcklbw %%xmm5,%%xmm0 \n"
  5204. "punpckhbw %%xmm5,%%xmm1 \n"
  5205. "pshufhw $0xc6,%%xmm0,%%xmm0 \n"
  5206. "pshuflw $0xc6,%%xmm0,%%xmm0 \n"
  5207. "pshufhw $0xc6,%%xmm1,%%xmm1 \n"
  5208. "pshuflw $0xc6,%%xmm1,%%xmm1 \n"
  5209. "packuswb %%xmm1,%%xmm0 \n"
  5210. "movdqu %%xmm0," MEMACCESS(1) " \n"
  5211. "lea " MEMLEA(0x10,1) ",%1 \n"
  5212. "sub $0x4,%3 \n"
  5213. "jg 3012b \n"
  5214. "99: \n"
  5215. : "+r"(src_argb), // %0
  5216. "+r"(dst_argb), // %1
  5217. "=&d"(pixel_temp), // %2
  5218. "+r"(width) // %3
  5219. : "r"(shuffler) // %4
  5220. : "memory", "cc", NACL_R14
  5221. "xmm0", "xmm1", "xmm5"
  5222. );
  5223. }
  5224. #endif // HAS_ARGBSHUFFLEROW_SSE2
  5225. #ifdef HAS_I422TOYUY2ROW_SSE2
  5226. void I422ToYUY2Row_SSE2(const uint8* src_y,
  5227. const uint8* src_u,
  5228. const uint8* src_v,
  5229. uint8* dst_frame,
  5230. int width) {
  5231. asm volatile (
  5232. "sub %1,%2 \n"
  5233. LABELALIGN
  5234. "1: \n"
  5235. "movq " MEMACCESS(1) ",%%xmm2 \n"
  5236. MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
  5237. "lea " MEMLEA(0x8,1) ",%1 \n"
  5238. "punpcklbw %%xmm3,%%xmm2 \n"
  5239. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  5240. "lea " MEMLEA(0x10,0) ",%0 \n"
  5241. "movdqa %%xmm0,%%xmm1 \n"
  5242. "punpcklbw %%xmm2,%%xmm0 \n"
  5243. "punpckhbw %%xmm2,%%xmm1 \n"
  5244. "movdqu %%xmm0," MEMACCESS(3) " \n"
  5245. "movdqu %%xmm1," MEMACCESS2(0x10,3) " \n"
  5246. "lea " MEMLEA(0x20,3) ",%3 \n"
  5247. "sub $0x10,%4 \n"
  5248. "jg 1b \n"
  5249. : "+r"(src_y), // %0
  5250. "+r"(src_u), // %1
  5251. "+r"(src_v), // %2
  5252. "+r"(dst_frame), // %3
  5253. "+rm"(width) // %4
  5254. :
  5255. : "memory", "cc", NACL_R14
  5256. "xmm0", "xmm1", "xmm2", "xmm3"
  5257. );
  5258. }
  5259. #endif // HAS_I422TOYUY2ROW_SSE2
  5260. #ifdef HAS_I422TOUYVYROW_SSE2
  5261. void I422ToUYVYRow_SSE2(const uint8* src_y,
  5262. const uint8* src_u,
  5263. const uint8* src_v,
  5264. uint8* dst_frame,
  5265. int width) {
  5266. asm volatile (
  5267. "sub %1,%2 \n"
  5268. LABELALIGN
  5269. "1: \n"
  5270. "movq " MEMACCESS(1) ",%%xmm2 \n"
  5271. MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
  5272. "lea " MEMLEA(0x8,1) ",%1 \n"
  5273. "punpcklbw %%xmm3,%%xmm2 \n"
  5274. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  5275. "movdqa %%xmm2,%%xmm1 \n"
  5276. "lea " MEMLEA(0x10,0) ",%0 \n"
  5277. "punpcklbw %%xmm0,%%xmm1 \n"
  5278. "punpckhbw %%xmm0,%%xmm2 \n"
  5279. "movdqu %%xmm1," MEMACCESS(3) " \n"
  5280. "movdqu %%xmm2," MEMACCESS2(0x10,3) " \n"
  5281. "lea " MEMLEA(0x20,3) ",%3 \n"
  5282. "sub $0x10,%4 \n"
  5283. "jg 1b \n"
  5284. : "+r"(src_y), // %0
  5285. "+r"(src_u), // %1
  5286. "+r"(src_v), // %2
  5287. "+r"(dst_frame), // %3
  5288. "+rm"(width) // %4
  5289. :
  5290. : "memory", "cc", NACL_R14
  5291. "xmm0", "xmm1", "xmm2", "xmm3"
  5292. );
  5293. }
  5294. #endif // HAS_I422TOUYVYROW_SSE2
  5295. #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
  5296. void ARGBPolynomialRow_SSE2(const uint8* src_argb,
  5297. uint8* dst_argb,
  5298. const float* poly,
  5299. int width) {
  5300. asm volatile (
  5301. "pxor %%xmm3,%%xmm3 \n"
  5302. // 2 pixel loop.
  5303. LABELALIGN
  5304. "1: \n"
  5305. "movq " MEMACCESS(0) ",%%xmm0 \n"
  5306. "lea " MEMLEA(0x8,0) ",%0 \n"
  5307. "punpcklbw %%xmm3,%%xmm0 \n"
  5308. "movdqa %%xmm0,%%xmm4 \n"
  5309. "punpcklwd %%xmm3,%%xmm0 \n"
  5310. "punpckhwd %%xmm3,%%xmm4 \n"
  5311. "cvtdq2ps %%xmm0,%%xmm0 \n"
  5312. "cvtdq2ps %%xmm4,%%xmm4 \n"
  5313. "movdqa %%xmm0,%%xmm1 \n"
  5314. "movdqa %%xmm4,%%xmm5 \n"
  5315. "mulps " MEMACCESS2(0x10,3) ",%%xmm0 \n"
  5316. "mulps " MEMACCESS2(0x10,3) ",%%xmm4 \n"
  5317. "addps " MEMACCESS(3) ",%%xmm0 \n"
  5318. "addps " MEMACCESS(3) ",%%xmm4 \n"
  5319. "movdqa %%xmm1,%%xmm2 \n"
  5320. "movdqa %%xmm5,%%xmm6 \n"
  5321. "mulps %%xmm1,%%xmm2 \n"
  5322. "mulps %%xmm5,%%xmm6 \n"
  5323. "mulps %%xmm2,%%xmm1 \n"
  5324. "mulps %%xmm6,%%xmm5 \n"
  5325. "mulps " MEMACCESS2(0x20,3) ",%%xmm2 \n"
  5326. "mulps " MEMACCESS2(0x20,3) ",%%xmm6 \n"
  5327. "mulps " MEMACCESS2(0x30,3) ",%%xmm1 \n"
  5328. "mulps " MEMACCESS2(0x30,3) ",%%xmm5 \n"
  5329. "addps %%xmm2,%%xmm0 \n"
  5330. "addps %%xmm6,%%xmm4 \n"
  5331. "addps %%xmm1,%%xmm0 \n"
  5332. "addps %%xmm5,%%xmm4 \n"
  5333. "cvttps2dq %%xmm0,%%xmm0 \n"
  5334. "cvttps2dq %%xmm4,%%xmm4 \n"
  5335. "packuswb %%xmm4,%%xmm0 \n"
  5336. "packuswb %%xmm0,%%xmm0 \n"
  5337. "movq %%xmm0," MEMACCESS(1) " \n"
  5338. "lea " MEMLEA(0x8,1) ",%1 \n"
  5339. "sub $0x2,%2 \n"
  5340. "jg 1b \n"
  5341. : "+r"(src_argb), // %0
  5342. "+r"(dst_argb), // %1
  5343. "+r"(width) // %2
  5344. : "r"(poly) // %3
  5345. : "memory", "cc"
  5346. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  5347. );
  5348. }
  5349. #endif // HAS_ARGBPOLYNOMIALROW_SSE2
  5350. #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
  5351. void ARGBPolynomialRow_AVX2(const uint8* src_argb,
  5352. uint8* dst_argb,
  5353. const float* poly,
  5354. int width) {
  5355. asm volatile (
  5356. "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n"
  5357. "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
  5358. "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
  5359. "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
  5360. // 2 pixel loop.
  5361. LABELALIGN
  5362. "1: \n"
  5363. "vpmovzxbd " MEMACCESS(0) ",%%ymm0 \n" // 2 ARGB pixels
  5364. "lea " MEMLEA(0x8,0) ",%0 \n"
  5365. "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats
  5366. "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X
  5367. "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X
  5368. "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X
  5369. "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X
  5370. "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * X
  5371. "vcvttps2dq %%ymm0,%%ymm0 \n"
  5372. "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
  5373. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  5374. "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n"
  5375. "vmovq %%xmm0," MEMACCESS(1) " \n"
  5376. "lea " MEMLEA(0x8,1) ",%1 \n"
  5377. "sub $0x2,%2 \n"
  5378. "jg 1b \n"
  5379. "vzeroupper \n"
  5380. : "+r"(src_argb), // %0
  5381. "+r"(dst_argb), // %1
  5382. "+r"(width) // %2
  5383. : "r"(poly) // %3
  5384. : "memory", "cc",
  5385. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  5386. );
  5387. }
  5388. #endif // HAS_ARGBPOLYNOMIALROW_AVX2
  5389. #ifdef HAS_HALFFLOATROW_SSE2
  5390. static float kScaleBias = 1.9259299444e-34f;
  5391. void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
  5392. scale *= kScaleBias;
  5393. asm volatile (
  5394. "pshufd $0x0,%3,%%xmm4 \n"
  5395. "pxor %%xmm5,%%xmm5 \n"
  5396. "sub %0,%1 \n"
  5397. // 16 pixel loop.
  5398. LABELALIGN
  5399. "1: \n"
  5400. "movdqu " MEMACCESS(0) ",%%xmm2 \n" // 8 shorts
  5401. "add $0x10,%0 \n"
  5402. "movdqa %%xmm2,%%xmm3 \n"
  5403. "punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1
  5404. "cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats
  5405. "punpckhwd %%xmm5,%%xmm3 \n"
  5406. "cvtdq2ps %%xmm3,%%xmm3 \n"
  5407. "mulps %%xmm4,%%xmm2 \n"
  5408. "mulps %%xmm4,%%xmm3 \n"
  5409. "psrld $0xd,%%xmm2 \n"
  5410. "psrld $0xd,%%xmm3 \n"
  5411. "packssdw %%xmm3,%%xmm2 \n"
  5412. MEMOPMEM(movdqu,xmm2,-0x10,0,1,1)
  5413. "sub $0x8,%2 \n"
  5414. "jg 1b \n"
  5415. : "+r"(src), // %0
  5416. "+r"(dst), // %1
  5417. "+r"(width) // %2
  5418. #if defined(__x86_64__)
  5419. : "x"(scale) // %3
  5420. #else
  5421. : "m"(scale) // %3
  5422. #endif
  5423. : "memory", "cc",
  5424. "xmm2", "xmm3", "xmm4", "xmm5"
  5425. );
  5426. }
  5427. #endif // HAS_HALFFLOATROW_SSE2
  5428. #ifdef HAS_HALFFLOATROW_AVX2
  5429. void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
  5430. scale *= kScaleBias;
  5431. asm volatile (
  5432. "vbroadcastss %3, %%ymm4 \n"
  5433. "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
  5434. "sub %0,%1 \n"
  5435. // 16 pixel loop.
  5436. LABELALIGN
  5437. "1: \n"
  5438. "vmovdqu " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts
  5439. "add $0x20,%0 \n"
  5440. "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates
  5441. "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n"
  5442. "vcvtdq2ps %%ymm3,%%ymm3 \n"
  5443. "vcvtdq2ps %%ymm2,%%ymm2 \n"
  5444. "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
  5445. "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
  5446. "vpsrld $0xd,%%ymm3,%%ymm3 \n"
  5447. "vpsrld $0xd,%%ymm2,%%ymm2 \n"
  5448. "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates
  5449. MEMOPMEM(vmovdqu,ymm2,-0x20,0,1,1)
  5450. "sub $0x10,%2 \n"
  5451. "jg 1b \n"
  5452. "vzeroupper \n"
  5453. : "+r"(src), // %0
  5454. "+r"(dst), // %1
  5455. "+r"(width) // %2
  5456. #if defined(__x86_64__)
  5457. : "x"(scale) // %3
  5458. #else
  5459. : "m"(scale) // %3
  5460. #endif
  5461. : "memory", "cc",
  5462. "xmm2", "xmm3", "xmm4", "xmm5"
  5463. );
  5464. }
  5465. #endif // HAS_HALFFLOATROW_AVX2
  5466. #ifdef HAS_HALFFLOATROW_F16C
  5467. void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {
  5468. asm volatile (
  5469. "vbroadcastss %3, %%ymm4 \n"
  5470. "sub %0,%1 \n"
  5471. // 16 pixel loop.
  5472. LABELALIGN
  5473. "1: \n"
  5474. "vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts -> 16 ints
  5475. "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n"
  5476. "vcvtdq2ps %%ymm2,%%ymm2 \n"
  5477. "vcvtdq2ps %%ymm3,%%ymm3 \n"
  5478. "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
  5479. "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
  5480. "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
  5481. "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
  5482. MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1)
  5483. MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1)
  5484. "add $0x20,%0 \n"
  5485. "sub $0x10,%2 \n"
  5486. "jg 1b \n"
  5487. "vzeroupper \n"
  5488. : "+r"(src), // %0
  5489. "+r"(dst), // %1
  5490. "+r"(width) // %2
  5491. #if defined(__x86_64__)
  5492. : "x"(scale) // %3
  5493. #else
  5494. : "m"(scale) // %3
  5495. #endif
  5496. : "memory", "cc",
  5497. "xmm2", "xmm3", "xmm4"
  5498. );
  5499. }
  5500. #endif // HAS_HALFFLOATROW_F16C
  5501. #ifdef HAS_HALFFLOATROW_F16C
  5502. void HalfFloat1Row_F16C(const uint16* src, uint16* dst, float, int width) {
  5503. asm volatile (
  5504. "sub %0,%1 \n"
  5505. // 16 pixel loop.
  5506. LABELALIGN
  5507. "1: \n"
  5508. "vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts -> 16 ints
  5509. "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n"
  5510. "vcvtdq2ps %%ymm2,%%ymm2 \n"
  5511. "vcvtdq2ps %%ymm3,%%ymm3 \n"
  5512. "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
  5513. "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
  5514. MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1)
  5515. MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1)
  5516. "add $0x20,%0 \n"
  5517. "sub $0x10,%2 \n"
  5518. "jg 1b \n"
  5519. "vzeroupper \n"
  5520. : "+r"(src), // %0
  5521. "+r"(dst), // %1
  5522. "+r"(width) // %2
  5523. :
  5524. : "memory", "cc",
  5525. "xmm2", "xmm3"
  5526. );
  5527. }
  5528. #endif // HAS_HALFFLOATROW_F16C
  5529. #ifdef HAS_ARGBCOLORTABLEROW_X86
  5530. // Tranform ARGB pixels with color table.
  5531. void ARGBColorTableRow_X86(uint8* dst_argb,
  5532. const uint8* table_argb,
  5533. int width) {
  5534. uintptr_t pixel_temp;
  5535. asm volatile (
  5536. // 1 pixel loop.
  5537. LABELALIGN
  5538. "1: \n"
  5539. "movzb " MEMACCESS(0) ",%1 \n"
  5540. "lea " MEMLEA(0x4,0) ",%0 \n"
  5541. MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1
  5542. "mov %b1," MEMACCESS2(-0x4,0) " \n"
  5543. "movzb " MEMACCESS2(-0x3,0) ",%1 \n"
  5544. MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1
  5545. "mov %b1," MEMACCESS2(-0x3,0) " \n"
  5546. "movzb " MEMACCESS2(-0x2,0) ",%1 \n"
  5547. MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1
  5548. "mov %b1," MEMACCESS2(-0x2,0) " \n"
  5549. "movzb " MEMACCESS2(-0x1,0) ",%1 \n"
  5550. MEMOPARG(movzb,0x03,3,1,4,1) " \n" // movzb 0x3(%3,%1,4),%1
  5551. "mov %b1," MEMACCESS2(-0x1,0) " \n"
  5552. "dec %2 \n"
  5553. "jg 1b \n"
  5554. : "+r"(dst_argb), // %0
  5555. "=&d"(pixel_temp), // %1
  5556. "+r"(width) // %2
  5557. : "r"(table_argb) // %3
  5558. : "memory", "cc");
  5559. }
  5560. #endif // HAS_ARGBCOLORTABLEROW_X86
  5561. #ifdef HAS_RGBCOLORTABLEROW_X86
  5562. // Tranform RGB pixels with color table.
  5563. void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
  5564. uintptr_t pixel_temp;
  5565. asm volatile (
  5566. // 1 pixel loop.
  5567. LABELALIGN
  5568. "1: \n"
  5569. "movzb " MEMACCESS(0) ",%1 \n"
  5570. "lea " MEMLEA(0x4,0) ",%0 \n"
  5571. MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1
  5572. "mov %b1," MEMACCESS2(-0x4,0) " \n"
  5573. "movzb " MEMACCESS2(-0x3,0) ",%1 \n"
  5574. MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1
  5575. "mov %b1," MEMACCESS2(-0x3,0) " \n"
  5576. "movzb " MEMACCESS2(-0x2,0) ",%1 \n"
  5577. MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1
  5578. "mov %b1," MEMACCESS2(-0x2,0) " \n"
  5579. "dec %2 \n"
  5580. "jg 1b \n"
  5581. : "+r"(dst_argb), // %0
  5582. "=&d"(pixel_temp), // %1
  5583. "+r"(width) // %2
  5584. : "r"(table_argb) // %3
  5585. : "memory", "cc");
  5586. }
  5587. #endif // HAS_RGBCOLORTABLEROW_X86
  5588. #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
  5589. // Tranform RGB pixels with luma table.
  5590. void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
  5591. uint8* dst_argb,
  5592. int width,
  5593. const uint8* luma,
  5594. uint32 lumacoeff) {
  5595. uintptr_t pixel_temp;
  5596. uintptr_t table_temp;
  5597. asm volatile (
  5598. "movd %6,%%xmm3 \n"
  5599. "pshufd $0x0,%%xmm3,%%xmm3 \n"
  5600. "pcmpeqb %%xmm4,%%xmm4 \n"
  5601. "psllw $0x8,%%xmm4 \n"
  5602. "pxor %%xmm5,%%xmm5 \n"
  5603. // 4 pixel loop.
  5604. LABELALIGN
  5605. "1: \n"
  5606. "movdqu " MEMACCESS(2) ",%%xmm0 \n"
  5607. "pmaddubsw %%xmm3,%%xmm0 \n"
  5608. "phaddw %%xmm0,%%xmm0 \n"
  5609. "pand %%xmm4,%%xmm0 \n"
  5610. "punpcklwd %%xmm5,%%xmm0 \n"
  5611. "movd %%xmm0,%k1 \n" // 32 bit offset
  5612. "add %5,%1 \n"
  5613. "pshufd $0x39,%%xmm0,%%xmm0 \n"
  5614. "movzb " MEMACCESS(2) ",%0 \n"
  5615. MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
  5616. "mov %b0," MEMACCESS(3) " \n"
  5617. "movzb " MEMACCESS2(0x1,2) ",%0 \n"
  5618. MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
  5619. "mov %b0," MEMACCESS2(0x1,3) " \n"
  5620. "movzb " MEMACCESS2(0x2,2) ",%0 \n"
  5621. MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
  5622. "mov %b0," MEMACCESS2(0x2,3) " \n"
  5623. "movzb " MEMACCESS2(0x3,2) ",%0 \n"
  5624. "mov %b0," MEMACCESS2(0x3,3) " \n"
  5625. "movd %%xmm0,%k1 \n" // 32 bit offset
  5626. "add %5,%1 \n"
  5627. "pshufd $0x39,%%xmm0,%%xmm0 \n"
  5628. "movzb " MEMACCESS2(0x4,2) ",%0 \n"
  5629. MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
  5630. "mov %b0," MEMACCESS2(0x4,3) " \n"
  5631. "movzb " MEMACCESS2(0x5,2) ",%0 \n"
  5632. MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
  5633. "mov %b0," MEMACCESS2(0x5,3) " \n"
  5634. "movzb " MEMACCESS2(0x6,2) ",%0 \n"
  5635. MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
  5636. "mov %b0," MEMACCESS2(0x6,3) " \n"
  5637. "movzb " MEMACCESS2(0x7,2) ",%0 \n"
  5638. "mov %b0," MEMACCESS2(0x7,3) " \n"
  5639. "movd %%xmm0,%k1 \n" // 32 bit offset
  5640. "add %5,%1 \n"
  5641. "pshufd $0x39,%%xmm0,%%xmm0 \n"
  5642. "movzb " MEMACCESS2(0x8,2) ",%0 \n"
  5643. MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
  5644. "mov %b0," MEMACCESS2(0x8,3) " \n"
  5645. "movzb " MEMACCESS2(0x9,2) ",%0 \n"
  5646. MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
  5647. "mov %b0," MEMACCESS2(0x9,3) " \n"
  5648. "movzb " MEMACCESS2(0xa,2) ",%0 \n"
  5649. MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
  5650. "mov %b0," MEMACCESS2(0xa,3) " \n"
  5651. "movzb " MEMACCESS2(0xb,2) ",%0 \n"
  5652. "mov %b0," MEMACCESS2(0xb,3) " \n"
  5653. "movd %%xmm0,%k1 \n" // 32 bit offset
  5654. "add %5,%1 \n"
  5655. "movzb " MEMACCESS2(0xc,2) ",%0 \n"
  5656. MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
  5657. "mov %b0," MEMACCESS2(0xc,3) " \n"
  5658. "movzb " MEMACCESS2(0xd,2) ",%0 \n"
  5659. MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
  5660. "mov %b0," MEMACCESS2(0xd,3) " \n"
  5661. "movzb " MEMACCESS2(0xe,2) ",%0 \n"
  5662. MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
  5663. "mov %b0," MEMACCESS2(0xe,3) " \n"
  5664. "movzb " MEMACCESS2(0xf,2) ",%0 \n"
  5665. "mov %b0," MEMACCESS2(0xf,3) " \n"
  5666. "lea " MEMLEA(0x10,2) ",%2 \n"
  5667. "lea " MEMLEA(0x10,3) ",%3 \n"
  5668. "sub $0x4,%4 \n"
  5669. "jg 1b \n"
  5670. : "=&d"(pixel_temp), // %0
  5671. "=&a"(table_temp), // %1
  5672. "+r"(src_argb), // %2
  5673. "+r"(dst_argb), // %3
  5674. "+rm"(width) // %4
  5675. : "r"(luma), // %5
  5676. "rm"(lumacoeff) // %6
  5677. : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
  5678. );
  5679. }
  5680. #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
  5681. #endif // defined(__x86_64__) || defined(__i386__)
  5682. #ifdef __cplusplus
  5683. } // extern "C"
  5684. } // namespace libyuv
  5685. #endif