row_win.cc 204 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341
  1. /*
  2. * Copyright 2011 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/row.h"
  11. // This module is for Visual C 32/64 bit and clangcl 32 bit
  12. #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
  13. (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
  14. #if defined(_M_X64)
  15. #include <emmintrin.h>
  16. #include <tmmintrin.h> // For _mm_maddubs_epi16
  17. #endif
  18. #ifdef __cplusplus
  19. namespace libyuv {
  20. extern "C" {
  21. #endif
  22. // 64 bit
  23. #if defined(_M_X64)
  24. // Read 4 UV from 422, upsample to 8 UV.
  25. #define READYUV422 \
  26. xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \
  27. xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \
  28. xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
  29. xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
  30. u_buf += 4; \
  31. xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
  32. xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
  33. y_buf += 8;
  34. // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
  35. #define READYUVA422 \
  36. xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \
  37. xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \
  38. xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
  39. xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
  40. u_buf += 4; \
  41. xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
  42. xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
  43. y_buf += 8; \
  44. xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \
  45. a_buf += 8;
  46. // Convert 8 pixels: 8 UV and 8 Y.
  47. #define YUVTORGB(yuvconstants) \
  48. xmm1 = _mm_loadu_si128(&xmm0); \
  49. xmm2 = _mm_loadu_si128(&xmm0); \
  50. xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \
  51. xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \
  52. xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \
  53. xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0); \
  54. xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1); \
  55. xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2); \
  56. xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \
  57. xmm0 = _mm_adds_epi16(xmm0, xmm4); \
  58. xmm1 = _mm_adds_epi16(xmm1, xmm4); \
  59. xmm2 = _mm_adds_epi16(xmm2, xmm4); \
  60. xmm0 = _mm_srai_epi16(xmm0, 6); \
  61. xmm1 = _mm_srai_epi16(xmm1, 6); \
  62. xmm2 = _mm_srai_epi16(xmm2, 6); \
  63. xmm0 = _mm_packus_epi16(xmm0, xmm0); \
  64. xmm1 = _mm_packus_epi16(xmm1, xmm1); \
  65. xmm2 = _mm_packus_epi16(xmm2, xmm2);
  66. // Store 8 ARGB values.
  67. #define STOREARGB \
  68. xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
  69. xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \
  70. xmm1 = _mm_loadu_si128(&xmm0); \
  71. xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \
  72. xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \
  73. _mm_storeu_si128((__m128i*)dst_argb, xmm0); \
  74. _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \
  75. dst_argb += 32;
  76. #if defined(HAS_I422TOARGBROW_SSSE3)
  77. void I422ToARGBRow_SSSE3(const uint8* y_buf,
  78. const uint8* u_buf,
  79. const uint8* v_buf,
  80. uint8* dst_argb,
  81. const struct YuvConstants* yuvconstants,
  82. int width) {
  83. __m128i xmm0, xmm1, xmm2, xmm4;
  84. const __m128i xmm5 = _mm_set1_epi8(-1);
  85. const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
  86. while (width > 0) {
  87. READYUV422
  88. YUVTORGB(yuvconstants)
  89. STOREARGB
  90. width -= 8;
  91. }
  92. }
  93. #endif
  94. #if defined(HAS_I422ALPHATOARGBROW_SSSE3)
  95. void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
  96. const uint8* u_buf,
  97. const uint8* v_buf,
  98. const uint8* a_buf,
  99. uint8* dst_argb,
  100. const struct YuvConstants* yuvconstants,
  101. int width) {
  102. __m128i xmm0, xmm1, xmm2, xmm4, xmm5;
  103. const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
  104. while (width > 0) {
  105. READYUVA422
  106. YUVTORGB(yuvconstants)
  107. STOREARGB
  108. width -= 8;
  109. }
  110. }
  111. #endif
  112. // 32 bit
  113. #else // defined(_M_X64)
  114. #ifdef HAS_ARGBTOYROW_SSSE3
  115. // Constants for ARGB.
  116. static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
  117. 13, 65, 33, 0, 13, 65, 33, 0};
  118. // JPeg full range.
  119. static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
  120. 15, 75, 38, 0, 15, 75, 38, 0};
  121. static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
  122. 112, -74, -38, 0, 112, -74, -38, 0};
  123. static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
  124. 127, -84, -43, 0, 127, -84, -43, 0};
  125. static const vec8 kARGBToV = {
  126. -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
  127. };
  128. static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
  129. -20, -107, 127, 0, -20, -107, 127, 0};
  130. // vpshufb for vphaddw + vpackuswb packed to shorts.
  131. static const lvec8 kShufARGBToUV_AVX = {
  132. 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
  133. 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
  134. // Constants for BGRA.
  135. static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
  136. 0, 33, 65, 13, 0, 33, 65, 13};
  137. static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
  138. 0, -38, -74, 112, 0, -38, -74, 112};
  139. static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
  140. 0, 112, -94, -18, 0, 112, -94, -18};
  141. // Constants for ABGR.
  142. static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
  143. 33, 65, 13, 0, 33, 65, 13, 0};
  144. static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
  145. -38, -74, 112, 0, -38, -74, 112, 0};
  146. static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
  147. 112, -94, -18, 0, 112, -94, -18, 0};
  148. // Constants for RGBA.
  149. static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
  150. 0, 13, 65, 33, 0, 13, 65, 33};
  151. static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
  152. 0, 112, -74, -38, 0, 112, -74, -38};
  153. static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
  154. 0, -18, -94, 112, 0, -18, -94, 112};
  155. static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
  156. 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
  157. // 7 bit fixed point 0.5.
  158. static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
  159. static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
  160. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
  161. static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
  162. 0x8080u, 0x8080u, 0x8080u, 0x8080u};
  163. // Shuffle table for converting RGB24 to ARGB.
  164. static const uvec8 kShuffleMaskRGB24ToARGB = {
  165. 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
  166. // Shuffle table for converting RAW to ARGB.
  167. static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u,
  168. 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
  169. // Shuffle table for converting RAW to RGB24. First 8.
  170. static const uvec8 kShuffleMaskRAWToRGB24_0 = {
  171. 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
  172. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
  173. // Shuffle table for converting RAW to RGB24. Middle 8.
  174. static const uvec8 kShuffleMaskRAWToRGB24_1 = {
  175. 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
  176. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
  177. // Shuffle table for converting RAW to RGB24. Last 8.
  178. static const uvec8 kShuffleMaskRAWToRGB24_2 = {
  179. 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
  180. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
  181. // Shuffle table for converting ARGB to RGB24.
  182. static const uvec8 kShuffleMaskARGBToRGB24 = {
  183. 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
  184. // Shuffle table for converting ARGB to RAW.
  185. static const uvec8 kShuffleMaskARGBToRAW = {
  186. 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
  187. // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
  188. static const uvec8 kShuffleMaskARGBToRGB24_0 = {
  189. 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
  190. // YUY2 shuf 16 Y to 32 Y.
  191. static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10,
  192. 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4,
  193. 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
  194. // YUY2 shuf 8 UV to 16 UV.
  195. static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9,
  196. 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7,
  197. 5, 7, 9, 11, 9, 11, 13, 15, 13, 15};
  198. // UYVY shuf 16 Y to 32 Y.
  199. static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11,
  200. 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5,
  201. 7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
  202. // UYVY shuf 8 UV to 16 UV.
  203. static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8,
  204. 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6,
  205. 4, 6, 8, 10, 8, 10, 12, 14, 12, 14};
  206. // NV21 shuf 8 VU to 16 UV.
  207. static const lvec8 kShuffleNV21 = {
  208. 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
  209. 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
  210. };
  211. // Duplicates gray value 3 times and fills in alpha opaque.
  212. __declspec(naked) void J400ToARGBRow_SSE2(const uint8* src_y,
  213. uint8* dst_argb,
  214. int width) {
  215. __asm {
  216. mov eax, [esp + 4] // src_y
  217. mov edx, [esp + 8] // dst_argb
  218. mov ecx, [esp + 12] // width
  219. pcmpeqb xmm5, xmm5 // generate mask 0xff000000
  220. pslld xmm5, 24
  221. convertloop:
  222. movq xmm0, qword ptr [eax]
  223. lea eax, [eax + 8]
  224. punpcklbw xmm0, xmm0
  225. movdqa xmm1, xmm0
  226. punpcklwd xmm0, xmm0
  227. punpckhwd xmm1, xmm1
  228. por xmm0, xmm5
  229. por xmm1, xmm5
  230. movdqu [edx], xmm0
  231. movdqu [edx + 16], xmm1
  232. lea edx, [edx + 32]
  233. sub ecx, 8
  234. jg convertloop
  235. ret
  236. }
  237. }
  238. #ifdef HAS_J400TOARGBROW_AVX2
  239. // Duplicates gray value 3 times and fills in alpha opaque.
  240. __declspec(naked) void J400ToARGBRow_AVX2(const uint8* src_y,
  241. uint8* dst_argb,
  242. int width) {
  243. __asm {
  244. mov eax, [esp + 4] // src_y
  245. mov edx, [esp + 8] // dst_argb
  246. mov ecx, [esp + 12] // width
  247. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
  248. vpslld ymm5, ymm5, 24
  249. convertloop:
  250. vmovdqu xmm0, [eax]
  251. lea eax, [eax + 16]
  252. vpermq ymm0, ymm0, 0xd8
  253. vpunpcklbw ymm0, ymm0, ymm0
  254. vpermq ymm0, ymm0, 0xd8
  255. vpunpckhwd ymm1, ymm0, ymm0
  256. vpunpcklwd ymm0, ymm0, ymm0
  257. vpor ymm0, ymm0, ymm5
  258. vpor ymm1, ymm1, ymm5
  259. vmovdqu [edx], ymm0
  260. vmovdqu [edx + 32], ymm1
  261. lea edx, [edx + 64]
  262. sub ecx, 16
  263. jg convertloop
  264. vzeroupper
  265. ret
  266. }
  267. }
  268. #endif // HAS_J400TOARGBROW_AVX2
  269. __declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24,
  270. uint8* dst_argb,
  271. int width) {
  272. __asm {
  273. mov eax, [esp + 4] // src_rgb24
  274. mov edx, [esp + 8] // dst_argb
  275. mov ecx, [esp + 12] // width
  276. pcmpeqb xmm5, xmm5 // generate mask 0xff000000
  277. pslld xmm5, 24
  278. movdqa xmm4, xmmword ptr kShuffleMaskRGB24ToARGB
  279. convertloop:
  280. movdqu xmm0, [eax]
  281. movdqu xmm1, [eax + 16]
  282. movdqu xmm3, [eax + 32]
  283. lea eax, [eax + 48]
  284. movdqa xmm2, xmm3
  285. palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
  286. pshufb xmm2, xmm4
  287. por xmm2, xmm5
  288. palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
  289. pshufb xmm0, xmm4
  290. movdqu [edx + 32], xmm2
  291. por xmm0, xmm5
  292. pshufb xmm1, xmm4
  293. movdqu [edx], xmm0
  294. por xmm1, xmm5
  295. palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
  296. pshufb xmm3, xmm4
  297. movdqu [edx + 16], xmm1
  298. por xmm3, xmm5
  299. movdqu [edx + 48], xmm3
  300. lea edx, [edx + 64]
  301. sub ecx, 16
  302. jg convertloop
  303. ret
  304. }
  305. }
  306. __declspec(naked) void RAWToARGBRow_SSSE3(const uint8* src_raw,
  307. uint8* dst_argb,
  308. int width) {
  309. __asm {
  310. mov eax, [esp + 4] // src_raw
  311. mov edx, [esp + 8] // dst_argb
  312. mov ecx, [esp + 12] // width
  313. pcmpeqb xmm5, xmm5 // generate mask 0xff000000
  314. pslld xmm5, 24
  315. movdqa xmm4, xmmword ptr kShuffleMaskRAWToARGB
  316. convertloop:
  317. movdqu xmm0, [eax]
  318. movdqu xmm1, [eax + 16]
  319. movdqu xmm3, [eax + 32]
  320. lea eax, [eax + 48]
  321. movdqa xmm2, xmm3
  322. palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
  323. pshufb xmm2, xmm4
  324. por xmm2, xmm5
  325. palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
  326. pshufb xmm0, xmm4
  327. movdqu [edx + 32], xmm2
  328. por xmm0, xmm5
  329. pshufb xmm1, xmm4
  330. movdqu [edx], xmm0
  331. por xmm1, xmm5
  332. palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
  333. pshufb xmm3, xmm4
  334. movdqu [edx + 16], xmm1
  335. por xmm3, xmm5
  336. movdqu [edx + 48], xmm3
  337. lea edx, [edx + 64]
  338. sub ecx, 16
  339. jg convertloop
  340. ret
  341. }
  342. }
  343. __declspec(naked) void RAWToRGB24Row_SSSE3(const uint8* src_raw,
  344. uint8* dst_rgb24,
  345. int width) {
  346. __asm {
  347. mov eax, [esp + 4] // src_raw
  348. mov edx, [esp + 8] // dst_rgb24
  349. mov ecx, [esp + 12] // width
  350. movdqa xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0
  351. movdqa xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1
  352. movdqa xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2
  353. convertloop:
  354. movdqu xmm0, [eax]
  355. movdqu xmm1, [eax + 4]
  356. movdqu xmm2, [eax + 8]
  357. lea eax, [eax + 24]
  358. pshufb xmm0, xmm3
  359. pshufb xmm1, xmm4
  360. pshufb xmm2, xmm5
  361. movq qword ptr [edx], xmm0
  362. movq qword ptr [edx + 8], xmm1
  363. movq qword ptr [edx + 16], xmm2
  364. lea edx, [edx + 24]
  365. sub ecx, 8
  366. jg convertloop
  367. ret
  368. }
  369. }
  370. // pmul method to replicate bits.
  371. // Math to replicate bits:
  372. // (v << 8) | (v << 3)
  373. // v * 256 + v * 8
  374. // v * (256 + 8)
  375. // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
  376. // 20 instructions.
  377. __declspec(naked) void RGB565ToARGBRow_SSE2(const uint8* src_rgb565,
  378. uint8* dst_argb,
  379. int width) {
  380. __asm {
  381. mov eax, 0x01080108 // generate multiplier to repeat 5 bits
  382. movd xmm5, eax
  383. pshufd xmm5, xmm5, 0
  384. mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
  385. movd xmm6, eax
  386. pshufd xmm6, xmm6, 0
  387. pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
  388. psllw xmm3, 11
  389. pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green
  390. psllw xmm4, 10
  391. psrlw xmm4, 5
  392. pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
  393. psllw xmm7, 8
  394. mov eax, [esp + 4] // src_rgb565
  395. mov edx, [esp + 8] // dst_argb
  396. mov ecx, [esp + 12] // width
  397. sub edx, eax
  398. sub edx, eax
  399. convertloop:
  400. movdqu xmm0, [eax] // fetch 8 pixels of bgr565
  401. movdqa xmm1, xmm0
  402. movdqa xmm2, xmm0
  403. pand xmm1, xmm3 // R in upper 5 bits
  404. psllw xmm2, 11 // B in upper 5 bits
  405. pmulhuw xmm1, xmm5 // * (256 + 8)
  406. pmulhuw xmm2, xmm5 // * (256 + 8)
  407. psllw xmm1, 8
  408. por xmm1, xmm2 // RB
  409. pand xmm0, xmm4 // G in middle 6 bits
  410. pmulhuw xmm0, xmm6 // << 5 * (256 + 4)
  411. por xmm0, xmm7 // AG
  412. movdqa xmm2, xmm1
  413. punpcklbw xmm1, xmm0
  414. punpckhbw xmm2, xmm0
  415. movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
  416. movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
  417. lea eax, [eax + 16]
  418. sub ecx, 8
  419. jg convertloop
  420. ret
  421. }
  422. }
  423. #ifdef HAS_RGB565TOARGBROW_AVX2
  424. // pmul method to replicate bits.
  425. // Math to replicate bits:
  426. // (v << 8) | (v << 3)
  427. // v * 256 + v * 8
  428. // v * (256 + 8)
  429. // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
  430. __declspec(naked) void RGB565ToARGBRow_AVX2(const uint8* src_rgb565,
  431. uint8* dst_argb,
  432. int width) {
  433. __asm {
  434. mov eax, 0x01080108 // generate multiplier to repeat 5 bits
  435. vmovd xmm5, eax
  436. vbroadcastss ymm5, xmm5
  437. mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
  438. vmovd xmm6, eax
  439. vbroadcastss ymm6, xmm6
  440. vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
  441. vpsllw ymm3, ymm3, 11
  442. vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green
  443. vpsllw ymm4, ymm4, 10
  444. vpsrlw ymm4, ymm4, 5
  445. vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
  446. vpsllw ymm7, ymm7, 8
  447. mov eax, [esp + 4] // src_rgb565
  448. mov edx, [esp + 8] // dst_argb
  449. mov ecx, [esp + 12] // width
  450. sub edx, eax
  451. sub edx, eax
  452. convertloop:
  453. vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565
  454. vpand ymm1, ymm0, ymm3 // R in upper 5 bits
  455. vpsllw ymm2, ymm0, 11 // B in upper 5 bits
  456. vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
  457. vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
  458. vpsllw ymm1, ymm1, 8
  459. vpor ymm1, ymm1, ymm2 // RB
  460. vpand ymm0, ymm0, ymm4 // G in middle 6 bits
  461. vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4)
  462. vpor ymm0, ymm0, ymm7 // AG
  463. vpermq ymm0, ymm0, 0xd8 // mutate for unpack
  464. vpermq ymm1, ymm1, 0xd8
  465. vpunpckhbw ymm2, ymm1, ymm0
  466. vpunpcklbw ymm1, ymm1, ymm0
  467. vmovdqu [eax * 2 + edx], ymm1 // store 4 pixels of ARGB
  468. vmovdqu [eax * 2 + edx + 32], ymm2 // store next 4 pixels of ARGB
  469. lea eax, [eax + 32]
  470. sub ecx, 16
  471. jg convertloop
  472. vzeroupper
  473. ret
  474. }
  475. }
  476. #endif // HAS_RGB565TOARGBROW_AVX2
  477. #ifdef HAS_ARGB1555TOARGBROW_AVX2
  478. __declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555,
  479. uint8* dst_argb,
  480. int width) {
  481. __asm {
  482. mov eax, 0x01080108 // generate multiplier to repeat 5 bits
  483. vmovd xmm5, eax
  484. vbroadcastss ymm5, xmm5
  485. mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
  486. vmovd xmm6, eax
  487. vbroadcastss ymm6, xmm6
  488. vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
  489. vpsllw ymm3, ymm3, 11
  490. vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green
  491. vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
  492. vpsllw ymm7, ymm7, 8
  493. mov eax, [esp + 4] // src_argb1555
  494. mov edx, [esp + 8] // dst_argb
  495. mov ecx, [esp + 12] // width
  496. sub edx, eax
  497. sub edx, eax
  498. convertloop:
  499. vmovdqu ymm0, [eax] // fetch 16 pixels of 1555
  500. vpsllw ymm1, ymm0, 1 // R in upper 5 bits
  501. vpsllw ymm2, ymm0, 11 // B in upper 5 bits
  502. vpand ymm1, ymm1, ymm3
  503. vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
  504. vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
  505. vpsllw ymm1, ymm1, 8
  506. vpor ymm1, ymm1, ymm2 // RB
  507. vpsraw ymm2, ymm0, 8 // A
  508. vpand ymm0, ymm0, ymm4 // G in middle 5 bits
  509. vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8)
  510. vpand ymm2, ymm2, ymm7
  511. vpor ymm0, ymm0, ymm2 // AG
  512. vpermq ymm0, ymm0, 0xd8 // mutate for unpack
  513. vpermq ymm1, ymm1, 0xd8
  514. vpunpckhbw ymm2, ymm1, ymm0
  515. vpunpcklbw ymm1, ymm1, ymm0
  516. vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB
  517. vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB
  518. lea eax, [eax + 32]
  519. sub ecx, 16
  520. jg convertloop
  521. vzeroupper
  522. ret
  523. }
  524. }
  525. #endif // HAS_ARGB1555TOARGBROW_AVX2
  526. #ifdef HAS_ARGB4444TOARGBROW_AVX2
  527. __declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444,
  528. uint8* dst_argb,
  529. int width) {
  530. __asm {
  531. mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
  532. vmovd xmm4, eax
  533. vbroadcastss ymm4, xmm4
  534. vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles
  535. mov eax, [esp + 4] // src_argb4444
  536. mov edx, [esp + 8] // dst_argb
  537. mov ecx, [esp + 12] // width
  538. sub edx, eax
  539. sub edx, eax
  540. convertloop:
  541. vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444
  542. vpand ymm2, ymm0, ymm5 // mask high nibbles
  543. vpand ymm0, ymm0, ymm4 // mask low nibbles
  544. vpsrlw ymm3, ymm2, 4
  545. vpsllw ymm1, ymm0, 4
  546. vpor ymm2, ymm2, ymm3
  547. vpor ymm0, ymm0, ymm1
  548. vpermq ymm0, ymm0, 0xd8 // mutate for unpack
  549. vpermq ymm2, ymm2, 0xd8
  550. vpunpckhbw ymm1, ymm0, ymm2
  551. vpunpcklbw ymm0, ymm0, ymm2
  552. vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB
  553. vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB
  554. lea eax, [eax + 32]
  555. sub ecx, 16
  556. jg convertloop
  557. vzeroupper
  558. ret
  559. }
  560. }
  561. #endif // HAS_ARGB4444TOARGBROW_AVX2
  562. // 24 instructions
  563. __declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555,
  564. uint8* dst_argb,
  565. int width) {
  566. __asm {
  567. mov eax, 0x01080108 // generate multiplier to repeat 5 bits
  568. movd xmm5, eax
  569. pshufd xmm5, xmm5, 0
  570. mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
  571. movd xmm6, eax
  572. pshufd xmm6, xmm6, 0
  573. pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
  574. psllw xmm3, 11
  575. movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green
  576. psrlw xmm4, 6
  577. pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
  578. psllw xmm7, 8
  579. mov eax, [esp + 4] // src_argb1555
  580. mov edx, [esp + 8] // dst_argb
  581. mov ecx, [esp + 12] // width
  582. sub edx, eax
  583. sub edx, eax
  584. convertloop:
  585. movdqu xmm0, [eax] // fetch 8 pixels of 1555
  586. movdqa xmm1, xmm0
  587. movdqa xmm2, xmm0
  588. psllw xmm1, 1 // R in upper 5 bits
  589. psllw xmm2, 11 // B in upper 5 bits
  590. pand xmm1, xmm3
  591. pmulhuw xmm2, xmm5 // * (256 + 8)
  592. pmulhuw xmm1, xmm5 // * (256 + 8)
  593. psllw xmm1, 8
  594. por xmm1, xmm2 // RB
  595. movdqa xmm2, xmm0
  596. pand xmm0, xmm4 // G in middle 5 bits
  597. psraw xmm2, 8 // A
  598. pmulhuw xmm0, xmm6 // << 6 * (256 + 8)
  599. pand xmm2, xmm7
  600. por xmm0, xmm2 // AG
  601. movdqa xmm2, xmm1
  602. punpcklbw xmm1, xmm0
  603. punpckhbw xmm2, xmm0
  604. movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
  605. movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
  606. lea eax, [eax + 16]
  607. sub ecx, 8
  608. jg convertloop
  609. ret
  610. }
  611. }
  612. // 18 instructions.
  613. __declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444,
  614. uint8* dst_argb,
  615. int width) {
  616. __asm {
  617. mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
  618. movd xmm4, eax
  619. pshufd xmm4, xmm4, 0
  620. movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles
  621. pslld xmm5, 4
  622. mov eax, [esp + 4] // src_argb4444
  623. mov edx, [esp + 8] // dst_argb
  624. mov ecx, [esp + 12] // width
  625. sub edx, eax
  626. sub edx, eax
  627. convertloop:
  628. movdqu xmm0, [eax] // fetch 8 pixels of bgra4444
  629. movdqa xmm2, xmm0
  630. pand xmm0, xmm4 // mask low nibbles
  631. pand xmm2, xmm5 // mask high nibbles
  632. movdqa xmm1, xmm0
  633. movdqa xmm3, xmm2
  634. psllw xmm1, 4
  635. psrlw xmm3, 4
  636. por xmm0, xmm1
  637. por xmm2, xmm3
  638. movdqa xmm1, xmm0
  639. punpcklbw xmm0, xmm2
  640. punpckhbw xmm1, xmm2
  641. movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB
  642. movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
  643. lea eax, [eax + 16]
  644. sub ecx, 8
  645. jg convertloop
  646. ret
  647. }
  648. }
  649. __declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8* src_argb,
  650. uint8* dst_rgb,
  651. int width) {
  652. __asm {
  653. mov eax, [esp + 4] // src_argb
  654. mov edx, [esp + 8] // dst_rgb
  655. mov ecx, [esp + 12] // width
  656. movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
  657. convertloop:
  658. movdqu xmm0, [eax] // fetch 16 pixels of argb
  659. movdqu xmm1, [eax + 16]
  660. movdqu xmm2, [eax + 32]
  661. movdqu xmm3, [eax + 48]
  662. lea eax, [eax + 64]
  663. pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
  664. pshufb xmm1, xmm6
  665. pshufb xmm2, xmm6
  666. pshufb xmm3, xmm6
  667. movdqa xmm4, xmm1 // 4 bytes from 1 for 0
  668. psrldq xmm1, 4 // 8 bytes from 1
  669. pslldq xmm4, 12 // 4 bytes from 1 for 0
  670. movdqa xmm5, xmm2 // 8 bytes from 2 for 1
  671. por xmm0, xmm4 // 4 bytes from 1 for 0
  672. pslldq xmm5, 8 // 8 bytes from 2 for 1
  673. movdqu [edx], xmm0 // store 0
  674. por xmm1, xmm5 // 8 bytes from 2 for 1
  675. psrldq xmm2, 8 // 4 bytes from 2
  676. pslldq xmm3, 4 // 12 bytes from 3 for 2
  677. por xmm2, xmm3 // 12 bytes from 3 for 2
  678. movdqu [edx + 16], xmm1 // store 1
  679. movdqu [edx + 32], xmm2 // store 2
  680. lea edx, [edx + 48]
  681. sub ecx, 16
  682. jg convertloop
  683. ret
  684. }
  685. }
  686. __declspec(naked) void ARGBToRAWRow_SSSE3(const uint8* src_argb,
  687. uint8* dst_rgb,
  688. int width) {
  689. __asm {
  690. mov eax, [esp + 4] // src_argb
  691. mov edx, [esp + 8] // dst_rgb
  692. mov ecx, [esp + 12] // width
  693. movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW
  694. convertloop:
  695. movdqu xmm0, [eax] // fetch 16 pixels of argb
  696. movdqu xmm1, [eax + 16]
  697. movdqu xmm2, [eax + 32]
  698. movdqu xmm3, [eax + 48]
  699. lea eax, [eax + 64]
  700. pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
  701. pshufb xmm1, xmm6
  702. pshufb xmm2, xmm6
  703. pshufb xmm3, xmm6
  704. movdqa xmm4, xmm1 // 4 bytes from 1 for 0
  705. psrldq xmm1, 4 // 8 bytes from 1
  706. pslldq xmm4, 12 // 4 bytes from 1 for 0
  707. movdqa xmm5, xmm2 // 8 bytes from 2 for 1
  708. por xmm0, xmm4 // 4 bytes from 1 for 0
  709. pslldq xmm5, 8 // 8 bytes from 2 for 1
  710. movdqu [edx], xmm0 // store 0
  711. por xmm1, xmm5 // 8 bytes from 2 for 1
  712. psrldq xmm2, 8 // 4 bytes from 2
  713. pslldq xmm3, 4 // 12 bytes from 3 for 2
  714. por xmm2, xmm3 // 12 bytes from 3 for 2
  715. movdqu [edx + 16], xmm1 // store 1
  716. movdqu [edx + 32], xmm2 // store 2
  717. lea edx, [edx + 48]
  718. sub ecx, 16
  719. jg convertloop
  720. ret
  721. }
  722. }
  723. __declspec(naked) void ARGBToRGB565Row_SSE2(const uint8* src_argb,
  724. uint8* dst_rgb,
  725. int width) {
  726. __asm {
  727. mov eax, [esp + 4] // src_argb
  728. mov edx, [esp + 8] // dst_rgb
  729. mov ecx, [esp + 12] // width
  730. pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
  731. psrld xmm3, 27
  732. pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
  733. psrld xmm4, 26
  734. pslld xmm4, 5
  735. pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
  736. pslld xmm5, 11
  737. convertloop:
  738. movdqu xmm0, [eax] // fetch 4 pixels of argb
  739. movdqa xmm1, xmm0 // B
  740. movdqa xmm2, xmm0 // G
  741. pslld xmm0, 8 // R
  742. psrld xmm1, 3 // B
  743. psrld xmm2, 5 // G
  744. psrad xmm0, 16 // R
  745. pand xmm1, xmm3 // B
  746. pand xmm2, xmm4 // G
  747. pand xmm0, xmm5 // R
  748. por xmm1, xmm2 // BG
  749. por xmm0, xmm1 // BGR
  750. packssdw xmm0, xmm0
  751. lea eax, [eax + 16]
  752. movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
  753. lea edx, [edx + 8]
  754. sub ecx, 4
  755. jg convertloop
  756. ret
  757. }
  758. }
  759. __declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb,
  760. uint8* dst_rgb,
  761. const uint32 dither4,
  762. int width) {
  763. __asm {
  764. mov eax, [esp + 4] // src_argb
  765. mov edx, [esp + 8] // dst_rgb
  766. movd xmm6, [esp + 12] // dither4
  767. mov ecx, [esp + 16] // width
  768. punpcklbw xmm6, xmm6 // make dither 16 bytes
  769. movdqa xmm7, xmm6
  770. punpcklwd xmm6, xmm6
  771. punpckhwd xmm7, xmm7
  772. pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
  773. psrld xmm3, 27
  774. pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
  775. psrld xmm4, 26
  776. pslld xmm4, 5
  777. pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
  778. pslld xmm5, 11
  779. convertloop:
  780. movdqu xmm0, [eax] // fetch 4 pixels of argb
  781. paddusb xmm0, xmm6 // add dither
  782. movdqa xmm1, xmm0 // B
  783. movdqa xmm2, xmm0 // G
  784. pslld xmm0, 8 // R
  785. psrld xmm1, 3 // B
  786. psrld xmm2, 5 // G
  787. psrad xmm0, 16 // R
  788. pand xmm1, xmm3 // B
  789. pand xmm2, xmm4 // G
  790. pand xmm0, xmm5 // R
  791. por xmm1, xmm2 // BG
  792. por xmm0, xmm1 // BGR
  793. packssdw xmm0, xmm0
  794. lea eax, [eax + 16]
  795. movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
  796. lea edx, [edx + 8]
  797. sub ecx, 4
  798. jg convertloop
  799. ret
  800. }
  801. }
  802. #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
  803. __declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb,
  804. uint8* dst_rgb,
  805. const uint32 dither4,
  806. int width) {
  807. __asm {
  808. mov eax, [esp + 4] // src_argb
  809. mov edx, [esp + 8] // dst_rgb
  810. vbroadcastss xmm6, [esp + 12] // dither4
  811. mov ecx, [esp + 16] // width
  812. vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes
  813. vpermq ymm6, ymm6, 0xd8
  814. vpunpcklwd ymm6, ymm6, ymm6
  815. vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
  816. vpsrld ymm3, ymm3, 27
  817. vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
  818. vpsrld ymm4, ymm4, 26
  819. vpslld ymm4, ymm4, 5
  820. vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
  821. convertloop:
  822. vmovdqu ymm0, [eax] // fetch 8 pixels of argb
  823. vpaddusb ymm0, ymm0, ymm6 // add dither
  824. vpsrld ymm2, ymm0, 5 // G
  825. vpsrld ymm1, ymm0, 3 // B
  826. vpsrld ymm0, ymm0, 8 // R
  827. vpand ymm2, ymm2, ymm4 // G
  828. vpand ymm1, ymm1, ymm3 // B
  829. vpand ymm0, ymm0, ymm5 // R
  830. vpor ymm1, ymm1, ymm2 // BG
  831. vpor ymm0, ymm0, ymm1 // BGR
  832. vpackusdw ymm0, ymm0, ymm0
  833. vpermq ymm0, ymm0, 0xd8
  834. lea eax, [eax + 32]
  835. vmovdqu [edx], xmm0 // store 8 pixels of RGB565
  836. lea edx, [edx + 16]
  837. sub ecx, 8
  838. jg convertloop
  839. vzeroupper
  840. ret
  841. }
  842. }
  843. #endif // HAS_ARGBTORGB565DITHERROW_AVX2
  844. // TODO(fbarchard): Improve sign extension/packing.
  845. __declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8* src_argb,
  846. uint8* dst_rgb,
  847. int width) {
  848. __asm {
  849. mov eax, [esp + 4] // src_argb
  850. mov edx, [esp + 8] // dst_rgb
  851. mov ecx, [esp + 12] // width
  852. pcmpeqb xmm4, xmm4 // generate mask 0x0000001f
  853. psrld xmm4, 27
  854. movdqa xmm5, xmm4 // generate mask 0x000003e0
  855. pslld xmm5, 5
  856. movdqa xmm6, xmm4 // generate mask 0x00007c00
  857. pslld xmm6, 10
  858. pcmpeqb xmm7, xmm7 // generate mask 0xffff8000
  859. pslld xmm7, 15
  860. convertloop:
  861. movdqu xmm0, [eax] // fetch 4 pixels of argb
  862. movdqa xmm1, xmm0 // B
  863. movdqa xmm2, xmm0 // G
  864. movdqa xmm3, xmm0 // R
  865. psrad xmm0, 16 // A
  866. psrld xmm1, 3 // B
  867. psrld xmm2, 6 // G
  868. psrld xmm3, 9 // R
  869. pand xmm0, xmm7 // A
  870. pand xmm1, xmm4 // B
  871. pand xmm2, xmm5 // G
  872. pand xmm3, xmm6 // R
  873. por xmm0, xmm1 // BA
  874. por xmm2, xmm3 // GR
  875. por xmm0, xmm2 // BGRA
  876. packssdw xmm0, xmm0
  877. lea eax, [eax + 16]
  878. movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555
  879. lea edx, [edx + 8]
  880. sub ecx, 4
  881. jg convertloop
  882. ret
  883. }
  884. }
  885. __declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8* src_argb,
  886. uint8* dst_rgb,
  887. int width) {
  888. __asm {
  889. mov eax, [esp + 4] // src_argb
  890. mov edx, [esp + 8] // dst_rgb
  891. mov ecx, [esp + 12] // width
  892. pcmpeqb xmm4, xmm4 // generate mask 0xf000f000
  893. psllw xmm4, 12
  894. movdqa xmm3, xmm4 // generate mask 0x00f000f0
  895. psrlw xmm3, 8
  896. convertloop:
  897. movdqu xmm0, [eax] // fetch 4 pixels of argb
  898. movdqa xmm1, xmm0
  899. pand xmm0, xmm3 // low nibble
  900. pand xmm1, xmm4 // high nibble
  901. psrld xmm0, 4
  902. psrld xmm1, 8
  903. por xmm0, xmm1
  904. packuswb xmm0, xmm0
  905. lea eax, [eax + 16]
  906. movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444
  907. lea edx, [edx + 8]
  908. sub ecx, 4
  909. jg convertloop
  910. ret
  911. }
  912. }
  913. #ifdef HAS_ARGBTORGB565ROW_AVX2
  914. __declspec(naked) void ARGBToRGB565Row_AVX2(const uint8* src_argb,
  915. uint8* dst_rgb,
  916. int width) {
  917. __asm {
  918. mov eax, [esp + 4] // src_argb
  919. mov edx, [esp + 8] // dst_rgb
  920. mov ecx, [esp + 12] // width
  921. vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
  922. vpsrld ymm3, ymm3, 27
  923. vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
  924. vpsrld ymm4, ymm4, 26
  925. vpslld ymm4, ymm4, 5
  926. vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
  927. convertloop:
  928. vmovdqu ymm0, [eax] // fetch 8 pixels of argb
  929. vpsrld ymm2, ymm0, 5 // G
  930. vpsrld ymm1, ymm0, 3 // B
  931. vpsrld ymm0, ymm0, 8 // R
  932. vpand ymm2, ymm2, ymm4 // G
  933. vpand ymm1, ymm1, ymm3 // B
  934. vpand ymm0, ymm0, ymm5 // R
  935. vpor ymm1, ymm1, ymm2 // BG
  936. vpor ymm0, ymm0, ymm1 // BGR
  937. vpackusdw ymm0, ymm0, ymm0
  938. vpermq ymm0, ymm0, 0xd8
  939. lea eax, [eax + 32]
  940. vmovdqu [edx], xmm0 // store 8 pixels of RGB565
  941. lea edx, [edx + 16]
  942. sub ecx, 8
  943. jg convertloop
  944. vzeroupper
  945. ret
  946. }
  947. }
  948. #endif // HAS_ARGBTORGB565ROW_AVX2
  949. #ifdef HAS_ARGBTOARGB1555ROW_AVX2
  950. __declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8* src_argb,
  951. uint8* dst_rgb,
  952. int width) {
  953. __asm {
  954. mov eax, [esp + 4] // src_argb
  955. mov edx, [esp + 8] // dst_rgb
  956. mov ecx, [esp + 12] // width
  957. vpcmpeqb ymm4, ymm4, ymm4
  958. vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f
  959. vpslld ymm5, ymm4, 5 // generate mask 0x000003e0
  960. vpslld ymm6, ymm4, 10 // generate mask 0x00007c00
  961. vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000
  962. vpslld ymm7, ymm7, 15
  963. convertloop:
  964. vmovdqu ymm0, [eax] // fetch 8 pixels of argb
  965. vpsrld ymm3, ymm0, 9 // R
  966. vpsrld ymm2, ymm0, 6 // G
  967. vpsrld ymm1, ymm0, 3 // B
  968. vpsrad ymm0, ymm0, 16 // A
  969. vpand ymm3, ymm3, ymm6 // R
  970. vpand ymm2, ymm2, ymm5 // G
  971. vpand ymm1, ymm1, ymm4 // B
  972. vpand ymm0, ymm0, ymm7 // A
  973. vpor ymm0, ymm0, ymm1 // BA
  974. vpor ymm2, ymm2, ymm3 // GR
  975. vpor ymm0, ymm0, ymm2 // BGRA
  976. vpackssdw ymm0, ymm0, ymm0
  977. vpermq ymm0, ymm0, 0xd8
  978. lea eax, [eax + 32]
  979. vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555
  980. lea edx, [edx + 16]
  981. sub ecx, 8
  982. jg convertloop
  983. vzeroupper
  984. ret
  985. }
  986. }
  987. #endif // HAS_ARGBTOARGB1555ROW_AVX2
  988. #ifdef HAS_ARGBTOARGB4444ROW_AVX2
  989. __declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8* src_argb,
  990. uint8* dst_rgb,
  991. int width) {
  992. __asm {
  993. mov eax, [esp + 4] // src_argb
  994. mov edx, [esp + 8] // dst_rgb
  995. mov ecx, [esp + 12] // width
  996. vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000
  997. vpsllw ymm4, ymm4, 12
  998. vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0
  999. convertloop:
  1000. vmovdqu ymm0, [eax] // fetch 8 pixels of argb
  1001. vpand ymm1, ymm0, ymm4 // high nibble
  1002. vpand ymm0, ymm0, ymm3 // low nibble
  1003. vpsrld ymm1, ymm1, 8
  1004. vpsrld ymm0, ymm0, 4
  1005. vpor ymm0, ymm0, ymm1
  1006. vpackuswb ymm0, ymm0, ymm0
  1007. vpermq ymm0, ymm0, 0xd8
  1008. lea eax, [eax + 32]
  1009. vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444
  1010. lea edx, [edx + 16]
  1011. sub ecx, 8
  1012. jg convertloop
  1013. vzeroupper
  1014. ret
  1015. }
  1016. }
  1017. #endif // HAS_ARGBTOARGB4444ROW_AVX2
  1018. // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
  1019. __declspec(naked) void ARGBToYRow_SSSE3(const uint8* src_argb,
  1020. uint8* dst_y,
  1021. int width) {
  1022. __asm {
  1023. mov eax, [esp + 4] /* src_argb */
  1024. mov edx, [esp + 8] /* dst_y */
  1025. mov ecx, [esp + 12] /* width */
  1026. movdqa xmm4, xmmword ptr kARGBToY
  1027. movdqa xmm5, xmmword ptr kAddY16
  1028. convertloop:
  1029. movdqu xmm0, [eax]
  1030. movdqu xmm1, [eax + 16]
  1031. movdqu xmm2, [eax + 32]
  1032. movdqu xmm3, [eax + 48]
  1033. pmaddubsw xmm0, xmm4
  1034. pmaddubsw xmm1, xmm4
  1035. pmaddubsw xmm2, xmm4
  1036. pmaddubsw xmm3, xmm4
  1037. lea eax, [eax + 64]
  1038. phaddw xmm0, xmm1
  1039. phaddw xmm2, xmm3
  1040. psrlw xmm0, 7
  1041. psrlw xmm2, 7
  1042. packuswb xmm0, xmm2
  1043. paddb xmm0, xmm5
  1044. movdqu [edx], xmm0
  1045. lea edx, [edx + 16]
  1046. sub ecx, 16
  1047. jg convertloop
  1048. ret
  1049. }
  1050. }
  1051. // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
  1052. // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
  1053. __declspec(naked) void ARGBToYJRow_SSSE3(const uint8* src_argb,
  1054. uint8* dst_y,
  1055. int width) {
  1056. __asm {
  1057. mov eax, [esp + 4] /* src_argb */
  1058. mov edx, [esp + 8] /* dst_y */
  1059. mov ecx, [esp + 12] /* width */
  1060. movdqa xmm4, xmmword ptr kARGBToYJ
  1061. movdqa xmm5, xmmword ptr kAddYJ64
  1062. convertloop:
  1063. movdqu xmm0, [eax]
  1064. movdqu xmm1, [eax + 16]
  1065. movdqu xmm2, [eax + 32]
  1066. movdqu xmm3, [eax + 48]
  1067. pmaddubsw xmm0, xmm4
  1068. pmaddubsw xmm1, xmm4
  1069. pmaddubsw xmm2, xmm4
  1070. pmaddubsw xmm3, xmm4
  1071. lea eax, [eax + 64]
  1072. phaddw xmm0, xmm1
  1073. phaddw xmm2, xmm3
  1074. paddw xmm0, xmm5 // Add .5 for rounding.
  1075. paddw xmm2, xmm5
  1076. psrlw xmm0, 7
  1077. psrlw xmm2, 7
  1078. packuswb xmm0, xmm2
  1079. movdqu [edx], xmm0
  1080. lea edx, [edx + 16]
  1081. sub ecx, 16
  1082. jg convertloop
  1083. ret
  1084. }
  1085. }
  1086. #ifdef HAS_ARGBTOYROW_AVX2
  1087. // vpermd for vphaddw + vpackuswb vpermd.
  1088. static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
  1089. // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
  1090. __declspec(naked) void ARGBToYRow_AVX2(const uint8* src_argb,
  1091. uint8* dst_y,
  1092. int width) {
  1093. __asm {
  1094. mov eax, [esp + 4] /* src_argb */
  1095. mov edx, [esp + 8] /* dst_y */
  1096. mov ecx, [esp + 12] /* width */
  1097. vbroadcastf128 ymm4, xmmword ptr kARGBToY
  1098. vbroadcastf128 ymm5, xmmword ptr kAddY16
  1099. vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX
  1100. convertloop:
  1101. vmovdqu ymm0, [eax]
  1102. vmovdqu ymm1, [eax + 32]
  1103. vmovdqu ymm2, [eax + 64]
  1104. vmovdqu ymm3, [eax + 96]
  1105. vpmaddubsw ymm0, ymm0, ymm4
  1106. vpmaddubsw ymm1, ymm1, ymm4
  1107. vpmaddubsw ymm2, ymm2, ymm4
  1108. vpmaddubsw ymm3, ymm3, ymm4
  1109. lea eax, [eax + 128]
  1110. vphaddw ymm0, ymm0, ymm1 // mutates.
  1111. vphaddw ymm2, ymm2, ymm3
  1112. vpsrlw ymm0, ymm0, 7
  1113. vpsrlw ymm2, ymm2, 7
  1114. vpackuswb ymm0, ymm0, ymm2 // mutates.
  1115. vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
  1116. vpaddb ymm0, ymm0, ymm5 // add 16 for Y
  1117. vmovdqu [edx], ymm0
  1118. lea edx, [edx + 32]
  1119. sub ecx, 32
  1120. jg convertloop
  1121. vzeroupper
  1122. ret
  1123. }
  1124. }
  1125. #endif // HAS_ARGBTOYROW_AVX2
  1126. #ifdef HAS_ARGBTOYJROW_AVX2
  1127. // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
  1128. __declspec(naked) void ARGBToYJRow_AVX2(const uint8* src_argb,
  1129. uint8* dst_y,
  1130. int width) {
  1131. __asm {
  1132. mov eax, [esp + 4] /* src_argb */
  1133. mov edx, [esp + 8] /* dst_y */
  1134. mov ecx, [esp + 12] /* width */
  1135. vbroadcastf128 ymm4, xmmword ptr kARGBToYJ
  1136. vbroadcastf128 ymm5, xmmword ptr kAddYJ64
  1137. vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX
  1138. convertloop:
  1139. vmovdqu ymm0, [eax]
  1140. vmovdqu ymm1, [eax + 32]
  1141. vmovdqu ymm2, [eax + 64]
  1142. vmovdqu ymm3, [eax + 96]
  1143. vpmaddubsw ymm0, ymm0, ymm4
  1144. vpmaddubsw ymm1, ymm1, ymm4
  1145. vpmaddubsw ymm2, ymm2, ymm4
  1146. vpmaddubsw ymm3, ymm3, ymm4
  1147. lea eax, [eax + 128]
  1148. vphaddw ymm0, ymm0, ymm1 // mutates.
  1149. vphaddw ymm2, ymm2, ymm3
  1150. vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding.
  1151. vpaddw ymm2, ymm2, ymm5
  1152. vpsrlw ymm0, ymm0, 7
  1153. vpsrlw ymm2, ymm2, 7
  1154. vpackuswb ymm0, ymm0, ymm2 // mutates.
  1155. vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
  1156. vmovdqu [edx], ymm0
  1157. lea edx, [edx + 32]
  1158. sub ecx, 32
  1159. jg convertloop
  1160. vzeroupper
  1161. ret
  1162. }
  1163. }
  1164. #endif // HAS_ARGBTOYJROW_AVX2
  1165. __declspec(naked) void BGRAToYRow_SSSE3(const uint8* src_argb,
  1166. uint8* dst_y,
  1167. int width) {
  1168. __asm {
  1169. mov eax, [esp + 4] /* src_argb */
  1170. mov edx, [esp + 8] /* dst_y */
  1171. mov ecx, [esp + 12] /* width */
  1172. movdqa xmm4, xmmword ptr kBGRAToY
  1173. movdqa xmm5, xmmword ptr kAddY16
  1174. convertloop:
  1175. movdqu xmm0, [eax]
  1176. movdqu xmm1, [eax + 16]
  1177. movdqu xmm2, [eax + 32]
  1178. movdqu xmm3, [eax + 48]
  1179. pmaddubsw xmm0, xmm4
  1180. pmaddubsw xmm1, xmm4
  1181. pmaddubsw xmm2, xmm4
  1182. pmaddubsw xmm3, xmm4
  1183. lea eax, [eax + 64]
  1184. phaddw xmm0, xmm1
  1185. phaddw xmm2, xmm3
  1186. psrlw xmm0, 7
  1187. psrlw xmm2, 7
  1188. packuswb xmm0, xmm2
  1189. paddb xmm0, xmm5
  1190. movdqu [edx], xmm0
  1191. lea edx, [edx + 16]
  1192. sub ecx, 16
  1193. jg convertloop
  1194. ret
  1195. }
  1196. }
  1197. __declspec(naked) void ABGRToYRow_SSSE3(const uint8* src_argb,
  1198. uint8* dst_y,
  1199. int width) {
  1200. __asm {
  1201. mov eax, [esp + 4] /* src_argb */
  1202. mov edx, [esp + 8] /* dst_y */
  1203. mov ecx, [esp + 12] /* width */
  1204. movdqa xmm4, xmmword ptr kABGRToY
  1205. movdqa xmm5, xmmword ptr kAddY16
  1206. convertloop:
  1207. movdqu xmm0, [eax]
  1208. movdqu xmm1, [eax + 16]
  1209. movdqu xmm2, [eax + 32]
  1210. movdqu xmm3, [eax + 48]
  1211. pmaddubsw xmm0, xmm4
  1212. pmaddubsw xmm1, xmm4
  1213. pmaddubsw xmm2, xmm4
  1214. pmaddubsw xmm3, xmm4
  1215. lea eax, [eax + 64]
  1216. phaddw xmm0, xmm1
  1217. phaddw xmm2, xmm3
  1218. psrlw xmm0, 7
  1219. psrlw xmm2, 7
  1220. packuswb xmm0, xmm2
  1221. paddb xmm0, xmm5
  1222. movdqu [edx], xmm0
  1223. lea edx, [edx + 16]
  1224. sub ecx, 16
  1225. jg convertloop
  1226. ret
  1227. }
  1228. }
  1229. __declspec(naked) void RGBAToYRow_SSSE3(const uint8* src_argb,
  1230. uint8* dst_y,
  1231. int width) {
  1232. __asm {
  1233. mov eax, [esp + 4] /* src_argb */
  1234. mov edx, [esp + 8] /* dst_y */
  1235. mov ecx, [esp + 12] /* width */
  1236. movdqa xmm4, xmmword ptr kRGBAToY
  1237. movdqa xmm5, xmmword ptr kAddY16
  1238. convertloop:
  1239. movdqu xmm0, [eax]
  1240. movdqu xmm1, [eax + 16]
  1241. movdqu xmm2, [eax + 32]
  1242. movdqu xmm3, [eax + 48]
  1243. pmaddubsw xmm0, xmm4
  1244. pmaddubsw xmm1, xmm4
  1245. pmaddubsw xmm2, xmm4
  1246. pmaddubsw xmm3, xmm4
  1247. lea eax, [eax + 64]
  1248. phaddw xmm0, xmm1
  1249. phaddw xmm2, xmm3
  1250. psrlw xmm0, 7
  1251. psrlw xmm2, 7
  1252. packuswb xmm0, xmm2
  1253. paddb xmm0, xmm5
  1254. movdqu [edx], xmm0
  1255. lea edx, [edx + 16]
  1256. sub ecx, 16
  1257. jg convertloop
  1258. ret
  1259. }
  1260. }
  1261. __declspec(naked) void ARGBToUVRow_SSSE3(const uint8* src_argb0,
  1262. int src_stride_argb,
  1263. uint8* dst_u,
  1264. uint8* dst_v,
  1265. int width) {
  1266. __asm {
  1267. push esi
  1268. push edi
  1269. mov eax, [esp + 8 + 4] // src_argb
  1270. mov esi, [esp + 8 + 8] // src_stride_argb
  1271. mov edx, [esp + 8 + 12] // dst_u
  1272. mov edi, [esp + 8 + 16] // dst_v
  1273. mov ecx, [esp + 8 + 20] // width
  1274. movdqa xmm5, xmmword ptr kAddUV128
  1275. movdqa xmm6, xmmword ptr kARGBToV
  1276. movdqa xmm7, xmmword ptr kARGBToU
  1277. sub edi, edx // stride from u to v
  1278. convertloop:
  1279. /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1280. movdqu xmm0, [eax]
  1281. movdqu xmm4, [eax + esi]
  1282. pavgb xmm0, xmm4
  1283. movdqu xmm1, [eax + 16]
  1284. movdqu xmm4, [eax + esi + 16]
  1285. pavgb xmm1, xmm4
  1286. movdqu xmm2, [eax + 32]
  1287. movdqu xmm4, [eax + esi + 32]
  1288. pavgb xmm2, xmm4
  1289. movdqu xmm3, [eax + 48]
  1290. movdqu xmm4, [eax + esi + 48]
  1291. pavgb xmm3, xmm4
  1292. lea eax, [eax + 64]
  1293. movdqa xmm4, xmm0
  1294. shufps xmm0, xmm1, 0x88
  1295. shufps xmm4, xmm1, 0xdd
  1296. pavgb xmm0, xmm4
  1297. movdqa xmm4, xmm2
  1298. shufps xmm2, xmm3, 0x88
  1299. shufps xmm4, xmm3, 0xdd
  1300. pavgb xmm2, xmm4
  1301. // step 2 - convert to U and V
  1302. // from here down is very similar to Y code except
  1303. // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1304. movdqa xmm1, xmm0
  1305. movdqa xmm3, xmm2
  1306. pmaddubsw xmm0, xmm7 // U
  1307. pmaddubsw xmm2, xmm7
  1308. pmaddubsw xmm1, xmm6 // V
  1309. pmaddubsw xmm3, xmm6
  1310. phaddw xmm0, xmm2
  1311. phaddw xmm1, xmm3
  1312. psraw xmm0, 8
  1313. psraw xmm1, 8
  1314. packsswb xmm0, xmm1
  1315. paddb xmm0, xmm5 // -> unsigned
  1316. // step 3 - store 8 U and 8 V values
  1317. movlps qword ptr [edx], xmm0 // U
  1318. movhps qword ptr [edx + edi], xmm0 // V
  1319. lea edx, [edx + 8]
  1320. sub ecx, 16
  1321. jg convertloop
  1322. pop edi
  1323. pop esi
  1324. ret
  1325. }
  1326. }
  1327. __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8* src_argb0,
  1328. int src_stride_argb,
  1329. uint8* dst_u,
  1330. uint8* dst_v,
  1331. int width) {
  1332. __asm {
  1333. push esi
  1334. push edi
  1335. mov eax, [esp + 8 + 4] // src_argb
  1336. mov esi, [esp + 8 + 8] // src_stride_argb
  1337. mov edx, [esp + 8 + 12] // dst_u
  1338. mov edi, [esp + 8 + 16] // dst_v
  1339. mov ecx, [esp + 8 + 20] // width
  1340. movdqa xmm5, xmmword ptr kAddUVJ128
  1341. movdqa xmm6, xmmword ptr kARGBToVJ
  1342. movdqa xmm7, xmmword ptr kARGBToUJ
  1343. sub edi, edx // stride from u to v
  1344. convertloop:
  1345. /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1346. movdqu xmm0, [eax]
  1347. movdqu xmm4, [eax + esi]
  1348. pavgb xmm0, xmm4
  1349. movdqu xmm1, [eax + 16]
  1350. movdqu xmm4, [eax + esi + 16]
  1351. pavgb xmm1, xmm4
  1352. movdqu xmm2, [eax + 32]
  1353. movdqu xmm4, [eax + esi + 32]
  1354. pavgb xmm2, xmm4
  1355. movdqu xmm3, [eax + 48]
  1356. movdqu xmm4, [eax + esi + 48]
  1357. pavgb xmm3, xmm4
  1358. lea eax, [eax + 64]
  1359. movdqa xmm4, xmm0
  1360. shufps xmm0, xmm1, 0x88
  1361. shufps xmm4, xmm1, 0xdd
  1362. pavgb xmm0, xmm4
  1363. movdqa xmm4, xmm2
  1364. shufps xmm2, xmm3, 0x88
  1365. shufps xmm4, xmm3, 0xdd
  1366. pavgb xmm2, xmm4
  1367. // step 2 - convert to U and V
  1368. // from here down is very similar to Y code except
  1369. // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1370. movdqa xmm1, xmm0
  1371. movdqa xmm3, xmm2
  1372. pmaddubsw xmm0, xmm7 // U
  1373. pmaddubsw xmm2, xmm7
  1374. pmaddubsw xmm1, xmm6 // V
  1375. pmaddubsw xmm3, xmm6
  1376. phaddw xmm0, xmm2
  1377. phaddw xmm1, xmm3
  1378. paddw xmm0, xmm5 // +.5 rounding -> unsigned
  1379. paddw xmm1, xmm5
  1380. psraw xmm0, 8
  1381. psraw xmm1, 8
  1382. packsswb xmm0, xmm1
  1383. // step 3 - store 8 U and 8 V values
  1384. movlps qword ptr [edx], xmm0 // U
  1385. movhps qword ptr [edx + edi], xmm0 // V
  1386. lea edx, [edx + 8]
  1387. sub ecx, 16
  1388. jg convertloop
  1389. pop edi
  1390. pop esi
  1391. ret
  1392. }
  1393. }
  1394. #ifdef HAS_ARGBTOUVROW_AVX2
  1395. __declspec(naked) void ARGBToUVRow_AVX2(const uint8* src_argb0,
  1396. int src_stride_argb,
  1397. uint8* dst_u,
  1398. uint8* dst_v,
  1399. int width) {
  1400. __asm {
  1401. push esi
  1402. push edi
  1403. mov eax, [esp + 8 + 4] // src_argb
  1404. mov esi, [esp + 8 + 8] // src_stride_argb
  1405. mov edx, [esp + 8 + 12] // dst_u
  1406. mov edi, [esp + 8 + 16] // dst_v
  1407. mov ecx, [esp + 8 + 20] // width
  1408. vbroadcastf128 ymm5, xmmword ptr kAddUV128
  1409. vbroadcastf128 ymm6, xmmword ptr kARGBToV
  1410. vbroadcastf128 ymm7, xmmword ptr kARGBToU
  1411. sub edi, edx // stride from u to v
  1412. convertloop:
  1413. /* step 1 - subsample 32x2 argb pixels to 16x1 */
  1414. vmovdqu ymm0, [eax]
  1415. vmovdqu ymm1, [eax + 32]
  1416. vmovdqu ymm2, [eax + 64]
  1417. vmovdqu ymm3, [eax + 96]
  1418. vpavgb ymm0, ymm0, [eax + esi]
  1419. vpavgb ymm1, ymm1, [eax + esi + 32]
  1420. vpavgb ymm2, ymm2, [eax + esi + 64]
  1421. vpavgb ymm3, ymm3, [eax + esi + 96]
  1422. lea eax, [eax + 128]
  1423. vshufps ymm4, ymm0, ymm1, 0x88
  1424. vshufps ymm0, ymm0, ymm1, 0xdd
  1425. vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
  1426. vshufps ymm4, ymm2, ymm3, 0x88
  1427. vshufps ymm2, ymm2, ymm3, 0xdd
  1428. vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
  1429. // step 2 - convert to U and V
  1430. // from here down is very similar to Y code except
  1431. // instead of 32 different pixels, its 16 pixels of U and 16 of V
  1432. vpmaddubsw ymm1, ymm0, ymm7 // U
  1433. vpmaddubsw ymm3, ymm2, ymm7
  1434. vpmaddubsw ymm0, ymm0, ymm6 // V
  1435. vpmaddubsw ymm2, ymm2, ymm6
  1436. vphaddw ymm1, ymm1, ymm3 // mutates
  1437. vphaddw ymm0, ymm0, ymm2
  1438. vpsraw ymm1, ymm1, 8
  1439. vpsraw ymm0, ymm0, 8
  1440. vpacksswb ymm0, ymm1, ymm0 // mutates
  1441. vpermq ymm0, ymm0, 0xd8 // For vpacksswb
  1442. vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
  1443. vpaddb ymm0, ymm0, ymm5 // -> unsigned
  1444. // step 3 - store 16 U and 16 V values
  1445. vextractf128 [edx], ymm0, 0 // U
  1446. vextractf128 [edx + edi], ymm0, 1 // V
  1447. lea edx, [edx + 16]
  1448. sub ecx, 32
  1449. jg convertloop
  1450. pop edi
  1451. pop esi
  1452. vzeroupper
  1453. ret
  1454. }
  1455. }
  1456. #endif // HAS_ARGBTOUVROW_AVX2
  1457. #ifdef HAS_ARGBTOUVJROW_AVX2
  1458. __declspec(naked) void ARGBToUVJRow_AVX2(const uint8* src_argb0,
  1459. int src_stride_argb,
  1460. uint8* dst_u,
  1461. uint8* dst_v,
  1462. int width) {
  1463. __asm {
  1464. push esi
  1465. push edi
  1466. mov eax, [esp + 8 + 4] // src_argb
  1467. mov esi, [esp + 8 + 8] // src_stride_argb
  1468. mov edx, [esp + 8 + 12] // dst_u
  1469. mov edi, [esp + 8 + 16] // dst_v
  1470. mov ecx, [esp + 8 + 20] // width
  1471. vbroadcastf128 ymm5, xmmword ptr kAddUV128
  1472. vbroadcastf128 ymm6, xmmword ptr kARGBToV
  1473. vbroadcastf128 ymm7, xmmword ptr kARGBToU
  1474. sub edi, edx // stride from u to v
  1475. convertloop:
  1476. /* step 1 - subsample 32x2 argb pixels to 16x1 */
  1477. vmovdqu ymm0, [eax]
  1478. vmovdqu ymm1, [eax + 32]
  1479. vmovdqu ymm2, [eax + 64]
  1480. vmovdqu ymm3, [eax + 96]
  1481. vpavgb ymm0, ymm0, [eax + esi]
  1482. vpavgb ymm1, ymm1, [eax + esi + 32]
  1483. vpavgb ymm2, ymm2, [eax + esi + 64]
  1484. vpavgb ymm3, ymm3, [eax + esi + 96]
  1485. lea eax, [eax + 128]
  1486. vshufps ymm4, ymm0, ymm1, 0x88
  1487. vshufps ymm0, ymm0, ymm1, 0xdd
  1488. vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
  1489. vshufps ymm4, ymm2, ymm3, 0x88
  1490. vshufps ymm2, ymm2, ymm3, 0xdd
  1491. vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
  1492. // step 2 - convert to U and V
  1493. // from here down is very similar to Y code except
  1494. // instead of 32 different pixels, its 16 pixels of U and 16 of V
  1495. vpmaddubsw ymm1, ymm0, ymm7 // U
  1496. vpmaddubsw ymm3, ymm2, ymm7
  1497. vpmaddubsw ymm0, ymm0, ymm6 // V
  1498. vpmaddubsw ymm2, ymm2, ymm6
  1499. vphaddw ymm1, ymm1, ymm3 // mutates
  1500. vphaddw ymm0, ymm0, ymm2
  1501. vpaddw ymm1, ymm1, ymm5 // +.5 rounding -> unsigned
  1502. vpaddw ymm0, ymm0, ymm5
  1503. vpsraw ymm1, ymm1, 8
  1504. vpsraw ymm0, ymm0, 8
  1505. vpacksswb ymm0, ymm1, ymm0 // mutates
  1506. vpermq ymm0, ymm0, 0xd8 // For vpacksswb
  1507. vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
  1508. // step 3 - store 16 U and 16 V values
  1509. vextractf128 [edx], ymm0, 0 // U
  1510. vextractf128 [edx + edi], ymm0, 1 // V
  1511. lea edx, [edx + 16]
  1512. sub ecx, 32
  1513. jg convertloop
  1514. pop edi
  1515. pop esi
  1516. vzeroupper
  1517. ret
  1518. }
  1519. }
  1520. #endif // HAS_ARGBTOUVJROW_AVX2
  1521. __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
  1522. uint8* dst_u,
  1523. uint8* dst_v,
  1524. int width) {
  1525. __asm {
  1526. push edi
  1527. mov eax, [esp + 4 + 4] // src_argb
  1528. mov edx, [esp + 4 + 8] // dst_u
  1529. mov edi, [esp + 4 + 12] // dst_v
  1530. mov ecx, [esp + 4 + 16] // width
  1531. movdqa xmm5, xmmword ptr kAddUV128
  1532. movdqa xmm6, xmmword ptr kARGBToV
  1533. movdqa xmm7, xmmword ptr kARGBToU
  1534. sub edi, edx // stride from u to v
  1535. convertloop:
  1536. /* convert to U and V */
  1537. movdqu xmm0, [eax] // U
  1538. movdqu xmm1, [eax + 16]
  1539. movdqu xmm2, [eax + 32]
  1540. movdqu xmm3, [eax + 48]
  1541. pmaddubsw xmm0, xmm7
  1542. pmaddubsw xmm1, xmm7
  1543. pmaddubsw xmm2, xmm7
  1544. pmaddubsw xmm3, xmm7
  1545. phaddw xmm0, xmm1
  1546. phaddw xmm2, xmm3
  1547. psraw xmm0, 8
  1548. psraw xmm2, 8
  1549. packsswb xmm0, xmm2
  1550. paddb xmm0, xmm5
  1551. movdqu [edx], xmm0
  1552. movdqu xmm0, [eax] // V
  1553. movdqu xmm1, [eax + 16]
  1554. movdqu xmm2, [eax + 32]
  1555. movdqu xmm3, [eax + 48]
  1556. pmaddubsw xmm0, xmm6
  1557. pmaddubsw xmm1, xmm6
  1558. pmaddubsw xmm2, xmm6
  1559. pmaddubsw xmm3, xmm6
  1560. phaddw xmm0, xmm1
  1561. phaddw xmm2, xmm3
  1562. psraw xmm0, 8
  1563. psraw xmm2, 8
  1564. packsswb xmm0, xmm2
  1565. paddb xmm0, xmm5
  1566. lea eax, [eax + 64]
  1567. movdqu [edx + edi], xmm0
  1568. lea edx, [edx + 16]
  1569. sub ecx, 16
  1570. jg convertloop
  1571. pop edi
  1572. ret
  1573. }
  1574. }
  1575. __declspec(naked) void BGRAToUVRow_SSSE3(const uint8* src_argb0,
  1576. int src_stride_argb,
  1577. uint8* dst_u,
  1578. uint8* dst_v,
  1579. int width) {
  1580. __asm {
  1581. push esi
  1582. push edi
  1583. mov eax, [esp + 8 + 4] // src_argb
  1584. mov esi, [esp + 8 + 8] // src_stride_argb
  1585. mov edx, [esp + 8 + 12] // dst_u
  1586. mov edi, [esp + 8 + 16] // dst_v
  1587. mov ecx, [esp + 8 + 20] // width
  1588. movdqa xmm5, xmmword ptr kAddUV128
  1589. movdqa xmm6, xmmword ptr kBGRAToV
  1590. movdqa xmm7, xmmword ptr kBGRAToU
  1591. sub edi, edx // stride from u to v
  1592. convertloop:
  1593. /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1594. movdqu xmm0, [eax]
  1595. movdqu xmm4, [eax + esi]
  1596. pavgb xmm0, xmm4
  1597. movdqu xmm1, [eax + 16]
  1598. movdqu xmm4, [eax + esi + 16]
  1599. pavgb xmm1, xmm4
  1600. movdqu xmm2, [eax + 32]
  1601. movdqu xmm4, [eax + esi + 32]
  1602. pavgb xmm2, xmm4
  1603. movdqu xmm3, [eax + 48]
  1604. movdqu xmm4, [eax + esi + 48]
  1605. pavgb xmm3, xmm4
  1606. lea eax, [eax + 64]
  1607. movdqa xmm4, xmm0
  1608. shufps xmm0, xmm1, 0x88
  1609. shufps xmm4, xmm1, 0xdd
  1610. pavgb xmm0, xmm4
  1611. movdqa xmm4, xmm2
  1612. shufps xmm2, xmm3, 0x88
  1613. shufps xmm4, xmm3, 0xdd
  1614. pavgb xmm2, xmm4
  1615. // step 2 - convert to U and V
  1616. // from here down is very similar to Y code except
  1617. // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1618. movdqa xmm1, xmm0
  1619. movdqa xmm3, xmm2
  1620. pmaddubsw xmm0, xmm7 // U
  1621. pmaddubsw xmm2, xmm7
  1622. pmaddubsw xmm1, xmm6 // V
  1623. pmaddubsw xmm3, xmm6
  1624. phaddw xmm0, xmm2
  1625. phaddw xmm1, xmm3
  1626. psraw xmm0, 8
  1627. psraw xmm1, 8
  1628. packsswb xmm0, xmm1
  1629. paddb xmm0, xmm5 // -> unsigned
  1630. // step 3 - store 8 U and 8 V values
  1631. movlps qword ptr [edx], xmm0 // U
  1632. movhps qword ptr [edx + edi], xmm0 // V
  1633. lea edx, [edx + 8]
  1634. sub ecx, 16
  1635. jg convertloop
  1636. pop edi
  1637. pop esi
  1638. ret
  1639. }
  1640. }
  1641. __declspec(naked) void ABGRToUVRow_SSSE3(const uint8* src_argb0,
  1642. int src_stride_argb,
  1643. uint8* dst_u,
  1644. uint8* dst_v,
  1645. int width) {
  1646. __asm {
  1647. push esi
  1648. push edi
  1649. mov eax, [esp + 8 + 4] // src_argb
  1650. mov esi, [esp + 8 + 8] // src_stride_argb
  1651. mov edx, [esp + 8 + 12] // dst_u
  1652. mov edi, [esp + 8 + 16] // dst_v
  1653. mov ecx, [esp + 8 + 20] // width
  1654. movdqa xmm5, xmmword ptr kAddUV128
  1655. movdqa xmm6, xmmword ptr kABGRToV
  1656. movdqa xmm7, xmmword ptr kABGRToU
  1657. sub edi, edx // stride from u to v
  1658. convertloop:
  1659. /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1660. movdqu xmm0, [eax]
  1661. movdqu xmm4, [eax + esi]
  1662. pavgb xmm0, xmm4
  1663. movdqu xmm1, [eax + 16]
  1664. movdqu xmm4, [eax + esi + 16]
  1665. pavgb xmm1, xmm4
  1666. movdqu xmm2, [eax + 32]
  1667. movdqu xmm4, [eax + esi + 32]
  1668. pavgb xmm2, xmm4
  1669. movdqu xmm3, [eax + 48]
  1670. movdqu xmm4, [eax + esi + 48]
  1671. pavgb xmm3, xmm4
  1672. lea eax, [eax + 64]
  1673. movdqa xmm4, xmm0
  1674. shufps xmm0, xmm1, 0x88
  1675. shufps xmm4, xmm1, 0xdd
  1676. pavgb xmm0, xmm4
  1677. movdqa xmm4, xmm2
  1678. shufps xmm2, xmm3, 0x88
  1679. shufps xmm4, xmm3, 0xdd
  1680. pavgb xmm2, xmm4
  1681. // step 2 - convert to U and V
  1682. // from here down is very similar to Y code except
  1683. // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1684. movdqa xmm1, xmm0
  1685. movdqa xmm3, xmm2
  1686. pmaddubsw xmm0, xmm7 // U
  1687. pmaddubsw xmm2, xmm7
  1688. pmaddubsw xmm1, xmm6 // V
  1689. pmaddubsw xmm3, xmm6
  1690. phaddw xmm0, xmm2
  1691. phaddw xmm1, xmm3
  1692. psraw xmm0, 8
  1693. psraw xmm1, 8
  1694. packsswb xmm0, xmm1
  1695. paddb xmm0, xmm5 // -> unsigned
  1696. // step 3 - store 8 U and 8 V values
  1697. movlps qword ptr [edx], xmm0 // U
  1698. movhps qword ptr [edx + edi], xmm0 // V
  1699. lea edx, [edx + 8]
  1700. sub ecx, 16
  1701. jg convertloop
  1702. pop edi
  1703. pop esi
  1704. ret
  1705. }
  1706. }
  1707. __declspec(naked) void RGBAToUVRow_SSSE3(const uint8* src_argb0,
  1708. int src_stride_argb,
  1709. uint8* dst_u,
  1710. uint8* dst_v,
  1711. int width) {
  1712. __asm {
  1713. push esi
  1714. push edi
  1715. mov eax, [esp + 8 + 4] // src_argb
  1716. mov esi, [esp + 8 + 8] // src_stride_argb
  1717. mov edx, [esp + 8 + 12] // dst_u
  1718. mov edi, [esp + 8 + 16] // dst_v
  1719. mov ecx, [esp + 8 + 20] // width
  1720. movdqa xmm5, xmmword ptr kAddUV128
  1721. movdqa xmm6, xmmword ptr kRGBAToV
  1722. movdqa xmm7, xmmword ptr kRGBAToU
  1723. sub edi, edx // stride from u to v
  1724. convertloop:
  1725. /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1726. movdqu xmm0, [eax]
  1727. movdqu xmm4, [eax + esi]
  1728. pavgb xmm0, xmm4
  1729. movdqu xmm1, [eax + 16]
  1730. movdqu xmm4, [eax + esi + 16]
  1731. pavgb xmm1, xmm4
  1732. movdqu xmm2, [eax + 32]
  1733. movdqu xmm4, [eax + esi + 32]
  1734. pavgb xmm2, xmm4
  1735. movdqu xmm3, [eax + 48]
  1736. movdqu xmm4, [eax + esi + 48]
  1737. pavgb xmm3, xmm4
  1738. lea eax, [eax + 64]
  1739. movdqa xmm4, xmm0
  1740. shufps xmm0, xmm1, 0x88
  1741. shufps xmm4, xmm1, 0xdd
  1742. pavgb xmm0, xmm4
  1743. movdqa xmm4, xmm2
  1744. shufps xmm2, xmm3, 0x88
  1745. shufps xmm4, xmm3, 0xdd
  1746. pavgb xmm2, xmm4
  1747. // step 2 - convert to U and V
  1748. // from here down is very similar to Y code except
  1749. // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1750. movdqa xmm1, xmm0
  1751. movdqa xmm3, xmm2
  1752. pmaddubsw xmm0, xmm7 // U
  1753. pmaddubsw xmm2, xmm7
  1754. pmaddubsw xmm1, xmm6 // V
  1755. pmaddubsw xmm3, xmm6
  1756. phaddw xmm0, xmm2
  1757. phaddw xmm1, xmm3
  1758. psraw xmm0, 8
  1759. psraw xmm1, 8
  1760. packsswb xmm0, xmm1
  1761. paddb xmm0, xmm5 // -> unsigned
  1762. // step 3 - store 8 U and 8 V values
  1763. movlps qword ptr [edx], xmm0 // U
  1764. movhps qword ptr [edx + edi], xmm0 // V
  1765. lea edx, [edx + 8]
  1766. sub ecx, 16
  1767. jg convertloop
  1768. pop edi
  1769. pop esi
  1770. ret
  1771. }
  1772. }
  1773. #endif // HAS_ARGBTOYROW_SSSE3
  1774. // Read 16 UV from 444
  1775. #define READYUV444_AVX2 \
  1776. __asm { \
  1777. __asm vmovdqu xmm0, [esi] /* U */ \
  1778. __asm vmovdqu xmm1, [esi + edi] /* V */ \
  1779. __asm lea esi, [esi + 16] \
  1780. __asm vpermq ymm0, ymm0, 0xd8 \
  1781. __asm vpermq ymm1, ymm1, 0xd8 \
  1782. __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
  1783. __asm vmovdqu xmm4, [eax] /* Y */ \
  1784. __asm vpermq ymm4, ymm4, 0xd8 \
  1785. __asm vpunpcklbw ymm4, ymm4, ymm4 \
  1786. __asm lea eax, [eax + 16]}
  1787. // Read 8 UV from 422, upsample to 16 UV.
  1788. #define READYUV422_AVX2 \
  1789. __asm { \
  1790. __asm vmovq xmm0, qword ptr [esi] /* U */ \
  1791. __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
  1792. __asm lea esi, [esi + 8] \
  1793. __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
  1794. __asm vpermq ymm0, ymm0, 0xd8 \
  1795. __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
  1796. __asm vmovdqu xmm4, [eax] /* Y */ \
  1797. __asm vpermq ymm4, ymm4, 0xd8 \
  1798. __asm vpunpcklbw ymm4, ymm4, ymm4 \
  1799. __asm lea eax, [eax + 16]}
  1800. // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
  1801. #define READYUVA422_AVX2 \
  1802. __asm { \
  1803. __asm vmovq xmm0, qword ptr [esi] /* U */ \
  1804. __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
  1805. __asm lea esi, [esi + 8] \
  1806. __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
  1807. __asm vpermq ymm0, ymm0, 0xd8 \
  1808. __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
  1809. __asm vmovdqu xmm4, [eax] /* Y */ \
  1810. __asm vpermq ymm4, ymm4, 0xd8 \
  1811. __asm vpunpcklbw ymm4, ymm4, ymm4 \
  1812. __asm lea eax, [eax + 16] \
  1813. __asm vmovdqu xmm5, [ebp] /* A */ \
  1814. __asm vpermq ymm5, ymm5, 0xd8 \
  1815. __asm lea ebp, [ebp + 16]}
  1816. // Read 8 UV from NV12, upsample to 16 UV.
  1817. #define READNV12_AVX2 \
  1818. __asm { \
  1819. __asm vmovdqu xmm0, [esi] /* UV */ \
  1820. __asm lea esi, [esi + 16] \
  1821. __asm vpermq ymm0, ymm0, 0xd8 \
  1822. __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
  1823. __asm vmovdqu xmm4, [eax] /* Y */ \
  1824. __asm vpermq ymm4, ymm4, 0xd8 \
  1825. __asm vpunpcklbw ymm4, ymm4, ymm4 \
  1826. __asm lea eax, [eax + 16]}
  1827. // Read 8 UV from NV21, upsample to 16 UV.
  1828. #define READNV21_AVX2 \
  1829. __asm { \
  1830. __asm vmovdqu xmm0, [esi] /* UV */ \
  1831. __asm lea esi, [esi + 16] \
  1832. __asm vpermq ymm0, ymm0, 0xd8 \
  1833. __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleNV21 \
  1834. __asm vmovdqu xmm4, [eax] /* Y */ \
  1835. __asm vpermq ymm4, ymm4, 0xd8 \
  1836. __asm vpunpcklbw ymm4, ymm4, ymm4 \
  1837. __asm lea eax, [eax + 16]}
  1838. // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
  1839. #define READYUY2_AVX2 \
  1840. __asm { \
  1841. __asm vmovdqu ymm4, [eax] /* YUY2 */ \
  1842. __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \
  1843. __asm vmovdqu ymm0, [eax] /* UV */ \
  1844. __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleYUY2UV \
  1845. __asm lea eax, [eax + 32]}
  1846. // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
  1847. #define READUYVY_AVX2 \
  1848. __asm { \
  1849. __asm vmovdqu ymm4, [eax] /* UYVY */ \
  1850. __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \
  1851. __asm vmovdqu ymm0, [eax] /* UV */ \
  1852. __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleUYVYUV \
  1853. __asm lea eax, [eax + 32]}
  1854. // Convert 16 pixels: 16 UV and 16 Y.
  1855. #define YUVTORGB_AVX2(YuvConstants) \
  1856. __asm { \
  1857. __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
  1858. __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
  1859. __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
  1860. __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASR] \
  1861. __asm vpsubw ymm2, ymm3, ymm2 \
  1862. __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \
  1863. __asm vpsubw ymm1, ymm3, ymm1 \
  1864. __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \
  1865. __asm vpsubw ymm0, ymm3, ymm0 /* Step 2: Find Y contribution to 16 R,G,B values */ \
  1866. __asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \
  1867. __asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \
  1868. __asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \
  1869. __asm vpaddsw ymm2, ymm2, ymm4 /* R += Y */ \
  1870. __asm vpsraw ymm0, ymm0, 6 \
  1871. __asm vpsraw ymm1, ymm1, 6 \
  1872. __asm vpsraw ymm2, ymm2, 6 \
  1873. __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \
  1874. __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \
  1875. __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \
  1876. }
  1877. // Store 16 ARGB values.
  1878. #define STOREARGB_AVX2 \
  1879. __asm { \
  1880. __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \
  1881. __asm vpermq ymm0, ymm0, 0xd8 \
  1882. __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \
  1883. __asm vpermq ymm2, ymm2, 0xd8 \
  1884. __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \
  1885. __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \
  1886. __asm vmovdqu 0[edx], ymm1 \
  1887. __asm vmovdqu 32[edx], ymm0 \
  1888. __asm lea edx, [edx + 64]}
  1889. // Store 16 RGBA values.
  1890. #define STORERGBA_AVX2 \
  1891. __asm { \
  1892. __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \
  1893. __asm vpermq ymm1, ymm1, 0xd8 \
  1894. __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \
  1895. __asm vpermq ymm2, ymm2, 0xd8 \
  1896. __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \
  1897. __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \
  1898. __asm vmovdqu [edx], ymm0 \
  1899. __asm vmovdqu [edx + 32], ymm1 \
  1900. __asm lea edx, [edx + 64]}
  1901. #ifdef HAS_I422TOARGBROW_AVX2
  1902. // 16 pixels
  1903. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  1904. __declspec(naked) void I422ToARGBRow_AVX2(
  1905. const uint8* y_buf,
  1906. const uint8* u_buf,
  1907. const uint8* v_buf,
  1908. uint8* dst_argb,
  1909. const struct YuvConstants* yuvconstants,
  1910. int width) {
  1911. __asm {
  1912. push esi
  1913. push edi
  1914. push ebx
  1915. mov eax, [esp + 12 + 4] // Y
  1916. mov esi, [esp + 12 + 8] // U
  1917. mov edi, [esp + 12 + 12] // V
  1918. mov edx, [esp + 12 + 16] // argb
  1919. mov ebx, [esp + 12 + 20] // yuvconstants
  1920. mov ecx, [esp + 12 + 24] // width
  1921. sub edi, esi
  1922. vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
  1923. convertloop:
  1924. READYUV422_AVX2
  1925. YUVTORGB_AVX2(ebx)
  1926. STOREARGB_AVX2
  1927. sub ecx, 16
  1928. jg convertloop
  1929. pop ebx
  1930. pop edi
  1931. pop esi
  1932. vzeroupper
  1933. ret
  1934. }
  1935. }
  1936. #endif // HAS_I422TOARGBROW_AVX2
  1937. #ifdef HAS_I422ALPHATOARGBROW_AVX2
  1938. // 16 pixels
  1939. // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
  1940. __declspec(naked) void I422AlphaToARGBRow_AVX2(
  1941. const uint8* y_buf,
  1942. const uint8* u_buf,
  1943. const uint8* v_buf,
  1944. const uint8* a_buf,
  1945. uint8* dst_argb,
  1946. const struct YuvConstants* yuvconstants,
  1947. int width) {
  1948. __asm {
  1949. push esi
  1950. push edi
  1951. push ebx
  1952. push ebp
  1953. mov eax, [esp + 16 + 4] // Y
  1954. mov esi, [esp + 16 + 8] // U
  1955. mov edi, [esp + 16 + 12] // V
  1956. mov ebp, [esp + 16 + 16] // A
  1957. mov edx, [esp + 16 + 20] // argb
  1958. mov ebx, [esp + 16 + 24] // yuvconstants
  1959. mov ecx, [esp + 16 + 28] // width
  1960. sub edi, esi
  1961. convertloop:
  1962. READYUVA422_AVX2
  1963. YUVTORGB_AVX2(ebx)
  1964. STOREARGB_AVX2
  1965. sub ecx, 16
  1966. jg convertloop
  1967. pop ebp
  1968. pop ebx
  1969. pop edi
  1970. pop esi
  1971. vzeroupper
  1972. ret
  1973. }
  1974. }
  1975. #endif // HAS_I422ALPHATOARGBROW_AVX2
  1976. #ifdef HAS_I444TOARGBROW_AVX2
  1977. // 16 pixels
  1978. // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
  1979. __declspec(naked) void I444ToARGBRow_AVX2(
  1980. const uint8* y_buf,
  1981. const uint8* u_buf,
  1982. const uint8* v_buf,
  1983. uint8* dst_argb,
  1984. const struct YuvConstants* yuvconstants,
  1985. int width) {
  1986. __asm {
  1987. push esi
  1988. push edi
  1989. push ebx
  1990. mov eax, [esp + 12 + 4] // Y
  1991. mov esi, [esp + 12 + 8] // U
  1992. mov edi, [esp + 12 + 12] // V
  1993. mov edx, [esp + 12 + 16] // argb
  1994. mov ebx, [esp + 12 + 20] // yuvconstants
  1995. mov ecx, [esp + 12 + 24] // width
  1996. sub edi, esi
  1997. vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
  1998. convertloop:
  1999. READYUV444_AVX2
  2000. YUVTORGB_AVX2(ebx)
  2001. STOREARGB_AVX2
  2002. sub ecx, 16
  2003. jg convertloop
  2004. pop ebx
  2005. pop edi
  2006. pop esi
  2007. vzeroupper
  2008. ret
  2009. }
  2010. }
  2011. #endif // HAS_I444TOARGBROW_AVX2
  2012. #ifdef HAS_NV12TOARGBROW_AVX2
  2013. // 16 pixels.
  2014. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  2015. __declspec(naked) void NV12ToARGBRow_AVX2(
  2016. const uint8* y_buf,
  2017. const uint8* uv_buf,
  2018. uint8* dst_argb,
  2019. const struct YuvConstants* yuvconstants,
  2020. int width) {
  2021. __asm {
  2022. push esi
  2023. push ebx
  2024. mov eax, [esp + 8 + 4] // Y
  2025. mov esi, [esp + 8 + 8] // UV
  2026. mov edx, [esp + 8 + 12] // argb
  2027. mov ebx, [esp + 8 + 16] // yuvconstants
  2028. mov ecx, [esp + 8 + 20] // width
  2029. vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
  2030. convertloop:
  2031. READNV12_AVX2
  2032. YUVTORGB_AVX2(ebx)
  2033. STOREARGB_AVX2
  2034. sub ecx, 16
  2035. jg convertloop
  2036. pop ebx
  2037. pop esi
  2038. vzeroupper
  2039. ret
  2040. }
  2041. }
  2042. #endif // HAS_NV12TOARGBROW_AVX2
  2043. #ifdef HAS_NV21TOARGBROW_AVX2
  2044. // 16 pixels.
  2045. // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  2046. __declspec(naked) void NV21ToARGBRow_AVX2(
  2047. const uint8* y_buf,
  2048. const uint8* vu_buf,
  2049. uint8* dst_argb,
  2050. const struct YuvConstants* yuvconstants,
  2051. int width) {
  2052. __asm {
  2053. push esi
  2054. push ebx
  2055. mov eax, [esp + 8 + 4] // Y
  2056. mov esi, [esp + 8 + 8] // VU
  2057. mov edx, [esp + 8 + 12] // argb
  2058. mov ebx, [esp + 8 + 16] // yuvconstants
  2059. mov ecx, [esp + 8 + 20] // width
  2060. vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
  2061. convertloop:
  2062. READNV21_AVX2
  2063. YUVTORGB_AVX2(ebx)
  2064. STOREARGB_AVX2
  2065. sub ecx, 16
  2066. jg convertloop
  2067. pop ebx
  2068. pop esi
  2069. vzeroupper
  2070. ret
  2071. }
  2072. }
  2073. #endif // HAS_NV21TOARGBROW_AVX2
  2074. #ifdef HAS_YUY2TOARGBROW_AVX2
  2075. // 16 pixels.
  2076. // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
  2077. __declspec(naked) void YUY2ToARGBRow_AVX2(
  2078. const uint8* src_yuy2,
  2079. uint8* dst_argb,
  2080. const struct YuvConstants* yuvconstants,
  2081. int width) {
  2082. __asm {
  2083. push ebx
  2084. mov eax, [esp + 4 + 4] // yuy2
  2085. mov edx, [esp + 4 + 8] // argb
  2086. mov ebx, [esp + 4 + 12] // yuvconstants
  2087. mov ecx, [esp + 4 + 16] // width
  2088. vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
  2089. convertloop:
  2090. READYUY2_AVX2
  2091. YUVTORGB_AVX2(ebx)
  2092. STOREARGB_AVX2
  2093. sub ecx, 16
  2094. jg convertloop
  2095. pop ebx
  2096. vzeroupper
  2097. ret
  2098. }
  2099. }
  2100. #endif // HAS_YUY2TOARGBROW_AVX2
  2101. #ifdef HAS_UYVYTOARGBROW_AVX2
  2102. // 16 pixels.
  2103. // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
  2104. __declspec(naked) void UYVYToARGBRow_AVX2(
  2105. const uint8* src_uyvy,
  2106. uint8* dst_argb,
  2107. const struct YuvConstants* yuvconstants,
  2108. int width) {
  2109. __asm {
  2110. push ebx
  2111. mov eax, [esp + 4 + 4] // uyvy
  2112. mov edx, [esp + 4 + 8] // argb
  2113. mov ebx, [esp + 4 + 12] // yuvconstants
  2114. mov ecx, [esp + 4 + 16] // width
  2115. vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
  2116. convertloop:
  2117. READUYVY_AVX2
  2118. YUVTORGB_AVX2(ebx)
  2119. STOREARGB_AVX2
  2120. sub ecx, 16
  2121. jg convertloop
  2122. pop ebx
  2123. vzeroupper
  2124. ret
  2125. }
  2126. }
  2127. #endif // HAS_UYVYTOARGBROW_AVX2
  2128. #ifdef HAS_I422TORGBAROW_AVX2
  2129. // 16 pixels
  2130. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
  2131. __declspec(naked) void I422ToRGBARow_AVX2(
  2132. const uint8* y_buf,
  2133. const uint8* u_buf,
  2134. const uint8* v_buf,
  2135. uint8* dst_argb,
  2136. const struct YuvConstants* yuvconstants,
  2137. int width) {
  2138. __asm {
  2139. push esi
  2140. push edi
  2141. push ebx
  2142. mov eax, [esp + 12 + 4] // Y
  2143. mov esi, [esp + 12 + 8] // U
  2144. mov edi, [esp + 12 + 12] // V
  2145. mov edx, [esp + 12 + 16] // abgr
  2146. mov ebx, [esp + 12 + 20] // yuvconstants
  2147. mov ecx, [esp + 12 + 24] // width
  2148. sub edi, esi
  2149. vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
  2150. convertloop:
  2151. READYUV422_AVX2
  2152. YUVTORGB_AVX2(ebx)
  2153. STORERGBA_AVX2
  2154. sub ecx, 16
  2155. jg convertloop
  2156. pop ebx
  2157. pop edi
  2158. pop esi
  2159. vzeroupper
  2160. ret
  2161. }
  2162. }
  2163. #endif // HAS_I422TORGBAROW_AVX2
  2164. #if defined(HAS_I422TOARGBROW_SSSE3)
  2165. // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
  2166. // Allows a conversion with half size scaling.
  2167. // Read 8 UV from 444.
  2168. #define READYUV444 \
  2169. __asm { \
  2170. __asm movq xmm0, qword ptr [esi] /* U */ \
  2171. __asm movq xmm1, qword ptr [esi + edi] /* V */ \
  2172. __asm lea esi, [esi + 8] \
  2173. __asm punpcklbw xmm0, xmm1 /* UV */ \
  2174. __asm movq xmm4, qword ptr [eax] \
  2175. __asm punpcklbw xmm4, xmm4 \
  2176. __asm lea eax, [eax + 8]}
  2177. // Read 4 UV from 422, upsample to 8 UV.
  2178. #define READYUV422 \
  2179. __asm { \
  2180. __asm movd xmm0, [esi] /* U */ \
  2181. __asm movd xmm1, [esi + edi] /* V */ \
  2182. __asm lea esi, [esi + 4] \
  2183. __asm punpcklbw xmm0, xmm1 /* UV */ \
  2184. __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
  2185. __asm movq xmm4, qword ptr [eax] \
  2186. __asm punpcklbw xmm4, xmm4 \
  2187. __asm lea eax, [eax + 8]}
  2188. // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
  2189. #define READYUVA422 \
  2190. __asm { \
  2191. __asm movd xmm0, [esi] /* U */ \
  2192. __asm movd xmm1, [esi + edi] /* V */ \
  2193. __asm lea esi, [esi + 4] \
  2194. __asm punpcklbw xmm0, xmm1 /* UV */ \
  2195. __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
  2196. __asm movq xmm4, qword ptr [eax] /* Y */ \
  2197. __asm punpcklbw xmm4, xmm4 \
  2198. __asm lea eax, [eax + 8] \
  2199. __asm movq xmm5, qword ptr [ebp] /* A */ \
  2200. __asm lea ebp, [ebp + 8]}
  2201. // Read 4 UV from NV12, upsample to 8 UV.
  2202. #define READNV12 \
  2203. __asm { \
  2204. __asm movq xmm0, qword ptr [esi] /* UV */ \
  2205. __asm lea esi, [esi + 8] \
  2206. __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
  2207. __asm movq xmm4, qword ptr [eax] \
  2208. __asm punpcklbw xmm4, xmm4 \
  2209. __asm lea eax, [eax + 8]}
  2210. // Read 4 VU from NV21, upsample to 8 UV.
  2211. #define READNV21 \
  2212. __asm { \
  2213. __asm movq xmm0, qword ptr [esi] /* UV */ \
  2214. __asm lea esi, [esi + 8] \
  2215. __asm pshufb xmm0, xmmword ptr kShuffleNV21 \
  2216. __asm movq xmm4, qword ptr [eax] \
  2217. __asm punpcklbw xmm4, xmm4 \
  2218. __asm lea eax, [eax + 8]}
  2219. // Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
  2220. #define READYUY2 \
  2221. __asm { \
  2222. __asm movdqu xmm4, [eax] /* YUY2 */ \
  2223. __asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \
  2224. __asm movdqu xmm0, [eax] /* UV */ \
  2225. __asm pshufb xmm0, xmmword ptr kShuffleYUY2UV \
  2226. __asm lea eax, [eax + 16]}
  2227. // Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
  2228. #define READUYVY \
  2229. __asm { \
  2230. __asm movdqu xmm4, [eax] /* UYVY */ \
  2231. __asm pshufb xmm4, xmmword ptr kShuffleUYVYY \
  2232. __asm movdqu xmm0, [eax] /* UV */ \
  2233. __asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \
  2234. __asm lea eax, [eax + 16]}
  2235. // Convert 8 pixels: 8 UV and 8 Y.
  2236. #define YUVTORGB(YuvConstants) \
  2237. __asm { \
  2238. __asm movdqa xmm1, xmm0 \
  2239. __asm movdqa xmm2, xmm0 \
  2240. __asm movdqa xmm3, xmm0 \
  2241. __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVBIASB] \
  2242. __asm pmaddubsw xmm1, xmmword ptr [YuvConstants + KUVTOB] \
  2243. __asm psubw xmm0, xmm1 \
  2244. __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVBIASG] \
  2245. __asm pmaddubsw xmm2, xmmword ptr [YuvConstants + KUVTOG] \
  2246. __asm psubw xmm1, xmm2 \
  2247. __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVBIASR] \
  2248. __asm pmaddubsw xmm3, xmmword ptr [YuvConstants + KUVTOR] \
  2249. __asm psubw xmm2, xmm3 \
  2250. __asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \
  2251. __asm paddsw xmm0, xmm4 /* B += Y */ \
  2252. __asm paddsw xmm1, xmm4 /* G += Y */ \
  2253. __asm paddsw xmm2, xmm4 /* R += Y */ \
  2254. __asm psraw xmm0, 6 \
  2255. __asm psraw xmm1, 6 \
  2256. __asm psraw xmm2, 6 \
  2257. __asm packuswb xmm0, xmm0 /* B */ \
  2258. __asm packuswb xmm1, xmm1 /* G */ \
  2259. __asm packuswb xmm2, xmm2 /* R */ \
  2260. }
  2261. // Store 8 ARGB values.
  2262. #define STOREARGB \
  2263. __asm { \
  2264. __asm punpcklbw xmm0, xmm1 /* BG */ \
  2265. __asm punpcklbw xmm2, xmm5 /* RA */ \
  2266. __asm movdqa xmm1, xmm0 \
  2267. __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \
  2268. __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \
  2269. __asm movdqu 0[edx], xmm0 \
  2270. __asm movdqu 16[edx], xmm1 \
  2271. __asm lea edx, [edx + 32]}
  2272. // Store 8 BGRA values.
  2273. #define STOREBGRA \
  2274. __asm { \
  2275. __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
  2276. __asm punpcklbw xmm1, xmm0 /* GB */ \
  2277. __asm punpcklbw xmm5, xmm2 /* AR */ \
  2278. __asm movdqa xmm0, xmm5 \
  2279. __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \
  2280. __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \
  2281. __asm movdqu 0[edx], xmm5 \
  2282. __asm movdqu 16[edx], xmm0 \
  2283. __asm lea edx, [edx + 32]}
  2284. // Store 8 RGBA values.
  2285. #define STORERGBA \
  2286. __asm { \
  2287. __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
  2288. __asm punpcklbw xmm1, xmm2 /* GR */ \
  2289. __asm punpcklbw xmm5, xmm0 /* AB */ \
  2290. __asm movdqa xmm0, xmm5 \
  2291. __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \
  2292. __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \
  2293. __asm movdqu 0[edx], xmm5 \
  2294. __asm movdqu 16[edx], xmm0 \
  2295. __asm lea edx, [edx + 32]}
  2296. // Store 8 RGB24 values.
  2297. #define STORERGB24 \
  2298. __asm {/* Weave into RRGB */ \
  2299. __asm punpcklbw xmm0, xmm1 /* BG */ \
  2300. __asm punpcklbw xmm2, xmm2 /* RR */ \
  2301. __asm movdqa xmm1, xmm0 \
  2302. __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
  2303. __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */ \
  2304. __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
  2305. __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
  2306. __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
  2307. __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \
  2308. __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \
  2309. __asm lea edx, [edx + 24]}
  2310. // Store 8 RGB565 values.
  2311. #define STORERGB565 \
  2312. __asm {/* Weave into RRGB */ \
  2313. __asm punpcklbw xmm0, xmm1 /* BG */ \
  2314. __asm punpcklbw xmm2, xmm2 /* RR */ \
  2315. __asm movdqa xmm1, xmm0 \
  2316. __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
  2317. __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */ \
  2318. __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \
  2319. __asm movdqa xmm2, xmm0 /* G */ \
  2320. __asm pslld xmm0, 8 /* R */ \
  2321. __asm psrld xmm3, 3 /* B */ \
  2322. __asm psrld xmm2, 5 /* G */ \
  2323. __asm psrad xmm0, 16 /* R */ \
  2324. __asm pand xmm3, xmm5 /* B */ \
  2325. __asm pand xmm2, xmm6 /* G */ \
  2326. __asm pand xmm0, xmm7 /* R */ \
  2327. __asm por xmm3, xmm2 /* BG */ \
  2328. __asm por xmm0, xmm3 /* BGR */ \
  2329. __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \
  2330. __asm movdqa xmm2, xmm1 /* G */ \
  2331. __asm pslld xmm1, 8 /* R */ \
  2332. __asm psrld xmm3, 3 /* B */ \
  2333. __asm psrld xmm2, 5 /* G */ \
  2334. __asm psrad xmm1, 16 /* R */ \
  2335. __asm pand xmm3, xmm5 /* B */ \
  2336. __asm pand xmm2, xmm6 /* G */ \
  2337. __asm pand xmm1, xmm7 /* R */ \
  2338. __asm por xmm3, xmm2 /* BG */ \
  2339. __asm por xmm1, xmm3 /* BGR */ \
  2340. __asm packssdw xmm0, xmm1 \
  2341. __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \
  2342. __asm lea edx, [edx + 16]}
  2343. // 8 pixels.
  2344. // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
  2345. __declspec(naked) void I444ToARGBRow_SSSE3(
  2346. const uint8* y_buf,
  2347. const uint8* u_buf,
  2348. const uint8* v_buf,
  2349. uint8* dst_argb,
  2350. const struct YuvConstants* yuvconstants,
  2351. int width) {
  2352. __asm {
  2353. push esi
  2354. push edi
  2355. push ebx
  2356. mov eax, [esp + 12 + 4] // Y
  2357. mov esi, [esp + 12 + 8] // U
  2358. mov edi, [esp + 12 + 12] // V
  2359. mov edx, [esp + 12 + 16] // argb
  2360. mov ebx, [esp + 12 + 20] // yuvconstants
  2361. mov ecx, [esp + 12 + 24] // width
  2362. sub edi, esi
  2363. pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
  2364. convertloop:
  2365. READYUV444
  2366. YUVTORGB(ebx)
  2367. STOREARGB
  2368. sub ecx, 8
  2369. jg convertloop
  2370. pop ebx
  2371. pop edi
  2372. pop esi
  2373. ret
  2374. }
  2375. }
  2376. // 8 pixels.
  2377. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
  2378. __declspec(naked) void I422ToRGB24Row_SSSE3(
  2379. const uint8* y_buf,
  2380. const uint8* u_buf,
  2381. const uint8* v_buf,
  2382. uint8* dst_rgb24,
  2383. const struct YuvConstants* yuvconstants,
  2384. int width) {
  2385. __asm {
  2386. push esi
  2387. push edi
  2388. push ebx
  2389. mov eax, [esp + 12 + 4] // Y
  2390. mov esi, [esp + 12 + 8] // U
  2391. mov edi, [esp + 12 + 12] // V
  2392. mov edx, [esp + 12 + 16] // argb
  2393. mov ebx, [esp + 12 + 20] // yuvconstants
  2394. mov ecx, [esp + 12 + 24] // width
  2395. sub edi, esi
  2396. movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
  2397. movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
  2398. convertloop:
  2399. READYUV422
  2400. YUVTORGB(ebx)
  2401. STORERGB24
  2402. sub ecx, 8
  2403. jg convertloop
  2404. pop ebx
  2405. pop edi
  2406. pop esi
  2407. ret
  2408. }
  2409. }
  2410. // 8 pixels
  2411. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
  2412. __declspec(naked) void I422ToRGB565Row_SSSE3(
  2413. const uint8* y_buf,
  2414. const uint8* u_buf,
  2415. const uint8* v_buf,
  2416. uint8* rgb565_buf,
  2417. const struct YuvConstants* yuvconstants,
  2418. int width) {
  2419. __asm {
  2420. push esi
  2421. push edi
  2422. push ebx
  2423. mov eax, [esp + 12 + 4] // Y
  2424. mov esi, [esp + 12 + 8] // U
  2425. mov edi, [esp + 12 + 12] // V
  2426. mov edx, [esp + 12 + 16] // argb
  2427. mov ebx, [esp + 12 + 20] // yuvconstants
  2428. mov ecx, [esp + 12 + 24] // width
  2429. sub edi, esi
  2430. pcmpeqb xmm5, xmm5 // generate mask 0x0000001f
  2431. psrld xmm5, 27
  2432. pcmpeqb xmm6, xmm6 // generate mask 0x000007e0
  2433. psrld xmm6, 26
  2434. pslld xmm6, 5
  2435. pcmpeqb xmm7, xmm7 // generate mask 0xfffff800
  2436. pslld xmm7, 11
  2437. convertloop:
  2438. READYUV422
  2439. YUVTORGB(ebx)
  2440. STORERGB565
  2441. sub ecx, 8
  2442. jg convertloop
  2443. pop ebx
  2444. pop edi
  2445. pop esi
  2446. ret
  2447. }
  2448. }
  2449. // 8 pixels.
  2450. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
  2451. __declspec(naked) void I422ToARGBRow_SSSE3(
  2452. const uint8* y_buf,
  2453. const uint8* u_buf,
  2454. const uint8* v_buf,
  2455. uint8* dst_argb,
  2456. const struct YuvConstants* yuvconstants,
  2457. int width) {
  2458. __asm {
  2459. push esi
  2460. push edi
  2461. push ebx
  2462. mov eax, [esp + 12 + 4] // Y
  2463. mov esi, [esp + 12 + 8] // U
  2464. mov edi, [esp + 12 + 12] // V
  2465. mov edx, [esp + 12 + 16] // argb
  2466. mov ebx, [esp + 12 + 20] // yuvconstants
  2467. mov ecx, [esp + 12 + 24] // width
  2468. sub edi, esi
  2469. pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
  2470. convertloop:
  2471. READYUV422
  2472. YUVTORGB(ebx)
  2473. STOREARGB
  2474. sub ecx, 8
  2475. jg convertloop
  2476. pop ebx
  2477. pop edi
  2478. pop esi
  2479. ret
  2480. }
  2481. }
  2482. // 8 pixels.
  2483. // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
  2484. __declspec(naked) void I422AlphaToARGBRow_SSSE3(
  2485. const uint8* y_buf,
  2486. const uint8* u_buf,
  2487. const uint8* v_buf,
  2488. const uint8* a_buf,
  2489. uint8* dst_argb,
  2490. const struct YuvConstants* yuvconstants,
  2491. int width) {
  2492. __asm {
  2493. push esi
  2494. push edi
  2495. push ebx
  2496. push ebp
  2497. mov eax, [esp + 16 + 4] // Y
  2498. mov esi, [esp + 16 + 8] // U
  2499. mov edi, [esp + 16 + 12] // V
  2500. mov ebp, [esp + 16 + 16] // A
  2501. mov edx, [esp + 16 + 20] // argb
  2502. mov ebx, [esp + 16 + 24] // yuvconstants
  2503. mov ecx, [esp + 16 + 28] // width
  2504. sub edi, esi
  2505. convertloop:
  2506. READYUVA422
  2507. YUVTORGB(ebx)
  2508. STOREARGB
  2509. sub ecx, 8
  2510. jg convertloop
  2511. pop ebp
  2512. pop ebx
  2513. pop edi
  2514. pop esi
  2515. ret
  2516. }
  2517. }
  2518. // 8 pixels.
  2519. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
  2520. __declspec(naked) void NV12ToARGBRow_SSSE3(
  2521. const uint8* y_buf,
  2522. const uint8* uv_buf,
  2523. uint8* dst_argb,
  2524. const struct YuvConstants* yuvconstants,
  2525. int width) {
  2526. __asm {
  2527. push esi
  2528. push ebx
  2529. mov eax, [esp + 8 + 4] // Y
  2530. mov esi, [esp + 8 + 8] // UV
  2531. mov edx, [esp + 8 + 12] // argb
  2532. mov ebx, [esp + 8 + 16] // yuvconstants
  2533. mov ecx, [esp + 8 + 20] // width
  2534. pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
  2535. convertloop:
  2536. READNV12
  2537. YUVTORGB(ebx)
  2538. STOREARGB
  2539. sub ecx, 8
  2540. jg convertloop
  2541. pop ebx
  2542. pop esi
  2543. ret
  2544. }
  2545. }
  2546. // 8 pixels.
  2547. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
  2548. __declspec(naked) void NV21ToARGBRow_SSSE3(
  2549. const uint8* y_buf,
  2550. const uint8* vu_buf,
  2551. uint8* dst_argb,
  2552. const struct YuvConstants* yuvconstants,
  2553. int width) {
  2554. __asm {
  2555. push esi
  2556. push ebx
  2557. mov eax, [esp + 8 + 4] // Y
  2558. mov esi, [esp + 8 + 8] // VU
  2559. mov edx, [esp + 8 + 12] // argb
  2560. mov ebx, [esp + 8 + 16] // yuvconstants
  2561. mov ecx, [esp + 8 + 20] // width
  2562. pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
  2563. convertloop:
  2564. READNV21
  2565. YUVTORGB(ebx)
  2566. STOREARGB
  2567. sub ecx, 8
  2568. jg convertloop
  2569. pop ebx
  2570. pop esi
  2571. ret
  2572. }
  2573. }
  2574. // 8 pixels.
  2575. // 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
  2576. __declspec(naked) void YUY2ToARGBRow_SSSE3(
  2577. const uint8* src_yuy2,
  2578. uint8* dst_argb,
  2579. const struct YuvConstants* yuvconstants,
  2580. int width) {
  2581. __asm {
  2582. push ebx
  2583. mov eax, [esp + 4 + 4] // yuy2
  2584. mov edx, [esp + 4 + 8] // argb
  2585. mov ebx, [esp + 4 + 12] // yuvconstants
  2586. mov ecx, [esp + 4 + 16] // width
  2587. pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
  2588. convertloop:
  2589. READYUY2
  2590. YUVTORGB(ebx)
  2591. STOREARGB
  2592. sub ecx, 8
  2593. jg convertloop
  2594. pop ebx
  2595. ret
  2596. }
  2597. }
  2598. // 8 pixels.
  2599. // 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
  2600. __declspec(naked) void UYVYToARGBRow_SSSE3(
  2601. const uint8* src_uyvy,
  2602. uint8* dst_argb,
  2603. const struct YuvConstants* yuvconstants,
  2604. int width) {
  2605. __asm {
  2606. push ebx
  2607. mov eax, [esp + 4 + 4] // uyvy
  2608. mov edx, [esp + 4 + 8] // argb
  2609. mov ebx, [esp + 4 + 12] // yuvconstants
  2610. mov ecx, [esp + 4 + 16] // width
  2611. pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
  2612. convertloop:
  2613. READUYVY
  2614. YUVTORGB(ebx)
  2615. STOREARGB
  2616. sub ecx, 8
  2617. jg convertloop
  2618. pop ebx
  2619. ret
  2620. }
  2621. }
  2622. __declspec(naked) void I422ToRGBARow_SSSE3(
  2623. const uint8* y_buf,
  2624. const uint8* u_buf,
  2625. const uint8* v_buf,
  2626. uint8* dst_rgba,
  2627. const struct YuvConstants* yuvconstants,
  2628. int width) {
  2629. __asm {
  2630. push esi
  2631. push edi
  2632. push ebx
  2633. mov eax, [esp + 12 + 4] // Y
  2634. mov esi, [esp + 12 + 8] // U
  2635. mov edi, [esp + 12 + 12] // V
  2636. mov edx, [esp + 12 + 16] // argb
  2637. mov ebx, [esp + 12 + 20] // yuvconstants
  2638. mov ecx, [esp + 12 + 24] // width
  2639. sub edi, esi
  2640. convertloop:
  2641. READYUV422
  2642. YUVTORGB(ebx)
  2643. STORERGBA
  2644. sub ecx, 8
  2645. jg convertloop
  2646. pop ebx
  2647. pop edi
  2648. pop esi
  2649. ret
  2650. }
  2651. }
  2652. #endif // HAS_I422TOARGBROW_SSSE3
  2653. #ifdef HAS_I400TOARGBROW_SSE2
  2654. // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
  2655. __declspec(naked) void I400ToARGBRow_SSE2(const uint8* y_buf,
  2656. uint8* rgb_buf,
  2657. int width) {
  2658. __asm {
  2659. mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
  2660. movd xmm2, eax
  2661. pshufd xmm2, xmm2,0
  2662. mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
  2663. movd xmm3, eax
  2664. pshufd xmm3, xmm3, 0
  2665. pcmpeqb xmm4, xmm4 // generate mask 0xff000000
  2666. pslld xmm4, 24
  2667. mov eax, [esp + 4] // Y
  2668. mov edx, [esp + 8] // rgb
  2669. mov ecx, [esp + 12] // width
  2670. convertloop:
  2671. // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
  2672. movq xmm0, qword ptr [eax]
  2673. lea eax, [eax + 8]
  2674. punpcklbw xmm0, xmm0 // Y.Y
  2675. pmulhuw xmm0, xmm2
  2676. psubusw xmm0, xmm3
  2677. psrlw xmm0, 6
  2678. packuswb xmm0, xmm0 // G
  2679. // Step 2: Weave into ARGB
  2680. punpcklbw xmm0, xmm0 // GG
  2681. movdqa xmm1, xmm0
  2682. punpcklwd xmm0, xmm0 // BGRA first 4 pixels
  2683. punpckhwd xmm1, xmm1 // BGRA next 4 pixels
  2684. por xmm0, xmm4
  2685. por xmm1, xmm4
  2686. movdqu [edx], xmm0
  2687. movdqu [edx + 16], xmm1
  2688. lea edx, [edx + 32]
  2689. sub ecx, 8
  2690. jg convertloop
  2691. ret
  2692. }
  2693. }
  2694. #endif // HAS_I400TOARGBROW_SSE2
  2695. #ifdef HAS_I400TOARGBROW_AVX2
  2696. // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
  2697. // note: vpunpcklbw mutates and vpackuswb unmutates.
  2698. __declspec(naked) void I400ToARGBRow_AVX2(const uint8* y_buf,
  2699. uint8* rgb_buf,
  2700. int width) {
  2701. __asm {
  2702. mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
  2703. vmovd xmm2, eax
  2704. vbroadcastss ymm2, xmm2
  2705. mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
  2706. vmovd xmm3, eax
  2707. vbroadcastss ymm3, xmm3
  2708. vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000
  2709. vpslld ymm4, ymm4, 24
  2710. mov eax, [esp + 4] // Y
  2711. mov edx, [esp + 8] // rgb
  2712. mov ecx, [esp + 12] // width
  2713. convertloop:
  2714. // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
  2715. vmovdqu xmm0, [eax]
  2716. lea eax, [eax + 16]
  2717. vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates
  2718. vpunpcklbw ymm0, ymm0, ymm0 // Y.Y
  2719. vpmulhuw ymm0, ymm0, ymm2
  2720. vpsubusw ymm0, ymm0, ymm3
  2721. vpsrlw ymm0, ymm0, 6
  2722. vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120
  2723. // TODO(fbarchard): Weave alpha with unpack.
  2724. // Step 2: Weave into ARGB
  2725. vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates
  2726. vpermq ymm1, ymm1, 0xd8
  2727. vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels
  2728. vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels
  2729. vpor ymm0, ymm0, ymm4
  2730. vpor ymm1, ymm1, ymm4
  2731. vmovdqu [edx], ymm0
  2732. vmovdqu [edx + 32], ymm1
  2733. lea edx, [edx + 64]
  2734. sub ecx, 16
  2735. jg convertloop
  2736. vzeroupper
  2737. ret
  2738. }
  2739. }
  2740. #endif // HAS_I400TOARGBROW_AVX2
  2741. #ifdef HAS_MIRRORROW_SSSE3
  2742. // Shuffle table for reversing the bytes.
  2743. static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
  2744. 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
  2745. // TODO(fbarchard): Replace lea with -16 offset.
  2746. __declspec(naked) void MirrorRow_SSSE3(const uint8* src,
  2747. uint8* dst,
  2748. int width) {
  2749. __asm {
  2750. mov eax, [esp + 4] // src
  2751. mov edx, [esp + 8] // dst
  2752. mov ecx, [esp + 12] // width
  2753. movdqa xmm5, xmmword ptr kShuffleMirror
  2754. convertloop:
  2755. movdqu xmm0, [eax - 16 + ecx]
  2756. pshufb xmm0, xmm5
  2757. movdqu [edx], xmm0
  2758. lea edx, [edx + 16]
  2759. sub ecx, 16
  2760. jg convertloop
  2761. ret
  2762. }
  2763. }
  2764. #endif // HAS_MIRRORROW_SSSE3
  2765. #ifdef HAS_MIRRORROW_AVX2
  2766. __declspec(naked) void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
  2767. __asm {
  2768. mov eax, [esp + 4] // src
  2769. mov edx, [esp + 8] // dst
  2770. mov ecx, [esp + 12] // width
  2771. vbroadcastf128 ymm5, xmmword ptr kShuffleMirror
  2772. convertloop:
  2773. vmovdqu ymm0, [eax - 32 + ecx]
  2774. vpshufb ymm0, ymm0, ymm5
  2775. vpermq ymm0, ymm0, 0x4e // swap high and low halfs
  2776. vmovdqu [edx], ymm0
  2777. lea edx, [edx + 32]
  2778. sub ecx, 32
  2779. jg convertloop
  2780. vzeroupper
  2781. ret
  2782. }
  2783. }
  2784. #endif // HAS_MIRRORROW_AVX2
  2785. #ifdef HAS_MIRRORUVROW_SSSE3
  2786. // Shuffle table for reversing the bytes of UV channels.
  2787. static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
  2788. 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
  2789. __declspec(naked) void MirrorUVRow_SSSE3(const uint8* src,
  2790. uint8* dst_u,
  2791. uint8* dst_v,
  2792. int width) {
  2793. __asm {
  2794. push edi
  2795. mov eax, [esp + 4 + 4] // src
  2796. mov edx, [esp + 4 + 8] // dst_u
  2797. mov edi, [esp + 4 + 12] // dst_v
  2798. mov ecx, [esp + 4 + 16] // width
  2799. movdqa xmm1, xmmword ptr kShuffleMirrorUV
  2800. lea eax, [eax + ecx * 2 - 16]
  2801. sub edi, edx
  2802. convertloop:
  2803. movdqu xmm0, [eax]
  2804. lea eax, [eax - 16]
  2805. pshufb xmm0, xmm1
  2806. movlpd qword ptr [edx], xmm0
  2807. movhpd qword ptr [edx + edi], xmm0
  2808. lea edx, [edx + 8]
  2809. sub ecx, 8
  2810. jg convertloop
  2811. pop edi
  2812. ret
  2813. }
  2814. }
  2815. #endif // HAS_MIRRORUVROW_SSSE3
  2816. #ifdef HAS_ARGBMIRRORROW_SSE2
  2817. __declspec(naked) void ARGBMirrorRow_SSE2(const uint8* src,
  2818. uint8* dst,
  2819. int width) {
  2820. __asm {
  2821. mov eax, [esp + 4] // src
  2822. mov edx, [esp + 8] // dst
  2823. mov ecx, [esp + 12] // width
  2824. lea eax, [eax - 16 + ecx * 4] // last 4 pixels.
  2825. convertloop:
  2826. movdqu xmm0, [eax]
  2827. lea eax, [eax - 16]
  2828. pshufd xmm0, xmm0, 0x1b
  2829. movdqu [edx], xmm0
  2830. lea edx, [edx + 16]
  2831. sub ecx, 4
  2832. jg convertloop
  2833. ret
  2834. }
  2835. }
  2836. #endif // HAS_ARGBMIRRORROW_SSE2
  2837. #ifdef HAS_ARGBMIRRORROW_AVX2
  2838. // Shuffle table for reversing the bytes.
  2839. static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
  2840. __declspec(naked) void ARGBMirrorRow_AVX2(const uint8* src,
  2841. uint8* dst,
  2842. int width) {
  2843. __asm {
  2844. mov eax, [esp + 4] // src
  2845. mov edx, [esp + 8] // dst
  2846. mov ecx, [esp + 12] // width
  2847. vmovdqu ymm5, ymmword ptr kARGBShuffleMirror_AVX2
  2848. convertloop:
  2849. vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order
  2850. vmovdqu [edx], ymm0
  2851. lea edx, [edx + 32]
  2852. sub ecx, 8
  2853. jg convertloop
  2854. vzeroupper
  2855. ret
  2856. }
  2857. }
  2858. #endif // HAS_ARGBMIRRORROW_AVX2
  2859. #ifdef HAS_SPLITUVROW_SSE2
  2860. __declspec(naked) void SplitUVRow_SSE2(const uint8* src_uv,
  2861. uint8* dst_u,
  2862. uint8* dst_v,
  2863. int width) {
  2864. __asm {
  2865. push edi
  2866. mov eax, [esp + 4 + 4] // src_uv
  2867. mov edx, [esp + 4 + 8] // dst_u
  2868. mov edi, [esp + 4 + 12] // dst_v
  2869. mov ecx, [esp + 4 + 16] // width
  2870. pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
  2871. psrlw xmm5, 8
  2872. sub edi, edx
  2873. convertloop:
  2874. movdqu xmm0, [eax]
  2875. movdqu xmm1, [eax + 16]
  2876. lea eax, [eax + 32]
  2877. movdqa xmm2, xmm0
  2878. movdqa xmm3, xmm1
  2879. pand xmm0, xmm5 // even bytes
  2880. pand xmm1, xmm5
  2881. packuswb xmm0, xmm1
  2882. psrlw xmm2, 8 // odd bytes
  2883. psrlw xmm3, 8
  2884. packuswb xmm2, xmm3
  2885. movdqu [edx], xmm0
  2886. movdqu [edx + edi], xmm2
  2887. lea edx, [edx + 16]
  2888. sub ecx, 16
  2889. jg convertloop
  2890. pop edi
  2891. ret
  2892. }
  2893. }
  2894. #endif // HAS_SPLITUVROW_SSE2
  2895. #ifdef HAS_SPLITUVROW_AVX2
  2896. __declspec(naked) void SplitUVRow_AVX2(const uint8* src_uv,
  2897. uint8* dst_u,
  2898. uint8* dst_v,
  2899. int width) {
  2900. __asm {
  2901. push edi
  2902. mov eax, [esp + 4 + 4] // src_uv
  2903. mov edx, [esp + 4 + 8] // dst_u
  2904. mov edi, [esp + 4 + 12] // dst_v
  2905. mov ecx, [esp + 4 + 16] // width
  2906. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
  2907. vpsrlw ymm5, ymm5, 8
  2908. sub edi, edx
  2909. convertloop:
  2910. vmovdqu ymm0, [eax]
  2911. vmovdqu ymm1, [eax + 32]
  2912. lea eax, [eax + 64]
  2913. vpsrlw ymm2, ymm0, 8 // odd bytes
  2914. vpsrlw ymm3, ymm1, 8
  2915. vpand ymm0, ymm0, ymm5 // even bytes
  2916. vpand ymm1, ymm1, ymm5
  2917. vpackuswb ymm0, ymm0, ymm1
  2918. vpackuswb ymm2, ymm2, ymm3
  2919. vpermq ymm0, ymm0, 0xd8
  2920. vpermq ymm2, ymm2, 0xd8
  2921. vmovdqu [edx], ymm0
  2922. vmovdqu [edx + edi], ymm2
  2923. lea edx, [edx + 32]
  2924. sub ecx, 32
  2925. jg convertloop
  2926. pop edi
  2927. vzeroupper
  2928. ret
  2929. }
  2930. }
  2931. #endif // HAS_SPLITUVROW_AVX2
  2932. #ifdef HAS_MERGEUVROW_SSE2
  2933. __declspec(naked) void MergeUVRow_SSE2(const uint8* src_u,
  2934. const uint8* src_v,
  2935. uint8* dst_uv,
  2936. int width) {
  2937. __asm {
  2938. push edi
  2939. mov eax, [esp + 4 + 4] // src_u
  2940. mov edx, [esp + 4 + 8] // src_v
  2941. mov edi, [esp + 4 + 12] // dst_uv
  2942. mov ecx, [esp + 4 + 16] // width
  2943. sub edx, eax
  2944. convertloop:
  2945. movdqu xmm0, [eax] // read 16 U's
  2946. movdqu xmm1, [eax + edx] // and 16 V's
  2947. lea eax, [eax + 16]
  2948. movdqa xmm2, xmm0
  2949. punpcklbw xmm0, xmm1 // first 8 UV pairs
  2950. punpckhbw xmm2, xmm1 // next 8 UV pairs
  2951. movdqu [edi], xmm0
  2952. movdqu [edi + 16], xmm2
  2953. lea edi, [edi + 32]
  2954. sub ecx, 16
  2955. jg convertloop
  2956. pop edi
  2957. ret
  2958. }
  2959. }
  2960. #endif // HAS_MERGEUVROW_SSE2
  2961. #ifdef HAS_MERGEUVROW_AVX2
  2962. __declspec(naked) void MergeUVRow_AVX2(const uint8* src_u,
  2963. const uint8* src_v,
  2964. uint8* dst_uv,
  2965. int width) {
  2966. __asm {
  2967. push edi
  2968. mov eax, [esp + 4 + 4] // src_u
  2969. mov edx, [esp + 4 + 8] // src_v
  2970. mov edi, [esp + 4 + 12] // dst_uv
  2971. mov ecx, [esp + 4 + 16] // width
  2972. sub edx, eax
  2973. convertloop:
  2974. vmovdqu ymm0, [eax] // read 32 U's
  2975. vmovdqu ymm1, [eax + edx] // and 32 V's
  2976. lea eax, [eax + 32]
  2977. vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2
  2978. vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3
  2979. vextractf128 [edi], ymm2, 0 // bytes 0..15
  2980. vextractf128 [edi + 16], ymm0, 0 // bytes 16..31
  2981. vextractf128 [edi + 32], ymm2, 1 // bytes 32..47
  2982. vextractf128 [edi + 48], ymm0, 1 // bytes 47..63
  2983. lea edi, [edi + 64]
  2984. sub ecx, 32
  2985. jg convertloop
  2986. pop edi
  2987. vzeroupper
  2988. ret
  2989. }
  2990. }
  2991. #endif // HAS_MERGEUVROW_AVX2
  2992. #ifdef HAS_COPYROW_SSE2
  2993. // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
  2994. __declspec(naked) void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
  2995. __asm {
  2996. mov eax, [esp + 4] // src
  2997. mov edx, [esp + 8] // dst
  2998. mov ecx, [esp + 12] // count
  2999. test eax, 15
  3000. jne convertloopu
  3001. test edx, 15
  3002. jne convertloopu
  3003. convertloopa:
  3004. movdqa xmm0, [eax]
  3005. movdqa xmm1, [eax + 16]
  3006. lea eax, [eax + 32]
  3007. movdqa [edx], xmm0
  3008. movdqa [edx + 16], xmm1
  3009. lea edx, [edx + 32]
  3010. sub ecx, 32
  3011. jg convertloopa
  3012. ret
  3013. convertloopu:
  3014. movdqu xmm0, [eax]
  3015. movdqu xmm1, [eax + 16]
  3016. lea eax, [eax + 32]
  3017. movdqu [edx], xmm0
  3018. movdqu [edx + 16], xmm1
  3019. lea edx, [edx + 32]
  3020. sub ecx, 32
  3021. jg convertloopu
  3022. ret
  3023. }
  3024. }
  3025. #endif // HAS_COPYROW_SSE2
  3026. #ifdef HAS_COPYROW_AVX
  3027. // CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
  3028. __declspec(naked) void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
  3029. __asm {
  3030. mov eax, [esp + 4] // src
  3031. mov edx, [esp + 8] // dst
  3032. mov ecx, [esp + 12] // count
  3033. convertloop:
  3034. vmovdqu ymm0, [eax]
  3035. vmovdqu ymm1, [eax + 32]
  3036. lea eax, [eax + 64]
  3037. vmovdqu [edx], ymm0
  3038. vmovdqu [edx + 32], ymm1
  3039. lea edx, [edx + 64]
  3040. sub ecx, 64
  3041. jg convertloop
  3042. vzeroupper
  3043. ret
  3044. }
  3045. }
  3046. #endif // HAS_COPYROW_AVX
  3047. // Multiple of 1.
  3048. __declspec(naked) void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
  3049. __asm {
  3050. mov eax, esi
  3051. mov edx, edi
  3052. mov esi, [esp + 4] // src
  3053. mov edi, [esp + 8] // dst
  3054. mov ecx, [esp + 12] // count
  3055. rep movsb
  3056. mov edi, edx
  3057. mov esi, eax
  3058. ret
  3059. }
  3060. }
  3061. #ifdef HAS_ARGBCOPYALPHAROW_SSE2
  3062. // width in pixels
  3063. __declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8* src,
  3064. uint8* dst,
  3065. int width) {
  3066. __asm {
  3067. mov eax, [esp + 4] // src
  3068. mov edx, [esp + 8] // dst
  3069. mov ecx, [esp + 12] // count
  3070. pcmpeqb xmm0, xmm0 // generate mask 0xff000000
  3071. pslld xmm0, 24
  3072. pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
  3073. psrld xmm1, 8
  3074. convertloop:
  3075. movdqu xmm2, [eax]
  3076. movdqu xmm3, [eax + 16]
  3077. lea eax, [eax + 32]
  3078. movdqu xmm4, [edx]
  3079. movdqu xmm5, [edx + 16]
  3080. pand xmm2, xmm0
  3081. pand xmm3, xmm0
  3082. pand xmm4, xmm1
  3083. pand xmm5, xmm1
  3084. por xmm2, xmm4
  3085. por xmm3, xmm5
  3086. movdqu [edx], xmm2
  3087. movdqu [edx + 16], xmm3
  3088. lea edx, [edx + 32]
  3089. sub ecx, 8
  3090. jg convertloop
  3091. ret
  3092. }
  3093. }
  3094. #endif // HAS_ARGBCOPYALPHAROW_SSE2
  3095. #ifdef HAS_ARGBCOPYALPHAROW_AVX2
  3096. // width in pixels
  3097. __declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8* src,
  3098. uint8* dst,
  3099. int width) {
  3100. __asm {
  3101. mov eax, [esp + 4] // src
  3102. mov edx, [esp + 8] // dst
  3103. mov ecx, [esp + 12] // count
  3104. vpcmpeqb ymm0, ymm0, ymm0
  3105. vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
  3106. convertloop:
  3107. vmovdqu ymm1, [eax]
  3108. vmovdqu ymm2, [eax + 32]
  3109. lea eax, [eax + 64]
  3110. vpblendvb ymm1, ymm1, [edx], ymm0
  3111. vpblendvb ymm2, ymm2, [edx + 32], ymm0
  3112. vmovdqu [edx], ymm1
  3113. vmovdqu [edx + 32], ymm2
  3114. lea edx, [edx + 64]
  3115. sub ecx, 16
  3116. jg convertloop
  3117. vzeroupper
  3118. ret
  3119. }
  3120. }
  3121. #endif // HAS_ARGBCOPYALPHAROW_AVX2
  3122. #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
  3123. // width in pixels
  3124. __declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8* src_argb,
  3125. uint8* dst_a,
  3126. int width) {
  3127. __asm {
  3128. mov eax, [esp + 4] // src_argb
  3129. mov edx, [esp + 8] // dst_a
  3130. mov ecx, [esp + 12] // width
  3131. extractloop:
  3132. movdqu xmm0, [eax]
  3133. movdqu xmm1, [eax + 16]
  3134. lea eax, [eax + 32]
  3135. psrld xmm0, 24
  3136. psrld xmm1, 24
  3137. packssdw xmm0, xmm1
  3138. packuswb xmm0, xmm0
  3139. movq qword ptr [edx], xmm0
  3140. lea edx, [edx + 8]
  3141. sub ecx, 8
  3142. jg extractloop
  3143. ret
  3144. }
  3145. }
  3146. #endif // HAS_ARGBEXTRACTALPHAROW_SSE2
  3147. #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
  3148. // width in pixels
  3149. __declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8* src_argb,
  3150. uint8* dst_a,
  3151. int width) {
  3152. __asm {
  3153. mov eax, [esp + 4] // src_argb
  3154. mov edx, [esp + 8] // dst_a
  3155. mov ecx, [esp + 12] // width
  3156. vmovdqa ymm4, ymmword ptr kPermdARGBToY_AVX
  3157. extractloop:
  3158. vmovdqu ymm0, [eax]
  3159. vmovdqu ymm1, [eax + 32]
  3160. vpsrld ymm0, ymm0, 24
  3161. vpsrld ymm1, ymm1, 24
  3162. vmovdqu ymm2, [eax + 64]
  3163. vmovdqu ymm3, [eax + 96]
  3164. lea eax, [eax + 128]
  3165. vpackssdw ymm0, ymm0, ymm1 // mutates
  3166. vpsrld ymm2, ymm2, 24
  3167. vpsrld ymm3, ymm3, 24
  3168. vpackssdw ymm2, ymm2, ymm3 // mutates
  3169. vpackuswb ymm0, ymm0, ymm2 // mutates
  3170. vpermd ymm0, ymm4, ymm0 // unmutate
  3171. vmovdqu [edx], ymm0
  3172. lea edx, [edx + 32]
  3173. sub ecx, 32
  3174. jg extractloop
  3175. vzeroupper
  3176. ret
  3177. }
  3178. }
  3179. #endif // HAS_ARGBEXTRACTALPHAROW_AVX2
  3180. #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
  3181. // width in pixels
  3182. __declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8* src,
  3183. uint8* dst,
  3184. int width) {
  3185. __asm {
  3186. mov eax, [esp + 4] // src
  3187. mov edx, [esp + 8] // dst
  3188. mov ecx, [esp + 12] // count
  3189. pcmpeqb xmm0, xmm0 // generate mask 0xff000000
  3190. pslld xmm0, 24
  3191. pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
  3192. psrld xmm1, 8
  3193. convertloop:
  3194. movq xmm2, qword ptr [eax] // 8 Y's
  3195. lea eax, [eax + 8]
  3196. punpcklbw xmm2, xmm2
  3197. punpckhwd xmm3, xmm2
  3198. punpcklwd xmm2, xmm2
  3199. movdqu xmm4, [edx]
  3200. movdqu xmm5, [edx + 16]
  3201. pand xmm2, xmm0
  3202. pand xmm3, xmm0
  3203. pand xmm4, xmm1
  3204. pand xmm5, xmm1
  3205. por xmm2, xmm4
  3206. por xmm3, xmm5
  3207. movdqu [edx], xmm2
  3208. movdqu [edx + 16], xmm3
  3209. lea edx, [edx + 32]
  3210. sub ecx, 8
  3211. jg convertloop
  3212. ret
  3213. }
  3214. }
  3215. #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
  3216. #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
  3217. // width in pixels
  3218. __declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8* src,
  3219. uint8* dst,
  3220. int width) {
  3221. __asm {
  3222. mov eax, [esp + 4] // src
  3223. mov edx, [esp + 8] // dst
  3224. mov ecx, [esp + 12] // count
  3225. vpcmpeqb ymm0, ymm0, ymm0
  3226. vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
  3227. convertloop:
  3228. vpmovzxbd ymm1, qword ptr [eax]
  3229. vpmovzxbd ymm2, qword ptr [eax + 8]
  3230. lea eax, [eax + 16]
  3231. vpslld ymm1, ymm1, 24
  3232. vpslld ymm2, ymm2, 24
  3233. vpblendvb ymm1, ymm1, [edx], ymm0
  3234. vpblendvb ymm2, ymm2, [edx + 32], ymm0
  3235. vmovdqu [edx], ymm1
  3236. vmovdqu [edx + 32], ymm2
  3237. lea edx, [edx + 64]
  3238. sub ecx, 16
  3239. jg convertloop
  3240. vzeroupper
  3241. ret
  3242. }
  3243. }
  3244. #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
  3245. #ifdef HAS_SETROW_X86
  3246. // Write 'count' bytes using an 8 bit value repeated.
  3247. // Count should be multiple of 4.
  3248. __declspec(naked) void SetRow_X86(uint8* dst, uint8 v8, int count) {
  3249. __asm {
  3250. movzx eax, byte ptr [esp + 8] // v8
  3251. mov edx, 0x01010101 // Duplicate byte to all bytes.
  3252. mul edx // overwrites edx with upper part of result.
  3253. mov edx, edi
  3254. mov edi, [esp + 4] // dst
  3255. mov ecx, [esp + 12] // count
  3256. shr ecx, 2
  3257. rep stosd
  3258. mov edi, edx
  3259. ret
  3260. }
  3261. }
  3262. // Write 'count' bytes using an 8 bit value repeated.
  3263. __declspec(naked) void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
  3264. __asm {
  3265. mov edx, edi
  3266. mov edi, [esp + 4] // dst
  3267. mov eax, [esp + 8] // v8
  3268. mov ecx, [esp + 12] // count
  3269. rep stosb
  3270. mov edi, edx
  3271. ret
  3272. }
  3273. }
  3274. // Write 'count' 32 bit values.
  3275. __declspec(naked) void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
  3276. __asm {
  3277. mov edx, edi
  3278. mov edi, [esp + 4] // dst
  3279. mov eax, [esp + 8] // v32
  3280. mov ecx, [esp + 12] // count
  3281. rep stosd
  3282. mov edi, edx
  3283. ret
  3284. }
  3285. }
  3286. #endif // HAS_SETROW_X86
  3287. #ifdef HAS_YUY2TOYROW_AVX2
  3288. __declspec(naked) void YUY2ToYRow_AVX2(const uint8* src_yuy2,
  3289. uint8* dst_y,
  3290. int width) {
  3291. __asm {
  3292. mov eax, [esp + 4] // src_yuy2
  3293. mov edx, [esp + 8] // dst_y
  3294. mov ecx, [esp + 12] // width
  3295. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
  3296. vpsrlw ymm5, ymm5, 8
  3297. convertloop:
  3298. vmovdqu ymm0, [eax]
  3299. vmovdqu ymm1, [eax + 32]
  3300. lea eax, [eax + 64]
  3301. vpand ymm0, ymm0, ymm5 // even bytes are Y
  3302. vpand ymm1, ymm1, ymm5
  3303. vpackuswb ymm0, ymm0, ymm1 // mutates.
  3304. vpermq ymm0, ymm0, 0xd8
  3305. vmovdqu [edx], ymm0
  3306. lea edx, [edx + 32]
  3307. sub ecx, 32
  3308. jg convertloop
  3309. vzeroupper
  3310. ret
  3311. }
  3312. }
  3313. __declspec(naked) void YUY2ToUVRow_AVX2(const uint8* src_yuy2,
  3314. int stride_yuy2,
  3315. uint8* dst_u,
  3316. uint8* dst_v,
  3317. int width) {
  3318. __asm {
  3319. push esi
  3320. push edi
  3321. mov eax, [esp + 8 + 4] // src_yuy2
  3322. mov esi, [esp + 8 + 8] // stride_yuy2
  3323. mov edx, [esp + 8 + 12] // dst_u
  3324. mov edi, [esp + 8 + 16] // dst_v
  3325. mov ecx, [esp + 8 + 20] // width
  3326. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
  3327. vpsrlw ymm5, ymm5, 8
  3328. sub edi, edx
  3329. convertloop:
  3330. vmovdqu ymm0, [eax]
  3331. vmovdqu ymm1, [eax + 32]
  3332. vpavgb ymm0, ymm0, [eax + esi]
  3333. vpavgb ymm1, ymm1, [eax + esi + 32]
  3334. lea eax, [eax + 64]
  3335. vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
  3336. vpsrlw ymm1, ymm1, 8
  3337. vpackuswb ymm0, ymm0, ymm1 // mutates.
  3338. vpermq ymm0, ymm0, 0xd8
  3339. vpand ymm1, ymm0, ymm5 // U
  3340. vpsrlw ymm0, ymm0, 8 // V
  3341. vpackuswb ymm1, ymm1, ymm1 // mutates.
  3342. vpackuswb ymm0, ymm0, ymm0 // mutates.
  3343. vpermq ymm1, ymm1, 0xd8
  3344. vpermq ymm0, ymm0, 0xd8
  3345. vextractf128 [edx], ymm1, 0 // U
  3346. vextractf128 [edx + edi], ymm0, 0 // V
  3347. lea edx, [edx + 16]
  3348. sub ecx, 32
  3349. jg convertloop
  3350. pop edi
  3351. pop esi
  3352. vzeroupper
  3353. ret
  3354. }
  3355. }
  3356. __declspec(naked) void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
  3357. uint8* dst_u,
  3358. uint8* dst_v,
  3359. int width) {
  3360. __asm {
  3361. push edi
  3362. mov eax, [esp + 4 + 4] // src_yuy2
  3363. mov edx, [esp + 4 + 8] // dst_u
  3364. mov edi, [esp + 4 + 12] // dst_v
  3365. mov ecx, [esp + 4 + 16] // width
  3366. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
  3367. vpsrlw ymm5, ymm5, 8
  3368. sub edi, edx
  3369. convertloop:
  3370. vmovdqu ymm0, [eax]
  3371. vmovdqu ymm1, [eax + 32]
  3372. lea eax, [eax + 64]
  3373. vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
  3374. vpsrlw ymm1, ymm1, 8
  3375. vpackuswb ymm0, ymm0, ymm1 // mutates.
  3376. vpermq ymm0, ymm0, 0xd8
  3377. vpand ymm1, ymm0, ymm5 // U
  3378. vpsrlw ymm0, ymm0, 8 // V
  3379. vpackuswb ymm1, ymm1, ymm1 // mutates.
  3380. vpackuswb ymm0, ymm0, ymm0 // mutates.
  3381. vpermq ymm1, ymm1, 0xd8
  3382. vpermq ymm0, ymm0, 0xd8
  3383. vextractf128 [edx], ymm1, 0 // U
  3384. vextractf128 [edx + edi], ymm0, 0 // V
  3385. lea edx, [edx + 16]
  3386. sub ecx, 32
  3387. jg convertloop
  3388. pop edi
  3389. vzeroupper
  3390. ret
  3391. }
  3392. }
  3393. __declspec(naked) void UYVYToYRow_AVX2(const uint8* src_uyvy,
  3394. uint8* dst_y,
  3395. int width) {
  3396. __asm {
  3397. mov eax, [esp + 4] // src_uyvy
  3398. mov edx, [esp + 8] // dst_y
  3399. mov ecx, [esp + 12] // width
  3400. convertloop:
  3401. vmovdqu ymm0, [eax]
  3402. vmovdqu ymm1, [eax + 32]
  3403. lea eax, [eax + 64]
  3404. vpsrlw ymm0, ymm0, 8 // odd bytes are Y
  3405. vpsrlw ymm1, ymm1, 8
  3406. vpackuswb ymm0, ymm0, ymm1 // mutates.
  3407. vpermq ymm0, ymm0, 0xd8
  3408. vmovdqu [edx], ymm0
  3409. lea edx, [edx + 32]
  3410. sub ecx, 32
  3411. jg convertloop
  3412. vzeroupper
  3413. ret
  3414. }
  3415. }
  3416. __declspec(naked) void UYVYToUVRow_AVX2(const uint8* src_uyvy,
  3417. int stride_uyvy,
  3418. uint8* dst_u,
  3419. uint8* dst_v,
  3420. int width) {
  3421. __asm {
  3422. push esi
  3423. push edi
  3424. mov eax, [esp + 8 + 4] // src_yuy2
  3425. mov esi, [esp + 8 + 8] // stride_yuy2
  3426. mov edx, [esp + 8 + 12] // dst_u
  3427. mov edi, [esp + 8 + 16] // dst_v
  3428. mov ecx, [esp + 8 + 20] // width
  3429. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
  3430. vpsrlw ymm5, ymm5, 8
  3431. sub edi, edx
  3432. convertloop:
  3433. vmovdqu ymm0, [eax]
  3434. vmovdqu ymm1, [eax + 32]
  3435. vpavgb ymm0, ymm0, [eax + esi]
  3436. vpavgb ymm1, ymm1, [eax + esi + 32]
  3437. lea eax, [eax + 64]
  3438. vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
  3439. vpand ymm1, ymm1, ymm5
  3440. vpackuswb ymm0, ymm0, ymm1 // mutates.
  3441. vpermq ymm0, ymm0, 0xd8
  3442. vpand ymm1, ymm0, ymm5 // U
  3443. vpsrlw ymm0, ymm0, 8 // V
  3444. vpackuswb ymm1, ymm1, ymm1 // mutates.
  3445. vpackuswb ymm0, ymm0, ymm0 // mutates.
  3446. vpermq ymm1, ymm1, 0xd8
  3447. vpermq ymm0, ymm0, 0xd8
  3448. vextractf128 [edx], ymm1, 0 // U
  3449. vextractf128 [edx + edi], ymm0, 0 // V
  3450. lea edx, [edx + 16]
  3451. sub ecx, 32
  3452. jg convertloop
  3453. pop edi
  3454. pop esi
  3455. vzeroupper
  3456. ret
  3457. }
  3458. }
  3459. __declspec(naked) void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
  3460. uint8* dst_u,
  3461. uint8* dst_v,
  3462. int width) {
  3463. __asm {
  3464. push edi
  3465. mov eax, [esp + 4 + 4] // src_yuy2
  3466. mov edx, [esp + 4 + 8] // dst_u
  3467. mov edi, [esp + 4 + 12] // dst_v
  3468. mov ecx, [esp + 4 + 16] // width
  3469. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
  3470. vpsrlw ymm5, ymm5, 8
  3471. sub edi, edx
  3472. convertloop:
  3473. vmovdqu ymm0, [eax]
  3474. vmovdqu ymm1, [eax + 32]
  3475. lea eax, [eax + 64]
  3476. vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
  3477. vpand ymm1, ymm1, ymm5
  3478. vpackuswb ymm0, ymm0, ymm1 // mutates.
  3479. vpermq ymm0, ymm0, 0xd8
  3480. vpand ymm1, ymm0, ymm5 // U
  3481. vpsrlw ymm0, ymm0, 8 // V
  3482. vpackuswb ymm1, ymm1, ymm1 // mutates.
  3483. vpackuswb ymm0, ymm0, ymm0 // mutates.
  3484. vpermq ymm1, ymm1, 0xd8
  3485. vpermq ymm0, ymm0, 0xd8
  3486. vextractf128 [edx], ymm1, 0 // U
  3487. vextractf128 [edx + edi], ymm0, 0 // V
  3488. lea edx, [edx + 16]
  3489. sub ecx, 32
  3490. jg convertloop
  3491. pop edi
  3492. vzeroupper
  3493. ret
  3494. }
  3495. }
  3496. #endif // HAS_YUY2TOYROW_AVX2
  3497. #ifdef HAS_YUY2TOYROW_SSE2
  3498. __declspec(naked) void YUY2ToYRow_SSE2(const uint8* src_yuy2,
  3499. uint8* dst_y,
  3500. int width) {
  3501. __asm {
  3502. mov eax, [esp + 4] // src_yuy2
  3503. mov edx, [esp + 8] // dst_y
  3504. mov ecx, [esp + 12] // width
  3505. pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
  3506. psrlw xmm5, 8
  3507. convertloop:
  3508. movdqu xmm0, [eax]
  3509. movdqu xmm1, [eax + 16]
  3510. lea eax, [eax + 32]
  3511. pand xmm0, xmm5 // even bytes are Y
  3512. pand xmm1, xmm5
  3513. packuswb xmm0, xmm1
  3514. movdqu [edx], xmm0
  3515. lea edx, [edx + 16]
  3516. sub ecx, 16
  3517. jg convertloop
  3518. ret
  3519. }
  3520. }
  3521. __declspec(naked) void YUY2ToUVRow_SSE2(const uint8* src_yuy2,
  3522. int stride_yuy2,
  3523. uint8* dst_u,
  3524. uint8* dst_v,
  3525. int width) {
  3526. __asm {
  3527. push esi
  3528. push edi
  3529. mov eax, [esp + 8 + 4] // src_yuy2
  3530. mov esi, [esp + 8 + 8] // stride_yuy2
  3531. mov edx, [esp + 8 + 12] // dst_u
  3532. mov edi, [esp + 8 + 16] // dst_v
  3533. mov ecx, [esp + 8 + 20] // width
  3534. pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
  3535. psrlw xmm5, 8
  3536. sub edi, edx
  3537. convertloop:
  3538. movdqu xmm0, [eax]
  3539. movdqu xmm1, [eax + 16]
  3540. movdqu xmm2, [eax + esi]
  3541. movdqu xmm3, [eax + esi + 16]
  3542. lea eax, [eax + 32]
  3543. pavgb xmm0, xmm2
  3544. pavgb xmm1, xmm3
  3545. psrlw xmm0, 8 // YUYV -> UVUV
  3546. psrlw xmm1, 8
  3547. packuswb xmm0, xmm1
  3548. movdqa xmm1, xmm0
  3549. pand xmm0, xmm5 // U
  3550. packuswb xmm0, xmm0
  3551. psrlw xmm1, 8 // V
  3552. packuswb xmm1, xmm1
  3553. movq qword ptr [edx], xmm0
  3554. movq qword ptr [edx + edi], xmm1
  3555. lea edx, [edx + 8]
  3556. sub ecx, 16
  3557. jg convertloop
  3558. pop edi
  3559. pop esi
  3560. ret
  3561. }
  3562. }
  3563. __declspec(naked) void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
  3564. uint8* dst_u,
  3565. uint8* dst_v,
  3566. int width) {
  3567. __asm {
  3568. push edi
  3569. mov eax, [esp + 4 + 4] // src_yuy2
  3570. mov edx, [esp + 4 + 8] // dst_u
  3571. mov edi, [esp + 4 + 12] // dst_v
  3572. mov ecx, [esp + 4 + 16] // width
  3573. pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
  3574. psrlw xmm5, 8
  3575. sub edi, edx
  3576. convertloop:
  3577. movdqu xmm0, [eax]
  3578. movdqu xmm1, [eax + 16]
  3579. lea eax, [eax + 32]
  3580. psrlw xmm0, 8 // YUYV -> UVUV
  3581. psrlw xmm1, 8
  3582. packuswb xmm0, xmm1
  3583. movdqa xmm1, xmm0
  3584. pand xmm0, xmm5 // U
  3585. packuswb xmm0, xmm0
  3586. psrlw xmm1, 8 // V
  3587. packuswb xmm1, xmm1
  3588. movq qword ptr [edx], xmm0
  3589. movq qword ptr [edx + edi], xmm1
  3590. lea edx, [edx + 8]
  3591. sub ecx, 16
  3592. jg convertloop
  3593. pop edi
  3594. ret
  3595. }
  3596. }
  3597. __declspec(naked) void UYVYToYRow_SSE2(const uint8* src_uyvy,
  3598. uint8* dst_y,
  3599. int width) {
  3600. __asm {
  3601. mov eax, [esp + 4] // src_uyvy
  3602. mov edx, [esp + 8] // dst_y
  3603. mov ecx, [esp + 12] // width
  3604. convertloop:
  3605. movdqu xmm0, [eax]
  3606. movdqu xmm1, [eax + 16]
  3607. lea eax, [eax + 32]
  3608. psrlw xmm0, 8 // odd bytes are Y
  3609. psrlw xmm1, 8
  3610. packuswb xmm0, xmm1
  3611. movdqu [edx], xmm0
  3612. lea edx, [edx + 16]
  3613. sub ecx, 16
  3614. jg convertloop
  3615. ret
  3616. }
  3617. }
  3618. __declspec(naked) void UYVYToUVRow_SSE2(const uint8* src_uyvy,
  3619. int stride_uyvy,
  3620. uint8* dst_u,
  3621. uint8* dst_v,
  3622. int width) {
  3623. __asm {
  3624. push esi
  3625. push edi
  3626. mov eax, [esp + 8 + 4] // src_yuy2
  3627. mov esi, [esp + 8 + 8] // stride_yuy2
  3628. mov edx, [esp + 8 + 12] // dst_u
  3629. mov edi, [esp + 8 + 16] // dst_v
  3630. mov ecx, [esp + 8 + 20] // width
  3631. pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
  3632. psrlw xmm5, 8
  3633. sub edi, edx
  3634. convertloop:
  3635. movdqu xmm0, [eax]
  3636. movdqu xmm1, [eax + 16]
  3637. movdqu xmm2, [eax + esi]
  3638. movdqu xmm3, [eax + esi + 16]
  3639. lea eax, [eax + 32]
  3640. pavgb xmm0, xmm2
  3641. pavgb xmm1, xmm3
  3642. pand xmm0, xmm5 // UYVY -> UVUV
  3643. pand xmm1, xmm5
  3644. packuswb xmm0, xmm1
  3645. movdqa xmm1, xmm0
  3646. pand xmm0, xmm5 // U
  3647. packuswb xmm0, xmm0
  3648. psrlw xmm1, 8 // V
  3649. packuswb xmm1, xmm1
  3650. movq qword ptr [edx], xmm0
  3651. movq qword ptr [edx + edi], xmm1
  3652. lea edx, [edx + 8]
  3653. sub ecx, 16
  3654. jg convertloop
  3655. pop edi
  3656. pop esi
  3657. ret
  3658. }
  3659. }
  3660. __declspec(naked) void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
  3661. uint8* dst_u,
  3662. uint8* dst_v,
  3663. int width) {
  3664. __asm {
  3665. push edi
  3666. mov eax, [esp + 4 + 4] // src_yuy2
  3667. mov edx, [esp + 4 + 8] // dst_u
  3668. mov edi, [esp + 4 + 12] // dst_v
  3669. mov ecx, [esp + 4 + 16] // width
  3670. pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
  3671. psrlw xmm5, 8
  3672. sub edi, edx
  3673. convertloop:
  3674. movdqu xmm0, [eax]
  3675. movdqu xmm1, [eax + 16]
  3676. lea eax, [eax + 32]
  3677. pand xmm0, xmm5 // UYVY -> UVUV
  3678. pand xmm1, xmm5
  3679. packuswb xmm0, xmm1
  3680. movdqa xmm1, xmm0
  3681. pand xmm0, xmm5 // U
  3682. packuswb xmm0, xmm0
  3683. psrlw xmm1, 8 // V
  3684. packuswb xmm1, xmm1
  3685. movq qword ptr [edx], xmm0
  3686. movq qword ptr [edx + edi], xmm1
  3687. lea edx, [edx + 8]
  3688. sub ecx, 16
  3689. jg convertloop
  3690. pop edi
  3691. ret
  3692. }
  3693. }
  3694. #endif // HAS_YUY2TOYROW_SSE2
  3695. #ifdef HAS_BLENDPLANEROW_SSSE3
  3696. // Blend 8 pixels at a time.
  3697. // unsigned version of math
  3698. // =((A2*C2)+(B2*(255-C2))+255)/256
  3699. // signed version of math
  3700. // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
  3701. __declspec(naked) void BlendPlaneRow_SSSE3(const uint8* src0,
  3702. const uint8* src1,
  3703. const uint8* alpha,
  3704. uint8* dst,
  3705. int width) {
  3706. __asm {
  3707. push esi
  3708. push edi
  3709. pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
  3710. psllw xmm5, 8
  3711. mov eax, 0x80808080 // 128 for biasing image to signed.
  3712. movd xmm6, eax
  3713. pshufd xmm6, xmm6, 0x00
  3714. mov eax, 0x807f807f // 32768 + 127 for unbias and round.
  3715. movd xmm7, eax
  3716. pshufd xmm7, xmm7, 0x00
  3717. mov eax, [esp + 8 + 4] // src0
  3718. mov edx, [esp + 8 + 8] // src1
  3719. mov esi, [esp + 8 + 12] // alpha
  3720. mov edi, [esp + 8 + 16] // dst
  3721. mov ecx, [esp + 8 + 20] // width
  3722. sub eax, esi
  3723. sub edx, esi
  3724. sub edi, esi
  3725. // 8 pixel loop.
  3726. convertloop8:
  3727. movq xmm0, qword ptr [esi] // alpha
  3728. punpcklbw xmm0, xmm0
  3729. pxor xmm0, xmm5 // a, 255-a
  3730. movq xmm1, qword ptr [eax + esi] // src0
  3731. movq xmm2, qword ptr [edx + esi] // src1
  3732. punpcklbw xmm1, xmm2
  3733. psubb xmm1, xmm6 // bias src0/1 - 128
  3734. pmaddubsw xmm0, xmm1
  3735. paddw xmm0, xmm7 // unbias result - 32768 and round.
  3736. psrlw xmm0, 8
  3737. packuswb xmm0, xmm0
  3738. movq qword ptr [edi + esi], xmm0
  3739. lea esi, [esi + 8]
  3740. sub ecx, 8
  3741. jg convertloop8
  3742. pop edi
  3743. pop esi
  3744. ret
  3745. }
  3746. }
  3747. #endif // HAS_BLENDPLANEROW_SSSE3
  3748. #ifdef HAS_BLENDPLANEROW_AVX2
  3749. // Blend 32 pixels at a time.
  3750. // unsigned version of math
  3751. // =((A2*C2)+(B2*(255-C2))+255)/256
  3752. // signed version of math
  3753. // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
  3754. __declspec(naked) void BlendPlaneRow_AVX2(const uint8* src0,
  3755. const uint8* src1,
  3756. const uint8* alpha,
  3757. uint8* dst,
  3758. int width) {
  3759. __asm {
  3760. push esi
  3761. push edi
  3762. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00
  3763. vpsllw ymm5, ymm5, 8
  3764. mov eax, 0x80808080 // 128 for biasing image to signed.
  3765. vmovd xmm6, eax
  3766. vbroadcastss ymm6, xmm6
  3767. mov eax, 0x807f807f // 32768 + 127 for unbias and round.
  3768. vmovd xmm7, eax
  3769. vbroadcastss ymm7, xmm7
  3770. mov eax, [esp + 8 + 4] // src0
  3771. mov edx, [esp + 8 + 8] // src1
  3772. mov esi, [esp + 8 + 12] // alpha
  3773. mov edi, [esp + 8 + 16] // dst
  3774. mov ecx, [esp + 8 + 20] // width
  3775. sub eax, esi
  3776. sub edx, esi
  3777. sub edi, esi
  3778. // 32 pixel loop.
  3779. convertloop32:
  3780. vmovdqu ymm0, [esi] // alpha
  3781. vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31
  3782. vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23
  3783. vpxor ymm3, ymm3, ymm5 // a, 255-a
  3784. vpxor ymm0, ymm0, ymm5 // a, 255-a
  3785. vmovdqu ymm1, [eax + esi] // src0
  3786. vmovdqu ymm2, [edx + esi] // src1
  3787. vpunpckhbw ymm4, ymm1, ymm2
  3788. vpunpcklbw ymm1, ymm1, ymm2
  3789. vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128
  3790. vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128
  3791. vpmaddubsw ymm3, ymm3, ymm4
  3792. vpmaddubsw ymm0, ymm0, ymm1
  3793. vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round.
  3794. vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round.
  3795. vpsrlw ymm3, ymm3, 8
  3796. vpsrlw ymm0, ymm0, 8
  3797. vpackuswb ymm0, ymm0, ymm3
  3798. vmovdqu [edi + esi], ymm0
  3799. lea esi, [esi + 32]
  3800. sub ecx, 32
  3801. jg convertloop32
  3802. pop edi
  3803. pop esi
  3804. vzeroupper
  3805. ret
  3806. }
  3807. }
  3808. #endif // HAS_BLENDPLANEROW_AVX2
  3809. #ifdef HAS_ARGBBLENDROW_SSSE3
  3810. // Shuffle table for isolating alpha.
  3811. static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
  3812. 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
  3813. // Blend 8 pixels at a time.
  3814. __declspec(naked) void ARGBBlendRow_SSSE3(const uint8* src_argb0,
  3815. const uint8* src_argb1,
  3816. uint8* dst_argb,
  3817. int width) {
  3818. __asm {
  3819. push esi
  3820. mov eax, [esp + 4 + 4] // src_argb0
  3821. mov esi, [esp + 4 + 8] // src_argb1
  3822. mov edx, [esp + 4 + 12] // dst_argb
  3823. mov ecx, [esp + 4 + 16] // width
  3824. pcmpeqb xmm7, xmm7 // generate constant 0x0001
  3825. psrlw xmm7, 15
  3826. pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
  3827. psrlw xmm6, 8
  3828. pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
  3829. psllw xmm5, 8
  3830. pcmpeqb xmm4, xmm4 // generate mask 0xff000000
  3831. pslld xmm4, 24
  3832. sub ecx, 4
  3833. jl convertloop4b // less than 4 pixels?
  3834. // 4 pixel loop.
  3835. convertloop4:
  3836. movdqu xmm3, [eax] // src argb
  3837. lea eax, [eax + 16]
  3838. movdqa xmm0, xmm3 // src argb
  3839. pxor xmm3, xmm4 // ~alpha
  3840. movdqu xmm2, [esi] // _r_b
  3841. pshufb xmm3, xmmword ptr kShuffleAlpha // alpha
  3842. pand xmm2, xmm6 // _r_b
  3843. paddw xmm3, xmm7 // 256 - alpha
  3844. pmullw xmm2, xmm3 // _r_b * alpha
  3845. movdqu xmm1, [esi] // _a_g
  3846. lea esi, [esi + 16]
  3847. psrlw xmm1, 8 // _a_g
  3848. por xmm0, xmm4 // set alpha to 255
  3849. pmullw xmm1, xmm3 // _a_g * alpha
  3850. psrlw xmm2, 8 // _r_b convert to 8 bits again
  3851. paddusb xmm0, xmm2 // + src argb
  3852. pand xmm1, xmm5 // a_g_ convert to 8 bits again
  3853. paddusb xmm0, xmm1 // + src argb
  3854. movdqu [edx], xmm0
  3855. lea edx, [edx + 16]
  3856. sub ecx, 4
  3857. jge convertloop4
  3858. convertloop4b:
  3859. add ecx, 4 - 1
  3860. jl convertloop1b
  3861. // 1 pixel loop.
  3862. convertloop1:
  3863. movd xmm3, [eax] // src argb
  3864. lea eax, [eax + 4]
  3865. movdqa xmm0, xmm3 // src argb
  3866. pxor xmm3, xmm4 // ~alpha
  3867. movd xmm2, [esi] // _r_b
  3868. pshufb xmm3, xmmword ptr kShuffleAlpha // alpha
  3869. pand xmm2, xmm6 // _r_b
  3870. paddw xmm3, xmm7 // 256 - alpha
  3871. pmullw xmm2, xmm3 // _r_b * alpha
  3872. movd xmm1, [esi] // _a_g
  3873. lea esi, [esi + 4]
  3874. psrlw xmm1, 8 // _a_g
  3875. por xmm0, xmm4 // set alpha to 255
  3876. pmullw xmm1, xmm3 // _a_g * alpha
  3877. psrlw xmm2, 8 // _r_b convert to 8 bits again
  3878. paddusb xmm0, xmm2 // + src argb
  3879. pand xmm1, xmm5 // a_g_ convert to 8 bits again
  3880. paddusb xmm0, xmm1 // + src argb
  3881. movd [edx], xmm0
  3882. lea edx, [edx + 4]
  3883. sub ecx, 1
  3884. jge convertloop1
  3885. convertloop1b:
  3886. pop esi
  3887. ret
  3888. }
  3889. }
  3890. #endif // HAS_ARGBBLENDROW_SSSE3
  3891. #ifdef HAS_ARGBATTENUATEROW_SSSE3
  3892. // Shuffle table duplicating alpha.
  3893. static const uvec8 kShuffleAlpha0 = {
  3894. 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
  3895. };
  3896. static const uvec8 kShuffleAlpha1 = {
  3897. 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
  3898. 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
  3899. };
  3900. __declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8* src_argb,
  3901. uint8* dst_argb,
  3902. int width) {
  3903. __asm {
  3904. mov eax, [esp + 4] // src_argb0
  3905. mov edx, [esp + 8] // dst_argb
  3906. mov ecx, [esp + 12] // width
  3907. pcmpeqb xmm3, xmm3 // generate mask 0xff000000
  3908. pslld xmm3, 24
  3909. movdqa xmm4, xmmword ptr kShuffleAlpha0
  3910. movdqa xmm5, xmmword ptr kShuffleAlpha1
  3911. convertloop:
  3912. movdqu xmm0, [eax] // read 4 pixels
  3913. pshufb xmm0, xmm4 // isolate first 2 alphas
  3914. movdqu xmm1, [eax] // read 4 pixels
  3915. punpcklbw xmm1, xmm1 // first 2 pixel rgbs
  3916. pmulhuw xmm0, xmm1 // rgb * a
  3917. movdqu xmm1, [eax] // read 4 pixels
  3918. pshufb xmm1, xmm5 // isolate next 2 alphas
  3919. movdqu xmm2, [eax] // read 4 pixels
  3920. punpckhbw xmm2, xmm2 // next 2 pixel rgbs
  3921. pmulhuw xmm1, xmm2 // rgb * a
  3922. movdqu xmm2, [eax] // mask original alpha
  3923. lea eax, [eax + 16]
  3924. pand xmm2, xmm3
  3925. psrlw xmm0, 8
  3926. psrlw xmm1, 8
  3927. packuswb xmm0, xmm1
  3928. por xmm0, xmm2 // copy original alpha
  3929. movdqu [edx], xmm0
  3930. lea edx, [edx + 16]
  3931. sub ecx, 4
  3932. jg convertloop
  3933. ret
  3934. }
  3935. }
  3936. #endif // HAS_ARGBATTENUATEROW_SSSE3
  3937. #ifdef HAS_ARGBATTENUATEROW_AVX2
  3938. // Shuffle table duplicating alpha.
  3939. static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u,
  3940. 128u, 128u, 14u, 15u, 14u, 15u,
  3941. 14u, 15u, 128u, 128u};
  3942. __declspec(naked) void ARGBAttenuateRow_AVX2(const uint8* src_argb,
  3943. uint8* dst_argb,
  3944. int width) {
  3945. __asm {
  3946. mov eax, [esp + 4] // src_argb0
  3947. mov edx, [esp + 8] // dst_argb
  3948. mov ecx, [esp + 12] // width
  3949. sub edx, eax
  3950. vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2
  3951. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
  3952. vpslld ymm5, ymm5, 24
  3953. convertloop:
  3954. vmovdqu ymm6, [eax] // read 8 pixels.
  3955. vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
  3956. vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
  3957. vpshufb ymm2, ymm0, ymm4 // low 4 alphas
  3958. vpshufb ymm3, ymm1, ymm4 // high 4 alphas
  3959. vpmulhuw ymm0, ymm0, ymm2 // rgb * a
  3960. vpmulhuw ymm1, ymm1, ymm3 // rgb * a
  3961. vpand ymm6, ymm6, ymm5 // isolate alpha
  3962. vpsrlw ymm0, ymm0, 8
  3963. vpsrlw ymm1, ymm1, 8
  3964. vpackuswb ymm0, ymm0, ymm1 // unmutated.
  3965. vpor ymm0, ymm0, ymm6 // copy original alpha
  3966. vmovdqu [eax + edx], ymm0
  3967. lea eax, [eax + 32]
  3968. sub ecx, 8
  3969. jg convertloop
  3970. vzeroupper
  3971. ret
  3972. }
  3973. }
  3974. #endif // HAS_ARGBATTENUATEROW_AVX2
  3975. #ifdef HAS_ARGBUNATTENUATEROW_SSE2
  3976. // Unattenuate 4 pixels at a time.
  3977. __declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8* src_argb,
  3978. uint8* dst_argb,
  3979. int width) {
  3980. __asm {
  3981. push ebx
  3982. push esi
  3983. push edi
  3984. mov eax, [esp + 12 + 4] // src_argb
  3985. mov edx, [esp + 12 + 8] // dst_argb
  3986. mov ecx, [esp + 12 + 12] // width
  3987. lea ebx, fixed_invtbl8
  3988. convertloop:
  3989. movdqu xmm0, [eax] // read 4 pixels
  3990. movzx esi, byte ptr [eax + 3] // first alpha
  3991. movzx edi, byte ptr [eax + 7] // second alpha
  3992. punpcklbw xmm0, xmm0 // first 2
  3993. movd xmm2, dword ptr [ebx + esi * 4]
  3994. movd xmm3, dword ptr [ebx + edi * 4]
  3995. pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a
  3996. pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
  3997. movlhps xmm2, xmm3
  3998. pmulhuw xmm0, xmm2 // rgb * a
  3999. movdqu xmm1, [eax] // read 4 pixels
  4000. movzx esi, byte ptr [eax + 11] // third alpha
  4001. movzx edi, byte ptr [eax + 15] // forth alpha
  4002. punpckhbw xmm1, xmm1 // next 2
  4003. movd xmm2, dword ptr [ebx + esi * 4]
  4004. movd xmm3, dword ptr [ebx + edi * 4]
  4005. pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words
  4006. pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
  4007. movlhps xmm2, xmm3
  4008. pmulhuw xmm1, xmm2 // rgb * a
  4009. lea eax, [eax + 16]
  4010. packuswb xmm0, xmm1
  4011. movdqu [edx], xmm0
  4012. lea edx, [edx + 16]
  4013. sub ecx, 4
  4014. jg convertloop
  4015. pop edi
  4016. pop esi
  4017. pop ebx
  4018. ret
  4019. }
  4020. }
  4021. #endif // HAS_ARGBUNATTENUATEROW_SSE2
  4022. #ifdef HAS_ARGBUNATTENUATEROW_AVX2
  4023. // Shuffle table duplicating alpha.
  4024. static const uvec8 kUnattenShuffleAlpha_AVX2 = {
  4025. 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
  4026. // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
  4027. // USE_GATHER is not on by default, due to being a slow instruction.
  4028. #ifdef USE_GATHER
  4029. __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8* src_argb,
  4030. uint8* dst_argb,
  4031. int width) {
  4032. __asm {
  4033. mov eax, [esp + 4] // src_argb0
  4034. mov edx, [esp + 8] // dst_argb
  4035. mov ecx, [esp + 12] // width
  4036. sub edx, eax
  4037. vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2
  4038. convertloop:
  4039. vmovdqu ymm6, [eax] // read 8 pixels.
  4040. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather.
  4041. vpsrld ymm2, ymm6, 24 // alpha in low 8 bits.
  4042. vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
  4043. vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
  4044. vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a
  4045. vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
  4046. vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
  4047. vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a
  4048. vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas
  4049. vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
  4050. vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
  4051. vpackuswb ymm0, ymm0, ymm1 // unmutated.
  4052. vmovdqu [eax + edx], ymm0
  4053. lea eax, [eax + 32]
  4054. sub ecx, 8
  4055. jg convertloop
  4056. vzeroupper
  4057. ret
  4058. }
  4059. }
  4060. #else // USE_GATHER
  4061. __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8* src_argb,
  4062. uint8* dst_argb,
  4063. int width) {
  4064. __asm {
  4065. push ebx
  4066. push esi
  4067. push edi
  4068. mov eax, [esp + 12 + 4] // src_argb
  4069. mov edx, [esp + 12 + 8] // dst_argb
  4070. mov ecx, [esp + 12 + 12] // width
  4071. sub edx, eax
  4072. lea ebx, fixed_invtbl8
  4073. vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2
  4074. convertloop:
  4075. // replace VPGATHER
  4076. movzx esi, byte ptr [eax + 3] // alpha0
  4077. movzx edi, byte ptr [eax + 7] // alpha1
  4078. vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a0]
  4079. vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a1]
  4080. movzx esi, byte ptr [eax + 11] // alpha2
  4081. movzx edi, byte ptr [eax + 15] // alpha3
  4082. vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0]
  4083. vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a2]
  4084. vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a3]
  4085. movzx esi, byte ptr [eax + 19] // alpha4
  4086. movzx edi, byte ptr [eax + 23] // alpha5
  4087. vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2]
  4088. vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a4]
  4089. vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a5]
  4090. movzx esi, byte ptr [eax + 27] // alpha6
  4091. movzx edi, byte ptr [eax + 31] // alpha7
  4092. vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4]
  4093. vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a6]
  4094. vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a7]
  4095. vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6]
  4096. vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0]
  4097. vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4]
  4098. vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
  4099. // end of VPGATHER
  4100. vmovdqu ymm6, [eax] // read 8 pixels.
  4101. vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
  4102. vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
  4103. vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
  4104. vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
  4105. vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a
  4106. vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas
  4107. vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
  4108. vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
  4109. vpackuswb ymm0, ymm0, ymm1 // unmutated.
  4110. vmovdqu [eax + edx], ymm0
  4111. lea eax, [eax + 32]
  4112. sub ecx, 8
  4113. jg convertloop
  4114. pop edi
  4115. pop esi
  4116. pop ebx
  4117. vzeroupper
  4118. ret
  4119. }
  4120. }
  4121. #endif // USE_GATHER
  4122. #endif // HAS_ARGBATTENUATEROW_AVX2
  4123. #ifdef HAS_ARGBGRAYROW_SSSE3
  4124. // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
  4125. __declspec(naked) void ARGBGrayRow_SSSE3(const uint8* src_argb,
  4126. uint8* dst_argb,
  4127. int width) {
  4128. __asm {
  4129. mov eax, [esp + 4] /* src_argb */
  4130. mov edx, [esp + 8] /* dst_argb */
  4131. mov ecx, [esp + 12] /* width */
  4132. movdqa xmm4, xmmword ptr kARGBToYJ
  4133. movdqa xmm5, xmmword ptr kAddYJ64
  4134. convertloop:
  4135. movdqu xmm0, [eax] // G
  4136. movdqu xmm1, [eax + 16]
  4137. pmaddubsw xmm0, xmm4
  4138. pmaddubsw xmm1, xmm4
  4139. phaddw xmm0, xmm1
  4140. paddw xmm0, xmm5 // Add .5 for rounding.
  4141. psrlw xmm0, 7
  4142. packuswb xmm0, xmm0 // 8 G bytes
  4143. movdqu xmm2, [eax] // A
  4144. movdqu xmm3, [eax + 16]
  4145. lea eax, [eax + 32]
  4146. psrld xmm2, 24
  4147. psrld xmm3, 24
  4148. packuswb xmm2, xmm3
  4149. packuswb xmm2, xmm2 // 8 A bytes
  4150. movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA
  4151. punpcklbw xmm0, xmm0 // 8 GG words
  4152. punpcklbw xmm3, xmm2 // 8 GA words
  4153. movdqa xmm1, xmm0
  4154. punpcklwd xmm0, xmm3 // GGGA first 4
  4155. punpckhwd xmm1, xmm3 // GGGA next 4
  4156. movdqu [edx], xmm0
  4157. movdqu [edx + 16], xmm1
  4158. lea edx, [edx + 32]
  4159. sub ecx, 8
  4160. jg convertloop
  4161. ret
  4162. }
  4163. }
  4164. #endif // HAS_ARGBGRAYROW_SSSE3
  4165. #ifdef HAS_ARGBSEPIAROW_SSSE3
  4166. // b = (r * 35 + g * 68 + b * 17) >> 7
  4167. // g = (r * 45 + g * 88 + b * 22) >> 7
  4168. // r = (r * 50 + g * 98 + b * 24) >> 7
  4169. // Constant for ARGB color to sepia tone.
  4170. static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
  4171. 17, 68, 35, 0, 17, 68, 35, 0};
  4172. static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
  4173. 22, 88, 45, 0, 22, 88, 45, 0};
  4174. static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
  4175. 24, 98, 50, 0, 24, 98, 50, 0};
  4176. // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
  4177. __declspec(naked) void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
  4178. __asm {
  4179. mov eax, [esp + 4] /* dst_argb */
  4180. mov ecx, [esp + 8] /* width */
  4181. movdqa xmm2, xmmword ptr kARGBToSepiaB
  4182. movdqa xmm3, xmmword ptr kARGBToSepiaG
  4183. movdqa xmm4, xmmword ptr kARGBToSepiaR
  4184. convertloop:
  4185. movdqu xmm0, [eax] // B
  4186. movdqu xmm6, [eax + 16]
  4187. pmaddubsw xmm0, xmm2
  4188. pmaddubsw xmm6, xmm2
  4189. phaddw xmm0, xmm6
  4190. psrlw xmm0, 7
  4191. packuswb xmm0, xmm0 // 8 B values
  4192. movdqu xmm5, [eax] // G
  4193. movdqu xmm1, [eax + 16]
  4194. pmaddubsw xmm5, xmm3
  4195. pmaddubsw xmm1, xmm3
  4196. phaddw xmm5, xmm1
  4197. psrlw xmm5, 7
  4198. packuswb xmm5, xmm5 // 8 G values
  4199. punpcklbw xmm0, xmm5 // 8 BG values
  4200. movdqu xmm5, [eax] // R
  4201. movdqu xmm1, [eax + 16]
  4202. pmaddubsw xmm5, xmm4
  4203. pmaddubsw xmm1, xmm4
  4204. phaddw xmm5, xmm1
  4205. psrlw xmm5, 7
  4206. packuswb xmm5, xmm5 // 8 R values
  4207. movdqu xmm6, [eax] // A
  4208. movdqu xmm1, [eax + 16]
  4209. psrld xmm6, 24
  4210. psrld xmm1, 24
  4211. packuswb xmm6, xmm1
  4212. packuswb xmm6, xmm6 // 8 A values
  4213. punpcklbw xmm5, xmm6 // 8 RA values
  4214. movdqa xmm1, xmm0 // Weave BG, RA together
  4215. punpcklwd xmm0, xmm5 // BGRA first 4
  4216. punpckhwd xmm1, xmm5 // BGRA next 4
  4217. movdqu [eax], xmm0
  4218. movdqu [eax + 16], xmm1
  4219. lea eax, [eax + 32]
  4220. sub ecx, 8
  4221. jg convertloop
  4222. ret
  4223. }
  4224. }
  4225. #endif // HAS_ARGBSEPIAROW_SSSE3
  4226. #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
  4227. // Tranform 8 ARGB pixels (32 bytes) with color matrix.
  4228. // Same as Sepia except matrix is provided.
  4229. // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
  4230. // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
  4231. __declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8* src_argb,
  4232. uint8* dst_argb,
  4233. const int8* matrix_argb,
  4234. int width) {
  4235. __asm {
  4236. mov eax, [esp + 4] /* src_argb */
  4237. mov edx, [esp + 8] /* dst_argb */
  4238. mov ecx, [esp + 12] /* matrix_argb */
  4239. movdqu xmm5, [ecx]
  4240. pshufd xmm2, xmm5, 0x00
  4241. pshufd xmm3, xmm5, 0x55
  4242. pshufd xmm4, xmm5, 0xaa
  4243. pshufd xmm5, xmm5, 0xff
  4244. mov ecx, [esp + 16] /* width */
  4245. convertloop:
  4246. movdqu xmm0, [eax] // B
  4247. movdqu xmm7, [eax + 16]
  4248. pmaddubsw xmm0, xmm2
  4249. pmaddubsw xmm7, xmm2
  4250. movdqu xmm6, [eax] // G
  4251. movdqu xmm1, [eax + 16]
  4252. pmaddubsw xmm6, xmm3
  4253. pmaddubsw xmm1, xmm3
  4254. phaddsw xmm0, xmm7 // B
  4255. phaddsw xmm6, xmm1 // G
  4256. psraw xmm0, 6 // B
  4257. psraw xmm6, 6 // G
  4258. packuswb xmm0, xmm0 // 8 B values
  4259. packuswb xmm6, xmm6 // 8 G values
  4260. punpcklbw xmm0, xmm6 // 8 BG values
  4261. movdqu xmm1, [eax] // R
  4262. movdqu xmm7, [eax + 16]
  4263. pmaddubsw xmm1, xmm4
  4264. pmaddubsw xmm7, xmm4
  4265. phaddsw xmm1, xmm7 // R
  4266. movdqu xmm6, [eax] // A
  4267. movdqu xmm7, [eax + 16]
  4268. pmaddubsw xmm6, xmm5
  4269. pmaddubsw xmm7, xmm5
  4270. phaddsw xmm6, xmm7 // A
  4271. psraw xmm1, 6 // R
  4272. psraw xmm6, 6 // A
  4273. packuswb xmm1, xmm1 // 8 R values
  4274. packuswb xmm6, xmm6 // 8 A values
  4275. punpcklbw xmm1, xmm6 // 8 RA values
  4276. movdqa xmm6, xmm0 // Weave BG, RA together
  4277. punpcklwd xmm0, xmm1 // BGRA first 4
  4278. punpckhwd xmm6, xmm1 // BGRA next 4
  4279. movdqu [edx], xmm0
  4280. movdqu [edx + 16], xmm6
  4281. lea eax, [eax + 32]
  4282. lea edx, [edx + 32]
  4283. sub ecx, 8
  4284. jg convertloop
  4285. ret
  4286. }
  4287. }
  4288. #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
  4289. #ifdef HAS_ARGBQUANTIZEROW_SSE2
  4290. // Quantize 4 ARGB pixels (16 bytes).
  4291. __declspec(naked) void ARGBQuantizeRow_SSE2(uint8* dst_argb,
  4292. int scale,
  4293. int interval_size,
  4294. int interval_offset,
  4295. int width) {
  4296. __asm {
  4297. mov eax, [esp + 4] /* dst_argb */
  4298. movd xmm2, [esp + 8] /* scale */
  4299. movd xmm3, [esp + 12] /* interval_size */
  4300. movd xmm4, [esp + 16] /* interval_offset */
  4301. mov ecx, [esp + 20] /* width */
  4302. pshuflw xmm2, xmm2, 040h
  4303. pshufd xmm2, xmm2, 044h
  4304. pshuflw xmm3, xmm3, 040h
  4305. pshufd xmm3, xmm3, 044h
  4306. pshuflw xmm4, xmm4, 040h
  4307. pshufd xmm4, xmm4, 044h
  4308. pxor xmm5, xmm5 // constant 0
  4309. pcmpeqb xmm6, xmm6 // generate mask 0xff000000
  4310. pslld xmm6, 24
  4311. convertloop:
  4312. movdqu xmm0, [eax] // read 4 pixels
  4313. punpcklbw xmm0, xmm5 // first 2 pixels
  4314. pmulhuw xmm0, xmm2 // pixel * scale >> 16
  4315. movdqu xmm1, [eax] // read 4 pixels
  4316. punpckhbw xmm1, xmm5 // next 2 pixels
  4317. pmulhuw xmm1, xmm2
  4318. pmullw xmm0, xmm3 // * interval_size
  4319. movdqu xmm7, [eax] // read 4 pixels
  4320. pmullw xmm1, xmm3
  4321. pand xmm7, xmm6 // mask alpha
  4322. paddw xmm0, xmm4 // + interval_size / 2
  4323. paddw xmm1, xmm4
  4324. packuswb xmm0, xmm1
  4325. por xmm0, xmm7
  4326. movdqu [eax], xmm0
  4327. lea eax, [eax + 16]
  4328. sub ecx, 4
  4329. jg convertloop
  4330. ret
  4331. }
  4332. }
  4333. #endif // HAS_ARGBQUANTIZEROW_SSE2
  4334. #ifdef HAS_ARGBSHADEROW_SSE2
  4335. // Shade 4 pixels at a time by specified value.
  4336. __declspec(naked) void ARGBShadeRow_SSE2(const uint8* src_argb,
  4337. uint8* dst_argb,
  4338. int width,
  4339. uint32 value) {
  4340. __asm {
  4341. mov eax, [esp + 4] // src_argb
  4342. mov edx, [esp + 8] // dst_argb
  4343. mov ecx, [esp + 12] // width
  4344. movd xmm2, [esp + 16] // value
  4345. punpcklbw xmm2, xmm2
  4346. punpcklqdq xmm2, xmm2
  4347. convertloop:
  4348. movdqu xmm0, [eax] // read 4 pixels
  4349. lea eax, [eax + 16]
  4350. movdqa xmm1, xmm0
  4351. punpcklbw xmm0, xmm0 // first 2
  4352. punpckhbw xmm1, xmm1 // next 2
  4353. pmulhuw xmm0, xmm2 // argb * value
  4354. pmulhuw xmm1, xmm2 // argb * value
  4355. psrlw xmm0, 8
  4356. psrlw xmm1, 8
  4357. packuswb xmm0, xmm1
  4358. movdqu [edx], xmm0
  4359. lea edx, [edx + 16]
  4360. sub ecx, 4
  4361. jg convertloop
  4362. ret
  4363. }
  4364. }
  4365. #endif // HAS_ARGBSHADEROW_SSE2
  4366. #ifdef HAS_ARGBMULTIPLYROW_SSE2
  4367. // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
  4368. __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8* src_argb0,
  4369. const uint8* src_argb1,
  4370. uint8* dst_argb,
  4371. int width) {
  4372. __asm {
  4373. push esi
  4374. mov eax, [esp + 4 + 4] // src_argb0
  4375. mov esi, [esp + 4 + 8] // src_argb1
  4376. mov edx, [esp + 4 + 12] // dst_argb
  4377. mov ecx, [esp + 4 + 16] // width
  4378. pxor xmm5, xmm5 // constant 0
  4379. convertloop:
  4380. movdqu xmm0, [eax] // read 4 pixels from src_argb0
  4381. movdqu xmm2, [esi] // read 4 pixels from src_argb1
  4382. movdqu xmm1, xmm0
  4383. movdqu xmm3, xmm2
  4384. punpcklbw xmm0, xmm0 // first 2
  4385. punpckhbw xmm1, xmm1 // next 2
  4386. punpcklbw xmm2, xmm5 // first 2
  4387. punpckhbw xmm3, xmm5 // next 2
  4388. pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2
  4389. pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2
  4390. lea eax, [eax + 16]
  4391. lea esi, [esi + 16]
  4392. packuswb xmm0, xmm1
  4393. movdqu [edx], xmm0
  4394. lea edx, [edx + 16]
  4395. sub ecx, 4
  4396. jg convertloop
  4397. pop esi
  4398. ret
  4399. }
  4400. }
  4401. #endif // HAS_ARGBMULTIPLYROW_SSE2
  4402. #ifdef HAS_ARGBADDROW_SSE2
  4403. // Add 2 rows of ARGB pixels together, 4 pixels at a time.
  4404. // TODO(fbarchard): Port this to posix, neon and other math functions.
  4405. __declspec(naked) void ARGBAddRow_SSE2(const uint8* src_argb0,
  4406. const uint8* src_argb1,
  4407. uint8* dst_argb,
  4408. int width) {
  4409. __asm {
  4410. push esi
  4411. mov eax, [esp + 4 + 4] // src_argb0
  4412. mov esi, [esp + 4 + 8] // src_argb1
  4413. mov edx, [esp + 4 + 12] // dst_argb
  4414. mov ecx, [esp + 4 + 16] // width
  4415. sub ecx, 4
  4416. jl convertloop49
  4417. convertloop4:
  4418. movdqu xmm0, [eax] // read 4 pixels from src_argb0
  4419. lea eax, [eax + 16]
  4420. movdqu xmm1, [esi] // read 4 pixels from src_argb1
  4421. lea esi, [esi + 16]
  4422. paddusb xmm0, xmm1 // src_argb0 + src_argb1
  4423. movdqu [edx], xmm0
  4424. lea edx, [edx + 16]
  4425. sub ecx, 4
  4426. jge convertloop4
  4427. convertloop49:
  4428. add ecx, 4 - 1
  4429. jl convertloop19
  4430. convertloop1:
  4431. movd xmm0, [eax] // read 1 pixels from src_argb0
  4432. lea eax, [eax + 4]
  4433. movd xmm1, [esi] // read 1 pixels from src_argb1
  4434. lea esi, [esi + 4]
  4435. paddusb xmm0, xmm1 // src_argb0 + src_argb1
  4436. movd [edx], xmm0
  4437. lea edx, [edx + 4]
  4438. sub ecx, 1
  4439. jge convertloop1
  4440. convertloop19:
  4441. pop esi
  4442. ret
  4443. }
  4444. }
  4445. #endif // HAS_ARGBADDROW_SSE2
  4446. #ifdef HAS_ARGBSUBTRACTROW_SSE2
  4447. // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
  4448. __declspec(naked) void ARGBSubtractRow_SSE2(const uint8* src_argb0,
  4449. const uint8* src_argb1,
  4450. uint8* dst_argb,
  4451. int width) {
  4452. __asm {
  4453. push esi
  4454. mov eax, [esp + 4 + 4] // src_argb0
  4455. mov esi, [esp + 4 + 8] // src_argb1
  4456. mov edx, [esp + 4 + 12] // dst_argb
  4457. mov ecx, [esp + 4 + 16] // width
  4458. convertloop:
  4459. movdqu xmm0, [eax] // read 4 pixels from src_argb0
  4460. lea eax, [eax + 16]
  4461. movdqu xmm1, [esi] // read 4 pixels from src_argb1
  4462. lea esi, [esi + 16]
  4463. psubusb xmm0, xmm1 // src_argb0 - src_argb1
  4464. movdqu [edx], xmm0
  4465. lea edx, [edx + 16]
  4466. sub ecx, 4
  4467. jg convertloop
  4468. pop esi
  4469. ret
  4470. }
  4471. }
  4472. #endif // HAS_ARGBSUBTRACTROW_SSE2
  4473. #ifdef HAS_ARGBMULTIPLYROW_AVX2
  4474. // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
  4475. __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8* src_argb0,
  4476. const uint8* src_argb1,
  4477. uint8* dst_argb,
  4478. int width) {
  4479. __asm {
  4480. push esi
  4481. mov eax, [esp + 4 + 4] // src_argb0
  4482. mov esi, [esp + 4 + 8] // src_argb1
  4483. mov edx, [esp + 4 + 12] // dst_argb
  4484. mov ecx, [esp + 4 + 16] // width
  4485. vpxor ymm5, ymm5, ymm5 // constant 0
  4486. convertloop:
  4487. vmovdqu ymm1, [eax] // read 8 pixels from src_argb0
  4488. lea eax, [eax + 32]
  4489. vmovdqu ymm3, [esi] // read 8 pixels from src_argb1
  4490. lea esi, [esi + 32]
  4491. vpunpcklbw ymm0, ymm1, ymm1 // low 4
  4492. vpunpckhbw ymm1, ymm1, ymm1 // high 4
  4493. vpunpcklbw ymm2, ymm3, ymm5 // low 4
  4494. vpunpckhbw ymm3, ymm3, ymm5 // high 4
  4495. vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4
  4496. vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4
  4497. vpackuswb ymm0, ymm0, ymm1
  4498. vmovdqu [edx], ymm0
  4499. lea edx, [edx + 32]
  4500. sub ecx, 8
  4501. jg convertloop
  4502. pop esi
  4503. vzeroupper
  4504. ret
  4505. }
  4506. }
  4507. #endif // HAS_ARGBMULTIPLYROW_AVX2
  4508. #ifdef HAS_ARGBADDROW_AVX2
  4509. // Add 2 rows of ARGB pixels together, 8 pixels at a time.
  4510. __declspec(naked) void ARGBAddRow_AVX2(const uint8* src_argb0,
  4511. const uint8* src_argb1,
  4512. uint8* dst_argb,
  4513. int width) {
  4514. __asm {
  4515. push esi
  4516. mov eax, [esp + 4 + 4] // src_argb0
  4517. mov esi, [esp + 4 + 8] // src_argb1
  4518. mov edx, [esp + 4 + 12] // dst_argb
  4519. mov ecx, [esp + 4 + 16] // width
  4520. convertloop:
  4521. vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
  4522. lea eax, [eax + 32]
  4523. vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1
  4524. lea esi, [esi + 32]
  4525. vmovdqu [edx], ymm0
  4526. lea edx, [edx + 32]
  4527. sub ecx, 8
  4528. jg convertloop
  4529. pop esi
  4530. vzeroupper
  4531. ret
  4532. }
  4533. }
  4534. #endif // HAS_ARGBADDROW_AVX2
  4535. #ifdef HAS_ARGBSUBTRACTROW_AVX2
  4536. // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
  4537. __declspec(naked) void ARGBSubtractRow_AVX2(const uint8* src_argb0,
  4538. const uint8* src_argb1,
  4539. uint8* dst_argb,
  4540. int width) {
  4541. __asm {
  4542. push esi
  4543. mov eax, [esp + 4 + 4] // src_argb0
  4544. mov esi, [esp + 4 + 8] // src_argb1
  4545. mov edx, [esp + 4 + 12] // dst_argb
  4546. mov ecx, [esp + 4 + 16] // width
  4547. convertloop:
  4548. vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
  4549. lea eax, [eax + 32]
  4550. vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1
  4551. lea esi, [esi + 32]
  4552. vmovdqu [edx], ymm0
  4553. lea edx, [edx + 32]
  4554. sub ecx, 8
  4555. jg convertloop
  4556. pop esi
  4557. vzeroupper
  4558. ret
  4559. }
  4560. }
  4561. #endif // HAS_ARGBSUBTRACTROW_AVX2
  4562. #ifdef HAS_SOBELXROW_SSE2
  4563. // SobelX as a matrix is
  4564. // -1 0 1
  4565. // -2 0 2
  4566. // -1 0 1
  4567. __declspec(naked) void SobelXRow_SSE2(const uint8* src_y0,
  4568. const uint8* src_y1,
  4569. const uint8* src_y2,
  4570. uint8* dst_sobelx,
  4571. int width) {
  4572. __asm {
  4573. push esi
  4574. push edi
  4575. mov eax, [esp + 8 + 4] // src_y0
  4576. mov esi, [esp + 8 + 8] // src_y1
  4577. mov edi, [esp + 8 + 12] // src_y2
  4578. mov edx, [esp + 8 + 16] // dst_sobelx
  4579. mov ecx, [esp + 8 + 20] // width
  4580. sub esi, eax
  4581. sub edi, eax
  4582. sub edx, eax
  4583. pxor xmm5, xmm5 // constant 0
  4584. convertloop:
  4585. movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
  4586. movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
  4587. punpcklbw xmm0, xmm5
  4588. punpcklbw xmm1, xmm5
  4589. psubw xmm0, xmm1
  4590. movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
  4591. movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
  4592. punpcklbw xmm1, xmm5
  4593. punpcklbw xmm2, xmm5
  4594. psubw xmm1, xmm2
  4595. movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0]
  4596. movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2]
  4597. punpcklbw xmm2, xmm5
  4598. punpcklbw xmm3, xmm5
  4599. psubw xmm2, xmm3
  4600. paddw xmm0, xmm2
  4601. paddw xmm0, xmm1
  4602. paddw xmm0, xmm1
  4603. pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
  4604. psubw xmm1, xmm0
  4605. pmaxsw xmm0, xmm1
  4606. packuswb xmm0, xmm0
  4607. movq qword ptr [eax + edx], xmm0
  4608. lea eax, [eax + 8]
  4609. sub ecx, 8
  4610. jg convertloop
  4611. pop edi
  4612. pop esi
  4613. ret
  4614. }
  4615. }
  4616. #endif // HAS_SOBELXROW_SSE2
  4617. #ifdef HAS_SOBELYROW_SSE2
  4618. // SobelY as a matrix is
  4619. // -1 -2 -1
  4620. // 0 0 0
  4621. // 1 2 1
  4622. __declspec(naked) void SobelYRow_SSE2(const uint8* src_y0,
  4623. const uint8* src_y1,
  4624. uint8* dst_sobely,
  4625. int width) {
  4626. __asm {
  4627. push esi
  4628. mov eax, [esp + 4 + 4] // src_y0
  4629. mov esi, [esp + 4 + 8] // src_y1
  4630. mov edx, [esp + 4 + 12] // dst_sobely
  4631. mov ecx, [esp + 4 + 16] // width
  4632. sub esi, eax
  4633. sub edx, eax
  4634. pxor xmm5, xmm5 // constant 0
  4635. convertloop:
  4636. movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
  4637. movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
  4638. punpcklbw xmm0, xmm5
  4639. punpcklbw xmm1, xmm5
  4640. psubw xmm0, xmm1
  4641. movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1]
  4642. movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1]
  4643. punpcklbw xmm1, xmm5
  4644. punpcklbw xmm2, xmm5
  4645. psubw xmm1, xmm2
  4646. movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
  4647. movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
  4648. punpcklbw xmm2, xmm5
  4649. punpcklbw xmm3, xmm5
  4650. psubw xmm2, xmm3
  4651. paddw xmm0, xmm2
  4652. paddw xmm0, xmm1
  4653. paddw xmm0, xmm1
  4654. pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
  4655. psubw xmm1, xmm0
  4656. pmaxsw xmm0, xmm1
  4657. packuswb xmm0, xmm0
  4658. movq qword ptr [eax + edx], xmm0
  4659. lea eax, [eax + 8]
  4660. sub ecx, 8
  4661. jg convertloop
  4662. pop esi
  4663. ret
  4664. }
  4665. }
  4666. #endif // HAS_SOBELYROW_SSE2
  4667. #ifdef HAS_SOBELROW_SSE2
  4668. // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
  4669. // A = 255
  4670. // R = Sobel
  4671. // G = Sobel
  4672. // B = Sobel
  4673. __declspec(naked) void SobelRow_SSE2(const uint8* src_sobelx,
  4674. const uint8* src_sobely,
  4675. uint8* dst_argb,
  4676. int width) {
  4677. __asm {
  4678. push esi
  4679. mov eax, [esp + 4 + 4] // src_sobelx
  4680. mov esi, [esp + 4 + 8] // src_sobely
  4681. mov edx, [esp + 4 + 12] // dst_argb
  4682. mov ecx, [esp + 4 + 16] // width
  4683. sub esi, eax
  4684. pcmpeqb xmm5, xmm5 // alpha 255
  4685. pslld xmm5, 24 // 0xff000000
  4686. convertloop:
  4687. movdqu xmm0, [eax] // read 16 pixels src_sobelx
  4688. movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
  4689. lea eax, [eax + 16]
  4690. paddusb xmm0, xmm1 // sobel = sobelx + sobely
  4691. movdqa xmm2, xmm0 // GG
  4692. punpcklbw xmm2, xmm0 // First 8
  4693. punpckhbw xmm0, xmm0 // Next 8
  4694. movdqa xmm1, xmm2 // GGGG
  4695. punpcklwd xmm1, xmm2 // First 4
  4696. punpckhwd xmm2, xmm2 // Next 4
  4697. por xmm1, xmm5 // GGGA
  4698. por xmm2, xmm5
  4699. movdqa xmm3, xmm0 // GGGG
  4700. punpcklwd xmm3, xmm0 // Next 4
  4701. punpckhwd xmm0, xmm0 // Last 4
  4702. por xmm3, xmm5 // GGGA
  4703. por xmm0, xmm5
  4704. movdqu [edx], xmm1
  4705. movdqu [edx + 16], xmm2
  4706. movdqu [edx + 32], xmm3
  4707. movdqu [edx + 48], xmm0
  4708. lea edx, [edx + 64]
  4709. sub ecx, 16
  4710. jg convertloop
  4711. pop esi
  4712. ret
  4713. }
  4714. }
  4715. #endif // HAS_SOBELROW_SSE2
  4716. #ifdef HAS_SOBELTOPLANEROW_SSE2
  4717. // Adds Sobel X and Sobel Y and stores Sobel into a plane.
  4718. __declspec(naked) void SobelToPlaneRow_SSE2(const uint8* src_sobelx,
  4719. const uint8* src_sobely,
  4720. uint8* dst_y,
  4721. int width) {
  4722. __asm {
  4723. push esi
  4724. mov eax, [esp + 4 + 4] // src_sobelx
  4725. mov esi, [esp + 4 + 8] // src_sobely
  4726. mov edx, [esp + 4 + 12] // dst_argb
  4727. mov ecx, [esp + 4 + 16] // width
  4728. sub esi, eax
  4729. convertloop:
  4730. movdqu xmm0, [eax] // read 16 pixels src_sobelx
  4731. movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
  4732. lea eax, [eax + 16]
  4733. paddusb xmm0, xmm1 // sobel = sobelx + sobely
  4734. movdqu [edx], xmm0
  4735. lea edx, [edx + 16]
  4736. sub ecx, 16
  4737. jg convertloop
  4738. pop esi
  4739. ret
  4740. }
  4741. }
  4742. #endif // HAS_SOBELTOPLANEROW_SSE2
  4743. #ifdef HAS_SOBELXYROW_SSE2
  4744. // Mixes Sobel X, Sobel Y and Sobel into ARGB.
  4745. // A = 255
  4746. // R = Sobel X
  4747. // G = Sobel
  4748. // B = Sobel Y
  4749. __declspec(naked) void SobelXYRow_SSE2(const uint8* src_sobelx,
  4750. const uint8* src_sobely,
  4751. uint8* dst_argb,
  4752. int width) {
  4753. __asm {
  4754. push esi
  4755. mov eax, [esp + 4 + 4] // src_sobelx
  4756. mov esi, [esp + 4 + 8] // src_sobely
  4757. mov edx, [esp + 4 + 12] // dst_argb
  4758. mov ecx, [esp + 4 + 16] // width
  4759. sub esi, eax
  4760. pcmpeqb xmm5, xmm5 // alpha 255
  4761. convertloop:
  4762. movdqu xmm0, [eax] // read 16 pixels src_sobelx
  4763. movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
  4764. lea eax, [eax + 16]
  4765. movdqa xmm2, xmm0
  4766. paddusb xmm2, xmm1 // sobel = sobelx + sobely
  4767. movdqa xmm3, xmm0 // XA
  4768. punpcklbw xmm3, xmm5
  4769. punpckhbw xmm0, xmm5
  4770. movdqa xmm4, xmm1 // YS
  4771. punpcklbw xmm4, xmm2
  4772. punpckhbw xmm1, xmm2
  4773. movdqa xmm6, xmm4 // YSXA
  4774. punpcklwd xmm6, xmm3 // First 4
  4775. punpckhwd xmm4, xmm3 // Next 4
  4776. movdqa xmm7, xmm1 // YSXA
  4777. punpcklwd xmm7, xmm0 // Next 4
  4778. punpckhwd xmm1, xmm0 // Last 4
  4779. movdqu [edx], xmm6
  4780. movdqu [edx + 16], xmm4
  4781. movdqu [edx + 32], xmm7
  4782. movdqu [edx + 48], xmm1
  4783. lea edx, [edx + 64]
  4784. sub ecx, 16
  4785. jg convertloop
  4786. pop esi
  4787. ret
  4788. }
  4789. }
  4790. #endif // HAS_SOBELXYROW_SSE2
  4791. #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
  4792. // Consider float CumulativeSum.
  4793. // Consider calling CumulativeSum one row at time as needed.
  4794. // Consider circular CumulativeSum buffer of radius * 2 + 1 height.
  4795. // Convert cumulative sum for an area to an average for 1 pixel.
  4796. // topleft is pointer to top left of CumulativeSum buffer for area.
  4797. // botleft is pointer to bottom left of CumulativeSum buffer.
  4798. // width is offset from left to right of area in CumulativeSum buffer measured
  4799. // in number of ints.
  4800. // area is the number of pixels in the area being averaged.
  4801. // dst points to pixel to store result to.
  4802. // count is number of averaged pixels to produce.
  4803. // Does 4 pixels at a time.
  4804. // This function requires alignment on accumulation buffer pointers.
  4805. void CumulativeSumToAverageRow_SSE2(const int32* topleft,
  4806. const int32* botleft,
  4807. int width,
  4808. int area,
  4809. uint8* dst,
  4810. int count) {
  4811. __asm {
  4812. mov eax, topleft // eax topleft
  4813. mov esi, botleft // esi botleft
  4814. mov edx, width
  4815. movd xmm5, area
  4816. mov edi, dst
  4817. mov ecx, count
  4818. cvtdq2ps xmm5, xmm5
  4819. rcpss xmm4, xmm5 // 1.0f / area
  4820. pshufd xmm4, xmm4, 0
  4821. sub ecx, 4
  4822. jl l4b
  4823. cmp area, 128 // 128 pixels will not overflow 15 bits.
  4824. ja l4
  4825. pshufd xmm5, xmm5, 0 // area
  4826. pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0
  4827. psrld xmm6, 16
  4828. cvtdq2ps xmm6, xmm6
  4829. addps xmm5, xmm6 // (65536.0 + area - 1)
  4830. mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area
  4831. cvtps2dq xmm5, xmm5 // 0.16 fixed point
  4832. packssdw xmm5, xmm5 // 16 bit shorts
  4833. // 4 pixel loop small blocks.
  4834. s4:
  4835. // top left
  4836. movdqu xmm0, [eax]
  4837. movdqu xmm1, [eax + 16]
  4838. movdqu xmm2, [eax + 32]
  4839. movdqu xmm3, [eax + 48]
  4840. // - top right
  4841. psubd xmm0, [eax + edx * 4]
  4842. psubd xmm1, [eax + edx * 4 + 16]
  4843. psubd xmm2, [eax + edx * 4 + 32]
  4844. psubd xmm3, [eax + edx * 4 + 48]
  4845. lea eax, [eax + 64]
  4846. // - bottom left
  4847. psubd xmm0, [esi]
  4848. psubd xmm1, [esi + 16]
  4849. psubd xmm2, [esi + 32]
  4850. psubd xmm3, [esi + 48]
  4851. // + bottom right
  4852. paddd xmm0, [esi + edx * 4]
  4853. paddd xmm1, [esi + edx * 4 + 16]
  4854. paddd xmm2, [esi + edx * 4 + 32]
  4855. paddd xmm3, [esi + edx * 4 + 48]
  4856. lea esi, [esi + 64]
  4857. packssdw xmm0, xmm1 // pack 4 pixels into 2 registers
  4858. packssdw xmm2, xmm3
  4859. pmulhuw xmm0, xmm5
  4860. pmulhuw xmm2, xmm5
  4861. packuswb xmm0, xmm2
  4862. movdqu [edi], xmm0
  4863. lea edi, [edi + 16]
  4864. sub ecx, 4
  4865. jge s4
  4866. jmp l4b
  4867. // 4 pixel loop
  4868. l4:
  4869. // top left
  4870. movdqu xmm0, [eax]
  4871. movdqu xmm1, [eax + 16]
  4872. movdqu xmm2, [eax + 32]
  4873. movdqu xmm3, [eax + 48]
  4874. // - top right
  4875. psubd xmm0, [eax + edx * 4]
  4876. psubd xmm1, [eax + edx * 4 + 16]
  4877. psubd xmm2, [eax + edx * 4 + 32]
  4878. psubd xmm3, [eax + edx * 4 + 48]
  4879. lea eax, [eax + 64]
  4880. // - bottom left
  4881. psubd xmm0, [esi]
  4882. psubd xmm1, [esi + 16]
  4883. psubd xmm2, [esi + 32]
  4884. psubd xmm3, [esi + 48]
  4885. // + bottom right
  4886. paddd xmm0, [esi + edx * 4]
  4887. paddd xmm1, [esi + edx * 4 + 16]
  4888. paddd xmm2, [esi + edx * 4 + 32]
  4889. paddd xmm3, [esi + edx * 4 + 48]
  4890. lea esi, [esi + 64]
  4891. cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area
  4892. cvtdq2ps xmm1, xmm1
  4893. mulps xmm0, xmm4
  4894. mulps xmm1, xmm4
  4895. cvtdq2ps xmm2, xmm2
  4896. cvtdq2ps xmm3, xmm3
  4897. mulps xmm2, xmm4
  4898. mulps xmm3, xmm4
  4899. cvtps2dq xmm0, xmm0
  4900. cvtps2dq xmm1, xmm1
  4901. cvtps2dq xmm2, xmm2
  4902. cvtps2dq xmm3, xmm3
  4903. packssdw xmm0, xmm1
  4904. packssdw xmm2, xmm3
  4905. packuswb xmm0, xmm2
  4906. movdqu [edi], xmm0
  4907. lea edi, [edi + 16]
  4908. sub ecx, 4
  4909. jge l4
  4910. l4b:
  4911. add ecx, 4 - 1
  4912. jl l1b
  4913. // 1 pixel loop
  4914. l1:
  4915. movdqu xmm0, [eax]
  4916. psubd xmm0, [eax + edx * 4]
  4917. lea eax, [eax + 16]
  4918. psubd xmm0, [esi]
  4919. paddd xmm0, [esi + edx * 4]
  4920. lea esi, [esi + 16]
  4921. cvtdq2ps xmm0, xmm0
  4922. mulps xmm0, xmm4
  4923. cvtps2dq xmm0, xmm0
  4924. packssdw xmm0, xmm0
  4925. packuswb xmm0, xmm0
  4926. movd dword ptr [edi], xmm0
  4927. lea edi, [edi + 4]
  4928. sub ecx, 1
  4929. jge l1
  4930. l1b:
  4931. }
  4932. }
  4933. #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
  4934. #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
  4935. // Creates a table of cumulative sums where each value is a sum of all values
  4936. // above and to the left of the value.
  4937. void ComputeCumulativeSumRow_SSE2(const uint8* row,
  4938. int32* cumsum,
  4939. const int32* previous_cumsum,
  4940. int width) {
  4941. __asm {
  4942. mov eax, row
  4943. mov edx, cumsum
  4944. mov esi, previous_cumsum
  4945. mov ecx, width
  4946. pxor xmm0, xmm0
  4947. pxor xmm1, xmm1
  4948. sub ecx, 4
  4949. jl l4b
  4950. test edx, 15
  4951. jne l4b
  4952. // 4 pixel loop
  4953. l4:
  4954. movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
  4955. lea eax, [eax + 16]
  4956. movdqa xmm4, xmm2
  4957. punpcklbw xmm2, xmm1
  4958. movdqa xmm3, xmm2
  4959. punpcklwd xmm2, xmm1
  4960. punpckhwd xmm3, xmm1
  4961. punpckhbw xmm4, xmm1
  4962. movdqa xmm5, xmm4
  4963. punpcklwd xmm4, xmm1
  4964. punpckhwd xmm5, xmm1
  4965. paddd xmm0, xmm2
  4966. movdqu xmm2, [esi] // previous row above.
  4967. paddd xmm2, xmm0
  4968. paddd xmm0, xmm3
  4969. movdqu xmm3, [esi + 16]
  4970. paddd xmm3, xmm0
  4971. paddd xmm0, xmm4
  4972. movdqu xmm4, [esi + 32]
  4973. paddd xmm4, xmm0
  4974. paddd xmm0, xmm5
  4975. movdqu xmm5, [esi + 48]
  4976. lea esi, [esi + 64]
  4977. paddd xmm5, xmm0
  4978. movdqu [edx], xmm2
  4979. movdqu [edx + 16], xmm3
  4980. movdqu [edx + 32], xmm4
  4981. movdqu [edx + 48], xmm5
  4982. lea edx, [edx + 64]
  4983. sub ecx, 4
  4984. jge l4
  4985. l4b:
  4986. add ecx, 4 - 1
  4987. jl l1b
  4988. // 1 pixel loop
  4989. l1:
  4990. movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes.
  4991. lea eax, [eax + 4]
  4992. punpcklbw xmm2, xmm1
  4993. punpcklwd xmm2, xmm1
  4994. paddd xmm0, xmm2
  4995. movdqu xmm2, [esi]
  4996. lea esi, [esi + 16]
  4997. paddd xmm2, xmm0
  4998. movdqu [edx], xmm2
  4999. lea edx, [edx + 16]
  5000. sub ecx, 1
  5001. jge l1
  5002. l1b:
  5003. }
  5004. }
  5005. #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
  5006. #ifdef HAS_ARGBAFFINEROW_SSE2
  5007. // Copy ARGB pixels from source image with slope to a row of destination.
  5008. __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb,
  5009. int src_argb_stride,
  5010. uint8* dst_argb,
  5011. const float* uv_dudv,
  5012. int width) {
  5013. __asm {
  5014. push esi
  5015. push edi
  5016. mov eax, [esp + 12] // src_argb
  5017. mov esi, [esp + 16] // stride
  5018. mov edx, [esp + 20] // dst_argb
  5019. mov ecx, [esp + 24] // pointer to uv_dudv
  5020. movq xmm2, qword ptr [ecx] // uv
  5021. movq xmm7, qword ptr [ecx + 8] // dudv
  5022. mov ecx, [esp + 28] // width
  5023. shl esi, 16 // 4, stride
  5024. add esi, 4
  5025. movd xmm5, esi
  5026. sub ecx, 4
  5027. jl l4b
  5028. // setup for 4 pixel loop
  5029. pshufd xmm7, xmm7, 0x44 // dup dudv
  5030. pshufd xmm5, xmm5, 0 // dup 4, stride
  5031. movdqa xmm0, xmm2 // x0, y0, x1, y1
  5032. addps xmm0, xmm7
  5033. movlhps xmm2, xmm0
  5034. movdqa xmm4, xmm7
  5035. addps xmm4, xmm4 // dudv *= 2
  5036. movdqa xmm3, xmm2 // x2, y2, x3, y3
  5037. addps xmm3, xmm4
  5038. addps xmm4, xmm4 // dudv *= 4
  5039. // 4 pixel loop
  5040. l4:
  5041. cvttps2dq xmm0, xmm2 // x, y float to int first 2
  5042. cvttps2dq xmm1, xmm3 // x, y float to int next 2
  5043. packssdw xmm0, xmm1 // x, y as 8 shorts
  5044. pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride.
  5045. movd esi, xmm0
  5046. pshufd xmm0, xmm0, 0x39 // shift right
  5047. movd edi, xmm0
  5048. pshufd xmm0, xmm0, 0x39 // shift right
  5049. movd xmm1, [eax + esi] // read pixel 0
  5050. movd xmm6, [eax + edi] // read pixel 1
  5051. punpckldq xmm1, xmm6 // combine pixel 0 and 1
  5052. addps xmm2, xmm4 // x, y += dx, dy first 2
  5053. movq qword ptr [edx], xmm1
  5054. movd esi, xmm0
  5055. pshufd xmm0, xmm0, 0x39 // shift right
  5056. movd edi, xmm0
  5057. movd xmm6, [eax + esi] // read pixel 2
  5058. movd xmm0, [eax + edi] // read pixel 3
  5059. punpckldq xmm6, xmm0 // combine pixel 2 and 3
  5060. addps xmm3, xmm4 // x, y += dx, dy next 2
  5061. movq qword ptr 8[edx], xmm6
  5062. lea edx, [edx + 16]
  5063. sub ecx, 4
  5064. jge l4
  5065. l4b:
  5066. add ecx, 4 - 1
  5067. jl l1b
  5068. // 1 pixel loop
  5069. l1:
  5070. cvttps2dq xmm0, xmm2 // x, y float to int
  5071. packssdw xmm0, xmm0 // x, y as shorts
  5072. pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride
  5073. addps xmm2, xmm7 // x, y += dx, dy
  5074. movd esi, xmm0
  5075. movd xmm0, [eax + esi] // copy a pixel
  5076. movd [edx], xmm0
  5077. lea edx, [edx + 4]
  5078. sub ecx, 1
  5079. jge l1
  5080. l1b:
  5081. pop edi
  5082. pop esi
  5083. ret
  5084. }
  5085. }
  5086. #endif // HAS_ARGBAFFINEROW_SSE2
  5087. #ifdef HAS_INTERPOLATEROW_AVX2
  5088. // Bilinear filter 32x2 -> 32x1
  5089. __declspec(naked) void InterpolateRow_AVX2(uint8* dst_ptr,
  5090. const uint8* src_ptr,
  5091. ptrdiff_t src_stride,
  5092. int dst_width,
  5093. int source_y_fraction) {
  5094. __asm {
  5095. push esi
  5096. push edi
  5097. mov edi, [esp + 8 + 4] // dst_ptr
  5098. mov esi, [esp + 8 + 8] // src_ptr
  5099. mov edx, [esp + 8 + 12] // src_stride
  5100. mov ecx, [esp + 8 + 16] // dst_width
  5101. mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
  5102. // Dispatch to specialized filters if applicable.
  5103. cmp eax, 0
  5104. je xloop100 // 0 / 256. Blend 100 / 0.
  5105. sub edi, esi
  5106. cmp eax, 128
  5107. je xloop50 // 128 /256 is 0.50. Blend 50 / 50.
  5108. vmovd xmm0, eax // high fraction 0..255
  5109. neg eax
  5110. add eax, 256
  5111. vmovd xmm5, eax // low fraction 256..1
  5112. vpunpcklbw xmm5, xmm5, xmm0
  5113. vpunpcklwd xmm5, xmm5, xmm5
  5114. vbroadcastss ymm5, xmm5
  5115. mov eax, 0x80808080 // 128b for bias and rounding.
  5116. vmovd xmm4, eax
  5117. vbroadcastss ymm4, xmm4
  5118. xloop:
  5119. vmovdqu ymm0, [esi]
  5120. vmovdqu ymm2, [esi + edx]
  5121. vpunpckhbw ymm1, ymm0, ymm2 // mutates
  5122. vpunpcklbw ymm0, ymm0, ymm2
  5123. vpsubb ymm1, ymm1, ymm4 // bias to signed image
  5124. vpsubb ymm0, ymm0, ymm4
  5125. vpmaddubsw ymm1, ymm5, ymm1
  5126. vpmaddubsw ymm0, ymm5, ymm0
  5127. vpaddw ymm1, ymm1, ymm4 // unbias and round
  5128. vpaddw ymm0, ymm0, ymm4
  5129. vpsrlw ymm1, ymm1, 8
  5130. vpsrlw ymm0, ymm0, 8
  5131. vpackuswb ymm0, ymm0, ymm1 // unmutates
  5132. vmovdqu [esi + edi], ymm0
  5133. lea esi, [esi + 32]
  5134. sub ecx, 32
  5135. jg xloop
  5136. jmp xloop99
  5137. // Blend 50 / 50.
  5138. xloop50:
  5139. vmovdqu ymm0, [esi]
  5140. vpavgb ymm0, ymm0, [esi + edx]
  5141. vmovdqu [esi + edi], ymm0
  5142. lea esi, [esi + 32]
  5143. sub ecx, 32
  5144. jg xloop50
  5145. jmp xloop99
  5146. // Blend 100 / 0 - Copy row unchanged.
  5147. xloop100:
  5148. rep movsb
  5149. xloop99:
  5150. pop edi
  5151. pop esi
  5152. vzeroupper
  5153. ret
  5154. }
  5155. }
  5156. #endif // HAS_INTERPOLATEROW_AVX2
  5157. // Bilinear filter 16x2 -> 16x1
  5158. // TODO(fbarchard): Consider allowing 256 using memcpy.
  5159. __declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr,
  5160. const uint8* src_ptr,
  5161. ptrdiff_t src_stride,
  5162. int dst_width,
  5163. int source_y_fraction) {
  5164. __asm {
  5165. push esi
  5166. push edi
  5167. mov edi, [esp + 8 + 4] // dst_ptr
  5168. mov esi, [esp + 8 + 8] // src_ptr
  5169. mov edx, [esp + 8 + 12] // src_stride
  5170. mov ecx, [esp + 8 + 16] // dst_width
  5171. mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
  5172. sub edi, esi
  5173. // Dispatch to specialized filters if applicable.
  5174. cmp eax, 0
  5175. je xloop100 // 0 /256. Blend 100 / 0.
  5176. cmp eax, 128
  5177. je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
  5178. movd xmm0, eax // high fraction 0..255
  5179. neg eax
  5180. add eax, 256
  5181. movd xmm5, eax // low fraction 255..1
  5182. punpcklbw xmm5, xmm0
  5183. punpcklwd xmm5, xmm5
  5184. pshufd xmm5, xmm5, 0
  5185. mov eax, 0x80808080 // 128 for biasing image to signed.
  5186. movd xmm4, eax
  5187. pshufd xmm4, xmm4, 0x00
  5188. xloop:
  5189. movdqu xmm0, [esi]
  5190. movdqu xmm2, [esi + edx]
  5191. movdqu xmm1, xmm0
  5192. punpcklbw xmm0, xmm2
  5193. punpckhbw xmm1, xmm2
  5194. psubb xmm0, xmm4 // bias image by -128
  5195. psubb xmm1, xmm4
  5196. movdqa xmm2, xmm5
  5197. movdqa xmm3, xmm5
  5198. pmaddubsw xmm2, xmm0
  5199. pmaddubsw xmm3, xmm1
  5200. paddw xmm2, xmm4
  5201. paddw xmm3, xmm4
  5202. psrlw xmm2, 8
  5203. psrlw xmm3, 8
  5204. packuswb xmm2, xmm3
  5205. movdqu [esi + edi], xmm2
  5206. lea esi, [esi + 16]
  5207. sub ecx, 16
  5208. jg xloop
  5209. jmp xloop99
  5210. // Blend 50 / 50.
  5211. xloop50:
  5212. movdqu xmm0, [esi]
  5213. movdqu xmm1, [esi + edx]
  5214. pavgb xmm0, xmm1
  5215. movdqu [esi + edi], xmm0
  5216. lea esi, [esi + 16]
  5217. sub ecx, 16
  5218. jg xloop50
  5219. jmp xloop99
  5220. // Blend 100 / 0 - Copy row unchanged.
  5221. xloop100:
  5222. movdqu xmm0, [esi]
  5223. movdqu [esi + edi], xmm0
  5224. lea esi, [esi + 16]
  5225. sub ecx, 16
  5226. jg xloop100
  5227. xloop99:
  5228. pop edi
  5229. pop esi
  5230. ret
  5231. }
  5232. }
  5233. // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
  5234. __declspec(naked) void ARGBShuffleRow_SSSE3(const uint8* src_argb,
  5235. uint8* dst_argb,
  5236. const uint8* shuffler,
  5237. int width) {
  5238. __asm {
  5239. mov eax, [esp + 4] // src_argb
  5240. mov edx, [esp + 8] // dst_argb
  5241. mov ecx, [esp + 12] // shuffler
  5242. movdqu xmm5, [ecx]
  5243. mov ecx, [esp + 16] // width
  5244. wloop:
  5245. movdqu xmm0, [eax]
  5246. movdqu xmm1, [eax + 16]
  5247. lea eax, [eax + 32]
  5248. pshufb xmm0, xmm5
  5249. pshufb xmm1, xmm5
  5250. movdqu [edx], xmm0
  5251. movdqu [edx + 16], xmm1
  5252. lea edx, [edx + 32]
  5253. sub ecx, 8
  5254. jg wloop
  5255. ret
  5256. }
  5257. }
  5258. #ifdef HAS_ARGBSHUFFLEROW_AVX2
  5259. __declspec(naked) void ARGBShuffleRow_AVX2(const uint8* src_argb,
  5260. uint8* dst_argb,
  5261. const uint8* shuffler,
  5262. int width) {
  5263. __asm {
  5264. mov eax, [esp + 4] // src_argb
  5265. mov edx, [esp + 8] // dst_argb
  5266. mov ecx, [esp + 12] // shuffler
  5267. vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
  5268. mov ecx, [esp + 16] // width
  5269. wloop:
  5270. vmovdqu ymm0, [eax]
  5271. vmovdqu ymm1, [eax + 32]
  5272. lea eax, [eax + 64]
  5273. vpshufb ymm0, ymm0, ymm5
  5274. vpshufb ymm1, ymm1, ymm5
  5275. vmovdqu [edx], ymm0
  5276. vmovdqu [edx + 32], ymm1
  5277. lea edx, [edx + 64]
  5278. sub ecx, 16
  5279. jg wloop
  5280. vzeroupper
  5281. ret
  5282. }
  5283. }
  5284. #endif // HAS_ARGBSHUFFLEROW_AVX2
  5285. __declspec(naked) void ARGBShuffleRow_SSE2(const uint8* src_argb,
  5286. uint8* dst_argb,
  5287. const uint8* shuffler,
  5288. int width) {
  5289. __asm {
  5290. push ebx
  5291. push esi
  5292. mov eax, [esp + 8 + 4] // src_argb
  5293. mov edx, [esp + 8 + 8] // dst_argb
  5294. mov esi, [esp + 8 + 12] // shuffler
  5295. mov ecx, [esp + 8 + 16] // width
  5296. pxor xmm5, xmm5
  5297. mov ebx, [esi] // shuffler
  5298. cmp ebx, 0x03000102
  5299. je shuf_3012
  5300. cmp ebx, 0x00010203
  5301. je shuf_0123
  5302. cmp ebx, 0x00030201
  5303. je shuf_0321
  5304. cmp ebx, 0x02010003
  5305. je shuf_2103
  5306. // TODO(fbarchard): Use one source pointer and 3 offsets.
  5307. shuf_any1:
  5308. movzx ebx, byte ptr [esi]
  5309. movzx ebx, byte ptr [eax + ebx]
  5310. mov [edx], bl
  5311. movzx ebx, byte ptr [esi + 1]
  5312. movzx ebx, byte ptr [eax + ebx]
  5313. mov [edx + 1], bl
  5314. movzx ebx, byte ptr [esi + 2]
  5315. movzx ebx, byte ptr [eax + ebx]
  5316. mov [edx + 2], bl
  5317. movzx ebx, byte ptr [esi + 3]
  5318. movzx ebx, byte ptr [eax + ebx]
  5319. mov [edx + 3], bl
  5320. lea eax, [eax + 4]
  5321. lea edx, [edx + 4]
  5322. sub ecx, 1
  5323. jg shuf_any1
  5324. jmp shuf99
  5325. shuf_0123:
  5326. movdqu xmm0, [eax]
  5327. lea eax, [eax + 16]
  5328. movdqa xmm1, xmm0
  5329. punpcklbw xmm0, xmm5
  5330. punpckhbw xmm1, xmm5
  5331. pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB
  5332. pshuflw xmm0, xmm0, 01Bh
  5333. pshufhw xmm1, xmm1, 01Bh
  5334. pshuflw xmm1, xmm1, 01Bh
  5335. packuswb xmm0, xmm1
  5336. movdqu [edx], xmm0
  5337. lea edx, [edx + 16]
  5338. sub ecx, 4
  5339. jg shuf_0123
  5340. jmp shuf99
  5341. shuf_0321:
  5342. movdqu xmm0, [eax]
  5343. lea eax, [eax + 16]
  5344. movdqa xmm1, xmm0
  5345. punpcklbw xmm0, xmm5
  5346. punpckhbw xmm1, xmm5
  5347. pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB
  5348. pshuflw xmm0, xmm0, 039h
  5349. pshufhw xmm1, xmm1, 039h
  5350. pshuflw xmm1, xmm1, 039h
  5351. packuswb xmm0, xmm1
  5352. movdqu [edx], xmm0
  5353. lea edx, [edx + 16]
  5354. sub ecx, 4
  5355. jg shuf_0321
  5356. jmp shuf99
  5357. shuf_2103:
  5358. movdqu xmm0, [eax]
  5359. lea eax, [eax + 16]
  5360. movdqa xmm1, xmm0
  5361. punpcklbw xmm0, xmm5
  5362. punpckhbw xmm1, xmm5
  5363. pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA
  5364. pshuflw xmm0, xmm0, 093h
  5365. pshufhw xmm1, xmm1, 093h
  5366. pshuflw xmm1, xmm1, 093h
  5367. packuswb xmm0, xmm1
  5368. movdqu [edx], xmm0
  5369. lea edx, [edx + 16]
  5370. sub ecx, 4
  5371. jg shuf_2103
  5372. jmp shuf99
  5373. shuf_3012:
  5374. movdqu xmm0, [eax]
  5375. lea eax, [eax + 16]
  5376. movdqa xmm1, xmm0
  5377. punpcklbw xmm0, xmm5
  5378. punpckhbw xmm1, xmm5
  5379. pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB
  5380. pshuflw xmm0, xmm0, 0C6h
  5381. pshufhw xmm1, xmm1, 0C6h
  5382. pshuflw xmm1, xmm1, 0C6h
  5383. packuswb xmm0, xmm1
  5384. movdqu [edx], xmm0
  5385. lea edx, [edx + 16]
  5386. sub ecx, 4
  5387. jg shuf_3012
  5388. shuf99:
  5389. pop esi
  5390. pop ebx
  5391. ret
  5392. }
  5393. }
  5394. // YUY2 - Macro-pixel = 2 image pixels
  5395. // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
  5396. // UYVY - Macro-pixel = 2 image pixels
  5397. // U0Y0V0Y1
  5398. __declspec(naked) void I422ToYUY2Row_SSE2(const uint8* src_y,
  5399. const uint8* src_u,
  5400. const uint8* src_v,
  5401. uint8* dst_frame,
  5402. int width) {
  5403. __asm {
  5404. push esi
  5405. push edi
  5406. mov eax, [esp + 8 + 4] // src_y
  5407. mov esi, [esp + 8 + 8] // src_u
  5408. mov edx, [esp + 8 + 12] // src_v
  5409. mov edi, [esp + 8 + 16] // dst_frame
  5410. mov ecx, [esp + 8 + 20] // width
  5411. sub edx, esi
  5412. convertloop:
  5413. movq xmm2, qword ptr [esi] // U
  5414. movq xmm3, qword ptr [esi + edx] // V
  5415. lea esi, [esi + 8]
  5416. punpcklbw xmm2, xmm3 // UV
  5417. movdqu xmm0, [eax] // Y
  5418. lea eax, [eax + 16]
  5419. movdqa xmm1, xmm0
  5420. punpcklbw xmm0, xmm2 // YUYV
  5421. punpckhbw xmm1, xmm2
  5422. movdqu [edi], xmm0
  5423. movdqu [edi + 16], xmm1
  5424. lea edi, [edi + 32]
  5425. sub ecx, 16
  5426. jg convertloop
  5427. pop edi
  5428. pop esi
  5429. ret
  5430. }
  5431. }
  5432. __declspec(naked) void I422ToUYVYRow_SSE2(const uint8* src_y,
  5433. const uint8* src_u,
  5434. const uint8* src_v,
  5435. uint8* dst_frame,
  5436. int width) {
  5437. __asm {
  5438. push esi
  5439. push edi
  5440. mov eax, [esp + 8 + 4] // src_y
  5441. mov esi, [esp + 8 + 8] // src_u
  5442. mov edx, [esp + 8 + 12] // src_v
  5443. mov edi, [esp + 8 + 16] // dst_frame
  5444. mov ecx, [esp + 8 + 20] // width
  5445. sub edx, esi
  5446. convertloop:
  5447. movq xmm2, qword ptr [esi] // U
  5448. movq xmm3, qword ptr [esi + edx] // V
  5449. lea esi, [esi + 8]
  5450. punpcklbw xmm2, xmm3 // UV
  5451. movdqu xmm0, [eax] // Y
  5452. movdqa xmm1, xmm2
  5453. lea eax, [eax + 16]
  5454. punpcklbw xmm1, xmm0 // UYVY
  5455. punpckhbw xmm2, xmm0
  5456. movdqu [edi], xmm1
  5457. movdqu [edi + 16], xmm2
  5458. lea edi, [edi + 32]
  5459. sub ecx, 16
  5460. jg convertloop
  5461. pop edi
  5462. pop esi
  5463. ret
  5464. }
  5465. }
  5466. #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
  5467. __declspec(naked) void ARGBPolynomialRow_SSE2(const uint8* src_argb,
  5468. uint8* dst_argb,
  5469. const float* poly,
  5470. int width) {
  5471. __asm {
  5472. push esi
  5473. mov eax, [esp + 4 + 4] /* src_argb */
  5474. mov edx, [esp + 4 + 8] /* dst_argb */
  5475. mov esi, [esp + 4 + 12] /* poly */
  5476. mov ecx, [esp + 4 + 16] /* width */
  5477. pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
  5478. // 2 pixel loop.
  5479. convertloop:
  5480. // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
  5481. // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
  5482. movq xmm0, qword ptr [eax] // BGRABGRA
  5483. lea eax, [eax + 8]
  5484. punpcklbw xmm0, xmm3
  5485. movdqa xmm4, xmm0
  5486. punpcklwd xmm0, xmm3 // pixel 0
  5487. punpckhwd xmm4, xmm3 // pixel 1
  5488. cvtdq2ps xmm0, xmm0 // 4 floats
  5489. cvtdq2ps xmm4, xmm4
  5490. movdqa xmm1, xmm0 // X
  5491. movdqa xmm5, xmm4
  5492. mulps xmm0, [esi + 16] // C1 * X
  5493. mulps xmm4, [esi + 16]
  5494. addps xmm0, [esi] // result = C0 + C1 * X
  5495. addps xmm4, [esi]
  5496. movdqa xmm2, xmm1
  5497. movdqa xmm6, xmm5
  5498. mulps xmm2, xmm1 // X * X
  5499. mulps xmm6, xmm5
  5500. mulps xmm1, xmm2 // X * X * X
  5501. mulps xmm5, xmm6
  5502. mulps xmm2, [esi + 32] // C2 * X * X
  5503. mulps xmm6, [esi + 32]
  5504. mulps xmm1, [esi + 48] // C3 * X * X * X
  5505. mulps xmm5, [esi + 48]
  5506. addps xmm0, xmm2 // result += C2 * X * X
  5507. addps xmm4, xmm6
  5508. addps xmm0, xmm1 // result += C3 * X * X * X
  5509. addps xmm4, xmm5
  5510. cvttps2dq xmm0, xmm0
  5511. cvttps2dq xmm4, xmm4
  5512. packuswb xmm0, xmm4
  5513. packuswb xmm0, xmm0
  5514. movq qword ptr [edx], xmm0
  5515. lea edx, [edx + 8]
  5516. sub ecx, 2
  5517. jg convertloop
  5518. pop esi
  5519. ret
  5520. }
  5521. }
  5522. #endif // HAS_ARGBPOLYNOMIALROW_SSE2
  5523. #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
  5524. __declspec(naked) void ARGBPolynomialRow_AVX2(const uint8* src_argb,
  5525. uint8* dst_argb,
  5526. const float* poly,
  5527. int width) {
  5528. __asm {
  5529. mov eax, [esp + 4] /* src_argb */
  5530. mov edx, [esp + 8] /* dst_argb */
  5531. mov ecx, [esp + 12] /* poly */
  5532. vbroadcastf128 ymm4, [ecx] // C0
  5533. vbroadcastf128 ymm5, [ecx + 16] // C1
  5534. vbroadcastf128 ymm6, [ecx + 32] // C2
  5535. vbroadcastf128 ymm7, [ecx + 48] // C3
  5536. mov ecx, [esp + 16] /* width */
  5537. // 2 pixel loop.
  5538. convertloop:
  5539. vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels
  5540. lea eax, [eax + 8]
  5541. vcvtdq2ps ymm0, ymm0 // X 8 floats
  5542. vmulps ymm2, ymm0, ymm0 // X * X
  5543. vmulps ymm3, ymm0, ymm7 // C3 * X
  5544. vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X
  5545. vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X
  5546. vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X
  5547. vcvttps2dq ymm0, ymm0
  5548. vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000
  5549. vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
  5550. vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000
  5551. vmovq qword ptr [edx], xmm0
  5552. lea edx, [edx + 8]
  5553. sub ecx, 2
  5554. jg convertloop
  5555. vzeroupper
  5556. ret
  5557. }
  5558. }
  5559. #endif // HAS_ARGBPOLYNOMIALROW_AVX2
  5560. #ifdef HAS_HALFFLOATROW_SSE2
  5561. static float kExpBias = 1.9259299444e-34f;
  5562. __declspec(naked) void HalfFloatRow_SSE2(const uint16* src,
  5563. uint16* dst,
  5564. float scale,
  5565. int width) {
  5566. __asm {
  5567. mov eax, [esp + 4] /* src */
  5568. mov edx, [esp + 8] /* dst */
  5569. movd xmm4, dword ptr [esp + 12] /* scale */
  5570. mov ecx, [esp + 16] /* width */
  5571. mulss xmm4, kExpBias
  5572. pshufd xmm4, xmm4, 0
  5573. pxor xmm5, xmm5
  5574. sub edx, eax
  5575. // 8 pixel loop.
  5576. convertloop:
  5577. movdqu xmm2, xmmword ptr [eax] // 8 shorts
  5578. add eax, 16
  5579. movdqa xmm3, xmm2
  5580. punpcklwd xmm2, xmm5
  5581. cvtdq2ps xmm2, xmm2 // convert 8 ints to floats
  5582. punpckhwd xmm3, xmm5
  5583. cvtdq2ps xmm3, xmm3
  5584. mulps xmm2, xmm4
  5585. mulps xmm3, xmm4
  5586. psrld xmm2, 13
  5587. psrld xmm3, 13
  5588. packssdw xmm2, xmm3
  5589. movdqu [eax + edx - 16], xmm2
  5590. sub ecx, 8
  5591. jg convertloop
  5592. ret
  5593. }
  5594. }
  5595. #endif // HAS_HALFFLOATROW_SSE2
  5596. #ifdef HAS_HALFFLOATROW_AVX2
  5597. __declspec(naked) void HalfFloatRow_AVX2(const uint16* src,
  5598. uint16* dst,
  5599. float scale,
  5600. int width) {
  5601. __asm {
  5602. mov eax, [esp + 4] /* src */
  5603. mov edx, [esp + 8] /* dst */
  5604. movd xmm4, dword ptr [esp + 12] /* scale */
  5605. mov ecx, [esp + 16] /* width */
  5606. vmulss xmm4, xmm4, kExpBias
  5607. vbroadcastss ymm4, xmm4
  5608. vpxor ymm5, ymm5, ymm5
  5609. sub edx, eax
  5610. // 16 pixel loop.
  5611. convertloop:
  5612. vmovdqu ymm2, [eax] // 16 shorts
  5613. add eax, 32
  5614. vpunpckhwd ymm3, ymm2, ymm5 // convert 16 shorts to 16 ints
  5615. vpunpcklwd ymm2, ymm2, ymm5
  5616. vcvtdq2ps ymm3, ymm3 // convert 16 ints to floats
  5617. vcvtdq2ps ymm2, ymm2
  5618. vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range.
  5619. vmulps ymm2, ymm2, ymm4
  5620. vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate
  5621. vpsrld ymm2, ymm2, 13
  5622. vpackssdw ymm2, ymm2, ymm3
  5623. vmovdqu [eax + edx - 32], ymm2
  5624. sub ecx, 16
  5625. jg convertloop
  5626. vzeroupper
  5627. ret
  5628. }
  5629. }
  5630. #endif // HAS_HALFFLOATROW_AVX2
  5631. #ifdef HAS_HALFFLOATROW_F16C
  5632. __declspec(naked) void HalfFloatRow_F16C(const uint16* src,
  5633. uint16* dst,
  5634. float scale,
  5635. int width) {
  5636. __asm {
  5637. mov eax, [esp + 4] /* src */
  5638. mov edx, [esp + 8] /* dst */
  5639. vbroadcastss ymm4, [esp + 12] /* scale */
  5640. mov ecx, [esp + 16] /* width */
  5641. sub edx, eax
  5642. // 16 pixel loop.
  5643. convertloop:
  5644. vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints
  5645. vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts
  5646. add eax, 32
  5647. vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats
  5648. vcvtdq2ps ymm3, ymm3
  5649. vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1
  5650. vmulps ymm3, ymm3, ymm4
  5651. vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate
  5652. vcvtps2ph xmm3, ymm3, 3
  5653. vmovdqu [eax + edx + 32], xmm2
  5654. vmovdqu [eax + edx + 32 + 16], xmm3
  5655. sub ecx, 16
  5656. jg convertloop
  5657. vzeroupper
  5658. ret
  5659. }
  5660. }
  5661. #endif // HAS_HALFFLOATROW_F16C
  5662. #ifdef HAS_ARGBCOLORTABLEROW_X86
  5663. // Tranform ARGB pixels with color table.
  5664. __declspec(naked) void ARGBColorTableRow_X86(uint8* dst_argb,
  5665. const uint8* table_argb,
  5666. int width) {
  5667. __asm {
  5668. push esi
  5669. mov eax, [esp + 4 + 4] /* dst_argb */
  5670. mov esi, [esp + 4 + 8] /* table_argb */
  5671. mov ecx, [esp + 4 + 12] /* width */
  5672. // 1 pixel loop.
  5673. convertloop:
  5674. movzx edx, byte ptr [eax]
  5675. lea eax, [eax + 4]
  5676. movzx edx, byte ptr [esi + edx * 4]
  5677. mov byte ptr [eax - 4], dl
  5678. movzx edx, byte ptr [eax - 4 + 1]
  5679. movzx edx, byte ptr [esi + edx * 4 + 1]
  5680. mov byte ptr [eax - 4 + 1], dl
  5681. movzx edx, byte ptr [eax - 4 + 2]
  5682. movzx edx, byte ptr [esi + edx * 4 + 2]
  5683. mov byte ptr [eax - 4 + 2], dl
  5684. movzx edx, byte ptr [eax - 4 + 3]
  5685. movzx edx, byte ptr [esi + edx * 4 + 3]
  5686. mov byte ptr [eax - 4 + 3], dl
  5687. dec ecx
  5688. jg convertloop
  5689. pop esi
  5690. ret
  5691. }
  5692. }
  5693. #endif // HAS_ARGBCOLORTABLEROW_X86
  5694. #ifdef HAS_RGBCOLORTABLEROW_X86
  5695. // Tranform RGB pixels with color table.
  5696. __declspec(naked) void RGBColorTableRow_X86(uint8* dst_argb,
  5697. const uint8* table_argb,
  5698. int width) {
  5699. __asm {
  5700. push esi
  5701. mov eax, [esp + 4 + 4] /* dst_argb */
  5702. mov esi, [esp + 4 + 8] /* table_argb */
  5703. mov ecx, [esp + 4 + 12] /* width */
  5704. // 1 pixel loop.
  5705. convertloop:
  5706. movzx edx, byte ptr [eax]
  5707. lea eax, [eax + 4]
  5708. movzx edx, byte ptr [esi + edx * 4]
  5709. mov byte ptr [eax - 4], dl
  5710. movzx edx, byte ptr [eax - 4 + 1]
  5711. movzx edx, byte ptr [esi + edx * 4 + 1]
  5712. mov byte ptr [eax - 4 + 1], dl
  5713. movzx edx, byte ptr [eax - 4 + 2]
  5714. movzx edx, byte ptr [esi + edx * 4 + 2]
  5715. mov byte ptr [eax - 4 + 2], dl
  5716. dec ecx
  5717. jg convertloop
  5718. pop esi
  5719. ret
  5720. }
  5721. }
  5722. #endif // HAS_RGBCOLORTABLEROW_X86
  5723. #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
  5724. // Tranform RGB pixels with luma table.
  5725. __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
  5726. uint8* dst_argb,
  5727. int width,
  5728. const uint8* luma,
  5729. uint32 lumacoeff) {
  5730. __asm {
  5731. push esi
  5732. push edi
  5733. mov eax, [esp + 8 + 4] /* src_argb */
  5734. mov edi, [esp + 8 + 8] /* dst_argb */
  5735. mov ecx, [esp + 8 + 12] /* width */
  5736. movd xmm2, dword ptr [esp + 8 + 16] // luma table
  5737. movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff
  5738. pshufd xmm2, xmm2, 0
  5739. pshufd xmm3, xmm3, 0
  5740. pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00
  5741. psllw xmm4, 8
  5742. pxor xmm5, xmm5
  5743. // 4 pixel loop.
  5744. convertloop:
  5745. movdqu xmm0, xmmword ptr [eax] // generate luma ptr
  5746. pmaddubsw xmm0, xmm3
  5747. phaddw xmm0, xmm0
  5748. pand xmm0, xmm4 // mask out low bits
  5749. punpcklwd xmm0, xmm5
  5750. paddd xmm0, xmm2 // add table base
  5751. movd esi, xmm0
  5752. pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
  5753. movzx edx, byte ptr [eax]
  5754. movzx edx, byte ptr [esi + edx]
  5755. mov byte ptr [edi], dl
  5756. movzx edx, byte ptr [eax + 1]
  5757. movzx edx, byte ptr [esi + edx]
  5758. mov byte ptr [edi + 1], dl
  5759. movzx edx, byte ptr [eax + 2]
  5760. movzx edx, byte ptr [esi + edx]
  5761. mov byte ptr [edi + 2], dl
  5762. movzx edx, byte ptr [eax + 3] // copy alpha.
  5763. mov byte ptr [edi + 3], dl
  5764. movd esi, xmm0
  5765. pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
  5766. movzx edx, byte ptr [eax + 4]
  5767. movzx edx, byte ptr [esi + edx]
  5768. mov byte ptr [edi + 4], dl
  5769. movzx edx, byte ptr [eax + 5]
  5770. movzx edx, byte ptr [esi + edx]
  5771. mov byte ptr [edi + 5], dl
  5772. movzx edx, byte ptr [eax + 6]
  5773. movzx edx, byte ptr [esi + edx]
  5774. mov byte ptr [edi + 6], dl
  5775. movzx edx, byte ptr [eax + 7] // copy alpha.
  5776. mov byte ptr [edi + 7], dl
  5777. movd esi, xmm0
  5778. pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
  5779. movzx edx, byte ptr [eax + 8]
  5780. movzx edx, byte ptr [esi + edx]
  5781. mov byte ptr [edi + 8], dl
  5782. movzx edx, byte ptr [eax + 9]
  5783. movzx edx, byte ptr [esi + edx]
  5784. mov byte ptr [edi + 9], dl
  5785. movzx edx, byte ptr [eax + 10]
  5786. movzx edx, byte ptr [esi + edx]
  5787. mov byte ptr [edi + 10], dl
  5788. movzx edx, byte ptr [eax + 11] // copy alpha.
  5789. mov byte ptr [edi + 11], dl
  5790. movd esi, xmm0
  5791. movzx edx, byte ptr [eax + 12]
  5792. movzx edx, byte ptr [esi + edx]
  5793. mov byte ptr [edi + 12], dl
  5794. movzx edx, byte ptr [eax + 13]
  5795. movzx edx, byte ptr [esi + edx]
  5796. mov byte ptr [edi + 13], dl
  5797. movzx edx, byte ptr [eax + 14]
  5798. movzx edx, byte ptr [esi + edx]
  5799. mov byte ptr [edi + 14], dl
  5800. movzx edx, byte ptr [eax + 15] // copy alpha.
  5801. mov byte ptr [edi + 15], dl
  5802. lea eax, [eax + 16]
  5803. lea edi, [edi + 16]
  5804. sub ecx, 4
  5805. jg convertloop
  5806. pop edi
  5807. pop esi
  5808. ret
  5809. }
  5810. }
  5811. #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
  5812. #endif // defined(_M_X64)
  5813. #ifdef __cplusplus
  5814. } // extern "C"
  5815. } // namespace libyuv
  5816. #endif
  5817. #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))