macros_msa.h 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233
  1. /*
  2. * Copyright 2016 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #ifndef INCLUDE_LIBYUV_MACROS_MSA_H_
  11. #define INCLUDE_LIBYUV_MACROS_MSA_H_
  12. #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
  13. #include <msa.h>
  14. #include <stdint.h>
  15. #if (__mips_isa_rev >= 6)
  16. #define LW(psrc) \
  17. ({ \
  18. uint8* psrc_lw_m = (uint8*)(psrc); /* NOLINT */ \
  19. uint32 val_m; \
  20. asm volatile("lw %[val_m], %[psrc_lw_m] \n" \
  21. : [val_m] "=r"(val_m) \
  22. : [psrc_lw_m] "m"(*psrc_lw_m)); \
  23. val_m; \
  24. })
  25. #if (__mips == 64)
  26. #define LD(psrc) \
  27. ({ \
  28. uint8* psrc_ld_m = (uint8*)(psrc); /* NOLINT */ \
  29. uint64 val_m = 0; \
  30. asm volatile("ld %[val_m], %[psrc_ld_m] \n" \
  31. : [val_m] "=r"(val_m) \
  32. : [psrc_ld_m] "m"(*psrc_ld_m)); \
  33. val_m; \
  34. })
  35. #else // !(__mips == 64)
  36. #define LD(psrc) \
  37. ({ \
  38. uint8* psrc_ld_m = (uint8*)(psrc); /* NOLINT */ \
  39. uint32 val0_m, val1_m; \
  40. uint64 val_m = 0; \
  41. val0_m = LW(psrc_ld_m); \
  42. val1_m = LW(psrc_ld_m + 4); \
  43. val_m = (uint64)(val1_m); /* NOLINT */ \
  44. val_m = (uint64)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \
  45. val_m = (uint64)(val_m | (uint64)val0_m); /* NOLINT */ \
  46. val_m; \
  47. })
  48. #endif // (__mips == 64)
  49. #define SW(val, pdst) \
  50. ({ \
  51. uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
  52. uint32_t val_m = (val); \
  53. asm volatile("sw %[val_m], %[pdst_sw_m] \n" \
  54. : [pdst_sw_m] "=m"(*pdst_sw_m) \
  55. : [val_m] "r"(val_m)); \
  56. })
  57. #if (__mips == 64)
  58. #define SD(val, pdst) \
  59. ({ \
  60. uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
  61. uint64_t val_m = (val); \
  62. asm volatile("sd %[val_m], %[pdst_sd_m] \n" \
  63. : [pdst_sd_m] "=m"(*pdst_sd_m) \
  64. : [val_m] "r"(val_m)); \
  65. })
  66. #else // !(__mips == 64)
  67. #define SD(val, pdst) \
  68. ({ \
  69. uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
  70. uint32_t val0_m, val1_m; \
  71. val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \
  72. val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
  73. SW(val0_m, pdst_sd_m); \
  74. SW(val1_m, pdst_sd_m + 4); \
  75. })
  76. #endif // !(__mips == 64)
  77. #else // !(__mips_isa_rev >= 6)
  78. #define LW(psrc) \
  79. ({ \
  80. uint8* psrc_lw_m = (uint8*)(psrc); /* NOLINT */ \
  81. uint32 val_m; \
  82. asm volatile("ulw %[val_m], %[psrc_lw_m] \n" \
  83. : [val_m] "=r"(val_m) \
  84. : [psrc_lw_m] "m"(*psrc_lw_m)); \
  85. val_m; \
  86. })
  87. #if (__mips == 64)
  88. #define LD(psrc) \
  89. ({ \
  90. uint8* psrc_ld_m = (uint8*)(psrc); /* NOLINT */ \
  91. uint64 val_m = 0; \
  92. asm volatile("uld %[val_m], %[psrc_ld_m] \n" \
  93. : [val_m] "=r"(val_m) \
  94. : [psrc_ld_m] "m"(*psrc_ld_m)); \
  95. val_m; \
  96. })
  97. #else // !(__mips == 64)
  98. #define LD(psrc) \
  99. ({ \
  100. uint8* psrc_ld_m = (uint8*)(psrc); /* NOLINT */ \
  101. uint32 val0_m, val1_m; \
  102. uint64 val_m = 0; \
  103. val0_m = LW(psrc_ld_m); \
  104. val1_m = LW(psrc_ld_m + 4); \
  105. val_m = (uint64)(val1_m); /* NOLINT */ \
  106. val_m = (uint64)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \
  107. val_m = (uint64)(val_m | (uint64)val0_m); /* NOLINT */ \
  108. val_m; \
  109. })
  110. #endif // (__mips == 64)
  111. #define SW(val, pdst) \
  112. ({ \
  113. uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
  114. uint32_t val_m = (val); \
  115. asm volatile("usw %[val_m], %[pdst_sw_m] \n" \
  116. : [pdst_sw_m] "=m"(*pdst_sw_m) \
  117. : [val_m] "r"(val_m)); \
  118. })
  119. #define SD(val, pdst) \
  120. ({ \
  121. uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
  122. uint32_t val0_m, val1_m; \
  123. val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \
  124. val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
  125. SW(val0_m, pdst_sd_m); \
  126. SW(val1_m, pdst_sd_m + 4); \
  127. })
  128. #endif // (__mips_isa_rev >= 6)
  129. // TODO(fbarchard): Consider removing __VAR_ARGS versions.
  130. #define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */
  131. #define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
  132. #define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */
  133. #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
  134. #define ST_H(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */
  135. #define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
  136. /* Description : Load two vectors with 16 'byte' sized elements
  137. Arguments : Inputs - psrc, stride
  138. Outputs - out0, out1
  139. Return Type - as per RTYPE
  140. Details : Load 16 byte elements in 'out0' from (psrc)
  141. Load 16 byte elements in 'out1' from (psrc + stride)
  142. */
  143. #define LD_B2(RTYPE, psrc, stride, out0, out1) \
  144. { \
  145. out0 = LD_B(RTYPE, (psrc)); \
  146. out1 = LD_B(RTYPE, (psrc) + stride); \
  147. }
  148. #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
  149. #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
  150. { \
  151. LD_B2(RTYPE, (psrc), stride, out0, out1); \
  152. LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
  153. }
  154. #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
  155. /* Description : Store two vectors with stride each having 16 'byte' sized
  156. elements
  157. Arguments : Inputs - in0, in1, pdst, stride
  158. Details : Store 16 byte elements from 'in0' to (pdst)
  159. Store 16 byte elements from 'in1' to (pdst + stride)
  160. */
  161. #define ST_B2(RTYPE, in0, in1, pdst, stride) \
  162. { \
  163. ST_B(RTYPE, in0, (pdst)); \
  164. ST_B(RTYPE, in1, (pdst) + stride); \
  165. }
  166. #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
  167. #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \
  168. { \
  169. ST_B2(RTYPE, in0, in1, (pdst), stride); \
  170. ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
  171. }
  172. #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
  173. /* Description : Store vectors of 8 halfword elements with stride
  174. Arguments : Inputs - in0, in1, pdst, stride
  175. Details : Store 8 halfword elements from 'in0' to (pdst)
  176. Store 8 halfword elements from 'in1' to (pdst + stride)
  177. */
  178. #define ST_H2(RTYPE, in0, in1, pdst, stride) \
  179. { \
  180. ST_H(RTYPE, in0, (pdst)); \
  181. ST_H(RTYPE, in1, (pdst) + stride); \
  182. }
  183. #define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__)
  184. // TODO(fbarchard): Consider using __msa_vshf_b and __msa_ilvr_b directly.
  185. /* Description : Shuffle byte vector elements as per mask vector
  186. Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
  187. Outputs - out0, out1
  188. Return Type - as per RTYPE
  189. Details : Byte elements from 'in0' & 'in1' are copied selectively to
  190. 'out0' as per control vector 'mask0'
  191. */
  192. #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
  193. { \
  194. out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \
  195. out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \
  196. }
  197. #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
  198. /* Description : Interleave both left and right half of input vectors
  199. Arguments : Inputs - in0, in1
  200. Outputs - out0, out1
  201. Return Type - as per RTYPE
  202. Details : Right half of byte elements from 'in0' and 'in1' are
  203. interleaved and written to 'out0'
  204. */
  205. #define ILVRL_B2(RTYPE, in0, in1, out0, out1) \
  206. { \
  207. out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
  208. out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
  209. }
  210. #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
  211. #endif /* !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) */
  212. #endif // INCLUDE_LIBYUV_MACROS_MSA_H_