1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192 |
- #include <xmmintrin.h>
- #define OVERRIDE_INNER_PROD
- float inner_prod(const float *a, const float *b, int len)
- {
- int i;
- float ret;
- __m128 sum = _mm_setzero_ps();
- for (i=0;i<(len>>2);i+=2)
- {
- sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+0), _mm_loadu_ps(b+0)));
- sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+4), _mm_loadu_ps(b+4)));
- a += 8;
- b += 8;
- }
- sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
- sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
- _mm_store_ss(&ret, sum);
- return ret;
- }
- #define OVERRIDE_PITCH_XCORR
- void pitch_xcorr(const float *_x, const float *_y, float *corr, int len, int nb_pitch, char *stack)
- {
- int i, offset;
- VARDECL(__m128 *x);
- VARDECL(__m128 *y);
- int N, L;
- N = len>>2;
- L = nb_pitch>>2;
- ALLOC(x, N, __m128);
- ALLOC(y, N+L, __m128);
- for (i=0;i<N;i++)
- x[i] = _mm_loadu_ps(_x+(i<<2));
- for (offset=0;offset<4;offset++)
- {
- for (i=0;i<N+L;i++)
- y[i] = _mm_loadu_ps(_y+(i<<2)+offset);
- for (i=0;i<L;i++)
- {
- int j;
- __m128 sum, *xx, *yy;
- sum = _mm_setzero_ps();
- yy = y+i;
- xx = x;
- for (j=0;j<N;j+=2)
- {
- sum = _mm_add_ps(sum, _mm_mul_ps(xx[0], yy[0]));
- sum = _mm_add_ps(sum, _mm_mul_ps(xx[1], yy[1]));
- xx += 2;
- yy += 2;
- }
- sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
- sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
- _mm_store_ss(corr+nb_pitch-1-(i<<2)-offset, sum);
- }
- }
- }
|