Code: *** linden/indra/libopenjpeg/tcd.c Mon Apr 22 22:15:55 2019 --- linden/indra/libopenjpeg/tcd.c Sun Oct 4 02:33:58 2020 *************** *** 33,38 **** --- 33,43 ---- #define _ISOC99_SOURCE /* lrintf is C99 */ #include "opj_includes.h" + #if defined(__AVX2__) + #include <immintrin.h> + #include <stdint.h> + #endif + void tcd_dump(FILE *fd, opj_tcd_t *tcd, opj_tcd_image_t * img) { int tileno, compno, resno, bandno, precno;//, cblkno; *************** *** 1564,1569 **** --- 1569,1599 ---- } } }else{ + #if defined(__AVX2__) + __m256i adjustv = _mm256_set1_epi32(adjust); + __m256i minv = _mm256_set1_epi32(min); + __m256i maxv = _mm256_set1_epi32(max); + for (j = res->y0; j < res->y1; ++j) { + // handle chunks of 8 + for (i = res->x0; i + 8 < res->x1; i += 8) { + // lets do 8 per chunk + float* start = &(((float*)tilec->data)[i - res->x0 + (j - res->y0) * tw]); + __m256i tmp = _mm256_cvtps_epi32(_mm256_loadu_ps(start)); + tmp = _mm256_add_epi32(tmp, adjustv); + // int_clamp vectorized... + tmp = _mm256_min_epi32(_mm256_max_epi32(tmp, minv), maxv); + int32_t* target = &(imagec->data[(i - offset_x) + (j - offset_y) * w]); + _mm256_storeu_si256((__m256i*)target, tmp); + } + // handle the rest of the row + for (; i < res->x1; ++i) { + float tmp = ((float*)tilec->data)[i - res->x0 + (j - res->y0) * tw]; + int v = lrintf(tmp); + v += adjust; + imagec->data[(i - offset_x) + (j - offset_y) * w] = int_clamp(v, min, max); + } + } + #else for (j = res->y0; j < res->y1; ++j) { for (i = res->x0; i < res->x1; ++i) { float tmp = ((float*)tilec->data)[i - res->x0 + (j - res->y0) * tw]; *************** *** 1572,1577 **** --- 1602,1608 ---- imagec->data[(i - offset_x) + (j - offset_y) * w] = int_clamp(v, min, max); } } + #endif } opj_aligned_free(tilec->data); }
|