Some AVX2 / SSE2 optimization for llface.cpp
Author |
Message |
kathrine
Joined: 2011-10-07 10:39:20 Posts: 207
|

Optimized some calls of xform() to use AVX2 and inlined them. The profiler claimed that part wasted 1% CPU on the render thread. Basically unrolled some loop a bit and moved some branches outside.  |  |  |  | Code: --- C:/devel/sl/cool/linden_old/indra/newview/llface.cpp Wed Jul 29 00:25:54 2020 +++ C:/devel/sl/cool/linden/indra/newview/llface.cpp Fri Oct 30 01:35:45 2020 @@ -57,10 +57,14 @@ #include "llvoclouds.h" #include "llvopartgroup.h" #include "llvosky.h" #include "llvovolume.h" +#if defined(__AVX2__) +#include <immintrin.h> +#endif + #define LL_MAX_INDICES_COUNT 1000000 static LLStaticHashedString sTextureIndexIn("texture_index_in"); static LLStaticHashedString sColorIn("color_in"); @@ -1720,43 +1724,183 @@ if (!mat && do_bump) { bump_tc.reserve(num_vertices); } + + if (texgen == LLTextureEntry::TEX_GEN_PLANAR) { LLVector4a vec; for (S32 i = 0; i < num_vertices; ++i) { LLVector2 tc(vf.mTexCoords[i]); LLVector4a& norm = vf.mNormals[i]; LLVector4a& center = *(vf.mCenter); - if (texgen == LLTextureEntry::TEX_GEN_PLANAR) - { vec = vf.mPositions[i]; vec.mul(scalea); planarProjection(tc, norm, center, vec); - } if (tex_mode && mTextureMatrix) { LLVector3 tmp(tc.mV[0], tc.mV[1], 0.f); tmp = tmp * *mTextureMatrix; tc.mV[0] = tmp.mV[0]; tc.mV[1] = tmp.mV[1]; } else { - xform(tc, cos_ang, sin_ang, os, ot, ms, mt); + // xform(tc, cos_ang, sin_ang, os, ot, ms, mt); + // Transform the texture coordinates for this face. + + // Texture transforms are done about the center of the face. + F32 s = tc.mV[0] - 0.5f; + F32 t = tc.mV[1] - 0.5f; + + // Handle rotation + F32 temp = s; + s = s * cos_ang + t * sin_ang; + t = -temp * sin_ang + t * cos_ang; + + // Then scale + s *= ms; + t *= mt; + + // Then offset + s += os + 0.5f; + t += ot + 0.5f; + + tc.mV[0] = s; + tc.mV[1] = t; + } + + *dst++ = tc; + + if (!mat && do_bump) + { + bump_tc.push_back(tc); + } + } + } + else if (tex_mode && mTextureMatrix) + { + for (S32 i = 0; i < num_vertices; ++i) + { + LLVector2 tc(vf.mTexCoords[i]); + + LLVector3 tmp(tc.mV[0], tc.mV[1], 0.f); + tmp = tmp * *mTextureMatrix; + tc.mV[0] = tmp.mV[0]; + tc.mV[1] = tmp.mV[1]; + + *dst++ = tc; + + if (!mat && do_bump) + { + bump_tc.push_back(tc); + } + } + } + else + { + S32 i = 0; +#if defined(__AVX2__) + __m256 cos_vec = _mm256_set1_ps(cos_ang); + __m256 sin_vec = _mm256_set1_ps(sin_ang); + __m256 off = _mm256_set1_ps(-0.5f); + __m256 osoff = _mm256_set1_ps(os + 0.5f); + __m256 otoff = _mm256_set1_ps(ot + 0.5f); + __m256 ms_vec = _mm256_set1_ps(ms); + __m256 mt_vec = _mm256_set1_ps(mt); + + for (; i + 8 <= num_vertices; i += 8) + { + F32 sv[8]; + F32 tv[8]; + + sv[0] = vf.mTexCoords[i].mV[0]; + tv[0] = vf.mTexCoords[i].mV[1]; + sv[1] = vf.mTexCoords[i+1].mV[0]; + tv[1] = vf.mTexCoords[i+1].mV[1]; + sv[2] = vf.mTexCoords[i+2].mV[0]; + tv[2] = vf.mTexCoords[i+2].mV[1]; + sv[3] = vf.mTexCoords[i+3].mV[0]; + tv[3] = vf.mTexCoords[i+3].mV[1]; + sv[4] = vf.mTexCoords[i+4].mV[0]; + tv[4] = vf.mTexCoords[i+4].mV[1]; + sv[5] = vf.mTexCoords[i+5].mV[0]; + tv[5] = vf.mTexCoords[i+5].mV[1]; + sv[6] = vf.mTexCoords[i+6].mV[0]; + tv[6] = vf.mTexCoords[i+6].mV[1]; + sv[7] = vf.mTexCoords[i+7].mV[0]; + tv[7] = vf.mTexCoords[i+7].mV[1]; + + __m256 svv = _mm256_loadu_ps(sv); + __m256 tvv = _mm256_loadu_ps(tv); + // Texture transforms are done about the center of the face. + svv = _mm256_add_ps(svv, off); + tvv = _mm256_add_ps(tvv, off); + + // Transform the texture coordinates for this face. + __m256 coss = _mm256_mul_ps(svv, cos_vec); + __m256 sins = _mm256_mul_ps(svv, sin_vec); + svv = _mm256_fmadd_ps(tvv, sin_vec, coss); + tvv = _mm256_fmsub_ps(tvv, cos_vec, sins); + + // Then scale and offset + svv = _mm256_fmadd_ps(svv, ms_vec, osoff); + tvv = _mm256_fmadd_ps(tvv, mt_vec, otoff); + + _mm256_storeu_ps(sv, svv); + _mm256_storeu_ps(tv, tvv); + + for (S32 j = 0; j < 8; ++j) + { + LLVector2 tc(sv[j], tv[j]); + *dst++ = tc; + + if (!mat && do_bump) + { + bump_tc.push_back(tc); + } + } } +#endif + for (; i < num_vertices; ++i) + { + LLVector2 tc(vf.mTexCoords[i]); + // xform(tc, cos_ang, sin_ang, os, ot, ms, mt); + // Transform the texture coordinates for this face. + + // Texture transforms are done about the center of the face. + F32 s = tc.mV[0] - 0.5f; + F32 t = tc.mV[1] - 0.5f; + + // Handle rotation + F32 temp = s; + s = s * cos_ang + t * sin_ang; + t = -temp * sin_ang + t * cos_ang; + + // Then scale + s *= ms; + t *= mt; + + // Then offset + s += os + 0.5f; + t += ot + 0.5f; + + tc.mV[0] = s; + tc.mV[1] = t; *dst++ = tc; if (!mat && do_bump) { bump_tc.push_back(tc); } } + } + } #if USE_MAP_RANGE if (map_range) {
|  |  |  |  |
Last edited by kathrine on 2020-10-30 18:37:54, edited 1 time in total.
|
2020-10-30 09:43:44 |
|
 |
kathrine
Joined: 2011-10-07 10:39:20 Posts: 207
|

A bit of cleanup and a SSE2 version as well.  |  |  |  | Code: -- C:/devel/sl/cool/linden_old/indra/newview/llface.cpp Wed Jul 29 00:25:54 2020 +++ C:/devel/sl/cool/linden/indra/newview/llface.cpp Fri Oct 30 19:34:35 2020 @@ -57,10 +57,16 @@ #include "llvoclouds.h" #include "llvopartgroup.h" #include "llvosky.h" #include "llvovolume.h" +#if defined(__AVX2__) +#include <immintrin.h> +#else +#include <xmmintrin.h> +#endif + #define LL_MAX_INDICES_COUNT 1000000 static LLStaticHashedString sTextureIndexIn("texture_index_in"); static LLStaticHashedString sColorIn("color_in"); @@ -1720,23 +1726,22 @@ if (!mat && do_bump) { bump_tc.reserve(num_vertices); } + + if (texgen == LLTextureEntry::TEX_GEN_PLANAR) { LLVector4a vec; for (S32 i = 0; i < num_vertices; ++i) { LLVector2 tc(vf.mTexCoords[i]); LLVector4a& norm = vf.mNormals[i]; LLVector4a& center = *(vf.mCenter); - if (texgen == LLTextureEntry::TEX_GEN_PLANAR) - { vec = vf.mPositions[i]; vec.mul(scalea); planarProjection(tc, norm, center, vec); - } if (tex_mode && mTextureMatrix) { LLVector3 tmp(tc.mV[0], tc.mV[1], 0.f); tmp = tmp * *mTextureMatrix; @@ -1753,10 +1758,164 @@ if (!mat && do_bump) { bump_tc.push_back(tc); } } + } + else if (tex_mode && mTextureMatrix) + { + for (S32 i = 0; i < num_vertices; ++i) + { + LLVector2 tc(vf.mTexCoords[i]); + + LLVector3 tmp(tc.mV[0], tc.mV[1], 0.f); + tmp = tmp * *mTextureMatrix; + tc.mV[0] = tmp.mV[0]; + tc.mV[1] = tmp.mV[1]; + + *dst++ = tc; + + if (!mat && do_bump) + { + bump_tc.push_back(tc); + } + } + } + else + { + S32 i = 0; +#if defined(__AVX2__) + __m256 cos_vec = _mm256_set1_ps(cos_ang); + __m256 sin_vec = _mm256_set1_ps(sin_ang); + __m256 off = _mm256_set1_ps(-0.5f); + __m256 osoff = _mm256_set1_ps(os + 0.5f); + __m256 otoff = _mm256_set1_ps(ot + 0.5f); + __m256 ms_vec = _mm256_set1_ps(ms); + __m256 mt_vec = _mm256_set1_ps(mt); + + for (; i + 8 <= num_vertices; i += 8) + { + F32 sv[8]; + F32 tv[8]; + + sv[0] = vf.mTexCoords[i].mV[0]; + tv[0] = vf.mTexCoords[i].mV[1]; + sv[1] = vf.mTexCoords[i+1].mV[0]; + tv[1] = vf.mTexCoords[i+1].mV[1]; + sv[2] = vf.mTexCoords[i+2].mV[0]; + tv[2] = vf.mTexCoords[i+2].mV[1]; + sv[3] = vf.mTexCoords[i+3].mV[0]; + tv[3] = vf.mTexCoords[i+3].mV[1]; + sv[4] = vf.mTexCoords[i+4].mV[0]; + tv[4] = vf.mTexCoords[i+4].mV[1]; + sv[5] = vf.mTexCoords[i+5].mV[0]; + tv[5] = vf.mTexCoords[i+5].mV[1]; + sv[6] = vf.mTexCoords[i+6].mV[0]; + tv[6] = vf.mTexCoords[i+6].mV[1]; + sv[7] = vf.mTexCoords[i+7].mV[0]; + tv[7] = vf.mTexCoords[i+7].mV[1]; + + __m256 svv = _mm256_loadu_ps(sv); + __m256 tvv = _mm256_loadu_ps(tv); + // Texture transforms are done about the center of the face. + svv = _mm256_add_ps(svv, off); + tvv = _mm256_add_ps(tvv, off); + + // Transform the texture coordinates for this face. + __m256 coss = _mm256_mul_ps(svv, cos_vec); + __m256 sins = _mm256_mul_ps(svv, sin_vec); + svv = _mm256_fmadd_ps(tvv, sin_vec, coss); + tvv = _mm256_fmsub_ps(tvv, cos_vec, sins); + + // Then scale and offset + svv = _mm256_fmadd_ps(svv, ms_vec, osoff); + tvv = _mm256_fmadd_ps(tvv, mt_vec, otoff); + + _mm256_storeu_ps(sv, svv); + _mm256_storeu_ps(tv, tvv); + + for (S32 j = 0; j < 8; ++j) + { + LLVector2 tc(sv[j], tv[j]); + *dst++ = tc; + + if (!mat && do_bump) + { + bump_tc.push_back(tc); + } + } + } +#else + /* SSE2 Version, we have no FMA :-( */ + __m128 cos_vec = _mm_set1_ps(cos_ang); + __m128 sin_vec = _mm_set1_ps(sin_ang); + __m128 off = _mm_set1_ps(-0.5f); + __m128 osoff = _mm_set1_ps(os + 0.5f); + __m128 otoff = _mm_set1_ps(ot + 0.5f); + __m128 ms_vec = _mm_set1_ps(ms); + __m128 mt_vec = _mm_set1_ps(mt); + + for (; i + 4 <= num_vertices; i += 4) + { + F32 sv[4]; + F32 tv[4]; + + sv[0] = vf.mTexCoords[i].mV[0]; + tv[0] = vf.mTexCoords[i].mV[1]; + sv[1] = vf.mTexCoords[i + 1].mV[0]; + tv[1] = vf.mTexCoords[i + 1].mV[1]; + sv[2] = vf.mTexCoords[i + 2].mV[0]; + tv[2] = vf.mTexCoords[i + 2].mV[1]; + sv[3] = vf.mTexCoords[i + 3].mV[0]; + tv[3] = vf.mTexCoords[i + 3].mV[1]; + + __m128 svv = _mm_loadu_ps(sv); + __m128 tvv = _mm_loadu_ps(tv); + // Texture transforms are done about the center of the face. + svv = _mm_add_ps(svv, off); + tvv = _mm_add_ps(tvv, off); + + // Transform the texture coordinates for this face. + __m128 coss = _mm_mul_ps(svv, cos_vec); + __m128 sins = _mm_mul_ps(svv, sin_vec); + /* no fmadd, so do it in two steps */ + svv = _mm_add_ps(_mm_mul_ps(tvv, sin_vec), coss); + tvv = _mm_add_ps(_mm_mul_ps(tvv, cos_vec), sins); + + // Then scale and offset + svv = _mm_add_ps(_mm_mul_ps(svv, ms_vec), osoff); + tvv = _mm_add_ps(_mm_mul_ps(tvv, mt_vec), otoff); + + _mm_storeu_ps(sv, svv); + _mm_storeu_ps(tv, tvv); + + for (S32 j = 0; j < 4; ++j) + { + LLVector2 tc(sv[j], tv[j]); + *dst++ = tc; + + if (!mat && do_bump) + { + bump_tc.push_back(tc); + } + } + } +#endif + for (; i < num_vertices; ++i) + { + LLVector2 tc(vf.mTexCoords[i]); + xform(tc, cos_ang, sin_ang, os, ot, ms, mt); + + *dst++ = tc; + + if (!mat && do_bump) + { + bump_tc.push_back(tc); + } + } + } + } #if USE_MAP_RANGE if (map_range) {
|  |  |  |  |
|
2020-10-30 18:37:35 |
|
 |
Henri Beauchamp
Joined: 2009-03-17 18:42:51 Posts: 5907
|
Thank you ! I slightly optimized your code further (in particular, the SSE2 version is used, even when AVX2 is available, when there are less than 8 faces (initially or remaining) to process). See the attached patch. Will test it over this coming week, and if everything works as expected, it will be part of next releases. PS: for your patches, please proceed as follow: - From a terminal (preferably Cygwin's or WSL's), change to the directory containing the original and the patched source trees (it would be C:/devel/sl/cool/ for your case).
- Use the "diff" utility as follow: diff -durN linden_old linden >your-patch-name.txt
- Compress it using gzip: gzip -9 your-patch-name.txt
- Post the result to the forum.
Failing to proceed as above makes it hard for me to apply your patches other than manually, line by line.
|
2020-11-01 18:38:34 |
|
 |
ZaneZimer
Joined: 2016-06-19 21:33:37 Posts: 384 Location: Columbus area, OH, USA
|
I gave the patch a try on the 1.28.1.4 src from yesterday. Most things work properly but it does orient textures wrong in transparent/flexi prims. See the photos attached.
|
2020-11-01 19:52:06 |
|
 |
Henri Beauchamp
Joined: 2009-03-17 18:42:51 Posts: 5907
|
Interesting... Not seeing such an issue so far here. Was it an AVX2 optimized build or just the SSE2 one ?
Also, is there a chance those wings can be tested (demo available) ?
|
2020-11-01 23:01:05 |
|
 |
ZaneZimer
Joined: 2016-06-19 21:33:37 Posts: 384 Location: Columbus area, OH, USA
|
My CPU is only AVX, not AVX2, so I believe it's just a SSE2 build but I do use the --tune & --usesystemlibs options while building. Here is my version info: The wings are my own creation based on a freebie texture from long ago. I have rebuilt them a few times and could share with you for testing.
|
2020-11-01 23:16:53 |
|
 |
Henri Beauchamp
Joined: 2009-03-17 18:42:51 Posts: 5907
|
Yes, that would be the SSE2-only version then. Yes, please, send them to me in-world, thank you !
|
2020-11-01 23:20:46 |
|
 |
ZaneZimer
Joined: 2016-06-19 21:33:37 Posts: 384 Location: Columbus area, OH, USA
|
You've got inventory! 
|
2020-11-01 23:26:14 |
|
 |
Henri Beauchamp
Joined: 2009-03-17 18:42:51 Posts: 5907
|
I confirm a problem with the SSE2 version... I'll double-check my own code to see if I did not miss something in Kathrine's patch.
EDIT: seeing nothing wrong in the recopy of Kathrine's code for SSE2...
|
2020-11-01 23:31:23 |
|
 |
ZaneZimer
Joined: 2016-06-19 21:33:37 Posts: 384 Location: Columbus area, OH, USA
|
Good that it's at least reproducible. I have a vague memory of this happening to those same wing (or a version of them) before, with an older build of the viewer. *Edit: Not quite the same issue as I commented on in: http://sldev.free.fr/forum/viewtopic.php?f=4&t=1977
|
2020-11-01 23:47:29 |
|
|
Who is online |
Users browsing this forum: No registered users and 24 guests |
|
You cannot post new topics in this forum You cannot reply to topics in this forum You cannot edit your posts in this forum You cannot delete your posts in this forum You cannot post attachments in this forum
|
|