A better Blender::blend_vec()
by asmaloney (Andy) · in Torque Game Engine · 01/17/2007 (12:59 pm) · 9 replies
A first attempt at Blender::blend_vec()
Blender::blend_vec() is only taking about 1.4% of the time in my Stronghold profile journal, but Rubes's project seems to be dominated by it. It's an extremely dense function and I don't pretend to understand all that's going on in there. I applied the same techniques I used on the other two optimizations and taken it from 1.4% to 1.1% - about a 20% speedup. Hopefully this will start to address things for you Rubes!
There are many other possible optimizations here. If someone can explain the alphaTable - what is does and how it's calculated [my brain hurts looking at it] - maybe we can improve the AlphaCalc::Calc() function. The majority of the time is spent on the vec_or call in AlphaCalc::Calc() - even if I simplify it to vec_or( temp_var, src ). Shark doesn't give me any clues why, so if anyone has suggestions to improve this, please let me know.
Let me know how it goes!
----------
In terrain/blender.cc, add this after the transpose() function:
Blender::blend_vec() is only taking about 1.4% of the time in my Stronghold profile journal, but Rubes's project seems to be dominated by it. It's an extremely dense function and I don't pretend to understand all that's going on in there. I applied the same techniques I used on the other two optimizations and taken it from 1.4% to 1.1% - about a 20% speedup. Hopefully this will start to address things for you Rubes!
There are many other possible optimizations here. If someone can explain the alphaTable - what is does and how it's calculated [my brain hurts looking at it] - maybe we can improve the AlphaCalc::Calc() function. The majority of the time is spent on the vec_or call in AlphaCalc::Calc() - even if I simplify it to vec_or( temp_var, src ). Shark doesn't give me any clues why, so if anyone has suggestions to improve this, please let me know.
Let me know how it goes!
----------
In terrain/blender.cc, add this after the transpose() function:
inline vector unsigned int vec_loadAndSplatU32( unsigned int *scalarPtr )
{
register vector unsigned char splatMap = vec_lvsl( 0, scalarPtr );
const register vector unsigned int result = vec_lde( 0, scalarPtr );
splatMap = (vector unsigned char) vec_splat( (vector unsigned int) splatMap, 0 );
return( vec_perm( result, result, splatMap ) );
}
// Move the alpha calculation out of the inner loops of blender_vec() to allow the compiler to work its magic
class AlphaCalc
{
public:
AlphaCalc::AlphaCalc( void )
: vec_two( (vector unsigned int)( 2 ) ),
index_const( (vector unsigned int)( 0x3F00 ) ),
v255( (vector unsigned int)( 255 ) ),
dstcol_const( (vector unsigned int) (0xf8) ),
#if SRC_IS_ABGR
col_adjust1( (vector unsigned int) (7, 2, 0, 0) ),
col_adjust2( (vector unsigned int) (0, 0, 3, 0) ),
#else
// for some strange and nonexistent altivec processor which uses RGBA5551
col_adjust1( (vector unsigned int) (8, 3, 0, 0) ),
col_adjust2( (vector unsigned int) (0, 0, 2, 0) ),
#endif
globalAlphaTable( alphaTable )
{
}
inline U16 Calc( vector unsigned int hscan_component, vector unsigned int src ) const
{
u_tmp.v = vec_or(vec_and(vec_sr(hscan_component, vec_two), index_const), src);
vector unsigned int dstcol;
unsigned int *sloader = (unsigned int *) &dstcol;
sloader[0] = globalAlphaTable[u_tmp.s[0]];
sloader[1] = globalAlphaTable[u_tmp.s[1]];
sloader[2] = globalAlphaTable[u_tmp.s[2]];
dstcol = vec_add( dstcol, dstcol );
dstcol = vec_min( dstcol, v255 );
// NOTE that on Mac, color order is flipped (ABGR1555 instead of RGBA5551), so:
// 1. we already reversed color order via BIG_ENDIAN indexing above, but
// 2. we need to change the shifts for alpha being the high bit instead of the low.
u_tmp.v = vec_sr(vec_sl(vec_and(dstcol, dstcol_const), col_adjust1), col_adjust2);
return( (unsigned short) (u_tmp.s[0] | u_tmp.s[1] | u_tmp.s[2]) );
}
private:
const vector unsigned int vec_two;
const vector unsigned int index_const;
const vector unsigned int v255;
const vector unsigned int dstcol_const;
const vector unsigned int col_adjust1;
const vector unsigned int col_adjust2;
mutable union {
vector unsigned int v;
U32 s[4];
} u_tmp;
const U8 * const globalAlphaTable;
};Continued...
#2
01/17/2007 (1:01 pm)
// copy in the lighting info
// Once again we make with the linear 2D array
const U32 xTexelInTarget(xInTarget << texelsPerSquareEdge_log2);
const U32 yTexelInTarget(yInTarget << texelsPerSquareEdge_log2);
const U32 yTexelInTarget_offset((yTexelInTarget << TEXELS_PER_TARGET_EDGE_LOG2));
U16 *const bits0 = &destmips[0][ yTexelInTarget_offset + xTexelInTarget ];
U16 *const bits1 = &destmips[1][ (yTexelInTarget_offset >> 2) + (xTexelInTarget >> 1) ];
U16 *const bits2 = &destmips[2][ (yTexelInTarget_offset >> 4) + (xTexelInTarget >> 2) ];
const U32 base_xInLightmap(xInTile << LUMELS_PER_SQUARE_EDGE_LOG2);
const U32 base_yInLightmap(yInTile << LUMELS_PER_SQUARE_EDGE_LOG2);
U32 yInLightmap_offset(base_yInLightmap << LUMELS_PER_TILE_EDGE_LOG2);
U32 next_yInLightmap(base_yInLightmap);
U32 yTexelInTargetSquare_offset(0);
U32 yTexelInSquare_offset(0);
for(U32 yLumelInSquare(0); yLumelInSquare < LUMELS_PER_SQUARE_EDGE; ++yLumelInSquare)
{
next_yInLightmap = (next_yInLightmap + 1) & LUMELS_PER_TILE_EDGE_MASK;
const U32 next_yInLightmap_offset(next_yInLightmap << LUMELS_PER_TILE_EDGE_LOG2);
U32 xInLightmap(base_xInLightmap);
U32 xTexelInSquare_offset = 0;
for(U32 xLumelInSquare(0); xLumelInSquare < LUMELS_PER_SQUARE_EDGE; ++xLumelInSquare)
{
const U32 next_xInLightmap((xInLightmap + 1) & LUMELS_PER_TILE_EDGE_MASK);
U32 texelInTargetSquare_offset = yTexelInTargetSquare_offset + xTexelInSquare_offset;
U32 texelInSquare_offset = yTexelInSquare_offset + xTexelInSquare_offset;
unsigned int *loader = (unsigned int *) &vlumels;
//lumels = (vector unsigned int) (lmap[xInLightmap | yInLightmap_offset], lmap[next_xInLightmap | yInLightmap_offset],
// lmap[xInLightmap | next_yInLightmap_offset], lmap[next_xInLightmap | next_yInLightmap_offset]);
loader[0] = lmap[xInLightmap | yInLightmap_offset];
loader[1] = lmap[next_xInLightmap | yInLightmap_offset];
loader[2] = lmap[xInLightmap | next_yInLightmap_offset];
loader[3] = lmap[next_xInLightmap | next_yInLightmap_offset];
// Split the LUMELs into colors
vector unsigned int col[4];
vector unsigned int col_const = (vector unsigned int) (0x1f << 11);
col[2] = vec_and(vlumels, col_const);
col[1] = vec_and(vec_sl(vlumels, (vector unsigned int) (5)), col_const);
col[0] = vec_and(vec_sl(vlumels, (vector unsigned int) (10)), col_const);
vector unsigned int vec_targetTexelsPerLumel_log2 = vec_loadAndSplatU32( &targetTexelsPerLumel_log2 );
transpose(4, col); // transpose the matrix since we were using rows and now we need columns
// One for each color component
vector unsigned int left_component_delta = vec_sr(vec_sub(col[2], col[0]), vec_targetTexelsPerLumel_log2);
vector unsigned int right_component_delta = vec_sr(vec_sub(col[3], col[1]), vec_targetTexelsPerLumel_log2);
vector unsigned int vscan_left_component = col[0];
vector unsigned int vscan_right_component = col[1];
// Now we interpolate the color shifts across the square
for(U32 yTexelInLumel = 0; yTexelInLumel < targetTexelsPerLumel; ++yTexelInLumel)
{
vector unsigned int across_component_delta = vec_sr(vec_sub(vscan_right_component, vscan_left_component), vec_targetTexelsPerLumel_log2);
vector unsigned int hscan_component = vscan_left_component;
vscan_left_component = vec_add(vscan_left_component, left_component_delta);
vscan_right_component = vec_add(vscan_right_component, right_component_delta);
U16 *dstbits = &bits0[ texelInTargetSquare_offset ];
const U8 *srcbits = (U8 *)&bufferToLightFrom[ texelInSquare_offset ];
for(U32 xTexelInLumel = 0; xTexelInLumel < targetTexelsPerLumel; ++xTexelInLumel)
{
vector unsigned int cur_srcbits;
loader = (unsigned int *) &cur_srcbits;
loader[0] = srcbits[0];
loader[1] = srcbits[1];
loader[2] = srcbits[2];
*dstbits++ = alphaCalc.Calc( hscan_component, cur_srcbits );
srcbits += 4;
}
texelInTargetSquare_offset += yStrideThroughTarget;
texelInSquare_offset += yStrideThroughSquare;
}
xTexelInSquare_offset += xStrideAcrossLumels;
xInLightmap = next_xInLightmap;
}
yInLightmap_offset = next_yInLightmap_offset;
yTexelInTargetSquare_offset += yStrideThroughTargetAcrossLumels;
yTexelInSquare_offset += yStrideThroughSquareAcrossLumels;
}
// end of lighting.
}
}
extrude( destmips, TEXELS_PER_TARGET_EDGE_LOG2 );
PROFILE_END();
}Done - had to split the function in the middle
#3
Nice work! I'll definitely plug this in when I get home and see how it does. A 20% boost would be huge for me.
I wonder why my project is so dominated by it, though?
01/17/2007 (1:50 pm)
Holy crap, this is the thread to stop the presses.Nice work! I'll definitely plug this in when I get home and see how it does. A 20% boost would be huge for me.
I wonder why my project is so dominated by it, though?
#4
I await the verdict!
01/17/2007 (2:22 pm)
I hope it does something for you. As I mentioned in other threads, I'm working with Stronghold so it may have very different characteristics. The changes I'm making though should be general - while you may not see speedups they should definitely not slow anybody's game down!I await the verdict!
#5
Rubes, how much of your mission time is spent in the terrain blender?
01/17/2007 (2:56 pm)
Terrain textures are cached after blending, so though it looks like blender is only using 1.4%, it's actually using much more in the frames where textures are generated. This can cause hitching (especially on Mac), so any improvement is a good one. :)Rubes, how much of your mission time is spent in the terrain blender?
#6
01/17/2007 (3:11 pm)
Thanks John - that makes sense. Do you know anything about the global alphaTable? Indexing into it and storing into a vector as I'm doing in AlphaCalc::Calc() is expensive. Could you explain its purpose and what is stored in it? I can read the code, but I don't understand what's behind it.
#7
@John: The hitching on the Mac (both PPC and Intel) is my personal crusade, so I'm with you on that. The performance test I have been using in my game is a journaled sequence where I spin around a couple of times, move to a different room, and then spin around a couple more times; in that sequence, the hitching is pronounced. When I profile with Shark on the PPC, Blender::blend_vec() leads the way at a whopping 13.2%.
01/17/2007 (3:21 pm)
@Andy: You're right, I think; the interiors in Stronghold may be more complicated structures than the one in my game; mine is large, and there are some complicated areas, but much of it is simplified. I should try tests in different parts of the structure to see if there are performance differences.@John: The hitching on the Mac (both PPC and Intel) is my personal crusade, so I'm with you on that. The performance test I have been using in my game is a journaled sequence where I spin around a couple of times, move to a different room, and then spin around a couple more times; in that sequence, the hitching is pronounced. When I profile with Shark on the PPC, Blender::blend_vec() leads the way at a whopping 13.2%.
#8
Before: Blender::blend_vec() 13.6%
After: Blender::blend_vec() 11.5%
That's about a 15% improvement. Subjectively, however, there wasn't much effect; it still appeared and felt the same, with lots of similar hitching. Nice work to get those numbers, though!
01/17/2007 (5:43 pm)
So I incorporated these changes into my build (which also incorporates the improved Interior::setupActivePolyList() function) and used the same journal playback as before with the spinning around. Shark profiling gave:Before: Blender::blend_vec() 13.6%
After: Blender::blend_vec() 11.5%
That's about a 15% improvement. Subjectively, however, there wasn't much effect; it still appeared and felt the same, with lots of similar hitching. Nice work to get those numbers, though!
#9
01/17/2007 (5:59 pm)
Rubes: Thanks for the report. Obviously there's more work to be done :-) At this point I'll need to get some input from someone with more knowledge of what's going on in this function. That vec_or() problem is perplexing too [and a real hit], so maybe someone with Altivec mastery can comment on that.
Torque Owner asmaloney (Andy)
Default Studio Name
inline void Blender::blend_vec( int x, int y, int squaresPerTargetEdge_log2, const U16 *lmap, U16 **destmips ) { PROFILE_START(ALTIVEC_BLEND); const U32 squaresPerTargetEdge(1 << squaresPerTargetEdge_log2); // 32 (low detail) to 4 (high detail). const U32 texelsPerSquareEdge_log2(TEXELS_PER_TARGET_EDGE_LOG2 - squaresPerTargetEdge_log2); // 5 (high detail) to 2 (low detail) const U32 texelsPerSquareEdge(1 << texelsPerSquareEdge_log2); // == TEXELS_PER_TARGET_EDGE / squaresPerTargetEdge); 4 (low) to 32 (high) detail. const U32 texelsPerSquare_log2(texelsPerSquareEdge_log2 << 1); // 10 (high detail) to 4 (low detail) const U32 sourceMipMapIndex(MAX_TEXELS_PER_SQUARE_EDGE_LOG2 - texelsPerSquareEdge_log2); const U32 targetTexelsPerLumel_log2(texelsPerSquareEdge_log2 - LUMELS_PER_SQUARE_EDGE_LOG2); const U32 targetTexelsPerLumel(1 << targetTexelsPerLumel_log2); const U32 yStrideThroughTarget(TEXELS_PER_TARGET_EDGE); const U32 yStrideThroughSquare(texelsPerSquareEdge); const U32 xStrideAcrossLumels(targetTexelsPerLumel); const U32 yStrideThroughTargetAcrossLumels(yStrideThroughTarget << targetTexelsPerLumel_log2); const U32 yStrideThroughSquareAcrossLumels(yStrideThroughSquare << targetTexelsPerLumel_log2); const U32 *const*const allSourceBitMaps = &bmpdata[sourceMipMapIndex * num_src_bmps]; const AlphaCalc alphaCalc; // sy & sx index through the SQUAREs of the DESTINATION MIP-MAP // All Source MIP-MAPs are 2D arrays of squares: // SQUARE source_mip_map_2D[SQUARES_PER_MIPMAP_EDGE][SQUARES_PER_MIPMAP_EDGE]; // But they are stored as 1D arrays: // SQUARE source_mip_map_1D[SQUARES_PER_MIPMAP_EDGE*SQUARES_PER_MIPMAP_EDGE]; // therefore the following are equivalent: // source_mip_map_2D[Y][X] // source_mip_map_1D[(Y * SQUARES_PER_MIPMAP_EDGE) + X] // source_mip_map_1D[(Y << SQUARES_PER_MIPMAP_EDGE_LOG2) + X] // source_mip_map_1D[(Y << SQUARES_PER_MIPMAP_EDGE_LOG2) | X] // This loop is from [0] through [squaresPerTargetEdge - 1] of the destination // and from [y] through [y + squaresPerTargetEdge - 1] of the source. // A single terrain TILE is equivalently: // SQUARE terrain_tile_2D[SQUARES_PER_TILE_EDGE][SQUARES_PER_TILE_EDGE]; // or // SQUARE terrain_tile_1D[SQUARES_PER_TILE_EDGE*SQUARES_PER_TILE_EDGE]; // therefore the following are equivalent: // terrain_tile_2D[Y][X] // terrain_tile_1D[(Y << SQUARES_PER_MIPMAP_EDGE_LOG2) | X] // Neither source_mip_map_1D nor terrain_tile_1D appear explicitly. for ( int yInTarget = 0; yInTarget < squaresPerTargetEdge; ++yInTarget ) { // This whole section is called "doing 2-dimensional array indexing the hard way" // yInTile & after_yInTile are the bottom and top of the source square we are actually processing, // masked to tile size which is what causes the "repeating" effect const int yInTile((y + yInTarget) & SQUARES_PER_TILE_EDGE_MASK); const int after_yInTile((yInTile + 1) & SQUARES_PER_TILE_EDGE_MASK); // yInTile_offset and after_yInTile_offset are the offsets into the terrain_tile_1D format arrays const int yInTile_offset(yInTile << SQUARES_PER_TILE_EDGE_LOG2); const int after_yInTile_offset(after_yInTile << SQUARES_PER_TILE_EDGE_LOG2); // py is the row index in squares into the source_mip_map_2D const int yInSource(yInTile & SQUARES_PER_MIPMAP_EDGE_MASK); // yInSource_offset is the offset in squares into the source_mip_map_1D, times the size // of the squares. const int yInSource_offset(yInSource << (texelsPerSquare_log2 + SQUARES_PER_MIPMAP_EDGE_LOG2)); // This loop is from [yInTarget][0] through [yInTarget][squaresPerTargetEdge - 1] of the destination // and from [yInTile][x] through [yInTile][x + squaresPerTargetEdge - 1] of the source. for ( int xInTarget = 0; xInTarget < squaresPerTargetEdge; xInTarget++ ) { // xInTile & after_xInTile are the left and right side of the source square we are actually processing, // masked to tile size which is what causes the "repeating" effect const int xInTile((x + xInTarget) & SQUARES_PER_TILE_EDGE_MASK); const int after_xInTile((xInTile + 1) & SQUARES_PER_TILE_EDGE_MASK); // xInSource is the column index in squares into the source_mip_map_2D const int xInSource(xInTile & SQUARES_PER_MIPMAP_EDGE_MASK); // As you can see the GRID is accessed in TILE co-ordinates const U32 gridflags(GRIDFLAGS( xInTile, yInTile )); // Cache the source textures at our mip-map level as specified by the GRID-FLAGS const U32 *sourceSquareBitMaps[MAXIMUM_TEXTURES]; // Cache the Alpha-Maps as specified by the GRID-FLAGS const U8 *alphaMaps[MAXIMUM_TEXTURES]; // Pre-calculate (U8*) &source_mip_map_2D[yInSource][xInSource] -- // ( (yInSource * SQUARES_PER_MIPMAP_EDGE) + xInSource ) * sizeof(SQUARE) const int bitmapOffset(yInSource_offset | (xInSource << texelsPerSquare_log2)); int numTexturesToBlend = 0; for ( int i = 0; i < num_src_bmps; ++i ) { if ( gridflags & (MATERIALSTART << i) ) // Gridflags tell us which materials are used for this square {// Cache Alpha maps and bitmaps. alphaMaps[ numTexturesToBlend ] = alpha_data[ i ]; sourceSquareBitMaps[ numTexturesToBlend++ ] = &allSourceBitMaps[ i ][bitmapOffset]; if ( numTexturesToBlend == MAXIMUM_TEXTURES ) break; // Why? What happens if more than (4) textures should be blended? } } const U32 *bufferToLightFrom = blendbuffer; if ( numTexturesToBlend < 2 ) { // don't copy the square over...just leave it and tell // lighting code to use src bmp as the source instead of // the blend_buffer; bufferToLightFrom = sourceSquareBitMaps[ 0 ]; } else { int alphaOffsets[4]; alphaOffsets[0] = yInTile_offset | xInTile; // precalculate offsets for tile-coords[yInTile][xInTile] alphaOffsets[1] = yInTile_offset | after_xInTile; // and so on for the square bounded by alphaOffsets[2] = after_yInTile_offset | xInTile; // [yInTile][xInTile] [yInTile][after_xInTile] alphaOffsets[3] = after_yInTile_offset | after_xInTile;// [after_yInTile][xInTile] [after_yInTile][after_xInTile] switch( numTexturesToBlend ) // Blend 1 square of the numTexturesToBlend bit-maps into the blend buffer { case 2: doSquare2( blendbuffer, texelsPerSquareEdge_log2, alphaOffsets, sourceSquareBitMaps, alphaMaps ); break; case 3: doSquare3( blendbuffer, texelsPerSquareEdge_log2, alphaOffsets, sourceSquareBitMaps, alphaMaps ); break; default: // more subtle paranoia doSquare4( blendbuffer, texelsPerSquareEdge_log2, alphaOffsets, sourceSquareBitMaps, alphaMaps ); break; } } // [these comments are making me paranoid -- Ed.]Continued...