| /* |
| Copyright 2005-2010 Intel Corporation. All Rights Reserved. |
| |
| This file is part of Threading Building Blocks. |
| |
| Threading Building Blocks is free software; you can redistribute it |
| and/or modify it under the terms of the GNU General Public License |
| version 2 as published by the Free Software Foundation. |
| |
| Threading Building Blocks is distributed in the hope that it will be |
| useful, but WITHOUT ANY WARRANTY; without even the implied warranty |
| of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| GNU General Public License for more details. |
| |
| You should have received a copy of the GNU General Public License |
| along with Threading Building Blocks; if not, write to the Free Software |
| Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA |
| |
| As a special exception, you may use this file as part of a free software |
| library without restriction. Specifically, if other files instantiate |
| templates or use macros or inline functions from this file, or you compile |
| this file and link it with other files to produce an executable, this |
| file does not by itself cause the resulting executable to be covered by |
| the GNU General Public License. This exception does not however |
| invalidate any other reasons why the executable file might be covered by |
| the GNU General Public License. |
| */ |
| |
| #include "Evolution.h" |
| |
| #ifdef USE_SSE |
| /* Update states with SSE */ |
| |
| #include <xmmintrin.h> |
| #include <emmintrin.h> |
| |
| inline void create_record( char * src, unsigned * dst, unsigned width) |
| { |
| dst[0] |= src[width - 1]; |
| for( unsigned a=0; a<31u; ++a ) |
| dst[0] |= src[a]<<(a+1); |
| unsigned a; |
| for( a=31u; a<width; ++a ) |
| dst[(a+1)/32u] |= src[a]<<((a+1)%32u); |
| dst[(a+1)/32u] |= src[0]<<((a+1)%32u); |
| } |
| |
| inline void sum_offset( __m128i * X, __m128i * A, __m128i * B, __m128i * C, |
| unsigned size_sse_ar, unsigned shift ) |
| { |
| for(unsigned i=0; i<size_sse_ar; ++i) |
| { |
| __m128i tmp = _mm_and_si128(A[i],X[shift + i]); |
| A[i]=_mm_xor_si128(A[i],X[shift + i]); |
| C[i]=_mm_or_si128(C[i],_mm_and_si128(B[i],tmp)); |
| B[i]=_mm_xor_si128(B[i],tmp); |
| } |
| } |
| |
| inline void shift_left2D( __m128i * X, unsigned height, unsigned size_sse_row ) |
| { |
| for( unsigned b=0; b<height; ++b ) |
| { |
| unsigned ind = b*size_sse_row; |
| unsigned x0 = X[ind].m128i_u32[0] & 1; |
| |
| X[ind] =_mm_or_si128( _mm_srli_epi16(X[ind],1), |
| _mm_slli_epi16( _mm_srli_si128( X[ind], 2), 15) ); |
| |
| unsigned x1 = X[ind + 1].m128i_u32[0] & 1; |
| X[ind+1] =_mm_or_si128( _mm_srli_epi16( X[ind+1],1), |
| _mm_slli_epi16( _mm_srli_si128( X[ind+1], 2), 15) ); |
| X[ind].m128i_u32[3] |= x1<<31; |
| |
| unsigned x2 = X[ind + 2].m128i_u32[0] & 1; |
| X[ind+2] =_mm_or_si128( _mm_srli_epi16( X[ind+2],1), |
| _mm_slli_epi16( _mm_srli_si128( X[ind+2], 2), 15) ); |
| X[ind+1].m128i_u32[3] |= x2<<31; |
| |
| unsigned* dst = (unsigned*)&X[ind]; |
| dst[301/32u] |= x0<<(301%32u); |
| } |
| } |
| |
| inline void shift_right2D( __m128i * X, unsigned height, unsigned size_sse_row ) |
| { |
| for( unsigned b=0; b<height; ++b ) |
| { |
| unsigned ind = b*size_sse_row; |
| |
| unsigned x0 = X[ind].m128i_u32[3]; x0>>=31; |
| X[ind] =_mm_or_si128( _mm_slli_epi16(X[ind],1), |
| _mm_srli_epi16( _mm_slli_si128( X[ind], 2), 15) ); |
| |
| unsigned x1 = X[ind + 1].m128i_u32[3]; x1>>=31; |
| X[ind + 1] =_mm_or_si128( _mm_slli_epi16(X[ind + 1],1), |
| _mm_srli_epi16( _mm_slli_si128( X[ind + 1], 2), 15) ); |
| X[ind + 1].m128i_u32[0] |= x0; |
| |
| unsigned* dst = (unsigned*)&X[ind]; |
| unsigned x2 = dst[301/32u] & (1<<(301%32u)); x2>>=(301%32u); |
| X[ind + 2] =_mm_or_si128( _mm_slli_epi16(X[ind + 2],1), |
| _mm_srli_epi16( _mm_slli_si128( X[ind + 2], 2), 15) ); |
| X[ind + 2].m128i_u32[0] |= x1; |
| X[ind].m128i_u32[0] |= x2; |
| } |
| } |
| |
| void UpdateState(Matrix * m_matrix, char * dest ,int begin, int end) |
| { |
| //300/128 + 1 =3, 3*300=900 |
| unsigned size_sse_row = m_matrix->width/128 + 1; //3 |
| unsigned size_sse_ar=size_sse_row * (end - begin); |
| __m128i X[906], A[900], B[900], C[900]; |
| char * mas = m_matrix->data; |
| |
| for( unsigned i=0; i<size_sse_ar; ++i) |
| { |
| A[i].m128i_u32[0]=0;A[i].m128i_u32[1]=0;A[i].m128i_u32[2]=0;A[i].m128i_u32[3]=0; |
| B[i].m128i_u32[0]=0;B[i].m128i_u32[1]=0;B[i].m128i_u32[2]=0;B[i].m128i_u32[3]=0; |
| C[i].m128i_u32[0]=0;C[i].m128i_u32[1]=0;C[i].m128i_u32[2]=0;C[i].m128i_u32[3]=0; |
| } |
| |
| for( unsigned i=0; i<size_sse_ar+6; ++i) |
| { |
| X[i].m128i_u32[0]=0;X[i].m128i_u32[1]=0;X[i].m128i_u32[2]=0;X[i].m128i_u32[3]=0; |
| } |
| |
| // create X[] with bounds |
| unsigned height = end - begin; |
| unsigned width = m_matrix->width; |
| for( unsigned b = 0 ; b < height; ++b ) |
| { |
| char* src = &mas[(b + begin)*width]; |
| unsigned* dst = (unsigned*)&X[(b+1)*size_sse_row]; |
| create_record(src, dst, width); |
| } |
| // create high row in X[] |
| char * src; |
| if(begin == 0) |
| { |
| src = &mas[(m_matrix->height-1)*width]; |
| } |
| else |
| { |
| src = &mas[(begin-1)*width]; |
| } |
| unsigned* dst = (unsigned*)X; |
| create_record(src, dst, width); |
| |
| //create lower row in X[] |
| if(end == m_matrix->height ) |
| { |
| src = mas; |
| } |
| else |
| { |
| src = &mas[end*width]; |
| } |
| dst = (unsigned*)&X[(height+1)*size_sse_row]; |
| create_record(src, dst, width); |
| |
| //sum( C, B, A, X+offset_for_upwards ); high-left friend |
| sum_offset(X,A,B,C,size_sse_ar, 0); |
| |
| //sum( C, B, A, X+offset_for_no_vertical_shift ); |
| sum_offset(X,A,B,C,size_sse_ar, size_sse_row); |
| |
| //sum( C, B, A, X+offset_for_downwards ); |
| sum_offset(X,A,B,C,size_sse_ar, 2*size_sse_row); |
| |
| //shift_left( X ); (when view 2D) in our logic it is in right |
| height = end - begin + 2; |
| shift_left2D( X, height, size_sse_row); |
| |
| //sum( C, B, A, X+offset_for_upwards ); high-left friend |
| sum_offset(X,A,B,C,size_sse_ar, 0); |
| |
| //sum( C, B, A, X+offset_for_downwards ); |
| sum_offset(X,A,B,C,size_sse_ar, 2*size_sse_row); |
| |
| //shift_left( X ); (view in 2D) in our logic it is right shift |
| height = end - begin + 2; |
| shift_left2D( X, height, size_sse_row); |
| |
| //sum( C, B, A, X+offset_for_upwards ); high-right friend |
| sum_offset(X,A,B,C,size_sse_ar, 0); |
| |
| //sum( C, B, A, X+offset_for_no_vertical_shift ); right friend |
| sum_offset(X,A,B,C,size_sse_ar, size_sse_row); |
| |
| //sum( C, B, A, X+offset_for_downwards ); right down friend |
| sum_offset(X,A,B,C,size_sse_ar, 2*size_sse_row); |
| |
| //shift_right( X ); (when view in 2D) in our case it left shift. |
| height = end - begin + 2; |
| shift_right2D( X, height, size_sse_row); |
| |
| //X = (X|A)&B&~C (done bitwise over the arrays) |
| unsigned shift = size_sse_row; |
| for(unsigned i=0; i<size_sse_ar; ++i) |
| { |
| C[i].m128i_u32[0] = ~C[i].m128i_u32[0]; |
| C[i].m128i_u32[1] = ~C[i].m128i_u32[1]; |
| C[i].m128i_u32[2] = ~C[i].m128i_u32[2]; |
| C[i].m128i_u32[3] = ~C[i].m128i_u32[3]; |
| X[shift + i] = _mm_and_si128(_mm_and_si128(_mm_or_si128(X[shift + i], |
| A[i]),B[i]),C[i]); |
| } |
| |
| height = end - begin; |
| width=m_matrix->width; |
| for( unsigned b=0; b<height; ++b ) |
| { |
| char* dst = &dest[(b+begin)*width]; |
| unsigned* src = (unsigned*)&X[(b+1)*size_sse_row]; |
| for( unsigned a=0; a<width; ++a ) |
| { |
| unsigned c = src[a/32u] & 1<<(a%32u); |
| dst[a] = c>>(a%32u); |
| } |
| } |
| } |
| #else |
| /* end SSE block */ |
| |
| // ---------------------------------------------------------------------- |
| // GetAdjacentCellState() - returns the state (value) of the specified |
| // adjacent cell of the current cell "cellNumber" |
| char GetAdjacentCellState( |
| char* source, // pointer to source data block |
| int x, // logical width of field |
| int y, // logical height of field |
| int cellNumber, // number of cell position to examine |
| int cp // which adjacent position |
| ) |
| { |
| /* |
| cp |
| *-- cp=1 ... --- cp=8 (summary: -1-2-3- |
| -x- -x- -4-x-5- |
| --- --* -6-7-8- ) |
| */ |
| char cellState = 0; // return value |
| |
| // set up boundary flags to trigger field-wrap logic |
| bool onTopRow = false; |
| bool onBottomRow = false; |
| bool onLeftColumn = false; |
| bool onRightColumn = false; |
| |
| // check to see if cell is on top row |
| if (cellNumber < x) |
| { |
| onTopRow = true; |
| } |
| // check to see if cell is on bottom row |
| if ((x*y)-cellNumber <= x) |
| { |
| onBottomRow = true; |
| } |
| // check to see if cell is on left column |
| if (cellNumber%x == 0) |
| { |
| onLeftColumn = true; |
| } |
| // check to see if cell is on right column |
| if ((cellNumber+1)%x == 0) |
| { |
| onRightColumn = true; |
| } |
| |
| switch (cp) |
| { |
| case 1: |
| if (onTopRow && onLeftColumn) |
| { |
| return *(source+((x*y)-1)); |
| } |
| if (onTopRow && !onLeftColumn) |
| { |
| return *(source+(((x*y)-x)+(cellNumber-1))); |
| } |
| if (onLeftColumn && !onTopRow) |
| { |
| return *(source+(cellNumber-1)); |
| } |
| return *((source+cellNumber)-(x+1)); |
| |
| case 2: |
| if (onTopRow) |
| { |
| return *(source+(((x*y)-x)+cellNumber)); |
| } |
| return *((source+cellNumber)-x); |
| |
| case 3: |
| if (onTopRow && onRightColumn) |
| { |
| return *(source+((x*y)-x)); |
| } |
| if (onTopRow && !onRightColumn) |
| { |
| return *(source+(((x*y)-x)+(cellNumber+1))); |
| } |
| if (onRightColumn && !onTopRow) |
| { |
| return *(source+((cellNumber-(x*2))+1)); |
| } |
| return *(source+(cellNumber-(x-1))); |
| |
| case 4: |
| if (onRightColumn) |
| { |
| return *(source+(cellNumber-(x-1))); |
| } |
| return *(source+(cellNumber+1)); |
| |
| case 5: |
| if (onBottomRow && onRightColumn) |
| { |
| return *source; |
| } |
| if (onBottomRow && !onRightColumn) |
| { |
| return *(source+((cellNumber-((x*y)-x))+1)); |
| } |
| if (onRightColumn && !onBottomRow) |
| { |
| return *(source+(cellNumber+1)); |
| } |
| return *(source+(((cellNumber+x))+1)); |
| |
| case 6: |
| if (onBottomRow) |
| { |
| return *(source+(cellNumber-((x*y)-x))); |
| } |
| return *(source+(cellNumber+x)); |
| |
| case 7: |
| if (onBottomRow && onLeftColumn) |
| { |
| return *(source+(x-1)); |
| } |
| if (onBottomRow && !onLeftColumn) |
| { |
| return *(source+(cellNumber-((x*y)-x)-1)); |
| } |
| if (onLeftColumn && !onBottomRow) |
| { |
| return *(source+(cellNumber+((x*2)-1))); |
| } |
| return *(source+(cellNumber+(x-1))); |
| |
| case 8: |
| if (onLeftColumn) |
| { |
| return *(source+(cellNumber+(x-1))); |
| } |
| return *(source+(cellNumber-1)); |
| } |
| return cellState; |
| } |
| |
| char CheckCell(Matrix * m_matrix, int cellNumber) |
| { |
| char total = 0; |
| char* source = m_matrix->data; |
| //look around to find cell's with status "alive" |
| for(int i=1; i<9; i++) |
| { |
| total += GetAdjacentCellState(source, m_matrix->width, m_matrix->height, cellNumber, i); |
| } |
| // if the number of adjacent live cells is < 2 or > 3, the result is a dead |
| // cell regardless of its current state. (A live cell dies of loneliness if it |
| // has less than 2 neighbors, and of overcrowding if it has more than 3; a new |
| // cell is born in an empty spot only if it has exactly 3 neighbors. |
| if (total < 2 || total > 3) |
| { |
| return 0; |
| } |
| |
| // if we get here and the cell position holds a living cell, it stays alive |
| if (*(source+cellNumber)) |
| { |
| return 1; |
| } |
| |
| // we have an empty position. If there are only 2 neighbors, the position stays |
| // empty. |
| if (total == 2) |
| { |
| return 0; |
| } |
| |
| // we have an empty position and exactly 3 neighbors. A cell is born. |
| return 1; |
| } |
| |
| void UpdateState(Matrix * m_matrix, char * dest ,int begin, int end) |
| { |
| for (int i=begin; i<=end; i++) |
| { |
| *(dest+i) = CheckCell(m_matrix, i); |
| } |
| } |
| |
| #endif |
| /* end non-SSE block */ |