For this case study let’s try to auto-vectorize an implementation of Abelian sandpile model

void stabilize() {
  while(1) {
    int spills = 0;

    for (size_t y = 1; y <= pixel; ++y) {
      for (size_t x = 1; x <= pixel; ++x) {
          const int pos = y * points + x;
          char currSand = buffer[pos];
          char newSand = currSand >= 4 ? currSand - 4 : currSand;
          spills += currSand >= 4;

          newSand = newSand + buffer[pos - points] >= 4;
          newSand = newSand + buffer[pos + points] >= 4;
          newSand = newSand + buffer[pos-1] >= 4;
          newSand = newSand + buffer[pos+1] >= 4;

          state[pos] = newSand;
      }
    }

    std::swap(buffer, state);
    if (!spills) {
      return;
    }
  }
}

Explore this example in [Godbolt]https://godbolt.org/#z:OYLghAFBqd5TKALEBjA9gEwKYFFMCWALugE4A0BIEAZgQDbYB2AhgLbYgDkAjF%2BTXRMiAZVQtGIHgCYBQogFUAztgAKAD24AGfgCsp5eiyahUAUmkAhC5fIrGqIgSHVmmAMLp6AVzZMDbgAyBEzYAHK%2BAEbYpFIA7OQADuhKxM5Mnj5%2BBsmpTkLBoRFs0bE8CfbYjukiRCykRJm%2B/jx22A75TLX1RIXhUTHxdnUNTdmtSiO9If0lg%2BUAlHbo3qSonFwWAMwhqD44ANRmW%2B6TpCHAx7hmWgCCGEyTByFEB4kE6u1HWwAiBzzHdyAgAcx2sdweTxeb3QLyU3z%2BAJOILBN1uIXoMwOqQAXtgAPqvUjYJReABu2AAkph1BBcQTXgBPcjYgh4wkHdQLI5xSwHYlEVZMA6Mg4AKhhcKOVk5YJ5PzRaLJsMwBxoTAgqCQ9QlkW8NBoMRZWp12LqRGw3LMvLRB2ewmx73o9Hhxz%2BWlRd1tarIBzpbIZIoR/zlosBbreH3acpsNkZVptdztdu2iVILGAbBYB3QbESrLYmG9dsEpD99I56mDSL5VfDv0jn3oMasNi5PPBt2TyZNZdQq1IImMqojeoNMTMAFZLMTSfQKdTaczOQspwqtp3u3bewdQgB3IdMEcN/ukQfD77XBsAFm%2BADEDqfz0eDgBaA63kCPgeHosb4vJkoToutK1gnj%2BF5XBG16el2W4APTwQcIjATmFJljQpC5ru2AEMASCRCspBKABdr7r%2BoGjvqhqkFOM4kuSVI0lAorvjwSwrmul7QbBpFmiwFp0bOjGLhAy5clxEbkcOsHdtaCpJvKipxD8XBLPQ3CTvw/hcDo5DoNwQKtjKpKrOs0pbHw5BENoalLAA1iAWxbAAdAAbFoWzXtePBbHEbmTpOWhubIGlcNe/BsCAPkuQAnNIcRxDwWjAtezlxFsk7kDpekGVw/BKCAWjWbZSxwLAKAYDg%2BDEGQlDUHQjCsBw3BWYIwhiBInA8NecjCMoaiaLpeggKFximLGVhtB06SuEeYwtFs5BBDMxSlFIxW5GkQgLVIS1bZ0fRrfMxWVNUQjdKMXjNHt01VJ0l3TEUAxlKdUy7b5ww9EdL0bUsplrBs2y7Ps2DfKcRDnCYUEQkIUIOu8TbVvW7igv%2BsOPK80LJFKEZIkCJxo52GJYhWRIMfOTG0mTIosjT7bWnyApCkGEo48IroyuoMYqYqdzKgQqrqpq2qkLq1FGo%2BosSpMAmWh23rQkBDAgRGHro7c3qluWAYcmGDY1kG9Z/Ij0YbpRVjxgriklr6/rsq8dYG3KTsEybUbNubk2WAziZwd2O5PhRVHjrR07CZToniauk7rpuW47tJL4RkHkFbFefy3scD6py%2B76ft%2BZ6/rJW7K86nNgX8ufHhnH4l92iHIah6DoWqWFsDheEEURJE28mScjq2DZjjRQkUwuzFiW%2B/wcRJsfcTevF93asuCeH49U2JLJz38UnYAeMka3JvOKfJymqepmnabZ%2BmGd72JEeZ2zSPwNnDQsSxINgLA4LEECX%2BFSKIBJzAhcloOI0hpDAlijwScbkQpuWvLFNy2Ub55QKkVEq78HKjWkC5IK0CeAhWBAFOIPlrxZTClsa%2Bw1b75SwToMqiByrIDQLmJ0RoqCanYQwQYwAeAyAEAwC0xFqCRBvpEEI9RGStX4BgNgHBhAAHkmD0BkbQnAWYTCSA0QQYk1QKSFVodgT4/YLSyMoMIdoN9MSRHTKQRkngcA30hgQKKvA1ICCMMAJQAA1Ag%2B8lGJGYBY9qohxCSB6n1RQKgNA330K0IwJg0De0MAQSIhVIBLHQIkToRjXxKK2G%2BdU6BXyQ2wNgV8FJHBkADMUnJzA8wFXaPdWaEA3AfWWkeH6cwyhJBSNtDI11xj9LyOkHp60JgtPOl0d6wyWh3RmY9CZ8wvpXSyAs2WDQVllH%2Bo/bq1liQbD4AArSqDaF5XUCQ18iCDjAFQKgf4PAXLSD9EZawU0Dg1RIGWbYrQDieDzLw350hLILFfqVHBSCXIQNioQ2KcC4jAgSok7gEVyBRSCuc3K3AMHFTfow8gLCICVR4YwCgXD5EcNiPwwRjURGFQgOI2hkjWAOIsfIxRRAVFqJvpo8aOi9KEH0U4QxN8TFVG8OYjxliLRhT0rY%2BxjisAbD0q49xJyvEZj8QEvcQSQkyrCZ1SJvUwkDTibQ/QY1knmGMrYWxmT/76VyekfJhTmkzRcO0%2Ba8yAjdNWr9VoB10idODQUANvSDBnQenMjZUbpkxu%2BhGyZazGi%2BomFMHZUg9lmU4LIcpxzPFhTOTlfglzrm3PuY8gRLy3mpK%2BYQH5FlZAArJTEZt4KGF2XIF/H%2BgwnVhXRZi4qpa6F4q7R/cgjlpCxXcvFNyMhYoZUSiQyh3BqHYrLbiidRLmEoEIAaeqtB5DGu6qa%2BQ5qhoKqQIVRJN6D00CIIyYJmDSA3tGuQN9SgH1PpfVoU5NCcVcB%2BAQA0Bx/H73bVctyNzbxVqebWsk8JoOwbuQ8hDL8J1QriDCuF144jgK2Mg/D0hYGGDRYBrd9DCr4sheRrgmHR3oKw%2BQdCqQXDXiAA%3D%3D%3D) or here

Restructuring for auto vectorization

const int pixel = 1<<8;
const int points = 1<<8;

void stabilize(char* buffer, char* state) {
  int spills = 0;

    for (size_t y = 1; y <= pixel; ++y) {
      for (size_t x = 1; x <= pixel; ++x) {
          const int pos = y * points + x;
          char currSand = buffer[pos];
          char newSand = currSand >= 4 ? currSand - 4 : currSand;
          spills += currSand >= 4;

          newSand = newSand + buffer[pos - points] >= 4;
          newSand = newSand + buffer[pos + points] >= 4;
          newSand = newSand + buffer[pos-1] >= 4;
          newSand = newSand + buffer[pos+1] >= 4;

          state[pos] = newSand;
      }
    }

    std::swap(buffer, state);
    if (!spills) {
      return;
    }
}

To make this reable I’m going to be extracting the y*points + x as an inline function

const int pixel = 1<<8;
const int points = 1<<8;
inline size_t resolveIdx(size_t y, size_t x) { return y * points + x; }

void stabilize(char* buffer, char* state) {
  int spills = 0;

  for (size_t y = 1; y <= pixel; ++y) {
    for (size_t x = 1; x <= pixel; ++x) {
      char currSand = buffer[resolveIdx(y, x)];
      char newSand = currSand >= 4 ? currSand - 4 : currSand;
      spills += currSand >= 4;
      // Spill over from neighbours
      newSand += buffer[resolveIdx((y - 1), x)] >= 4;
      newSand += buffer[resolveIdx((y + 1), x)] >= 4;
      newSand += buffer[resolveIdx(y, (x - 1))] >= 4;
      newSand += buffer[resolveIdx(y, (x + 1))] >= 4;

      state[resolveIdx(y, x)] = newSand;
    }
  }
}

OpenMP

const int pixel = 1<<8;
const int points = 1<<8;
inline size_t resolveIdx(size_t y, size_t x) { return y * points + x; }

void stabilize(char* buffer, char* state) {
  int spills = 0;

  for (size_t y = 1; y <= pixel; ++y) {
    #pragma omp simd
    for (size_t x = 1; x <= pixel; ++x) {
      char currSand = buffer[resolveIdx(y, x)];
      char newSand = currSand >= 4 ? currSand - 4 : currSand;
      spills += currSand >= 4;
      // Spill over from neighbours
      newSand += buffer[resolveIdx((y - 1), x)] >= 4;
      newSand += buffer[resolveIdx((y + 1), x)] >= 4;
      newSand += buffer[resolveIdx(y, (x - 1))] >= 4;
      newSand += buffer[resolveIdx(y, (x + 1))] >= 4;

      state[resolveIdx(y, x)] = newSand;
    }
  }
}

Encapsulating the function as a class

const size_t pixel = 1 << 8;
const size_t points = pixel;
const size_t size = points * points;

class Sandpile {
  void stabilize(char *state, char *buffer);
  inline size_t resolveIdx(size_t y, size_t x) { return y * points + x; }
};

void Sandpile::stabilize(char *state, char *buffer) {
    size_t spills = 0;
    for (size_t y = 1; y <= pixel; ++y) {
      #pragma omp simd
      for (size_t x = 1; x <= pixel; ++x) {
        const int pos = y * points + x;
        char currSand = buffer[pos];
        char newSand = currSand >= 4 ? currSand - 4 : currSand;
        spills += currSand >= 4;
        // Spill over from neighbours
        newSand = newSand + buffer[pos - points] >= 4;
        newSand = newSand + buffer[pos + points] >= 4;
        newSand = newSand + buffer[pos-1] >= 4;
        newSand = newSand + buffer[pos+1] >= 4;

        state[pos] = newSand;
      }
    }

    // print();
    std::swap(buffer, state);
    if (!spills) {
      return;
    }
}

For some reason with the signature I wasn’t able to auto vectorize it

// Work
class Sandpile {
  void stabilize(char *state, char *buffer); 
  inline size_t resolveIdx(size_t y, size_t x) { return y * points + x; }
};

// Does not allow for vectorization
class Sandpile {
  char *state;
  char *buffer;
  void stabilize();
  inline size_t resolveIdx(size_t y, size_t x) { return y * points + x; }
};


Final auto vectorized verison.

const size_t pixel = 1 << 8;
const size_t points = pixel;
const size_t size = points * points;

class Sandpile {
  vector<char> _state;
  vector<char> _buffer;
private:
  inline size_t resolveIdx(size_t y, size_t x) { return y * points + x; }
  // __restrict__ as we know there will be no overlap
  void stabilize(char *__restrict__ state, char *__restrict__ buffer) {
    size_t spills = 0;
    for (size_t y = 1; y <= pixel; ++y) {
      #pragma omp simd
      for (size_t x = 1; x <= pixel; ++x) {
        const int pos = y * points + x;
        char currSand = buffer[pos];
        char newSand = currSand >= 4 ? currSand - 4 : currSand;
        spills += currSand >= 4;
        // Spill over from neighbours
        newSand = newSand + buffer[pos - points] >= 4;
        newSand = newSand + buffer[pos + points] >= 4;
        newSand = newSand + buffer[pos-1] >= 4;
        newSand = newSand + buffer[pos+1] >= 4;

        state[pos] = newSand;
      }
    }

    // print();
    std::swap(buffer, state);
    if (!spills) {
      return;
    }
  }

  void computeIdentity() {
    // f(ones(n)*6 - f(ones(n)*6)
    char* buffer = _buffer.data();
    char* state = _state.data();

    for (size_t y = 1; y <= pixel; ++y) {
      for (size_t x = 1; x <= pixel; ++x) {
          const int pos = y * points + x;
        buffer[resolveIdx(y, x)] = 6;
      }
    }

    stabilize(buffer, state);

    for (size_t y = 1; y <= pixel; ++y) {
      for (size_t x = 1; x <= pixel; ++x) {
        buffer[resolveIdx(y, x)] = 6 - state[resolveIdx(y, x)];
      }
    }

    stabilize(buffer, state);
  }

};