Efficient Programs

A collection ofMicro Optimizations

by Alex, Gabriel & Michael

Start

● Optimizations from hash.0.c to hash.13.c

● Performance testing:gcc -Wall -O3 hash.c -o hashperf stat -r 5 -e instructions -e branch-misses hash input input2perf stat -r 5 -e cycles hash input input2

● Result:Cycles: 7.292.009.385Instructions: 1.063.178.278Branch mispredictions: 11.395.359Time elapsed: 2.2927 s

Analysis

● Hashtable○ Max. Collisions: 7○ Empty Elements: 363.543○ Amount Elements: 1.048.575○ Input Elements: 724.129

● Good hash table size: ~ 20% of Input● Brent-Hashing?● Parallelism?

Convert Linked-Lists to Arrays

● fewer cache misses on frequently used lookup

● overhead due to reorganizing

● struct size reduced from 24 to 16 bytesdue to removing *next

● faster at large lists○ break even point at HASHSIZE 2^18

Loop peeling: lookupif(l != NULL) {

if (keylen == l->keylen && memcmp(keyaddr, l->keyaddr, keylen)==0)

return l->value;

l = l->next;

while (l!=NULL) {

if (keylen == l->keylen && memcmp(keyaddr, l->keyaddr, keylen)==0)

return l->value;

l = l->next;

}

}

return -1;

Cycles: 7.255.927.875 (-0,495%)Instructions: 1.067.896.719 (+0,444%)Branch mispredictions: 11.464.124 (+0,603%)Time elapsed: 2,1613 s (-5,731%)

Inlineinline struct block slurp(char *filename)

inline unsigned long hash(char *addr, size_t len)

inline void insert(char *keyaddr, size_t keylen, int value)

inline int lookup(char *keyaddr, size_t keylen)

Cycles: 7.265.216.080 (+0,128%)Instructions: 1.067.543.945 (-0,033%)Branch mispredictions: 11.541.050 (+0,671%)Time elapsed: 2,1672 s (+0,273%)

Replace loop with macro#define REPEAT10(x) { x x x x x x x x x x }

REPEAT10 (

for (p=input2.addr, endp=input2.addr+input2.len; p<endp; ) {

...

}

);

Cycles: 7.313.103.515 (+0,659%)Instructions: 1.062.596.883 (-0,463%)Branch mispredictions: 11.423.373 (-1,020%)Time elapsed: 2,1791 s (+0,549%)

Some Minor Changes

● new Makro HASHSIZE-1● Remove unnecessary Casts

... with no effects

Loop peeling + adjust leninline unsigned long hash(char *addr, size_t len) {

...

if(len > 7 ) {

len = len - 7;

x = (*(unsigned long *)addr)*hashmult;

for (i=8; i<len; i+=8) {

w = *(unsigned long *)(addr+i);

x = (x + w)*hashmult;

}

len = len + 7;

}

...

Loop peeling + adjust len

Cycles: 8.271.902.713 (+13,111%)Instructions: 1.038.690.398 (-2,250%)Branch mispredictions: 11.809.722 (+3,382%)Time elapsed: 2,4551 s (+12,668%)

=> probably faster for long strings=> changes discarded

Pointers instead of indices uint128_t x;

unsigned long * laddr = (unsigned long *) addr;

unsigned long * end = (unsigned long *) (addr+len);

if(len > 7 ) {

x = *laddr * hashmult;

end--;

for (laddr++; laddr <= end; laddr++) {

x = (x + *laddr)*hashmult;

}

if (laddr < (end+1))

x = ( x + ((*laddr)<< ( ((char*)laddr - (char*)end)*8)) ) * hashmult;

return x+(x>>64);

} else if (laddr < end) {

x = (uint128_t)((*laddr)<<((8-len)*8)) * hashmult;

return x+(x>>64);

}

return 0;

Pointers instead of indices

Cycles: 8.253.559.129 (+12,860%)Instructions: 1.021.822.315 (-3,837%)Branch misprecditions: 1.1825.252 (+3,518%)Time elapsed: 2,4558 s (+12,700%)

=> probably faster for long strings => changes discarded

Improve loop-layoutfor (p=input1.addr, endp=input1.addr+input1.len, i=0; p<endp; i++) {

nextp=memchr(p, '\n', endp-p);

if (nextp == NULL)

break;

...

}

------------------------------------------------

for (p=input.addr, endp=input.addr+input.len, r=0,

nextp=memchr(p, '\n', endp-p); nextp != NULL;

r++, nextp=memchr(p, '\n', endp-p)) {

...

}

Improve loop-layout

Cycles: 7.364.723.755 (+0,705%)Instructions: 1072512560 (+0,933%)Branch mispredictions: 11606354 (+1,601%)Time elapsed: 2,2509 s (+3,294%)

=> "if" and "&&" probably similar instructions in this case

for (p=input.addr, endp=input.addr+input.len, r=0,

nextp=memchr(p, '\n', endp-p); nextp != NULL;

r++, nextp=memchr(p, '\n', endp-p)) {

...

}

struct block input1, input2; struct block input;

unsigned int i; unsigned long r=0;

unsigned long r=0;

Remove unnecessary check

Remove unnecessary variables

Cycles: 7323904385 (-0,554%)Instructions: 1064977111 (-0,702%)Branch mispredictions: 11734428 (+1,103%)Time elapsed: 2,2129 s (-1,688%)

Remove unnecessarycheck & variables

● Idea:○ replace '\0' with '\n' at the end○ use rawmemchr without length check instead○ safe compares

endp=input1.addr+input1.len;

*endp = '\n';

for (p=input1.addr, i=0, nextp=rawmemchr(p, '\n'); p<endp ; i++) {

nextp=rawmemchr(p, '\n');

insert(p, nextp-p, i);

p = nextp+1;

}

Sentinel with rawmemchr

endp=input.addr+input.len;

*endp = '\n';

p=input.addr;

nextp = p;

for (r=0; nextp<endp; r++) {

for(;*nextp ^ '\n'; nextp++);

insert(p, nextp-p, r);

nextp++;

p = nextp;

}

Cycles: 7.400.275.087 (+1,042%)Instructions: 1.157.591.866 (+8,696%)Branch mispredictions: 11.715.914 (-0,158%)Time elapsed: 2,2064 s (-0,293%)

Sentinel self made rawmemchr

inline int mycmp(char* in1, char* in2, int len){

do{

if(*in1 ^ *in2) return 0;

in1++; in2++; len--;

}while(len>0);

return 1;

}

if (keylen == l->keylen && mycmp(l->keyaddr, keyaddr, keylen))


Faster memcmp

*(keyaddr+keylen) = 0; // FROM INSERT

inline int mycmp(char* in1, char* in2, int len){

while(*in1 == *in2) {

in1++; in2++; len--;

}

return len;

}

if (keylen==l->keylen && !mycmp(l->keyaddr, keyaddr, keylen)) // FROM LOOKUP


Faster memcmp with Sentinel

int *cache = malloc(size*sizeof(int));

int *startcache, *endcache;

startcache = cache;

endcache = startcache + size;

while(nextp<endp) {

if (cache >= endcache){

size = size<<1;

cache = realloc(cache, size*sizeof(int));

}

for(;*nextp ^ '\n'; nextp++);

*cache = lookup(p, nextp-p);

r = r * 2654435761L + *cache;

r = r + (r>>32);

cache++; nextp++; p = nextp;

}

Cachingendcache = cache;

REPEAT9 (

cache = startcache;

while (cache < endcache) {

r = r * 2654435761L + *cache;

r = r + (r>>32);

cache++;

} );

Cycles: 925.886.063 (-84,109%)Instructions: 494.630.615 (-74,155%)Branch mispredictions: 2.395.446 (-83,825%)Time elapsed: 0,2847 s (-83,603%)

Caching + memcmp

Cycles: 925.783.880 (-84,110%)Instructions: 475.125.520 (-75,172%)Branch mispredictions: 2.418.936 (-83,659%)Time elapsed: 0,2839 s (-83,738%)

Caching + memcmp with Sentinel

int size = input.len/6; int size = input.len/2;

...

if (cache >= endcache){

size = size<<1;

cache = realloc(cache, size*sizeof(int));

}

...

Cycles: 930.929.061 (+0,544%)Instructions: 475.676.977 (-3,831%)Branch mispredictions: 2.384.999 (-0,436%)Time elapsed: 0,2830 s (-0,586%)

Approximation of cache size

Cycles: 930.929.061 (-87,233%)Instructions: 475.676.977 (-55,259%)Branch mispredictions: 2.384.999 (-79.070%)Time elapsed: 0,2830 s (-87,656%)

Overall

Any Questions?

Code available: https://github.com/grill/micro-optimisations

Fin

https://github.com/grill/micro-optimisations

Technology

Efficient Programs