This is the memcmp() i wrote:
Code:
int memcmp_vec_aligned(const void *s1, const void *s2, size_t len) {
unsigned char* s1first = s1;
unsigned char* s2first = s2;
vector unsigned char s1vector, s2vector;
//printf("s1first = %x\ts2first = %x\tlen = %ld\n", s1first, s2first, len);
while (len >= 16) {
s1vector = vec_ld(0, s1first);
s2vector = vec_ld(0, s2first);
if (vec_any_ne(s1vector, s2vector)) {
//printf("Vectors not equal, calling memcmp\n");
return memcmp(s1first, s2first, 16);
}
s1first += 16;
s2first += 16;
len -= 16;
//printf("s1first = %x\ts2first = %x\tlen = %ld\n", s1first, s2first, len);
}
if (len > 0) {
//printf("Stuff left, calling memcmp()\n");
return memcmp(s1first, s2first, len);
}
return 0;
}
int memcmp_vec(const void *s1, const void *s2, size_t len) {
if (len <= MEMCMP_THRESHOLD) {
return memcmp(s1, s2, len);
}
// for inline char -> vector char conversion
unsigned char* s1first = s1;
unsigned char* s2first = s2;
unsigned char *temp1;
int temp2, result;
int s1offset = (int)s1first & 15;
int s2offset = (int)s2first & 15;
vector unsigned char s1vector, s2vector;
vector unsigned char MSQ, LSQ;
vector unsigned char mask;
#ifdef DEBUG
printf("srcfirst = %x\tsrclast = %x\n", s1first, s1first+len-1);
printf("dstfirst = %x\tdstlast = %x\n", s2first, s2first+len-1);
printf("s1offset = %ld\ts2offset = %ld\n", s1offset, s2offset);
#endif
if (s1offset == 0 && s2offset == 0) {
// Both buffers are 16-byte aligned, just compare the aligned values
return memcmp_vec_aligned(s1first, s2first, len);
} else {
// Use standard memcmp to copy the few bytes at the beginning
result = memcmp(s1first, s2first, s1offset);
if (result)
return result;
// Advance both pointers appropriately, now s1first is 16-byte aligned
s1first += s1offset;
s2first += s1offset;
len -= s1offset;
// recalculate the s2offset
s2offset = 16 - ((int)s2first & 15);
if (s2offset == 0) {
return memcmp_vec_aligned(s1first, s2first, len);
}
while (len >= 16) {
MSQ = vec_ld(0, s2first); // most significant quadword
LSQ = vec_ld(15, s2first); // least significant quadword
mask = vec_lvsl(0, s2first); // create the permute mask
if (vec_any_ne(vec_ld(0, s1first), vec_perm(MSQ, LSQ, mask)))
return memcmp(s1first, s2first, 16);
s1first += 16;
s2first += 16;
len -= 16;
}
if (len > 0) {
return memcmp(s1first, s2first, len);
}
}
return 0;
}
and these are the results i got:
Code:
$ ./altivectorize -v -s -g --tests memcmp --norandom --loops 1000000
Altivec is supported
Verbose mode on
Will do both scalar and vector tests
Will also do glibc tests
loops: 1000000
output file:
will do tests: memcmp
#size arrays glibc altivec (Effective bandwidth)
7 599186 0.020 (333.8 MB/s) 0.100 (66.8 MB/s) (0.2x)
13 325000 0.030 (413.3 MB/s) 0.130 (95.4 MB/s) (0.2x)
16 262144 0.020 (762.9 MB/s) 0.130 (117.4 MB/s) (0.2x)
20 209715 0.030 (635.8 MB/s) 0.170 (112.2 MB/s) (0.2x)
27 155344 0.040 (643.7 MB/s) 0.190 (135.5 MB/s) (0.2x)
35 119837 0.040 (834.5 MB/s) 0.140 (238.4 MB/s) (0.3x)
43 97542 0.050 (820.2 MB/s) 0.190 (215.8 MB/s) (0.3x)
54 77672 0.040 (1287.5 MB/s) 0.160 (321.9 MB/s) (0.2x)
64 65536 0.040 (1525.9 MB/s) 0.140 (436.0 MB/s) (0.3x)
90 46603 0.060 (1430.5 MB/s) 0.200 (429.2 MB/s) (0.3x)
128 32768 0.070 (1743.9 MB/s) 0.160 (762.9 MB/s) (0.4x)
185 22672 0.090 (1960.3 MB/s) 0.210 (840.1 MB/s) (0.4x)
256 16384 0.120 (2034.5 MB/s) 0.210 (1162.6 MB/s) (0.6x)
347 12087 0.170 (1946.6 MB/s) 0.300 (1103.1 MB/s) (0.6x)
512 8192 0.250 (1953.1 MB/s) 0.320 (1525.9 MB/s) (0.8x)
831 5047 0.390 (2032.1 MB/s) 0.520 (1524.0 MB/s) (0.8x)
2048 2048 0.920 (2123.0 MB/s) 1.150 (1698.4 MB/s) (0.8x)
3981 1053 1.990 (1907.8 MB/s) 2.570 (1477.3 MB/s) (0.8x)
8192 512 3.630 (2152.2 MB/s) 5.060 (1544.0 MB/s) (0.7x)
13488 311 6.250 (2058.1 MB/s) 10.020 (1283.7 MB/s) (0.6x)
16384 256 7.940 (1967.9 MB/s) 10.630 (1469.9 MB/s) (0.7x)
Now, I really can't understand why I get such abysmal performance.