This is the memcmp() i wrote:
Code:
int memcmp_vec_aligned(const void *s1, const void *s2, size_t len) {
    unsigned char* s1first = s1;    
    unsigned char* s2first = s2;
    vector unsigned char s1vector, s2vector; 
    //printf("s1first = %x\ts2first = %x\tlen = %ld\n", s1first, s2first, len);
    while (len >= 16) {
        s1vector = vec_ld(0, s1first);
        s2vector = vec_ld(0, s2first);
        if (vec_any_ne(s1vector, s2vector)) {
            //printf("Vectors not equal, calling memcmp\n");
            return memcmp(s1first, s2first, 16);
        }
        s1first += 16;
        s2first += 16;
        len -= 16;
        //printf("s1first = %x\ts2first = %x\tlen = %ld\n", s1first, s2first, len);
    }
    
    if (len > 0) {
        //printf("Stuff left, calling memcmp()\n");
        return memcmp(s1first, s2first, len);
    }
    return 0;
}
int memcmp_vec(const void *s1, const void *s2, size_t len) {
        
    if (len <= MEMCMP_THRESHOLD) {
        return memcmp(s1, s2, len);
    }
    
    // for inline  char -> vector char  conversion 
    unsigned char* s1first = s1;    
    unsigned char* s2first = s2;
    unsigned char *temp1;
    
    int temp2, result;
    int s1offset = (int)s1first & 15;
    int s2offset = (int)s2first & 15;
    
    vector unsigned char s1vector, s2vector; 
    vector unsigned char MSQ, LSQ;
    vector unsigned char mask;
    
#ifdef DEBUG
    printf("srcfirst = %x\tsrclast = %x\n", s1first, s1first+len-1);
    printf("dstfirst = %x\tdstlast = %x\n", s2first, s2first+len-1);
    printf("s1offset = %ld\ts2offset = %ld\n", s1offset, s2offset);
#endif
    
    if (s1offset == 0 && s2offset == 0) {
        // Both buffers are 16-byte aligned, just compare the aligned values
        return memcmp_vec_aligned(s1first, s2first, len);
    } else {    
        // Use standard memcmp to copy the few bytes at the beginning
        result = memcmp(s1first, s2first, s1offset);
        if (result)
            return result;
        
        // Advance both pointers appropriately, now s1first is 16-byte aligned
        s1first += s1offset;
        s2first += s1offset;
        len -= s1offset;
        // recalculate the s2offset
        s2offset = 16 - ((int)s2first & 15);
        if (s2offset == 0) {
            return memcmp_vec_aligned(s1first, s2first, len);
        }
        while (len >= 16) {
            MSQ = vec_ld(0, s2first);            // most significant quadword
            LSQ = vec_ld(15, s2first);           // least significant quadword
            mask = vec_lvsl(0, s2first);         // create the permute mask
            if (vec_any_ne(vec_ld(0, s1first), vec_perm(MSQ, LSQ, mask)))
                return memcmp(s1first, s2first, 16);
            s1first += 16;
            s2first += 16;
            len -= 16;
        }
        
        if (len > 0) {
            return memcmp(s1first, s2first, len);
        }
    }
    return 0; 
}
and these are the results i got:
Code:
$ ./altivectorize -v -s -g --tests memcmp --norandom --loops 1000000
Altivec is supported
Verbose mode on
Will do both scalar and vector tests
Will also do glibc tests
loops: 1000000
output file:
will do tests: memcmp
#size   arrays  glibc                   altivec (Effective bandwidth)
7       599186  0.020 (333.8 MB/s)      0.100 (66.8 MB/s) (0.2x)
13      325000  0.030 (413.3 MB/s)      0.130 (95.4 MB/s) (0.2x)
16      262144  0.020 (762.9 MB/s)      0.130 (117.4 MB/s) (0.2x)
20      209715  0.030 (635.8 MB/s)      0.170 (112.2 MB/s) (0.2x)
27      155344  0.040 (643.7 MB/s)      0.190 (135.5 MB/s) (0.2x)
35      119837  0.040 (834.5 MB/s)      0.140 (238.4 MB/s) (0.3x)
43      97542   0.050 (820.2 MB/s)      0.190 (215.8 MB/s) (0.3x)
54      77672   0.040 (1287.5 MB/s)     0.160 (321.9 MB/s) (0.2x)
64      65536   0.040 (1525.9 MB/s)     0.140 (436.0 MB/s) (0.3x)
90      46603   0.060 (1430.5 MB/s)     0.200 (429.2 MB/s) (0.3x)
128     32768   0.070 (1743.9 MB/s)     0.160 (762.9 MB/s) (0.4x)
185     22672   0.090 (1960.3 MB/s)     0.210 (840.1 MB/s) (0.4x)
256     16384   0.120 (2034.5 MB/s)     0.210 (1162.6 MB/s) (0.6x)
347     12087   0.170 (1946.6 MB/s)     0.300 (1103.1 MB/s) (0.6x)
512     8192    0.250 (1953.1 MB/s)     0.320 (1525.9 MB/s) (0.8x)
831     5047    0.390 (2032.1 MB/s)     0.520 (1524.0 MB/s) (0.8x)
2048    2048    0.920 (2123.0 MB/s)     1.150 (1698.4 MB/s) (0.8x)
3981    1053    1.990 (1907.8 MB/s)     2.570 (1477.3 MB/s) (0.8x)
8192    512     3.630 (2152.2 MB/s)     5.060 (1544.0 MB/s) (0.7x)
13488   311     6.250 (2058.1 MB/s)     10.020 (1283.7 MB/s) (0.6x)
16384   256     7.940 (1967.9 MB/s)     10.630 (1469.9 MB/s) (0.7x)
Now, I really can't understand why I get such abysmal performance. 
