/* ex: set ts=2 et: */ /* * Copyright 2008 Ryan Flynn * * www.parseerror.com * * Mon Nov 24 15:49:08 EST 2008 * * How Fast Can We Reverse a String? * * NOTE: run like so and send me all the results: * $ cat /proc/cpuinfo # or equivalent * $ cc -std=c99 -W -Wall -pedantic -O3 -o strrev strrev.c * $ ./strrev * * I stumbled across this exercise in Palindromic Numbers from * by way of MathWorld's * list of Unsolved Problems * * * The algorithm involves taking the base-10 digits of a number 123 -> [1,2,3] * reversing them Rev([1,2,3]) -> [3,2,1] * and then adding them to the original number [1,2,3]+[3,2,1] -> [4,4,4] * until one gets a palindromic number. * * Most numbers yield palindromic numbers quickly using this method, but a * few do not. * * I built a program to search for a palindromic result for the number * 196. I am now trying to make it run faster, and as the function spends * 99% of its time reversing and adding arrays of integers together I thought * I would try to find a faster way to reverse the contents of a string. * * So the goal is to develop a string reversal function SR for which * * SR("", 0) -> "" * SR("a", 1) -> "a" * SR("ab", 2) -> "ba" * SR("abc", 3) -> "cba" * * and so on. * */ #include #include #include #include /** * this is the first thing that came to my mind. * use only one index and a halfway marker. * it works perfectly well and is easy to understand. */ void obvious(char *dst, const char *src, unsigned len) { if (len) { long i = 0, halfway = len / 2; while (i <= halfway) { dst[i] = src[len-1-i]; dst[len-1-i] = src[i]; i++; } } } /** * work one ascending index forwards, one at the end working frontwards, * copy their bytes until they meet * * this function fits everything we care about into x86 registers, * and yet it is not faster than 'obvious'... why? * * the inner loop consists of 4 movs, 1 inc and 1 dec. the one-byte * reads must be saturating our memory bus */ void obvious_twoindex(char *dst, const char *src, unsigned len) { if (len) { long i = 0, j = len - 1; while (i <= j) { dst[i] = src[j]; dst[j] = src[i]; i++, j--; } } } /** * use only pointers, no indexes */ void obvious_pointer(char *dst, const char *src, unsigned len) { if (len) { char *d = dst + (len-1); const char *s = src + (len-1); while (dst <= d) { *dst++ = *s--; *d-- = *src++; } } } /** * check if the bytes are different before swapping. * surprisingly, this produces a savings from 'obvious' even given * a string with low likelihood of the swap chars matching. * significant savings if the string is a palindrome; but still * not faster than the unrolled ones. */ void obvious_check(char *dst, const char *src, unsigned len) { if (len) { long i = 0, j = len - 1; while (i < j) { if (dst[i] != src[j]) dst[i] = src[j]; if (dst[j] != src[i]) dst[j] = src[i]; i++, j--; } if (dst[i] != src[j]) { dst[i] = src[j]; dst[j] = src[i]; } } } static void recurse_(char *dst, const char *src, unsigned i, unsigned j) { if (i < j) { dst[i] = src[j]; dst[j] = src[i]; recurse_(dst, src, ++i, --j); } } /** * work one ascending index forwards, one at the end working frontwards, * copy their bytes until they meet * NOTE: not acceptable for long strings. */ void obvious_recurse(char *dst, const char *src, unsigned len) { if (len) { recurse_(dst, src, 0, len-1); if (len & 1) dst[len/2] = src[len/2]; } } /** * work from the middle outwards * maybe locality of reference will make a difference in medium-sized strings? * this function is nice and simple */ void insideout(char *dst, const char *src, unsigned len) { if (len) { long i = (len - 1) / 2, j = len / 2; while (j < (long)len) { dst[i] = src[j]; dst[j++] = src[i--]; } } } /** * use GCC's builtin CPU-hinting prefetch to * populate the cache. * use only one index and a halfway marker. * it works perfectly, is easy to understand. */ void obvious_prefetch(char *dst, const char *src, unsigned len) { if (len) { long i = 0, halfway = len / 2; __builtin_prefetch(dst+i); __builtin_prefetch(src+len-1); __builtin_prefetch(src+i); __builtin_prefetch(dst+len-1); while (i <= halfway) { dst[i] = src[len-1-i]; dst[len-1-i] = src[i]; i++; __builtin_prefetch(dst+i); __builtin_prefetch(src+i); __builtin_prefetch(src+i-i); __builtin_prefetch(dst+len-1-i); } } } /** * copy two-byte pieces at a time, when possible. * should be faster. */ void byte2_unroll(char *dst, const char *src, unsigned len) { if (len) { long i = 0, halfway = len / 2; union { uint16_t i; uint8_t c[2]; } tmp; while (i <= halfway - (long)sizeof tmp.c) { tmp.i = *(uint16_t *)(src+i); dst[len-1-i] = tmp.c[0]; dst[len-2-i] = tmp.c[1]; tmp.i = *(uint16_t *)(src+len-i-sizeof tmp.c); dst[i++] = tmp.c[1]; dst[i++] = tmp.c[0]; } /* shore up any single bytes */ while (i <= halfway) { dst[i] = src[len-1-i]; dst[len-1-i] = src[i]; i++; } } } /** * copy four-byte pieces at a time, when possible. * should be faster. */ void byte4_unroll(char *dst, const char *src, unsigned len) { if (len) { unsigned i = 0, halfway = len / 2; union { uint32_t i; uint8_t c[4]; } tmp; while (i+4 <= halfway) { tmp.i = *(uint32_t *)(src+i); dst[len-1-i] = tmp.c[0]; dst[len-2-i] = tmp.c[1]; dst[len-3-i] = tmp.c[2]; dst[len-4-i] = tmp.c[3]; tmp.i = *(uint32_t *)(src+len-4-i); dst[i++] = tmp.c[3]; dst[i++] = tmp.c[2]; dst[i++] = tmp.c[1]; dst[i++] = tmp.c[0]; } while (i <= halfway) { dst[i] = src[len-1-i]; dst[len-1-i] = src[i]; i++; } } } /** * copy four-byte pieces at a time, when possible. * should be faster. */ void byte4_unroll_prefetch(char *dst, const char *src, unsigned len) { if (len) { unsigned i = 0, halfway = len / 2; union { uint32_t i; uint8_t c[4]; } tmp; __builtin_prefetch(src); __builtin_prefetch(src+len-4); while (i+4 <= halfway) { tmp.i = *(uint32_t *)(src+i); dst[len-1-i] = tmp.c[0]; dst[len-2-i] = tmp.c[1]; dst[len-3-i] = tmp.c[2]; dst[len-4-i] = tmp.c[3]; tmp.i = *(uint32_t *)(src+len-4-i); dst[i++] = tmp.c[3]; dst[i++] = tmp.c[2]; dst[i++] = tmp.c[1]; dst[i++] = tmp.c[0]; __builtin_prefetch(src+i); __builtin_prefetch(src+len-4-i); } while (i <= halfway) { dst[i] = src[len-1-i]; dst[len-1-i] = src[i]; i++; } } } /** * copy four-byte pieces at a time, when possible. * should be faster. */ void byte4_unroll2(char *dst, const char *src, unsigned len) { if (len) { union { uint32_t i; uint8_t c[sizeof(uint32_t)]; } tmp; const long halfway = len / 2; register long i = 0; while (i <= halfway-4) { tmp.i = *(uint32_t *)(src+i); dst[len-1-i] = tmp.c[0]; dst[len-2-i] = tmp.c[1]; dst[len-3-i] = tmp.c[2]; dst[len-4-i] = tmp.c[3]; tmp.i = *(uint32_t *)(src+len-i-sizeof tmp.c); dst[i++] = tmp.c[3]; dst[i++] = tmp.c[2]; dst[i++] = tmp.c[1]; dst[i++] = tmp.c[0]; } while (i <= halfway) { dst[i] = src[len-1-i]; dst[len-1-i] = src[i]; i++; } } } /** * copy four-byte pieces at a time, when possible. * should be faster. */ void byte4_unroll2_expect(char *dst, const char *src, unsigned len) { if (len) { union { uint32_t i; uint8_t c[sizeof(uint32_t)]; } tmp; const long halfway = len / 2; register long i = 0; while (__builtin_expect(i <= halfway-4,1)) { tmp.i = *(uint32_t *)(src+i); dst[len-1-i] = tmp.c[0]; dst[len-2-i] = tmp.c[1]; dst[len-3-i] = tmp.c[2]; dst[len-4-i] = tmp.c[3]; tmp.i = *(uint32_t *)(src+len-i-sizeof tmp.c); dst[i++] = tmp.c[3]; dst[i++] = tmp.c[2]; dst[i++] = tmp.c[1]; dst[i++] = tmp.c[0]; } while (i <= halfway) { dst[i] = src[len-1-i]; dst[len-1-i] = src[i]; i++; } } } /** * copy four-byte pieces at a time, when possible. * should be faster. */ void byte4_unroll3(char *dst, const char *src, unsigned len) { if (len) { const long halfway = len / 2, halfway2 = halfway - 4; uint32_t j; register long i = 0; while (i <= halfway2) { j = *(uint32_t *)(src+i); dst[len-1-i] = (uint8_t)(j); dst[len-2-i] = (uint8_t)(j >> 8); dst[len-3-i] = (uint8_t)(j >> 16); dst[len-4-i] = (uint8_t)(j >> 24); j = *(uint32_t *)(src+len-i-sizeof j); dst[i++] = (uint8_t)(j >> 24); dst[i++] = (uint8_t)(j >> 16); dst[i++] = (uint8_t)(j >> 8); dst[i++] = (uint8_t)(j); } while (i <= halfway) { dst[i] = src[len-1-i]; dst[len-1-i] = src[i]; i++; } } } /** * copy four-byte pieces at a time, when possible. * should be faster. */ void byte4_loop(char *dst, const char *src, unsigned len) { if (len) { unsigned i = 0, halfway = len / 2; union { uint32_t i; uint8_t c[4]; } tmp; while (i+4 <= halfway) { int d = len-1-i; tmp.i = *(uint32_t *)(src+i); dst[d--] = tmp.c[0]; dst[d--] = tmp.c[1]; dst[d--] = tmp.c[2]; dst[d] = tmp.c[3]; tmp.i = *(uint32_t *)(src+len-4-i); dst[i++] = tmp.c[3]; dst[i++] = tmp.c[2]; dst[i++] = tmp.c[1]; dst[i++] = tmp.c[0]; } while (i <= halfway) { dst[i] = src[len-1-i]; dst[len-1-i] = src[i]; i++; } } } /** * copy four-byte pieces at a time, when possible. * should be faster. */ void byte4_wb(char *dst, const char *src, unsigned len) { if (len) { unsigned i = 0, halfway = len / 2; union { uint32_t i; char c[4]; } tmp; while (i+4 <= halfway) { tmp.i = *(uint32_t *)(src+i); tmp.i = (tmp.c[0] << 24) | (tmp.c[1] << 16) | (tmp.c[2] << 8) | tmp.c[3]; *(uint32_t *)(dst+len-4-i) = tmp.i; tmp.i = *(uint32_t *)(src+len-4-i); tmp.i = (tmp.c[0] << 24) | (tmp.c[1] << 16) | (tmp.c[2] << 8) | tmp.c[3]; *(uint32_t *)(dst+i) = tmp.i; i += 4; } while (i <= halfway) { dst[i] = src[len-1-i]; dst[len-1-i] = src[i]; i++; } } } #include /** * read four bytes, perform bitwise operations on that quantity to order it, * then write the whole thing out in one piece * FIXME: this abuse of ntohl() will only work on little-endian machines */ void byte4_w(char *dst, const char *src, unsigned len) { if (len) { long i = 0, halfway = len / 2; while (i <= halfway-4) { *(uint32_t *)(dst+len-4-i) = ntohl(*(uint32_t *)(src+i)); *(uint32_t *)(dst+i) = ntohl(*(uint32_t *)(src+len-4-i)); i += 4; } while (i <= halfway) { dst[i] = src[len-1-i]; dst[len-1-i] = src[i]; i++; } } } /** * read four bytes, perform bitwise operations on that quantity to order it, * then write the whole thing out in one piece * FIXME: this abuse of ntohl() will only work on little-endian machines */ void byte4_wc(char *dst, const char *src, unsigned len) { if (len) { long i = 0, halfway = len / 2; __builtin_prefetch(dst+len-4-i); __builtin_prefetch(src+i); __builtin_prefetch(dst+i); __builtin_prefetch(src+len-4-i); while (i <= halfway-4) { *(uint32_t *)(dst+len-4-i) = ntohl(*(uint32_t *)(src+i)); *(uint32_t *)(dst+i) = ntohl(*(uint32_t *)(src+len-4-i)); i += 4; __builtin_prefetch(dst+len-4-i); __builtin_prefetch(src+i); __builtin_prefetch(dst+i); __builtin_prefetch(src+len-4-i); } while (i <= halfway) { dst[i] = src[len-1-i]; dst[len-1-i] = src[i]; i++; } } } /** * copy four-byte pieces at a time, when possible. * should be faster. */ void byte8_unroll(char *dst, const char *src, unsigned len) { if (len) { unsigned i = 0, halfway = len / 2; union { uint64_t i; char c[8]; } tmp; while (i+8 <= halfway) { tmp.i = *(uint64_t *)(src+i); dst[len-1-i] = tmp.c[0]; dst[len-2-i] = tmp.c[1]; dst[len-3-i] = tmp.c[2]; dst[len-4-i] = tmp.c[3]; dst[len-5-i] = tmp.c[4]; dst[len-6-i] = tmp.c[5]; dst[len-7-i] = tmp.c[6]; dst[len-8-i] = tmp.c[7]; tmp.i = *(uint64_t *)(src+len-8-i); dst[i++] = tmp.c[7]; dst[i++] = tmp.c[6]; dst[i++] = tmp.c[5]; dst[i++] = tmp.c[4]; dst[i++] = tmp.c[3]; dst[i++] = tmp.c[2]; dst[i++] = tmp.c[1]; dst[i++] = tmp.c[0]; } while (i <= halfway) { dst[i] = src[len-1-i]; dst[len-1-i] = src[i]; i++; } } } /** * copy four-byte pieces at a time, when possible. * should be faster. */ void byte8_subloop(char *dst, const char *src, unsigned len) { if (len) { unsigned i = 0, halfway = len / 2; union { uint64_t i; char c[8]; } tmp; while (i+8 <= halfway) { int j = 0; tmp.i = *(uint64_t *)(src+i); while (j < 8) { dst[len-j-1-i] = tmp.c[j]; j++; } tmp.i = *(uint64_t *)(src+len-8-i); while (j--) dst[i++] = tmp.c[j]; } while (i <= halfway) { dst[i] = src[len-1-i]; dst[len-1-i] = src[i]; i++; } } } /** * 64-bit version of byte4_w */ void byte8_w(char *dst, const char *src, unsigned len) { if (len) { long i = 0, halfway = len / 2; while (i <= halfway-8) { *(uint32_t *)(dst+len-8-i) = ntohl(*(uint32_t *)(src+i+4)); *(uint32_t *)(dst+len-4-i) = ntohl(*(uint32_t *)(src+i)); *(uint32_t *)(dst+i) = ntohl(*(uint32_t *)(src+len-4-i)); *(uint32_t *)(dst+i+4) = ntohl(*(uint32_t *)(src+len-8-i)); i += 8; } while (i <= halfway) { dst[i] = src[len-1-i]; dst[len-1-i] = src[i]; i++; } } } /** * 64-bit version of byte4_w */ void byte8_w64(char *dst, const char *src, unsigned len) { if (len) { long i = 0, halfway = len / 2; while (i <= halfway-8) { *(uint64_t *)(dst+len-8-i) = __bswap_64(*(uint64_t *)(src+i)); *(uint64_t *)(dst+i) = __bswap_64(*(uint64_t *)(src+len-8-i)); i += 8; } while (i <= halfway) { dst[i] = src[len-1-i]; dst[len-1-i] = src[i]; i++; } } } /** * 128-bit version of byte4_w */ void byte16_w(char *dst, const char *src, unsigned len) { if (len) { long i = 0, halfway = len / 2; while (i <= halfway-16) { *(uint32_t *)(dst+len-16-i) = ntohl(*(uint32_t *)(src+i+12)); *(uint32_t *)(dst+len-12-i) = ntohl(*(uint32_t *)(src+i+8)); *(uint32_t *)(dst+len-8-i) = ntohl(*(uint32_t *)(src+i+4)); *(uint32_t *)(dst+len-4-i) = ntohl(*(uint32_t *)(src+i)); *(uint32_t *)(dst+i) = ntohl(*(uint32_t *)(src+len-4-i)); *(uint32_t *)(dst+i+4) = ntohl(*(uint32_t *)(src+len-8-i)); *(uint32_t *)(dst+i+8) = ntohl(*(uint32_t *)(src+len-12-i)); *(uint32_t *)(dst+i+12) = ntohl(*(uint32_t *)(src+len-16-i)); i += 16; } while (i <= halfway) { dst[i] = src[len-1-i]; dst[len-1-i] = src[i]; i++; } } } /** * 128-bit version of byte4_write32 */ void byte16_w64(char *dst, const char *src, unsigned len) { if (len) { long i = 0, halfway = len / 2; while (i <= halfway-16) { *(uint64_t *)(dst+len-16-i) = __bswap_64(*(uint64_t *)(src+i+8)); *(uint64_t *)(dst+len-8-i) = __bswap_64(*(uint64_t *)(src+i)); *(uint64_t *)(dst+i) = __bswap_64(*(uint64_t *)(src+len-8-i)); *(uint64_t *)(dst+i+8) = __bswap_64(*(uint64_t *)(src+len-16-i)); i += 16; } while (i <= halfway) { dst[i] = src[len-1-i]; dst[len-1-i] = src[i]; i++; } } } /** * 256-bit version of byte4_write32 */ void byte32_w(char *dst, const char *src, unsigned len) { if (len) { long i = 0, halfway = len / 2; while (i <= halfway-32) { *(uint32_t *)(dst+len-32-i) = ntohl(*(uint32_t *)(src+i+28)); *(uint32_t *)(dst+len-28-i) = ntohl(*(uint32_t *)(src+i+24)); *(uint32_t *)(dst+len-24-i) = ntohl(*(uint32_t *)(src+i+20)); *(uint32_t *)(dst+len-20-i) = ntohl(*(uint32_t *)(src+i+16)); *(uint32_t *)(dst+len-16-i) = ntohl(*(uint32_t *)(src+i+12)); *(uint32_t *)(dst+len-12-i) = ntohl(*(uint32_t *)(src+i+8)); *(uint32_t *)(dst+len-8-i) = ntohl(*(uint32_t *)(src+i+4)); *(uint32_t *)(dst+len-4-i) = ntohl(*(uint32_t *)(src+i)); *(uint32_t *)(dst+i) = ntohl(*(uint32_t *)(src+len-4-i)); *(uint32_t *)(dst+i+4) = ntohl(*(uint32_t *)(src+len-8-i)); *(uint32_t *)(dst+i+8) = ntohl(*(uint32_t *)(src+len-12-i)); *(uint32_t *)(dst+i+12) = ntohl(*(uint32_t *)(src+len-16-i)); *(uint32_t *)(dst+i+16) = ntohl(*(uint32_t *)(src+len-20-i)); *(uint32_t *)(dst+i+20) = ntohl(*(uint32_t *)(src+len-24-i)); *(uint32_t *)(dst+i+24) = ntohl(*(uint32_t *)(src+len-28-i)); *(uint32_t *)(dst+i+28) = ntohl(*(uint32_t *)(src+len-32-i)); i += 32; } while (i <= halfway) { dst[i] = src[len-1-i]; dst[len-1-i] = src[i]; i++; } } } /** * 256-bit version of byte4_write32 */ void byte32_w_prefetch(char *dst, const char *src, unsigned len) { if (len) { long i = 0, halfway = len / 2; while (i <= halfway-32) { __builtin_prefetch(dst+i); __builtin_prefetch(src+i); *(uint32_t *)(dst+len-32-i) = ntohl(*(uint32_t *)(src+i+28)); *(uint32_t *)(dst+len-28-i) = ntohl(*(uint32_t *)(src+i+24)); *(uint32_t *)(dst+len-24-i) = ntohl(*(uint32_t *)(src+i+20)); *(uint32_t *)(dst+len-20-i) = ntohl(*(uint32_t *)(src+i+16)); *(uint32_t *)(dst+len-16-i) = ntohl(*(uint32_t *)(src+i+12)); *(uint32_t *)(dst+len-12-i) = ntohl(*(uint32_t *)(src+i+8)); *(uint32_t *)(dst+len-8-i) = ntohl(*(uint32_t *)(src+i+4)); *(uint32_t *)(dst+len-4-i) = ntohl(*(uint32_t *)(src+i)); *(uint32_t *)(dst+i) = ntohl(*(uint32_t *)(src+len-4-i)); *(uint32_t *)(dst+i+4) = ntohl(*(uint32_t *)(src+len-8-i)); *(uint32_t *)(dst+i+8) = ntohl(*(uint32_t *)(src+len-12-i)); *(uint32_t *)(dst+i+12) = ntohl(*(uint32_t *)(src+len-16-i)); *(uint32_t *)(dst+i+16) = ntohl(*(uint32_t *)(src+len-20-i)); *(uint32_t *)(dst+i+20) = ntohl(*(uint32_t *)(src+len-24-i)); *(uint32_t *)(dst+i+24) = ntohl(*(uint32_t *)(src+len-28-i)); *(uint32_t *)(dst+i+28) = ntohl(*(uint32_t *)(src+len-32-i)); i += 32; } __builtin_prefetch(dst+i); __builtin_prefetch(src+i); while (i <= halfway) { dst[i] = src[len-1-i]; dst[len-1-i] = src[i]; i++; } } } /** * 256-bit version of byte4_write32 */ void byte32_w64(char *dst, const char *src, unsigned len) { if (len) { long i = 0, halfway = len / 2; while (i <= halfway-32) { *(uint64_t *)(dst+len-32-i) = __bswap_64(*(uint64_t *)(src+i+24)); *(uint64_t *)(dst+len-24-i) = __bswap_64(*(uint64_t *)(src+i+16)); *(uint64_t *)(dst+len-16-i) = __bswap_64(*(uint64_t *)(src+i+8)); *(uint64_t *)(dst+len- 8-i) = __bswap_64(*(uint64_t *)(src+i)); *(uint64_t *)(dst+i) = __bswap_64(*(uint64_t *)(src+len-8-i)); *(uint64_t *)(dst+i+8) = __bswap_64(*(uint64_t *)(src+len-16-i)); *(uint64_t *)(dst+i+16) = __bswap_64(*(uint64_t *)(src+len-24-i)); *(uint64_t *)(dst+i+24) = __bswap_64(*(uint64_t *)(src+len-32-i)); i += 32; } while (i <= halfway) { dst[i] = src[len-1-i]; dst[len-1-i] = src[i]; i++; } } } /** * 512-bit version of byte4_write32 */ void byte64_w(char *dst, const char *src, unsigned len) { if (len) { long i = 0, halfway = len / 2; while (i <= halfway-64) { *(uint32_t *)(dst+len-64-i) = ntohl(*(uint32_t *)(src+i+60)); *(uint32_t *)(dst+len-60-i) = ntohl(*(uint32_t *)(src+i+56)); *(uint32_t *)(dst+len-56-i) = ntohl(*(uint32_t *)(src+i+52)); *(uint32_t *)(dst+len-52-i) = ntohl(*(uint32_t *)(src+i+48)); *(uint32_t *)(dst+len-48-i) = ntohl(*(uint32_t *)(src+i+44)); *(uint32_t *)(dst+len-44-i) = ntohl(*(uint32_t *)(src+i+40)); *(uint32_t *)(dst+len-40-i) = ntohl(*(uint32_t *)(src+i+36)); *(uint32_t *)(dst+len-36-i) = ntohl(*(uint32_t *)(src+i+32)); *(uint32_t *)(dst+len-32-i) = ntohl(*(uint32_t *)(src+i+28)); *(uint32_t *)(dst+len-28-i) = ntohl(*(uint32_t *)(src+i+24)); *(uint32_t *)(dst+len-24-i) = ntohl(*(uint32_t *)(src+i+20)); *(uint32_t *)(dst+len-20-i) = ntohl(*(uint32_t *)(src+i+16)); *(uint32_t *)(dst+len-16-i) = ntohl(*(uint32_t *)(src+i+12)); *(uint32_t *)(dst+len-12-i) = ntohl(*(uint32_t *)(src+i+8)); *(uint32_t *)(dst+len-8-i) = ntohl(*(uint32_t *)(src+i+4)); *(uint32_t *)(dst+len-4-i) = ntohl(*(uint32_t *)(src+i)); *(uint32_t *)(dst+i) = ntohl(*(uint32_t *)(src+len-4-i)); *(uint32_t *)(dst+i+4) = ntohl(*(uint32_t *)(src+len-8-i)); *(uint32_t *)(dst+i+8) = ntohl(*(uint32_t *)(src+len-12-i)); *(uint32_t *)(dst+i+12) = ntohl(*(uint32_t *)(src+len-16-i)); *(uint32_t *)(dst+i+16) = ntohl(*(uint32_t *)(src+len-20-i)); *(uint32_t *)(dst+i+20) = ntohl(*(uint32_t *)(src+len-24-i)); *(uint32_t *)(dst+i+24) = ntohl(*(uint32_t *)(src+len-28-i)); *(uint32_t *)(dst+i+28) = ntohl(*(uint32_t *)(src+len-32-i)); *(uint32_t *)(dst+i+32) = ntohl(*(uint32_t *)(src+len-36-i)); *(uint32_t *)(dst+i+36) = ntohl(*(uint32_t *)(src+len-40-i)); *(uint32_t *)(dst+i+40) = ntohl(*(uint32_t *)(src+len-44-i)); *(uint32_t *)(dst+i+44) = ntohl(*(uint32_t *)(src+len-48-i)); *(uint32_t *)(dst+i+48) = ntohl(*(uint32_t *)(src+len-52-i)); *(uint32_t *)(dst+i+52) = ntohl(*(uint32_t *)(src+len-56-i)); *(uint32_t *)(dst+i+56) = ntohl(*(uint32_t *)(src+len-60-i)); *(uint32_t *)(dst+i+60) = ntohl(*(uint32_t *)(src+len-64-i)); i += 64; } while (i <= halfway) { dst[i] = src[len-1-i]; dst[len-1-i] = src[i]; i++; } } } /** * 512-bit version of byte4_write32 */ void byte64_w64(char *dst, const char *src, unsigned len) { if (len) { long i = 0, halfway = len / 2; while (i <= halfway-64) { *(uint64_t *)(dst+len-56-i) = __bswap_64(*(uint64_t *)(src+i+48)); *(uint64_t *)(dst+len-48-i) = __bswap_64(*(uint64_t *)(src+i+40)); *(uint64_t *)(dst+len-40-i) = __bswap_64(*(uint64_t *)(src+i+32)); *(uint64_t *)(dst+len-32-i) = __bswap_64(*(uint64_t *)(src+i+24)); *(uint64_t *)(dst+len-24-i) = __bswap_64(*(uint64_t *)(src+i+16)); *(uint64_t *)(dst+len-16-i) = __bswap_64(*(uint64_t *)(src+i+8)); *(uint64_t *)(dst+len- 8-i) = __bswap_64(*(uint64_t *)(src+i)); *(uint64_t *)(dst+i) = __bswap_64(*(uint64_t *)(src+len-8-i)); *(uint64_t *)(dst+i+8) = __bswap_64(*(uint64_t *)(src+len-16-i)); *(uint64_t *)(dst+i+16) = __bswap_64(*(uint64_t *)(src+len-24-i)); *(uint64_t *)(dst+i+24) = __bswap_64(*(uint64_t *)(src+len-32-i)); *(uint64_t *)(dst+i+32) = __bswap_64(*(uint64_t *)(src+len-40-i)); *(uint64_t *)(dst+i+40) = __bswap_64(*(uint64_t *)(src+len-48-i)); *(uint64_t *)(dst+i+48) = __bswap_64(*(uint64_t *)(src+len-56-i)); *(uint64_t *)(dst+i+56) = __bswap_64(*(uint64_t *)(src+len-64-i)); i += 32; } while (i <= halfway) { dst[i] = src[len-1-i]; dst[len-1-i] = src[i]; i++; } } } /** * 1024-bit version of byte4_write32 */ void byte128_w(char *dst, const char *src, unsigned len) { if (len) { long i = 0, halfway = len / 2; while (i <= halfway-128) { *(uint32_t *)(dst+len-128-i) = ntohl(*(uint32_t *)(src+i+124)); *(uint32_t *)(dst+len-124-i) = ntohl(*(uint32_t *)(src+i+120)); *(uint32_t *)(dst+len-120-i) = ntohl(*(uint32_t *)(src+i+116)); *(uint32_t *)(dst+len-116-i) = ntohl(*(uint32_t *)(src+i+112)); *(uint32_t *)(dst+len-112-i) = ntohl(*(uint32_t *)(src+i+108)); *(uint32_t *)(dst+len-108-i) = ntohl(*(uint32_t *)(src+i+104)); *(uint32_t *)(dst+len-104-i) = ntohl(*(uint32_t *)(src+i+100)); *(uint32_t *)(dst+len-100-i) = ntohl(*(uint32_t *)(src+i+96)); *(uint32_t *)(dst+len-96-i) = ntohl(*(uint32_t *)(src+i+92)); *(uint32_t *)(dst+len-92-i) = ntohl(*(uint32_t *)(src+i+88)); *(uint32_t *)(dst+len-88-i) = ntohl(*(uint32_t *)(src+i+84)); *(uint32_t *)(dst+len-84-i) = ntohl(*(uint32_t *)(src+i+80)); *(uint32_t *)(dst+len-80-i) = ntohl(*(uint32_t *)(src+i+76)); *(uint32_t *)(dst+len-76-i) = ntohl(*(uint32_t *)(src+i+72)); *(uint32_t *)(dst+len-72-i) = ntohl(*(uint32_t *)(src+i+68)); *(uint32_t *)(dst+len-68-i) = ntohl(*(uint32_t *)(src+i+64)); *(uint32_t *)(dst+len-64-i) = ntohl(*(uint32_t *)(src+i+60)); *(uint32_t *)(dst+len-60-i) = ntohl(*(uint32_t *)(src+i+56)); *(uint32_t *)(dst+len-56-i) = ntohl(*(uint32_t *)(src+i+52)); *(uint32_t *)(dst+len-52-i) = ntohl(*(uint32_t *)(src+i+48)); *(uint32_t *)(dst+len-48-i) = ntohl(*(uint32_t *)(src+i+44)); *(uint32_t *)(dst+len-44-i) = ntohl(*(uint32_t *)(src+i+40)); *(uint32_t *)(dst+len-40-i) = ntohl(*(uint32_t *)(src+i+36)); *(uint32_t *)(dst+len-36-i) = ntohl(*(uint32_t *)(src+i+32)); *(uint32_t *)(dst+len-32-i) = ntohl(*(uint32_t *)(src+i+28)); *(uint32_t *)(dst+len-28-i) = ntohl(*(uint32_t *)(src+i+24)); *(uint32_t *)(dst+len-24-i) = ntohl(*(uint32_t *)(src+i+20)); *(uint32_t *)(dst+len-20-i) = ntohl(*(uint32_t *)(src+i+16)); *(uint32_t *)(dst+len-16-i) = ntohl(*(uint32_t *)(src+i+12)); *(uint32_t *)(dst+len-12-i) = ntohl(*(uint32_t *)(src+i+8)); *(uint32_t *)(dst+len-8-i) = ntohl(*(uint32_t *)(src+i+4)); *(uint32_t *)(dst+len-4-i) = ntohl(*(uint32_t *)(src+i)); *(uint32_t *)(dst+i) = ntohl(*(uint32_t *)(src+len-4-i)); *(uint32_t *)(dst+i+4) = ntohl(*(uint32_t *)(src+len-8-i)); *(uint32_t *)(dst+i+8) = ntohl(*(uint32_t *)(src+len-12-i)); *(uint32_t *)(dst+i+12) = ntohl(*(uint32_t *)(src+len-16-i)); *(uint32_t *)(dst+i+16) = ntohl(*(uint32_t *)(src+len-20-i)); *(uint32_t *)(dst+i+20) = ntohl(*(uint32_t *)(src+len-24-i)); *(uint32_t *)(dst+i+24) = ntohl(*(uint32_t *)(src+len-28-i)); *(uint32_t *)(dst+i+28) = ntohl(*(uint32_t *)(src+len-32-i)); *(uint32_t *)(dst+i+32) = ntohl(*(uint32_t *)(src+len-36-i)); *(uint32_t *)(dst+i+36) = ntohl(*(uint32_t *)(src+len-40-i)); *(uint32_t *)(dst+i+40) = ntohl(*(uint32_t *)(src+len-44-i)); *(uint32_t *)(dst+i+44) = ntohl(*(uint32_t *)(src+len-48-i)); *(uint32_t *)(dst+i+48) = ntohl(*(uint32_t *)(src+len-52-i)); *(uint32_t *)(dst+i+52) = ntohl(*(uint32_t *)(src+len-56-i)); *(uint32_t *)(dst+i+56) = ntohl(*(uint32_t *)(src+len-60-i)); *(uint32_t *)(dst+i+60) = ntohl(*(uint32_t *)(src+len-64-i)); *(uint32_t *)(dst+i+64) = ntohl(*(uint32_t *)(src+len-68-i)); *(uint32_t *)(dst+i+68) = ntohl(*(uint32_t *)(src+len-72-i)); *(uint32_t *)(dst+i+72) = ntohl(*(uint32_t *)(src+len-76-i)); *(uint32_t *)(dst+i+76) = ntohl(*(uint32_t *)(src+len-80-i)); *(uint32_t *)(dst+i+80) = ntohl(*(uint32_t *)(src+len-84-i)); *(uint32_t *)(dst+i+84) = ntohl(*(uint32_t *)(src+len-88-i)); *(uint32_t *)(dst+i+88) = ntohl(*(uint32_t *)(src+len-92-i)); *(uint32_t *)(dst+i+92) = ntohl(*(uint32_t *)(src+len-96-i)); *(uint32_t *)(dst+i+96) = ntohl(*(uint32_t *)(src+len-100-i)); *(uint32_t *)(dst+i+100) = ntohl(*(uint32_t *)(src+len-104-i)); *(uint32_t *)(dst+i+104) = ntohl(*(uint32_t *)(src+len-108-i)); *(uint32_t *)(dst+i+108) = ntohl(*(uint32_t *)(src+len-112-i)); *(uint32_t *)(dst+i+112) = ntohl(*(uint32_t *)(src+len-116-i)); *(uint32_t *)(dst+i+116) = ntohl(*(uint32_t *)(src+len-120-i)); *(uint32_t *)(dst+i+120) = ntohl(*(uint32_t *)(src+len-124-i)); *(uint32_t *)(dst+i+124) = ntohl(*(uint32_t *)(src+len-128-i)); i += 128; } while (i <= halfway) { dst[i] = src[len-1-i]; dst[len-1-i] = src[i]; i++; } } } /** * 1024-bit version of byte4_write32 */ void byte128_w64(char *dst, const char *src, unsigned len) { if (len) { long i = 0, halfway = len / 2; while (i <= halfway-128) { *(uint64_t *)(dst+len-128-i) = __bswap_64(*(uint64_t *)(src+i+120)); *(uint64_t *)(dst+len-120-i) = __bswap_64(*(uint64_t *)(src+i+112)); *(uint64_t *)(dst+len-112-i) = __bswap_64(*(uint64_t *)(src+i+104)); *(uint64_t *)(dst+len-104-i) = __bswap_64(*(uint64_t *)(src+i+96)); *(uint64_t *)(dst+len-96-i) = __bswap_64(*(uint64_t *)(src+i+88)); *(uint64_t *)(dst+len-88-i) = __bswap_64(*(uint64_t *)(src+i+80)); *(uint64_t *)(dst+len-80-i) = __bswap_64(*(uint64_t *)(src+i+72)); *(uint64_t *)(dst+len-72-i) = __bswap_64(*(uint64_t *)(src+i+64)); *(uint64_t *)(dst+len-64-i) = __bswap_64(*(uint64_t *)(src+i+56)); *(uint64_t *)(dst+len-56-i) = __bswap_64(*(uint64_t *)(src+i+48)); *(uint64_t *)(dst+len-48-i) = __bswap_64(*(uint64_t *)(src+i+40)); *(uint64_t *)(dst+len-40-i) = __bswap_64(*(uint64_t *)(src+i+32)); *(uint64_t *)(dst+len-32-i) = __bswap_64(*(uint64_t *)(src+i+24)); *(uint64_t *)(dst+len-24-i) = __bswap_64(*(uint64_t *)(src+i+16)); *(uint64_t *)(dst+len-16-i) = __bswap_64(*(uint64_t *)(src+i+ 8)); *(uint64_t *)(dst+len- 8-i) = __bswap_64(*(uint64_t *)(src+i)); *(uint64_t *)(dst+i) = __bswap_64(*(uint64_t *)(src+len-8-i)); *(uint64_t *)(dst+i+8) = __bswap_64(*(uint64_t *)(src+len-16-i)); *(uint64_t *)(dst+i+16) = __bswap_64(*(uint64_t *)(src+len-24-i)); *(uint64_t *)(dst+i+24) = __bswap_64(*(uint64_t *)(src+len-32-i)); *(uint64_t *)(dst+i+32) = __bswap_64(*(uint64_t *)(src+len-40-i)); *(uint64_t *)(dst+i+40) = __bswap_64(*(uint64_t *)(src+len-48-i)); *(uint64_t *)(dst+i+48) = __bswap_64(*(uint64_t *)(src+len-56-i)); *(uint64_t *)(dst+i+56) = __bswap_64(*(uint64_t *)(src+len-64-i)); *(uint64_t *)(dst+i+64) = __bswap_64(*(uint64_t *)(src+len-72-i)); *(uint64_t *)(dst+i+72) = __bswap_64(*(uint64_t *)(src+len-80-i)); *(uint64_t *)(dst+i+80) = __bswap_64(*(uint64_t *)(src+len-88-i)); *(uint64_t *)(dst+i+88) = __bswap_64(*(uint64_t *)(src+len-96-i)); *(uint64_t *)(dst+i+96) = __bswap_64(*(uint64_t *)(src+len-104-i)); *(uint64_t *)(dst+i+104) = __bswap_64(*(uint64_t *)(src+len-112-i)); *(uint64_t *)(dst+i+112) = __bswap_64(*(uint64_t *)(src+len-120-i)); *(uint64_t *)(dst+i+120) = __bswap_64(*(uint64_t *)(src+len-128-i)); i += 128; } while (i <= halfway) { dst[i] = src[len-1-i]; dst[len-1-i] = src[i]; i++; } } } /************************* test crap ****************************/ #include #include #include /** * test a function for correctness over a range of strings with * known correct answers. */ void test(void (*f)(char *, const char *, unsigned)) { const struct { const unsigned len; const char * const str, * const rev; } *t, Test[] = { { 0, "", "" }, { 1, "a", "a" }, { 2, "ab", "ba" }, { 3, "abc", "cba" }, { 4, "abcd", "dcba" }, { 5, "abcde", "edcba" }, { 6, "abcdef", "fedcba" }, { 7, "abcdefg", "gfedcba" }, { 8, "01234567", "76543210" }, { 9, "abcdefghi", "ihgfedcba" }, { 10, "abcdefghij", "jihgfedcba" }, { 11, "abcdefghijk", "kjihgfedcba" }, { 12, "abcdefghijkl", "lkjihgfedcba" }, { 13, "abcdefghijklm", "mlkjihgfedcba" }, { 14, "abcdefghijklmn", "nmlkjihgfedcba" }, { 15, "abcdefghijklmno", "onmlkjihgfedcba" }, { 16, "abcdefghijklmnop", "ponmlkjihgfedcba" }, { 17, "abcdefghijklmnopq", "qponmlkjihgfedcba" }, { 18, "aaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaa" }, { 19, "aaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaa" }, { 64, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789./", "/.9876543210ZYXWVUTSRQPONMLKJIHGFEDCBAzyxwvutsrqponmlkjihgfedcba" }, { 128, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789./" "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789./", "/.9876543210ZYXWVUTSRQPONMLKJIHGFEDCBAzyxwvutsrqponmlkjihgfedcba" "/.9876543210ZYXWVUTSRQPONMLKJIHGFEDCBAzyxwvutsrqponmlkjihgfedcba" }, { 256, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789./" "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789./" "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789./" "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789./", "/.9876543210ZYXWVUTSRQPONMLKJIHGFEDCBAzyxwvutsrqponmlkjihgfedcba" "/.9876543210ZYXWVUTSRQPONMLKJIHGFEDCBAzyxwvutsrqponmlkjihgfedcba" "/.9876543210ZYXWVUTSRQPONMLKJIHGFEDCBAzyxwvutsrqponmlkjihgfedcba" "/.9876543210ZYXWVUTSRQPONMLKJIHGFEDCBAzyxwvutsrqponmlkjihgfedcba" } }; char pad[1] = { 0x7F }; /* detect buffer underflow */ char dst[256]; char pad2[1] = { 0x7F }; /* detect buffer overflow */ unsigned i = 0, matches = 0, len; t = Test; #undef VALIDATE /* #define this for verbose validation output; useful for developing new algorithms */ #ifdef VALIDATE printf("validating %s dst=%p...\n", name, (void *)dst); #endif while (i < sizeof Test / sizeof Test[0]) { int match; #ifdef VALIDATE printf("%2u len=%u \"%.*s\" -> \"%.*s\" ", i, t->len, t->len, t->str, t->len, t->rev); #endif fflush(stdout); len = t->len; assert(0x7F == pad[0]); /* check underflow */ assert(0x7F == pad2[0]); /* check overflow */ f(dst, t->str, t->len); assert(0x7F == pad[0]); /* check underflow */ assert(0x7F == pad2[0]); /* check overflow */ assert(t->len == len); match = 0 == memcmp(dst, t->rev, t->len); /* verify results */ matches += match; #ifdef VALIDATE printf("[%s] (\"%.*s\")\n", match ? "OK" : "!!", t->len, dst); #endif assert(match); i++, t++; } } #define MAXLEN 128*1024 /* maximum string length to test */ /** * return the number of seconds required to run function 'f' against * input 'src' for all lengths [0..len] */ static double speed(char *dst, const char *src, unsigned len, void (*f)()) { double d[2], total; struct timeval tv[2]; /* test speed */ gettimeofday(tv, NULL); do f(dst, src, len); while (len--); /* run for all variations of [len..0] */ gettimeofday(tv + 1, NULL); d[0] = (tv[0].tv_sec * 1000000) + tv[0].tv_usec; d[1] = (tv[1].tv_sec * 1000000) + tv[1].tv_usec; total = (d[1] - d[0]) / 1000000; printf("%5.2f ", total); fflush(stdout); return total; } /** * run function 'f' through a variety of lengths and print the total amount of time */ static void run(const char *name, void (*f)()) { static double FirstRun = 0; char *src = malloc(MAXLEN), *dst = malloc(MAXLEN); double time = 0; unsigned i = MAXLEN; assert(src); assert(dst); while (i--) src[i] = (char)(rand() % 10); /* ~10% chance swapped chars will be the same */ printf("%28s ", name); fflush(stdout); /* test for correctness ... */ test(f); time += speed(dst, src, MAXLEN, f); if (0 == FirstRun) FirstRun = time; printf(" %6.1f%%\n", FirstRun/time*100-100); free(src); free(dst); } /* Stringification */ #define S(str) S_(str) #define S_(str) #str /* handy macro so i can get the function name as a string and a symbol * from the same input, saves me typing and protects against human error */ #define V(f) run(S(f), f) int main(void) { printf("%28s %5s %7s\n", "function", "sec", "speedup"); srand(time(NULL)); V(obvious); V(byte4_w); V(byte8_w); V(byte8_w64); V(byte16_w); V(byte16_w64); V(byte32_w); V(byte32_w_prefetch); V(byte32_w64); V(byte64_w); V(byte64_w64); V(byte128_w); V(byte128_w64); V(obvious_prefetch); V(obvious_check); V(obvious_pointer); V(obvious_twoindex); V(insideout); #if 0 V(obvious_recurse); /* NOTE: not acceptable for long strings, sorry */ #endif V(byte2_unroll); V(byte4_unroll); V(byte4_unroll_prefetch); V(byte4_unroll2); V(byte4_unroll2_expect); V(byte4_unroll3); V(byte4_loop); V(byte4_wb); V(byte4_wc); V(byte8_unroll); V(byte8_subloop); V(obvious); return 0; }