I just tested this loop on my Mac, and it beats memcmp
:
uint64_t *p = (uint64_t *)buffer1;
uint64_t compare;
memset(&compare, 1, sizeof compare);
for (i = 0; i < length/sizeof compare; i++)
{
if (p[i] != compare)
break;
}
Complete example code:
#include <stdio.h>
#include <string.h>
#include <sys/resource.h>
#include <time.h>
#include <stdlib.h>
#include <stdint.h>
// from: http://www.gnu.org/software/libc/manual/html_node/Elapsed-Time.html
void timeval_subtract(struct timeval *result, struct timeval *x, struct timeval *y)
{
/* Perform the carry for the later subtraction by updating y. */
if (x->tv_usec < y->tv_usec)
{
int nsec = (y->tv_usec - x->tv_usec) / 1000000 + 1;
y->tv_usec -= 1000000 * nsec;
y->tv_sec += nsec;
}
if (x->tv_usec - y->tv_usec > 1000000)
{
int nsec = (x->tv_usec - y->tv_usec) / 1000000;
y->tv_usec += 1000000 * nsec;
y->tv_sec -= nsec;
}
/* Compute the time remaining to wait. tv_usec is certainly positive. */
result->tv_sec = x->tv_sec - y->tv_sec;
result->tv_usec = x->tv_usec - y->tv_usec;
}
int main(int argc, char **argv)
{
struct rusage before;
struct rusage after;
struct timeval diff;
size_t i;
size_t length = strtoull(argv[1], NULL, 0);
char *buffer1 = malloc(length);
char *buffer2 = malloc(length);
printf("filling...");
fflush(stdout);
memset(buffer1, 1, length);
memset(buffer2, 1, length);
printf(" done\n");
getrusage(RUSAGE_SELF, &before);
uint64_t *p = (uint64_t *)buffer1;
uint64_t compare;
memset(&compare, 1, sizeof compare);
for (i = 0; i < length/sizeof compare; i++)
{
if (p[i] != compare)
break;
}
if (i == length/sizeof compare)
i = 0;
getrusage(RUSAGE_SELF, &after);
printf("\nloop (returned %zu):\n", i);
timeval_subtract(&diff, &after.ru_utime, &before.ru_utime);
printf("User: %ld.%06d s\n", diff.tv_sec, diff.tv_usec);
timeval_subtract(&diff, &after.ru_stime, &before.ru_stime);
printf("System: %ld.%06d s\n", diff.tv_sec, diff.tv_usec);
getrusage(RUSAGE_SELF, &before);
i = memcmp(buffer1, buffer2, length);
getrusage(RUSAGE_SELF, &after);
printf("\nmemcmp (returned %zu):\n", i);
timeval_subtract(&diff, &after.ru_utime, &before.ru_utime);
printf("User: %ld.%06d s\n", diff.tv_sec, diff.tv_usec);
timeval_subtract(&diff, &after.ru_stime, &before.ru_stime);
printf("System: %ld.%06d s\n", diff.tv_sec, diff.tv_usec);
return 0;
}
And run results:
$ make
clang -Wall -Wextra -Werror -O3 -g -o example example.c
./example 0x10000000
filling... done
loop (returned 0):
User: 0.024078 s
System: 0.000011 s
memcmp (returned 0):
User: 0.036752 s
System: 0.000017 s
Maybe you can do something similar?
Note: For those concerned about cache warming, I also tried with the memcmp before the loop and got the same results.
memcmp
is going to do? – Yarndyedmemcmp
takes in comparison to afor
loop you've written yourself before you came to the conclusion thatmemcmp
is faster? Have you tried reading and comparing blocks of 32 or 64 bits at a time in afor
loop? – Factitiousstarting_address - ONE_BYTE
will point below the start of starting_address at the first access. – Sweetmemcmp(start, start+1, size-1)
but the interesting question is whether memcmp has well-defined behavior when the inputs overlap. I was going to instinctively say no, but when I tried to find an authoritative source stating that it's undefined, I couldn't come up with one. – Unmixedmemcpy
easily with a 64-bit compare in a tight loop. (0.036462 s vs 0.024012 s) for OP's buffer size of0x10000000
. – Yarndyeduint64_t *p = (uint64_t *)buffer; uint64_t compare; memset(&compare, 1, sizeof compare); for (i = 0; i < SIZE/sizeof compare; i++) { if (p[i] != compare) break; }
– Yarndyed#define SIZE 0x10000000
elsewhere. – Yarndyed