Is there a reverse function for strstr
Asked Answered
H

18

29

I am trying to find a similar function to strstr that searches a substring starting from the end towards the beginning of the string.

Hierolatry answered 27/10, 2009 at 23:43 Comment(3)
Umm I am not sure if I should modify this question. But I have one more. Is there a C Library function to find the index to the last occurance of a substring within a string?Hierolatry
You can get the same effect using strrstr() and pointer arithmetic.Save
@ManavSharma Why do you ask another question in a comment or consider modifying an existing one? This site is all about questions, you can ask as many as you like. Another question means you create another question on this platform.Dekameter
O
14

The standard C library does not have a "reverse strstr" function, so you have to find or write your own.

I came up with a couple of solutions of my own, and added some testing and benchmarking code together with the other functions in this thread. For those curious, running on my laptop (Ubuntu karmic, amd64 architecture) the output looks like this:

$ gcc -O2 --std=c99 strrstr.c && ./a.out
#1 0.123 us last_strstr
#2 0.440 us theo
#3 0.460 us cordelia
#4 1.690 us digitalross
#5 7.700 us backwards_memcmp
#6 8.600 us sinan

Your results may be different and, depending on your compiler and library, the ordering of the results may also be different.

To get the offset (index) of the match from the beginning of the string, use pointer arithmetic:

char *match = last_strstr(haystack, needle);
ptrdiff_t index;
if (match != NULL)
    index = match - haystack;
else
    index = -1;

And now, the larch (note that this is purely in C, I do not know C++ well enough to give an answer for it):

#include <string.h>
#include <stdlib.h>

/* By liw. */
static char *last_strstr(const char *haystack, const char *needle)
{
    if (*needle == '\0')
        return (char *) haystack;

    char *result = NULL;
    for (;;) {
        char *p = strstr(haystack, needle);
        if (p == NULL)
            break;
        result = p;
        haystack = p + 1;
    }

    return result;
}


/* By liw. */
static char *backwards_memcmp(const char *haystack, const char *needle)
{
    size_t haylen = strlen(haystack);

    if (*needle == '\0')
        return (char *) haystack;

    size_t needlelen = strlen(needle);
    if (needlelen > haylen)
        return NULL;

    const char *p = haystack + haylen - needlelen;
    for (;;) {
        if (memcmp(p, needle, needlelen) == 0)
            return (char *) p;
        if (p == haystack)
            return NULL;
        --p;
    }
}


/* From http://stuff.mit.edu/afs/sipb/user/cordelia/Diplomacy/mapit/strrstr.c
 */
static char *cordelia(const char *s1, const char *s2)
{
 const char *sc1, *sc2, *psc1, *ps1;

 if (*s2 == '\0')
  return((char *)s1);

 ps1 = s1 + strlen(s1);

 while(ps1 != s1) {
  --ps1;
  for (psc1 = ps1, sc2 = s2; ; )
   if (*(psc1++) != *(sc2++))
    break;
   else if (*sc2 == '\0')
    return ((char *)ps1);
 }
 return ((char *)NULL);
}


/* From http://stackoverflow.com/questions/1634359/
   is-there-a-reverse-fn-for-strstr/1634398#1634398 (DigitalRoss). */
static char *reverse(const char *s)
{
  if (s == NULL)
    return NULL;
  size_t i, len = strlen(s);
  char *r = malloc(len + 1);

  for(i = 0; i < len; ++i)
    r[i] = s[len - i - 1];
  r[len] = 0;
  return r;
}
char *digitalross(const char *s1, const char *s2)
{
  size_t  s1len = strlen(s1);
  size_t  s2len = strlen(s2);
  const char *s;

  if (s2len == 0)
    return (char *) s1;
  if (s2len > s1len)
    return NULL;
  for (s = s1 + s1len - s2len; s >= s1; --s)
    if (strncmp(s, s2, s2len) == 0)
      return (char *) s;
  return NULL;
}


/* From http://stackoverflow.com/questions/1634359/
  is-there-a-reverse-fn-for-strstr/1634487#1634487 (Sinan Ünür). */

char *sinan(const char *source, const char *target)
{
    const char *current;
    const char *found = NULL;

    if (*target == '\0')
        return (char *) source;

    size_t target_length = strlen(target);
    current = source + strlen(source) - target_length;

    while ( current >= source ) {
        if ( (found = strstr(current, target)) ) {
            break;
        }
        current -= 1;
    }

    return (char *) found;
}


/* From http://stackoverflow.com/questions/1634359/
  is-there-a-reverse-fn-for-strstr/1634441#1634441 (Theo Spears). */
char *theo(const char* haystack, const char* needle)
{
  size_t needle_length = strlen(needle);
  const char* haystack_end = haystack + strlen(haystack) - needle_length;
  const char* p;
  size_t i;

  if (*needle == '\0')
    return (char *) haystack;
  for(p = haystack_end; p >= haystack; --p)
  {
    for(i = 0; i < needle_length; ++i) {
      if(p[i] != needle[i])
        goto next;
    }
    return (char *) p;

    next:;
  }
  return 0;
}


/*
 * The rest of this code is a test and timing harness for the various
 * implementations above.
 */


#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>


/* Check that the given function works. */
static bool works(const char *name, char *(*func)(const char *, const char *))
{
    struct {
        const char *haystack;
        const char *needle;
        int offset;
    } tests[] = {
        { "", "", 0 },
        { "", "x", -1 },
        { "x", "", 0 },
        { "x", "x", 0 },
        { "xy", "x", 0 },
        { "xy", "y", 1 },
        { "xyx", "x", 2 },
        { "xyx", "y", 1 },
        { "xyx", "z", -1 },
        { "xyx", "", 0 },
    };
    const int num_tests = sizeof(tests) / sizeof(tests[0]);
    bool ok = true;

    for (int i = 0; i < num_tests; ++i) {
        int offset;
        char *p = func(tests[i].haystack, tests[i].needle);
        if (p == NULL)
            offset = -1;
        else
            offset = p - tests[i].haystack;
        if (offset != tests[i].offset) {
            fprintf(stderr, "FAIL %s, test %d: returned %d, haystack = '%s', "
                            "needle = '%s', correct return %d\n",
                            name, i, offset, tests[i].haystack, tests[i].needle,
                            tests[i].offset);
            ok = false;
        }
    }
    return ok;
}


/* Dummy function for calibrating the measurement loop. */
static char *dummy(const char *haystack, const char *needle)
{
    return NULL;
}


/* Measure how long it will take to call the given function with the
   given arguments the given number of times. Return clock ticks. */
static clock_t repeat(char *(*func)(const char *, const char *),
                       const char *haystack, const char *needle,
                       long num_times)
{
    clock_t start, end;

    start = clock();
    for (long i = 0; i < num_times; ++i) {
        func(haystack, needle);
    }
    end = clock();
    return end - start;
}


static clock_t min(clock_t a, clock_t b)
{
    if (a < b)
        return a;
    else
        return b;
}


/* Measure the time to execute one call of a function, and return the
   number of CPU clock ticks (see clock(3)). */
static double timeit(char *(*func)(const char *, const char *))
{
    /* The arguments for the functions to be measured. We deliberately
       choose a case where the haystack is large and the needle is in
       the middle, rather than at either end. Obviously, any test data
       will favor some implementations over others. This is the weakest
       part of the benchmark. */

    const char haystack[] = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "b"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
    const char needle[] = "b";

    /* First we find out how many repeats we need to do to get a sufficiently
       long measurement time. These functions are so fast that measuring
       only a small number of repeats will give wrong results. However,
       we don't want to do a ridiculously long measurement, either, so 
       start with one repeat and multiply it by 10 until the total time is
       about 0.2 seconds. 

       Finally, we measure the dummy function the same number of times
       to get rid of the call overhead.

       */

    clock_t mintime = 0.2 * CLOCKS_PER_SEC;
    clock_t clocks;
    long repeats = 1;
    for (;;) {
        clocks = repeat(func, haystack, needle, repeats);
        if (clocks >= mintime)
            break;
        repeats *= 10;
    }

    clocks = min(clocks, repeat(func, haystack, needle, repeats));
    clocks = min(clocks, repeat(func, haystack, needle, repeats));

    clock_t dummy_clocks;

    dummy_clocks = repeat(dummy, haystack, needle, repeats);
    dummy_clocks = min(dummy_clocks, repeat(dummy, haystack, needle, repeats));
    dummy_clocks = min(dummy_clocks, repeat(dummy, haystack, needle, repeats));

    return (double) (clocks - dummy_clocks) / repeats / CLOCKS_PER_SEC;
}


/* Array of all functions. */
struct func {
    const char *name;
    char *(*func)(const char *, const char *);
    double secs;
} funcs[] = {
#define X(func) { #func, func, 0 }
    X(last_strstr),
    X(backwards_memcmp),
    X(cordelia),
    X(digitalross),
    X(sinan),
    X(theo),
#undef X
};
const int num_funcs = sizeof(funcs) / sizeof(funcs[0]);


/* Comparison function for qsort, comparing timings. */
int funcmp(const void *a, const void *b)
{
    const struct func *aa = a;
    const struct func *bb = b;

    if (aa->secs < bb->secs)
        return -1;
    else if (aa->secs > bb->secs)
        return 1;
    else
        return 0;
}


int main(void)
{

    bool ok = true;
    for (int i = 0; i < num_funcs; ++i) {
        if (!works(funcs[i].name, funcs[i].func)) {
            fprintf(stderr, "%s does not work\n", funcs[i].name);            
            ok = false;
        }
    }
    if (!ok)
        return EXIT_FAILURE;

    for (int i = 0; i < num_funcs; ++i)
        funcs[i].secs = timeit(funcs[i].func);
    qsort(funcs, num_funcs, sizeof(funcs[0]), funcmp);
    for (int i = 0; i < num_funcs; ++i)
        printf("#%d %.3f us %s\n", i+1, funcs[i].secs * 1e6, funcs[i].name);

    return 0;
}
Offshore answered 29/10, 2009 at 14:0 Comment(6)
Sorry about the length. All the interesting parts (the actual implementations of reverse strstr) are at the top of the code, so should be easy to find.Offshore
You should improve your tests. Haystack and needle too short, try it on large strings.Vday
If needle is empty string, shouldn't last_strstr return the end of haystack instead of the start?Cyruscyst
I prefer while (1) rather than for (;;), it's nicerMegaphone
This kept popping up in my feed, so I thought I’d give it a go. I produced a function that performs (on my PC) the same as theo’s, but I am still amazed that last_strstr() wins by such an impressive margin. (Maybe I’ll post my solution as a separate answer just so y’all have it anyway.)Farinose
So, I figured out why, and added an answer below with my own algorithm.Farinose
S
7

I don't know of one. One of the nice things about C is that if you write your own function, it's just as fast and efficient as the library ones. (This is totally not the case in many other languages.)

You could reverse the string and the substring, and then search.

Finally, the other thing people often do when the string library isn't good enough is to move to regular expressions.

Ok, I wrote both reverse() and rstrstr(), which might work if we are lucky. Get rid of __restrict for C++. You also might want to make the parameters const, but then you will need to cast the return value. To answer your comment question, you can get the index from the address of the substring by just substracting the original string pointer from it. OK:

#include <stdlib.h>
#include <string.h>

char *reverse(const char * __restrict const s)
{
  if (s == NULL)
    return NULL;
  size_t i, len = strlen(s);
  char *r = malloc(len + 1);

  for(i = 0; i < len; ++i)
    r[i] = s[len - i - 1];
  r[len] = 0;
  return r;
}

char *rstrstr(char *__restrict s1, char *__restrict s2)
{
  size_t  s1len = strlen(s1);
  size_t  s2len = strlen(s2);
  char *s;

  if (s2len > s1len)
    return NULL;
  for (s = s1 + s1len - s2len; s >= s1; --s)
    if (strncmp(s, s2, s2len) == 0)
      return s;
  return NULL;
}
Starry answered 27/10, 2009 at 23:54 Comment(8)
"it's just as fast and efficient as the library ones" That is not true, you can very easily write code in C that is much less efficient than the library one (for example completely botch up your own "qsort" routine and make O(n^2) and turn it into insertion sort.Daune
Obviously my statement implies "unless you botch the implementation", but we can't make any statements about anything without including that assumption. I mean, seriously now.Starry
Its easily possible for a library routine to be 10x faster than an algorithmically correct naive implementation. Try writing your own pow( ) function that delivers sub-ulp accuracy and see how you do on performance. Even in simpler functions, a naive memcpy( ) might use byte-by-byte copies, where a library implementation might use an unrolled loop of vector copies.Chanticleer
You guys are missing the overall point, which is that your routine is written in C, the library routine is written in C, everyone is on a level playing field. Let's see someone implement strcmp in Perl, Python, Ruby, or even Java. That's the, ahem, obvious, point.Starry
The library routine is not necessarily written in C. When I'm working on a standard library, I'm usually writing assembly. You could write a C library function in nearly any compiled language that knows about the C calling conventions on the target platform. (That said, I agree that all this is beside the point).Chanticleer
@Dan, you're wrong. It reverses the sequence of char values ignoring any larger multibyte character structure, which strstr would ignore anyway. As long as you translate the result back into an offset into the original string, it works just fine with multibyte character strings.Hancock
The standard library almost certainly does not use C; it either uses assembly or C with compiler specific intrinsics. It's basically impossible to make a strrstr as fast as a standard library strstr.Guzzle
@Guzzle GlibC strstr is pretty much plain C. Its speed result of a clever searching strategy that will even use a skip table if haystack is above a certain length. Using assembly hardly pays off today most of the time as compilers produce better assembly code from C than 9 of 10 devs could by hand and intrinsics only make sense in situations where the CPU can do something, that cannot be expressed in C (like counting the number of 1 bits in an int, almost any CPU can do that, C cannot; or rotate bits of an int) - but none of that is helpful for implementing strstr.Dekameter
P
4

If you can use C++, you can search strings like this:

std::string::iterator found=std::search(haystack.rbegin(), haystack.rend(), needle.rbegin(), needle.rend()).base();
// => yields haystack.begin() if not found, otherwise, an iterator past-the end of the occurence of needle
Peephole answered 27/10, 2009 at 23:59 Comment(2)
and std::w/string::find_last_ofCooley
and even std::string_view::find_last_ofStrephon
A
3

One possible, if not entirely elegant, implementation might look like:

#include "string.h"

const char* rstrstr(const char* haystack, const char* needle)
{
  int needle_length = strlen(needle);
  const char* haystack_end = haystack + strlen(haystack) - needle_length;
  const char* p;
  size_t i;

  for(p = haystack_end; p >= haystack; --p)
  {
    for(i = 0; i < needle_length; ++i) {
      if(p[i] != needle[i])
        goto next;
    }
    return p;

    next:;
  }
  return 0;
}
Adalard answered 28/10, 2009 at 0:9 Comment(4)
As a bit of nitpicking I would make all of those const char *. codepad.org/KzryjtREIves
@Kinopiko: Actually, that's not even nitpicking. Leaving the const out would make this function a pain to use for the caller if their "needle" or "haystack" is already const.Desperation
int should be size_t. But I almost want to upvote for the goto.Starry
You need to make haystack_end and p and the return value of the function const char * too. Please see my paste on codepad.org.Ives
G
3

Though non-standard, strrstr is widely supported and does exactly what you want.

Guzzle answered 8/3, 2015 at 21:53 Comment(1)
Hmmm, it needs to install a lib for such a trivial function. I wish libc had it!Corvin
I
2

No. This is one of the places that the C++ std::string class has an obvious advantage -- along with std::string::find(), there's also std::string::rfind().

Ioved answered 28/10, 2009 at 0:7 Comment(0)
S
2

Here is one. Testing it is an exercise I'll leave to you :)

Save answered 28/10, 2009 at 0:8 Comment(0)
H
2

I think you can still do it using library functions.

1.Use strrev function to reverse the string.

2.Use strstr function to do whatever you want to do.

3.You can find start index (from reverse ) of the search string by subtracting start index of the search string from the length of original string.

Hasid answered 28/10, 2009 at 7:21 Comment(3)
Going to try this out. Huh .. strrev not found.. looking where it might live.Modification
strrev does not exist in string.h on either linux or os/x #8534774Modification
Be careful doing this - you'll also need to reverse your needle string if you're going to do this.Punishment
C
1

Is there a C Library function to find the index to the last occurrence of a substring within a string?

Edit: As @hhafez notes in a comment below, the first solution I posted for this was inefficient and incorrect (because I advanced the pointer by target_length which worked fine in my silly test). You can find that version in the edit history.

Here is an implementation that starts at the end and works back:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

const char *
findlast(const char *source, const char *target) {
    const char *current;
    const char *found = NULL;

    size_t target_length = strlen(target);
    current = source + strlen(source) - target_length;

    while ( current >= source ) {
        if ( (found = strstr(current, target)) ) {
            break;
        }
        current -= 1;
    }

    return found;
}

int main(int argc, char *argv[]) {
    if ( argc != 3 ) {
        fputs("invoke with source and search strings as arguments", stderr);
        return EXIT_FAILURE;
    }

    const char *found = findlast(argv[1], argv[2]);

    if ( found ) {
        printf("Last occurence of '%s' in '%s' is at offset %d\n",
                argv[2], argv[1], found - argv[1]
                );
    }
    return 0;
}

Output:

C:\Temp> st "this is a test string that tests this" test
Last occurence of 'test' in 'this is a test string that tests this' is 
at offset 27
Cube answered 28/10, 2009 at 0:21 Comment(1)
but that is ugly compared to writing your own routine, what if the string is really long? Also don't forget that he is expecting to have multiple occurrences and he expects to find them towards the end, that's why he wants to start from the end and not from the start.Daune
L
1

Here is the most minimal simple implantation that I could come up with. Unlike other implementations of this function it avoids the initial strstr call that some other people like user3119703 had.

char * lastStrstr(const char * haystack,const char * needle){
    char*temp=haystack,*before=0;
    while(temp=strstr(temp,needle)) before=temp++;
    return before;
}
Lizbeth answered 8/6, 2014 at 2:56 Comment(1)
This looks like it is still starting from the beginning of the "haystack" when searching for the substring (as opposed to starting from the end and searching towards the beginning) as OP requested.Stuffing
D
0

I don't believe there is in the c string lib, but it would be trivial to write your own, On one condition, you know the length of the string or it is properly terminated.

Daune answered 27/10, 2009 at 23:50 Comment(0)
I
0

There isn't one in the standard C library. You may be able to find one on the web, or you may have to write your own.

Ives answered 27/10, 2009 at 23:51 Comment(0)
T
0

Long story short:

Nope - there is no function in the C-library that does what you need..

But as others have pointed out: It's not rocket-science to write such a function...

Tabina answered 28/10, 2009 at 0:5 Comment(1)
It's not rocket science to write a slow implementation, but making it fast requires some fancy algorithms.Hancock
H
0

Thanks for your answers! There is one more way which came from the MSDN forum. http://social.msdn.microsoft.com/Forums/en-US/vclanguage/thread/ed0f6ef9-8911-4879-accb-b3c778a09d94

Hierolatry answered 28/10, 2009 at 0:22 Comment(0)
A
0
char * strrstr(char *_Str, char *_SubStr){
    char *returnPointer, *p;

    //find 1st occurence. if not found, return NULL
    if ( (p=strstr(_Str, _SubStr))==NULL)
        return NULL;

    //loop around until no more occurences
    do{
        returnPointer=p;
        ++p;
    }while(p=strstr(p, _SubStr));

    return returnPointer;
}
Acierate answered 19/12, 2013 at 16:25 Comment(0)
I
0

You can use standard algorithm std::find_end for this purpose. For example

    char s[] = "What is the last word last";
    char t[] = "last";

    std::cout << std::find_end( s, s + sizeof( s ) - 1, t, t + sizeof( t ) -1 )
              << std::endl;
Ignaciaignacio answered 19/12, 2013 at 16:35 Comment(0)
C
0
char* strrstr(char * _Str, const char * _SubStr)
{
    const BYTE EQUAL=0;
    int i=0, src_len = strlen(_Str), find_len = strlen(_SubStr),
        tail_count=0;

    for(i=src_len; i>-1; i--)
    {
        if(_Str[i] == _SubStr[0] && tail_count >= find_len)
        {
            if(strncmp(&_Str[i], _SubStr, find_len) == EQUAL)
            {
                return &_Str[i];
            }
        }
        tail_count++;
    }
    return NULL;    
}
Chalco answered 24/6, 2016 at 11:19 Comment(2)
Explain it a bit tooJuliojulis
@TalhaIrfan With the length of the target string, since at the end to reduce the index to find the desired character string. - google translationChalco
B
0

This thread kept popping up in my feed and I was not particularly satisfied with the existing answers and thought I’d have a go at it myself. I ran a naïve test with the existing top-voted answer and came-up slightly better than theo’s algorithm, but not beating the last_strstr algorithm, which surprised me.

Taking another look at the test harness I realized that it was, alas, not the best possible test. In fact, it actually skews the test for the last_strstr algorithm. The offending code is this block:

    const char haystack[] = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "b"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
    const char needle[] = "b";

This is not a good test of everyone’s algorithms. So I modified the test harnass. The major changes are described in the code, but the major points are:

  • The haystack needs to be much larger.
  • The haystack needs to have many potential matches with the needle.
  • The needle also needs to be larger, and have as much overlap with potential matches in the haystack as possible.

Testing was divided into rounds, testing matches at the beginning, middle, and end of the haystack. Potential matches in the haystack are built from a goobered version of the needle, where the goobering is done at the beginning, middle, and end of the needle (to play to algorithms that match forward or backward with the needle).

Here is the modified test harness. My algorithm is added at the top.

#include <string.h>
#include <stdlib.h>


/* Michael Thomas Greer (Dúthomhas) */
// For some reason everyone thinks the empty string should match the
// _beginning_ of the haystack. IDK why... but since everyone else
// wrote their functions to work that way:
#define EMPTY_STRING_MATCHES_END_OF_SEARCH

// Search backward in an (inclusive,exclusive) range for a character
const char * rstrchr( const char * s_begin, const char * s_end, int c )
{
    while (s_end --> s_begin)
    {
        if (*s_end == c)
            return s_end;
    }
    return NULL;
}

// Search backwards in an (inclusive,exclusive) range for a string
const char * rstrstr( const char * s_begin, const char * s_end, const char * s )
{
    size_t n = strlen( s );
#ifdef EMPTY_STRING_MATCHES_END_OF_SEARCH
    if (!n) return s_begin;
#else
    if (!n) return s_end;
#endif

    int c = s[n-1];    // character to search is LAST character of substring
    s_begin += n - 1;
    while ((s_end = rstrchr( s_begin, s_end, c )))  // for every potential match
    {
        if (strncmp( s_end - n + 1, s, n ) == 0)
            return s_end - n + 1;
        s_end -= 1;
    }

    return NULL;
}

#if !defined(HAS_STRRSTR) && !defined(STRRSTR) && !defined(strrstr)
    char * strrstr( const char * haystack, const char * needle )
    {
        return (char *) rstrstr( haystack, strchr(haystack,'\0'), needle );
    }
    #define HAS_STRRSTR 1
    #define STRRSTR strrstr
    #define strrstr strrstr
#endif


/* By liw. */
static char *last_strstr(const char *haystack, const char *needle)
{
    if (*needle == '\0')
        return (char *) haystack;

    char *result = NULL;
    for (;;) {
        char *p = strstr(haystack, needle);
        if (p == NULL)
            break;
        result = p;
        haystack = p + 1;
    }

    return result;
}


/* By liw. */
static char *backwards_memcmp(const char *haystack, const char *needle)
{
    size_t haylen = strlen(haystack);

    if (*needle == '\0')
        return (char *) haystack;

    size_t needlelen = strlen(needle);
    if (needlelen > haylen)
        return NULL;

    const char *p = haystack + haylen - needlelen;
    for (;;) {
        if (memcmp(p, needle, needlelen) == 0)
            return (char *) p;
        if (p == haystack)
            return NULL;
        --p;
    }
}


/* From http://stuff.mit.edu/afs/sipb/user/cordelia/Diplomacy/mapit/strrstr.c
 */
static char *cordelia(const char *s1, const char *s2)
{
 const char /**sc1,*/ *sc2, *psc1, *ps1;

 if (*s2 == '\0')
  return((char *)s1);

 ps1 = s1 + strlen(s1);

 while(ps1 != s1) {
  --ps1;
  for (psc1 = ps1, sc2 = s2; ; )
   if (*(psc1++) != *(sc2++))
    break;
   else if (*sc2 == '\0')
    return ((char *)ps1);
 }
 return ((char *)NULL);
}


/* From http://stackoverflow.com/questions/1634359/
   is-there-a-reverse-fn-for-strstr/1634398#1634398 (DigitalRoss). */
#if 0
static char *reverse(const char *s)
{
  if (s == NULL)
    return NULL;
  size_t i, len = strlen(s);
  char *r = malloc(len + 1);

  for(i = 0; i < len; ++i)
    r[i] = s[len - i - 1];
  r[len] = 0;
  return r;
}
#endif
char *digitalross(const char *s1, const char *s2)
{
  size_t  s1len = strlen(s1);
  size_t  s2len = strlen(s2);
  const char *s;

  if (s2len == 0)
    return (char *) s1;
  if (s2len > s1len)
    return NULL;
  for (s = s1 + s1len - s2len; s >= s1; --s)
    if (strncmp(s, s2, s2len) == 0)
      return (char *) s;
  return NULL;
}


/* From http://stackoverflow.com/questions/1634359/
  is-there-a-reverse-fn-for-strstr/1634487#1634487 (Sinan Ünür). */

char *sinan(const char *source, const char *target)
{
    const char *current;
    const char *found = NULL;

    if (*target == '\0')
        return (char *) source;

    size_t target_length = strlen(target);
    current = source + strlen(source) - target_length;

    while ( current >= source ) {
        if ( (found = strstr(current, target)) ) {
            break;
        }
        current -= 1;
    }

    return (char *) found;
}


/* From http://stackoverflow.com/questions/1634359/
  is-there-a-reverse-fn-for-strstr/1634441#1634441 (Theo Spears). */
char *theo(const char* haystack, const char* needle)
{
  size_t needle_length = strlen(needle);
  const char* haystack_end = haystack + strlen(haystack) - needle_length;
  const char* p;
  size_t i;

  if (*needle == '\0')
    return (char *) haystack;
  for(p = haystack_end; p >= haystack; --p)
  {
    for(i = 0; i < needle_length; ++i) {
      if(p[i] != needle[i])
        goto next;
    }
    return (char *) p;

    next:;
  }
  return 0;
}


/*
 * The rest of this code is a test and timing harness for the various
 * implementations above.
 */


#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>


/* Check that the given function works. */
static bool works(const char *name, char *(*func)(const char *, const char *))
{
    struct {
        const char *haystack;
        const char *needle;
        int offset;
    } tests[] = {
        { "", "", 0 },
        { "", "x", -1 },
        { "x", "", 0 },
        { "x", "x", 0 },
        { "xy", "x", 0 },
        { "xy", "y", 1 },
        { "xyx", "x", 2 },
        { "xyx", "y", 1 },
        { "xyx", "z", -1 },
        { "xyx", "", 0 },
    };
    const int num_tests = sizeof(tests) / sizeof(tests[0]);
    bool ok = true;

    for (int i = 0; i < num_tests; ++i) {
        int offset;
        char *p = func(tests[i].haystack, tests[i].needle);
        if (p == NULL)
            offset = -1;
        else
            offset = p - tests[i].haystack;
        if (offset != tests[i].offset) {
            fprintf(stderr, "FAIL %s, test %d: returned %d, haystack = '%s', "
                            "needle = '%s', correct return %d\n",
                            name, i, offset, tests[i].haystack, tests[i].needle,
                            tests[i].offset);
            ok = false;
        }
    }
    return ok;
}


/* Dummy function for calibrating the measurement loop. */
static char *dummy(const char *haystack, const char *needle)
{
    (void)haystack;
    (void)needle;
    return NULL;
}


/* Measure how long it will take to call the given function with the
   given arguments the given number of times. Return clock ticks. */
static clock_t repeat(char *(*func)(const char *, const char *),
                       const char *haystack, const char *needle,
                       long num_times)
{
    clock_t start, end;

    start = clock();
    for (long i = 0; i < num_times; ++i) {
        func(haystack, needle);
    }
    end = clock();
    return end - start;
}


static clock_t min(clock_t a, clock_t b)
{
    if (a < b)
        return a;
    else
        return b;
}


// HERE ARE THE MAJOR CHANGES TO THE TEST HARNASS.
//
// First, we need a much larger haystack.
//
// Next, our needle should also be long and have potential matches inside it.
// (I suppose I could have made a much more devious potential match?)
//
// Life is complicated by having to have both the needle and the haystack have
// errors in the right spots. So... we mix it up.
//
// The haystack is just a bunch of unneedle repeats. The needle is placed in
// the haystack at the front, middle, or end of the haystack, depending on the
// argument to initialize_haystack().
//
// Likewise, the unneedle is exactly like the needle, but with ONE character
// out of place at the second, middle, and penultimate character indices,
// depending on the argument to initialize_unneedle().
//
// In both cases, the argument is just 0, 1, or 2 corresponding to front, middle,
// and end, respectively.
//
const char needle[] = "THIS IS MY MATCHING STRING IT IS SOMEWHAT LONG JUST TO MAKE THINGS DIFFICULT";
char haystack[4096];
char unneedle[sizeof(needle)];


static void initialize_haystack(size_t n)
{
    memset( haystack, 'a', sizeof(haystack) );
    haystack[sizeof(haystack)-1] = 0;

    char * p = haystack + sizeof(haystack) - sizeof(unneedle);
    while ((size_t)(p - haystack) > sizeof(unneedle)-1) {
        memcpy( p, unneedle, sizeof(unneedle)-1 );
        p -= sizeof(unneedle)-1;
    }

    switch (n) {
        case 1: n = (sizeof(haystack) - sizeof(needle)) / 2; break;
        case 2: n =  sizeof(haystack) - sizeof(needle);
    }
    memcpy(haystack+n, needle, sizeof(needle)-1);
}


static void initialize_unneedle(size_t n)
{
    strcpy( unneedle, needle );
    unneedle[n] = '^';
    printf( "unneedle = \"%s\"\n", unneedle );
}


/* Measure the time to execute one call of a function, and return the
   number of CPU clock ticks (see clock(3)). */
static double timeit(char *(*func)(const char *, const char *))
{
    /* The arguments for the functions to be measured. We deliberately
       choose a case where the haystack is large and the needle is in
       the middle, rather than at either end. Obviously, any test data
       will favor some implementations over others. This is the weakest
       part of the benchmark. */

#if 0
    const char haystack[] = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "b"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
    const char needle[] = "b";
#endif

    /* First we find out how many repeats we need to do to get a sufficiently
       long measurement time. These functions are so fast that measuring
       only a small number of repeats will give wrong results. However,
       we don't want to do a ridiculously long measurement, either, so
       start with one repeat and multiply it by 10 until the total time is
       about 0.2 seconds.

       Finally, we measure the dummy function the same number of times
       to get rid of the call overhead.

       */

    clock_t mintime = 0.2 * CLOCKS_PER_SEC;
    clock_t clocks;
    long repeats = 1;
    for (;;) {
        clocks = repeat(func, haystack, needle, repeats);
        if (clocks >= mintime)
            break;
        repeats *= 10;
    }

    clocks = min(clocks, repeat(func, haystack, needle, repeats));
    clocks = min(clocks, repeat(func, haystack, needle, repeats));

    clock_t dummy_clocks;

    dummy_clocks = repeat(dummy, haystack, needle, repeats);
    dummy_clocks = min(dummy_clocks, repeat(dummy, haystack, needle, repeats));
    dummy_clocks = min(dummy_clocks, repeat(dummy, haystack, needle, repeats));

    return (double) (clocks - dummy_clocks) / repeats / CLOCKS_PER_SEC;
}


/* Array of all functions. */
struct func {
    const char *name;
    char *(*func)(const char *, const char *);
    double secs;
    double tot_secs;
} funcs[] = {
#define X(func) { #func, func, 0., 0. }
    X(last_strstr),
    X(backwards_memcmp),
    X(cordelia),
    X(digitalross),
    X(sinan),
    X(theo),
    X(strrstr),
#undef X
};
const int num_funcs = sizeof(funcs) / sizeof(funcs[0]);


/* Comparison function for qsort, comparing timings. */
int funcmp(const void *a, const void *b)
{
    const struct func *aa = a;
    const struct func *bb = b;

    if (aa->secs < bb->secs)
        return -1;
    else if (aa->secs > bb->secs)
        return 1;
    else
        return 0;
}


// Comparison for TOTAL timings
int funcmp2(const void *a, const void *b)
{
    const struct func *aa = a;
    const struct func *bb = b;

    if (aa->tot_secs < bb->tot_secs)
        return -1;
    else if (aa->tot_secs > bb->tot_secs)
        return 1;
    else
        return 0;
}


int main(void)
{
    bool ok = true;
    for (int i = 0; i < num_funcs; ++i) {
        if (!works(funcs[i].name, funcs[i].func)) {
            fprintf(stderr, "%s does not work\n", funcs[i].name);
            ok = false;
        }
    }
    if (!ok)
        return EXIT_FAILURE;

    for (int k = 0; k < 3; ++k)    // for each haystack configuration
    for (int n = 0; n < 3; ++n) {  // for each unneedle configuration

        // Display the test number and data configuration for the test
        printf("test round %d\n", (k*3) + n+1);
        printf("haystack = \"%c..%c..%c\"\n",
            k==0 ? 'N' : '.',
            k==1 ? 'N' : '.',
            k==2 ? 'N' : '.');

        // Initialize the data properly
        initialize_unneedle(
            n == 2 ? sizeof(needle)-3 :
            n == 1 ? sizeof(needle)/2 : 1 );
        initialize_haystack(k);

        // Tests!
        for (int i = 0; i < num_funcs; ++i) {
            printf( "\r%-50c\r%s", ' ', funcs[i].name );
            fflush(stdout);
            funcs[i].secs = timeit(funcs[i].func);
            funcs[i].tot_secs += funcs[i].secs;
        }

        // Sort and display the results
        printf( "\r%50c\r", ' ' );
        qsort(funcs, num_funcs, sizeof(funcs[0]), funcmp);
        for (int i = 0; i < num_funcs; ++i)
            printf("#%d %.3f us %s\n", i+1, funcs[i].secs * 1e6, funcs[i].name);
        puts("");
    }

    // Sort and display the average results
    printf("averages:\n");
    qsort(funcs, num_funcs, sizeof(funcs[0]), funcmp2);
    for (int i = 0; i < num_funcs; ++i)
        printf("#%d %.3f us %s\n", i+1, funcs[i].tot_secs * 1e6 / 9, funcs[i].name);

    return 0;
}

I compiled it with clang -Wall -Wextra -Werror -pedantic-errors -O3 -Wno-unused-function. (I had to make a couple small modifications to the code as it was just to get rid of the compile errors for things like unused variables, IIRC.)

str[A-Za-z] is reserved by POSIX

My code adds strrstr() as a function using, AFAIK, all the right magic macros to make sure it gets along nicely with other properly-written code.

That way it can be used in such a way that should POSIX (or some other library writer) add a nice, highly-optimized strrstr() function to its C Library there should be no conflict.

Memory Access Order

I tested to make sure that the sorting of the results wasn’t affecting the results themselves. (It wasn’t.) The way I tested this was to simply not sort, and also to re-initialize the haystack every time through the test loop. Once no consequence was observed I removed it back out of the test loop and restored sorting in individual test rounds.

Results

The final results are more what I expected, though last_strstr() still kicked butt, which still surprises me:

averages:
#1 0.770 us strrstr
#2 0.817 us last_strstr
#3 1.526 us cordelia
#4 1.571 us theo
#5 2.123 us backwards_memcmp
#6 2.944 us digitalross
#7 724.793 us sinan

This was an average run for all the algorithms, neither best nor worst for any one of them. My PC is a 12th Gen Intel© Core™ i7-12700K running Linux Mint 21.3 Cinnamon 6.0.4 with 15.4 GiB available RAM.

strrstr

Here is my code separated out for use if you want to use it. License is either the Boost License or the standard StackOverflow CC-BY-SA license you agree to when posting code here, whichever you prefer.

strrstr.h

#ifndef DUTHOMHAS_STRRSTR_H
#define DUTHOMHAS_STRRSTR_H

// Copyright 2024 Michael Thomas Greer.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE_1_0.txt or copy at
//  https://www.boost.org/LICENSE_1_0.txt )

#include <string.h>

const char * rstrchr( const char * s_begin, const char * s_end, int c );
const char * rstrstr( const char * s_begin, const char * s_end, const char * s );
// Search backwards in a range (inclusive,exclusive) for a single byte or a byte string.
// Returns a pointer to the BEGINNING of the match if found, else NULL.

#if !defined(HAS_STRRSTR) && !defined(STRRSTR) && !defined(strrstr)
    char * strrstr( const char * haystack, const char * needle );
    #define HAS_STRRSTR 1
    #define STRRSTR strrstr
    #define strrstr strrstr
#endif

#endif

strrstr.c

// Copyright 2024 Michael Thomas Greer.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE_1_0.txt or copy at
//  https://www.boost.org/LICENSE_1_0.txt )

#include <string.h>


// For some reason everyone thinks the empty string should match the
// _beginning_ of the haystack. IDK why... 
//
// To get the IMHO correct behavior, make sure to #define the following somewhere:
//   DUTHOMHAS_STRRSTR_EMPTY_STRING_MATCHES_EVERYTHING


// Search backward in an (inclusive,exclusive) range for a character
const char * rstrchr( const char * s_begin, const char * s_end, int c )
{
    while (s_end --> s_begin)
    {
        if (*s_end == c)
            return s_end;
    }
    return NULL;
}


// Search backwards in an (inclusive,exclusive) range for a string
const char * rstrstr( const char * s_begin, const char * s_end, const char * s )
{
    size_t n = strlen( s );
#ifndef DUTHOMHAS_STRRSTR_EMPTY_STRING_MATCHES_EVERYTHING
    if (!n) return s_begin;
#else
    if (!n) return s_end;
#endif

    int c = s[n-1];    // character to search is LAST character of substring
    s_begin += n - 1;
    while ((s_end = rstrchr( s_begin, s_end, c )))  // for every potential match
    {
        if (strncmp( s_end - n + 1, s, n ) == 0)
            return s_end - n + 1;
        s_end -= 1;
    }

    return NULL;
}


#if !defined(HAS_STRRSTR) && !defined(STRRSTR) && !defined(strrstr)
    char * strrstr( const char * haystack, const char * needle )
    {
        return (char *) rstrstr( haystack, strchr(haystack,'\0'), needle );
    }
#endif

Here’s a little test program to play with it, even:

#include <stdio.h>
#include "strrchr.h"

#ifndef DUTHOMHAS_STRRSTR_EMPTY_STRING_MATCHES_EVERYTHING
    #error "You must #define DUTHOMHAS_STRRSTR_EMPTY_STRING_MATCHES_EVERYTHING when you compile for testing"
#endif

void ask( const char * prompt, char * s, size_t n )
{
        printf( "%s", prompt );
        fflush( stdout );
        fgets( s, n, stdin );
        char * p = strpbrk( s, "\n\r" );
        if (p) *p = '\0';
}

int main(void)
{
        char s[1000];  ask( "s? ",  s,  sizeof s  );
        char ss[100];  ask( "ss? ", ss, sizeof ss );
        
        const char * p = strrstr( s, ss );
        if (!p) printf( "%s\n", "not found" );
        else
        {
                printf( "\"%s\"\n", s );
                printf( "%*c^%zu\n", (int)(p-s)+1, ' ', (size_t)(p-s) );
        }
        
        puts( "\n" "See the code for details about the following tests:" );
        strcpy( s, "abcdefg" );
        //              ^s+4
        p = rstrstr( s, s+4, "ab" );  printf( "test 1: %s\n", p == s   ? "pass" : "fail" );  // match at head of range
        p = rstrstr( s, s+4, "de" );  printf( "test 2: %s\n", p        ? "fail" : "pass" );  // should NOT be found
        p = rstrstr( s, s+4, "cd" );  printf( "test 3: %s\n", p        ? "pass" : "fail" );  // match at end of range
        p = rstrstr( s, s+4, "" );    printf( "test 4: %s\n", p == s+4 ? "pass" : "fail" );  // ok: empty string @ end of range
        p = rstrstr( s, s+3, "abcd" );printf( "test 5: %s\n", p        ? "fail" : "pass" );  // ss is too large
        
        p = rstrchr( s, s+4, 0 );     printf( "test 6: %s\n", p        ? "fail" : "pass" );  // should NOT be found
        p = rstrchr( s, s+7, 0 );     printf( "test 7: %s\n", p        ? "fail" : "pass" );  // same
        p = rstrchr( s, s+8, 0 );     printf( "test 8: %s\n", p == s+7 ? "pass" : "fail" );  // ok to find
        
        return 0;
}

Make sure to compile with that big weird macro so that the testing works. This is significant because we want to test for all cases that matching does not violate the end of the range.

Some of this stuff I just typed-in. Let me know if I made any mistakes.

Bircher answered 24/5 at 8:29 Comment(5)
Rainy day to fill?? Points on Scrabble tiles are crude representation of (inverse of) letter frequency. Imagine haystack is 1 chapter of a book. Imagine needle is "equal". Imagine pausing on every 'e' to check if that e in the text is the 1st char of needle... Imagine pausing on every 'q' to check if that q in the text is part of a needle occurrence... Bet code would pause and examine "it this right??" somewhat less often if seaching for q... :-) Requires some "set up" code prior to searching, (maybe a LUT of target character to target as prime candidate value), but... :-)Spacious
@Spacious Now you are talking text searching algorithms like Boyer-Moore. This is just a linear strstr... in reverse! 😃Farinose
Killing time here... Linux seems to offer memrchr() that could replace the homebrew rstrchr()... Miss the chance to be flash with odd looking while (s_end --> s_begin)... Time for me to go read a book... Cheers! :-)Spacious
@Spacious Huh, I didn’t know that little function existed. Alas, it actually killed performance, though, taking my nice 0.076–0.077 µs times to 1.06 µs or so. The structure of my algorithm apparently optimizes nicely, but not with the structure of the memrchr() implementation thrown in the middle of it, it seems.Farinose
Ya know... +300ns... ~50% slower... Thanks for trying, and for reporting your results. Puzzling, but there ya go... :-) Not suggesting you have to do this, but... I wonder if a VERY LARGE haystack might give different results... It's the weekend... Get outside and go play... Cheers, and have a great weekend! :-)Spacious

© 2022 - 2024 — McMap. All rights reserved.