Checking if 2 printf()
format strings are compatible is an exercise in format parsing.
C, at least, has no standard run-time compare function such as:
int format_cmp(const char *f1, const char *f2); // Does not exist
Formats like "%d %f"
and "%i %e"
are obviously compatible in that both expect an int
and float/double
. Note: float
are promoted to double
as short
and signed char
are promoted to int
.
Formats "%*.*f"
and "%i %d %e"
are compatible, but not obvious: both expect an int
,int
and float/double
.
Formats "%hhd"
and "%d"
both expect an int
, even though the first will have it values cast to signed char
before printing.
Formats "%d"
and "%u"
are not compatible. Even though many systems will behaved as hoped. Note: Typically char
will promote to int
.
Formats "%d"
and "%ld"
are not strictly compatible. On a 32-bit system there are equivalent, but not in general. Of course code can be altered to accommodate this. OTOH "%lf"
and "%f"
are compatible due to the usual argument promotions of float
to double
.
Formats "%lu"
and "%zu"
may be compatible, but that depends on the implementation of unsigned long
and size_t
. Additions to code could allow this or related equivalences.
Some combinations of modifiers and specifiers are not defined like "%zp"
. The following does not dis-allow such esoteric combinations - but does compare them.
Modifiers like "$"
are extensions to standard C and are not implemented in the following.
The compatibility test for printf()
differs from scanf()
.
#include <ctype.h>
#include <limits.h>
#include <stdio.h>
#include <string.h>
typedef enum {
type_none,
type_int,
type_unsigned,
type_float,
type_charpointer,
type_voidpointer,
type_intpointer,
type_unknown,
type_type_N = 0xFFFFFF
} type_type;
typedef struct {
const char *format;
int int_queue;
type_type type;
} format_T;
static void format_init(format_T *state, const char *format);
static type_type format_get(format_T *state);
static void format_next(format_T *state);
void format_init(format_T *state, const char *format) {
state->format = format;
state->int_queue = 0;
state->type = type_none;
format_next(state);
}
type_type format_get(format_T *state) {
if (state->int_queue > 0) {
return type_int;
}
return state->type;
}
const char *seek_flag(const char *format) {
while (strchr("-+ #0", *format) != NULL)
format++;
return format;
}
const char *seek_width(const char *format, int *int_queue) {
*int_queue = 0;
if (*format == '*') {
format++;
(*int_queue)++;
} else {
while (isdigit((unsigned char ) *format))
format++;
}
if (*format == '.') {
if (*format == '*') {
format++;
(*int_queue)++;
} else {
while (isdigit((unsigned char ) *format))
format++;
}
}
return format;
}
const char *seek_mod(const char *format, int *mod) {
*mod = 0;
if (format[0] == 'h' && format[1] == 'h') {
format += 2;
} else if (format[0] == 'l' && format[1] == 'l') {
*mod = ('l' << CHAR_BIT) + 'l';
format += 2;
} else if (strchr("ljztL", *format)) {
*mod = *format;
format++;
} else if (strchr("h", *format)) {
format++;
}
return format;
}
const char *seek_specifier(const char *format, int mod, type_type *type) {
if (strchr("di", *format)) {
*type = type_int;
format++;
} else if (strchr("ouxX", *format)) {
*type = type_unsigned;
format++;
} else if (strchr("fFeEgGaA", *format)) {
if (mod == 'l') mod = 0;
*type = type_float;
format++;
} else if (strchr("c", *format)) {
*type = type_int;
format++;
} else if (strchr("s", *format)) {
*type = type_charpointer;
format++;
} else if (strchr("p", *format)) {
*type = type_voidpointer;
format++;
} else if (strchr("n", *format)) {
*type = type_intpointer;
format++;
} else {
*type = type_unknown;
exit(1);
}
*type |= mod << CHAR_BIT; // Bring in modifier
return format;
}
void format_next(format_T *state) {
if (state->int_queue > 0) {
state->int_queue--;
return;
}
while (*state->format) {
if (state->format[0] == '%') {
state->format++;
if (state->format[0] == '%') {
state->format++;
continue;
}
state->format = seek_flag(state->format);
state->format = seek_width(state->format, &state->int_queue);
int mod;
state->format = seek_mod(state->format, &mod);
state->format = seek_specifier(state->format, mod, &state->type);
return;
} else {
state->format++;
}
}
state->type = type_none;
}
// 0 Compatible
// 1 Not Compatible
// 2 Not Comparable
int format_cmp(const char *f1, const char *f2) {
format_T state1;
format_init(&state1, f1);
format_T state2;
format_init(&state2, f2);
while (format_get(&state1) == format_get(&state2)) {
if (format_get(&state1) == type_none)
return 0;
if (format_get(&state1) == type_unknown)
return 2;
format_next(&state1);
format_next(&state2);
}
if (format_get(&state1) == type_unknown)
return 2;
if (format_get(&state2) == type_unknown)
return 2;
return 1;
}
Note: only minimal testing done. Lots of additional considerations could be added.
Known shortcomings: hh,h,l,ll,j,z,t
modifiers with n
. l
with s,c
.
[Edit]
OP comments about security concerns. This changes the nature of the post and the compare from an equality one to a security one. I'd imagine that one of the patterns (A) would be a reference pattern and the next (B) would be the test. The test would be "is B at least as secure as A?". Example A = "%.20s"
and B1 = "%.19s"
, B2 = "%.20s"
, B3 = "%.21s"
. B1
and B2
both pass the security test as they do not extract more the 20 char
. B3
is a problem as it goes pass the reference limit of 20 char
. Further any non-width qualified with %s %[ %c
is a security problem - in the reference or test pattern. This answer's code does not address this issue.
As mentioned, code does not yet handle modifiers with "%n"
.
[2018 edit]
Concerning "Formats "%d"
and "%u"
are not compatible.": This is for values to be printed in general. For values in the [0..INT_MAX]
range, either format may work per C11dr §6.5.2.2 6.
NS_FORMAT_FUNCTION
to your will. Check this SO answer, as well as the Clang docs for__format__
. – Leatherbackparse_printf_format
looks cool. How do I import it? – Triphammer"%x %lo %f"
and"%d %lx %e"
regarded as similar? Since each takes values in the sequenceint
,long
,double
, I think they probably are. And presumably"%8.3f"
and"%+12.6f"
are similar? That is, I'm guessing that your intent is to ensure that using either format string will consume the same list of other arguments. I'll also observe that there isn't a standard function that'll do the job, so any answer inevitably involves (quite a lot of) code — more than fits comfortably into an SO answer. – Clubby$
in"Something %d and %f" and "Something %2$f and %1$d"
is not part of the C standard. This should result in a 3rd answer: "not comparable". – Djambi