Minimising string length
If you want to keep the string lengths to a minimum, you could create a string that is lexicographically halfway between the left and right strings, so that there is room to insert additional strings, and only create a longer string if absolutely necessary.
I will assume an alphabet [a-z], and a lexicographical ordering where an empty space comes before 'a', so that e.g. "ab" comes before "abc".
Basic case
You start by copying the characters from the beginning of the strings, until you encounter the first difference, which could be either two different characters, or the end of the left string:
abcde ~ abchi -> abc + d ~ h
abc ~ abchi -> abc + _ ~ h
The new string is then created by appending the character that is halfway in the alphabet between the left character (or the beginning of the alphabet) and the right character:
abcde ~ abchi -> abc + d ~ h -> abcf
abc ~ abchi -> abc + _ ~ h -> abcd
Consecutive characters
If the two different characters are lexicographically consecutive, first copy the left character, and then append the character halfway between the next character from the left string and the end of the alphabet:
abhs ~ abit -> ab + h ~ i -> abh + s ~ _ -> abhw
abh ~ abit -> ab + h ~ i -> abh + _ ~ _ -> abhn
If the next character(s) in the left string are one or more z's, then copy them and append the character halfway between the first non-z character and the end of the alphabet:
abhz ~ abit -> ab + h ~ i -> abh + z ~ _ -> abhz + _ ~ _ -> abhzn
abhzs ~ abit -> ab + h ~ i -> abh + z ~ _ -> abhz + s ~ _ -> abhzw
abhzz ~ abit -> ab + h ~ i -> abh + z ~ _ -> ... -> abhzz + _ ~ _ -> abhzzn
Right character is a or b
You should never create a string by appending an 'a' to the left string, because that would create two lexicographically consecutive strings, inbetween which no further strings could be added. The solution is to always append an additional character, halfway inbetween the beginning of the alphabet and the next character from the right string:
abc ~ abcah -> abc + _ ~ a -> abca + _ ~ h -> abcad
abc ~ abcab -> abc + _ ~ a -> abca + _ ~ b -> abcaa + _ ~ _ -> abcaan
abc ~ abcaah -> abc + _ ~ a -> abca + _ ~ a -> abcaa + _ ~ h -> abcaad
abc ~ abcb -> abc + _ ~ b -> abca + _ ~ _ -> abcan
Code examples
Below is a code snippet which demonstrates the method. It's a bit fiddly because JavaScript, but not actually complicated. To generate a first string, call the function with two empty strings; this will generate the string "n". To insert a string before the leftmost or after the rightmost string, call the function with that string and an empty string.
function midString(prev, next) {
var p, n, pos, str;
for (pos = 0; p == n; pos++) { // find leftmost non-matching character
p = pos < prev.length ? prev.charCodeAt(pos) : 96;
n = pos < next.length ? next.charCodeAt(pos) : 123;
}
str = prev.slice(0, pos - 1); // copy identical part of string
if (p == 96) { // prev string equals beginning of next
while (n == 97) { // next character is 'a'
n = pos < next.length ? next.charCodeAt(pos++) : 123; // get char from next
str += 'a'; // insert an 'a' to match the 'a'
}
if (n == 98) { // next character is 'b'
str += 'a'; // insert an 'a' to match the 'b'
n = 123; // set to end of alphabet
}
}
else if (p + 1 == n) { // found consecutive characters
str += String.fromCharCode(p); // insert character from prev
n = 123; // set to end of alphabet
while ((p = pos < prev.length ? prev.charCodeAt(pos++) : 96) == 122) { // p='z'
str += 'z'; // insert 'z' to match 'z'
}
}
return str + String.fromCharCode(Math.ceil((p + n) / 2)); // append middle character
}
var strings = ["", ""];
while (strings.length < 100) {
var rnd = Math.floor(Math.random() * (strings.length - 1));
strings.splice(rnd + 1, 0, midString(strings[rnd], strings[rnd + 1]));
document.write(strings + "<br>");
}
Below is a straightforward translation into C. Call the function with empty null-terminated strings to generate the first string, or insert before the leftmost or after the rightmost string. The string buffer buf
should be large enough to accomodate one extra character.
int midstring(const char *prev, const char *next, char *buf) {
char p = 0, n = 0;
int len = 0;
while (p == n) { // copy identical part
p = prev[len] ? prev[len] : 'a' - 1;
n = next[len] ? next[len] : 'z' + 1;
if (p == n) buf[len++] = p;
}
if (p == 'a' - 1) { // end of left string
while (n == 'a') { // handle a's
buf[len++] = 'a';
n = next[len] ? next[len] : 'z' + 1;
}
if (n == 'b') { // handle b
buf[len++] = 'a';
n = 'z' + 1;
}
}
else if (p + 1 == n) { // consecutive characters
n = 'z' + 1;
buf[len++] = p;
while ((p = prev[len] ? prev[len] : 'a' - 1) == 'z') { // handle z's
buf[len++] = 'z';
}
}
buf[len++] = n - (n - p) / 2; // append middle character
buf[len] = '\0';
return len;
}
Average string length
The best case is when the elements are inserted in random order. In practice, when generating 65,536 strings in pseudo-random order, the average string length is around 4.74 characters (the theoretical minimum, using every combination before moving to longer strings, would be 3.71).
The worst case is when inserting the elements in order, and always generating a new rightmost or leftmost string; this will lead to a recurring pattern:
n, u, x, z, zn, zu, zx, zz, zzn, zzu, zzx, zzz, zzzn, zzzu, zzzx, zzzz...
n, g, d, b, an, ag, ad, ab, aan, aag, aad, aab, aaan, aaag, aaad, aaab...
with an extra character being added after every fourth string.
If you have an existing ordered list for which you want to generate keys, generate lexicographically equally-spaced keys with an algorithm like the one below, and then use the algorithm described above to generate a new key when inserting a new element.
The code checks how many charactes are needed, how many different characters are needed for the least significant digit, and then switches between two selections from the alphabet to get the right number of keys. E.g. keys with two character can have 676 different values, so if you ask for 1600 keys, that is 1.37 extra keys per two-character combination, so after each two-character key an additional one ('n') or two ('j','r') characters are appended, i.e.: aan ab abj abr ac acn ad adn ae aej aer af afn ...
(skipping the initial 'aa').
function seqString(num) {
var chars = Math.floor(Math.log(num) / Math.log(26)) + 1;
var prev = Math.pow(26, chars - 1);
var ratio = chars > 1 ? (num + 1 - prev) / prev : num;
var part = Math.floor(ratio);
var alpha = [partialAlphabet(part), partialAlphabet(part + 1)];
var leap_step = ratio % 1, leap_total = 0.5;
var first = true;
var strings = [];
generateStrings(chars - 1, "");
return strings;
function generateStrings(full, str) {
if (full) {
for (var i = 0; i < 26; i++) {
generateStrings(full - 1, str + String.fromCharCode(97 + i));
}
}
else {
if (!first) strings.push(stripTrailingAs(str));
else first = false;
var leap = Math.floor(leap_total += leap_step);
leap_total %= 1;
for (var i = 0; i < part + leap; i++) {
strings.push(str + alpha[leap][i]);
}
}
}
function stripTrailingAs(str) {
var last = str.length - 1;
while (str.charAt(last) == 'a') --last;
return str.slice(0, last + 1);
}
function partialAlphabet(num) {
var magic = [0, 4096, 65792, 528416, 1081872, 2167048, 2376776, 4756004,
4794660, 5411476, 9775442, 11097386, 11184810, 22369621];
var bits = num < 13 ? magic[num] : 33554431 - magic[25 - num];
var chars = [];
for (var i = 1; i < 26; i++, bits >>= 1) {
if (bits & 1) chars.push(String.fromCharCode(97 + i));
}
return chars;
}
}
document.write(seqString(1600).join(' '));
<
" first, this question really hinges on that definition! – Kaykayaa < ax
,ax < b
, then appending a single char would be a trivial solution – Kaykaya