#include <stdio.h>
#include <iostream>
#include <string>
#include <chrono>
#include <memory>
#include <cstdlib>
#include <cstdint>
#include <cstring>
#include <immintrin.h>
using namespace std;
const int p[9] = {1, 10, 100,
1000, 10000, 100000,
1000000, 10000000, 100000000};
class MyTimer {
private:
std::chrono::time_point<std::chrono::steady_clock> starter;
public:
void startCounter() {
starter = std::chrono::steady_clock::now();
}
int64_t getCounterNs() {
return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - starter).count();
}
};
int convert1(const char *a) {
int res = 0;
for (int i=0; i<9; i++) res = res * 10 + a[i] - 48;
return res;
}
int convert2(const char *a) {
return (a[0] - 48) * p[8] + (a[1] - 48) * p[7] + (a[2] - 48) * p[6]
+ (a[3] - 48) * p[5] + (a[4] - 48) * p[4] + (a[5] - 48) * p[3]
+ (a[6] - 48) * p[2] + (a[7] - 48) * p[1] + (a[8] - 48) * p[0];
}
int convert3(const char *a) {
return (a[0] - 48) * p[8] + a[1] * p[7] + a[2] * p[6] + a[3] * p[5]
+ a[4] * p[4] + a[5] * p[3] + a[6] * p[2] + a[7] * p[1] + a[8]
- 533333328;
}
const unsigned pu[9] = {1, 10, 100, 1000, 10000, 100000, 1000000, 10000000,
100000000};
int convert4u(const char *aa) {
const unsigned char *a = (const unsigned char*) aa;
return a[0] * pu[8] + a[1] * pu[7] + a[2] * pu[6] + a[3] * pu[5] + a[4] * pu[4]
+ a[5] * pu[3] + a[6] * pu[2] + a[7] * pu[1] + a[8] - (unsigned) 5333333328u;
}
int convert5(const char* a) {
int val = 0;
for(size_t k =0;k <9;++k) {
val = (val << 3) + (val << 1) + (a[k]-'0');
}
return val;
}
const unsigned pu2[9] = {100000000, 10000000, 1000000, 100000, 10000, 1000, 100, 10, 1};
int convert6u(const char *a) {
return a[0]*pu2[0] + a[1]*pu2[1] + a[2]*pu2[2] + a[3] * pu2[3] + a[4] * pu2[4] + a[5] * pu2[5] + a[6] * pu2[6] + a[7] * pu2[7] + a[8] - (unsigned) 5333333328u;
}
constexpr std::uint64_t zeros(char z) {
std::uint64_t result = 0;
for (int i = 0; i < sizeof(result); ++i) {
result = result*256 + z;
}
return result;
}
int convertX(const char *a) {
constexpr std::uint64_t offset = zeros('0');
constexpr std::uint64_t o1 = 0xFF00FF00FF00FF00;
constexpr std::uint64_t o2 = 0xFFFF0000FFFF0000;
constexpr std::uint64_t o3 = 0xFFFFFFFF00000000;
std::uint64_t buffer;
std::memcpy(&buffer, a, sizeof(buffer));
const auto bytes = buffer - offset;
const auto b1 = (bytes & o1) >> 8;
const auto words = (bytes & ~o1) + 10*b1;
const auto w1 = (words & o2) >> 16;
const auto dwords = (words & ~o2) + 100*w1;
const auto d1 = (dwords & o3) >> 32;
const auto qwords = (dwords & ~o3) + 1000*d1;
const auto final = 10*static_cast<unsigned>(qwords) + (a[9] - '0');
return static_cast<int>(final);
}
//######################## ACCEPTED ANSWER
//########################
//########################
typedef struct { // for output into memory
alignas(16) unsigned hours;
unsigned minutes, seconds, nanos;
} hmsn;
void str2hmsn(hmsn *out, const char str[15]) // HHMMSSXXXXXXXXX 15 total, with 9-digit nanoseconds.
{ // 15 not including the terminating 0 (if any) which we don't read
//hmsn retval;
__m128i digs = _mm_loadu_si128((const __m128i*)str);
digs = _mm_sub_epi8( digs, _mm_set1_epi8('0') );
__m128i hms_x_words = _mm_maddubs_epi16( digs, _mm_set1_epi16( 10U + (1U<<8) )); // SSSE3 pairs of digits => 10s, 1s places.
__m128i hms_unpacked = _mm_cvtepu16_epi32(hms_x_words); // SSE4.1 hours, minutes, seconds unpack from uint16_t to uint32
//_mm_storeu_si128((__m128i*)&retval, hms_unpacked); // store first 3 struct members; last to be written separately
_mm_storeu_si128((__m128i*)out, hms_unpacked);
// or scalar extract with _mm_cvtsi128_si64 (movq) and shift / movzx
__m128i xwords = _mm_bsrli_si128(hms_x_words, 6); // would like to schedule this sooner, so oldest-uop-first starts this critical path shuffle ahead of pmovzx
// 8 bytes of data, lined up in low 2 dwords, rather than split across high 3
// could have got here with an 8-byte load that starts here, if we didn't want to get the H,M,S integers cheaply.
__m128i xdwords = _mm_madd_epi16(xwords, _mm_setr_epi16(100, 1, 100, 1, 0,0,0,0)); // low/high uint32 chunks, discard the 9th x digit.
uint64_t pair32 = _mm_cvtsi128_si64(xdwords);
uint32_t msd = 100*100 * (uint32_t)pair32; // most significant dword was at lower address (in printing order), so low half on little-endian x86. encourage compilers to use 32-bit operand-size for imul
uint32_t first8_x = msd + (uint32_t)(pair32 >> 32);
uint32_t nanos = first8_x * 10 + ((unsigned char)str[14] - '0'); // total*10 + lowest digit
out->nanos = nanos;
//retval.nanos = nanos;
//return retval;
// returning the struct by value encourages compilers in the wrong direction
// into not doing separate stores, even when inlining into a function that assigns the whole struct to a pointed-to output
}
hmsn mystruct;
int convertSIMD(const char* a)
{
str2hmsn(&mystruct, a);
return mystruct.nanos;
}
//########################
//########################
using ConvertFunc = int(const char*);
volatile int result = 0; // do something with the result of function to prevent unexpected optimization
void benchmark(ConvertFunc converter, string name, int numTest=1000) {
MyTimer timer;
const int N = 100000;
char *a = new char[9*N + 17];
int64_t runtime = 0;
for (int t=1; t<=numTest; t++) {
// change something to prevent unexpected optimization
for (int i=0; i<9*N; i++) a[i] = rand() % 10 + '0';
timer.startCounter();
for (int i=0; i<9*N; i+= 9) result = converter(a+i);
runtime += timer.getCounterNs();
}
cout << name << ": " << (runtime / (double(numTest) * N)) << "ns average\n";
delete[] a;
}
int main() {
benchmark(convert1, "slow");
benchmark(convert2, "normal");
benchmark(convert3, "fast");
benchmark(convert4u, "unsigned");
benchmark(convert5, "shifting");
benchmark(convert6u, "reverse");
benchmark(convertX, "swar64");
benchmark(convertSIMD, "manualSIMD");
return 0;
}
I want to find the fastest way turn char a[9]
into an int
. The full problem is convert char a[15]
with form HHMMSSxxxxxxxxx timestamp to nanosecond, where ~50 bytes after the x
are allocated and can be safely read (but not write). We only care about the last 9 digits in this question.
Version 1 is basic, version 2,3 try to save some computation. I compile with -O3 flag, and storing power of 10s in array is fine because it is optimized away (checked using Godbolt).
How can I make this faster? Yes I know this sounds like premature optimization, but let's assume I need that final 2-3% boost.
Version 3 is decidedly faster, while version 2 is slower due to code size. But can we do better than version 3?
Invalid benchmarkEdit 2: the function can return unsigned int
instead of int
(i.e
unsigned convert1(char *a);
Edit 3: I noticed that the new code is an invalid benchmark, since convert(a) is only executed once. Using the original code, the difference is only ~1%.
Edit 4: New benchmark. using unsigned (convert4u, convert6u) is consistently 3-5% faster than using int. I will run a long (10+ min) benchmark to see if there's a winner. I've edited the code to use a new benchmark. It generates a large amount of data, then run the converter functions.
Edit 5: results: 4.19, 4.51, 3.82, 3.59, 7.64, 3.72
seconds. The unsigned version is fastest. Is it possible to use SIMD on just 9 bytes? If not, then I guess this is the best solution. I still hope there's a crazier solution, though
Edit 6: benchmark result on AMD Ryzen 4350G, gcc version 10.3, compile command gcc -o main main.cpp -std=c++17 -O3 -mavx -mavx2 -march=native
slow: 4.17794ns average
normal: 2.59945ns average
fast: 2.27917ns average
unsigned: 2.43814ns average
shifting: 4.72233ns average
reverse: 2.2274ns average
swar64: 2.17179ns average
manualSIMD: 1.55203ns average
The accepted answer does even more than the question require and compute HH/MM/SS/nanosec
, so it's even faster than this benchmark shows.
std::chrono::whatever
twice is probably more than the calculation itself. Unless you need to achieve a throughput of one trillion of integer conversions per second, I wouldn't worry about it. Anyway, using a global array for storing powers of 10 is pretty bad for performance. Just inline them in the formula. – Legendresult
and the compiler should optimize such thatconverter
isnt called at all – Lenientchar a[16]
with 9 real chars and 7 chars of padding? Dealing with 9 chars is possible but a pain. – Featherstonestd::from_chars()
? – Slipknot(a[8] - 48) * p[0]
with(a[8] - 48)
. – Aluminuma[i]
with random charactersrand()
instead of random digitsrand() % 10 + '0'
? Are you certain that is will not make a difference? – Aluminumvolatile
, then turning on optimizations will show you meaningful results – Lenientunsigned
math, then(a[0] - 48) * p[8] ... - 533333328
could becomea[0] * p[8] ... - (unsigned)5333333328u
. (Hope I got the constant correct) – Aluminumfor (int i=1; i<= 1000000; i++) result = converter(a);
loop and understand that the function always returns the same result for the same input. Note the instructionmov dword ptr [rip + result], eax
repeated 10 times in a row and then a jump right back to the start of the block to achieve a 1000000 repetition. The actual conversion is only performed once. (And the timing is x10 faster than with older compilers). – Drollerychar a[9];
-->char aa[9]; char * volatile a = aa;
would solve that so the callconverter(a);
is not assume-able to always result in the same value. – Aluminuma
than you actually use... – Drollerya[i] - 48
. Write that asa[i] - '0'
. That works for all character encodings. – Anatolaint convert_null(char* a) { return 0; }
. I doubt it is 0.0. – Aluminuma[i] - '0'
(anda[i] - 48
) is that the subtraction is anint
one. To encourage a lesser compiler into unsigned math, it may be useful to banish signed operations and usea[i] - (unsigned) '0'
. Still code should avoid magic numbers, even533333328
, which could be made into#define
as'0' + 10*'0' + 100*'0' ...
. – Aluminumch - '0'
is the canonical way to convert a digit character to the value that it represents.ch - 48
is simply wrong. Sure, if you have additional constraints, you might want to add some bells and whistles, but that has nothing to do with what I said. – Anatolaconst
pointers, all of the times dropped to approximately 400 ms. I haven't checked with Compiler Explorer, but my guess is that using non-const pointers blocked some compiler optimization because it couldn't prove there was no pointer aliasing. – SodomiteautoSIMD
is not an appropriate name for Adrian's answer, though: it's manual SWAR. (SIMD Within A Register). So maybe SWAR64bit as a name. – Lunitidalconst
makes a big difference, that makes me wonder if the benchmark might be getting partially optimized away, or if inlining details matter. But IIRC, 1.5 ns on OP's Zen (several clock cycles) seems reasonable throughput for mine, so if it's worse that might be an MSVC missed optimization. – Lunitidal-march=znver1
or/arch:AVX2
options. Both compile without inlining thebenchmark()
function calls or constant-propagating function pointers into it, so they don't have any opportunity to hoist redundant work out of the loop, or even hoist constants out of loops like you might hope a normal use-case would allow. So anyway, the benchmark looks fine here. Our actual functions compile basically the same too – Lunitidalconst
from theconst char*
args (godbolt.org/z/GhvY89hhq), but I didn't try actually running the code. (I don't have a convenient way to run MSVC output, and didn't port the asm to run on Linux.) The critical parts, thebenchmark
loop itself, and theconvert...
functions, look to have compiled about the same. You didn't maybe test a debug build in your version withoutconst
? Theconst
version still avoids any optimizations that would defeat the benchmark, same as non-const. – Lunitidal