Fast input/output in competitive programming

Asked 17/3, 2012 at 11:29 Answered 24/8, 2015 at 17:50

I have come across this particular snippet of code many times in solutions of competitive programming contests. I understand the basic use of this code to beat time limits but i want to understand it more deeply. I know that unistd.h gives access to system call wrapper functions such as fork, pipe and I/O primitives (read, write, ..).

It will also be great if anyone can explain or guide me to resources that can help me understand it further.

#include <stdlib.h>
#include <stdint.h>
#include <unistd.h>
class FastInput {
public:
    FastInput() {
        m_dataOffset = 0;
        m_dataSize = 0;
        m_v = 0x80000000;
    }
    uint32_t ReadNext() {
        if (m_dataOffset == m_dataSize) {
            int r = read(0, m_buffer, sizeof(m_buffer));
            if (r <= 0) return m_v;
            m_dataOffset = 0;
            m_dataSize = 0;
            int i = 0;
            if (m_buffer[0] < '0') {
                if (m_v != 0x80000000) {
                    m_data[m_dataSize++] = m_v;
                    m_v = 0x80000000;
                }
                for (; (i < r) && (m_buffer[i] < '0'); ++i);
            }
            for (; i < r;) {
                if (m_buffer[i] >= '0') {
                    m_v = m_v * 10 + m_buffer[i] - 48;
                    ++i;
                } else {
                    m_data[m_dataSize++] = m_v;
                    m_v = 0x80000000;
                    for (i = i + 1; (i < r) && (m_buffer[i] < '0'); ++i);
                }
            }
        }
        return m_data[m_dataOffset++];
    }
public:
    uint8_t m_buffer[32768];
    uint32_t m_data[16384];
    size_t m_dataOffset, m_dataSize;
    uint32_t m_v;
};
class FastOutput {
public:
    FastOutput() {
        m_dataOffset = 0;
    }
    ~FastOutput() {
    }
    void Flush() {
        if (m_dataOffset) {
            if (write(1, m_data, m_dataOffset));
            m_dataOffset = 0;
        }
    }
    void PrintUint(uint32_t v, char d) {
        if (m_dataOffset + 11 > sizeof(m_data)) Flush();
        if (v < 100000) {
            if (v < 1000) {
                if (v < 10) {
                    m_data[m_dataOffset + 0] = v + 48;
                    m_dataOffset += 1;
                } else if (v < 100) {
                    m_data[m_dataOffset + 1] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 0] = v + 48;
                    m_dataOffset += 2;
                } else {
                    m_data[m_dataOffset + 2] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 1] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 0] = v + 48;
                    m_dataOffset += 3;
                }
            } else {
                if (v < 10000) {
                    m_data[m_dataOffset + 3] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 2] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 1] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 0] = v + 48;
                    m_dataOffset += 4;
                } else {
                    m_data[m_dataOffset + 4] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 3] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 2] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 1] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 0] = v + 48;
                    m_dataOffset += 5;
                }
            }
        } else {
            if (v < 100000000) {
                if (v < 1000000) {
                    m_data[m_dataOffset + 5] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 4] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 3] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 2] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 1] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 0] = v + 48;
                    m_dataOffset += 6;
                } else if (v < 10000000) {
                    m_data[m_dataOffset + 6] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 5] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 4] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 3] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 2] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 1] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 0] = v + 48;
                    m_dataOffset += 7;
                } else {
                    m_data[m_dataOffset + 7] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 6] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 5] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 4] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 3] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 2] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 1] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 0] = v + 48;
                    m_dataOffset += 8;
                }
            } else {
                if (v < 1000000000) {
                    m_data[m_dataOffset + 8] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 7] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 6] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 5] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 4] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 3] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 2] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 1] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 0] = v + 48;
                    m_dataOffset += 9;
                } else {
                    m_data[m_dataOffset + 9] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 8] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 7] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 6] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 5] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 4] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 3] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 2] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 1] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 0] = v + 48;
                    m_dataOffset += 10;
                }
            }
        }
        m_data[m_dataOffset++] = d;
    }
    void PrintChar(char d) {
        if (m_dataOffset + 1 > sizeof(m_data)) Flush();
        m_data[m_dataOffset++] = d;
    }
    void ReplaceChar(int offset, char d) {
        m_data[m_dataOffset + offset] = d;
    }
public:
    uint8_t m_data[32768];
    size_t m_dataOffset;
};

One more thing: Is it good practice to employ similar techniques in production level code?

Carven answered 17/3, 2012 at 11:29 Comment(10)

Really have you seen it that often? I have been competing for many years, have seen a lot of code - but this ugliness - never. I wouldn't agree that competitive programming is about hacking the system - problems are always set up in a way that using normal IO functions and good algorithms they will pass. Code like that is used only by "cheater" that can't figure out the most optimized code, but still try to pass (sometimes they succeed). As for production level code - it is a matter of priorities - code legibility against performance. – Michiko 17/3, 2012 at 11:33

What aspect of it, specifically, are you having trouble understanding? – Thereat 17/3, 2012 at 11:33

@BorisStrandjev i shd have been more particular. i have seen it in codechef programs more often than any other sites. – Carven 17/3, 2012 at 11:35

@OliCharlesworth how are the I/O functions used and what makes it faster? What is the logic behind it? – Carven 17/3, 2012 at 11:40

@s2n: The author of this code is basically reimplementing buffered I/O, along with integer serialisation/deserialisation. He/she is assuming that this is faster than using standard I/O wrappers and printf/atoi/etc. – Thereat 17/3, 2012 at 11:44

@OliCharlesworth I dont know much abt these concepts(integer (de)serialization , I/O wrappers).Can you suggest some resources those can help delve deeper into these techniques? – Carven 17/3, 2012 at 11:51

If you need performance, use the stdio routines from C instead of the slow C++ iostream routines. In production code, I would avoid to re-implement the wheel or any other over-optimization, since the probability of introducing any hidden bugs is usually much higher than the performance gain. – Dorfman 17/3, 2012 at 12:36

@Dorfman I don't know about using C routines for speed. here's what I found when I compared cstdio to iostream – Spontaneity 17/3, 2012 at 13:39

@Spontaneity Of course, it depends heavily on the implementation and the task what you want to do. For reading or writing simple strings, C++ may be faster, but as far as I know, if you need to write/read numbers or formatting, fscanf and fprintf are faster. I will do some tests and write about my results here. See e.g. #1042610 – Dorfman 17/3, 2012 at 15:55

I've done now some interesting test (gcc 4.6.2, Linux 64bit): Reading 10E7 floating point numbers: iostream (>> parsing): 4.5s; iostream (getline, strtod): 2.0s; cstdio (fscanf): 2.9s; cstdio (fgets, strtod): 2.2s. So in the implementation I use, iostream is faster on reading lines, but very slow if anything need to be converted or parsed. – Dorfman 18/3, 2012 at 0:42

Is it good practice to employ similar techniques in production level code?

No. Reimplementing the wheel leads to bugs. Bugs require extra development time and cost money.

can help me understand it further.

If you don't understand the code, the code is poorly written. Code is written by humans, and for humans. If another programmer doesn't understand code quickly, there may be a big problem. The rationale behind this thinking ("written for humans") is simple: development time costs a lot, and unreadable code increases development time.

The code fragment in question utilizes several bad coding practices:

Hungarian notation (there's no need for that in case-sensitive notation and especially in C++),
Short variable members (can you tell what m_v means without reading the rest of the program, for example?)
Hard-coded values (+ 48, + 11)
(subjective) Mixes signed/unsigned ints/chars (mingw/gcc will annoy the hell out of you while compiling).
Code copy-pasting (v /= 10 and similar - C++ has macros/templates, damn it, so if you want to unroll loop by hand, use them!).
Needlessly multi-level if/else.

Unless you want to become worse at programming, I'd advise to avoid trying to "understand" this code fragment. It is bad.

I seriously doubt that this particular design was a result of profiling. Most likely scenario some "genius" assumed that his code fragment will outperform built-in functions.

When you want performance, you follow this pattern:

Write initial version.
Repeat until performance gain is no longer worth it or until there's no solution:
1. Do not make many assumptions about what will improve performance. You're human, human's job is to make mistakes. By Murphy's law, your assumptions will be incorrect.
2. Consider algorithmic optimization first.
3. Run the code through profiler.
4. Locate bottlenecks.
5. Investigate total performance gain if total time spent in this particular routine will be reduced to zero.
6. If the gain is reasonable (time/cost), optimize the routine. Otherwise ignore.

Edbert answered 17/3, 2012 at 13:8 Comment(0)

try this for faster I/O

ios_base::sync_with_stdio(false); cin.tie(NULL);

It sets whether the standard C++ streams are synchronized to the standard C streams after each input/output operation. By default, iostream objects and cstdio streams are synchronized.

Toxicity answered 24/8, 2015 at 17:50 Comment(0)

In the PrintUint function, he's basically just unrolling a loop by hand. Unrolling loops are sometimes a good thing to do -- however the compiler already does it, and will do it better than you, most of the time.

To plug my favorite language feature, it would be better implemented using templates: a simple implementation (more clever probably exist) would look like:

// I'm sure the compiler can figure out the inline part, but I'll add it anyways
template<unsigned int N> 
inline void print_uint_inner(uint32_t v) {
    m_data[m_dataOffset + N] = v - v / 10 * 10 + 48;
    print_uint_inner<N-1>(v / 10);
}

// For situations just like this, there's a trick to avoid having to define the base case separately.
inline void print_uint_inner<0>(uint32_t v) {
    m_data[m_dataOffset] = v - v / 10 * 10 + 48;
}

template<unsigned int N>
inline void print_uint_helper(uint32_t v) {
    print_uint_inner<N-1>(v);
    m_dataOffset += N;
}

// We could generate the compile-time binary search with templates too, rather than by hand.
void PrintUint(uint32_t v, char d) {
    if (m_dataOffset + 11 > sizeof(m_data)) Flush();
    if (v < 100000) {
        if (v < 1000) {
            if (v < 10) {
                print_uint_helper<1>(v);
            } else if (v < 100) {
                print_uint_helper<2>(v);
            } else {
                print_uint_helper<3>(v);
            }
        } else {
            if (v < 10000) {
                print_uint_helper<4>(v);
            } else {
                print_uint_helper<5>(v);
            }
        }
    } else {
        if (v < 100000000) {
            if (v < 1000000) {
                print_uint_helper<6>(v);
            } else if (v < 10000000) {
                print_uint_helper<7>(v);
            } else {
                print_uint_helper<8>(v);
            }
        } else {
            if (v < 1000000000) {
                print_uint_helper<9>(v);
            } else {
                print_uint_helper<10>(v);
            }
        }
    }
    m_data[m_dataOffset++] = d;
}

Is doing things like this good coding practice in general? Yes, but only if all of the following criteria are satisfied:

You've already written the obvious, easy to understand, simple version.
You've profiled your program, so that you know this stretch of code is costing enough time to be worth the effort
You're willing to go through the extra work to ensure the more complex version is actually correct
You've profiled the revised program, so that you know the rewrite actually improved your run-time.

Also, you should probably retain the ability to switch back to the simple version, either using compile-time constants or pre-processor directives. This will be important for two reasons:

When you're debugging, the ability to switch back to the simple version will help to narrow down places where there could be problems
When you try running on a different computer (or even the same computer under different conditions), you may find the complicated version is no longer faster than the simple version.

Empery answered 17/3, 2012 at 12:44 Comment(1)

+1. The only bit I'd be cautious about is the idea of using the preprocessor to select between two versions of the code. In my experience, that's a sure-fire way to allow compile-time errors to creep into the unused code, unless you're regularly testing both variants of the code. – Thereat 17/3, 2012 at 12:53

Hot tags

Godot Unity Godot Help Programming Godot 4.X GUI GDScript 3D 2D Physics CSharp Godot 3.X VR XR Projects C++

Recommended topics

Hot tags