I just follow mike.dld's answer in this question, and add the printf
support for the UTF-8
string.
As mkluwe mentioned in his answer that by default, printf
function will output to the console one by one byte, while the console can't handle single byte correctly. My method is quite simple, I use the snprintf
function to print the whole content to a internal string buffer, and then dump the buffer to std::cout
.
Here is the full testing code:
#include <iostream>
#include <locale>
#include <windows.h>
#include <cstdlib>
using namespace std;
// https://mcmap.net/q/15777/-convert-wstring-to-string-encoded-in-utf-8
#include <codecvt>
#include <string>
// convert UTF-8 string to wstring
std::wstring utf8_to_wstring (const std::string& str)
{
std::wstring_convert<std::codecvt_utf8<wchar_t>> myconv;
return myconv.from_bytes(str);
}
// convert wstring to UTF-8 string
std::string wstring_to_utf8 (const std::wstring& str)
{
std::wstring_convert<std::codecvt_utf8<wchar_t>> myconv;
return myconv.to_bytes(str);
}
// https://mcmap.net/q/15587/-utf-8-output-on-windows-console
// mike.dld's answer
class ConsoleStreamBufWin32 : public std::streambuf
{
public:
ConsoleStreamBufWin32(DWORD handleId, bool isInput);
protected:
// std::basic_streambuf
virtual std::streambuf* setbuf(char_type* s, std::streamsize n);
virtual int sync();
virtual int_type underflow();
virtual int_type overflow(int_type c = traits_type::eof());
private:
HANDLE const m_handle;
bool const m_isInput;
std::string m_buffer;
};
ConsoleStreamBufWin32::ConsoleStreamBufWin32(DWORD handleId, bool isInput) :
m_handle(::GetStdHandle(handleId)),
m_isInput(isInput),
m_buffer()
{
if (m_isInput)
{
setg(0, 0, 0);
}
}
std::streambuf* ConsoleStreamBufWin32::setbuf(char_type* /*s*/, std::streamsize /*n*/)
{
return 0;
}
int ConsoleStreamBufWin32::sync()
{
if (m_isInput)
{
::FlushConsoleInputBuffer(m_handle);
setg(0, 0, 0);
}
else
{
if (m_buffer.empty())
{
return 0;
}
std::wstring const wideBuffer = utf8_to_wstring(m_buffer);
DWORD writtenSize;
::WriteConsoleW(m_handle, wideBuffer.c_str(), wideBuffer.size(), &writtenSize, NULL);
}
m_buffer.clear();
return 0;
}
ConsoleStreamBufWin32::int_type ConsoleStreamBufWin32::underflow()
{
if (!m_isInput)
{
return traits_type::eof();
}
if (gptr() >= egptr())
{
wchar_t wideBuffer[128];
DWORD readSize;
if (!::ReadConsoleW(m_handle, wideBuffer, ARRAYSIZE(wideBuffer) - 1, &readSize, NULL))
{
return traits_type::eof();
}
wideBuffer[readSize] = L'\0';
m_buffer = wstring_to_utf8(wideBuffer);
setg(&m_buffer[0], &m_buffer[0], &m_buffer[0] + m_buffer.size());
if (gptr() >= egptr())
{
return traits_type::eof();
}
}
return sgetc();
}
ConsoleStreamBufWin32::int_type ConsoleStreamBufWin32::overflow(int_type c)
{
if (m_isInput)
{
return traits_type::eof();
}
m_buffer += traits_type::to_char_type(c);
return traits_type::not_eof(c);
}
template<typename StreamT>
inline void FixStdStream(DWORD handleId, bool isInput, StreamT& stream)
{
if (::GetFileType(::GetStdHandle(handleId)) == FILE_TYPE_CHAR)
{
stream.rdbuf(new ConsoleStreamBufWin32(handleId, isInput));
}
}
// some code are from this blog
// https://blog.csdn.net/witton/article/details/108087135
#define printf(fmt, ...) __fprint(stdout, fmt, ##__VA_ARGS__ )
int __vfprint(FILE *fp, const char *fmt, va_list va)
{
// https://mcmap.net/q/15780/-which-of-sprintf-snprintf-is-more-secure
size_t nbytes = snprintf(NULL, 0, fmt, va) + 1; /* +1 for the '\0' */
char *str = (char*)malloc(nbytes);
snprintf(str, nbytes, fmt, va);
std::cout << str;
free(str);
return nbytes;
}
int __fprint(FILE *fp, const char *fmt, ...)
{
va_list va;
va_start(va, fmt);
int n = __vfprint(fp, fmt, va);
va_end(va);
return n;
}
int main()
{
FixStdStream(STD_INPUT_HANDLE, true, std::cin);
FixStdStream(STD_OUTPUT_HANDLE, false, std::cout);
FixStdStream(STD_ERROR_HANDLE, false, std::cerr);
// ...
std::cout << "\xc3\xbc" << std::endl;
printf("\xc3\xbc");
// ...
return 0;
}
The source code is saved in UTF-8
format, and build under Msys2's GCC and run under Windows 7 64bit. Here is the result
ü
ü
fputs
) and I can type UTF-8 encoded files with thetype
command (after having donechcp 65001
). Thus I thought it can handle this encoding… – Soviet