Characters printed differently in R.app/RStudio/reprex
Asked Answered
B

1

9

With R 4.4.0 on a MacBook, nothing locale() or encoding related in .Rprofile or .Renviron. Sys.getlocale() on a fresh session returns "en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8" in both the native R console, or RStudio.

KOI8-R is a Cyrillic encoding that uses one byte per character. When using reprex from R studio (this is my output, which conforms to my expectations.

Note: this is using the reprex addin, which is running reprex::reprex(), itself using as default input code from the paste bin.

ch256 <- sapply(0:255, function(x) rawToChar(as.raw(x)))
Sys.setlocale("LC_CTYPE", "ru_RU.KOI8-R")
#> [1] "ru_RU.KOI8-R"
ch256
#>   [1] ""     "\001" "\002" "\003" "\004" "\005" "\006" "\a"   "\b"   "\t"  
#>  [11] "\n"   "\v"   "\f"   "\r"   "\016" "\017" "\020" "\021" "\022" "\023"
#>  [21] "\024" "\025" "\026" "\027" "\030" "\031" "\032" "\033" "\034" "\035"
#>  [31] "\036" "\037" " "    "!"    "\""   "#"    "$"    "%"    "&"    "'"   
#>  [41] "("    ")"    "*"    "+"    ","    "-"    "."    "/"    "0"    "1"   
#>  [51] "2"    "3"    "4"    "5"    "6"    "7"    "8"    "9"    ":"    ";"   
#>  [61] "<"    "="    ">"    "?"    "@"    "A"    "B"    "C"    "D"    "E"   
#>  [71] "F"    "G"    "H"    "I"    "J"    "K"    "L"    "M"    "N"    "O"   
#>  [81] "P"    "Q"    "R"    "S"    "T"    "U"    "V"    "W"    "X"    "Y"   
#>  [91] "Z"    "["    "\\"   "]"    "^"    "_"    "`"    "a"    "b"    "c"   
#> [101] "d"    "e"    "f"    "g"    "h"    "i"    "j"    "k"    "l"    "m"   
#> [111] "n"    "o"    "p"    "q"    "r"    "s"    "t"    "u"    "v"    "w"   
#> [121] "x"    "y"    "z"    "{"    "|"    "}"    "~"    "\177" "─" "│"
#> [131] "┌" "┐" "└" "┘" "├" "┤" "┬" "┴" "┼" "▀"
#> [141] "▄" "█" "▌" "▐" "░" "▒" "▓" "⌠" "■" "∙"
#> [151] "√" "≈" "≤" "≥" " " "⌡" "°" "²" "·" "÷"
#> [161] "═" "║" "╒" "ё" "╓" "╔" "╕" "╖" "╗" "╘"
#> [171] "╙" "╚" "╛" "╜" "╝" "╞" "╟" "╠" "╡" "Ё"
#> [181] "╢" "╣" "╤" "╥" "╦" "╧" "╨" "╩" "╪" "╫"
#> [191] "╬" "©" "ю" "а" "б" "ц" "д" "е" "ф" "г"
#> [201] "х" "и" "й" "к" "л" "м" "н" "о" "п" "я"
#> [211] "р" "с" "т" "у" "ж" "в" "ь" "ы" "з" "ш"
#> [221] "э" "щ" "ч" "ъ" "Ю" "А" "Б" "Ц" "Д" "Е"
#> [231] "Ф" "Г" "Х" "И" "Й" "К" "Л" "М" "Н" "О"
#> [241] "П" "Я" "Р" "С" "Т" "У" "Ж" "В" "Ь" "Ы"
#> [251] "З" "Ш" "Э" "Щ" "Ч" "Ъ"

However the same code printed in my RStudio console prints something different (fake reprex from output copy and paste):

ch256 <- sapply(0:255, function(x) rawToChar(as.raw(x)))
Sys.setlocale("LC_CTYPE", "ru_RU.KOI8-R")
ch256
#> [1] ""     "\001" "\002" "\003" "\004" "\005" "\006" "\a"   "\b"   "\t"  
#> [11] "\n"   "\v"   "\f"   "\r"   "\016" "\017" "\020" "\021" "\022" "\023"
#> [21] "\024" "\025" "\026" "\027" "\030" "\031" "\032" "\033" "\034" "\035"
#> [31] "\036" "\037" " "    "!"    "\""   "#"    "$"    "%"    "&"    "'"   
#> [41] "("    ")"    "*"    "+"    ","    "-"    "."    "/"    "0"    "1"   
#> [51] "2"    "3"    "4"    "5"    "6"    "7"    "8"    "9"    ":"    ";"   
#> [61] "<"    "="    ">"    "?"    "@"    "A"    "B"    "C"    "D"    "E"   
#> [71] "F"    "G"    "H"    "I"    "J"    "K"    "L"    "M"    "N"    "O"   
#> [81] "P"    "Q"    "R"    "S"    "T"    "U"    "V"    "W"    "X"    "Y"   
#> [91] "Z"    "["    "\\"   "]"    "^"    "_"    "`"    "a"    "b"    "c"   
#> [101] "d"    "e"    "f"    "g"    "h"    "i"    "j"    "k"    "l"    "m"   
#> [111] "n"    "o"    "p"    "q"    "r"    "s"    "t"    "u"    "v"    "w"   
#> [121] "x"    "y"    "z"    "{"    "|"    "}"    "~"    "\177" "�" "�"
#> [131] "�" "�" "�" "�" "�" "�" "�" "�" "�" "�"
#> [141] "�" "�" "�" "�" "�" "�" "�" "�" "�" "�"
#> [151] "�" "�" "�" "�" "�" "�" "�" "�" "�" "�"
#> [161] "�" "�" "�" "�" "�" "�" "�" "�" "�" "�"
#> [171] "�" "�" "�" "�" "�" "�" "�" "�" "�" "�"
#> [181] "�" "�" "�" "�" "�" "�" "�" "�" "�" "�"
#> [191] "�" "�" "�" "�" "�" "�" "�" "�" "�" "�"
#> [201] "�" "�" "�" "�" "�" "�" "�" "�" "�" "�"
#> [211] "�" "�" "�" "�" "�" "�" "�" "�" "�" "�"
#> [221] "�" "�" "�" "�" "�" "�" "�" "�" "�" "�"
#> [231] "�" "�" "�" "�" "�" "�" "�" "�" "�" "�"
#> [241] "�" "�" "�" "�" "�" "�" "�" "�" "�" "�"
#> [251] "�" "�" "�" "�" "�" "�"

In the R for Mac OS X GUI (R.app) it's different again, the encoding appears to be ignored and latin1 looking characters are printed (fake reprex from output copy and paste):

ch256 <- sapply(0:255, function(x) rawToChar(as.raw(x)))
Sys.setlocale("LC_CTYPE", "ru_RU.KOI8-R")
#> [1] "ru_RU.KOI8-R"
ch256
#> [1] ""     "\001" "\002" "\003" "\004" "\005" "\006" "\a"   "\b"   "\t"  
#> [11] "\n"   "\v"   "\f"   "\r"   "\016" "\017" "\020" "\021" "\022" "\023"
#> [21] "\024" "\025" "\026" "\027" "\030" "\031" "\032" "\033" "\034" "\035"
#> [31] "\036" "\037" " "    "!"    "\""   "#"    "$"    "%"    "&"    "'"   
#> [41] "("    ")"    "*"    "+"    ","    "-"    "."    "/"    "0"    "1"   
#> [51] "2"    "3"    "4"    "5"    "6"    "7"    "8"    "9"    ":"    ";"   
#> [61] "<"    "="    ">"    "?"    "@"    "A"    "B"    "C"    "D"    "E"   
#> [71] "F"    "G"    "H"    "I"    "J"    "K"    "L"    "M"    "N"    "O"   
#> [81] "P"    "Q"    "R"    "S"    "T"    "U"    "V"    "W"    "X"    "Y"   
#> [91] "Z"    "["    "\\"   "]"    "^"    "_"    "`"    "a"    "b"    "c"   
#> [101] "d"    "e"    "f"    "g"    "h"    "i"    "j"    "k"    "l"    "m"   
#> [111] "n"    "o"    "p"    "q"    "r"    "s"    "t"    "u"    "v"    "w"   
#> [121] "x"    "y"    "z"    "{"    "|"    "}"    "~"    "\177" "Ä" "Å"
#> [131] "Ç" "É" "Ñ" "Ö" "Ü" "á" "à" "â" "ä" "ã"
#> [141] "å" "ç" "é" "è" "ê" "ë" "í" "ì" "î" "ï"
#> [151] "ñ" "ó" "ò" "ô" "ö" "õ" "ú" "ù" "û" "ü"
#> [161] "†" "°" "¢" "£" "§" "•" "¶" "ß" "®" "�"
#> [171] "™" "´" "¨" "≠" "Æ" "Ø" "∞" "±" "≤" "≥"
#> [181] "¥" "µ" "∂" "∑" "∏" "π" "∫" "ª" "º" "Ω"
#> [191] "æ" "ø" "¿" "¡" "¬" "√" "ƒ" "≈" "∆" "«"
#> [201] "»" "…" " " "À" "Ã" "Õ" "Œ" "œ" "–" "—"
#> [211] "“" "”" "‘" "’" "÷" "◊" "ÿ" "Ÿ" "⁄" "€"
#> [221] "‹" "›" "fi" "fl" "‡" "·" "‚" "„" "‰" "Â"
#> [231] "Ê" "Á" "Ë" "È" "Í" "Î" "Ï" "Ì" "Ó" "Ô"
#> [241] "" "Ò" "Ú" "Û" "Ù" "ı" "ˆ" "˜" "¯" "˘"
#> [251] "˙" "˚" "¸" "˝" "˛" "ˇ"

In fact I can reproduce the above with the ISO8859-1 encoding as well (latin1), the native R console will print those correctly this time like reprex, but the RStudio output will still be wrong.

I know that making everything UTF-8 fixes everything, but I really want to understand :

  • What's happening here?
  • Is it possible to get the correct output everywhere?
  • Is this output different on different systems?
Birddog answered 2/5, 2024 at 7:48 Comment(16)
I'll put a 200 bounty on this as soon as I am allowed to (already answered or not)Birddog
I had something similar on windows at one point where one or more of the systems refused to change the locale. If I remember correctly, it also differed when running R itself or running R from the command line. Never figured out why.Conception
You have not stated exactly how you have used package reprex. What was the function call?Lempres
It could also help to know the locale settings of the processes that launched R and RStudio. Are they the same or different?Lempres
Confirmation that no startup files (.Rprofile, .Renviron, ...) were used would also be helpful.Lempres
Finally, if you are using Terminal on macOS, then have you considered its encoding? You probably want to set it to KOI8-R in Terminal > Settings > Profiles > Advanced > International. I'm not sure if RStudio has a similar setting for its console ...Lempres
Thanks Mikael, I've updated the question. I'm not using the terminal but the native R console or RStudio, and I haven't found such options for those. Shouldn't my first call Sys.setlocale("LC_CTYPE", "ru_RU.KOI8-R") take precedence over those anyway ?Birddog
Well, it is documented in help("Sys.setlocale") that attempts to change the character set of an already launched R process "may not work and are likely to lead to some confusion". It depends on the platform and on the application embedding R. You have to read the corresponding documentation.Lempres
For RStudio, that's here. For R.app (what I think you mean by "native R console"), that's here. For RStudio, it seems like you can specify the encoding of the text editor but not that of the console which seems stuck on UTF-8 (happy to be wrong though, because I'm not sure). For R.app, I think that you are seeing the application fall back to the Mac OS Roman (as documented), not to ISO Latin 1.Lempres
My guess is that, somewhere in the quite long reprex call stack, the output of the code snippet is translated from KOI8-R to UTF-8. The translated output displays correctly in UTF-8, which is the encoding that rmarkdown assumes when rendering to HTML. How it knows to translate from KOI8-R and not some other encoding is beyond me ...Lempres
Thanks this is very helpful! I've edited my question to reference R.app correctly.Birddog
My wider goal is to be able to reliably test a package with different locales. the R.app doc says: "Please note that you must always use ‘.UTF-8’ version of the locale, otherwise R.APP will not work properly.". The Rstudio doc says: "If you call Sys.setlocale with "LC_CTYPE" or "LC_ALL" to change the system locale while RStudio is running, you may run into some minor issues as RStudio assumes the system encoding doesn't change". Thus it appears that this is just not possible to reliably test different native encodings from a single machine, is that right?Birddog
The reprex situation hints to why my testthat snapshot tests show a different encoding from what I see in my console.Birddog
It really depends on what you are trying to test. If you care about the raw bytes in a string, then you can do byte comparison. If you care about display, then maybe the thing to do is to translate the string (using iconv or maybe enc2native) to the system encoding so that it displays correctly in the system encoding. But there can be portability issues there too. IMO you do not need to be testing display extensively if that is not the purpose of your software.Lempres
Do you want to post an answer? Otherwise I'm happy to self answer using your insights and references.Birddog
I'll try to write something up over the weekend.Lempres
L
8

I'm not a macOS or locale expert by any means, but this issue seems to boil down to the documented limitations of Sys.setlocale (a simple wrapper around setlocale from the Standard C Library; see man setlocale). help("Sys.setlocale") says:

Attempts to change the character set (by Sys.setlocale("LC_CTYPE", ), if that implies a different character set) during a session may not work and are likely to lead to some confusion.

IIUC that is because the application embedding R, which handles the output stream, may not be written to honor changes to the character set by the embedded R. So you really need to be reading the documentation of the application embedding R.

The R for macOS FAQ says:

By default R.APP uses UTF-8 for newly created documents and for the console. When opening new documents R.APP assumes UTF-8 and only if the document violates UTF-8 rules, it will try to fallback to legacy encoding, usually Mac Roman.

Indeed, your output from R.APP seems consistent with Mac OS Roman.

This Posit Support article says:

If you call Sys.setlocale with "LC_CTYPE" or "LC_ALL" to change the system locale while RStudio is running, you may run into some minor issues as RStudio assumes the system encoding doesn't change.

suggesting that the character set used by the RStudio console is fixed at start up depending on the environment at start up. Well, if we dig around in the RStudio sources, we find that it effectively requires UTF-8 even if the environment indicates a different, macOS-supported character set. (And, on my macOS, locale -a indicates that KO18-R is supported.)

That leaves Terminal.app, which I tend to use instead of R.app because I tend to want a shell. The encoding there can be set under Settings > Profiles > Advanced > International. If that is set to UTF-8, then we see output similar to RStudio. But if that is set to KO18-R, then we see "expected" output for bytes 0 through 255. Nice.

To answer some of the remaining questions:

How do you get "expected" output under every application?

If you know that the source encoding is KO18-R and that the system encoding is UTF-8, then use iconv to translate the strings to the system encoding instead of trying to change the character set to the match the source encoding.

iconv(ch256, from = "KO18-R", to = "UTF-8")

If you don't know that the system encoding is UTF-8, then you could try using to = l10n_info()[["codeset"]]. I'm not sure if that is general or portable, though ...

Why are bytes 128 through 255 rendered as "�"?

Under section "Single-byte locales", help("print.default") says:

If a non-printable character is encountered during output, it is represented as one of the ANSI escape sequences (\a, \b, \f, \n, \r, \t, \v, \\ and \0: see Quotes), or failing that as a 3-digit octal code: for example the UK currency pound sign in the C locale (if implemented correctly) is printed as \243. Which characters are non-printable depends on the locale.

Under section "Unicode and other multi-byte locales", it says:

It is possible to have a character string in a character vector that is not valid in the current locale. If a byte is encountered that is not part of a valid character it is printed in hex in the form \xab and this is repeated until the start of a valid character. (This will rapidly recover from minor errors in UTF-8.)

You told R to use a single-byte encoding, namely KO18-R. In that encoding, bytes 128 through 255 are printable characters, so print.default does not attempt to format them as octal "\abc". It leaves the original, single bytes alone. But those bytes do not represent valid characters in the UTF-8 encoding used by the application embedding R, so they are ultimately rendered as the standard, multi-byte replacement character "�". You do not see the hex "\xab" because (again) R thinks that you are using a single-byte encoding. It has no way of knowing that the application embedding R is actually using a multi-byte encoding, where "\xab" would be more informative than "�".

Why does reprex produce "expected" output?

I don't really know. reprex uses rmarkdown to render output and rmarkdown seems to use UTF-8 unconditionally. My guess is that somewhere in the reprex call stack the output containing bytes 128 through 255 is translated from KO18-R to UTF-8. But how would rmarkdown know to translate from KO18-R? Does it somehow record the encoding in use before the R subprocess terminates? The messages emitted by this augmented code block are suggestive ...

reprex::reprex({
                   Sys.setlocale("LC_CTYPE", "ru_RU.KOI8-R")
                   sapply(0:255, function(x) rawToChar(as.raw(x)))
                   Sys.setlocale("LC_CTYPE", "ru_RU.UTF-8")
                   sapply(0:255, function(x) rawToChar(as.raw(x)))
               },
               std_out_err = TRUE)
Quitting from lines  at lines 20-24 [unnamed-chunk-2] (soot-cub_reprex.spin.Rmd)
Error in gsub("[\n]{2,}$", "\n", x) : input string 1 is invalid
In addition: Warning messages:
1: In grepl("^\\s*$", x) :
  unable to translate '  [1] ""     "\001" "\002" "\003" "\004" "\005" "\006" "\a"   "\b"   "\t"  
 [11] "\n"   "\v"   "\f"   "\r"   "\016" "\017" "\020" "\021" "\022" "\023"
 [21] "\024" "\025" "\026" "\027" "\030" "\031" "\032" "\033" "\034" "\035"
 [31] "\036" "\037" " " ...' to a wide string
2: In grepl("^\\s*$", x) : input string 1 is invalid
3: In gsub("[\n]{2,}$", "\n", x) :
  unable to translate '  [1] ""     "\001" "\002" "\003" "\004" "\005" "\006" "\a"   "\b"   "\t"  
 [11] "\n"   "\v"   "\f"   "\r"   "\016" "\017" "\020" "\021" "\022" "\023"
 [21] "\024" "\025" "\026" "\027" "\030" "\031" "\032" "\033" "\034" "\035"
 [31] "\036" "\037" " " ...' to a wide string

Maybe one of the functions in the stack should be passing useBytes = TRUE to grep and friends. Or maybe not. It would be nice to see the traceback ...

Lempres answered 14/5, 2024 at 17:48 Comment(0)

© 2022 - 2025 — McMap. All rights reserved.