Byte order mark screws up file reading in Java

Asked 2/12, 2009 at 20:4 Answered 4/12, 2021 at 12:54

130

I'm trying to read CSV files using Java. Some of the files may have a byte order mark in the beginning, but not all. When present, the byte order gets read along with the rest of the first line, thus causing problems with string compares.

Is there an easy way to skip the byte order mark when it is present?

Isolt answered 2/12, 2009 at 20:4 Comment(1)

maybe: rgagnon.com/javadetails/java-handle-utf8-file-with-bom.html – Laliberte 24/7, 2012 at 18:14

126

EDIT: I've made a proper release on GitHub: https://github.com/gpakosz/UnicodeBOMInputStream

Here is a class I coded a while ago, I just edited the package name before pasting. Nothing special, it is quite similar to solutions posted in SUN's bug database. Incorporate it in your code and you're fine.

/* ____________________________________________________________________________
 * 
 * File:    UnicodeBOMInputStream.java
 * Author:  Gregory Pakosz.
 * Date:    02 - November - 2005    
 * ____________________________________________________________________________
 */
package com.stackoverflow.answer;

import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;

/**
 * The <code>UnicodeBOMInputStream</code> class wraps any
 * <code>InputStream</code> and detects the presence of any Unicode BOM
 * (Byte Order Mark) at its beginning, as defined by
 * <a href="http://www.faqs.org/rfcs/rfc3629.html">RFC 3629 - UTF-8, a transformation format of ISO 10646</a>
 * 
 * <p>The
 * <a href="http://www.unicode.org/unicode/faq/utf_bom.html">Unicode FAQ</a>
 * defines 5 types of BOMs:<ul>
 * <li><pre>00 00 FE FF  = UTF-32, big-endian</pre></li>
 * <li><pre>FF FE 00 00  = UTF-32, little-endian</pre></li>
 * <li><pre>FE FF        = UTF-16, big-endian</pre></li>
 * <li><pre>FF FE        = UTF-16, little-endian</pre></li>
 * <li><pre>EF BB BF     = UTF-8</pre></li>
 * </ul></p>
 * 
 * <p>Use the {@link #getBOM()} method to know whether a BOM has been detected
 * or not.
 * </p>
 * <p>Use the {@link #skipBOM()} method to remove the detected BOM from the
 * wrapped <code>InputStream</code> object.</p>
 */
public class UnicodeBOMInputStream extends InputStream
{
  /**
   * Type safe enumeration class that describes the different types of Unicode
   * BOMs.
   */
  public static final class BOM
  {
    /**
     * NONE.
     */
    public static final BOM NONE = new BOM(new byte[]{},"NONE");

    /**
     * UTF-8 BOM (EF BB BF).
     */
    public static final BOM UTF_8 = new BOM(new byte[]{(byte)0xEF,
                                                       (byte)0xBB,
                                                       (byte)0xBF},
                                            "UTF-8");

    /**
     * UTF-16, little-endian (FF FE).
     */
    public static final BOM UTF_16_LE = new BOM(new byte[]{ (byte)0xFF,
                                                            (byte)0xFE},
                                                "UTF-16 little-endian");

    /**
     * UTF-16, big-endian (FE FF).
     */
    public static final BOM UTF_16_BE = new BOM(new byte[]{ (byte)0xFE,
                                                            (byte)0xFF},
                                                "UTF-16 big-endian");

    /**
     * UTF-32, little-endian (FF FE 00 00).
     */
    public static final BOM UTF_32_LE = new BOM(new byte[]{ (byte)0xFF,
                                                            (byte)0xFE,
                                                            (byte)0x00,
                                                            (byte)0x00},
                                                "UTF-32 little-endian");

    /**
     * UTF-32, big-endian (00 00 FE FF).
     */
    public static final BOM UTF_32_BE = new BOM(new byte[]{ (byte)0x00,
                                                            (byte)0x00,
                                                            (byte)0xFE,
                                                            (byte)0xFF},
                                                "UTF-32 big-endian");

    /**
     * Returns a <code>String</code> representation of this <code>BOM</code>
     * value.
     */
    public final String toString()
    {
      return description;
    }

    /**
     * Returns the bytes corresponding to this <code>BOM</code> value.
     */
    public final byte[] getBytes()
    {
      final int     length = bytes.length;
      final byte[]  result = new byte[length];

      // Make a defensive copy
      System.arraycopy(bytes,0,result,0,length);

      return result;
    }

    private BOM(final byte bom[], final String description)
    {
      assert(bom != null)               : "invalid BOM: null is not allowed";
      assert(description != null)       : "invalid description: null is not allowed";
      assert(description.length() != 0) : "invalid description: empty string is not allowed";

      this.bytes          = bom;
      this.description  = description;
    }

            final byte    bytes[];
    private final String  description;

  } // BOM

  /**
   * Constructs a new <code>UnicodeBOMInputStream</code> that wraps the
   * specified <code>InputStream</code>.
   * 
   * @param inputStream an <code>InputStream</code>.
   * 
   * @throws NullPointerException when <code>inputStream</code> is
   * <code>null</code>.
   * @throws IOException on reading from the specified <code>InputStream</code>
   * when trying to detect the Unicode BOM.
   */
  public UnicodeBOMInputStream(final InputStream inputStream) throws  NullPointerException,
                                                                      IOException

  {
    if (inputStream == null)
      throw new NullPointerException("invalid input stream: null is not allowed");

    in = new PushbackInputStream(inputStream,4);

    final byte  bom[] = new byte[4];
    final int   read  = in.read(bom);

    switch(read)
    {
      case 4:
        if ((bom[0] == (byte)0xFF) &&
            (bom[1] == (byte)0xFE) &&
            (bom[2] == (byte)0x00) &&
            (bom[3] == (byte)0x00))
        {
          this.bom = BOM.UTF_32_LE;
          break;
        }
        else
        if ((bom[0] == (byte)0x00) &&
            (bom[1] == (byte)0x00) &&
            (bom[2] == (byte)0xFE) &&
            (bom[3] == (byte)0xFF))
        {
          this.bom = BOM.UTF_32_BE;
          break;
        }

      case 3:
        if ((bom[0] == (byte)0xEF) &&
            (bom[1] == (byte)0xBB) &&
            (bom[2] == (byte)0xBF))
        {
          this.bom = BOM.UTF_8;
          break;
        }

      case 2:
        if ((bom[0] == (byte)0xFF) &&
            (bom[1] == (byte)0xFE))
        {
          this.bom = BOM.UTF_16_LE;
          break;
        }
        else
        if ((bom[0] == (byte)0xFE) &&
            (bom[1] == (byte)0xFF))
        {
          this.bom = BOM.UTF_16_BE;
          break;
        }

      default:
        this.bom = BOM.NONE;
        break;
    }

    if (read > 0)
      in.unread(bom,0,read);
  }

  /**
   * Returns the <code>BOM</code> that was detected in the wrapped
   * <code>InputStream</code> object.
   * 
   * @return a <code>BOM</code> value.
   */
  public final BOM getBOM()
  {
    // BOM type is immutable.
    return bom;
  }

  /**
   * Skips the <code>BOM</code> that was found in the wrapped
   * <code>InputStream</code> object.
   * 
   * @return this <code>UnicodeBOMInputStream</code>.
   * 
   * @throws IOException when trying to skip the BOM from the wrapped
   * <code>InputStream</code> object.
   */
  public final synchronized UnicodeBOMInputStream skipBOM() throws IOException
  {
    if (!skipped)
    {
      in.skip(bom.bytes.length);
      skipped = true;
    }
    return this;
  }

  /**
   * {@inheritDoc}
   */
  public int read() throws IOException
  {
    return in.read();
  }

  /**
   * {@inheritDoc}
   */
  public int read(final byte b[]) throws  IOException,
                                          NullPointerException
  {
    return in.read(b,0,b.length);
  }

  /**
   * {@inheritDoc}
   */
  public int read(final byte b[],
                  final int off,
                  final int len) throws IOException,
                                        NullPointerException
  {
    return in.read(b,off,len);
  }

  /**
   * {@inheritDoc}
   */
  public long skip(final long n) throws IOException
  {
    return in.skip(n);
  }

  /**
   * {@inheritDoc}
   */
  public int available() throws IOException
  {
    return in.available();
  }

  /**
   * {@inheritDoc}
   */
  public void close() throws IOException
  {
    in.close();
  }

  /**
   * {@inheritDoc}
   */
  public synchronized void mark(final int readlimit)
  {
    in.mark(readlimit);
  }

  /**
   * {@inheritDoc}
   */
  public synchronized void reset() throws IOException
  {
    in.reset();
  }

  /**
   * {@inheritDoc}
   */
  public boolean markSupported() 
  {
    return in.markSupported();
  }

  private final PushbackInputStream in;
  private final BOM                 bom;
  private       boolean             skipped = false;

} // UnicodeBOMInputStream

And you're using it this way:

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;

public final class UnicodeBOMInputStreamUsage
{
  public static void main(final String[] args) throws Exception
  {
    FileInputStream fis = new FileInputStream("test/offending_bom.txt");
    UnicodeBOMInputStream ubis = new UnicodeBOMInputStream(fis);

    System.out.println("detected BOM: " + ubis.getBOM());

    System.out.print("Reading the content of the file without skipping the BOM: ");
    InputStreamReader isr = new InputStreamReader(ubis);
    BufferedReader br = new BufferedReader(isr);

    System.out.println(br.readLine());

    br.close();
    isr.close();
    ubis.close();
    fis.close();

    fis = new FileInputStream("test/offending_bom.txt");
    ubis = new UnicodeBOMInputStream(fis);
    isr = new InputStreamReader(ubis);
    br = new BufferedReader(isr);

    ubis.skipBOM();

    System.out.print("Reading the content of the file after skipping the BOM: ");
    System.out.println(br.readLine());

    br.close();
    isr.close();
    ubis.close();
    fis.close();
  }

} // UnicodeBOMInputStreamUsage

Melvamelvena answered 2/12, 2009 at 20:20 Comment(12)

Sorry for the long scrolling areas, too bad there is no attachment feature – Melvamelvena 2/12, 2009 at 20:22

You're welcome. I remember I discovered this problem after editing XML configuration files with the most widespread XML editor in the world: Notepad.exe which inserts a BOM when saving back a file that contains Unicode characters :) – Melvamelvena 4/12, 2009 at 7:27

Great decorator! It may be a good idea to delegate the BOM recognition to the BOM class, too, though. Chain of responsibility, someone? – Scornik 23/5, 2011 at 9:1

yeah well the great chain of design patterns... ;) – Melvamelvena 10/8, 2011 at 9:19

This should be in core Java API – Tequilater 7/12, 2012 at 16:42

Why not add javaCharset key as a member in UnicodeBOMInputStream whose value can be used to read file accordingly in InputStreamReader isr = new InputStreamReader(ubis, ubis.getCharsetKey()) where getCharsetKey return the Java.charset values as per the BOM found. – Recipe 20/3, 2013 at 7:1

10 years have passed and I'm still receiving karma for this :D I'm looking at you Java! – Melvamelvena 22/5, 2015 at 10:33

Using your code I managed to solve this BOM problem... but strangely your code as written didn't work for me: int read = in.read( bom ) in fact returned 4 for me, not 3, so everything went wrong, despite the fact that this was a UTF-8 BOM. I followed up with in.skip( 3 )... and was then able to SAX parse my file. Strange that no-one else has mentioned this. NB offending BOM characters: "ï»¿". Also, int casts of the bytes at the start of the line came out at: "-17, -69, -65, 60, 63, 120, 109, 108, 32, 118, 101, 114, 115, 105, 111, ...". This might be of some help... – Vanbuskirk 3/3, 2017 at 20:0

Which byte array, by the way, comes out as "EFBBBF3C3F786D6C2076657..." using, for example, org.apache.commons.codec.binary.Hex.encodeHexString( bytes ) – Vanbuskirk 4/3, 2017 at 8:9

Not worked for me!! I don't know why but maybe my file in in locale "fa_IR" – Proprietress 14/8, 2017 at 10:5

I think Java is following a "lazy" pattern: it does things reactively, assuming all the data input is in good order and format. Same happens with Java way of reading keystore and build cert chain. Java: pillar in the "lazy" world, yes you are! – Raincoat 15/9, 2017 at 12:28

Upvoted because answer provides history regarding why file input stream does not provide the option to discard BOM by default. – Woodborer 30/4, 2018 at 19:9

108

The Apache Commons IO library has an InputStream that can detect and discard BOMs: BOMInputStream (javadoc):

BOMInputStream bomIn = new BOMInputStream(in);
int firstNonBOMByte = bomIn.read(); // Skips BOM
if (bomIn.hasBOM()) {
    // has a UTF-8 BOM
}

If you also need to detect different encodings, it can also distinguish among various different byte-order marks, e.g. UTF-8 vs. UTF-16 big + little endian - details at the doc link above. You can then use the detected ByteOrderMark to choose a Charset to decode the stream. (There's probably a more streamlined way to do this if you need all of this functionality - maybe the UnicodeReader in BalusC's answer?). Note that, in general, there's not a very good way to detect what encoding some bytes are in, but if the stream starts with a BOM, apparently this can be helpful.

Edit: If you need to detect the BOM in UTF-16, UTF-32, etc, then the constructor should be:

new BOMInputStream(is, ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE,
        ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE, ByteOrderMark.UTF_32LE)

Upvote @martin-charlesworth's comment :)

Spirochete answered 12/9, 2011 at 15:28 Comment(5)

Just skips the BOM. Should be the perfect solution for 99% of the use cases. – Brancusi 7/5, 2012 at 7:39

I used this answer successfully. However, I would respectfully add the boolean arg for specifying whether to include or exclude the BOM. Example: BOMInputStream bomIn = new BOMInputStream(in, false); // don't include the BOM – Compote 7/10, 2013 at 17:12

I would also add that this only detects UTF-8 BOM. If you want to detect all the utf-X BOMs then you need to pass them in to the BOMInputStream constructor.

BOMInputStream bomIn = new BOMInputStream(is, ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, 				ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE, ByteOrderMark.UTF_32LE);

– Swinger 26/6, 2014 at 16:29

As for the comment of @KevinMeredith, I want to stress that the constructor with boolean is clearer, but the default constructor has already got rid of UTF-8 BOM, as the JavaDoc suggests: BOMInputStream(InputStream delegate) Constructs a new BOM InputStream that excludes a ByteOrderMark.UTF_8 BOM. – Raincoat 15/9, 2017 at 12:18

Skipping solves most of my problems. If my file starts with a BOM UTF_16BE, can I create an InputReader by skipping the BOM and reading the file as UTF_8? So far it works, I want to understand if there is any edge case? Thanks in advance. – Mulligrubs 26/5, 2020 at 21:22

More simple solution:

public class BOMSkipper
{
    public static void skip(Reader reader) throws IOException
    {
        reader.mark(1);
        char[] possibleBOM = new char[1];
        reader.read(possibleBOM);

        if (possibleBOM[0] != '\ufeff')
        {
            reader.reset();
        }
    }
}

Usage sample:

BufferedReader input = new BufferedReader(new InputStreamReader(new FileInputStream(file), fileExpectedCharset));
BOMSkipper.skip(input);
//Now UTF prefix not present:
input.readLine();
...

It works with all 5 UTF encodings!

Indefeasible answered 16/8, 2013 at 13:57 Comment(8)

Very nice Andrei. But could you explain why it works? How does the pattern 0xFEFF successfully match UTF-8 files which seem to have a different pattern and 3 bytes instead of 2? And how can that pattern match both endians of UTF16 and UTF32? – Knurl 27/5, 2014 at 19:8

As you can see - I don't use byte stream but character stream opened with expected charset. So if the first character from this stream is BOM - I skip it. BOM can have different byte representation for each encoding, but this is one character. Please read this article, it helps me: joelonsoftware.com/articles/Unicode.html – Indefeasible 28/5, 2014 at 22:21

Nice solution, just make sure to check if file is not empty to avoid IOException in skip method before reading. You may do that by calling if (reader.ready()){ reader.read(possibleBOM) ... } – Cittern 17/6, 2014 at 13:49

I see you have covered 0xFE 0xFF, which is the Byte order Mark for UTF-16BE. But, what if the first 3 bytes are 0xEF 0xBB 0xEF ? (the byte order mark for UTF-8). You claim that this works for all UTF-8 formats. Which could be true (I haven't tested your code), but then how does it work ? – Flagman 7/7, 2016 at 8:40

See my answer to Vahid: I open not the byte stream but character stream and read one character from it. Never mind what utf encoding used for file - bom prefix can represented by different count of bytes, but in terms of characters it's just one character – Indefeasible 19/7, 2016 at 22:58

Great solution, Andrei! Thank you very much! – Gyromagnetic 30/4, 2018 at 9:31

The mark() method mark a position in the input to which the stream can be "reset" by calling the reset() method. It needed for future reads, after BOM skipping – Indefeasible 7/8, 2018 at 6:35

If you're trying to mark the second index, you should call it after reading. – Jovian 7/8, 2018 at 18:26

Google Data API has an UnicodeReader which automagically detects the encoding.

You can use it instead of InputStreamReader. Here's an -slightly compactized- extract of its source which is pretty straightforward:

public class UnicodeReader extends Reader {
    private static final int BOM_SIZE = 4;
    private final InputStreamReader reader;

    /**
     * Construct UnicodeReader
     * @param in Input stream.
     * @param defaultEncoding Default encoding to be used if BOM is not found,
     * or <code>null</code> to use system default encoding.
     * @throws IOException If an I/O error occurs.
     */
    public UnicodeReader(InputStream in, String defaultEncoding) throws IOException {
        byte bom[] = new byte[BOM_SIZE];
        String encoding;
        int unread;
        PushbackInputStream pushbackStream = new PushbackInputStream(in, BOM_SIZE);
        int n = pushbackStream.read(bom, 0, bom.length);

        // Read ahead four bytes and check for BOM marks.
        if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) {
            encoding = "UTF-8";
            unread = n - 3;
        } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
            encoding = "UTF-16BE";
            unread = n - 2;
        } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
            encoding = "UTF-16LE";
            unread = n - 2;
        } else if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
            encoding = "UTF-32BE";
            unread = n - 4;
        } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
            encoding = "UTF-32LE";
            unread = n - 4;
        } else {
            encoding = defaultEncoding;
            unread = n;
        }

        // Unread bytes if necessary and skip BOM marks.
        if (unread > 0) {
            pushbackStream.unread(bom, (n - unread), unread);
        } else if (unread < -1) {
            pushbackStream.unread(bom, 0, 0);
        }

        // Use given encoding.
        if (encoding == null) {
            reader = new InputStreamReader(pushbackStream);
        } else {
            reader = new InputStreamReader(pushbackStream, encoding);
        }
    }

    public String getEncoding() {
        return reader.getEncoding();
    }

    public int read(char[] cbuf, int off, int len) throws IOException {
        return reader.read(cbuf, off, len);
    }

    public void close() throws IOException {
        reader.close();
    }
}

Bema answered 2/12, 2009 at 20:29 Comment(4)

It seems that the link says Google Data API is deprecated ? Where should one look for the Google Data API now ? – Headsail 7/7, 2016 at 2:17

@XichenLi: GData API is been deprecated for its intented purpose. I didn't intend to suggest to use GData API directly (OP isn't using any GData service), but I intend to take over the source code as example for your own implementation. That's also why I included it in my answer, ready for copypaste. – Bema 7/7, 2016 at 11:2

There's a bug in this. The UTF-32LE case is unreachable. In order for (bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00) to be true, then the UTF-16LE case ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) would have already matched. – Dubiety 19/7, 2017 at 14:5

Since this code is from the Google Data API, I posted issue 471 about it. – Dubiety 19/7, 2017 at 14:32

The Apache Commons IO Library's BOMInputStream has already been mentioned by @rescdsk, but I did not see it mention how to get an InputStream without the BOM.

Here's how I did it in Scala.

 import java.io._
 val file = new File(path_to_xml_file_with_BOM)
 val fileInpStream = new FileInputStream(file)   
 val bomIn = new BOMInputStream(fileInpStream, 
         false); // false means don't include BOM

Compote answered 24/9, 2013 at 15:58 Comment(2)

Single arg constructor does it: public BOMInputStream(InputStream delegate) { this(delegate, false, ByteOrderMark.UTF_8); }. It excludes UTF-8 BOM by default. – Osteoclasis 15/7, 2016 at 17:5

Good point, Vladimir. I see that in its docs - commons.apache.org/proper/commons-io/javadocs/api-2.2/org/…: Constructs a new BOM InputStream that excludes a ByteOrderMark.UTF_8 BOM. – Compote 15/7, 2016 at 17:10

To simply remove the BOM characters from your file, I recomend using Apache Common IO

public BOMInputStream(InputStream delegate,
              boolean include)
Constructs a new BOM InputStream that detects a a ByteOrderMark.UTF_8 and optionally includes it.
Parameters:
delegate - the InputStream to delegate to
include - true to include the UTF-8 BOM or false to exclude it

Set include to false and your BOM characters will be excluded.

Unorthodox answered 26/11, 2015 at 19:31 Comment(0)

Regrettably not. You'll have to identify and skip yourself. This page details what you have to watch for. Also see this SO question for more details.

Fabiano answered 2/12, 2009 at 20:9 Comment(0)

Here is my code to read csv files in most char sets. It should cover 99% situations.

        try(InputStream inputStream = new FileInputStream(csvFile);){
            BOMInputStream bomInputStream = new BOMInputStream(inputStream ,ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE);
            Charset charset;
            if(!bomInputStream.hasBOM()) charset = StandardCharsets.UTF_8;
            else if(bomInputStream.hasBOM(ByteOrderMark.UTF_8)) charset = StandardCharsets.UTF_8;
            else if(bomInputStream.hasBOM(ByteOrderMark.UTF_16LE)) charset = StandardCharsets.UTF_16LE;
            else if(bomInputStream.hasBOM(ByteOrderMark.UTF_16BE)) charset = StandardCharsets.UTF_16BE;
            else { throw new Exception("The charset of the file " + csvFile + " is not supported.");}
            
            try(Reader streamReader = new InputStreamReader(bomInputStream, charset);
                BufferedReader bufferedReader = new BufferedReader(streamReader);) {
                for(String line; (line = bufferedReader.readLine()) != null; ) {
                    String[] columns = line.split(",");
             //read csv columns
            }
        }

Logical answered 6/2, 2021 at 2:35 Comment(1)

IMO the best answer (and coding example), except that it falls back to UTF-8 if there is no BOM. See also my general answer below. – Amour 4/12, 2021 at 12:55

IMO none of the given answers is really satisfying. Just skipping the BOM and then read the rest of the stream in the current platform's default encoding is definitively wrong. Remember: The platform default on Unix/Linux and windows differ: former is UTF-8, later is ANSI. Such a solution only works if the rest of the stream (after the BOM) only contains 7-bit ASCII characters (which, I admit, in most programmer near files like configurations is true). But as soon there are non ASCII characters, you will fail with this approach.

That's why all java classes/methods, which can convert byte arrays/streams to string (and vice versa) have a second parameter indicating the encoding to be used (Reader, Writer, Scanner, String.getBytes(), etc.).

There are so much character encodings out in the world, not only UTF-xx. And still - in the current year 2021 - there are so much encoding problems between end user applications, especially if they run on different platforms (iOS, windows, unix). All these problems only exist because the programmer was too lazy to learn how character encoding works.

Thus, it's an absolute MUST to evaluate first the encoding to be used, and then performing the string/stream conversion using the found encoding. Consulting the respective specification(s) is the first step. And only if you cannot be sure which encoding you encounter while reading a stream you have to evaluate it by yourself. But caution: such an evaluation always will only be a 'best guess', there is no algorithm which can cover all possibilities.

In this sense, Lee's answer (and coding example) from Feb 6,2021 is IMO the best one, except that he falls back to UTF-8 if there is no BOM.

Amour answered 4/12, 2021 at 12:54 Comment(0)

I had the same problem, and because I wasn't reading in a bunch of files I did a simpler solution. I think my encoding was UTF-8 because when I printed out the offending character with the help of this page: Get unicode value of a character I found that it was \ufeff. I used the code System.out.println( "\\u" + Integer.toHexString(str.charAt(0) | 0x10000).substring(1) ); to print out the offending unicode value.

Once I had the offending unicode value, I replaced it in the first line of my file before I went on reading. The business logic of that section:

String str = reader.readLine().trim();
str = str.replace("\ufeff", "");

This fixed my problem. Then I was able to go on processing the file with no issue. I added on trim() just in case of leading or trailing whitespace, you can do that or not, based on what your specific needs are.

Eubanks answered 17/1, 2018 at 19:23 Comment(1)

That did not work for me, but I used .replaceFirst("\u00EF\u00BB\u00BF", "") which did. – Allveta 19/2, 2018 at 13:24

NotePad++ is a good tool to convert UTF-8 encoding to UTF-8(BOM) encoding.

https://notepad-plus-plus.org/downloads/

UTF8BOMTester.java

public class UTF8BOMTester {

public static void main(String[] args) throws FileNotFoundException, IOException {
    // TODO Auto-generated method stub
    File file = new File("test.txt");
    boolean same = UTF8BOMInputStream.isSameEncodingType(file);
    System.out.println(same);
    if (same) {
        UTF8BOMInputStream is = new UTF8BOMInputStream(file);
        BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
        System.out.println(br.readLine());
    }

}

static void bytesPrint(byte[] b) {
    for (byte a : b)
        System.out.printf("%x ", a);
}}

UTF8BOMInputStream.java

public class UTF8BOMInputStream extends InputStream {

byte[] SYMBLE_BOM = { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF };
FileInputStream fis;
final boolean isSameEncodingType;
public UTF8BOMInputStream(File file) throws IOException {
    FileInputStream fis=new FileInputStream(file);
    byte[] symble=new byte[3];
    fis.read(symble);
    bytesPrint(symble);
    isSameEncodingType=isSameEncodingType(symble);
    if(isSameEncodingType)
        this.fis=fis;
    else
        this.fis=null;
    
}

@Override
public int read() throws IOException {
    return fis.read();
}

void bytesPrint(byte[] b) {
    for (byte a : b)
        System.out.printf("%x ", a);
}

boolean bytesCompare(byte[] a, byte[] b) {
    if (a.length != b.length)
        return false;

    for (int i = 0; i < a.length; i++) {
        if (a[i] != b[i])
            return false;
    }
    return true;
}
boolean isSameEncodingType(byte[] symble) {
    return bytesCompare(symble,SYMBLE_BOM);
}
public static boolean isSameEncodingType(File file) throws IOException {
    return (new UTF8BOMInputStream(file)).isSameEncodingType;
}

Excrescence answered 2/9, 2020 at 12:0 Comment(0)

Hot tags

Godot Unity Godot Help Programming Godot 4.X GUI GDScript 3D 2D Physics CSharp Godot 3.X VR XR Projects C++

Recommended topics

Hot tags