From 56d71f261a8bd6031e47e2bf80867049a2aa13da Mon Sep 17 00:00:00 2001 From: chenjinsong Date: Thu, 27 Sep 2018 16:11:54 +0800 Subject: initial commit --- src/com/nis/nmsclient/util/io/UnicodeReader.java | 120 +++++++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 src/com/nis/nmsclient/util/io/UnicodeReader.java (limited to 'src/com/nis/nmsclient/util/io/UnicodeReader.java') diff --git a/src/com/nis/nmsclient/util/io/UnicodeReader.java b/src/com/nis/nmsclient/util/io/UnicodeReader.java new file mode 100644 index 0000000..c4169b5 --- /dev/null +++ b/src/com/nis/nmsclient/util/io/UnicodeReader.java @@ -0,0 +1,120 @@ +package com.nis.nmsclient.util.io; +/** + version: 1.1 / 2007-01-25 + - changed BOM recognition ordering (longer boms first) + + Original pseudocode : Thomas Weidenfeller + Implementation tweaked: Aki Nieminen + + http://www.unicode.org/unicode/faq/utf_bom.html + BOMs: + 00 00 FE FF = UTF-32, big-endian + FF FE 00 00 = UTF-32, little-endian + EF BB BF = UTF-8, + FE FF = UTF-16, big-endian + FF FE = UTF-16, little-endian + + Win2k Notepad: + Unicode format = UTF-16LE + ***/ + +import java.io.*; + +/** + * Generic unicode textreader, which will use BOM mark to identify the encoding + * to be used. If BOM is not found then use a given default or system encoding. + */ +public class UnicodeReader extends Reader { + PushbackInputStream internalIn; + InputStreamReader internalIn2 = null; + String defaultEnc; + + private static final int BOM_SIZE = 4; + + /** + * + * @param in + * inputstream to be read + * @param defaultEnc + * default encoding if stream does not have BOM marker. Give NULL + * to use system-level default. + */ + public UnicodeReader(InputStream in, String defaultEnc) { + internalIn = new PushbackInputStream(in, BOM_SIZE); + this.defaultEnc = defaultEnc; + } + + public String getDefaultEncoding() { + return defaultEnc; + } + + /** + * Get stream encoding or NULL if stream is uninitialized. Call init() or + * read() method to initialize it. + */ + public String getEncoding() { + if (internalIn2 == null) + return null; + return internalIn2.getEncoding(); + } + + /** + * Read-ahead four bytes and check for BOM marks. Extra bytes are unread + * back to the stream, only BOM bytes are skipped. + */ + protected void init() throws IOException { + if (internalIn2 != null) + return; + + String encoding; + byte bom[] = new byte[BOM_SIZE]; + int n, unread; + n = internalIn.read(bom, 0, bom.length); + + if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) + && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) { + encoding = "UTF-32BE"; + unread = n - 4; + } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) + && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) { + encoding = "UTF-32LE"; + unread = n - 4; + } else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) + && (bom[2] == (byte) 0xBF)) { + encoding = "UTF-8"; + unread = n - 3; + } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) { + encoding = "UTF-16BE"; + unread = n - 2; + } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) { + encoding = "UTF-16LE"; + unread = n - 2; + } else { + // Unicode BOM mark not found, unread all bytes + encoding = defaultEnc; + unread = n; + } + // System.out.println("read=" + n + ", unread=" + unread); + + if (unread > 0) + internalIn.unread(bom, (n - unread), unread); + + // Use given encoding + if (encoding == null) { + internalIn2 = new InputStreamReader(internalIn); + } else { + internalIn2 = new InputStreamReader(internalIn, encoding); + } + } + + public void close() throws IOException { + init(); + internalIn2.close(); + } + + public int read(char[] cbuf, int off, int len) throws IOException { + init(); + return internalIn2.read(cbuf, off, len); + } + +} \ No newline at end of file -- cgit v1.2.3