summaryrefslogtreecommitdiff
path: root/src/com/nis/nmsclient/util/io/UnicodeInputStream.java
diff options
context:
space:
mode:
authorchenjinsong <[email protected]>2018-09-27 16:11:54 +0800
committerchenjinsong <[email protected]>2018-09-27 16:11:54 +0800
commit56d71f261a8bd6031e47e2bf80867049a2aa13da (patch)
treef09257b2143782a333a9eda3395137837d9bdad1 /src/com/nis/nmsclient/util/io/UnicodeInputStream.java
initial commit
Diffstat (limited to 'src/com/nis/nmsclient/util/io/UnicodeInputStream.java')
-rw-r--r--src/com/nis/nmsclient/util/io/UnicodeInputStream.java118
1 files changed, 118 insertions, 0 deletions
diff --git a/src/com/nis/nmsclient/util/io/UnicodeInputStream.java b/src/com/nis/nmsclient/util/io/UnicodeInputStream.java
new file mode 100644
index 0000000..9e61d42
--- /dev/null
+++ b/src/com/nis/nmsclient/util/io/UnicodeInputStream.java
@@ -0,0 +1,118 @@
+package com.nis.nmsclient.util.io;
+/**
+ version: 1.1 / 2007-01-25
+ - changed BOM recognition ordering (longer boms first)
+
+ Original pseudocode : Thomas Weidenfeller
+ Implementation tweaked: Aki Nieminen
+
+ http://www.unicode.org/unicode/faq/utf_bom.html
+ BOMs in byte length ordering:
+ 00 00 FE FF = UTF-32, big-endian
+ FF FE 00 00 = UTF-32, little-endian
+ EF BB BF = UTF-8,
+ FE FF = UTF-16, big-endian
+ FF FE = UTF-16, little-endian
+
+ Win2k Notepad:
+ Unicode format = UTF-16LE
+ ***/
+
+import java.io.*;
+
+/**
+ * This inputstream will recognize unicode BOM marks and will skip bytes if
+ * getEncoding() method is called before any of the read(...) methods.
+ *
+ * Usage pattern: String enc = "ISO-8859-1"; // or NULL to use systemdefault
+ * FileInputStream fis = new FileInputStream(file); UnicodeInputStream uin = new
+ * UnicodeInputStream(fis, enc); enc = uin.getEncoding(); // check and skip
+ * possible BOM bytes InputStreamReader in; if (enc == null) in = new
+ * InputStreamReader(uin); else in = new InputStreamReader(uin, enc);
+ */
+public class UnicodeInputStream extends InputStream {
+ PushbackInputStream internalIn;
+ boolean isInited = false;
+ String defaultEnc;
+ String encoding;
+
+ private static final int BOM_SIZE = 4;
+
+ UnicodeInputStream(InputStream in, String defaultEnc) {
+ internalIn = new PushbackInputStream(in, BOM_SIZE);
+ this.defaultEnc = defaultEnc;
+ }
+
+ public String getDefaultEncoding() {
+ return defaultEnc;
+ }
+
+ public String getEncoding() {
+ if (!isInited) {
+ try {
+ init();
+ } catch (IOException ex) {
+ IllegalStateException ise = new IllegalStateException(
+ "Init method failed.");
+ ise.initCause(ise);
+ throw ise;
+ }
+ }
+ return encoding;
+ }
+
+ /**
+ * Read-ahead four bytes and check for BOM marks. Extra bytes are unread
+ * back to the stream, only BOM bytes are skipped.
+ */
+ protected void init() throws IOException {
+ if (isInited)
+ return;
+
+ byte bom[] = new byte[BOM_SIZE];
+ int n, unread;
+ n = internalIn.read(bom, 0, bom.length);
+
+ if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00)
+ && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
+ encoding = "UTF-32BE";
+ unread = n - 4;
+ } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)
+ && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
+ encoding = "UTF-32LE";
+ unread = n - 4;
+ } else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB)
+ && (bom[2] == (byte) 0xBF)) {
+ encoding = "UTF-8";
+ unread = n - 3;
+ } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
+ encoding = "UTF-16BE";
+ unread = n - 2;
+ } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
+ encoding = "UTF-16LE";
+ unread = n - 2;
+ } else {
+ // Unicode BOM mark not found, unread all bytes
+ encoding = defaultEnc;
+ unread = n;
+ }
+ // System.out.println("read=" + n + ", unread=" + unread);
+
+ if (unread > 0)
+ internalIn.unread(bom, (n - unread), unread);
+
+ isInited = true;
+ }
+
+ public void close() throws IOException {
+ // init();
+ isInited = true;
+ internalIn.close();
+ }
+
+ public int read() throws IOException {
+ // init();
+ isInited = true;
+ return internalIn.read();
+ }
+}