summaryrefslogtreecommitdiff
path: root/src/com/nis/nmsclient/util/io
diff options
context:
space:
mode:
Diffstat (limited to 'src/com/nis/nmsclient/util/io')
-rw-r--r--src/com/nis/nmsclient/util/io/UnicodeInputStream.java118
-rw-r--r--src/com/nis/nmsclient/util/io/UnicodeReader.java120
2 files changed, 238 insertions, 0 deletions
diff --git a/src/com/nis/nmsclient/util/io/UnicodeInputStream.java b/src/com/nis/nmsclient/util/io/UnicodeInputStream.java
new file mode 100644
index 0000000..9e61d42
--- /dev/null
+++ b/src/com/nis/nmsclient/util/io/UnicodeInputStream.java
@@ -0,0 +1,118 @@
+package com.nis.nmsclient.util.io;
+/**
+ version: 1.1 / 2007-01-25
+ - changed BOM recognition ordering (longer boms first)
+
+ Original pseudocode : Thomas Weidenfeller
+ Implementation tweaked: Aki Nieminen
+
+ http://www.unicode.org/unicode/faq/utf_bom.html
+ BOMs in byte length ordering:
+ 00 00 FE FF = UTF-32, big-endian
+ FF FE 00 00 = UTF-32, little-endian
+ EF BB BF = UTF-8,
+ FE FF = UTF-16, big-endian
+ FF FE = UTF-16, little-endian
+
+ Win2k Notepad:
+ Unicode format = UTF-16LE
+ ***/
+
+import java.io.*;
+
+/**
+ * This inputstream will recognize unicode BOM marks and will skip bytes if
+ * getEncoding() method is called before any of the read(...) methods.
+ *
+ * Usage pattern: String enc = "ISO-8859-1"; // or NULL to use systemdefault
+ * FileInputStream fis = new FileInputStream(file); UnicodeInputStream uin = new
+ * UnicodeInputStream(fis, enc); enc = uin.getEncoding(); // check and skip
+ * possible BOM bytes InputStreamReader in; if (enc == null) in = new
+ * InputStreamReader(uin); else in = new InputStreamReader(uin, enc);
+ */
+public class UnicodeInputStream extends InputStream {
+ PushbackInputStream internalIn;
+ boolean isInited = false;
+ String defaultEnc;
+ String encoding;
+
+ private static final int BOM_SIZE = 4;
+
+ UnicodeInputStream(InputStream in, String defaultEnc) {
+ internalIn = new PushbackInputStream(in, BOM_SIZE);
+ this.defaultEnc = defaultEnc;
+ }
+
+ public String getDefaultEncoding() {
+ return defaultEnc;
+ }
+
+ public String getEncoding() {
+ if (!isInited) {
+ try {
+ init();
+ } catch (IOException ex) {
+ IllegalStateException ise = new IllegalStateException(
+ "Init method failed.");
+ ise.initCause(ise);
+ throw ise;
+ }
+ }
+ return encoding;
+ }
+
+ /**
+ * Read-ahead four bytes and check for BOM marks. Extra bytes are unread
+ * back to the stream, only BOM bytes are skipped.
+ */
+ protected void init() throws IOException {
+ if (isInited)
+ return;
+
+ byte bom[] = new byte[BOM_SIZE];
+ int n, unread;
+ n = internalIn.read(bom, 0, bom.length);
+
+ if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00)
+ && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
+ encoding = "UTF-32BE";
+ unread = n - 4;
+ } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)
+ && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
+ encoding = "UTF-32LE";
+ unread = n - 4;
+ } else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB)
+ && (bom[2] == (byte) 0xBF)) {
+ encoding = "UTF-8";
+ unread = n - 3;
+ } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
+ encoding = "UTF-16BE";
+ unread = n - 2;
+ } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
+ encoding = "UTF-16LE";
+ unread = n - 2;
+ } else {
+ // Unicode BOM mark not found, unread all bytes
+ encoding = defaultEnc;
+ unread = n;
+ }
+ // System.out.println("read=" + n + ", unread=" + unread);
+
+ if (unread > 0)
+ internalIn.unread(bom, (n - unread), unread);
+
+ isInited = true;
+ }
+
+ public void close() throws IOException {
+ // init();
+ isInited = true;
+ internalIn.close();
+ }
+
+ public int read() throws IOException {
+ // init();
+ isInited = true;
+ return internalIn.read();
+ }
+}
diff --git a/src/com/nis/nmsclient/util/io/UnicodeReader.java b/src/com/nis/nmsclient/util/io/UnicodeReader.java
new file mode 100644
index 0000000..c4169b5
--- /dev/null
+++ b/src/com/nis/nmsclient/util/io/UnicodeReader.java
@@ -0,0 +1,120 @@
+package com.nis.nmsclient.util.io;
+/**
+ version: 1.1 / 2007-01-25
+ - changed BOM recognition ordering (longer boms first)
+
+ Original pseudocode : Thomas Weidenfeller
+ Implementation tweaked: Aki Nieminen
+
+ http://www.unicode.org/unicode/faq/utf_bom.html
+ BOMs:
+ 00 00 FE FF = UTF-32, big-endian
+ FF FE 00 00 = UTF-32, little-endian
+ EF BB BF = UTF-8,
+ FE FF = UTF-16, big-endian
+ FF FE = UTF-16, little-endian
+
+ Win2k Notepad:
+ Unicode format = UTF-16LE
+ ***/
+
+import java.io.*;
+
+/**
+ * Generic unicode textreader, which will use BOM mark to identify the encoding
+ * to be used. If BOM is not found then use a given default or system encoding.
+ */
+public class UnicodeReader extends Reader {
+ PushbackInputStream internalIn;
+ InputStreamReader internalIn2 = null;
+ String defaultEnc;
+
+ private static final int BOM_SIZE = 4;
+
+ /**
+ *
+ * @param in
+ * inputstream to be read
+ * @param defaultEnc
+ * default encoding if stream does not have BOM marker. Give NULL
+ * to use system-level default.
+ */
+ public UnicodeReader(InputStream in, String defaultEnc) {
+ internalIn = new PushbackInputStream(in, BOM_SIZE);
+ this.defaultEnc = defaultEnc;
+ }
+
+ public String getDefaultEncoding() {
+ return defaultEnc;
+ }
+
+ /**
+ * Get stream encoding or NULL if stream is uninitialized. Call init() or
+ * read() method to initialize it.
+ */
+ public String getEncoding() {
+ if (internalIn2 == null)
+ return null;
+ return internalIn2.getEncoding();
+ }
+
+ /**
+ * Read-ahead four bytes and check for BOM marks. Extra bytes are unread
+ * back to the stream, only BOM bytes are skipped.
+ */
+ protected void init() throws IOException {
+ if (internalIn2 != null)
+ return;
+
+ String encoding;
+ byte bom[] = new byte[BOM_SIZE];
+ int n, unread;
+ n = internalIn.read(bom, 0, bom.length);
+
+ if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00)
+ && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
+ encoding = "UTF-32BE";
+ unread = n - 4;
+ } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)
+ && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
+ encoding = "UTF-32LE";
+ unread = n - 4;
+ } else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB)
+ && (bom[2] == (byte) 0xBF)) {
+ encoding = "UTF-8";
+ unread = n - 3;
+ } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
+ encoding = "UTF-16BE";
+ unread = n - 2;
+ } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
+ encoding = "UTF-16LE";
+ unread = n - 2;
+ } else {
+ // Unicode BOM mark not found, unread all bytes
+ encoding = defaultEnc;
+ unread = n;
+ }
+ // System.out.println("read=" + n + ", unread=" + unread);
+
+ if (unread > 0)
+ internalIn.unread(bom, (n - unread), unread);
+
+ // Use given encoding
+ if (encoding == null) {
+ internalIn2 = new InputStreamReader(internalIn);
+ } else {
+ internalIn2 = new InputStreamReader(internalIn, encoding);
+ }
+ }
+
+ public void close() throws IOException {
+ init();
+ internalIn2.close();
+ }
+
+ public int read(char[] cbuf, int off, int len) throws IOException {
+ init();
+ return internalIn2.read(cbuf, off, len);
+ }
+
+} \ No newline at end of file