diff options
Diffstat (limited to 'src/com/nis/nmsclient/util/io')
| -rw-r--r-- | src/com/nis/nmsclient/util/io/UnicodeInputStream.java | 118 | ||||
| -rw-r--r-- | src/com/nis/nmsclient/util/io/UnicodeReader.java | 120 |
2 files changed, 238 insertions, 0 deletions
diff --git a/src/com/nis/nmsclient/util/io/UnicodeInputStream.java b/src/com/nis/nmsclient/util/io/UnicodeInputStream.java new file mode 100644 index 0000000..9e61d42 --- /dev/null +++ b/src/com/nis/nmsclient/util/io/UnicodeInputStream.java @@ -0,0 +1,118 @@ +package com.nis.nmsclient.util.io; +/** + version: 1.1 / 2007-01-25 + - changed BOM recognition ordering (longer boms first) + + Original pseudocode : Thomas Weidenfeller + Implementation tweaked: Aki Nieminen + + http://www.unicode.org/unicode/faq/utf_bom.html + BOMs in byte length ordering: + 00 00 FE FF = UTF-32, big-endian + FF FE 00 00 = UTF-32, little-endian + EF BB BF = UTF-8, + FE FF = UTF-16, big-endian + FF FE = UTF-16, little-endian + + Win2k Notepad: + Unicode format = UTF-16LE + ***/ + +import java.io.*; + +/** + * This inputstream will recognize unicode BOM marks and will skip bytes if + * getEncoding() method is called before any of the read(...) methods. + * + * Usage pattern: String enc = "ISO-8859-1"; // or NULL to use systemdefault + * FileInputStream fis = new FileInputStream(file); UnicodeInputStream uin = new + * UnicodeInputStream(fis, enc); enc = uin.getEncoding(); // check and skip + * possible BOM bytes InputStreamReader in; if (enc == null) in = new + * InputStreamReader(uin); else in = new InputStreamReader(uin, enc); + */ +public class UnicodeInputStream extends InputStream { + PushbackInputStream internalIn; + boolean isInited = false; + String defaultEnc; + String encoding; + + private static final int BOM_SIZE = 4; + + UnicodeInputStream(InputStream in, String defaultEnc) { + internalIn = new PushbackInputStream(in, BOM_SIZE); + this.defaultEnc = defaultEnc; + } + + public String getDefaultEncoding() { + return defaultEnc; + } + + public String getEncoding() { + if (!isInited) { + try { + init(); + } catch (IOException ex) { + IllegalStateException ise = new IllegalStateException( + "Init method failed."); + ise.initCause(ise); + throw ise; + } + } + return encoding; + } + + /** + * Read-ahead four bytes and check for BOM marks. Extra bytes are unread + * back to the stream, only BOM bytes are skipped. + */ + protected void init() throws IOException { + if (isInited) + return; + + byte bom[] = new byte[BOM_SIZE]; + int n, unread; + n = internalIn.read(bom, 0, bom.length); + + if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) + && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) { + encoding = "UTF-32BE"; + unread = n - 4; + } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) + && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) { + encoding = "UTF-32LE"; + unread = n - 4; + } else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) + && (bom[2] == (byte) 0xBF)) { + encoding = "UTF-8"; + unread = n - 3; + } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) { + encoding = "UTF-16BE"; + unread = n - 2; + } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) { + encoding = "UTF-16LE"; + unread = n - 2; + } else { + // Unicode BOM mark not found, unread all bytes + encoding = defaultEnc; + unread = n; + } + // System.out.println("read=" + n + ", unread=" + unread); + + if (unread > 0) + internalIn.unread(bom, (n - unread), unread); + + isInited = true; + } + + public void close() throws IOException { + // init(); + isInited = true; + internalIn.close(); + } + + public int read() throws IOException { + // init(); + isInited = true; + return internalIn.read(); + } +} diff --git a/src/com/nis/nmsclient/util/io/UnicodeReader.java b/src/com/nis/nmsclient/util/io/UnicodeReader.java new file mode 100644 index 0000000..c4169b5 --- /dev/null +++ b/src/com/nis/nmsclient/util/io/UnicodeReader.java @@ -0,0 +1,120 @@ +package com.nis.nmsclient.util.io; +/** + version: 1.1 / 2007-01-25 + - changed BOM recognition ordering (longer boms first) + + Original pseudocode : Thomas Weidenfeller + Implementation tweaked: Aki Nieminen + + http://www.unicode.org/unicode/faq/utf_bom.html + BOMs: + 00 00 FE FF = UTF-32, big-endian + FF FE 00 00 = UTF-32, little-endian + EF BB BF = UTF-8, + FE FF = UTF-16, big-endian + FF FE = UTF-16, little-endian + + Win2k Notepad: + Unicode format = UTF-16LE + ***/ + +import java.io.*; + +/** + * Generic unicode textreader, which will use BOM mark to identify the encoding + * to be used. If BOM is not found then use a given default or system encoding. + */ +public class UnicodeReader extends Reader { + PushbackInputStream internalIn; + InputStreamReader internalIn2 = null; + String defaultEnc; + + private static final int BOM_SIZE = 4; + + /** + * + * @param in + * inputstream to be read + * @param defaultEnc + * default encoding if stream does not have BOM marker. Give NULL + * to use system-level default. + */ + public UnicodeReader(InputStream in, String defaultEnc) { + internalIn = new PushbackInputStream(in, BOM_SIZE); + this.defaultEnc = defaultEnc; + } + + public String getDefaultEncoding() { + return defaultEnc; + } + + /** + * Get stream encoding or NULL if stream is uninitialized. Call init() or + * read() method to initialize it. + */ + public String getEncoding() { + if (internalIn2 == null) + return null; + return internalIn2.getEncoding(); + } + + /** + * Read-ahead four bytes and check for BOM marks. Extra bytes are unread + * back to the stream, only BOM bytes are skipped. + */ + protected void init() throws IOException { + if (internalIn2 != null) + return; + + String encoding; + byte bom[] = new byte[BOM_SIZE]; + int n, unread; + n = internalIn.read(bom, 0, bom.length); + + if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) + && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) { + encoding = "UTF-32BE"; + unread = n - 4; + } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) + && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) { + encoding = "UTF-32LE"; + unread = n - 4; + } else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) + && (bom[2] == (byte) 0xBF)) { + encoding = "UTF-8"; + unread = n - 3; + } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) { + encoding = "UTF-16BE"; + unread = n - 2; + } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) { + encoding = "UTF-16LE"; + unread = n - 2; + } else { + // Unicode BOM mark not found, unread all bytes + encoding = defaultEnc; + unread = n; + } + // System.out.println("read=" + n + ", unread=" + unread); + + if (unread > 0) + internalIn.unread(bom, (n - unread), unread); + + // Use given encoding + if (encoding == null) { + internalIn2 = new InputStreamReader(internalIn); + } else { + internalIn2 = new InputStreamReader(internalIn, encoding); + } + } + + public void close() throws IOException { + init(); + internalIn2.close(); + } + + public int read(char[] cbuf, int off, int len) throws IOException { + init(); + return internalIn2.read(cbuf, off, len); + } + +}
\ No newline at end of file |
