001 /*
002 * $Id: CharsetToolkit.java,v 1.2 2004/07/11 19:41:25 glaforge Exp $
003 *
004 * Copyright 2003 (C) Guillaume Laforge. All Rights Reserved.
005 *
006 * Redistribution and use of this software and associated documentation
007 * ("Software"), with or without modification, are permitted provided that the
008 * following conditions are met:
009 * 1. Redistributions of source code must retain copyright statements and
010 * notices. Redistributions must also contain a copy of this document.
011 * 2. Redistributions in binary form must reproduce the above copyright
012 * notice, this list of conditions and the following disclaimer in the
013 * documentation and/or other materials provided with the distribution.
014 * 3. The name "groovy" must not be used to endorse or promote products
015 * derived from this Software without prior written permission of The Codehaus.
016 * For written permission, please contact info@codehaus.org.
017 * 4. Products derived from this Software may not be called "groovy" nor may
018 * "groovy" appear in their names without prior written permission of The
019 * Codehaus. "groovy" is a registered trademark of The Codehaus.
020 * 5. Due credit should be given to The Codehaus - http://groovy.codehaus.org/
021 *
022 * THIS SOFTWARE IS PROVIDED BY THE CODEHAUS AND CONTRIBUTORS ``AS IS'' AND ANY
023 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
024 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
025 * DISCLAIMED. IN NO EVENT SHALL THE CODEHAUS OR ITS CONTRIBUTORS BE LIABLE FOR
026 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
027 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
028 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
029 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
030 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
031 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
032 * DAMAGE.
033 *
034 */
035
036 package groovy.util;
037
038 import java.io.*;
039 import java.nio.charset.Charset;
040 import java.util.*;
041
042 /**
043 * <p>Utility class to guess the encoding of a given text file.</p>
044 *
045 * <p>Unicode files encoded in UTF-16 (low or big endian) or UTF-8 files
046 * with a Byte Order Marker are correctly discovered. For UTF-8 files with no BOM, if the buffer
047 * is wide enough, the charset should also be discovered.</p>
048 *
049 * <p>A byte buffer of 4KB is usually sufficient to be able to guess the encoding.</p>
050 *
051 * <p>Usage:</p>
052 * <pre>
053 * // guess the encoding
054 * Charset guessedCharset = CharsetToolkit.guessEncoding(file, 4096);
055 *
056 * // create a reader with the correct charset
057 * CharsetToolkit toolkit = new CharsetToolkit(file);
058 * BufferedReader reader = toolkit.getReader();
059 *
060 * // read the file content
061 * String line;
062 * while ((line = br.readLine())!= null)
063 * {
064 * System.out.println(line);
065 * }
066 * </pre>
067 *
068 * @author Guillaume Laforge
069 */
070 public class CharsetToolkit {
071 private byte[] buffer;
072 private Charset defaultCharset;
073 private Charset charset;
074 private boolean enforce8Bit = true;
075 private File file;
076
077 /**
078 * Constructor of the <code>CharsetToolkit</code> utility class.
079 *
080 * @param file of which we want to know the encoding.
081 */
082 public CharsetToolkit(File file) throws IOException {
083 this.file = file;
084 InputStream input = new FileInputStream(file);
085 byte[] bytes = new byte[4096];
086 int bytesRead = input.read(bytes);
087 if (bytesRead == -1) {
088 this.buffer = new byte[0];
089 }
090 else if (bytesRead < 4096) {
091 byte[] bytesToGuess = new byte[bytesRead];
092 System.arraycopy(bytes, 0, bytesToGuess, 0, bytesRead);
093 this.buffer = bytesToGuess;
094 }
095 else {
096 this.buffer = bytes;
097 }
098 this.defaultCharset = getDefaultSystemCharset();
099 this.charset = null;
100 }
101
102 /**
103 * Defines the default <code>Charset</code> used in case the buffer represents
104 * an 8-bit <code>Charset</code>.
105 *
106 * @param defaultCharset the default <code>Charset</code> to be returned by <code>guessEncoding()</code>
107 * if an 8-bit <code>Charset</code> is encountered.
108 */
109 public void setDefaultCharset(Charset defaultCharset) {
110 if (defaultCharset != null)
111 this.defaultCharset = defaultCharset;
112 else
113 this.defaultCharset = getDefaultSystemCharset();
114 }
115
116 public Charset getCharset() {
117 if (this.charset == null)
118 this.charset = guessEncoding();
119 return charset;
120 }
121
122 /**
123 * If US-ASCII is recognized, enforce to return the default encoding, rather than US-ASCII.
124 * It might be a file without any special character in the range 128-255, but that may be or become
125 * a file encoded with the default <code>charset</code> rather than US-ASCII.
126 *
127 * @param enforce a boolean specifying the use or not of US-ASCII.
128 */
129 public void setEnforce8Bit(boolean enforce) {
130 this.enforce8Bit = enforce;
131 }
132
133 /**
134 * Gets the enforce8Bit flag, in case we do not want to ever get a US-ASCII encoding.
135 *
136 * @return a boolean representing the flag of use of US-ASCII.
137 */
138 public boolean getEnforce8Bit() {
139 return this.enforce8Bit;
140 }
141
142 /**
143 * Retrieves the default Charset
144 * @return
145 */
146 public Charset getDefaultCharset() {
147 return defaultCharset;
148 }
149
150 /**
151 * <p>Guess the encoding of the provided buffer.</p>
152 * If Byte Order Markers are encountered at the beginning of the buffer, we immidiately
153 * return the charset implied by this BOM. Otherwise, the file would not be a human
154 * readable text file.</p>
155 *
156 * <p>If there is no BOM, this method tries to discern whether the file is UTF-8 or not.
157 * If it is not UTF-8, we assume the encoding is the default system encoding
158 * (of course, it might be any 8-bit charset, but usually, an 8-bit charset is the default one).</p>
159 *
160 * <p>It is possible to discern UTF-8 thanks to the pattern of characters with a multi-byte sequence.</p>
161 * <pre>
162 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
163 * 0000 0000-0000 007F 0xxxxxxx
164 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
165 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
166 * 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
167 * 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
168 * 0400 0000-7FFF FFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
169 * </pre>
170 * <p>With UTF-8, 0xFE and 0xFF never appear.</p>
171 *
172 * @return the Charset recognized.
173 */
174 private Charset guessEncoding() {
175 // if the file has a Byte Order Marker, we can assume the file is in UTF-xx
176 // otherwise, the file would not be human readable
177 if (hasUTF8Bom())
178 return Charset.forName("UTF-8");
179 if (hasUTF16LEBom())
180 return Charset.forName("UTF-16LE");
181 if (hasUTF16BEBom())
182 return Charset.forName("UTF-16BE");
183
184 // if a byte has its most significant bit set, the file is in UTF-8 or in the default encoding
185 // otherwise, the file is in US-ASCII
186 boolean highOrderBit = false;
187
188 // if the file is in UTF-8, high order bytes must have a certain value, in order to be valid
189 // if it's not the case, we can assume the encoding is the default encoding of the system
190 boolean validU8Char = true;
191
192 // TODO the buffer is not read up to the end, but up to length - 6
193
194 int length = buffer.length;
195 int i = 0;
196 while (i < length - 6) {
197 byte b0 = buffer[i];
198 byte b1 = buffer[i + 1];
199 byte b2 = buffer[i + 2];
200 byte b3 = buffer[i + 3];
201 byte b4 = buffer[i + 4];
202 byte b5 = buffer[i + 5];
203 if (b0 < 0) {
204 // a high order bit was encountered, thus the encoding is not US-ASCII
205 // it may be either an 8-bit encoding or UTF-8
206 highOrderBit = true;
207 // a two-bytes sequence was encoutered
208 if (isTwoBytesSequence(b0)) {
209 // there must be one continuation byte of the form 10xxxxxx,
210 // otherwise the following characteris is not a valid UTF-8 construct
211 if (!isContinuationChar(b1))
212 validU8Char = false;
213 else
214 i++;
215 }
216 // a three-bytes sequence was encoutered
217 else if (isThreeBytesSequence(b0)) {
218 // there must be two continuation bytes of the form 10xxxxxx,
219 // otherwise the following characteris is not a valid UTF-8 construct
220 if (!(isContinuationChar(b1) && isContinuationChar(b2)))
221 validU8Char = false;
222 else
223 i += 2;
224 }
225 // a four-bytes sequence was encoutered
226 else if (isFourBytesSequence(b0)) {
227 // there must be three continuation bytes of the form 10xxxxxx,
228 // otherwise the following characteris is not a valid UTF-8 construct
229 if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3)))
230 validU8Char = false;
231 else
232 i += 3;
233 }
234 // a five-bytes sequence was encoutered
235 else if (isFiveBytesSequence(b0)) {
236 // there must be four continuation bytes of the form 10xxxxxx,
237 // otherwise the following characteris is not a valid UTF-8 construct
238 if (!(isContinuationChar(b1)
239 && isContinuationChar(b2)
240 && isContinuationChar(b3)
241 && isContinuationChar(b4)))
242 validU8Char = false;
243 else
244 i += 4;
245 }
246 // a six-bytes sequence was encoutered
247 else if (isSixBytesSequence(b0)) {
248 // there must be five continuation bytes of the form 10xxxxxx,
249 // otherwise the following characteris is not a valid UTF-8 construct
250 if (!(isContinuationChar(b1)
251 && isContinuationChar(b2)
252 && isContinuationChar(b3)
253 && isContinuationChar(b4)
254 && isContinuationChar(b5)))
255 validU8Char = false;
256 else
257 i += 5;
258 }
259 else
260 validU8Char = false;
261 }
262 if (!validU8Char)
263 break;
264 i++;
265 }
266 // if no byte with an high order bit set, the encoding is US-ASCII
267 // (it might have been UTF-7, but this encoding is usually internally used only by mail systems)
268 if (!highOrderBit) {
269 // returns the default charset rather than US-ASCII if the enforce8Bit flag is set.
270 if (this.enforce8Bit)
271 return this.defaultCharset;
272 else
273 return Charset.forName("US-ASCII");
274 }
275 // if no invalid UTF-8 were encountered, we can assume the encoding is UTF-8,
276 // otherwise the file would not be human readable
277 if (validU8Char)
278 return Charset.forName("UTF-8");
279 // finally, if it's not UTF-8 nor US-ASCII, let's assume the encoding is the default encoding
280 return this.defaultCharset;
281 }
282
283 /**
284 * If the byte has the form 10xxxxx, then it's a continuation byte of a multiple byte character;
285 *
286 * @param b a byte.
287 * @return true if it's a continuation char.
288 */
289 private static boolean isContinuationChar(byte b) {
290 return -128 <= b && b <= -65;
291 }
292
293 /**
294 * If the byte has the form 110xxxx, then it's the first byte of a two-bytes sequence character.
295 *
296 * @param b a byte.
297 * @return true if it's the first byte of a two-bytes sequence.
298 */
299 private static boolean isTwoBytesSequence(byte b) {
300 return -64 <= b && b <= -33;
301 }
302
303 /**
304 * If the byte has the form 1110xxx, then it's the first byte of a three-bytes sequence character.
305 *
306 * @param b a byte.
307 * @return true if it's the first byte of a three-bytes sequence.
308 */
309 private static boolean isThreeBytesSequence(byte b) {
310 return -32 <= b && b <= -17;
311 }
312
313 /**
314 * If the byte has the form 11110xx, then it's the first byte of a four-bytes sequence character.
315 *
316 * @param b a byte.
317 * @return true if it's the first byte of a four-bytes sequence.
318 */
319 private static boolean isFourBytesSequence(byte b) {
320 return -16 <= b && b <= -9;
321 }
322
323 /**
324 * If the byte has the form 11110xx, then it's the first byte of a five-bytes sequence character.
325 *
326 * @param b a byte.
327 * @return true if it's the first byte of a five-bytes sequence.
328 */
329 private static boolean isFiveBytesSequence(byte b) {
330 return -8 <= b && b <= -5;
331 }
332
333 /**
334 * If the byte has the form 1110xxx, then it's the first byte of a six-bytes sequence character.
335 *
336 * @param b a byte.
337 * @return true if it's the first byte of a six-bytes sequence.
338 */
339 private static boolean isSixBytesSequence(byte b) {
340 return -4 <= b && b <= -3;
341 }
342
343 /**
344 * Retrieve the default charset of the system.
345 *
346 * @return the default <code>Charset</code>.
347 */
348 public static Charset getDefaultSystemCharset() {
349 return Charset.forName(System.getProperty("file.encoding"));
350 }
351
352 /**
353 * Has a Byte Order Marker for UTF-8 (Used by Microsoft's Notepad and other editors).
354 *
355 * @return true if the buffer has a BOM for UTF8.
356 */
357 public boolean hasUTF8Bom() {
358 if (buffer.length >= 3)
359 return (buffer[0] == -17 && buffer[1] == -69 && buffer[2] == -65);
360 else
361 return false;
362 }
363
364 /**
365 * Has a Byte Order Marker for UTF-16 Low Endian
366 * (ucs-2le, ucs-4le, and ucs-16le).
367 *
368 * @return true if the buffer has a BOM for UTF-16 Low Endian.
369 */
370 public boolean hasUTF16LEBom() {
371 if (buffer.length >= 2)
372 return (buffer[0] == -1 && buffer[1] == -2);
373 else
374 return false;
375 }
376
377 /**
378 * Has a Byte Order Marker for UTF-16 Big Endian
379 * (utf-16 and ucs-2).
380 *
381 * @return true if the buffer has a BOM for UTF-16 Big Endian.
382 */
383 public boolean hasUTF16BEBom() {
384 if (buffer.length >= 2)
385 return (buffer[0] == -2 && buffer[1] == -1);
386 else
387 return false;
388 }
389
390 /**
391 * Gets a <code>BufferedReader</code> (indeed a <code>LineNumberReader</code>) from the <code>File</code>
392 * specified in the constructor of <code>CharsetToolkit</code> using the charset discovered by the
393 * method <code>guessEncoding()</code>.
394 *
395 * @return a <code>BufferedReader</code>
396 * @throws FileNotFoundException if the file is not found.
397 */
398 public BufferedReader getReader() throws FileNotFoundException {
399 LineNumberReader reader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), getCharset()));
400 if (hasUTF8Bom() || hasUTF16LEBom() || hasUTF16BEBom()) {
401 try {
402 reader.read();
403 }
404 catch (IOException e) {
405 // should never happen, as a file with no content
406 // but with a BOM has at least one char
407 }
408 }
409 return reader;
410 }
411
412 /**
413 * Retrieves all the available <code>Charset</code>s on the platform,
414 * among which the default <code>charset</code>.
415 *
416 * @return an array of <code>Charset</code>s.
417 */
418 public static Charset[] getAvailableCharsets() {
419 Collection collection = Charset.availableCharsets().values();
420 return (Charset[]) collection.toArray(new Charset[collection.size()]);
421 }
422 }