Return the Unicode char which is coded in the bytes at position 0.
import java.io.File; import java.io.FileFilter; import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. * */ /** * Various string manipulation methods that are more efficient then chaining * string operations: all is done in the same buffer without creating a bunch of * string objects. * * @author <a href="mailto:dev@labs.apache.org">Dungeon Project</a> */ public class Main { private static final int UTF8_MULTI_BYTES_MASK = 0x0080; private static final int UTF8_TWO_BYTES_MASK = 0x00E0; private static final int UTF8_TWO_BYTES = 0x00C0; private static final int UTF8_THREE_BYTES_MASK = 0x00F0; private static final int UTF8_THREE_BYTES = 0x00E0; private static final int UTF8_FOUR_BYTES_MASK = 0x00F8; private static final int UTF8_FOUR_BYTES = 0x00F0; private static final int UTF8_FIVE_BYTES_MASK = 0x00FC; private static final int UTF8_FIVE_BYTES = 0x00F8; private static final int UTF8_SIX_BYTES_MASK = 0x00FE; private static final int UTF8_SIX_BYTES = 0x00FC; /** * Return the Unicode char which is coded in the bytes at position 0. * * @param bytes * The byte[] represntation of an Unicode string. * @return The first char found. */ public static final char bytesToChar( byte[] bytes ) { return bytesToChar( bytes, 0 ); } /** * Return the Unicode char which is coded in the bytes at the given * position. * * @param bytes * The byte[] represntation of an Unicode string. * @param pos * The current position to start decoding the char * @return The decoded char, or -1 if no char can be decoded TODO : Should * stop after the third byte, as a char is only 2 bytes long. */ public static final char bytesToChar( byte[] bytes, int pos ) { if ( bytes == null ) { return ( char ) -1; } if ( ( bytes[pos] & UTF8_MULTI_BYTES_MASK ) == 0 ) { return ( char ) bytes[pos]; } else { if ( ( bytes[pos] & UTF8_TWO_BYTES_MASK ) == UTF8_TWO_BYTES ) { // Two bytes char return ( char ) ( ( ( bytes[pos] & 0x1C ) << 6 ) + // 110x-xxyy // 10zz-zzzz // -> // 0000-0xxx // 0000-0000 ( ( bytes[pos] & 0x03 ) << 6 ) + // 110x-xxyy 10zz-zzzz // -> 0000-0000 // yy00-0000 ( bytes[pos + 1] & 0x3F ) // 110x-xxyy 10zz-zzzz -> 0000-0000 // 00zz-zzzz ); // -> 0000-0xxx yyzz-zzzz (07FF) } else if ( ( bytes[pos] & UTF8_THREE_BYTES_MASK ) == UTF8_THREE_BYTES ) { // Three bytes char return ( char ) ( // 1110-tttt 10xx-xxyy 10zz-zzzz -> tttt-0000-0000-0000 ( ( bytes[pos] & 0x0F ) << 12 ) + // 1110-tttt 10xx-xxyy 10zz-zzzz -> 0000-xxxx-0000-0000 ( ( bytes[pos + 1] & 0x3C ) << 6 ) + // 1110-tttt 10xx-xxyy 10zz-zzzz -> 0000-0000-yy00-0000 ( ( bytes[pos + 1] & 0x03 ) << 6 ) + // 1110-tttt 10xx-xxyy 10zz-zzzz -> 0000-0000-00zz-zzzz ( bytes[pos + 2] & 0x3F ) // -> tttt-xxxx yyzz-zzzz (FF FF) ); } else if ( ( bytes[pos] & UTF8_FOUR_BYTES_MASK ) == UTF8_FOUR_BYTES ) { // Four bytes char return ( char ) ( // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 000t-tt00 // 0000-0000 0000-0000 ( ( bytes[pos] & 0x07 ) << 18 ) + // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-00uu // 0000-0000 0000-0000 ( ( bytes[pos + 1] & 0x30 ) << 16 ) + // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000 // vvvv-0000 0000-0000 ( ( bytes[pos + 1] & 0x0F ) << 12 ) + // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000 // 0000-xxxx 0000-0000 ( ( bytes[pos + 2] & 0x3C ) << 6 ) + // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000 // 0000-0000 yy00-0000 ( ( bytes[pos + 2] & 0x03 ) << 6 ) + // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000 // 0000-0000 00zz-zzzz ( bytes[pos + 3] & 0x3F ) // -> 000t-ttuu vvvv-xxxx yyzz-zzzz (1FFFFF) ); } else if ( ( bytes[pos] & UTF8_FIVE_BYTES_MASK ) == UTF8_FIVE_BYTES ) { // Five bytes char return ( char ) ( // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> // 0000-00tt 0000-0000 0000-0000 0000-0000 ( ( bytes[pos] & 0x03 ) << 24 ) + // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> // 0000-0000 uuuu-uu00 0000-0000 0000-0000 ( ( bytes[pos + 1] & 0x3F ) << 18 ) + // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> // 0000-0000 0000-00vv 0000-0000 0000-0000 ( ( bytes[pos + 2] & 0x30 ) << 12 ) + // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> // 0000-0000 0000-0000 wwww-0000 0000-0000 ( ( bytes[pos + 2] & 0x0F ) << 12 ) + // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> // 0000-0000 0000-0000 0000-xxxx 0000-0000 ( ( bytes[pos + 3] & 0x3C ) << 6 ) + // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> // 0000-0000 0000-0000 0000-0000 yy00-0000 ( ( bytes[pos + 3] & 0x03 ) << 6 ) + // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> // 0000-0000 0000-0000 0000-0000 00zz-zzzz ( bytes[pos + 4] & 0x3F ) // -> 0000-00tt uuuu-uuvv wwww-xxxx yyzz-zzzz (03 FF FF FF) ); } else if ( ( bytes[pos] & UTF8_FIVE_BYTES_MASK ) == UTF8_FIVE_BYTES ) { // Six bytes char return ( char ) ( // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz // -> // 0s00-0000 0000-0000 0000-0000 0000-0000 ( ( bytes[pos] & 0x01 ) << 30 ) + // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz // -> // 00tt-tttt 0000-0000 0000-0000 0000-0000 ( ( bytes[pos + 1] & 0x3F ) << 24 ) + // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy // 10zz-zzzz -> // 0000-0000 uuuu-uu00 0000-0000 0000-0000 ( ( bytes[pos + 2] & 0x3F ) << 18 ) + // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy // 10zz-zzzz -> // 0000-0000 0000-00vv 0000-0000 0000-0000 ( ( bytes[pos + 3] & 0x30 ) << 12 ) + // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy // 10zz-zzzz -> // 0000-0000 0000-0000 wwww-0000 0000-0000 ( ( bytes[pos + 3] & 0x0F ) << 12 ) + // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy // 10zz-zzzz -> // 0000-0000 0000-0000 0000-xxxx 0000-0000 ( ( bytes[pos + 4] & 0x3C ) << 6 ) + // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy // 10zz-zzzz -> // 0000-0000 0000-0000 0000-0000 yy00-0000 ( ( bytes[pos + 4] & 0x03 ) << 6 ) + // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz // -> // 0000-0000 0000-0000 0000-0000 00zz-zzzz ( bytes[pos + 5] & 0x3F ) // -> 0stt-tttt uuuu-uuvv wwww-xxxx yyzz-zzzz (7F FF FF FF) ); } else { return ( char ) -1; } } } }