Return the Unicode char which is coded in the bytes at the given position.
import java.io.File; import java.io.FileFilter; import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. * */ /** * Various string manipulation methods that are more efficient then chaining * string operations: all is done in the same buffer without creating a bunch of * string objects. * * @author <a href="mailto:dev@labs.apache.org">Dungeon Project</a> */ public class Main { private static final int CHAR_ONE_BYTE_MASK = 0xFFFFFF80; private static final int CHAR_TWO_BYTES_MASK = 0xFFFFF800; private static final int CHAR_THREE_BYTES_MASK = 0xFFFF0000; private static final int CHAR_FOUR_BYTES_MASK = 0xFFE00000; private static final int CHAR_FIVE_BYTES_MASK = 0xFC000000; private static final int CHAR_SIX_BYTES_MASK = 0x80000000; /** * Return the Unicode char which is coded in the bytes at the given * position. * * @param car The character to be transformed to an array of bytes * * @return The byte array representing the char * * TODO : Should stop after the third byte, as a char is only 2 bytes long. */ public static final byte[] charToBytes( char car ) { byte[] bytes = new byte[countNbBytesPerChar( car )]; if ( ( car | 0x7F ) == 0x7F ) { // Single byte char bytes[0] = ( byte ) car; return bytes; } else if ( ( car | 0x7F) == 0x7FF ) { // two bytes char bytes[0] = ( byte ) ( 0x00C0 + ( ( car & 0x07C0 ) >> 6 ) ); bytes[1] = ( byte ) ( 0x0080 + ( car & 0x3F ) ); } else { // Three bytes char bytes[0] = ( byte ) ( 0x00E0 + ( ( car & 0xF000 ) >> 12 ) ); bytes[1] = ( byte ) ( 0x0080 + ( ( car & 0x0FC0 ) >> 6 ) ); bytes[2] = ( byte ) ( 0x0080 + ( car & 0x3F ) ); } return bytes; } /** * Return the number of bytes that hold an Unicode char. * * @param car * The character to be decoded * @return The number of bytes to hold the char. TODO : Should stop after * the third byte, as a char is only 2 bytes long. */ public static final int countNbBytesPerChar( char car ) { if ( ( car & CHAR_ONE_BYTE_MASK ) == 0 ) { return 1; } else if ( ( car & CHAR_TWO_BYTES_MASK ) == 0 ) { return 2; } else if ( ( car & CHAR_THREE_BYTES_MASK ) == 0 ) { return 3; } else if ( ( car & CHAR_FOUR_BYTES_MASK ) == 0 ) { return 4; } else if ( ( car & CHAR_FIVE_BYTES_MASK ) == 0 ) { return 5; } else if ( ( car & CHAR_SIX_BYTES_MASK ) == 0 ) { return 6; } else { return -1; } } }