Count the number of bytes needed to return an Unicode char. This can be from 1 to 6.
import java.io.File; import java.io.FileFilter; import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. * */ /** * Various string manipulation methods that are more efficient then chaining * string operations: all is done in the same buffer without creating a bunch of * string objects. * * @author <a href="mailto:dev@labs.apache.org">Dungeon Project</a> */ public class Main { private static final int UTF8_MULTI_BYTES_MASK = 0x0080; private static final int UTF8_TWO_BYTES_MASK = 0x00E0; private static final int UTF8_TWO_BYTES = 0x00C0; private static final int UTF8_THREE_BYTES_MASK = 0x00F0; private static final int UTF8_THREE_BYTES = 0x00E0; private static final int UTF8_FOUR_BYTES_MASK = 0x00F8; private static final int UTF8_FOUR_BYTES = 0x00F0; private static final int UTF8_FIVE_BYTES_MASK = 0x00FC; private static final int UTF8_FIVE_BYTES = 0x00F8; private static final int UTF8_SIX_BYTES_MASK = 0x00FE; private static final int UTF8_SIX_BYTES = 0x00FC; /** * Count the number of bytes needed to return an Unicode char. This can be * from 1 to 6. * * @param bytes * The bytes to read * @param pos * Position to start counting. It must be a valid start of a * encoded char ! * @return The number of bytes to create a char, or -1 if the encoding is * wrong. TODO : Should stop after the third byte, as a char is only * 2 bytes long. */ public static final int countBytesPerChar( byte[] bytes, int pos ) { if ( bytes == null ) { return -1; } if ( ( bytes[pos] & UTF8_MULTI_BYTES_MASK ) == 0 ) { return 1; } else if ( ( bytes[pos] & UTF8_TWO_BYTES_MASK ) == UTF8_TWO_BYTES ) { return 2; } else if ( ( bytes[pos] & UTF8_THREE_BYTES_MASK ) == UTF8_THREE_BYTES ) { return 3; } else if ( ( bytes[pos] & UTF8_FOUR_BYTES_MASK ) == UTF8_FOUR_BYTES ) { return 4; } else if ( ( bytes[pos] & UTF8_FIVE_BYTES_MASK ) == UTF8_FIVE_BYTES ) { return 5; } else if ( ( bytes[pos] & UTF8_SIX_BYTES_MASK ) == UTF8_SIX_BYTES ) { return 6; } else { return -1; } } }