package com.jclark.xml.tok;
/**
* An Encoding
object corresponds to a possible
* encoding (a mapping from characters to sequences of bytes).
* It provides operations on byte arrays
* that represent all or part of a parsed XML entity in that encoding.
*
* The set of ASCII characters excluding $@\^`{}~
* have a special status; these are called XML significant
* characters.
*
* This class imposes certain restrictions on an encoding: *
* Several methods operate on byte subarrays. The subarray is specified
* by a byte array buf
and two integers,
* off
and end
; off
* gives the index in buf
of the first byte of the subarray
* and end
gives the
* index in buf
of the byte immediately after the last byte.
*
* Use the getInitialEncoding
method to get an
* Encoding
object to use to start parsing an entity.
*
* The main operations provided by Encoding
are
* tokenizeProlog
, tokenizeContent
and
* tokenizeCdataSection
;
* these are used to divide up an XML entity into tokens.
* tokenizeProlog
is used for the prolog of an XML document
* as well as for the external subset and parameter entities (except
* when referenced in an EntityValue
);
* it can also be used for parsing the Misc
* that follows
* the document element.
* tokenizeContent
is used for the document element and for
* parsed general entities that are referenced in content
* except for CDATA sections.
* tokenizeCdataSection
is used for CDATA sections, following
* the <![CDATA[
up to and including the ]]>
.
*
* tokenizeAttributeValue
and tokenizeEntityValue
* are used to further divide up tokens returned by tokenizeProlog
* and tokenizeContent
; they are also used to divide up entities
* referenced in attribute values or entity values.
* @version $Revision$ $Date$
*/
public abstract class Encoding {
/**
* Represents one or more characters of data.
*/
public static final int TOK_DATA_CHARS = 0;
/**
* Represents a newline (CR, LF or CR followed by LF) in data.
*/
public static final int TOK_DATA_NEWLINE = TOK_DATA_CHARS + 1;
/**
* Represents a complete start-tag <name>
,
* that doesn't have any attribute specifications.
*/
public static final int TOK_START_TAG_NO_ATTS = TOK_DATA_NEWLINE + 1;
/**
* Represents a complete start-tag <name att="val">
,
* that contains one or more attribute specifications.
*/
public static final int TOK_START_TAG_WITH_ATTS = TOK_START_TAG_NO_ATTS + 1;
/**
* Represents an empty element tag <name/>
,
* that doesn't have any attribute specifications.
*/
public static final int TOK_EMPTY_ELEMENT_NO_ATTS = TOK_START_TAG_WITH_ATTS + 1;
/**
* Represents an empty element tag <name att="val"/>
,
* that contains one or more attribute specifications.
*/
public static final int TOK_EMPTY_ELEMENT_WITH_ATTS = TOK_EMPTY_ELEMENT_NO_ATTS + 1;
/**
* Represents a complete end-tag </name>
.
*/
public static final int TOK_END_TAG = TOK_EMPTY_ELEMENT_WITH_ATTS + 1;
/**
* Represents the start of a CDATA section <![CDATA[
.
*/
public static final int TOK_CDATA_SECT_OPEN = TOK_END_TAG + 1;
/**
* Represents the end of a CDATA section ]]>
.
*/
public static final int TOK_CDATA_SECT_CLOSE = TOK_CDATA_SECT_OPEN + 1;
/**
* Represents a general entity reference.
*/
public static final int TOK_ENTITY_REF = TOK_CDATA_SECT_CLOSE + 1;
/**
* Represents a general entity reference to a one of the 5 predefined
* entities amp
, lt
, gt
,
* quot
, apos
.
*/
public static final int TOK_MAGIC_ENTITY_REF = TOK_ENTITY_REF + 1;
/**
* Represents a numeric character reference (decimal or hexadecimal),
* when the referenced character is less than or equal to 0xFFFF
* and so is represented by a single char.
*/
public static final int TOK_CHAR_REF = TOK_MAGIC_ENTITY_REF + 1;
/**
* Represents a numeric character reference (decimal or hexadecimal),
* when the referenced character is greater than 0xFFFF and so is
* represented by a pair of chars.
*/
public static final int TOK_CHAR_PAIR_REF = TOK_CHAR_REF + 1;
/**
* Represents a processing instruction.
*/
public static final int TOK_PI = TOK_CHAR_PAIR_REF + 1;
/**
* Represents an XML declaration or text declaration (a processing
* instruction whose target is xml
).
*/
public static final int TOK_XML_DECL = TOK_PI + 1;
/**
* Represents a comment <!-- comment -->
.
* This can occur both in the prolog and in content.
*/
public static final int TOK_COMMENT = TOK_XML_DECL + 1;
/**
* Represents a white space character in an attribute value,
* excluding white space characters that are part of line boundaries.
*/
public static final int TOK_ATTRIBUTE_VALUE_S = TOK_COMMENT + 1;
/**
* Represents a parameter entity reference in the prolog.
*/
public static final int TOK_PARAM_ENTITY_REF = TOK_ATTRIBUTE_VALUE_S + 1;
/**
* Represents whitespace in the prolog.
* The token contains one or more whitespace characters.
*/
public static final int TOK_PROLOG_S = TOK_PARAM_ENTITY_REF + 1;
/**
* Represents <!NAME
in the prolog.
*/
public static final int TOK_DECL_OPEN = TOK_PROLOG_S + 1;
/**
* Represents >
in the prolog.
*/
public static final int TOK_DECL_CLOSE = TOK_DECL_OPEN + 1;
/**
* Represents a name in the prolog.
*/
public static final int TOK_NAME = TOK_DECL_CLOSE + 1;
/**
* Represents a name token in the prolog that is not a name.
*/
public static final int TOK_NMTOKEN = TOK_NAME + 1;
/**
* Represents #NAME
in the prolog.
*/
public static final int TOK_POUND_NAME = TOK_NMTOKEN + 1;
/**
* Represents |
in the prolog.
*/
public static final int TOK_OR = TOK_POUND_NAME + 1;
/**
* Represents a %
in the prolog that does not start
* a parameter entity reference.
* This can occur in an entity declaration.
*/
public static final int TOK_PERCENT = TOK_OR + 1;
/**
* Represents a (
in the prolog.
*/
public static final int TOK_OPEN_PAREN = TOK_PERCENT + 1;
/**
* Represents a )
in the prolog that is not
* followed immediately by any of
* *
, +
or ?
.
*/
public static final int TOK_CLOSE_PAREN = TOK_OPEN_PAREN + 1;
/**
* Represents [
in the prolog.
*/
public static final int TOK_OPEN_BRACKET = TOK_CLOSE_PAREN + 1;
/**
* Represents ]
in the prolog.
*/
public static final int TOK_CLOSE_BRACKET = TOK_OPEN_BRACKET + 1;
/**
* Represents a literal (EntityValue, AttValue, SystemLiteral or
* PubidLiteral).
*/
public static final int TOK_LITERAL = TOK_CLOSE_BRACKET + 1;
/**
* Represents a name followed immediately by ?
.
*/
public static final int TOK_NAME_QUESTION = TOK_LITERAL + 1;
/**
* Represents a name followed immediately by *
.
*/
public static final int TOK_NAME_ASTERISK = TOK_NAME_QUESTION + 1;
/**
* Represents a name followed immediately by +
.
*/
public static final int TOK_NAME_PLUS = TOK_NAME_ASTERISK + 1;
/**
* Represents <![
in the prolog.
*/
public static final int TOK_COND_SECT_OPEN = TOK_NAME_PLUS + 1;
/**
* Represents ]]>
in the prolog.
*/
public static final int TOK_COND_SECT_CLOSE = TOK_COND_SECT_OPEN + 1;
/**
* Represents )?
in the prolog.
*/
public static final int TOK_CLOSE_PAREN_QUESTION = TOK_COND_SECT_CLOSE + 1;
/**
* Represents )*
in the prolog.
*/
public static final int TOK_CLOSE_PAREN_ASTERISK = TOK_CLOSE_PAREN_QUESTION + 1;
/**
* Represents )+
in the prolog.
*/
public static final int TOK_CLOSE_PAREN_PLUS = TOK_CLOSE_PAREN_ASTERISK + 1;
/**
* Represents ,
in the prolog.
*/
public static final int TOK_COMMA = TOK_CLOSE_PAREN_PLUS + 1;
/**
* Convert bytes to characters.
* The bytes on sourceBuf
between sourceStart
* and sourceEnd
are converted to characters and stored
* in targetBuf
starting at targetStart
.
* (targetBuf.length - targetStart) * getMinBytesPerChar()
* must be at greater than or equal to
* sourceEnd - sourceStart
.
* If getFixedBytesPerChar
returns a value greater than 0,
* then the return value will be equal to
* (sourceEnd - sourceStart)/getFixedBytesPerChar()
.
* @return the number of characters stored into targetBuf
* @see #getFixedBytesPerChar
*/
public abstract int convert(byte[] sourceBuf, int sourceStart, int sourceEnd,
char[] targetBuf, int targetStart);
/**
* Returns the number of bytes required to represent each char
,
* or zero if different char
s are represented by different
* numbers of bytes. The value returned will 0, 1, 2, or 4.
*/
public abstract int getFixedBytesPerChar();
private static Encoding utf8Encoding;
private static Encoding utf16LittleEndianEncoding;
private static Encoding utf16BigEndianEncoding;
private static Encoding internalEncoding;
private static Encoding iso8859_1Encoding;
private static Encoding asciiEncoding;
private static Encoding windows1250Encoding;
private static Encoding iso8859_2Encoding;
private static final byte UTF8_ENCODING = 0;
private static final byte UTF16_LITTLE_ENDIAN_ENCODING = 1;
private static final byte UTF16_BIG_ENDIAN_ENCODING = 2;
private static final byte INTERNAL_ENCODING = 3;
private static final byte ISO8859_1_ENCODING = 4;
private static final byte ASCII_ENCODING = 5;
private static final byte WINDOWS1250_ENCODING = 6;
private static final byte ISO8859_2_ENCODING = 7;
// Encoding vector for windows-1250 encoding
// This encoding is used by Windows for Central European languages
private static final String windows1250MapString =
"\0\u0001\u0002\u0003\u0004\u0005\u0006\u0007\b\t\n\u000b\f\r\u000e\u000f" +
"\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001a\u001b\u001c\u001d\u001e\u001f" +
"\u0020\u0021\"\u0023\u0024\u0025\u0026\'\u0028\u0029\u002a\u002b\u002c\u002d\u002e\u002f" +
"\u0030\u0031\u0032\u0033\u0034\u0035\u0036\u0037\u0038\u0039\u003a\u003b\u003c\u003d\u003e\u003f" +
"\u0040\u0041\u0042\u0043\u0044\u0045\u0046\u0047\u0048\u0049\u004a\u004b\u004c\u004d\u004e\u004f" +
"\u0050\u0051\u0052\u0053\u0054\u0055\u0056\u0057\u0058\u0059\u005a\u005b\\\u005d\u005e\u005f" +
"\u0060\u0061\u0062\u0063\u0064\u0065\u0066\u0067\u0068\u0069\u006a\u006b\u006c\u006d\u006e\u006f" +
"\u0070\u0071\u0072\u0073\u0074\u0075\u0076\u0077\u0078\u0079\u007a\u007b\u007c\u007d\u007e\u007f" +
"\u20ac\ufffd\u201a\ufffd\u201e\u2026\u2020\u2021\ufffd\u2030\u0160\u2039\u015a\u0164\u017d\u0179" +
"\ufffd\u2018\u2019\u201c\u201d\u2022\u2013\u2014\ufffd\u2122\u0161\u203a\u015b\u0165\u017e\u017a" +
"\u00a0\u02c7\u02d8\u0141\u00a4\u0104\u00a6\u00a7\u00a8\u00a9\u015e\u00ab\ufffd\u00ad\u00ae\u017b" +
"\u00b0\u00b1\u02db\u0142\u00b4\u00b5\u00b6\u00b7\u00b8\u0105\u015f\u00bb\u013d\u02dd\u013e\u017c" +
"\u0154\u00c1\u00c2\u0102\u00c4\u0139\u0106\u00c7\u010c\u00c9\u0118\u00cb\u011a\u00cd\u00ce\u010e" +
"\u0110\u0143\u0147\u00d3\u00d4\u0150\u00d6\u00d7\u0158\u016e\u00da\u0170\u00dc\u00dd\u0162\u00df" +
"\u0155\u00e1\u00e2\u0103\u00e4\u013a\u0107\u00e7\u010d\u00e9\u0119\u00eb\u011b\u00ed\u00ee\u010f" +
"\u0111\u0144\u0148\u00f3\u00f4\u0151\u00f6\u00f7\u0159\u016f\u00fa\u0171\u00fc\u00fd\u0163\u02d9";
// Encoding vector for ISO 8859-2 encoding
// This encoding is ISO standard for Central European languages
private static final String iso8859_2MapString =
"\0\u0001\u0002\u0003\u0004\u0005\u0006\u0007\b\t\n\u000b\f\r\u000e\u000f" +
"\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001a\u001b\u001c\u001d\u001e\u001f" +
"\u0020\u0021\"\u0023\u0024\u0025\u0026\'\u0028\u0029\u002a\u002b\u002c\u002d\u002e\u002f" +
"\u0030\u0031\u0032\u0033\u0034\u0035\u0036\u0037\u0038\u0039\u003a\u003b\u003c\u003d\u003e\u003f" +
"\u0040\u0041\u0042\u0043\u0044\u0045\u0046\u0047\u0048\u0049\u004a\u004b\u004c\u004d\u004e\u004f" +
"\u0050\u0051\u0052\u0053\u0054\u0055\u0056\u0057\u0058\u0059\u005a\u005b\\\u005d\u005e\u005f" +
"\u0060\u0061\u0062\u0063\u0064\u0065\u0066\u0067\u0068\u0069\u006a\u006b\u006c\u006d\u006e\u006f" +
"\u0070\u0071\u0072\u0073\u0074\u0075\u0076\u0077\u0078\u0079\u007a\u007b\u007c\u007d\u007e\u007f" +
"\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008a\u008b\u008c\u008d\u008e\u008f" +
"\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009a\u009b\u009c\u009d\u009e\u009f" +
"\u00a0\u0104\u02d8\u0141\u00a4\u013d\u015a\u00a7\u00a8\u0160\u015e\u0164\u0179\u00ad\u017d\u017b" +
"\u00b0\u0105\u02db\u0142\u00b4\u013e\u015b\u02c7\u00b8\u0161\u015f\u0165\u017a\u02dd\u017e\u017c" +
"\u0154\u00c1\u00c2\u0102\u00c4\u0139\u0106\u00c7\u010c\u00c9\u0118\u00cb\u011a\u00cd\u00ce\u010e" +
"\u0110\u0143\u0147\u00d3\u00d4\u0150\u00d6\u00d7\u0158\u016e\u00da\u0170\u00dc\u00dd\u0162\u00df" +
"\u0155\u00e1\u00e2\u0103\u00e4\u013a\u0107\u00e7\u010d\u00e9\u0119\u00eb\u011b\u00ed\u00ee\u010f" +
"\u0111\u0144\u0148\u00f3\u00f4\u0151\u00f6\u00f7\u0159\u016f\u00fa\u0171\u00fc\u00fd\u0163\u02d9";
private static synchronized Encoding getEncoding(byte enc) {
switch (enc) {
case UTF8_ENCODING:
if (utf8Encoding == null)
utf8Encoding = new UTF8Encoding();
return utf8Encoding;
case UTF16_LITTLE_ENDIAN_ENCODING:
if (utf16LittleEndianEncoding == null)
utf16LittleEndianEncoding = new UTF16LittleEndianEncoding();
return utf16LittleEndianEncoding;
case UTF16_BIG_ENDIAN_ENCODING:
if (utf16BigEndianEncoding == null)
utf16BigEndianEncoding = new UTF16BigEndianEncoding();
return utf16BigEndianEncoding;
case INTERNAL_ENCODING:
if (internalEncoding == null)
internalEncoding = new InternalEncoding();
return internalEncoding;
case ISO8859_1_ENCODING:
if (iso8859_1Encoding == null)
iso8859_1Encoding = new ISO8859_1Encoding();
return iso8859_1Encoding;
case ASCII_ENCODING:
if (asciiEncoding == null)
asciiEncoding = new ASCIIEncoding();
return asciiEncoding;
case WINDOWS1250_ENCODING:
if (windows1250Encoding == null)
windows1250Encoding = new SingleByteEncoding(windows1250MapString);
return windows1250Encoding;
case ISO8859_2_ENCODING:
if (iso8859_2Encoding == null)
iso8859_2Encoding = new SingleByteEncoding(iso8859_2MapString);
return iso8859_2Encoding;
}
return null;
}
Encoding getUTF16Encoding() {
return getEncoding(UTF16_BIG_ENDIAN_ENCODING);
}
// Bytes with type < 0 may not be data in content.
// The negation of the lead byte type gives the total number of bytes.
static final int BT_LEAD2 = -2;
static final int BT_LEAD3 = BT_LEAD2 - 1;
static final int BT_LEAD4 = BT_LEAD3 - 1;
static final int BT_NONXML = BT_LEAD4 - 1;
static final int BT_MALFORM = BT_NONXML - 1;
static final int BT_LT = BT_MALFORM - 1;
static final int BT_AMP = BT_LT - 1;
static final int BT_RSQB = BT_AMP - 1;
static final int BT_CR = BT_RSQB - 1;
static final int BT_LF = BT_CR - 1;
// Bytes with type >= 0 are treated as data in content.
static final int BT_GT = 0;
static final int BT_QUOT = BT_GT + 1;
static final int BT_APOS = BT_QUOT + 1;
static final int BT_EQUALS = BT_APOS + 1;
static final int BT_QUEST = BT_EQUALS + 1;
static final int BT_EXCL = BT_QUEST + 1;
static final int BT_SOL = BT_EXCL + 1;
static final int BT_SEMI = BT_SOL + 1;
static final int BT_NUM = BT_SEMI + 1;
static final int BT_LSQB = BT_NUM + 1;
static final int BT_S = BT_LSQB + 1;
static final int BT_NMSTRT = BT_S + 1;
static final int BT_NAME = BT_NMSTRT + 1;
static final int BT_MINUS = BT_NAME + 1;
static final int BT_OTHER = BT_MINUS + 1;
static final int BT_PERCNT = BT_OTHER + 1;
static final int BT_LPAR = BT_PERCNT + 1;
static final int BT_RPAR = BT_LPAR + 1;
static final int BT_AST = BT_RPAR + 1;
static final int BT_PLUS = BT_AST + 1;
static final int BT_COMMA = BT_PLUS + 1;
static final int BT_VERBAR = BT_COMMA + 1;
final static byte[] asciiTypeTable = {
/* 0x00 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,
/* 0x04 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,
/* 0x08 */ BT_NONXML, BT_S, BT_LF, BT_NONXML,
/* 0x0C */ BT_NONXML, BT_CR, BT_NONXML, BT_NONXML,
/* 0x10 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,
/* 0x14 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,
/* 0x18 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,
/* 0x1C */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,
/* 0x20 */ BT_S, BT_EXCL, BT_QUOT, BT_NUM,
/* 0x24 */ BT_OTHER, BT_PERCNT, BT_AMP, BT_APOS,
/* 0x28 */ BT_LPAR, BT_RPAR, BT_AST, BT_PLUS,
/* 0x2C */ BT_COMMA, BT_MINUS, BT_NAME, BT_SOL,
/* 0x30 */ BT_NAME, BT_NAME, BT_NAME, BT_NAME,
/* 0x34 */ BT_NAME, BT_NAME, BT_NAME, BT_NAME,
/* 0x38 */ BT_NAME, BT_NAME, BT_NMSTRT, BT_SEMI,
/* 0x3C */ BT_LT, BT_EQUALS, BT_GT, BT_QUEST,
/* 0x40 */ BT_OTHER, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
/* 0x44 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
/* 0x48 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
/* 0x4C */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
/* 0x50 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
/* 0x54 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
/* 0x58 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_LSQB,
/* 0x5C */ BT_OTHER, BT_RSQB, BT_OTHER, BT_NMSTRT,
/* 0x60 */ BT_OTHER, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
/* 0x64 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
/* 0x68 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
/* 0x6C */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
/* 0x70 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
/* 0x74 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
/* 0x78 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_OTHER,
/* 0x7C */ BT_VERBAR, BT_OTHER, BT_OTHER, BT_OTHER
};
// The minimum number of bytes per character.
private /* final */ int minBPC;
Encoding(int minBPC) {
this.minBPC = minBPC;
}
// There are guaranteed to be minBPC available bytes starting at off.
abstract int byteType(byte[] buf, int off);
abstract int byteToAscii(byte[] buf, int off);
// This must only be called when c is an (XML significant) ASCII character.
abstract boolean charMatches(byte[] buf, int off, char c);
// Called only when byteType(buf, off) == BT_LEAD2
int byteType2(byte[] buf, int off) {
return BT_OTHER;
}
// Called only when byteType(buf, off) == BT_LEAD3
int byteType3(byte[] buf, int off) {
return BT_OTHER;
}
// Called only when byteType(buf, off) == BT_LEAD4
int byteType4(byte[] buf, int off) {
return BT_OTHER;
}
void check2(byte[] buf, int off) throws InvalidTokenException { }
void check3(byte[] buf, int off) throws InvalidTokenException { }
void check4(byte[] buf, int off) throws InvalidTokenException { }
/**
* Moves a position forward.
* On entry, pos
gives the position of the byte at index
* off
in buf
.
* On exit, it pos
will give the position of the byte at index
* end
, which must be greater than or equal to off
.
* The bytes between off
and end
must encode
* one or more complete characters.
* A carriage return followed by a line feed will be treated as a single
* line delimiter provided that they are given to movePosition
* together.
*/
public abstract void movePosition(byte[] buf, int off, int end, Position pos);
// end encoding specific part
private final
void checkCharMatches(byte[] buf, int off, char c) throws InvalidTokenException {
if (!charMatches(buf, off, c))
throw new InvalidTokenException(off);
}
/* off points to character following "');
token.tokenEnd = off + minBPC;
return TOK_COMMENT;
}
break;
default:
off += minBPC;
break;
}
}
}
throw new PartialTokenException();
}
/* off points to character following " */
switch (byteType(buf, off + minBPC)) {
case BT_S:
case BT_CR:
case BT_LF:
case BT_PERCNT:
throw new InvalidTokenException(off);
}
/* fall through */
case BT_S:
case BT_CR:
case BT_LF:
token.tokenEnd = off;
return TOK_DECL_OPEN;
case BT_NMSTRT:
off += minBPC;
break;
default:
throw new InvalidTokenException(off);
}
}
throw new PartialTokenException();
}
private final
boolean targetIsXml(byte[] buf, int off, int end) throws InvalidTokenException {
boolean upper = false;
if (end - off != minBPC*3)
return false;
switch (byteToAscii(buf, off)) {
case 'x':
break;
case 'X':
upper = true;
break;
default:
return false;
}
off += minBPC;
switch (byteToAscii(buf, off)) {
case 'm':
break;
case 'M':
upper = true;
break;
default:
return false;
}
off += minBPC;
switch (byteToAscii(buf, off)) {
case 'l':
break;
case 'L':
upper = true;
break;
default:
return false;
}
if (upper)
throw new InvalidTokenException(off, InvalidTokenException.XML_TARGET);
return true;
}
/* off points to character following "" */
private final
int scanPi(byte[] buf, int off, int end, Token token)
throws PartialTokenException, InvalidTokenException {
int target = off;
if (off == end)
throw new PartialTokenException();
switch (byteType(buf, off)) {
case BT_NMSTRT:
off += minBPC;
break;
case BT_LEAD2:
if (end - off < 2)
throw new PartialCharException(off);
if (byteType2(buf, off) != BT_NMSTRT)
throw new InvalidTokenException(off);
off += 2;
break;
case BT_LEAD3:
if (end - off < 3)
throw new PartialCharException(off);
if (byteType3(buf, off) != BT_NMSTRT)
throw new InvalidTokenException(off);
off += 3;
break;
case BT_LEAD4:
if (end - off < 4)
throw new PartialCharException(off);
if (byteType4(buf, off) != BT_NMSTRT)
throw new InvalidTokenException(off);
off += 4;
break;
default:
throw new InvalidTokenException(off);
}
while (off != end) {
switch (byteType(buf, off)) {
case BT_NMSTRT:
case BT_NAME:
case BT_MINUS:
off += minBPC;
break;
case BT_LEAD2:
if (end - off < 2)
throw new PartialCharException(off);
if (!isNameChar2(buf, off))
throw new InvalidTokenException(off);
off += 2;
break;
case BT_LEAD3:
if (end - off < 3)
throw new PartialCharException(off);
if (!isNameChar3(buf, off))
throw new InvalidTokenException(off);
off += 3;
break;
case BT_LEAD4:
if (end - off < 4)
throw new PartialCharException(off);
if (!isNameChar4(buf, off))
throw new InvalidTokenException(off);
off += 4;
break;
case BT_S:
case BT_CR:
case BT_LF:
boolean isXml = targetIsXml(buf, target, off);
token.nameEnd = off;
off += minBPC;
while (off != end) {
switch (byteType(buf, off)) {
case BT_LEAD2:
if (end - off < 2)
throw new PartialCharException(off);
check2(buf, off);
off += 2;
break;
case BT_LEAD3:
if (end - off < 3)
throw new PartialCharException(off);
check3(buf, off);
off += 3;
break;
case BT_LEAD4:
if (end - off < 4)
throw new PartialCharException(off);
check4(buf, off);
off += 4;
break;
case BT_NONXML:
case BT_MALFORM:
throw new InvalidTokenException(off);
case BT_QUEST:
off += minBPC;
if (off == end)
throw new PartialTokenException();
if (charMatches(buf, off, '>')) {
token.tokenEnd = off + minBPC;
if (isXml)
return TOK_XML_DECL;
else
return TOK_PI;
}
break;
default:
off += minBPC;
break;
}
}
throw new PartialTokenException();
case BT_QUEST:
token.nameEnd = off;
off += minBPC;
if (off == end)
throw new PartialTokenException();
checkCharMatches(buf, off, '>');
token.tokenEnd = off + minBPC;
return (targetIsXml(buf, target, token.nameEnd)
? TOK_XML_DECL
: TOK_PI);
default:
throw new InvalidTokenException(off);
}
}
throw new PartialTokenException();
}
/* off points to character following "
*
TOK_DATA_CHARS
* TOK_DATA_NEWLINE
* TOK_CDATA_SECT_CLOSE
*
*
* Information about the token is stored in token
.
*
* After TOK_CDATA_SECT_CLOSE
is returned, the application
* should use tokenizeContent
.
*
* @exception EmptyTokenException if the subarray is empty
* @exception PartialTokenException if the subarray contains only part of
* a legal token
* @exception InvalidTokenException if the subarrary does not start
* with a legal token or part of one
* @exception ExtensibleTokenException if the subarray encodes just a carriage
* return ('\r')
*
* @see #TOK_DATA_CHARS
* @see #TOK_DATA_NEWLINE
* @see #TOK_CDATA_SECT_CLOSE
* @see Token
* @see EmptyTokenException
* @see PartialTokenException
* @see InvalidTokenException
* @see ExtensibleTokenException
* @see #tokenizeContent
*/
public final int tokenizeCdataSection(byte[] buf, int off, int end, Token token) throws EmptyTokenException, PartialTokenException, InvalidTokenException, ExtensibleTokenException {
if (minBPC > 1)
end = adjustEnd(off, end);
if (off == end)
throw new EmptyTokenException();
switch (byteType(buf, off)) {
case BT_RSQB:
off += minBPC;
if (off == end)
throw new PartialTokenException();
if (!charMatches(buf, off, ']'))
break;
off += minBPC;
if (off == end)
throw new PartialTokenException();
if (!charMatches(buf, off, '>')) {
off -= minBPC;
break;
}
token.tokenEnd = off + minBPC;
return TOK_CDATA_SECT_CLOSE;
case BT_CR:
off += minBPC;
if (off == end)
throw new ExtensibleTokenException(TOK_DATA_NEWLINE);
if (byteType(buf, off) == BT_LF)
off += minBPC;
token.tokenEnd = off;
return TOK_DATA_NEWLINE;
case BT_LF:
token.tokenEnd = off + minBPC;
return TOK_DATA_NEWLINE;
case BT_NONXML:
case BT_MALFORM:
throw new InvalidTokenException(off);
case BT_LEAD2:
if (end - off < 2)
throw new PartialCharException(off);
check2(buf, off);
off += 2;
break;
case BT_LEAD3:
if (end - off < 3)
throw new PartialCharException(off);
check3(buf, off);
off += 3;
break;
case BT_LEAD4:
if (end - off < 4)
throw new PartialCharException(off);
check4(buf, off);
off += 4;
break;
default:
off += minBPC;
break;
}
token.tokenEnd = extendCdata(buf, off, end);
return TOK_DATA_CHARS;
}
int extendCdata(final byte[] buf, int off, final int end) throws InvalidTokenException {
while (off != end) {
switch (byteType(buf, off)) {
case BT_LEAD2:
if (end - off < 2)
return off;
check2(buf, off);
off += 2;
break;
case BT_LEAD3:
if (end - off < 3)
return off;
check3(buf, off);
off += 3;
break;
case BT_LEAD4:
if (end - off < 4)
return off;
check4(buf, off);
off += 4;
break;
case BT_RSQB:
case BT_NONXML:
case BT_MALFORM:
case BT_CR:
case BT_LF:
return off;
default:
off += minBPC;
break;
}
}
return off;
}
/* off points to character following "" */
private final
int scanEndTag(byte[] buf, int off, int end, Token token)
throws PartialTokenException, InvalidTokenException {
if (off == end)
throw new PartialTokenException();
switch (byteType(buf, off)) {
case BT_NMSTRT:
off += minBPC;
break;
case BT_LEAD2:
if (end - off < 2)
throw new PartialCharException(off);
if (byteType2(buf, off) != BT_NMSTRT)
throw new InvalidTokenException(off);
off += 2;
break;
case BT_LEAD3:
if (end - off < 3)
throw new PartialCharException(off);
if (byteType3(buf, off) != BT_NMSTRT)
throw new InvalidTokenException(off);
off += 3;
break;
case BT_LEAD4:
if (end - off < 4)
throw new PartialCharException(off);
if (byteType4(buf, off) != BT_NMSTRT)
throw new InvalidTokenException(off);
off += 4;
break;
default:
throw new InvalidTokenException(off);
}
while (off != end) {
switch (byteType(buf, off)) {
case BT_NMSTRT:
case BT_NAME:
case BT_MINUS:
off += minBPC;
break;
case BT_LEAD2:
if (end - off < 2)
throw new PartialCharException(off);
if (!isNameChar2(buf, off))
throw new InvalidTokenException(off);
off += 2;
break;
case BT_LEAD3:
if (end - off < 3)
throw new PartialCharException(off);
if (!isNameChar3(buf, off))
throw new InvalidTokenException(off);
off += 3;
break;
case BT_LEAD4:
if (end - off < 4)
throw new PartialCharException(off);
if (!isNameChar4(buf, off))
throw new InvalidTokenException(off);
off += 4;
break;
case BT_S:
case BT_CR:
case BT_LF:
token.nameEnd = off;
for (off += minBPC; off != end; off += minBPC) {
switch (byteType(buf, off)) {
case BT_S:
case BT_CR:
case BT_LF:
break;
case BT_GT:
token.tokenEnd = off + minBPC;
return TOK_END_TAG;
default:
throw new InvalidTokenException(off);
}
}
throw new PartialTokenException();
case BT_GT:
token.nameEnd = off;
token.tokenEnd = off + minBPC;
return TOK_END_TAG;
default:
throw new InvalidTokenException(off);
}
}
throw new PartialTokenException();
}
/* off points to character following "" */
private final
int scanHexCharRef(byte[] buf, int off, int end, Token token)
throws PartialTokenException, InvalidTokenException {
if (off != end) {
int c = byteToAscii(buf, off);
int num;
switch (c) {
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
num = c - '0';
break;
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
num = c - ('A' - 10);
break;
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
num = c - ('a' - 10);
break;
default:
throw new InvalidTokenException(off);
}
for (off += minBPC; off != end; off += minBPC) {
c = byteToAscii(buf, off);
switch (c) {
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
num = (num << 4) + c - '0';
break;
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
num = (num << 4) + c - ('A' - 10);
break;
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
num = (num << 4) + c - ('a' - 10);
break;
case ';':
token.tokenEnd = off + minBPC;
return setRefChar(num, token);
default:
throw new InvalidTokenException(off);
}
if (num >= 0x110000)
throw new InvalidTokenException(off);
}
}
throw new PartialTokenException();
}
/* off points to character following "" */
private final
int scanCharRef(byte[] buf, int off, int end, Token token)
throws PartialTokenException, InvalidTokenException {
if (off != end) {
int c = byteToAscii(buf, off);
switch (c) {
case 'x':
return scanHexCharRef(buf, off + minBPC, end, token);
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
break;
default:
throw new InvalidTokenException(off);
}
int num = c - '0';
for (off += minBPC; off != end; off += minBPC) {
c = byteToAscii(buf, off);
switch (c) {
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
num = num * 10 + (c - '0');
if (num < 0x110000)
break;
/* fall through */
default:
throw new InvalidTokenException(off);
case ';':
token.tokenEnd = off + minBPC;
return setRefChar(num, token);
}
}
}
throw new PartialTokenException();
}
/* num is known to be < 0x110000; return the token code */
private final int setRefChar(int num, Token token)
throws InvalidTokenException {
if (num < 0x10000) {
switch (charTypeTable[num >> 8][num & 0xFF]) {
case BT_NONXML:
case BT_LEAD4:
case BT_MALFORM:
throw new InvalidTokenException(token.tokenEnd - minBPC);
}
token.refChar1 = (char)num;
return TOK_CHAR_REF;
}
else {
num -= 0x10000;
token.refChar1 = (char)((num >> 10) + 0xD800);
token.refChar2 = (char)((num & ((1 << 10) - 1)) + 0xDC00);
return TOK_CHAR_PAIR_REF;
}
}
private final
boolean isMagicEntityRef(byte[] buf, int off, int end, Token token) {
switch (byteToAscii(buf, off)) {
case 'a':
if (end - off < minBPC*4)
break;
switch (byteToAscii(buf, off + minBPC)) {
case 'm':
if (charMatches(buf, off + minBPC*2, 'p')
&& charMatches(buf, off + minBPC*3, ';')) {
token.tokenEnd = off + minBPC*4;
token.refChar1 = '&';
return true;
}
break;
case 'p':
if (end - off >= minBPC*5
&& charMatches(buf, off + minBPC*2, 'o')
&& charMatches(buf, off + minBPC*3, 's')
&& charMatches(buf, off + minBPC*4, ';')) {
token.tokenEnd = off + minBPC*5;
token.refChar1 = '\'';
return true;
}
break;
}
break;
case 'l':
if (end - off >= minBPC*3
&& charMatches(buf, off + minBPC, 't')
&& charMatches(buf, off + minBPC*2, ';')) {
token.tokenEnd = off + minBPC*3;
token.refChar1 = '<';
return true;
}
break;
case 'g':
if (end - off >= minBPC*3
&& charMatches(buf, off + minBPC, 't')
&& charMatches(buf, off + minBPC*2, ';')) {
token.tokenEnd = off + minBPC*3;
token.refChar1 = '>';
return true;
}
break;
case 'q':
if (end - off >= minBPC*5
&& charMatches(buf, off + minBPC, 'u')
&& charMatches(buf, off + minBPC*2, 'o')
&& charMatches(buf, off + minBPC*3, 't')
&& charMatches(buf, off + minBPC*4, ';')) {
token.tokenEnd = off + minBPC*5;
token.refChar1 = '"';
return true;
}
break;
}
return false;
}
/* off points to character following "&" */
private final
int scanRef(byte[] buf, int off, int end, Token token)
throws PartialTokenException, InvalidTokenException {
if (off == end)
throw new PartialTokenException();
if (isMagicEntityRef(buf, off, end, token))
return TOK_MAGIC_ENTITY_REF;
switch (byteType(buf, off)) {
case BT_NMSTRT:
off += minBPC;
break;
case BT_LEAD2:
if (end - off < 2)
throw new PartialCharException(off);
if (byteType2(buf, off) != BT_NMSTRT)
throw new InvalidTokenException(off);
off += 2;
break;
case BT_LEAD3:
if (end - off < 3)
throw new PartialCharException(off);
if (byteType3(buf, off) != BT_NMSTRT)
throw new InvalidTokenException(off);
off += 3;
break;
case BT_LEAD4:
if (end - off < 4)
throw new PartialCharException(off);
if (byteType4(buf, off) != BT_NMSTRT)
throw new InvalidTokenException(off);
off += 4;
break;
case BT_NUM:
return scanCharRef(buf, off + minBPC, end, token);
default:
throw new InvalidTokenException(off);
}
while (off != end) {
switch (byteType(buf, off)) {
case BT_NMSTRT:
case BT_NAME:
case BT_MINUS:
off += minBPC;
break;
case BT_LEAD2:
if (end - off < 2)
throw new PartialCharException(off);
if (!isNameChar2(buf, off))
throw new InvalidTokenException(off);
off += 2;
break;
case BT_LEAD3:
if (end - off < 3)
throw new PartialCharException(off);
if (!isNameChar3(buf, off))
throw new InvalidTokenException(off);
off += 3;
break;
case BT_LEAD4:
if (end - off < 4)
throw new PartialCharException(off);
if (!isNameChar4(buf, off))
throw new InvalidTokenException(off);
off += 4;
break;
case BT_SEMI:
token.nameEnd = off;
token.tokenEnd = off + minBPC;
return TOK_ENTITY_REF;
default:
throw new InvalidTokenException(off);
}
}
throw new PartialTokenException();
}
/* off points to character following first character of attribute name */
private final
int scanAtts(int nameStart, byte[] buf, int off, int end, ContentToken token)
throws PartialTokenException, InvalidTokenException {
int nameEnd = -1;
while (off != end) {
switch (byteType(buf, off)) {
case BT_NMSTRT:
case BT_NAME:
case BT_MINUS:
off += minBPC;
break;
case BT_LEAD2:
if (end - off < 2)
throw new PartialCharException(off);
if (!isNameChar2(buf, off))
throw new InvalidTokenException(off);
off += 2;
break;
case BT_LEAD3:
if (end - off < 3)
throw new PartialCharException(off);
if (!isNameChar3(buf, off))
throw new InvalidTokenException(off);
off += 3;
break;
case BT_LEAD4:
if (end - off < 4)
throw new PartialCharException(off);
if (!isNameChar4(buf, off))
throw new InvalidTokenException(off);
off += 4;
break;
case BT_S:
case BT_CR:
case BT_LF:
nameEnd = off;
loop:
for (;;) {
off += minBPC;
if (off == end)
throw new PartialTokenException();
switch (byteType(buf, off)) {
case BT_EQUALS:
break loop;
case BT_S:
case BT_LF:
case BT_CR:
break;
default:
throw new InvalidTokenException(off);
}
}
/* fall through */
case BT_EQUALS:
{
if (nameEnd < 0)
nameEnd = off;
int open;
for (;;) {
off += minBPC;
if (off == end)
throw new PartialTokenException();
open = byteType(buf, off);
if (open == BT_QUOT || open == BT_APOS)
break;
switch (open) {
case BT_S:
case BT_LF:
case BT_CR:
break;
default:
throw new InvalidTokenException(off);
}
}
off += minBPC;
int valueStart = off;
boolean normalized = true;
/* in attribute value */
for (;;) {
int t;
if (off == end)
throw new PartialTokenException();
t = byteType(buf, off);
if (t == open)
break;
switch (t) {
case BT_NONXML:
case BT_MALFORM:
throw new InvalidTokenException(off);
case BT_LEAD2:
if (end - off < 2)
throw new PartialCharException(off);
check2(buf, off);
off += 2;
break;
case BT_LEAD3:
if (end - off < 3)
throw new PartialCharException(off);
check3(buf, off);
off += 3;
break;
case BT_LEAD4:
if (end - off < 4)
throw new PartialCharException(off);
check4(buf, off);
off += 4;
break;
case BT_AMP:
{
normalized = false;
int saveNameEnd = token.nameEnd;
scanRef(buf, off + minBPC, end, token);
token.nameEnd = saveNameEnd;
off = token.tokenEnd;
break;
}
case BT_S:
if (normalized
&& (off == valueStart
|| byteToAscii(buf, off) != ' '
|| (off + minBPC != end
&& (byteToAscii(buf, off + minBPC) == ' '
|| byteType(buf, off + minBPC) == open))))
normalized = false;
off += minBPC;
break;
case BT_LT:
throw new InvalidTokenException(off);
case BT_LF:
case BT_CR:
normalized = false;
/* fall through */
default:
off += minBPC;
break;
}
}
token.appendAttribute(nameStart, nameEnd, valueStart, off,
normalized);
off += minBPC;
if (off == end)
throw new PartialTokenException();
int t = byteType(buf, off);
switch (t) {
case BT_S:
case BT_CR:
case BT_LF:
off += minBPC;
if (off == end)
throw new PartialTokenException();
t = byteType(buf, off);
break;
case BT_GT:
case BT_SOL:
break;
default:
throw new InvalidTokenException(off);
}
/* off points to closing quote */
skipToName:
for (;;) {
switch (t) {
case BT_NMSTRT:
nameStart = off;
off += minBPC;
break skipToName;
case BT_LEAD2:
if (end - off < 2)
throw new PartialCharException(off);
if (byteType2(buf, off) != BT_NMSTRT)
throw new InvalidTokenException(off);
nameStart = off;
off += 2;
break skipToName;
case BT_LEAD3:
if (end - off < 3)
throw new PartialCharException(off);
if (byteType3(buf, off) != BT_NMSTRT)
throw new InvalidTokenException(off);
nameStart = off;
off += 3;
break skipToName;
case BT_LEAD4:
if (end - off < 4)
throw new PartialCharException(off);
if (byteType4(buf, off) != BT_NMSTRT)
throw new InvalidTokenException(off);
nameStart = off;
off += 4;
break skipToName;
case BT_S:
case BT_CR:
case BT_LF:
break;
case BT_GT:
token.checkAttributeUniqueness(buf);
token.tokenEnd = off + minBPC;
return TOK_START_TAG_WITH_ATTS;
case BT_SOL:
off += minBPC;
if (off == end)
throw new PartialTokenException();
checkCharMatches(buf, off, '>');
token.checkAttributeUniqueness(buf);
token.tokenEnd = off + minBPC;
return TOK_EMPTY_ELEMENT_WITH_ATTS;
default:
throw new InvalidTokenException(off);
}
off += minBPC;
if (off == end)
throw new PartialTokenException();
t = byteType(buf, off);
}
nameEnd = -1;
break;
}
default:
throw new InvalidTokenException(off);
}
}
throw new PartialTokenException();
}
/* off points to character following "<" */
private final
int scanLt(byte[] buf, int off, int end, ContentToken token)
throws PartialTokenException, InvalidTokenException {
if (off == end)
throw new PartialTokenException();
switch (byteType(buf, off)) {
case BT_NMSTRT:
off += minBPC;
break;
case BT_LEAD2:
if (end - off < 2)
throw new PartialCharException(off);
if (byteType2(buf, off) != BT_NMSTRT)
throw new InvalidTokenException(off);
off += 2;
break;
case BT_LEAD3:
if (end - off < 3)
throw new PartialCharException(off);
if (byteType3(buf, off) != BT_NMSTRT)
throw new InvalidTokenException(off);
off += 3;
break;
case BT_LEAD4:
if (end - off < 4)
throw new PartialCharException(off);
if (byteType4(buf, off) != BT_NMSTRT)
throw new InvalidTokenException(off);
off += 4;
break;
case BT_EXCL:
if ((off += minBPC) == end)
throw new PartialTokenException();
switch (byteType(buf, off)) {
case BT_MINUS:
return scanComment(buf, off + minBPC, end, token);
case BT_LSQB:
return scanCdataSection(buf, off + minBPC, end, token);
}
throw new InvalidTokenException(off);
case BT_QUEST:
return scanPi(buf, off + minBPC, end, token);
case BT_SOL:
return scanEndTag(buf, off + minBPC, end, token);
default:
throw new InvalidTokenException(off);
}
/* we have a start-tag */
token.nameEnd = -1;
token.clearAttributes();
while (off != end) {
switch (byteType(buf, off)) {
case BT_NMSTRT:
case BT_NAME:
case BT_MINUS:
off += minBPC;
break;
case BT_LEAD2:
if (end - off < 2)
throw new PartialCharException(off);
if (!isNameChar2(buf, off))
throw new InvalidTokenException(off);
off += 2;
break;
case BT_LEAD3:
if (end - off < 3)
throw new PartialCharException(off);
if (!isNameChar3(buf, off))
throw new InvalidTokenException(off);
off += 3;
break;
case BT_LEAD4:
if (end - off < 4)
throw new PartialCharException(off);
if (!isNameChar4(buf, off))
throw new InvalidTokenException(off);
off += 4;
break;
case BT_S:
case BT_CR:
case BT_LF:
token.nameEnd = off;
off += minBPC;
loop:
for (;;) {
if (off == end)
throw new PartialTokenException();
switch (byteType(buf, off)) {
case BT_NMSTRT:
return scanAtts(off, buf, off + minBPC, end, token);
case BT_LEAD2:
if (end - off < 2)
throw new PartialCharException(off);
if (byteType2(buf, off) != BT_NMSTRT)
throw new InvalidTokenException(off);
return scanAtts(off, buf, off + 2, end, token);
case BT_LEAD3:
if (end - off < 3)
throw new PartialCharException(off);
if (byteType3(buf, off) != BT_NMSTRT)
throw new InvalidTokenException(off);
return scanAtts(off, buf, off + 3, end, token);
case BT_LEAD4:
if (end - off < 4)
throw new PartialCharException(off);
if (byteType4(buf, off) != BT_NMSTRT)
throw new InvalidTokenException(off);
return scanAtts(off, buf, off + 4, end, token);
case BT_GT:
case BT_SOL:
break loop;
case BT_S:
case BT_CR:
case BT_LF:
off += minBPC;
break;
default:
throw new InvalidTokenException(off);
}
}
break;
case BT_GT:
if (token.nameEnd < 0)
token.nameEnd = off;
token.tokenEnd = off + minBPC;
return TOK_START_TAG_NO_ATTS;
case BT_SOL:
if (token.nameEnd < 0)
token.nameEnd = off;
off += minBPC;
if (off == end)
throw new PartialTokenException();
checkCharMatches(buf, off, '>');
token.tokenEnd = off + minBPC;
return TOK_EMPTY_ELEMENT_NO_ATTS;
default:
throw new InvalidTokenException(off);
}
}
throw new PartialTokenException();
}
// Ensure that we always scan a multiple of minBPC bytes.
private final int adjustEnd(int off, int end) throws PartialCharException {
int n = end - off;
if ((n & (minBPC - 1)) != 0) {
n &= ~(minBPC - 1);
if (n == 0)
throw new PartialCharException(off);
return off + n;
}
else
return end;
}
/**
* Scans the first token of a byte subarrary that contains content.
* Returns one of the following integers according to the type of token
* that the subarray starts with:
*
TOK_START_TAG_NO_ATTS
* TOK_START_TAG_WITH_ATTS
* TOK_EMPTY_ELEMENT_NO_ATTS
* TOK_EMPTY_ELEMENT_WITH_ATTS
* TOK_END_TAG
* TOK_DATA_CHARS
* TOK_DATA_NEWLINE
* TOK_CDATA_SECT_OPEN
* TOK_ENTITY_REF
* TOK_MAGIC_ENTITY_REF
* TOK_CHAR_REF
* TOK_CHAR_PAIR_REF
* TOK_PI
* TOK_XML_DECL
* TOK_COMMENT
*
* Information about the token is stored in token
.
*
* When TOK_CDATA_SECT_OPEN
is returned,
* tokenizeCdataSection
should be called until
* it returns TOK_CDATA_SECT
.
*
* @exception EmptyTokenException if the subarray is empty
* @exception PartialTokenException if the subarray contains only part of
* a legal token
* @exception InvalidTokenException if the subarrary does not start
* with a legal token or part of one
* @exception ExtensibleTokenException if the subarray encodes just a carriage
* return ('\r')
*
* @see #TOK_START_TAG_NO_ATTS
* @see #TOK_START_TAG_WITH_ATTS
* @see #TOK_EMPTY_ELEMENT_NO_ATTS
* @see #TOK_EMPTY_ELEMENT_WITH_ATTS
* @see #TOK_END_TAG
* @see #TOK_DATA_CHARS
* @see #TOK_DATA_NEWLINE
* @see #TOK_CDATA_SECT_OPEN
* @see #TOK_ENTITY_REF
* @see #TOK_MAGIC_ENTITY_REF
* @see #TOK_CHAR_REF
* @see #TOK_CHAR_PAIR_REF
* @see #TOK_PI
* @see #TOK_XML_DECL
* @see #TOK_COMMENT
* @see ContentToken
* @see EmptyTokenException
* @see PartialTokenException
* @see InvalidTokenException
* @see ExtensibleTokenException
* @see #tokenizeCdataSection
*/
public final int tokenizeContent(byte[] buf, int off, int end, ContentToken token)
throws PartialTokenException, InvalidTokenException, EmptyTokenException, ExtensibleTokenException {
if (minBPC > 1)
end = adjustEnd(off, end);
if (off == end)
throw new EmptyTokenException();
switch (byteType(buf, off)) {
case BT_LT:
return scanLt(buf, off + minBPC, end, token);
case BT_AMP:
return scanRef(buf, off + minBPC, end, token);
case BT_CR:
off += minBPC;
if (off == end)
throw new ExtensibleTokenException(TOK_DATA_NEWLINE);
if (byteType(buf, off) == BT_LF)
off += minBPC;
token.tokenEnd = off;
return TOK_DATA_NEWLINE;
case BT_LF:
token.tokenEnd = off + minBPC;
return TOK_DATA_NEWLINE;
case BT_RSQB:
off += minBPC;
if (off == end)
throw new ExtensibleTokenException(TOK_DATA_CHARS);
if (!charMatches(buf, off, ']'))
break;
off += minBPC;
if (off == end)
throw new ExtensibleTokenException(TOK_DATA_CHARS);
if (!charMatches(buf, off, '>')) {
off -= minBPC;
break;
}
throw new InvalidTokenException(off);
case BT_NONXML:
case BT_MALFORM:
throw new InvalidTokenException(off);
case BT_LEAD2:
if (end - off < 2)
throw new PartialCharException(off);
check2(buf, off);
off += 2;
break;
case BT_LEAD3:
if (end - off < 3)
throw new PartialCharException(off);
check3(buf, off);
off += 3;
break;
case BT_LEAD4:
if (end - off < 4)
throw new PartialCharException(off);
check4(buf, off);
off += 4;
break;
default:
off += minBPC;
break;
}
token.tokenEnd = extendData(buf, off, end);
return TOK_DATA_CHARS;
}
int extendData(final byte[] buf, int off, final int end) throws InvalidTokenException {
while (off != end) {
switch (byteType(buf, off)) {
case BT_LEAD2:
if (end - off < 2)
return off;
check2(buf, off);
off += 2;
break;
case BT_LEAD3:
if (end - off < 3)
return off;
check3(buf, off);
off += 3;
break;
case BT_LEAD4:
if (end - off < 4)
return off;
check4(buf, off);
off += 4;
break;
case BT_RSQB:
case BT_AMP:
case BT_LT:
case BT_NONXML:
case BT_MALFORM:
case BT_CR:
case BT_LF:
return off;
default:
off += minBPC;
break;
}
}
return off;
}
/* off points to character following "%" */
private final
int scanPercent(byte[] buf, int off, int end, Token token)
throws PartialTokenException, InvalidTokenException {
if (off == end)
throw new PartialTokenException();
switch (byteType(buf, off)) {
case BT_NMSTRT:
off += minBPC;
break;
case BT_LEAD2:
if (end - off < 2)
throw new PartialCharException(off);
if (byteType2(buf, off) != BT_NMSTRT)
throw new InvalidTokenException(off);
off += 2;
break;
case BT_LEAD3:
if (end - off < 3)
throw new PartialCharException(off);
if (byteType3(buf, off) != BT_NMSTRT)
throw new InvalidTokenException(off);
off += 3;
break;
case BT_LEAD4:
if (end - off < 4)
throw new PartialCharException(off);
if (byteType4(buf, off) != BT_NMSTRT)
throw new InvalidTokenException(off);
off += 4;
break;
case BT_S:
case BT_LF:
case BT_CR:
case BT_PERCNT:
token.tokenEnd = off;
return TOK_PERCENT;
default:
throw new InvalidTokenException(off);
}
while (off != end) {
switch (byteType(buf, off)) {
case BT_NMSTRT:
case BT_NAME:
case BT_MINUS:
off += minBPC;
break;
case BT_LEAD2:
if (end - off < 2)
throw new PartialCharException(off);
if (!isNameChar2(buf, off))
throw new InvalidTokenException(off);
off += 2;
break;
case BT_LEAD3:
if (end - off < 3)
throw new PartialCharException(off);
if (!isNameChar3(buf, off))
throw new InvalidTokenException(off);
off += 3;
break;
case BT_LEAD4:
if (end - off < 4)
throw new PartialCharException(off);
if (!isNameChar4(buf, off))
throw new InvalidTokenException(off);
off += 4;
break;
case BT_SEMI:
token.nameEnd = off;
token.tokenEnd = off + minBPC;
return TOK_PARAM_ENTITY_REF;
default:
throw new InvalidTokenException(off);
}
}
throw new PartialTokenException();
}
private final
int scanPoundName(byte[] buf, int off, int end, Token token)
throws PartialTokenException, InvalidTokenException, ExtensibleTokenException {
if (off == end)
throw new PartialTokenException();
switch (byteType(buf, off)) {
case BT_NMSTRT:
off += minBPC;
break;
case BT_LEAD2:
if (end - off < 2)
throw new PartialCharException(off);
if (byteType2(buf, off) != BT_NMSTRT)
throw new InvalidTokenException(off);
off += 2;
break;
case BT_LEAD3:
if (end - off < 3)
throw new PartialCharException(off);
if (byteType3(buf, off) != BT_NMSTRT)
throw new InvalidTokenException(off);
off += 3;
break;
case BT_LEAD4:
if (end - off < 4)
throw new PartialCharException(off);
if (byteType4(buf, off) != BT_NMSTRT)
throw new InvalidTokenException(off);
off += 4;
break;
default:
throw new InvalidTokenException(off);
}
while (off != end) {
switch (byteType(buf, off)) {
case BT_NMSTRT:
case BT_NAME:
case BT_MINUS:
off += minBPC;
break;
case BT_LEAD2:
if (end - off < 2)
throw new PartialCharException(off);
if (!isNameChar2(buf, off))
throw new InvalidTokenException(off);
off += 2;
break;
case BT_LEAD3:
if (end - off < 3)
throw new PartialCharException(off);
if (!isNameChar3(buf, off))
throw new InvalidTokenException(off);
off += 3;
break;
case BT_LEAD4:
if (end - off < 4)
throw new PartialCharException(off);
if (!isNameChar4(buf, off))
throw new InvalidTokenException(off);
off += 4;
break;
case BT_CR:
case BT_LF:
case BT_S:
case BT_RPAR:
case BT_GT:
case BT_PERCNT:
case BT_VERBAR:
token.tokenEnd = off;
return TOK_POUND_NAME;
default:
throw new InvalidTokenException(off);
}
}
throw new ExtensibleTokenException(TOK_POUND_NAME);
}
private final
int scanLit(int open, byte[] buf, int off, int end, Token token)
throws PartialTokenException, InvalidTokenException, ExtensibleTokenException {
while (off != end) {
int t = byteType(buf, off);
switch (t) {
case BT_LEAD2:
if (end - off < 2)
throw new PartialTokenException();
check2(buf, off);
off += 2;
break;
case BT_LEAD3:
if (end - off < 3)
throw new PartialTokenException();
check3(buf, off);
off += 3;
break;
case BT_LEAD4:
if (end - off < 4)
throw new PartialTokenException();
check4(buf, off);
off += 4;
break;
case BT_NONXML:
case BT_MALFORM:
throw new InvalidTokenException(off);
case BT_QUOT:
case BT_APOS:
off += minBPC;
if (t != open)
break;
if (off == end)
throw new ExtensibleTokenException(TOK_LITERAL);
switch (byteType(buf, off)) {
case BT_S:
case BT_CR:
case BT_LF:
case BT_GT:
case BT_PERCNT:
case BT_LSQB:
token.tokenEnd = off;
return TOK_LITERAL;
default:
throw new InvalidTokenException(off);
}
default:
off += minBPC;
break;
}
}
throw new PartialTokenException();
}
/**
* Returns an encoding object to be used to start parsing an external entity.
* The encoding is chosen based on the initial 4 bytes of the entity.
*
* @param buf the byte array containing the initial bytes of the entity
* @param off the index in buf
of the first byte of the entity
* @param end the index in buf
following the last available
* byte of the entity; end - off
must be greater than or equal
* to 4 unless the entity has fewer that 4 bytes, in which case it must
* be equal to the length of the entity
* @param token receives information about the presence of a byte order
* mark; if the entity starts with a byte order mark
* then token.getTokenEnd()
* will return off + 2
, otherwise it will return
* off
*
* @see TextDecl
* @see XmlDecl
* @see #TOK_XML_DECL
* @see #getEncoding
* @see #getInternalEncoding
*/
public static final
Encoding getInitialEncoding(byte[] buf, int off, int end, Token token) {
token.tokenEnd = off;
switch (end - off) {
case 0:
break;
case 1:
if (buf[off] < 0)
return null;
break;
default:
int b0 = buf[off] & 0xFF;
int b1 = buf[off + 1] & 0xFF;
switch ((b0 << 8) | b1) {
case 0xFEFF:
token.tokenEnd = off + 2;
/* fall through */
case '<': /* not legal; but not a fatal error */
return getEncoding(UTF16_BIG_ENDIAN_ENCODING);
case 0xFFFE:
token.tokenEnd = off + 2;
/* fall through */
case '<' << 8: /* not legal; but not a fatal error */
return getEncoding(UTF16_LITTLE_ENDIAN_ENCODING);
}
}
return getEncoding(UTF8_ENCODING);
}
/**
* Returns an Encoding
corresponding to
* the specified IANA character set name.
* Returns this Encoding
if the name is null.
* Returns null if the specified encoding is not supported.
* Note that there are two distinct Encoding
objects
* associated with the name UTF-16
, one for
* each possible byte order; if this Encoding
* is UTF-16 with little-endian byte ordering, then
* getEncoding("UTF-16")
will return this,
* otherwise it will return an Encoding
for
* UTF-16 with big-endian byte ordering.
* @param name a string specifying the IANA name of the encoding; this is
* case insensitive
*/
public final
Encoding getEncoding(String name) {
if (name == null)
return this;
if (name.equalsIgnoreCase("UTF-8"))
return getEncoding(UTF8_ENCODING);
if (name.equalsIgnoreCase("UTF-16"))
return getUTF16Encoding();
if (name.equalsIgnoreCase("ISO-8859-1"))
return getEncoding(ISO8859_1_ENCODING);
if (name.equalsIgnoreCase("US-ASCII"))
return getEncoding(ASCII_ENCODING);
if (name.equalsIgnoreCase("windows-1250"))
return getEncoding(WINDOWS1250_ENCODING);
if (name.equalsIgnoreCase("ISO-8859-2"))
return getEncoding(ISO8859_2_ENCODING);
return null;
}
/**
* Returns an Encoding
for entities encoded with
* a single-byte encoding (an encoding in which each byte represents
* exactly one character).
* @param map a string specifying the character represented by each byte;
* the string must have a length of 256; map.charAt(b)
* specifies the character encoded by byte b
; bytes that do
* not represent any character should be mapped to \uFFFD
*/
public final
Encoding getSingleByteEncoding(String map) {
return new SingleByteEncoding(map);
}
/**
* Returns an Encoding
object for use with internal entities.
* This is a UTF-16 big endian encoding, except that newlines
* are assumed to have been normalized into line feed,
* so carriage return is treated like a space.
*/
public final static
Encoding getInternalEncoding() {
return getEncoding(INTERNAL_ENCODING);
}
/**
* Scans the first token of a byte subarray that contains part of a
* prolog.
* Returns one of the following integers according to the type of token
* that the subarray starts with:
*
TOK_PI
* TOK_XML_DECL
* TOK_COMMENT
* TOK_PARAM_ENTITY_REF
* TOK_PROLOG_S
* TOK_DECL_OPEN
* TOK_DECL_CLOSE
* TOK_NAME
* TOK_NMTOKEN
* TOK_POUND_NAME
* TOK_OR
* TOK_PERCENT
* TOK_OPEN_PAREN
* TOK_CLOSE_PAREN
* TOK_OPEN_BRACKET
* TOK_CLOSE_BRACKET
* TOK_LITERAL
* TOK_NAME_QUESTION
* TOK_NAME_ASTERISK
* TOK_NAME_PLUS
* TOK_COND_SECT_OPEN
* TOK_COND_SECT_CLOSE
* TOK_CLOSE_PAREN_QUESTION
* TOK_CLOSE_PAREN_ASTERISK
* TOK_CLOSE_PAREN_PLUS
* TOK_COMMA
* tokenizeContent
should be used on the remainder
* of the entity
* @exception ExtensibleTokenException if the subarray is a legal token
* but subsequent bytes in the same entity could be part of the token
* @see #TOK_PI
* @see #TOK_XML_DECL
* @see #TOK_COMMENT
* @see #TOK_PARAM_ENTITY_REF
* @see #TOK_PROLOG_S
* @see #TOK_DECL_OPEN
* @see #TOK_DECL_CLOSE
* @see #TOK_NAME
* @see #TOK_NMTOKEN
* @see #TOK_POUND_NAME
* @see #TOK_OR
* @see #TOK_PERCENT
* @see #TOK_OPEN_PAREN
* @see #TOK_CLOSE_PAREN
* @see #TOK_OPEN_BRACKET
* @see #TOK_CLOSE_BRACKET
* @see #TOK_LITERAL
* @see #TOK_NAME_QUESTION
* @see #TOK_NAME_ASTERISK
* @see #TOK_NAME_PLUS
* @see #TOK_COND_SECT_OPEN
* @see #TOK_COND_SECT_CLOSE
* @see #TOK_CLOSE_PAREN_QUESTION
* @see #TOK_CLOSE_PAREN_ASTERISK
* @see #TOK_CLOSE_PAREN_PLUS
* @see #TOK_COMMA
* @see ContentToken
* @see EmptyTokenException
* @see PartialTokenException
* @see InvalidTokenException
* @see ExtensibleTokenException
* @see EndOfPrologException
*/
public final
int tokenizeProlog(byte[] buf, int off, int end, Token token)
throws PartialTokenException,
InvalidTokenException,
EmptyTokenException,
ExtensibleTokenException,
EndOfPrologException {
int tok;
if (minBPC > 1)
end = adjustEnd(off, end);
if (off == end)
throw new EmptyTokenException();
switch (byteType(buf, off)) {
case BT_QUOT:
return scanLit(BT_QUOT, buf, off + minBPC, end, token);
case BT_APOS:
return scanLit(BT_APOS, buf, off + minBPC, end, token);
case BT_LT:
{
off += minBPC;
if (off == end)
throw new PartialTokenException();
switch (byteType(buf, off)) {
case BT_EXCL:
return scanDecl(buf, off + minBPC, end, token);
case BT_QUEST:
return scanPi(buf, off + minBPC, end, token);
case BT_NMSTRT:
case BT_LEAD2:
case BT_LEAD3:
case BT_LEAD4:
token.tokenEnd = off - minBPC;
throw new EndOfPrologException();
}
throw new InvalidTokenException(off);
}
case BT_CR:
if (off + minBPC == end)
throw new ExtensibleTokenException(TOK_PROLOG_S);
/* fall through */
case BT_S:
case BT_LF:
for (;;) {
off += minBPC;
if (off == end)
break;
switch (byteType(buf, off)) {
case BT_S:
case BT_LF:
break;
case BT_CR:
/* don't split CR/LF pair */
if (off + minBPC != end)
break;
/* fall through */
default:
token.tokenEnd = off;
return TOK_PROLOG_S;
}
}
token.tokenEnd = off;
return TOK_PROLOG_S;
case BT_PERCNT:
return scanPercent(buf, off + minBPC, end, token);
case BT_COMMA:
token.tokenEnd = off + minBPC;
return TOK_COMMA;
case BT_LSQB:
token.tokenEnd = off + minBPC;
return TOK_OPEN_BRACKET;
case BT_RSQB:
off += minBPC;
if (off == end)
throw new ExtensibleTokenException(TOK_CLOSE_BRACKET);
if (charMatches(buf, off, ']')) {
if (off + minBPC == end)
throw new PartialTokenException();
if (charMatches(buf, off + minBPC, '>')) {
token.tokenEnd = off + 2*minBPC;
return TOK_COND_SECT_CLOSE;
}
}
token.tokenEnd = off;
return TOK_CLOSE_BRACKET;
case BT_LPAR:
token.tokenEnd = off + minBPC;
return TOK_OPEN_PAREN;
case BT_RPAR:
off += minBPC;
if (off == end)
throw new ExtensibleTokenException(TOK_CLOSE_PAREN);
switch (byteType(buf, off)) {
case BT_AST:
token.tokenEnd = off + minBPC;
return TOK_CLOSE_PAREN_ASTERISK;
case BT_QUEST:
token.tokenEnd = off + minBPC;
return TOK_CLOSE_PAREN_QUESTION;
case BT_PLUS:
token.tokenEnd = off + minBPC;
return TOK_CLOSE_PAREN_PLUS;
case BT_CR:
case BT_LF:
case BT_S:
case BT_GT:
case BT_COMMA:
case BT_VERBAR:
case BT_RPAR:
token.tokenEnd = off;
return TOK_CLOSE_PAREN;
}
throw new InvalidTokenException(off);
case BT_VERBAR:
token.tokenEnd = off + minBPC;
return TOK_OR;
case BT_GT:
token.tokenEnd = off + minBPC;
return TOK_DECL_CLOSE;
case BT_NUM:
return scanPoundName(buf, off + minBPC, end, token);
case BT_LEAD2:
if (end - off < 2)
throw new PartialCharException(off);
switch (byteType2(buf, off)) {
case BT_NMSTRT:
off += 2;
tok = TOK_NAME;
break;
case BT_NAME:
off += 2;
tok = TOK_NMTOKEN;
break;
default:
throw new InvalidTokenException(off);
}
break;
case BT_LEAD3:
if (end - off < 3)
throw new PartialCharException(off);
switch (byteType3(buf, off)) {
case BT_NMSTRT:
off += 3;
tok = TOK_NAME;
break;
case BT_NAME:
off += 3;
tok = TOK_NMTOKEN;
break;
default:
throw new InvalidTokenException(off);
}
break;
case BT_LEAD4:
if (end - off < 4)
throw new PartialCharException(off);
switch (byteType4(buf, off)) {
case BT_NMSTRT:
off += 4;
tok = TOK_NAME;
break;
case BT_NAME:
off += 4;
tok = TOK_NMTOKEN;
break;
default:
throw new InvalidTokenException(off);
}
break;
case BT_NMSTRT:
tok = TOK_NAME;
off += minBPC;
break;
case BT_NAME:
case BT_MINUS:
tok = TOK_NMTOKEN;
off += minBPC;
break;
default:
throw new InvalidTokenException(off);
}
while (off != end) {
switch (byteType(buf, off)) {
case BT_NMSTRT:
case BT_NAME:
case BT_MINUS:
off += minBPC;
break;
case BT_LEAD2:
if (end - off < 2)
throw new PartialCharException(off);
if (!isNameChar2(buf, off))
throw new InvalidTokenException(off);
off += 2;
break;
case BT_LEAD3:
if (end - off < 3)
throw new PartialCharException(off);
if (!isNameChar3(buf, off))
throw new InvalidTokenException(off);
off += 3;
break;
case BT_LEAD4:
if (end - off < 4)
throw new PartialCharException(off);
if (!isNameChar4(buf, off))
throw new InvalidTokenException(off);
off += 4;
break;
case BT_GT:
case BT_RPAR:
case BT_COMMA:
case BT_VERBAR:
case BT_LSQB:
case BT_PERCNT:
case BT_S:
case BT_CR:
case BT_LF:
token.tokenEnd = off;
return tok;
case BT_PLUS:
if (tok != TOK_NAME)
throw new InvalidTokenException(off);
token.tokenEnd = off + minBPC;
return TOK_NAME_PLUS;
case BT_AST:
if (tok != TOK_NAME)
throw new InvalidTokenException(off);
token.tokenEnd = off + minBPC;
return TOK_NAME_ASTERISK;
case BT_QUEST:
if (tok != TOK_NAME)
throw new InvalidTokenException(off);
token.tokenEnd = off + minBPC;
return TOK_NAME_QUESTION;
default:
throw new InvalidTokenException(off);
}
}
throw new ExtensibleTokenException(tok);
}
/**
* Scans the first token of a byte subarrary that contains part of
* literal attribute value. The opening and closing delimiters
* are not included in the subarrary.
* Returns one of the following integers according to the type of
* token that the subarray starts with:
* TOK_DATA_CHARS
* TOK_DATA_NEWLINE
* TOK_ATTRIBUTE_VALUE_S
* TOK_MAGIC_ENTITY_REF
* TOK_ENTITY_REF
* TOK_CHAR_REF
* TOK_CHAR_PAIR_REF
* TOK_DATA_CHARS
* TOK_DATA_NEWLINE
* TOK_PARAM_ENTITY_REF
* TOK_MAGIC_ENTITY_REF
* TOK_ENTITY_REF
* TOK_CHAR_REF
* TOK_CHAR_PAIR_REF
* <![ IGNORE [
.
*
* @return the index of the character following the closing
* ]]>
*
* @exception PartialTokenException if the subarray does not contain the
* complete ignored conditional section
* @exception InvalidTokenException if the ignored conditional section
* contains illegal characters
*/
public final
int skipIgnoreSect(byte[] buf, int off, int end) throws PartialTokenException, InvalidTokenException {
if (minBPC > 1)
end = adjustEnd(off, end);
int level = 0;
loop:
while (off != end) {
switch (byteType(buf, off)) {
case BT_LEAD2:
if (end - off < 2)
throw new PartialCharException(off);
check2(buf, off);
off += 2;
break;
case BT_LEAD3:
if (end - off < 3)
throw new PartialCharException(off);
check3(buf, off);
off += 3;
break;
case BT_LEAD4:
if (end - off < 4)
throw new PartialCharException(off);
check4(buf, off);
off += 4;
break;
case BT_NONXML:
case BT_MALFORM:
throw new InvalidTokenException(off);
case BT_LT:
off += minBPC;
if (off == end)
break loop;
if (!charMatches(buf, off, '!'))
break;
off += minBPC;
if (off == end)
break loop;
if (!charMatches(buf, off, '['))
break;
level++;
off += minBPC;
break;
case BT_RSQB:
off += minBPC;
if (off == end)
break loop;
if (!charMatches(buf, off, ']'))
break;
off += minBPC;
if (off == end)
break loop;
if (charMatches(buf, off, '>')) {
if (level == 0)
return off + minBPC;
level--;
}
else if (charMatches(buf, off, ']'))
break;
off += minBPC;
break;
default:
off += minBPC;
break;
}
}
throw new PartialTokenException();
}
/**
* Checks that a literal contained in the specified byte subarray
* is a legal public identifier and returns a string with
* the normalized content of the public id.
* The subarray includes the opening and closing quotes.
* @exception InvalidTokenException if it is not a legal public identifier
*/
public final
String getPublicId(byte[] buf, int off, int end) throws InvalidTokenException {
StringBuffer sbuf = new StringBuffer();
off += minBPC;
end -= minBPC;
for (; off != end; off += minBPC) {
char c = (char)byteToAscii(buf, off);
switch (byteType(buf, off)) {
case BT_MINUS:
case BT_APOS:
case BT_LPAR:
case BT_RPAR:
case BT_PLUS:
case BT_COMMA:
case BT_SOL:
case BT_EQUALS:
case BT_QUEST:
case BT_SEMI:
case BT_EXCL:
case BT_AST:
case BT_PERCNT:
case BT_NUM:
sbuf.append(c);
break;
case BT_S:
if (charMatches(buf, off, '\t'))
throw new InvalidTokenException(off);
/* fall through */
case BT_CR:
case BT_LF:
if (sbuf.length() > 0 && sbuf.charAt(sbuf.length() - 1) != ' ')
sbuf.append(' ');
break;
case BT_NAME:
case BT_NMSTRT:
if ((c & ~0x7f) == 0) {
sbuf.append(c);
break;
}
// fall through
default:
switch (c) {
case '$':
case '@':
break;
default:
throw new InvalidTokenException(off);
}
break;
}
}
if (sbuf.length() > 0 && sbuf.charAt(sbuf.length() - 1) == ' ')
sbuf.setLength(sbuf.length() - 1);
return sbuf.toString();
}
/**
* Returns true if the specified byte subarray is equal to the string.
* The string must contain only XML significant characters.
*/
public final
boolean matchesXMLString(byte[] buf, int off, int end, String str) {
int len = str.length();
if (len*minBPC != end - off)
return false;
for (int i = 0; i < len; off += minBPC, i++) {
if (!charMatches(buf, off, str.charAt(i)))
return false;
}
return true;
}
/**
* Skips over XML whitespace characters at the start of the specified
* subarray.
*
* @return the index of the first non-whitespace character,
* end
if there is the subarray is all whitespace
*/
public final
int skipS(byte[] buf, int off, int end) {
loop:
while (off < end) {
switch (byteType(buf, off)) {
case BT_S:
case BT_CR:
case BT_LF:
off += minBPC;
break;
default:
break loop;
}
}
return off;
}
private final boolean isNameChar2(byte[] buf, int off) {
int bt = byteType2(buf, off);
return bt == BT_NAME || bt == BT_NMSTRT;
}
private final boolean isNameChar3(byte[] buf, int off) {
int bt = byteType3(buf, off);
return bt == BT_NAME || bt == BT_NMSTRT;
}
private final boolean isNameChar4(byte[] buf, int off) {
int bt = byteType4(buf, off);
return bt == BT_NAME || bt == BT_NMSTRT;
}
private static final String nameStartSingles =
"\u003a\u005f\u0386\u038c\u03da\u03dc\u03de\u03e0\u0559\u06d5\u093d\u09b2" +
"\u0a5e\u0a8d\u0abd\u0ae0\u0b3d\u0b9c\u0cde\u0e30\u0e84\u0e8a\u0e8d\u0ea5" +
"\u0ea7\u0eb0\u0ebd\u1100\u1109\u113c\u113e\u1140\u114c\u114e\u1150\u1159" +
"\u1163\u1165\u1167\u1169\u1175\u119e\u11a8\u11ab\u11ba\u11eb\u11f0\u11f9" +
"\u1f59\u1f5b\u1f5d\u1fbe\u2126\u212e\u3007";
private static final String nameStartRanges =
"\u0041\u005a\u0061\u007a\u00c0\u00d6\u00d8\u00f6\u00f8\u00ff\u0100\u0131" +
"\u0134\u013e\u0141\u0148\u014a\u017e\u0180\u01c3\u01cd\u01f0\u01f4\u01f5" +
"\u01fa\u0217\u0250\u02a8\u02bb\u02c1\u0388\u038a\u038e\u03a1\u03a3\u03ce" +
"\u03d0\u03d6\u03e2\u03f3\u0401\u040c\u040e\u044f\u0451\u045c\u045e\u0481" +
"\u0490\u04c4\u04c7\u04c8\u04cb\u04cc\u04d0\u04eb\u04ee\u04f5\u04f8\u04f9" +
"\u0531\u0556\u0561\u0586\u05d0\u05ea\u05f0\u05f2\u0621\u063a\u0641\u064a" +
"\u0671\u06b7\u06ba\u06be\u06c0\u06ce\u06d0\u06d3\u06e5\u06e6\u0905\u0939" +
"\u0958\u0961\u0985\u098c\u098f\u0990\u0993\u09a8\u09aa\u09b0\u09b6\u09b9" +
"\u09dc\u09dd\u09df\u09e1\u09f0\u09f1\u0a05\u0a0a\u0a0f\u0a10\u0a13\u0a28" +
"\u0a2a\u0a30\u0a32\u0a33\u0a35\u0a36\u0a38\u0a39\u0a59\u0a5c\u0a72\u0a74" +
"\u0a85\u0a8b\u0a8f\u0a91\u0a93\u0aa8\u0aaa\u0ab0\u0ab2\u0ab3\u0ab5\u0ab9" +
"\u0b05\u0b0c\u0b0f\u0b10\u0b13\u0b28\u0b2a\u0b30\u0b32\u0b33\u0b36\u0b39" +
"\u0b5c\u0b5d\u0b5f\u0b61\u0b85\u0b8a\u0b8e\u0b90\u0b92\u0b95\u0b99\u0b9a" +
"\u0b9e\u0b9f\u0ba3\u0ba4\u0ba8\u0baa\u0bae\u0bb5\u0bb7\u0bb9\u0c05\u0c0c" +
"\u0c0e\u0c10\u0c12\u0c28\u0c2a\u0c33\u0c35\u0c39\u0c60\u0c61\u0c85\u0c8c" +
"\u0c8e\u0c90\u0c92\u0ca8\u0caa\u0cb3\u0cb5\u0cb9\u0ce0\u0ce1\u0d05\u0d0c" +
"\u0d0e\u0d10\u0d12\u0d28\u0d2a\u0d39\u0d60\u0d61\u0e01\u0e2e\u0e32\u0e33" +
"\u0e40\u0e45\u0e81\u0e82\u0e87\u0e88\u0e94\u0e97\u0e99\u0e9f\u0ea1\u0ea3" +
"\u0eaa\u0eab\u0ead\u0eae\u0eb2\u0eb3\u0ec0\u0ec4\u0f40\u0f47\u0f49\u0f69" +
"\u10a0\u10c5\u10d0\u10f6\u1102\u1103\u1105\u1107\u110b\u110c\u110e\u1112" +
"\u1154\u1155\u115f\u1161\u116d\u116e\u1172\u1173\u11ae\u11af\u11b7\u11b8" +
"\u11bc\u11c2\u1e00\u1e9b\u1ea0\u1ef9\u1f00\u1f15\u1f18\u1f1d\u1f20\u1f45" +
"\u1f48\u1f4d\u1f50\u1f57\u1f5f\u1f7d\u1f80\u1fb4\u1fb6\u1fbc\u1fc2\u1fc4" +
"\u1fc6\u1fcc\u1fd0\u1fd3\u1fd6\u1fdb\u1fe0\u1fec\u1ff2\u1ff4\u1ff6\u1ffc" +
"\u212a\u212b\u2180\u2182\u3041\u3094\u30a1\u30fa\u3105\u312c\uac00\ud7a3" +
"\u4e00\u9fa5\u3021\u3029";
private static final String nameSingles =
"\u002d\u002e\u05bf\u05c4\u0670\u093c\u094d\u09bc\u09be\u09bf\u09d7\u0a02" +
"\u0a3c\u0a3e\u0a3f\u0abc\u0b3c\u0bd7\u0d57\u0e31\u0eb1\u0f35\u0f37\u0f39" +
"\u0f3e\u0f3f\u0f97\u0fb9\u20e1\u3099\u309a\u00b7\u02d0\u02d1\u0387\u0640" +
"\u0e46\u0ec6\u3005";
private static final String nameRanges =
"\u0300\u0345\u0360\u0361\u0483\u0486\u0591\u05a1\u05a3\u05b9\u05bb\u05bd" +
"\u05c1\u05c2\u064b\u0652\u06d6\u06dc\u06dd\u06df\u06e0\u06e4\u06e7\u06e8" +
"\u06ea\u06ed\u0901\u0903\u093e\u094c\u0951\u0954\u0962\u0963\u0981\u0983" +
"\u09c0\u09c4\u09c7\u09c8\u09cb\u09cd\u09e2\u09e3\u0a40\u0a42\u0a47\u0a48" +
"\u0a4b\u0a4d\u0a70\u0a71\u0a81\u0a83\u0abe\u0ac5\u0ac7\u0ac9\u0acb\u0acd" +
"\u0b01\u0b03\u0b3e\u0b43\u0b47\u0b48\u0b4b\u0b4d\u0b56\u0b57\u0b82\u0b83" +
"\u0bbe\u0bc2\u0bc6\u0bc8\u0bca\u0bcd\u0c01\u0c03\u0c3e\u0c44\u0c46\u0c48" +
"\u0c4a\u0c4d\u0c55\u0c56\u0c82\u0c83\u0cbe\u0cc4\u0cc6\u0cc8\u0cca\u0ccd" +
"\u0cd5\u0cd6\u0d02\u0d03\u0d3e\u0d43\u0d46\u0d48\u0d4a\u0d4d\u0e34\u0e3a" +
"\u0e47\u0e4e\u0eb4\u0eb9\u0ebb\u0ebc\u0ec8\u0ecd\u0f18\u0f19\u0f71\u0f84" +
"\u0f86\u0f8b\u0f90\u0f95\u0f99\u0fad\u0fb1\u0fb7\u20d0\u20dc\u302a\u302f" +
"\u0030\u0039\u0660\u0669\u06f0\u06f9\u0966\u096f\u09e6\u09ef\u0a66\u0a6f" +
"\u0ae6\u0aef\u0b66\u0b6f\u0be7\u0bef\u0c66\u0c6f\u0ce6\u0cef\u0d66\u0d6f" +
"\u0e50\u0e59\u0ed0\u0ed9\u0f20\u0f29\u3031\u3035\u309d\u309e\u30fc\u30fe";
/* final */ static byte[][] charTypeTable;
private static void setCharType(char c, int type) {
if (c < 0x80)
return;
int hi = c >> 8;
if (charTypeTable[hi] == null) {
charTypeTable[hi] = new byte[256];
for (int i = 0; i < 256; i++)
charTypeTable[hi][i] = BT_OTHER;
}
charTypeTable[hi][c & 0xFF] = (byte)type;
}
private static void setCharType(char min, char max, int type) {
byte[] shared = null;
do {
if ((min & 0xFF) == 0) {
for (; min + 0xFF <= max; min += 0x100) {
if (shared == null) {
shared = new byte[256];
for (int i = 0; i < 256; i++)
shared[i] = (byte)type;
}
charTypeTable[min >> 8] = shared;
if (min + 0xFF == max)
return;
}
}
setCharType(min, type);
} while (min++ != max);
}
static {
charTypeTable = new byte[256][];
for (int i = 0; i < nameSingles.length(); i++)
setCharType(nameSingles.charAt(i), BT_NAME);
for (int i = 0; i < nameRanges.length(); i += 2)
setCharType(nameRanges.charAt(i), nameRanges.charAt(i + 1), BT_NAME);
for (int i = 0; i < nameStartSingles.length(); i++)
setCharType(nameStartSingles.charAt(i), BT_NMSTRT);
for (int i = 0; i < nameStartRanges.length(); i += 2)
setCharType(nameStartRanges.charAt(i), nameStartRanges.charAt(i + 1),
BT_NMSTRT);
setCharType('\uD800', '\uDBFF', BT_LEAD4);
setCharType('\uDC00', '\uDFFF', BT_MALFORM);
setCharType('\uFFFE', '\uFFFF', BT_NONXML);
byte[] other = new byte[256];
for (int i = 0; i < 256; i++)
other[i] = BT_OTHER;
for (int i = 0; i < 256; i++)
if (charTypeTable[i] == null)
charTypeTable[i] = other;
System.arraycopy(asciiTypeTable, 0, charTypeTable[0], 0, 128);
}
/**
* Returns the minimum number of bytes required to represent a single
* character in this encoding. The value will be 1, 2 or 4.
*/
public final int getMinBytesPerChar() {
return minBPC;
}
}