Context Navigation

regcomp.c

Visit:

Last change on this file was 3076, checked in by bird, 18 years ago
gawk 3.1.5
File size: 110.1 KB

Rev	Line
[3076]	1	/* Extended regular expression matching and search library.
	2	Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
	3	This file is part of the GNU C Library.
	4	Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
	5
	6	The GNU C Library is free software; you can redistribute it and/or
	7	modify it under the terms of the GNU Lesser General Public
	8	License as published by the Free Software Foundation; either
	9	version 2.1 of the License, or (at your option) any later version.
	10
	11	The GNU C Library is distributed in the hope that it will be useful,
	12	but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	14	Lesser General Public License for more details.
	15
	16	You should have received a copy of the GNU Lesser General Public
	17	License along with the GNU C Library; if not, write to the Free
	18	Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
	19	02110-1301 USA. */
	20
	21	static reg_errcode_t re_compile_internal (regex_t preg, const char pattern,
	22	int length, reg_syntax_t syntax);
	23	static void re_compile_fastmap_iter (regex_t *bufp,
	24	const re_dfastate_t *init_state,
	25	char *fastmap);
	26	static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
	27	static void init_word_char (re_dfa_t *dfa);
	28	#ifdef RE_ENABLE_I18N
	29	static void free_charset (re_charset_t *cset);
	30	#endif /* RE_ENABLE_I18N */
	31	static void free_workarea_compile (regex_t *preg);
	32	static reg_errcode_t create_initial_state (re_dfa_t *dfa);
	33	#ifdef RE_ENABLE_I18N
	34	static void optimize_utf8 (re_dfa_t *dfa);
	35	#endif
	36	static reg_errcode_t analyze (regex_t *preg);
	37	static reg_errcode_t create_initial_state (re_dfa_t *dfa);
	38	static reg_errcode_t preorder (bin_tree_t *root,
	39	reg_errcode_t (fn (void , bin_tree_t )),
	40	void *extra);
	41	static reg_errcode_t postorder (bin_tree_t *root,
	42	reg_errcode_t (fn (void , bin_tree_t )),
	43	void *extra);
	44	static reg_errcode_t optimize_subexps (void extra, bin_tree_t node);
	45	static reg_errcode_t lower_subexps (void extra, bin_tree_t node);
	46	static bin_tree_t lower_subexp (reg_errcode_t err, regex_t *preg,
	47	bin_tree_t *node);
	48	static reg_errcode_t calc_first (void extra, bin_tree_t node);
	49	static reg_errcode_t calc_next (void extra, bin_tree_t node);
	50	static reg_errcode_t link_nfa_nodes (void extra, bin_tree_t node);
	51	static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
	52	int top_clone_node, int root_node,
	53	unsigned int constraint);
	54	static reg_errcode_t duplicate_node (int new_idx, re_dfa_t dfa, int org_idx,
	55	unsigned int constraint);
	56	static int search_duplicated_node (re_dfa_t *dfa, int org_node,
	57	unsigned int constraint);
	58	static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
	59	static reg_errcode_t calc_eclosure_iter (re_node_set new_set, re_dfa_t dfa,
	60	int node, int root);
	61	static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
	62	static int fetch_number (re_string_t input, re_token_t token,
	63	reg_syntax_t syntax);
	64	static void fetch_token (re_token_t result, re_string_t input,
	65	reg_syntax_t syntax);
	66	static int peek_token (re_token_t token, re_string_t input,
	67	reg_syntax_t syntax);
	68	static int peek_token_bracket (re_token_t token, re_string_t input,
	69	reg_syntax_t syntax);
	70	static bin_tree_t parse (re_string_t regexp, regex_t *preg,
	71	reg_syntax_t syntax, reg_errcode_t *err);
	72	static bin_tree_t parse_reg_exp (re_string_t regexp, regex_t *preg,
	73	re_token_t *token, reg_syntax_t syntax,
	74	int nest, reg_errcode_t *err);
	75	static bin_tree_t parse_branch (re_string_t regexp, regex_t *preg,
	76	re_token_t *token, reg_syntax_t syntax,
	77	int nest, reg_errcode_t *err);
	78	static bin_tree_t parse_expression (re_string_t regexp, regex_t *preg,
	79	re_token_t *token, reg_syntax_t syntax,
	80	int nest, reg_errcode_t *err);
	81	static bin_tree_t parse_sub_exp (re_string_t regexp, regex_t *preg,
	82	re_token_t *token, reg_syntax_t syntax,
	83	int nest, reg_errcode_t *err);
	84	static bin_tree_t parse_dup_op (bin_tree_t dup_elem, re_string_t *regexp,
	85	re_dfa_t dfa, re_token_t token,
	86	reg_syntax_t syntax, reg_errcode_t *err);
	87	static bin_tree_t parse_bracket_exp (re_string_t regexp, re_dfa_t *dfa,
	88	re_token_t *token, reg_syntax_t syntax,
	89	reg_errcode_t *err);
	90	static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
	91	re_string_t *regexp,
	92	re_token_t *token, int token_len,
	93	re_dfa_t *dfa,
	94	reg_syntax_t syntax,
	95	int accept_hyphen);
	96	static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
	97	re_string_t *regexp,
	98	re_token_t *token);
	99	#ifndef _LIBC
	100	# ifdef RE_ENABLE_I18N
	101	static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
	102	re_charset_t mbcset, int range_alloc,
	103	bracket_elem_t *start_elem,
	104	bracket_elem_t *end_elem);
	105	static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
	106	re_charset_t *mbcset,
	107	int *coll_sym_alloc,
	108	const unsigned char *name);
	109	# else /* not RE_ENABLE_I18N */
	110	static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
	111	bracket_elem_t *start_elem,
	112	bracket_elem_t *end_elem);
	113	static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
	114	const unsigned char *name);
	115	# endif /* not RE_ENABLE_I18N */
	116	#endif /* not _LIBC */
	117	#ifdef RE_ENABLE_I18N
	118	static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
	119	re_charset_t *mbcset,
	120	int *equiv_class_alloc,
	121	const unsigned char *name);
	122	static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
	123	re_bitset_ptr_t sbcset,
	124	re_charset_t *mbcset,
	125	int *char_class_alloc,
	126	const char *class_name,
	127	reg_syntax_t syntax);
	128	#else /* not RE_ENABLE_I18N */
	129	static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
	130	const unsigned char *name);
	131	static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
	132	re_bitset_ptr_t sbcset,
	133	const char *class_name,
	134	reg_syntax_t syntax);
	135	#endif /* not RE_ENABLE_I18N */
	136	static bin_tree_t build_charclass_op (re_dfa_t dfa,
	137	unsigned RE_TRANSLATE_TYPE trans,
	138	const char *class_name,
	139	const char *extra,
	140	int non_match, reg_errcode_t *err);
	141	static bin_tree_t create_tree (re_dfa_t dfa,
	142	bin_tree_t left, bin_tree_t right,
	143	re_token_type_t type);
	144	static bin_tree_t create_token_tree (re_dfa_t dfa,
	145	bin_tree_t left, bin_tree_t right,
	146	const re_token_t *token);
	147	static bin_tree_t duplicate_tree (const bin_tree_t src, re_dfa_t *dfa);
	148	static void free_token (re_token_t *node);
	149	static reg_errcode_t free_tree (void extra, bin_tree_t node);
	150	static reg_errcode_t mark_opt_subexp (void extra, bin_tree_t node);
	151
	152
	153	/* This table gives an error message for each of the error codes listed
	154	in regex.h. Obviously the order here has to be same as there.
	155	POSIX doesn't require that we do anything for REG_NOERROR,
	156	but why not be nice? */
	157
	158	const ERRMSG_TYPE __re_error_msgid[] attribute_hidden =
	159	{
	160	#define REG_NOERROR_IDX 0
	161	gettext_noop ("Success") /* REG_NOERROR */
	162	ERRMSG_SEPARATOR
	163	#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
	164	gettext_noop ("No match") /* REG_NOMATCH */
	165	ERRMSG_SEPARATOR
	166	#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
	167	gettext_noop ("Invalid regular expression") /* REG_BADPAT */
	168	ERRMSG_SEPARATOR
	169	#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
	170	gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
	171	ERRMSG_SEPARATOR
	172	#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
	173	gettext_noop ("Invalid character class name") /* REG_ECTYPE */
	174	ERRMSG_SEPARATOR
	175	#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
	176	gettext_noop ("Trailing backslash") /* REG_EESCAPE */
	177	ERRMSG_SEPARATOR
	178	#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
	179	gettext_noop ("Invalid back reference") /* REG_ESUBREG */
	180	ERRMSG_SEPARATOR
	181	#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
	182	gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
	183	ERRMSG_SEPARATOR
	184	#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
	185	gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
	186	ERRMSG_SEPARATOR
	187	#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
	188	gettext_noop ("Unmatched \\{") /* REG_EBRACE */
	189	ERRMSG_SEPARATOR
	190	#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
	191	gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
	192	ERRMSG_SEPARATOR
	193	#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
	194	gettext_noop ("Invalid range end") /* REG_ERANGE */
	195	ERRMSG_SEPARATOR
	196	#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
	197	gettext_noop ("Memory exhausted") /* REG_ESPACE */
	198	ERRMSG_SEPARATOR
	199	#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
	200	gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
	201	ERRMSG_SEPARATOR
	202	#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
	203	gettext_noop ("Premature end of regular expression") /* REG_EEND */
	204	ERRMSG_SEPARATOR
	205	#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
	206	gettext_noop ("Regular expression too big") /* REG_ESIZE */
	207	ERRMSG_SEPARATOR
	208	#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
	209	gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
	210	ERRMSG_SEPARATOR
	211	};
	212
	213	const size_t __re_error_msgid_idx[] attribute_hidden =
	214	{
	215	REG_NOERROR_IDX,
	216	REG_NOMATCH_IDX,
	217	REG_BADPAT_IDX,
	218	REG_ECOLLATE_IDX,
	219	REG_ECTYPE_IDX,
	220	REG_EESCAPE_IDX,
	221	REG_ESUBREG_IDX,
	222	REG_EBRACK_IDX,
	223	REG_EPAREN_IDX,
	224	REG_EBRACE_IDX,
	225	REG_BADBR_IDX,
	226	REG_ERANGE_IDX,
	227	REG_ESPACE_IDX,
	228	REG_BADRPT_IDX,
	229	REG_EEND_IDX,
	230	REG_ESIZE_IDX,
	231	REG_ERPAREN_IDX
	232	};
	233
	234
	235	/* Entry points for GNU code. */
	236
	237	/* re_compile_pattern is the GNU regular expression compiler: it
	238	compiles PATTERN (of length LENGTH) and puts the result in BUFP.
	239	Returns 0 if the pattern was valid, otherwise an error string.
	240
	241	Assumes the `allocated' (and perhaps `buffer') and `translate' fields
	242	are set in BUFP on entry. */
	243
	244	const char *
	245	re_compile_pattern (pattern, length, bufp)
	246	const char *pattern;
	247	size_t length;
	248	struct re_pattern_buffer *bufp;
	249	{
	250	reg_errcode_t ret;
	251
	252	/* And GNU code determines whether or not to get register information
	253	by passing null for the REGS argument to re_match, etc., not by
	254	setting no_sub, unless RE_NO_SUB is set. */
	255	bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
	256
	257	/* Match anchors at newline. */
	258	bufp->newline_anchor = 1;
	259
	260	ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
	261
	262	if (!ret)
	263	return NULL;
	264	return gettext (RE_ERRMSG(ret));
	265	}
	266	#ifdef _LIBC
	267	weak_alias (__re_compile_pattern, re_compile_pattern)
	268	#endif
	269
	270	/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
	271	also be assigned to arbitrarily: each pattern buffer stores its own
	272	syntax, so it can be changed between regex compilations. */
	273	/* This has no initializer because initialized variables in Emacs
	274	become read-only after dumping. */
	275	reg_syntax_t re_syntax_options;
	276
	277
	278	/* Specify the precise syntax of regexps for compilation. This provides
	279	for compatibility for various utilities which historically have
	280	different, incompatible syntaxes.
	281
	282	The argument SYNTAX is a bit mask comprised of the various bits
	283	defined in regex.h. We return the old syntax. */
	284
	285	reg_syntax_t
	286	re_set_syntax (syntax)
	287	reg_syntax_t syntax;
	288	{
	289	reg_syntax_t ret = re_syntax_options;
	290
	291	re_syntax_options = syntax;
	292	return ret;
	293	}
	294	#ifdef _LIBC
	295	weak_alias (__re_set_syntax, re_set_syntax)
	296	#endif
	297
	298	int
	299	re_compile_fastmap (bufp)
	300	struct re_pattern_buffer *bufp;
	301	{
	302	re_dfa_t dfa = (re_dfa_t ) bufp->buffer;
	303	char *fastmap = bufp->fastmap;
	304
	305	memset (fastmap, '\0', sizeof (char) * SBC_MAX);
	306	re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
	307	if (dfa->init_state != dfa->init_state_word)
	308	re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
	309	if (dfa->init_state != dfa->init_state_nl)
	310	re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
	311	if (dfa->init_state != dfa->init_state_begbuf)
	312	re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
	313	bufp->fastmap_accurate = 1;
	314	return 0;
	315	}
	316	#ifdef _LIBC
	317	weak_alias (__re_compile_fastmap, re_compile_fastmap)
	318	#endif
	319
	320	static inline void
	321	__attribute ((always_inline))
	322	re_set_fastmap (char *fastmap, int icase, int ch)
	323	{
	324	fastmap[ch] = 1;
	325	if (icase)
	326	fastmap[tolower (ch)] = 1;
	327	}
	328
	329	/* Helper function for re_compile_fastmap.
	330	Compile fastmap for the initial_state INIT_STATE. */
	331
	332	static void
	333	re_compile_fastmap_iter (bufp, init_state, fastmap)
	334	regex_t *bufp;
	335	const re_dfastate_t *init_state;
	336	char *fastmap;
	337	{
	338	re_dfa_t dfa = (re_dfa_t ) bufp->buffer;
	339	int node_cnt;
	340	int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
	341	for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
	342	{
	343	int node = init_state->nodes.elems[node_cnt];
	344	re_token_type_t type = dfa->nodes[node].type;
	345
	346	if (type == CHARACTER)
	347	{
	348	re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
	349	#ifdef RE_ENABLE_I18N
	350	if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
	351	{
	352	unsigned char buf = re_malloc (unsigned char, dfa->mb_cur_max), p;
	353	wchar_t wc;
	354	mbstate_t state;
	355
	356	p = buf;
	357	*p++ = dfa->nodes[node].opr.c;
	358	while (++node < dfa->nodes_len
	359	&& dfa->nodes[node].type == CHARACTER
	360	&& dfa->nodes[node].mb_partial)
	361	*p++ = dfa->nodes[node].opr.c;
	362	memset (&state, 0, sizeof (state));
	363	if (mbrtowc (&wc, (const char *) buf, p - buf,
	364	&state) == p - buf
	365	&& __wcrtomb ((char *) buf, towlower (wc), &state) > 0)
	366	re_set_fastmap (fastmap, 0, buf[0]);
	367	re_free (buf);
	368	}
	369	#endif
	370	}
	371	else if (type == SIMPLE_BRACKET)
	372	{
	373	int i, j, ch;
	374	for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
	375	for (j = 0; j < UINT_BITS; ++j, ++ch)
	376	if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
	377	re_set_fastmap (fastmap, icase, ch);
	378	}
	379	#ifdef RE_ENABLE_I18N
	380	else if (type == COMPLEX_BRACKET)
	381	{
	382	int i;
	383	re_charset_t *cset = dfa->nodes[node].opr.mbcset;
	384	if (cset->non_match \|\| cset->ncoll_syms \|\| cset->nequiv_classes
	385	\|\| cset->nranges \|\| cset->nchar_classes)
	386	{
	387	# ifdef _LIBC
	388	if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
	389	{
	390	/* In this case we want to catch the bytes which are
	391	the first byte of any collation elements.
	392	e.g. In da_DK, we want to catch 'a' since "aa"
	393	is a valid collation element, and don't catch
	394	'b' since 'b' is the only collation element
	395	which starts from 'b'. */
	396	int j, ch;
	397	const int32_t table = (const int32_t )
	398	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
	399	for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
	400	for (j = 0; j < UINT_BITS; ++j, ++ch)
	401	if (table[ch] < 0)
	402	re_set_fastmap (fastmap, icase, ch);
	403	}
	404	# else
	405	if (dfa->mb_cur_max > 1)
	406	for (i = 0; i < SBC_MAX; ++i)
	407	if (__btowc (i) == WEOF)
	408	re_set_fastmap (fastmap, icase, i);
	409	# endif /* not _LIBC */
	410	}
	411	for (i = 0; i < cset->nmbchars; ++i)
	412	{
	413	char buf[256];
	414	mbstate_t state;
	415	memset (&state, '\0', sizeof (state));
	416	__wcrtomb (buf, cset->mbchars[i], &state);
	417	re_set_fastmap (fastmap, icase, (unsigned char ) buf);
	418	if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
	419	{
	420	__wcrtomb (buf, towlower (cset->mbchars[i]), &state);
	421	re_set_fastmap (fastmap, 0, (unsigned char ) buf);
	422	}
	423	}
	424	}
	425	#endif /* RE_ENABLE_I18N */
	426	else if (type == OP_PERIOD
	427	#ifdef RE_ENABLE_I18N
	428	\|\| type == OP_UTF8_PERIOD
	429	#endif /* RE_ENABLE_I18N */
	430	\|\| type == END_OF_RE)
	431	{
	432	memset (fastmap, '\1', sizeof (char) * SBC_MAX);
	433	if (type == END_OF_RE)
	434	bufp->can_be_null = 1;
	435	return;
	436	}
	437	}
	438	}
	439
	440
	441	/* Entry point for POSIX code. */
	442	/* regcomp takes a regular expression as a string and compiles it.
	443
	444	PREG is a regex_t *. We do not expect any fields to be initialized,
	445	since POSIX says we shouldn't. Thus, we set
	446
	447	`buffer' to the compiled pattern;
	448	`used' to the length of the compiled pattern;
	449	`syntax' to RE_SYNTAX_POSIX_EXTENDED if the
	450	REG_EXTENDED bit in CFLAGS is set; otherwise, to
	451	RE_SYNTAX_POSIX_BASIC;
	452	`newline_anchor' to REG_NEWLINE being set in CFLAGS;
	453	`fastmap' to an allocated space for the fastmap;
	454	`fastmap_accurate' to zero;
	455	`re_nsub' to the number of subexpressions in PATTERN.
	456
	457	PATTERN is the address of the pattern string.
	458
	459	CFLAGS is a series of bits which affect compilation.
	460
	461	If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
	462	use POSIX basic syntax.
	463
	464	If REG_NEWLINE is set, then . and [^...] don't match newline.
	465	Also, regexec will try a match beginning after every newline.
	466
	467	If REG_ICASE is set, then we considers upper- and lowercase
	468	versions of letters to be equivalent when matching.
	469
	470	If REG_NOSUB is set, then when PREG is passed to regexec, that
	471	routine will report only success or failure, and nothing about the
	472	registers.
	473
	474	It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
	475	the return codes and their meanings.) */
	476
	477	int
	478	regcomp (preg, pattern, cflags)
	479	regex_t *__restrict preg;
	480	const char *__restrict pattern;
	481	int cflags;
	482	{
	483	reg_errcode_t ret;
	484	reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
	485	: RE_SYNTAX_POSIX_BASIC);
	486
	487	preg->buffer = NULL;
	488	preg->allocated = 0;
	489	preg->used = 0;
	490
	491	/* Try to allocate space for the fastmap. */
	492	preg->fastmap = re_malloc (char, SBC_MAX);
	493	if (BE (preg->fastmap == NULL, 0))
	494	return REG_ESPACE;
	495
	496	syntax \|= (cflags & REG_ICASE) ? RE_ICASE : 0;
	497
	498	/* If REG_NEWLINE is set, newlines are treated differently. */
	499	if (cflags & REG_NEWLINE)
	500	{ /* REG_NEWLINE implies neither . nor [^...] match newline. */
	501	syntax &= ~RE_DOT_NEWLINE;
	502	syntax \|= RE_HAT_LISTS_NOT_NEWLINE;
	503	/* It also changes the matching behavior. */
	504	preg->newline_anchor = 1;
	505	}
	506	else
	507	preg->newline_anchor = 0;
	508	preg->no_sub = !!(cflags & REG_NOSUB);
	509	preg->translate = NULL;
	510
	511	ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
	512
	513	/* POSIX doesn't distinguish between an unmatched open-group and an
	514	unmatched close-group: both are REG_EPAREN. */
	515	if (ret == REG_ERPAREN)
	516	ret = REG_EPAREN;
	517
	518	/* We have already checked preg->fastmap != NULL. */
	519	if (BE (ret == REG_NOERROR, 1))
	520	/* Compute the fastmap now, since regexec cannot modify the pattern
	521	buffer. This function never fails in this implementation. */
	522	(void) re_compile_fastmap (preg);
	523	else
	524	{
	525	/* Some error occurred while compiling the expression. */
	526	re_free (preg->fastmap);
	527	preg->fastmap = NULL;
	528	}
	529
	530	return (int) ret;
	531	}
	532	#ifdef _LIBC
	533	weak_alias (__regcomp, regcomp)
	534	#endif
	535
	536	/* Returns a message corresponding to an error code, ERRCODE, returned
	537	from either regcomp or regexec. We don't use PREG here. */
	538
	539	size_t
	540	regerror (errcode, preg, errbuf, errbuf_size)
	541	int errcode;
	542	const regex_t *preg;
	543	char *errbuf;
	544	size_t errbuf_size;
	545	{
	546	const char *msg;
	547	size_t msg_size;
	548
	549	if (BE (errcode < 0
	550	\|\| errcode >= (int) (sizeof (__re_error_msgid_idx)
	551	/ sizeof (__re_error_msgid_idx[0])), 0))
	552	/* Only error codes returned by the rest of the code should be passed
	553	to this routine. If we are given anything else, or if other regex
	554	code generates an invalid error code, then the program has a bug.
	555	Dump core so we can fix it. */
	556	abort ();
	557
	558	msg = gettext (RE_ERRMSG(errcode));
	559
	560	msg_size = strlen (msg) + 1; /* Includes the null. */
	561
	562	if (BE (errbuf_size != 0, 1))
	563	{
	564	if (BE (msg_size > errbuf_size, 0))
	565	{
	566	memcpy (errbuf, msg, errbuf_size - 1);
	567	errbuf[errbuf_size - 1] = 0;
	568	}
	569	else
	570	memcpy (errbuf, msg, msg_size);
	571	}
	572
	573	return msg_size;
	574	}
	575	#ifdef _LIBC
	576	weak_alias (__regerror, regerror)
	577	#endif
	578
	579
	580	#ifdef RE_ENABLE_I18N
	581	/* This static array is used for the map to single-byte characters when
	582	UTF-8 is used. Otherwise we would allocate memory just to initialize
	583	it the same all the time. UTF-8 is the preferred encoding so this is
	584	a worthwhile optimization. */
	585	static const bitset utf8_sb_map =
	586	{
	587	/* Set the first 128 bits. */
	588	# if UINT_MAX == 0xffffffff
	589	0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
	590	# else
	591	# error "Add case for new unsigned int size"
	592	# endif
	593	};
	594	#endif
	595
	596
	597	static void
	598	free_dfa_content (re_dfa_t *dfa)
	599	{
	600	int i, j;
	601
	602	if (dfa->nodes)
	603	for (i = 0; i < dfa->nodes_len; ++i)
	604	free_token (dfa->nodes + i);
	605	re_free (dfa->nexts);
	606	for (i = 0; i < dfa->nodes_len; ++i)
	607	{
	608	if (dfa->eclosures != NULL)
	609	re_node_set_free (dfa->eclosures + i);
	610	if (dfa->inveclosures != NULL)
	611	re_node_set_free (dfa->inveclosures + i);
	612	if (dfa->edests != NULL)
	613	re_node_set_free (dfa->edests + i);
	614	}
	615	re_free (dfa->edests);
	616	re_free (dfa->eclosures);
	617	re_free (dfa->inveclosures);
	618	re_free (dfa->nodes);
	619
	620	if (dfa->state_table)
	621	for (i = 0; i <= dfa->state_hash_mask; ++i)
	622	{
	623	struct re_state_table_entry *entry = dfa->state_table + i;
	624	for (j = 0; j < entry->num; ++j)
	625	{
	626	re_dfastate_t *state = entry->array[j];
	627	free_state (state);
	628	}
	629	re_free (entry->array);
	630	}
	631	re_free (dfa->state_table);
	632	#ifdef RE_ENABLE_I18N
	633	if (dfa->sb_char != utf8_sb_map)
	634	re_free (dfa->sb_char);
	635	#endif
	636	re_free (dfa->subexp_map);
	637	#ifdef DEBUG
	638	re_free (dfa->re_str);
	639	#endif
	640
	641	re_free (dfa);
	642	}
	643
	644
	645	/* Free dynamically allocated space used by PREG. */
	646
	647	void
	648	regfree (preg)
	649	regex_t *preg;
	650	{
	651	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
	652	if (BE (dfa != NULL, 1))
	653	free_dfa_content (dfa);
	654	preg->buffer = NULL;
	655	preg->allocated = 0;
	656
	657	re_free (preg->fastmap);
	658	preg->fastmap = NULL;
	659
	660	re_free (preg->translate);
	661	preg->translate = NULL;
	662	}
	663	#ifdef _LIBC
	664	weak_alias (__regfree, regfree)
	665	#endif
	666
	667
	668	/* Entry points compatible with 4.2 BSD regex library. We don't define
	669	them unless specifically requested. */
	670
	671	#if defined _REGEX_RE_COMP \|\| defined _LIBC
	672
	673	/* BSD has one and only one pattern buffer. */
	674	static struct re_pattern_buffer re_comp_buf;
	675
	676	char *
	677	# ifdef _LIBC
	678	/* Make these definitions weak in libc, so POSIX programs can redefine
	679	these names if they don't use our functions, and still use
	680	regcomp/regexec above without link errors. */
	681	weak_function
	682	# endif
	683	re_comp (s)
	684	const char *s;
	685	{
	686	reg_errcode_t ret;
	687	char *fastmap;
	688
	689	if (!s)
	690	{
	691	if (!re_comp_buf.buffer)
	692	return gettext ("No previous regular expression");
	693	return 0;
	694	}
	695
	696	if (re_comp_buf.buffer)
	697	{
	698	fastmap = re_comp_buf.fastmap;
	699	re_comp_buf.fastmap = NULL;
	700	__regfree (&re_comp_buf);
	701	memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
	702	re_comp_buf.fastmap = fastmap;
	703	}
	704
	705	if (re_comp_buf.fastmap == NULL)
	706	{
	707	re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
	708	if (re_comp_buf.fastmap == NULL)
	709	return (char *) gettext (RE_ERRMSG(REG_ESPACE_IDX));
	710	}
	711
	712	/* Since `re_exec' always passes NULL for the `regs' argument, we
	713	don't need to initialize the pattern buffer fields which affect it. */
	714
	715	/* Match anchors at newlines. */
	716	re_comp_buf.newline_anchor = 1;
	717
	718	ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
	719
	720	if (!ret)
	721	return NULL;
	722
	723	/* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
	724	return (char *) gettext (RE_ERRMSG(ret));
	725	}
	726
	727	#ifdef _LIBC
	728	libc_freeres_fn (free_mem)
	729	{
	730	__regfree (&re_comp_buf);
	731	}
	732	#endif
	733
	734	#endif /* _REGEX_RE_COMP */
	735
	736
	737	/* Internal entry point.
	738	Compile the regular expression PATTERN, whose length is LENGTH.
	739	SYNTAX indicate regular expression's syntax. */
	740
	741	static reg_errcode_t
	742	re_compile_internal (preg, pattern, length, syntax)
	743	regex_t *preg;
	744	const char * pattern;
	745	int length;
	746	reg_syntax_t syntax;
	747	{
	748	reg_errcode_t err = REG_NOERROR;
	749	re_dfa_t *dfa;
	750	re_string_t regexp;
	751
	752	/* Initialize the pattern buffer. */
	753	preg->fastmap_accurate = 0;
	754	preg->syntax = syntax;
	755	preg->not_bol = preg->not_eol = 0;
	756	preg->used = 0;
	757	preg->re_nsub = 0;
	758	preg->can_be_null = 0;
	759	preg->regs_allocated = REGS_UNALLOCATED;
	760
	761	/* Initialize the dfa. */
	762	dfa = (re_dfa_t *) preg->buffer;
	763	if (BE (preg->allocated < sizeof (re_dfa_t), 0))
	764	{
	765	/* If zero allocated, but buffer is non-null, try to realloc
	766	enough space. This loses if buffer's address is bogus, but
	767	that is the user's responsibility. If ->buffer is NULL this
	768	is a simple allocation. */
	769	dfa = re_realloc (preg->buffer, re_dfa_t, 1);
	770	if (dfa == NULL)
	771	return REG_ESPACE;
	772	preg->allocated = sizeof (re_dfa_t);
	773	preg->buffer = (unsigned char *) dfa;
	774	}
	775	preg->used = sizeof (re_dfa_t);
	776
	777	err = init_dfa (dfa, length);
	778	if (BE (err != REG_NOERROR, 0))
	779	{
	780	free_dfa_content (dfa);
	781	preg->buffer = NULL;
	782	preg->allocated = 0;
	783	return err;
	784	}
	785	#ifdef DEBUG
	786	dfa->re_str = re_malloc (char, length + 1);
	787	strncpy (dfa->re_str, pattern, length + 1);
	788	#endif
	789
	790	err = re_string_construct (&regexp, pattern, length, preg->translate,
	791	syntax & RE_ICASE, dfa);
	792	if (BE (err != REG_NOERROR, 0))
	793	{
	794	re_compile_internal_free_return:
	795	free_workarea_compile (preg);
	796	re_string_destruct (&regexp);
	797	free_dfa_content (dfa);
	798	preg->buffer = NULL;
	799	preg->allocated = 0;
	800	return err;
	801	}
	802
	803	/* Parse the regular expression, and build a structure tree. */
	804	preg->re_nsub = 0;
	805	dfa->str_tree = parse (&regexp, preg, syntax, &err);
	806	if (BE (dfa->str_tree == NULL, 0))
	807	goto re_compile_internal_free_return;
	808
	809	/* Analyze the tree and create the nfa. */
	810	err = analyze (preg);
	811	if (BE (err != REG_NOERROR, 0))
	812	goto re_compile_internal_free_return;
	813
	814	#ifdef RE_ENABLE_I18N
	815	/* If possible, do searching in single byte encoding to speed things up. */
	816	if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
	817	optimize_utf8 (dfa);
	818	#endif
	819
	820	/* Then create the initial state of the dfa. */
	821	err = create_initial_state (dfa);
	822
	823	/* Release work areas. */
	824	free_workarea_compile (preg);
	825	re_string_destruct (&regexp);
	826
	827	if (BE (err != REG_NOERROR, 0))
	828	{
	829	free_dfa_content (dfa);
	830	preg->buffer = NULL;
	831	preg->allocated = 0;
	832	}
	833
	834	return err;
	835	}
	836
	837	/* Initialize DFA. We use the length of the regular expression PAT_LEN
	838	as the initial length of some arrays. */
	839
	840	static reg_errcode_t
	841	init_dfa (dfa, pat_len)
	842	re_dfa_t *dfa;
	843	int pat_len;
	844	{
	845	int table_size;
	846	#ifndef _LIBC
	847	char *codeset_name;
	848	#endif
	849
	850	memset (dfa, '\0', sizeof (re_dfa_t));
	851
	852	/* Force allocation of str_tree_storage the first time. */
	853	dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
	854
	855	dfa->nodes_alloc = pat_len + 1;
	856	dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
	857
	858	dfa->states_alloc = pat_len + 1;
	859
	860	/* table_size = 2 ^ ceil(log pat_len) */
	861	for (table_size = 1; table_size > 0; table_size <<= 1)
	862	if (table_size > pat_len)
	863	break;
	864
	865	dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
	866	dfa->state_hash_mask = table_size - 1;
	867
	868	dfa->mb_cur_max = MB_CUR_MAX;
	869	#ifdef _LIBC
	870	if (dfa->mb_cur_max == 6
	871	&& strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
	872	dfa->is_utf8 = 1;
	873	dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
	874	!= 0);
	875	#else
	876	# ifdef HAVE_LANGINFO_CODESET
	877	codeset_name = nl_langinfo (CODESET);
	878	# else
	879	codeset_name = getenv ("LC_ALL");
	880	if (codeset_name == NULL \|\| codeset_name[0] == '\0')
	881	codeset_name = getenv ("LC_CTYPE");
	882	if (codeset_name == NULL \|\| codeset_name[0] == '\0')
	883	codeset_name = getenv ("LANG");
	884	if (codeset_name == NULL)
	885	codeset_name = "";
	886	else if (strchr (codeset_name, '.') != NULL)
	887	codeset_name = strchr (codeset_name, '.') + 1;
	888	# endif
	889
	890	/* strcasecmp isn't a standard interface. brute force check */
	891	#if 0
	892	if (strcasecmp (codeset_name, "UTF-8") == 0
	893	\|\| strcasecmp (codeset_name, "UTF8") == 0)
	894	dfa->is_utf8 = 1;
	895	#else
	896	if ( (codeset_name[0] == 'U' \|\| codeset_name[0] == 'u')
	897	&& (codeset_name[1] == 'T' \|\| codeset_name[1] == 't')
	898	&& (codeset_name[2] == 'F' \|\| codeset_name[2] == 'f')
	899	&& (codeset_name[3] == '-'
	900	? codeset_name[4] == '8' && codeset_name[5] == '\0'
	901	: codeset_name[3] == '8' && codeset_name[4] == '\0'))
	902	dfa->is_utf8 = 1;
	903	#endif
	904
	905	/* We check exhaustively in the loop below if this charset is a
	906	superset of ASCII. */
	907	dfa->map_notascii = 0;
	908	#endif
	909
	910	#ifdef RE_ENABLE_I18N
	911	if (dfa->mb_cur_max > 1)
	912	{
	913	if (dfa->is_utf8)
	914	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
	915	else
	916	{
	917	int i, j, ch;
	918
	919	dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
	920	if (BE (dfa->sb_char == NULL, 0))
	921	return REG_ESPACE;
	922
	923	/* Clear all bits by, then set those corresponding to single
	924	byte chars. */
	925	bitset_empty (dfa->sb_char);
	926
	927	for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
	928	for (j = 0; j < UINT_BITS; ++j, ++ch)
	929	{
	930	wchar_t wch = __btowc (ch);
	931	if (wch != WEOF)
	932	dfa->sb_char[i] \|= 1 << j;
	933	# ifndef _LIBC
	934	if (isascii (ch) && wch != (wchar_t) ch)
	935	dfa->map_notascii = 1;
	936	# endif
	937	}
	938	}
	939	}
	940	#endif
	941
	942	if (BE (dfa->nodes == NULL \|\| dfa->state_table == NULL, 0))
	943	return REG_ESPACE;
	944	return REG_NOERROR;
	945	}
	946
	947	/* Initialize WORD_CHAR table, which indicate which character is
	948	"word". In this case "word" means that it is the word construction
	949	character used by some operators like "\<", "\>", etc. */
	950
	951	static void
	952	init_word_char (dfa)
	953	re_dfa_t *dfa;
	954	{
	955	int i, j, ch;
	956	dfa->word_ops_used = 1;
	957	for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
	958	for (j = 0; j < UINT_BITS; ++j, ++ch)
	959	if (isalnum (ch) \|\| ch == '_')
	960	dfa->word_char[i] \|= 1 << j;
	961	}
	962
	963	/* Free the work area which are only used while compiling. */
	964
	965	static void
	966	free_workarea_compile (preg)
	967	regex_t *preg;
	968	{
	969	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
	970	bin_tree_storage_t storage, next;
	971	for (storage = dfa->str_tree_storage; storage; storage = next)
	972	{
	973	next = storage->next;
	974	re_free (storage);
	975	}
	976	dfa->str_tree_storage = NULL;
	977	dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
	978	dfa->str_tree = NULL;
	979	re_free (dfa->org_indices);
	980	dfa->org_indices = NULL;
	981	}
	982
	983	/* Create initial states for all contexts. */
	984
	985	static reg_errcode_t
	986	create_initial_state (dfa)
	987	re_dfa_t *dfa;
	988	{
	989	int first, i;
	990	reg_errcode_t err;
	991	re_node_set init_nodes;
	992
	993	/* Initial states have the epsilon closure of the node which is
	994	the first node of the regular expression. */
	995	first = dfa->str_tree->first->node_idx;
	996	dfa->init_node = first;
	997	err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
	998	if (BE (err != REG_NOERROR, 0))
	999	return err;
	1000
	1001	/* The back-references which are in initial states can epsilon transit,
	1002	since in this case all of the subexpressions can be null.
	1003	Then we add epsilon closures of the nodes which are the next nodes of
	1004	the back-references. */
	1005	if (dfa->nbackref > 0)
	1006	for (i = 0; i < init_nodes.nelem; ++i)
	1007	{
	1008	int node_idx = init_nodes.elems[i];
	1009	re_token_type_t type = dfa->nodes[node_idx].type;
	1010
	1011	int clexp_idx;
	1012	if (type != OP_BACK_REF)
	1013	continue;
	1014	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
	1015	{
	1016	re_token_t *clexp_node;
	1017	clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
	1018	if (clexp_node->type == OP_CLOSE_SUBEXP
	1019	&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
	1020	break;
	1021	}
	1022	if (clexp_idx == init_nodes.nelem)
	1023	continue;
	1024
	1025	if (type == OP_BACK_REF)
	1026	{
	1027	int dest_idx = dfa->edests[node_idx].elems[0];
	1028	if (!re_node_set_contains (&init_nodes, dest_idx))
	1029	{
	1030	re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
	1031	i = 0;
	1032	}
	1033	}
	1034	}
	1035
	1036	/* It must be the first time to invoke acquire_state. */
	1037	dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
	1038	/* We don't check ERR here, since the initial state must not be NULL. */
	1039	if (BE (dfa->init_state == NULL, 0))
	1040	return err;
	1041	if (dfa->init_state->has_constraint)
	1042	{
	1043	dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
	1044	CONTEXT_WORD);
	1045	dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
	1046	CONTEXT_NEWLINE);
	1047	dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
	1048	&init_nodes,
	1049	CONTEXT_NEWLINE
	1050	\| CONTEXT_BEGBUF);
	1051	if (BE (dfa->init_state_word == NULL \|\| dfa->init_state_nl == NULL
	1052	\|\| dfa->init_state_begbuf == NULL, 0))
	1053	return err;
	1054	}
	1055	else
	1056	dfa->init_state_word = dfa->init_state_nl
	1057	= dfa->init_state_begbuf = dfa->init_state;
	1058
	1059	re_node_set_free (&init_nodes);
	1060	return REG_NOERROR;
	1061	}
	1062
	1063
	1064	#ifdef RE_ENABLE_I18N
	1065	/* If it is possible to do searching in single byte encoding instead of UTF-8
	1066	to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
	1067	DFA nodes where needed. */
	1068
	1069	static void
	1070	optimize_utf8 (dfa)
	1071	re_dfa_t *dfa;
	1072	{
	1073	int node, i, mb_chars = 0, has_period = 0;
	1074
	1075	for (node = 0; node < dfa->nodes_len; ++node)
	1076	switch (dfa->nodes[node].type)
	1077	{
	1078	case CHARACTER:
	1079	if (dfa->nodes[node].opr.c >= 0x80)
	1080	mb_chars = 1;
	1081	break;
	1082	case ANCHOR:
	1083	switch (dfa->nodes[node].opr.idx)
	1084	{
	1085	case LINE_FIRST:
	1086	case LINE_LAST:
	1087	case BUF_FIRST:
	1088	case BUF_LAST:
	1089	break;
	1090	default:
	1091	/* Word anchors etc. cannot be handled. */
	1092	return;
	1093	}
	1094	break;
	1095	case OP_PERIOD:
	1096	has_period = 1;
	1097	break;
	1098	case OP_BACK_REF:
	1099	case OP_ALT:
	1100	case END_OF_RE:
	1101	case OP_DUP_ASTERISK:
	1102	case OP_OPEN_SUBEXP:
	1103	case OP_CLOSE_SUBEXP:
	1104	break;
	1105	case COMPLEX_BRACKET:
	1106	return;
	1107	case SIMPLE_BRACKET:
	1108	/* Just double check. */
	1109	for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
	1110	if (dfa->nodes[node].opr.sbcset[i])
	1111	return;
	1112	break;
	1113	default:
	1114	abort ();
	1115	}
	1116
	1117	if (mb_chars \|\| has_period)
	1118	for (node = 0; node < dfa->nodes_len; ++node)
	1119	{
	1120	if (dfa->nodes[node].type == CHARACTER
	1121	&& dfa->nodes[node].opr.c >= 0x80)
	1122	dfa->nodes[node].mb_partial = 0;
	1123	else if (dfa->nodes[node].type == OP_PERIOD)
	1124	dfa->nodes[node].type = OP_UTF8_PERIOD;
	1125	}
	1126
	1127	/* The search can be in single byte locale. */
	1128	dfa->mb_cur_max = 1;
	1129	dfa->is_utf8 = 0;
	1130	dfa->has_mb_node = dfa->nbackref > 0 \|\| has_period;
	1131	}
	1132	#endif
	1133
	1134
	1135	/* Analyze the structure tree, and calculate "first", "next", "edest",
	1136	"eclosure", and "inveclosure". */
	1137
	1138	static reg_errcode_t
	1139	analyze (preg)
	1140	regex_t *preg;
	1141	{
	1142	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
	1143	reg_errcode_t ret;
	1144
	1145	/* Allocate arrays. */
	1146	dfa->nexts = re_malloc (int, dfa->nodes_alloc);
	1147	dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
	1148	dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
	1149	dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
	1150	if (BE (dfa->nexts == NULL \|\| dfa->org_indices == NULL \|\| dfa->edests == NULL
	1151	\|\| dfa->eclosures == NULL, 0))
	1152	return REG_ESPACE;
	1153
	1154	dfa->subexp_map = re_malloc (int, preg->re_nsub);
	1155	if (dfa->subexp_map != NULL)
	1156	{
	1157	int i;
	1158	for (i = 0; i < preg->re_nsub; i++)
	1159	dfa->subexp_map[i] = i;
	1160	preorder (dfa->str_tree, optimize_subexps, dfa);
	1161	for (i = 0; i < preg->re_nsub; i++)
	1162	if (dfa->subexp_map[i] != i)
	1163	break;
	1164	if (i == preg->re_nsub)
	1165	{
	1166	free (dfa->subexp_map);
	1167	dfa->subexp_map = NULL;
	1168	}
	1169	}
	1170
	1171	ret = postorder (dfa->str_tree, lower_subexps, preg);
	1172	if (BE (ret != REG_NOERROR, 0))
	1173	return ret;
	1174	ret = postorder (dfa->str_tree, calc_first, dfa);
	1175	if (BE (ret != REG_NOERROR, 0))
	1176	return ret;
	1177	preorder (dfa->str_tree, calc_next, dfa);
	1178	ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
	1179	if (BE (ret != REG_NOERROR, 0))
	1180	return ret;
	1181	ret = calc_eclosure (dfa);
	1182	if (BE (ret != REG_NOERROR, 0))
	1183	return ret;
	1184
	1185	/* We only need this during the prune_impossible_nodes pass in regexec.c;
	1186	skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
	1187	if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
	1188	\|\| dfa->nbackref)
	1189	{
	1190	dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
	1191	if (BE (dfa->inveclosures == NULL, 0))
	1192	return REG_ESPACE;
	1193	ret = calc_inveclosure (dfa);
	1194	}
	1195
	1196	return ret;
	1197	}
	1198
	1199	/* Our parse trees are very unbalanced, so we cannot use a stack to
	1200	implement parse tree visits. Instead, we use parent pointers and
	1201	some hairy code in these two functions. */
	1202	static reg_errcode_t
	1203	postorder (root, fn, extra)
	1204	bin_tree_t *root;
	1205	reg_errcode_t (fn (void , bin_tree_t ));
	1206	void *extra;
	1207	{
	1208	bin_tree_t node, prev;
	1209
	1210	for (node = root; ; )
	1211	{
	1212	/* Descend down the tree, preferably to the left (or to the right
	1213	if that's the only child). */
	1214	while (node->left \|\| node->right)
	1215	if (node->left)
	1216	node = node->left;
	1217	else
	1218	node = node->right;
	1219
	1220	do
	1221	{
	1222	reg_errcode_t err = fn (extra, node);
	1223	if (BE (err != REG_NOERROR, 0))
	1224	return err;
	1225	if (node->parent == NULL)
	1226	return REG_NOERROR;
	1227	prev = node;
	1228	node = node->parent;
	1229	}
	1230	/* Go up while we have a node that is reached from the right. */
	1231	while (node->right == prev \|\| node->right == NULL);
	1232	node = node->right;
	1233	}
	1234	}
	1235
	1236	static reg_errcode_t
	1237	preorder (root, fn, extra)
	1238	bin_tree_t *root;
	1239	reg_errcode_t (fn (void , bin_tree_t ));
	1240	void *extra;
	1241	{
	1242	bin_tree_t *node;
	1243
	1244	for (node = root; ; )
	1245	{
	1246	reg_errcode_t err = fn (extra, node);
	1247	if (BE (err != REG_NOERROR, 0))
	1248	return err;
	1249
	1250	/* Go to the left node, or up and to the right. */
	1251	if (node->left)
	1252	node = node->left;
	1253	else
	1254	{
	1255	bin_tree_t *prev = NULL;
	1256	while (node->right == prev \|\| node->right == NULL)
	1257	{
	1258	prev = node;
	1259	node = node->parent;
	1260	if (!node)
	1261	return REG_NOERROR;
	1262	}
	1263	node = node->right;
	1264	}
	1265	}
	1266	}
	1267
	1268	/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
	1269	re_search_internal to map the inner one's opr.idx to this one's. Adjust
	1270	backreferences as well. Requires a preorder visit. */
	1271	static reg_errcode_t
	1272	optimize_subexps (extra, node)
	1273	void *extra;
	1274	bin_tree_t *node;
	1275	{
	1276	re_dfa_t dfa = (re_dfa_t ) extra;
	1277
	1278	if (node->token.type == OP_BACK_REF && dfa->subexp_map)
	1279	{
	1280	int idx = node->token.opr.idx;
	1281	node->token.opr.idx = dfa->subexp_map[idx];
	1282	dfa->used_bkref_map \|= 1 << node->token.opr.idx;
	1283	}
	1284
	1285	else if (node->token.type == SUBEXP
	1286	&& node->left && node->left->token.type == SUBEXP)
	1287	{
	1288	int other_idx = node->left->token.opr.idx;
	1289
	1290	node->left = node->left->left;
	1291	if (node->left)
	1292	node->left->parent = node;
	1293
	1294	dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
	1295	if (other_idx < 8 * sizeof (dfa->used_bkref_map))
	1296	dfa->used_bkref_map &= ~(1 << other_idx);
	1297	}
	1298
	1299	return REG_NOERROR;
	1300	}
	1301
	1302	/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
	1303	of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
	1304	static reg_errcode_t
	1305	lower_subexps (extra, node)
	1306	void *extra;
	1307	bin_tree_t *node;
	1308	{
	1309	regex_t preg = (regex_t ) extra;
	1310	reg_errcode_t err = REG_NOERROR;
	1311
	1312	if (node->left && node->left->token.type == SUBEXP)
	1313	{
	1314	node->left = lower_subexp (&err, preg, node->left);
	1315	if (node->left)
	1316	node->left->parent = node;
	1317	}
	1318	if (node->right && node->right->token.type == SUBEXP)
	1319	{
	1320	node->right = lower_subexp (&err, preg, node->right);
	1321	if (node->right)
	1322	node->right->parent = node;
	1323	}
	1324
	1325	return err;
	1326	}
	1327
	1328	static bin_tree_t *
	1329	lower_subexp (err, preg, node)
	1330	reg_errcode_t *err;
	1331	regex_t *preg;
	1332	bin_tree_t *node;
	1333	{
	1334	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
	1335	bin_tree_t *body = node->left;
	1336	bin_tree_t op, cls, tree1, tree;
	1337
	1338	if (preg->no_sub
	1339	&& (node->token.opr.idx >= 8 * sizeof (dfa->used_bkref_map)
	1340	\|\| !(dfa->used_bkref_map & (1 << node->token.opr.idx))))
	1341	return node->left;
	1342
	1343	/* Convert the SUBEXP node to the concatenation of an
	1344	OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
	1345	op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
	1346	cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
	1347	tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
	1348	tree = create_tree (dfa, op, tree1, CONCAT);
	1349	if (BE (tree == NULL \|\| tree1 == NULL \|\| op == NULL \|\| cls == NULL, 0))
	1350	{
	1351	*err = REG_ESPACE;
	1352	return NULL;
	1353	}
	1354
	1355	op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
	1356	op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
	1357	return tree;
	1358	}
	1359
	1360	/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
	1361	nodes. Requires a postorder visit. */
	1362	static reg_errcode_t
	1363	calc_first (extra, node)
	1364	void *extra;
	1365	bin_tree_t *node;
	1366	{
	1367	re_dfa_t dfa = (re_dfa_t ) extra;
	1368	if (node->token.type == CONCAT)
	1369	{
	1370	node->first = node->left->first;
	1371	node->node_idx = node->left->node_idx;
	1372	}
	1373	else
	1374	{
	1375	node->first = node;
	1376	node->node_idx = re_dfa_add_node (dfa, node->token);
	1377	if (BE (node->node_idx == -1, 0))
	1378	return REG_ESPACE;
	1379	}
	1380	return REG_NOERROR;
	1381	}
	1382
	1383	/* Pass 2: compute NEXT on the tree. Preorder visit. */
	1384	static reg_errcode_t
	1385	calc_next (extra, node)
	1386	void *extra;
	1387	bin_tree_t *node;
	1388	{
	1389	switch (node->token.type)
	1390	{
	1391	case OP_DUP_ASTERISK:
	1392	node->left->next = node;
	1393	break;
	1394	case CONCAT:
	1395	node->left->next = node->right->first;
	1396	node->right->next = node->next;
	1397	break;
	1398	default:
	1399	if (node->left)
	1400	node->left->next = node->next;
	1401	if (node->right)
	1402	node->right->next = node->next;
	1403	break;
	1404	}
	1405	return REG_NOERROR;
	1406	}
	1407
	1408	/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
	1409	static reg_errcode_t
	1410	link_nfa_nodes (extra, node)
	1411	void *extra;
	1412	bin_tree_t *node;
	1413	{
	1414	re_dfa_t dfa = (re_dfa_t ) extra;
	1415	int idx = node->node_idx;
	1416	reg_errcode_t err = REG_NOERROR;
	1417
	1418	switch (node->token.type)
	1419	{
	1420	case CONCAT:
	1421	break;
	1422
	1423	case END_OF_RE:
	1424	assert (node->next == NULL);
	1425	break;
	1426
	1427	case OP_DUP_ASTERISK:
	1428	case OP_ALT:
	1429	{
	1430	int left, right;
	1431	dfa->has_plural_match = 1;
	1432	if (node->left != NULL)
	1433	left = node->left->first->node_idx;
	1434	else
	1435	left = node->next->node_idx;
	1436	if (node->right != NULL)
	1437	right = node->right->first->node_idx;
	1438	else
	1439	right = node->next->node_idx;
	1440	assert (left > -1);
	1441	assert (right > -1);
	1442	err = re_node_set_init_2 (dfa->edests + idx, left, right);
	1443	}
	1444	break;
	1445
	1446	case ANCHOR:
	1447	case OP_OPEN_SUBEXP:
	1448	case OP_CLOSE_SUBEXP:
	1449	err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
	1450	break;
	1451
	1452	case OP_BACK_REF:
	1453	dfa->nexts[idx] = node->next->node_idx;
	1454	if (node->token.type == OP_BACK_REF)
	1455	re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
	1456	break;
	1457
	1458	default:
	1459	assert (!IS_EPSILON_NODE (node->token.type));
	1460	dfa->nexts[idx] = node->next->node_idx;
	1461	break;
	1462	}
	1463
	1464	return err;
	1465	}
	1466
	1467	/* Duplicate the epsilon closure of the node ROOT_NODE.
	1468	Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
	1469	to their own constraint. */
	1470
	1471	static reg_errcode_t
	1472	duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
	1473	init_constraint)
	1474	re_dfa_t *dfa;
	1475	int top_org_node, top_clone_node, root_node;
	1476	unsigned int init_constraint;
	1477	{
	1478	reg_errcode_t err;
	1479	int org_node, clone_node, ret;
	1480	unsigned int constraint = init_constraint;
	1481	for (org_node = top_org_node, clone_node = top_clone_node;;)
	1482	{
	1483	int org_dest, clone_dest;
	1484	if (dfa->nodes[org_node].type == OP_BACK_REF)
	1485	{
	1486	/* If the back reference epsilon-transit, its destination must
	1487	also have the constraint. Then duplicate the epsilon closure
	1488	of the destination of the back reference, and store it in
	1489	edests of the back reference. */
	1490	org_dest = dfa->nexts[org_node];
	1491	re_node_set_empty (dfa->edests + clone_node);
	1492	err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
	1493	if (BE (err != REG_NOERROR, 0))
	1494	return err;
	1495	dfa->nexts[clone_node] = dfa->nexts[org_node];
	1496	ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
	1497	if (BE (ret < 0, 0))
	1498	return REG_ESPACE;
	1499	}
	1500	else if (dfa->edests[org_node].nelem == 0)
	1501	{
	1502	/* In case of the node can't epsilon-transit, don't duplicate the
	1503	destination and store the original destination as the
	1504	destination of the node. */
	1505	dfa->nexts[clone_node] = dfa->nexts[org_node];
	1506	break;
	1507	}
	1508	else if (dfa->edests[org_node].nelem == 1)
	1509	{
	1510	/* In case of the node can epsilon-transit, and it has only one
	1511	destination. */
	1512	org_dest = dfa->edests[org_node].elems[0];
	1513	re_node_set_empty (dfa->edests + clone_node);
	1514	if (dfa->nodes[org_node].type == ANCHOR)
	1515	{
	1516	/* In case of the node has another constraint, append it. */
	1517	if (org_node == root_node && clone_node != org_node)
	1518	{
	1519	/* ...but if the node is root_node itself, it means the
	1520	epsilon closure have a loop, then tie it to the
	1521	destination of the root_node. */
	1522	ret = re_node_set_insert (dfa->edests + clone_node,
	1523	org_dest);
	1524	if (BE (ret < 0, 0))
	1525	return REG_ESPACE;
	1526	break;
	1527	}
	1528	constraint \|= dfa->nodes[org_node].opr.ctx_type;
	1529	}
	1530	err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
	1531	if (BE (err != REG_NOERROR, 0))
	1532	return err;
	1533	ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
	1534	if (BE (ret < 0, 0))
	1535	return REG_ESPACE;
	1536	}
	1537	else /* dfa->edests[org_node].nelem == 2 */
	1538	{
	1539	/* In case of the node can epsilon-transit, and it has two
	1540	destinations. In the bin_tree_t and DFA, that's '\|' and ''. /
	1541	org_dest = dfa->edests[org_node].elems[0];
	1542	re_node_set_empty (dfa->edests + clone_node);
	1543	/* Search for a duplicated node which satisfies the constraint. */
	1544	clone_dest = search_duplicated_node (dfa, org_dest, constraint);
	1545	if (clone_dest == -1)
	1546	{
	1547	/* There are no such a duplicated node, create a new one. */
	1548	err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
	1549	if (BE (err != REG_NOERROR, 0))
	1550	return err;
	1551	ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
	1552	if (BE (ret < 0, 0))
	1553	return REG_ESPACE;
	1554	err = duplicate_node_closure (dfa, org_dest, clone_dest,
	1555	root_node, constraint);
	1556	if (BE (err != REG_NOERROR, 0))
	1557	return err;
	1558	}
	1559	else
	1560	{
	1561	/* There are a duplicated node which satisfy the constraint,
	1562	use it to avoid infinite loop. */
	1563	ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
	1564	if (BE (ret < 0, 0))
	1565	return REG_ESPACE;
	1566	}
	1567
	1568	org_dest = dfa->edests[org_node].elems[1];
	1569	err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
	1570	if (BE (err != REG_NOERROR, 0))
	1571	return err;
	1572	ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
	1573	if (BE (ret < 0, 0))
	1574	return REG_ESPACE;
	1575	}
	1576	org_node = org_dest;
	1577	clone_node = clone_dest;
	1578	}
	1579	return REG_NOERROR;
	1580	}
	1581
	1582	/* Search for a node which is duplicated from the node ORG_NODE, and
	1583	satisfies the constraint CONSTRAINT. */
	1584
	1585	static int
	1586	search_duplicated_node (dfa, org_node, constraint)
	1587	re_dfa_t *dfa;
	1588	int org_node;
	1589	unsigned int constraint;
	1590	{
	1591	int idx;
	1592	for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
	1593	{
	1594	if (org_node == dfa->org_indices[idx]
	1595	&& constraint == dfa->nodes[idx].constraint)
	1596	return idx; /* Found. */
	1597	}
	1598	return -1; /* Not found. */
	1599	}
	1600
	1601	/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
	1602	The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
	1603	otherwise return the error code. */
	1604
	1605	static reg_errcode_t
	1606	duplicate_node (new_idx, dfa, org_idx, constraint)
	1607	re_dfa_t *dfa;
	1608	int *new_idx, org_idx;
	1609	unsigned int constraint;
	1610	{
	1611	int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
	1612	if (BE (dup_idx == -1, 0))
	1613	return REG_ESPACE;
	1614	dfa->nodes[dup_idx].constraint = constraint;
	1615	if (dfa->nodes[org_idx].type == ANCHOR)
	1616	dfa->nodes[dup_idx].constraint \|= dfa->nodes[org_idx].opr.ctx_type;
	1617	dfa->nodes[dup_idx].duplicated = 1;
	1618
	1619	/* Store the index of the original node. */
	1620	dfa->org_indices[dup_idx] = org_idx;
	1621	*new_idx = dup_idx;
	1622	return REG_NOERROR;
	1623	}
	1624
	1625	static reg_errcode_t
	1626	calc_inveclosure (dfa)
	1627	re_dfa_t *dfa;
	1628	{
	1629	int src, idx, ret;
	1630	for (idx = 0; idx < dfa->nodes_len; ++idx)
	1631	re_node_set_init_empty (dfa->inveclosures + idx);
	1632
	1633	for (src = 0; src < dfa->nodes_len; ++src)
	1634	{
	1635	int *elems = dfa->eclosures[src].elems;
	1636	for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
	1637	{
	1638	ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
	1639	if (BE (ret == -1, 0))
	1640	return REG_ESPACE;
	1641	}
	1642	}
	1643
	1644	return REG_NOERROR;
	1645	}
	1646
	1647	/* Calculate "eclosure" for all the node in DFA. */
	1648
	1649	static reg_errcode_t
	1650	calc_eclosure (dfa)
	1651	re_dfa_t *dfa;
	1652	{
	1653	int node_idx, incomplete;
	1654	#ifdef DEBUG
	1655	assert (dfa->nodes_len > 0);
	1656	#endif
	1657	incomplete = 0;
	1658	/* For each nodes, calculate epsilon closure. */
	1659	for (node_idx = 0; ; ++node_idx)
	1660	{
	1661	reg_errcode_t err;
	1662	re_node_set eclosure_elem;
	1663	if (node_idx == dfa->nodes_len)
	1664	{
	1665	if (!incomplete)
	1666	break;
	1667	incomplete = 0;
	1668	node_idx = 0;
	1669	}
	1670
	1671	#ifdef DEBUG
	1672	assert (dfa->eclosures[node_idx].nelem != -1);
	1673	#endif
	1674
	1675	/* If we have already calculated, skip it. */
	1676	if (dfa->eclosures[node_idx].nelem != 0)
	1677	continue;
	1678	/* Calculate epsilon closure of `node_idx'. */
	1679	err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
	1680	if (BE (err != REG_NOERROR, 0))
	1681	return err;
	1682
	1683	if (dfa->eclosures[node_idx].nelem == 0)
	1684	{
	1685	incomplete = 1;
	1686	re_node_set_free (&eclosure_elem);
	1687	}
	1688	}
	1689	return REG_NOERROR;
	1690	}
	1691
	1692	/* Calculate epsilon closure of NODE. */
	1693
	1694	static reg_errcode_t
	1695	calc_eclosure_iter (new_set, dfa, node, root)
	1696	re_node_set *new_set;
	1697	re_dfa_t *dfa;
	1698	int node, root;
	1699	{
	1700	reg_errcode_t err;
	1701	unsigned int constraint;
	1702	int i, incomplete;
	1703	re_node_set eclosure;
	1704	incomplete = 0;
	1705	err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
	1706	if (BE (err != REG_NOERROR, 0))
	1707	return err;
	1708
	1709	/* This indicates that we are calculating this node now.
	1710	We reference this value to avoid infinite loop. */
	1711	dfa->eclosures[node].nelem = -1;
	1712
	1713	constraint = ((dfa->nodes[node].type == ANCHOR)
	1714	? dfa->nodes[node].opr.ctx_type : 0);
	1715	/* If the current node has constraints, duplicate all nodes.
	1716	Since they must inherit the constraints. */
	1717	if (constraint
	1718	&& dfa->edests[node].nelem
	1719	&& !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
	1720	{
	1721	int org_node, cur_node;
	1722	org_node = cur_node = node;
	1723	err = duplicate_node_closure (dfa, node, node, node, constraint);
	1724	if (BE (err != REG_NOERROR, 0))
	1725	return err;
	1726	}
	1727
	1728	/* Expand each epsilon destination nodes. */
	1729	if (IS_EPSILON_NODE(dfa->nodes[node].type))
	1730	for (i = 0; i < dfa->edests[node].nelem; ++i)
	1731	{
	1732	re_node_set eclosure_elem;
	1733	int edest = dfa->edests[node].elems[i];
	1734	/* If calculating the epsilon closure of `edest' is in progress,
	1735	return intermediate result. */
	1736	if (dfa->eclosures[edest].nelem == -1)
	1737	{
	1738	incomplete = 1;
	1739	continue;
	1740	}
	1741	/* If we haven't calculated the epsilon closure of `edest' yet,
	1742	calculate now. Otherwise use calculated epsilon closure. */
	1743	if (dfa->eclosures[edest].nelem == 0)
	1744	{
	1745	err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
	1746	if (BE (err != REG_NOERROR, 0))
	1747	return err;
	1748	}
	1749	else
	1750	eclosure_elem = dfa->eclosures[edest];
	1751	/* Merge the epsilon closure of `edest'. */
	1752	re_node_set_merge (&eclosure, &eclosure_elem);
	1753	/* If the epsilon closure of `edest' is incomplete,
	1754	the epsilon closure of this node is also incomplete. */
	1755	if (dfa->eclosures[edest].nelem == 0)
	1756	{
	1757	incomplete = 1;
	1758	re_node_set_free (&eclosure_elem);
	1759	}
	1760	}
	1761
	1762	/* Epsilon closures include itself. */
	1763	re_node_set_insert (&eclosure, node);
	1764	if (incomplete && !root)
	1765	dfa->eclosures[node].nelem = 0;
	1766	else
	1767	dfa->eclosures[node] = eclosure;
	1768	*new_set = eclosure;
	1769	return REG_NOERROR;
	1770	}
	1771
	1772
	1773	/* Functions for token which are used in the parser. */
	1774
	1775	/* Fetch a token from INPUT.
	1776	We must not use this function inside bracket expressions. */
	1777
	1778	static void
	1779	fetch_token (result, input, syntax)
	1780	re_token_t *result;
	1781	re_string_t *input;
	1782	reg_syntax_t syntax;
	1783	{
	1784	re_string_skip_bytes (input, peek_token (result, input, syntax));
	1785	}
	1786
	1787	/* Peek a token from INPUT, and return the length of the token.
	1788	We must not use this function inside bracket expressions. */
	1789
	1790	static int
	1791	peek_token (token, input, syntax)
	1792	re_token_t *token;
	1793	re_string_t *input;
	1794	reg_syntax_t syntax;
	1795	{
	1796	unsigned char c;
	1797
	1798	if (re_string_eoi (input))
	1799	{
	1800	token->type = END_OF_RE;
	1801	return 0;
	1802	}
	1803
	1804	c = re_string_peek_byte (input, 0);
	1805	token->opr.c = c;
	1806
	1807	token->word_char = 0;
	1808	#ifdef RE_ENABLE_I18N
	1809	token->mb_partial = 0;
	1810	if (input->mb_cur_max > 1 &&
	1811	!re_string_first_byte (input, re_string_cur_idx (input)))
	1812	{
	1813	token->type = CHARACTER;
	1814	token->mb_partial = 1;
	1815	return 1;
	1816	}
	1817	#endif
	1818	if (c == '\\')
	1819	{
	1820	unsigned char c2;
	1821	if (re_string_cur_idx (input) + 1 >= re_string_length (input))
	1822	{
	1823	token->type = BACK_SLASH;
	1824	return 1;
	1825	}
	1826
	1827	c2 = re_string_peek_byte_case (input, 1);
	1828	token->opr.c = c2;
	1829	token->type = CHARACTER;
	1830	#ifdef RE_ENABLE_I18N
	1831	if (input->mb_cur_max > 1)
	1832	{
	1833	wint_t wc = re_string_wchar_at (input,
	1834	re_string_cur_idx (input) + 1);
	1835	token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
	1836	}
	1837	else
	1838	#endif
	1839	token->word_char = IS_WORD_CHAR (c2) != 0;
	1840
	1841	switch (c2)
	1842	{
	1843	case '\|':
	1844	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
	1845	token->type = OP_ALT;
	1846	break;
	1847	case '1': case '2': case '3': case '4': case '5':
	1848	case '6': case '7': case '8': case '9':
	1849	if (!(syntax & RE_NO_BK_REFS))
	1850	{
	1851	token->type = OP_BACK_REF;
	1852	token->opr.idx = c2 - '1';
	1853	}
	1854	break;
	1855	case '<':
	1856	if (!(syntax & RE_NO_GNU_OPS))
	1857	{
	1858	token->type = ANCHOR;
	1859	token->opr.ctx_type = WORD_FIRST;
	1860	}
	1861	break;
	1862	case '>':
	1863	if (!(syntax & RE_NO_GNU_OPS))
	1864	{
	1865	token->type = ANCHOR;
	1866	token->opr.ctx_type = WORD_LAST;
	1867	}
	1868	break;
	1869	case 'b':
	1870	if (!(syntax & RE_NO_GNU_OPS))
	1871	{
	1872	token->type = ANCHOR;
	1873	token->opr.ctx_type = WORD_DELIM;
	1874	}
	1875	break;
	1876	case 'B':
	1877	if (!(syntax & RE_NO_GNU_OPS))
	1878	{
	1879	token->type = ANCHOR;
	1880	token->opr.ctx_type = NOT_WORD_DELIM;
	1881	}
	1882	break;
	1883	case 'w':
	1884	if (!(syntax & RE_NO_GNU_OPS))
	1885	token->type = OP_WORD;
	1886	break;
	1887	case 'W':
	1888	if (!(syntax & RE_NO_GNU_OPS))
	1889	token->type = OP_NOTWORD;
	1890	break;
	1891	#ifndef GAWK
	1892	case 's':
	1893	if (!(syntax & RE_NO_GNU_OPS))
	1894	token->type = OP_SPACE;
	1895	break;
	1896	case 'S':
	1897	if (!(syntax & RE_NO_GNU_OPS))
	1898	token->type = OP_NOTSPACE;
	1899	break;
	1900	#endif
	1901	case '`':
	1902	if (!(syntax & RE_NO_GNU_OPS))
	1903	{
	1904	token->type = ANCHOR;
	1905	token->opr.ctx_type = BUF_FIRST;
	1906	}
	1907	break;
	1908	case '\'':
	1909	if (!(syntax & RE_NO_GNU_OPS))
	1910	{
	1911	token->type = ANCHOR;
	1912	token->opr.ctx_type = BUF_LAST;
	1913	}
	1914	break;
	1915	case '(':
	1916	if (!(syntax & RE_NO_BK_PARENS))
	1917	token->type = OP_OPEN_SUBEXP;
	1918	break;
	1919	case ')':
	1920	if (!(syntax & RE_NO_BK_PARENS))
	1921	token->type = OP_CLOSE_SUBEXP;
	1922	break;
	1923	case '+':
	1924	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
	1925	token->type = OP_DUP_PLUS;
	1926	break;
	1927	case '?':
	1928	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
	1929	token->type = OP_DUP_QUESTION;
	1930	break;
	1931	case '{':
	1932	if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
	1933	token->type = OP_OPEN_DUP_NUM;
	1934	break;
	1935	case '}':
	1936	if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
	1937	token->type = OP_CLOSE_DUP_NUM;
	1938	break;
	1939	default:
	1940	break;
	1941	}
	1942	return 2;
	1943	}
	1944
	1945	token->type = CHARACTER;
	1946	#ifdef RE_ENABLE_I18N
	1947	if (input->mb_cur_max > 1)
	1948	{
	1949	wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
	1950	token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
	1951	}
	1952	else
	1953	#endif
	1954	token->word_char = IS_WORD_CHAR (token->opr.c);
	1955
	1956	switch (c)
	1957	{
	1958	case '\n':
	1959	if (syntax & RE_NEWLINE_ALT)
	1960	token->type = OP_ALT;
	1961	break;
	1962	case '\|':
	1963	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
	1964	token->type = OP_ALT;
	1965	break;
	1966	case '*':
	1967	token->type = OP_DUP_ASTERISK;
	1968	break;
	1969	case '+':
	1970	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
	1971	token->type = OP_DUP_PLUS;
	1972	break;
	1973	case '?':
	1974	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
	1975	token->type = OP_DUP_QUESTION;
	1976	break;
	1977	case '{':
	1978	if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
	1979	token->type = OP_OPEN_DUP_NUM;
	1980	break;
	1981	case '}':
	1982	if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
	1983	token->type = OP_CLOSE_DUP_NUM;
	1984	break;
	1985	case '(':
	1986	if (syntax & RE_NO_BK_PARENS)
	1987	token->type = OP_OPEN_SUBEXP;
	1988	break;
	1989	case ')':
	1990	if (syntax & RE_NO_BK_PARENS)
	1991	token->type = OP_CLOSE_SUBEXP;
	1992	break;
	1993	case '[':
	1994	token->type = OP_OPEN_BRACKET;
	1995	break;
	1996	case '.':
	1997	token->type = OP_PERIOD;
	1998	break;
	1999	case '^':
	2000	if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS \| RE_CARET_ANCHORS_HERE)) &&
	2001	re_string_cur_idx (input) != 0)
	2002	{
	2003	char prev = re_string_peek_byte (input, -1);
	2004	if (!(syntax & RE_NEWLINE_ALT) \|\| prev != '\n')
	2005	break;
	2006	}
	2007	token->type = ANCHOR;
	2008	token->opr.ctx_type = LINE_FIRST;
	2009	break;
	2010	case '$':
	2011	if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
	2012	re_string_cur_idx (input) + 1 != re_string_length (input))
	2013	{
	2014	re_token_t next;
	2015	re_string_skip_bytes (input, 1);
	2016	peek_token (&next, input, syntax);
	2017	re_string_skip_bytes (input, -1);
	2018	if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
	2019	break;
	2020	}
	2021	token->type = ANCHOR;
	2022	token->opr.ctx_type = LINE_LAST;
	2023	break;
	2024	default:
	2025	break;
	2026	}
	2027	return 1;
	2028	}
	2029
	2030	/* Peek a token from INPUT, and return the length of the token.
	2031	We must not use this function out of bracket expressions. */
	2032
	2033	static int
	2034	peek_token_bracket (token, input, syntax)
	2035	re_token_t *token;
	2036	re_string_t *input;
	2037	reg_syntax_t syntax;
	2038	{
	2039	unsigned char c;
	2040	if (re_string_eoi (input))
	2041	{
	2042	token->type = END_OF_RE;
	2043	return 0;
	2044	}
	2045	c = re_string_peek_byte (input, 0);
	2046	token->opr.c = c;
	2047
	2048	#ifdef RE_ENABLE_I18N
	2049	if (input->mb_cur_max > 1 &&
	2050	!re_string_first_byte (input, re_string_cur_idx (input)))
	2051	{
	2052	token->type = CHARACTER;
	2053	return 1;
	2054	}
	2055	#endif /* RE_ENABLE_I18N */
	2056
	2057	if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
	2058	&& re_string_cur_idx (input) + 1 < re_string_length (input))
	2059	{
	2060	/* In this case, '\' escape a character. */
	2061	unsigned char c2;
	2062	re_string_skip_bytes (input, 1);
	2063	c2 = re_string_peek_byte (input, 0);
	2064	token->opr.c = c2;
	2065	token->type = CHARACTER;
	2066	return 1;
	2067	}
	2068	if (c == '[') /* '[' is a special char in a bracket exps. */
	2069	{
	2070	unsigned char c2;
	2071	int token_len;
	2072	if (re_string_cur_idx (input) + 1 < re_string_length (input))
	2073	c2 = re_string_peek_byte (input, 1);
	2074	else
	2075	c2 = 0;
	2076	token->opr.c = c2;
	2077	token_len = 2;
	2078	switch (c2)
	2079	{
	2080	case '.':
	2081	token->type = OP_OPEN_COLL_ELEM;
	2082	break;
	2083	case '=':
	2084	token->type = OP_OPEN_EQUIV_CLASS;
	2085	break;
	2086	case ':':
	2087	if (syntax & RE_CHAR_CLASSES)
	2088	{
	2089	token->type = OP_OPEN_CHAR_CLASS;
	2090	break;
	2091	}
	2092	/* else fall through. */
	2093	default:
	2094	token->type = CHARACTER;
	2095	token->opr.c = c;
	2096	token_len = 1;
	2097	break;
	2098	}
	2099	return token_len;
	2100	}
	2101	switch (c)
	2102	{
	2103	case '-':
	2104	token->type = OP_CHARSET_RANGE;
	2105	break;
	2106	case ']':
	2107	token->type = OP_CLOSE_BRACKET;
	2108	break;
	2109	case '^':
	2110	token->type = OP_NON_MATCH_LIST;
	2111	break;
	2112	default:
	2113	token->type = CHARACTER;
	2114	}
	2115	return 1;
	2116	}
	2117
	2118
	2119	/* Functions for parser. */
	2120
	2121	/* Entry point of the parser.
	2122	Parse the regular expression REGEXP and return the structure tree.
	2123	If an error is occured, ERR is set by error code, and return NULL.
	2124	This function build the following tree, from regular expression <reg_exp>:
	2125	CAT
	2126	/ \
	2127	/ \
	2128	<reg_exp> EOR
	2129
	2130	CAT means concatenation.
	2131	EOR means end of regular expression. */
	2132
	2133	static bin_tree_t *
	2134	parse (regexp, preg, syntax, err)
	2135	re_string_t *regexp;
	2136	regex_t *preg;
	2137	reg_syntax_t syntax;
	2138	reg_errcode_t *err;
	2139	{
	2140	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
	2141	bin_tree_t tree, eor, *root;
	2142	re_token_t current_token;
	2143	dfa->syntax = syntax;
	2144	fetch_token (&current_token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
	2145	tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
	2146	if (BE (*err != REG_NOERROR && tree == NULL, 0))
	2147	return NULL;
	2148	eor = create_tree (dfa, NULL, NULL, END_OF_RE);
	2149	if (tree != NULL)
	2150	root = create_tree (dfa, tree, eor, CONCAT);
	2151	else
	2152	root = eor;
	2153	if (BE (eor == NULL \|\| root == NULL, 0))
	2154	{
	2155	*err = REG_ESPACE;
	2156	return NULL;
	2157	}
	2158	return root;
	2159	}
	2160
	2161	/* This function build the following tree, from regular expression
	2162	<branch1>\|<branch2>:
	2163	ALT
	2164	/ \
	2165	/ \
	2166	<branch1> <branch2>
	2167
	2168	ALT means alternative, which represents the operator `\|'. */
	2169
	2170	static bin_tree_t *
	2171	parse_reg_exp (regexp, preg, token, syntax, nest, err)
	2172	re_string_t *regexp;
	2173	regex_t *preg;
	2174	re_token_t *token;
	2175	reg_syntax_t syntax;
	2176	int nest;
	2177	reg_errcode_t *err;
	2178	{
	2179	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
	2180	bin_tree_t tree, branch = NULL;
	2181	tree = parse_branch (regexp, preg, token, syntax, nest, err);
	2182	if (BE (*err != REG_NOERROR && tree == NULL, 0))
	2183	return NULL;
	2184
	2185	while (token->type == OP_ALT)
	2186	{
	2187	fetch_token (token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
	2188	if (token->type != OP_ALT && token->type != END_OF_RE
	2189	&& (nest == 0 \|\| token->type != OP_CLOSE_SUBEXP))
	2190	{
	2191	branch = parse_branch (regexp, preg, token, syntax, nest, err);
	2192	if (BE (*err != REG_NOERROR && branch == NULL, 0))
	2193	return NULL;
	2194	}
	2195	else
	2196	branch = NULL;
	2197	tree = create_tree (dfa, tree, branch, OP_ALT);
	2198	if (BE (tree == NULL, 0))
	2199	{
	2200	*err = REG_ESPACE;
	2201	return NULL;
	2202	}
	2203	}
	2204	return tree;
	2205	}
	2206
	2207	/* This function build the following tree, from regular expression
	2208	<exp1><exp2>:
	2209	CAT
	2210	/ \
	2211	/ \
	2212	<exp1> <exp2>
	2213
	2214	CAT means concatenation. */
	2215
	2216	static bin_tree_t *
	2217	parse_branch (regexp, preg, token, syntax, nest, err)
	2218	re_string_t *regexp;
	2219	regex_t *preg;
	2220	re_token_t *token;
	2221	reg_syntax_t syntax;
	2222	int nest;
	2223	reg_errcode_t *err;
	2224	{
	2225	bin_tree_t tree, exp;
	2226	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
	2227	tree = parse_expression (regexp, preg, token, syntax, nest, err);
	2228	if (BE (*err != REG_NOERROR && tree == NULL, 0))
	2229	return NULL;
	2230
	2231	while (token->type != OP_ALT && token->type != END_OF_RE
	2232	&& (nest == 0 \|\| token->type != OP_CLOSE_SUBEXP))
	2233	{
	2234	exp = parse_expression (regexp, preg, token, syntax, nest, err);
	2235	if (BE (*err != REG_NOERROR && exp == NULL, 0))
	2236	{
	2237	return NULL;
	2238	}
	2239	if (tree != NULL && exp != NULL)
	2240	{
	2241	tree = create_tree (dfa, tree, exp, CONCAT);
	2242	if (tree == NULL)
	2243	{
	2244	*err = REG_ESPACE;
	2245	return NULL;
	2246	}
	2247	}
	2248	else if (tree == NULL)
	2249	tree = exp;
	2250	/* Otherwise exp == NULL, we don't need to create new tree. */
	2251	}
	2252	return tree;
	2253	}
	2254
	2255	/* This function build the following tree, from regular expression a*:
	2256	*
	2257	\|
	2258	a
	2259	*/
	2260
	2261	static bin_tree_t *
	2262	parse_expression (regexp, preg, token, syntax, nest, err)
	2263	re_string_t *regexp;
	2264	regex_t *preg;
	2265	re_token_t *token;
	2266	reg_syntax_t syntax;
	2267	int nest;
	2268	reg_errcode_t *err;
	2269	{
	2270	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
	2271	bin_tree_t *tree;
	2272	switch (token->type)
	2273	{
	2274	case CHARACTER:
	2275	tree = create_token_tree (dfa, NULL, NULL, token);
	2276	if (BE (tree == NULL, 0))
	2277	{
	2278	*err = REG_ESPACE;
	2279	return NULL;
	2280	}
	2281	#ifdef RE_ENABLE_I18N
	2282	if (dfa->mb_cur_max > 1)
	2283	{
	2284	while (!re_string_eoi (regexp)
	2285	&& !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
	2286	{
	2287	bin_tree_t *mbc_remain;
	2288	fetch_token (token, regexp, syntax);
	2289	mbc_remain = create_token_tree (dfa, NULL, NULL, token);
	2290	tree = create_tree (dfa, tree, mbc_remain, CONCAT);
	2291	if (BE (mbc_remain == NULL \|\| tree == NULL, 0))
	2292	{
	2293	*err = REG_ESPACE;
	2294	return NULL;
	2295	}
	2296	}
	2297	}
	2298	#endif
	2299	break;
	2300	case OP_OPEN_SUBEXP:
	2301	tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
	2302	if (BE (*err != REG_NOERROR && tree == NULL, 0))
	2303	return NULL;
	2304	break;
	2305	case OP_OPEN_BRACKET:
	2306	tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
	2307	if (BE (*err != REG_NOERROR && tree == NULL, 0))
	2308	return NULL;
	2309	break;
	2310	case OP_BACK_REF:
	2311	if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
	2312	{
	2313	*err = REG_ESUBREG;
	2314	return NULL;
	2315	}
	2316	dfa->used_bkref_map \|= 1 << token->opr.idx;
	2317	tree = create_token_tree (dfa, NULL, NULL, token);
	2318	if (BE (tree == NULL, 0))
	2319	{
	2320	*err = REG_ESPACE;
	2321	return NULL;
	2322	}
	2323	++dfa->nbackref;
	2324	dfa->has_mb_node = 1;
	2325	break;
	2326	case OP_OPEN_DUP_NUM:
	2327	if (syntax & RE_CONTEXT_INVALID_DUP)
	2328	{
	2329	*err = REG_BADRPT;
	2330	return NULL;
	2331	}
	2332	/* FALLTHROUGH */
	2333	case OP_DUP_ASTERISK:
	2334	case OP_DUP_PLUS:
	2335	case OP_DUP_QUESTION:
	2336	if (syntax & RE_CONTEXT_INVALID_OPS)
	2337	{
	2338	*err = REG_BADRPT;
	2339	return NULL;
	2340	}
	2341	else if (syntax & RE_CONTEXT_INDEP_OPS)
	2342	{
	2343	fetch_token (token, regexp, syntax);
	2344	return parse_expression (regexp, preg, token, syntax, nest, err);
	2345	}
	2346	/* else fall through */
	2347	case OP_CLOSE_SUBEXP:
	2348	if ((token->type == OP_CLOSE_SUBEXP) &&
	2349	!(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
	2350	{
	2351	*err = REG_ERPAREN;
	2352	return NULL;
	2353	}
	2354	/* else fall through */
	2355	case OP_CLOSE_DUP_NUM:
	2356	/* We treat it as a normal character. */
	2357
	2358	/* Then we can these characters as normal characters. */
	2359	token->type = CHARACTER;
	2360	/* mb_partial and word_char bits should be initialized already
	2361	by peek_token. */
	2362	tree = create_token_tree (dfa, NULL, NULL, token);
	2363	if (BE (tree == NULL, 0))
	2364	{
	2365	*err = REG_ESPACE;
	2366	return NULL;
	2367	}
	2368	break;
	2369	case ANCHOR:
	2370	if ((token->opr.ctx_type
	2371	& (WORD_DELIM \| NOT_WORD_DELIM \| WORD_FIRST \| WORD_LAST))
	2372	&& dfa->word_ops_used == 0)
	2373	init_word_char (dfa);
	2374	if (token->opr.ctx_type == WORD_DELIM
	2375	\|\| token->opr.ctx_type == NOT_WORD_DELIM)
	2376	{
	2377	bin_tree_t tree_first, tree_last;
	2378	if (token->opr.ctx_type == WORD_DELIM)
	2379	{
	2380	token->opr.ctx_type = WORD_FIRST;
	2381	tree_first = create_token_tree (dfa, NULL, NULL, token);
	2382	token->opr.ctx_type = WORD_LAST;
	2383	}
	2384	else
	2385	{
	2386	token->opr.ctx_type = INSIDE_WORD;
	2387	tree_first = create_token_tree (dfa, NULL, NULL, token);
	2388	token->opr.ctx_type = INSIDE_NOTWORD;
	2389	}
	2390	tree_last = create_token_tree (dfa, NULL, NULL, token);
	2391	tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
	2392	if (BE (tree_first == NULL \|\| tree_last == NULL \|\| tree == NULL, 0))
	2393	{
	2394	*err = REG_ESPACE;
	2395	return NULL;
	2396	}
	2397	}
	2398	else
	2399	{
	2400	tree = create_token_tree (dfa, NULL, NULL, token);
	2401	if (BE (tree == NULL, 0))
	2402	{
	2403	*err = REG_ESPACE;
	2404	return NULL;
	2405	}
	2406	}
	2407	/* We must return here, since ANCHORs can't be followed
	2408	by repetition operators.
	2409	eg. RE"^" is invalid or "<ANCHOR(^)><CHAR()>",
	2410	it must not be "<ANCHOR(^)><REPEAT()>". /
	2411	fetch_token (token, regexp, syntax);
	2412	return tree;
	2413	case OP_PERIOD:
	2414	tree = create_token_tree (dfa, NULL, NULL, token);
	2415	if (BE (tree == NULL, 0))
	2416	{
	2417	*err = REG_ESPACE;
	2418	return NULL;
	2419	}
	2420	if (dfa->mb_cur_max > 1)
	2421	dfa->has_mb_node = 1;
	2422	break;
	2423	case OP_WORD:
	2424	case OP_NOTWORD:
	2425	tree = build_charclass_op (dfa, regexp->trans,
	2426	(const char *) "alnum",
	2427	(const char *) "_",
	2428	token->type == OP_NOTWORD, err);
	2429	if (BE (*err != REG_NOERROR && tree == NULL, 0))
	2430	return NULL;
	2431	break;
	2432	case OP_SPACE:
	2433	case OP_NOTSPACE:
	2434	tree = build_charclass_op (dfa, regexp->trans,
	2435	(const char *) "space",
	2436	(const char *) "",
	2437	token->type == OP_NOTSPACE, err);
	2438	if (BE (*err != REG_NOERROR && tree == NULL, 0))
	2439	return NULL;
	2440	break;
	2441	case OP_ALT:
	2442	case END_OF_RE:
	2443	return NULL;
	2444	case BACK_SLASH:
	2445	*err = REG_EESCAPE;
	2446	return NULL;
	2447	default:
	2448	/* Must not happen? */
	2449	#ifdef DEBUG
	2450	assert (0);
	2451	#endif
	2452	return NULL;
	2453	}
	2454	fetch_token (token, regexp, syntax);
	2455
	2456	while (token->type == OP_DUP_ASTERISK \|\| token->type == OP_DUP_PLUS
	2457	\|\| token->type == OP_DUP_QUESTION \|\| token->type == OP_OPEN_DUP_NUM)
	2458	{
	2459	tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
	2460	if (BE (*err != REG_NOERROR && tree == NULL, 0))
	2461	return NULL;
	2462	/* In BRE consecutive duplications are not allowed. */
	2463	if ((syntax & RE_CONTEXT_INVALID_DUP)
	2464	&& (token->type == OP_DUP_ASTERISK
	2465	\|\| token->type == OP_OPEN_DUP_NUM))
	2466	{
	2467	*err = REG_BADRPT;
	2468	return NULL;
	2469	}
	2470	}
	2471
	2472	return tree;
	2473	}
	2474
	2475	/* This function build the following tree, from regular expression
	2476	(<reg_exp>):
	2477	SUBEXP
	2478	\|
	2479	<reg_exp>
	2480	*/
	2481
	2482	static bin_tree_t *
	2483	parse_sub_exp (regexp, preg, token, syntax, nest, err)
	2484	re_string_t *regexp;
	2485	regex_t *preg;
	2486	re_token_t *token;
	2487	reg_syntax_t syntax;
	2488	int nest;
	2489	reg_errcode_t *err;
	2490	{
	2491	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
	2492	bin_tree_t *tree;
	2493	size_t cur_nsub;
	2494	cur_nsub = preg->re_nsub++;
	2495
	2496	fetch_token (token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
	2497
	2498	/* The subexpression may be a null string. */
	2499	if (token->type == OP_CLOSE_SUBEXP)
	2500	tree = NULL;
	2501	else
	2502	{
	2503	tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
	2504	if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
	2505	*err = REG_EPAREN;
	2506	if (BE (*err != REG_NOERROR, 0))
	2507	return NULL;
	2508	}
	2509	dfa->completed_bkref_map \|= 1 << cur_nsub;
	2510
	2511	tree = create_tree (dfa, tree, NULL, SUBEXP);
	2512	if (BE (tree == NULL, 0))
	2513	{
	2514	*err = REG_ESPACE;
	2515	return NULL;
	2516	}
	2517	tree->token.opr.idx = cur_nsub;
	2518	return tree;
	2519	}
	2520
	2521	/* This function parse repetition operators like "", "+", "{1,3}" etc. /
	2522
	2523	static bin_tree_t *
	2524	parse_dup_op (elem, regexp, dfa, token, syntax, err)
	2525	bin_tree_t *elem;
	2526	re_string_t *regexp;
	2527	re_dfa_t *dfa;
	2528	re_token_t *token;
	2529	reg_syntax_t syntax;
	2530	reg_errcode_t *err;
	2531	{
	2532	bin_tree_t tree = NULL, old_tree = NULL;
	2533	int i, start, end, start_idx = re_string_cur_idx (regexp);
	2534	#ifndef RE_TOKEN_INIT_BUG
	2535	re_token_t start_token = *token;
	2536	#else
	2537	re_token_t start_token;
	2538
	2539	memcpy ((void ) &start_token, (void ) token, sizeof start_token);
	2540	#endif
	2541
	2542	if (token->type == OP_OPEN_DUP_NUM)
	2543	{
	2544	end = 0;
	2545	start = fetch_number (regexp, token, syntax);
	2546	if (start == -1)
	2547	{
	2548	if (token->type == CHARACTER && token->opr.c == ',')
	2549	start = 0; /* We treat "{,m}" as "{0,m}". */
	2550	else
	2551	{
	2552	err = REG_BADBR; / <re>{} is invalid. */
	2553	return NULL;
	2554	}
	2555	}
	2556	if (BE (start != -2, 1))
	2557	{
	2558	/* We treat "{n}" as "{n,n}". */
	2559	end = ((token->type == OP_CLOSE_DUP_NUM) ? start
	2560	: ((token->type == CHARACTER && token->opr.c == ',')
	2561	? fetch_number (regexp, token, syntax) : -2));
	2562	}
	2563	if (BE (start == -2 \|\| end == -2, 0))
	2564	{
	2565	/* Invalid sequence. */
	2566	if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
	2567	{
	2568	if (token->type == END_OF_RE)
	2569	*err = REG_EBRACE;
	2570	else
	2571	*err = REG_BADBR;
	2572
	2573	return NULL;
	2574	}
	2575
	2576	/* If the syntax bit is set, rollback. */
	2577	re_string_set_index (regexp, start_idx);
	2578	*token = start_token;
	2579	token->type = CHARACTER;
	2580	/* mb_partial and word_char bits should be already initialized by
	2581	peek_token. */
	2582	return elem;
	2583	}
	2584
	2585	if (BE (end != -1 && start > end, 0))
	2586	{
	2587	/* First number greater than second. */
	2588	*err = REG_BADBR;
	2589	return NULL;
	2590	}
	2591	}
	2592	else
	2593	{
	2594	start = (token->type == OP_DUP_PLUS) ? 1 : 0;
	2595	end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
	2596	}
	2597
	2598	fetch_token (token, regexp, syntax);
	2599
	2600	if (BE (elem == NULL, 0))
	2601	return NULL;
	2602	if (BE (start == 0 && end == 0, 0))
	2603	{
	2604	postorder (elem, free_tree, NULL);
	2605	return NULL;
	2606	}
	2607
	2608	/* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
	2609	if (BE (start > 0, 0))
	2610	{
	2611	tree = elem;
	2612	for (i = 2; i <= start; ++i)
	2613	{
	2614	elem = duplicate_tree (elem, dfa);
	2615	tree = create_tree (dfa, tree, elem, CONCAT);
	2616	if (BE (elem == NULL \|\| tree == NULL, 0))
	2617	goto parse_dup_op_espace;
	2618	}
	2619
	2620	if (start == end)
	2621	return tree;
	2622
	2623	/* Duplicate ELEM before it is marked optional. */
	2624	elem = duplicate_tree (elem, dfa);
	2625	old_tree = tree;
	2626	}
	2627	else
	2628	old_tree = NULL;
	2629
	2630	if (elem->token.type == SUBEXP)
	2631	postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
	2632
	2633	tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
	2634	if (BE (tree == NULL, 0))
	2635	goto parse_dup_op_espace;
	2636
	2637	/* This loop is actually executed only when end != -1,
	2638	to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
	2639	already created the start+1-th copy. */
	2640	for (i = start + 2; i <= end; ++i)
	2641	{
	2642	elem = duplicate_tree (elem, dfa);
	2643	tree = create_tree (dfa, tree, elem, CONCAT);
	2644	if (BE (elem == NULL \|\| tree == NULL, 0))
	2645	goto parse_dup_op_espace;
	2646
	2647	tree = create_tree (dfa, tree, NULL, OP_ALT);
	2648	if (BE (tree == NULL, 0))
	2649	goto parse_dup_op_espace;
	2650	}
	2651
	2652	if (old_tree)
	2653	tree = create_tree (dfa, old_tree, tree, CONCAT);
	2654
	2655	return tree;
	2656
	2657	parse_dup_op_espace:
	2658	*err = REG_ESPACE;
	2659	return NULL;
	2660	}
	2661
	2662	/* Size of the names for collating symbol/equivalence_class/character_class.
	2663	I'm not sure, but maybe enough. */
	2664	#define BRACKET_NAME_BUF_SIZE 32
	2665
	2666	#ifndef _LIBC
	2667	/* Local function for parse_bracket_exp only used in case of NOT _LIBC.
	2668	Build the range expression which starts from START_ELEM, and ends
	2669	at END_ELEM. The result are written to MBCSET and SBCSET.
	2670	RANGE_ALLOC is the allocated size of mbcset->range_starts, and
	2671	mbcset->range_ends, is a pointer argument sinse we may
	2672	update it. */
	2673
	2674	static reg_errcode_t
	2675	# ifdef RE_ENABLE_I18N
	2676	build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
	2677	re_charset_t *mbcset;
	2678	int *range_alloc;
	2679	# else /* not RE_ENABLE_I18N */
	2680	build_range_exp (sbcset, start_elem, end_elem)
	2681	# endif /* not RE_ENABLE_I18N */
	2682	re_bitset_ptr_t sbcset;
	2683	bracket_elem_t start_elem, end_elem;
	2684	{
	2685	unsigned int start_ch, end_ch;
	2686	/* Equivalence Classes and Character Classes can't be a range start/end. */
	2687	if (BE (start_elem->type == EQUIV_CLASS \|\| start_elem->type == CHAR_CLASS
	2688	\|\| end_elem->type == EQUIV_CLASS \|\| end_elem->type == CHAR_CLASS,
	2689	0))
	2690	return REG_ERANGE;
	2691
	2692	/* We can handle no multi character collating elements without libc
	2693	support. */
	2694	if (BE ((start_elem->type == COLL_SYM
	2695	&& strlen ((char *) start_elem->opr.name) > 1)
	2696	\|\| (end_elem->type == COLL_SYM
	2697	&& strlen ((char *) end_elem->opr.name) > 1), 0))
	2698	return REG_ECOLLATE;
	2699
	2700	# ifdef RE_ENABLE_I18N
	2701	{
	2702	wchar_t wc, start_wc, end_wc;
	2703	wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
	2704
	2705	start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
	2706	: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
	2707	: 0));
	2708	end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
	2709	: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
	2710	: 0));
	2711	start_wc = ((start_elem->type == SB_CHAR \|\| start_elem->type == COLL_SYM)
	2712	? __btowc (start_ch) : start_elem->opr.wch);
	2713	end_wc = ((end_elem->type == SB_CHAR \|\| end_elem->type == COLL_SYM)
	2714	? __btowc (end_ch) : end_elem->opr.wch);
	2715	if (start_wc == WEOF \|\| end_wc == WEOF)
	2716	return REG_ECOLLATE;
	2717	cmp_buf[0] = start_wc;
	2718	cmp_buf[4] = end_wc;
	2719	if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
	2720	return REG_ERANGE;
	2721
	2722	/* Got valid collation sequence values, add them as a new entry.
	2723	However, for !_LIBC we have no collation elements: if the
	2724	character set is single byte, the single byte character set
	2725	that we build below suffices. parse_bracket_exp passes
	2726	no MBCSET if dfa->mb_cur_max == 1. */
	2727	if (mbcset)
	2728	{
	2729	/* Check the space of the arrays. */
	2730	if (BE (*range_alloc == mbcset->nranges, 0))
	2731	{
	2732	/* There is not enough space, need realloc. */
	2733	wchar_t new_array_start, new_array_end;
	2734	int new_nranges;
	2735
	2736	/* +1 in case of mbcset->nranges is 0. */
	2737	new_nranges = 2 * mbcset->nranges + 1;
	2738	/* Use realloc since mbcset->range_starts and mbcset->range_ends
	2739	are NULL if range_alloc == 0. /
	2740	new_array_start = re_realloc (mbcset->range_starts, wchar_t,
	2741	new_nranges);
	2742	new_array_end = re_realloc (mbcset->range_ends, wchar_t,
	2743	new_nranges);
	2744
	2745	if (BE (new_array_start == NULL \|\| new_array_end == NULL, 0))
	2746	return REG_ESPACE;
	2747
	2748	mbcset->range_starts = new_array_start;
	2749	mbcset->range_ends = new_array_end;
	2750	*range_alloc = new_nranges;
	2751	}
	2752
	2753	mbcset->range_starts[mbcset->nranges] = start_wc;
	2754	mbcset->range_ends[mbcset->nranges++] = end_wc;
	2755	}
	2756
	2757	/* Build the table for single byte characters. */
	2758	for (wc = 0; wc < SBC_MAX; ++wc)
	2759	{
	2760	cmp_buf[2] = wc;
	2761	if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
	2762	&& wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
	2763	bitset_set (sbcset, wc);
	2764	}
	2765	}
	2766	# else /* not RE_ENABLE_I18N */
	2767	{
	2768	unsigned int ch;
	2769	start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
	2770	: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
	2771	: 0));
	2772	end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
	2773	: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
	2774	: 0));
	2775	if (start_ch > end_ch)
	2776	return REG_ERANGE;
	2777	/* Build the table for single byte characters. */
	2778	for (ch = 0; ch < SBC_MAX; ++ch)
	2779	if (start_ch <= ch && ch <= end_ch)
	2780	bitset_set (sbcset, ch);
	2781	}
	2782	# endif /* not RE_ENABLE_I18N */
	2783	return REG_NOERROR;
	2784	}
	2785	#endif /* not _LIBC */
	2786
	2787	#ifndef _LIBC
	2788	/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
	2789	Build the collating element which is represented by NAME.
	2790	The result are written to MBCSET and SBCSET.
	2791	COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
	2792	pointer argument since we may update it. */
	2793
	2794	static reg_errcode_t
	2795	# ifdef RE_ENABLE_I18N
	2796	build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
	2797	re_charset_t *mbcset;
	2798	int *coll_sym_alloc;
	2799	# else /* not RE_ENABLE_I18N */
	2800	build_collating_symbol (sbcset, name)
	2801	# endif /* not RE_ENABLE_I18N */
	2802	re_bitset_ptr_t sbcset;
	2803	const unsigned char *name;
	2804	{
	2805	size_t name_len = strlen ((const char *) name);
	2806	if (BE (name_len != 1, 0))
	2807	return REG_ECOLLATE;
	2808	else
	2809	{
	2810	bitset_set (sbcset, name[0]);
	2811	return REG_NOERROR;
	2812	}
	2813	}
	2814	#endif /* not _LIBC */
	2815
	2816	/* This function parse bracket expression like "[abc]", "[a-c]",
	2817	"[[.a-a.]]" etc. */
	2818
	2819	static bin_tree_t *
	2820	parse_bracket_exp (regexp, dfa, token, syntax, err)
	2821	re_string_t *regexp;
	2822	re_dfa_t *dfa;
	2823	re_token_t *token;
	2824	reg_syntax_t syntax;
	2825	reg_errcode_t *err;
	2826	{
	2827	#ifdef _LIBC
	2828	const unsigned char *collseqmb;
	2829	const char *collseqwc;
	2830	uint32_t nrules;
	2831	int32_t table_size;
	2832	const int32_t *symb_table;
	2833	const unsigned char *extra;
	2834
	2835	/* Local function for parse_bracket_exp used in _LIBC environement.
	2836	Seek the collating symbol entry correspondings to NAME.
	2837	Return the index of the symbol in the SYMB_TABLE. */
	2838
	2839	auto inline int32_t
	2840	__attribute ((always_inline))
	2841	seek_collating_symbol_entry (name, name_len)
	2842	const unsigned char *name;
	2843	size_t name_len;
	2844	{
	2845	int32_t hash = elem_hash ((const char *) name, name_len);
	2846	int32_t elem = hash % table_size;
	2847	int32_t second = hash % (table_size - 2);
	2848	while (symb_table[2 * elem] != 0)
	2849	{
	2850	/* First compare the hashing value. */
	2851	if (symb_table[2 * elem] == hash
	2852	/* Compare the length of the name. */
	2853	&& name_len == extra[symb_table[2 * elem + 1]]
	2854	/* Compare the name. */
	2855	&& memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
	2856	name_len) == 0)
	2857	{
	2858	/* Yep, this is the entry. */
	2859	break;
	2860	}
	2861
	2862	/* Next entry. */
	2863	elem += second;
	2864	}
	2865	return elem;
	2866	}
	2867
	2868	/* Local function for parse_bracket_exp used in _LIBC environement.
	2869	Look up the collation sequence value of BR_ELEM.
	2870	Return the value if succeeded, UINT_MAX otherwise. */
	2871
	2872	auto inline unsigned int
	2873	__attribute ((always_inline))
	2874	lookup_collation_sequence_value (br_elem)
	2875	bracket_elem_t *br_elem;
	2876	{
	2877	if (br_elem->type == SB_CHAR)
	2878	{
	2879	/*
	2880	if (MB_CUR_MAX == 1)
	2881	*/
	2882	if (nrules == 0)
	2883	return collseqmb[br_elem->opr.ch];
	2884	else
	2885	{
	2886	wint_t wc = __btowc (br_elem->opr.ch);
	2887	return __collseq_table_lookup (collseqwc, wc);
	2888	}
	2889	}
	2890	else if (br_elem->type == MB_CHAR)
	2891	{
	2892	return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
	2893	}
	2894	else if (br_elem->type == COLL_SYM)
	2895	{
	2896	size_t sym_name_len = strlen ((char *) br_elem->opr.name);
	2897	if (nrules != 0)
	2898	{
	2899	int32_t elem, idx;
	2900	elem = seek_collating_symbol_entry (br_elem->opr.name,
	2901	sym_name_len);
	2902	if (symb_table[2 * elem] != 0)
	2903	{
	2904	/* We found the entry. */
	2905	idx = symb_table[2 * elem + 1];
	2906	/* Skip the name of collating element name. */
	2907	idx += 1 + extra[idx];
	2908	/* Skip the byte sequence of the collating element. */
	2909	idx += 1 + extra[idx];
	2910	/* Adjust for the alignment. */
	2911	idx = (idx + 3) & ~3;
	2912	/* Skip the multibyte collation sequence value. */
	2913	idx += sizeof (unsigned int);
	2914	/* Skip the wide char sequence of the collating element. */
	2915	idx += sizeof (unsigned int) *
	2916	(1 + (unsigned int ) (extra + idx));
	2917	/* Return the collation sequence value. */
	2918	return (unsigned int ) (extra + idx);
	2919	}
	2920	else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
	2921	{
	2922	/* No valid character. Match it as a single byte
	2923	character. */
	2924	return collseqmb[br_elem->opr.name[0]];
	2925	}
	2926	}
	2927	else if (sym_name_len == 1)
	2928	return collseqmb[br_elem->opr.name[0]];
	2929	}
	2930	return UINT_MAX;
	2931	}
	2932
	2933	/* Local function for parse_bracket_exp used in _LIBC environement.
	2934	Build the range expression which starts from START_ELEM, and ends
	2935	at END_ELEM. The result are written to MBCSET and SBCSET.
	2936	RANGE_ALLOC is the allocated size of mbcset->range_starts, and
	2937	mbcset->range_ends, is a pointer argument sinse we may
	2938	update it. */
	2939
	2940	auto inline reg_errcode_t
	2941	__attribute ((always_inline))
	2942	build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
	2943	re_charset_t *mbcset;
	2944	int *range_alloc;
	2945	re_bitset_ptr_t sbcset;
	2946	bracket_elem_t start_elem, end_elem;
	2947	{
	2948	unsigned int ch;
	2949	uint32_t start_collseq;
	2950	uint32_t end_collseq;
	2951
	2952	/* Equivalence Classes and Character Classes can't be a range
	2953	start/end. */
	2954	if (BE (start_elem->type == EQUIV_CLASS \|\| start_elem->type == CHAR_CLASS
	2955	\|\| end_elem->type == EQUIV_CLASS \|\| end_elem->type == CHAR_CLASS,
	2956	0))
	2957	return REG_ERANGE;
	2958
	2959	start_collseq = lookup_collation_sequence_value (start_elem);
	2960	end_collseq = lookup_collation_sequence_value (end_elem);
	2961	/* Check start/end collation sequence values. */
	2962	if (BE (start_collseq == UINT_MAX \|\| end_collseq == UINT_MAX, 0))
	2963	return REG_ECOLLATE;
	2964	if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
	2965	return REG_ERANGE;
	2966
	2967	/* Got valid collation sequence values, add them as a new entry.
	2968	However, if we have no collation elements, and the character set
	2969	is single byte, the single byte character set that we
	2970	build below suffices. */
	2971	if (nrules > 0 \|\| dfa->mb_cur_max > 1)
	2972	{
	2973	/* Check the space of the arrays. */
	2974	if (BE (*range_alloc == mbcset->nranges, 0))
	2975	{
	2976	/* There is not enough space, need realloc. */
	2977	uint32_t *new_array_start;
	2978	uint32_t *new_array_end;
	2979	int new_nranges;
	2980
	2981	/* +1 in case of mbcset->nranges is 0. */
	2982	new_nranges = 2 * mbcset->nranges + 1;
	2983	new_array_start = re_realloc (mbcset->range_starts, uint32_t,
	2984	new_nranges);
	2985	new_array_end = re_realloc (mbcset->range_ends, uint32_t,
	2986	new_nranges);
	2987
	2988	if (BE (new_array_start == NULL \|\| new_array_end == NULL, 0))
	2989	return REG_ESPACE;
	2990
	2991	mbcset->range_starts = new_array_start;
	2992	mbcset->range_ends = new_array_end;
	2993	*range_alloc = new_nranges;
	2994	}
	2995
	2996	mbcset->range_starts[mbcset->nranges] = start_collseq;
	2997	mbcset->range_ends[mbcset->nranges++] = end_collseq;
	2998	}
	2999
	3000	/* Build the table for single byte characters. */
	3001	for (ch = 0; ch < SBC_MAX; ch++)
	3002	{
	3003	uint32_t ch_collseq;
	3004	/*
	3005	if (MB_CUR_MAX == 1)
	3006	*/
	3007	if (nrules == 0)
	3008	ch_collseq = collseqmb[ch];
	3009	else
	3010	ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
	3011	if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
	3012	bitset_set (sbcset, ch);
	3013	}
	3014	return REG_NOERROR;
	3015	}
	3016
	3017	/* Local function for parse_bracket_exp used in _LIBC environement.
	3018	Build the collating element which is represented by NAME.
	3019	The result are written to MBCSET and SBCSET.
	3020	COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
	3021	pointer argument sinse we may update it. */
	3022
	3023	auto inline reg_errcode_t
	3024	__attribute ((always_inline))
	3025	build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
	3026	re_charset_t *mbcset;
	3027	int *coll_sym_alloc;
	3028	re_bitset_ptr_t sbcset;
	3029	const unsigned char *name;
	3030	{
	3031	int32_t elem, idx;
	3032	size_t name_len = strlen ((const char *) name);
	3033	if (nrules != 0)
	3034	{
	3035	elem = seek_collating_symbol_entry (name, name_len);
	3036	if (symb_table[2 * elem] != 0)
	3037	{
	3038	/* We found the entry. */
	3039	idx = symb_table[2 * elem + 1];
	3040	/* Skip the name of collating element name. */
	3041	idx += 1 + extra[idx];
	3042	}
	3043	else if (symb_table[2 * elem] == 0 && name_len == 1)
	3044	{
	3045	/* No valid character, treat it as a normal
	3046	character. */
	3047	bitset_set (sbcset, name[0]);
	3048	return REG_NOERROR;
	3049	}
	3050	else
	3051	return REG_ECOLLATE;
	3052
	3053	/* Got valid collation sequence, add it as a new entry. */
	3054	/* Check the space of the arrays. */
	3055	if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
	3056	{
	3057	/* Not enough, realloc it. */
	3058	/* +1 in case of mbcset->ncoll_syms is 0. */
	3059	int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
	3060	/* Use realloc since mbcset->coll_syms is NULL
	3061	if alloc == 0. /
	3062	int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
	3063	new_coll_sym_alloc);
	3064	if (BE (new_coll_syms == NULL, 0))
	3065	return REG_ESPACE;
	3066	mbcset->coll_syms = new_coll_syms;
	3067	*coll_sym_alloc = new_coll_sym_alloc;
	3068	}
	3069	mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
	3070	return REG_NOERROR;
	3071	}
	3072	else
	3073	{
	3074	if (BE (name_len != 1, 0))
	3075	return REG_ECOLLATE;
	3076	else
	3077	{
	3078	bitset_set (sbcset, name[0]);
	3079	return REG_NOERROR;
	3080	}
	3081	}
	3082	}
	3083	#endif
	3084
	3085	re_token_t br_token;
	3086	re_bitset_ptr_t sbcset;
	3087	#ifdef RE_ENABLE_I18N
	3088	re_charset_t *mbcset;
	3089	int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
	3090	int equiv_class_alloc = 0, char_class_alloc = 0;
	3091	#endif /* not RE_ENABLE_I18N */
	3092	int non_match = 0;
	3093	bin_tree_t *work_tree;
	3094	int token_len;
	3095	int first_round = 1;
	3096	#ifdef _LIBC
	3097	collseqmb = (const unsigned char *)
	3098	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
	3099	nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
	3100	if (nrules)
	3101	{
	3102	/*
	3103	if (MB_CUR_MAX > 1)
	3104	*/
	3105	collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
	3106	table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
	3107	symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
	3108	_NL_COLLATE_SYMB_TABLEMB);
	3109	extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
	3110	_NL_COLLATE_SYMB_EXTRAMB);
	3111	}
	3112	#endif
	3113	sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
	3114	#ifdef RE_ENABLE_I18N
	3115	mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
	3116	#endif /* RE_ENABLE_I18N */
	3117	#ifdef RE_ENABLE_I18N
	3118	if (BE (sbcset == NULL \|\| mbcset == NULL, 0))
	3119	#else
	3120	if (BE (sbcset == NULL, 0))
	3121	#endif /* RE_ENABLE_I18N */
	3122	{
	3123	*err = REG_ESPACE;
	3124	return NULL;
	3125	}
	3126
	3127	token_len = peek_token_bracket (token, regexp, syntax);
	3128	if (BE (token->type == END_OF_RE, 0))
	3129	{
	3130	*err = REG_BADPAT;
	3131	goto parse_bracket_exp_free_return;
	3132	}
	3133	if (token->type == OP_NON_MATCH_LIST)
	3134	{
	3135	#ifdef RE_ENABLE_I18N
	3136	mbcset->non_match = 1;
	3137	#endif /* not RE_ENABLE_I18N */
	3138	non_match = 1;
	3139	if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
	3140	bitset_set (sbcset, '\0');
	3141	re_string_skip_bytes (regexp, token_len); /* Skip a token. */
	3142	token_len = peek_token_bracket (token, regexp, syntax);
	3143	if (BE (token->type == END_OF_RE, 0))
	3144	{
	3145	*err = REG_BADPAT;
	3146	goto parse_bracket_exp_free_return;
	3147	}
	3148	}
	3149
	3150	/* We treat the first ']' as a normal character. */
	3151	if (token->type == OP_CLOSE_BRACKET)
	3152	token->type = CHARACTER;
	3153
	3154	while (1)
	3155	{
	3156	bracket_elem_t start_elem, end_elem;
	3157	unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
	3158	unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
	3159	reg_errcode_t ret;
	3160	int token_len2 = 0, is_range_exp = 0;
	3161	re_token_t token2;
	3162
	3163	start_elem.opr.name = start_name_buf;
	3164	ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
	3165	syntax, first_round);
	3166	if (BE (ret != REG_NOERROR, 0))
	3167	{
	3168	*err = ret;
	3169	goto parse_bracket_exp_free_return;
	3170	}
	3171	first_round = 0;
	3172
	3173	/* Get information about the next token. We need it in any case. */
	3174	token_len = peek_token_bracket (token, regexp, syntax);
	3175
	3176	/* Do not check for ranges if we know they are not allowed. */
	3177	if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
	3178	{
	3179	if (BE (token->type == END_OF_RE, 0))
	3180	{
	3181	*err = REG_EBRACK;
	3182	goto parse_bracket_exp_free_return;
	3183	}
	3184	if (token->type == OP_CHARSET_RANGE)
	3185	{
	3186	re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
	3187	token_len2 = peek_token_bracket (&token2, regexp, syntax);
	3188	if (BE (token2.type == END_OF_RE, 0))
	3189	{
	3190	*err = REG_EBRACK;
	3191	goto parse_bracket_exp_free_return;
	3192	}
	3193	if (token2.type == OP_CLOSE_BRACKET)
	3194	{
	3195	/* We treat the last '-' as a normal character. */
	3196	re_string_skip_bytes (regexp, -token_len);
	3197	token->type = CHARACTER;
	3198	}
	3199	else
	3200	is_range_exp = 1;
	3201	}
	3202	}
	3203
	3204	if (is_range_exp == 1)
	3205	{
	3206	end_elem.opr.name = end_name_buf;
	3207	ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
	3208	dfa, syntax, 1);
	3209	if (BE (ret != REG_NOERROR, 0))
	3210	{
	3211	*err = ret;
	3212	goto parse_bracket_exp_free_return;
	3213	}
	3214
	3215	token_len = peek_token_bracket (token, regexp, syntax);
	3216
	3217	#ifdef _LIBC
	3218	*err = build_range_exp (sbcset, mbcset, &range_alloc,
	3219	&start_elem, &end_elem);
	3220	#else
	3221	# ifdef RE_ENABLE_I18N
	3222	*err = build_range_exp (sbcset,
	3223	dfa->mb_cur_max > 1 ? mbcset : NULL,
	3224	&range_alloc, &start_elem, &end_elem);
	3225	# else
	3226	*err = build_range_exp (sbcset, &start_elem, &end_elem);
	3227	# endif
	3228	#endif /* RE_ENABLE_I18N */
	3229	if (BE (*err != REG_NOERROR, 0))
	3230	goto parse_bracket_exp_free_return;
	3231	}
	3232	else
	3233	{
	3234	switch (start_elem.type)
	3235	{
	3236	case SB_CHAR:
	3237	bitset_set (sbcset, start_elem.opr.ch);
	3238	break;
	3239	#ifdef RE_ENABLE_I18N
	3240	case MB_CHAR:
	3241	/* Check whether the array has enough space. */
	3242	if (BE (mbchar_alloc == mbcset->nmbchars, 0))
	3243	{
	3244	wchar_t *new_mbchars;
	3245	/* Not enough, realloc it. */
	3246	/* +1 in case of mbcset->nmbchars is 0. */
	3247	mbchar_alloc = 2 * mbcset->nmbchars + 1;
	3248	/* Use realloc since array is NULL if alloc == 0. /
	3249	new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
	3250	mbchar_alloc);
	3251	if (BE (new_mbchars == NULL, 0))
	3252	goto parse_bracket_exp_espace;
	3253	mbcset->mbchars = new_mbchars;
	3254	}
	3255	mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
	3256	break;
	3257	#endif /* RE_ENABLE_I18N */
	3258	case EQUIV_CLASS:
	3259	*err = build_equiv_class (sbcset,
	3260	#ifdef RE_ENABLE_I18N
	3261	mbcset, &equiv_class_alloc,
	3262	#endif /* RE_ENABLE_I18N */
	3263	start_elem.opr.name);
	3264	if (BE (*err != REG_NOERROR, 0))
	3265	goto parse_bracket_exp_free_return;
	3266	break;
	3267	case COLL_SYM:
	3268	*err = build_collating_symbol (sbcset,
	3269	#ifdef RE_ENABLE_I18N
	3270	mbcset, &coll_sym_alloc,
	3271	#endif /* RE_ENABLE_I18N */
	3272	start_elem.opr.name);
	3273	if (BE (*err != REG_NOERROR, 0))
	3274	goto parse_bracket_exp_free_return;
	3275	break;
	3276	case CHAR_CLASS:
	3277	*err = build_charclass (regexp->trans, sbcset,
	3278	#ifdef RE_ENABLE_I18N
	3279	mbcset, &char_class_alloc,
	3280	#endif /* RE_ENABLE_I18N */
	3281	(const char *) start_elem.opr.name, syntax);
	3282	if (BE (*err != REG_NOERROR, 0))
	3283	goto parse_bracket_exp_free_return;
	3284	break;
	3285	default:
	3286	assert (0);
	3287	break;
	3288	}
	3289	}
	3290	if (BE (token->type == END_OF_RE, 0))
	3291	{
	3292	*err = REG_EBRACK;
	3293	goto parse_bracket_exp_free_return;
	3294	}
	3295	if (token->type == OP_CLOSE_BRACKET)
	3296	break;
	3297	}
	3298
	3299	re_string_skip_bytes (regexp, token_len); /* Skip a token. */
	3300
	3301	/* If it is non-matching list. */
	3302	if (non_match)
	3303	bitset_not (sbcset);
	3304
	3305	#ifdef RE_ENABLE_I18N
	3306	/* Ensure only single byte characters are set. */
	3307	if (dfa->mb_cur_max > 1)
	3308	bitset_mask (sbcset, dfa->sb_char);
	3309
	3310	if (mbcset->nmbchars \|\| mbcset->ncoll_syms \|\| mbcset->nequiv_classes
	3311	\|\| mbcset->nranges \|\| (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
	3312	\|\| mbcset->non_match)))
	3313	{
	3314	bin_tree_t *mbc_tree;
	3315	int sbc_idx;
	3316	/* Build a tree for complex bracket. */
	3317	dfa->has_mb_node = 1;
	3318	br_token.type = COMPLEX_BRACKET;
	3319	br_token.opr.mbcset = mbcset;
	3320	mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
	3321	if (BE (mbc_tree == NULL, 0))
	3322	goto parse_bracket_exp_espace;
	3323	for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
	3324	if (sbcset[sbc_idx])
	3325	break;
	3326	/* If there are no bits set in sbcset, there is no point
	3327	of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
	3328	if (sbc_idx < BITSET_UINTS)
	3329	{
	3330	/* Build a tree for simple bracket. */
	3331	br_token.type = SIMPLE_BRACKET;
	3332	br_token.opr.sbcset = sbcset;
	3333	work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
	3334	if (BE (work_tree == NULL, 0))
	3335	goto parse_bracket_exp_espace;
	3336
	3337	/* Then join them by ALT node. */
	3338	work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
	3339	if (BE (work_tree == NULL, 0))
	3340	goto parse_bracket_exp_espace;
	3341	}
	3342	else
	3343	{
	3344	re_free (sbcset);
	3345	work_tree = mbc_tree;
	3346	}
	3347	}
	3348	else
	3349	#endif /* not RE_ENABLE_I18N */
	3350	{
	3351	#ifdef RE_ENABLE_I18N
	3352	free_charset (mbcset);
	3353	#endif
	3354	/* Build a tree for simple bracket. */
	3355	br_token.type = SIMPLE_BRACKET;
	3356	br_token.opr.sbcset = sbcset;
	3357	work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
	3358	if (BE (work_tree == NULL, 0))
	3359	goto parse_bracket_exp_espace;
	3360	}
	3361	return work_tree;
	3362
	3363	parse_bracket_exp_espace:
	3364	*err = REG_ESPACE;
	3365	parse_bracket_exp_free_return:
	3366	re_free (sbcset);
	3367	#ifdef RE_ENABLE_I18N
	3368	free_charset (mbcset);
	3369	#endif /* RE_ENABLE_I18N */
	3370	return NULL;
	3371	}
	3372
	3373	/* Parse an element in the bracket expression. */
	3374
	3375	static reg_errcode_t
	3376	parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
	3377	accept_hyphen)
	3378	bracket_elem_t *elem;
	3379	re_string_t *regexp;
	3380	re_token_t *token;
	3381	int token_len;
	3382	re_dfa_t *dfa;
	3383	reg_syntax_t syntax;
	3384	int accept_hyphen;
	3385	{
	3386	#ifdef RE_ENABLE_I18N
	3387	int cur_char_size;
	3388	cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
	3389	if (cur_char_size > 1)
	3390	{
	3391	elem->type = MB_CHAR;
	3392	elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
	3393	re_string_skip_bytes (regexp, cur_char_size);
	3394	return REG_NOERROR;
	3395	}
	3396	#endif /* RE_ENABLE_I18N */
	3397	re_string_skip_bytes (regexp, token_len); /* Skip a token. */
	3398	if (token->type == OP_OPEN_COLL_ELEM \|\| token->type == OP_OPEN_CHAR_CLASS
	3399	\|\| token->type == OP_OPEN_EQUIV_CLASS)
	3400	return parse_bracket_symbol (elem, regexp, token);
	3401	if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
	3402	{
	3403	/* A '-' must only appear as anything but a range indicator before
	3404	the closing bracket. Everything else is an error. */
	3405	re_token_t token2;
	3406	(void) peek_token_bracket (&token2, regexp, syntax);
	3407	if (token2.type != OP_CLOSE_BRACKET)
	3408	/* The actual error value is not standardized since this whole
	3409	case is undefined. But ERANGE makes good sense. */
	3410	return REG_ERANGE;
	3411	}
	3412	elem->type = SB_CHAR;
	3413	elem->opr.ch = token->opr.c;
	3414	return REG_NOERROR;
	3415	}
	3416
	3417	/* Parse a bracket symbol in the bracket expression. Bracket symbols are
	3418	such as [:<character_class>:], [.<collating_element>.], and
	3419	[=<equivalent_class>=]. */
	3420
	3421	static reg_errcode_t
	3422	parse_bracket_symbol (elem, regexp, token)
	3423	bracket_elem_t *elem;
	3424	re_string_t *regexp;
	3425	re_token_t *token;
	3426	{
	3427	unsigned char ch, delim = token->opr.c;
	3428	int i = 0;
	3429	if (re_string_eoi(regexp))
	3430	return REG_EBRACK;
	3431	for (;; ++i)
	3432	{
	3433	if (i >= BRACKET_NAME_BUF_SIZE)
	3434	return REG_EBRACK;
	3435	if (token->type == OP_OPEN_CHAR_CLASS)
	3436	ch = re_string_fetch_byte_case (regexp);
	3437	else
	3438	ch = re_string_fetch_byte (regexp);
	3439	if (re_string_eoi(regexp))
	3440	return REG_EBRACK;
	3441	if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
	3442	break;
	3443	elem->opr.name[i] = ch;
	3444	}
	3445	re_string_skip_bytes (regexp, 1);
	3446	elem->opr.name[i] = '\0';
	3447	switch (token->type)
	3448	{
	3449	case OP_OPEN_COLL_ELEM:
	3450	elem->type = COLL_SYM;
	3451	break;
	3452	case OP_OPEN_EQUIV_CLASS:
	3453	elem->type = EQUIV_CLASS;
	3454	break;
	3455	case OP_OPEN_CHAR_CLASS:
	3456	elem->type = CHAR_CLASS;
	3457	break;
	3458	default:
	3459	break;
	3460	}
	3461	return REG_NOERROR;
	3462	}
	3463
	3464	/* Helper function for parse_bracket_exp.
	3465	Build the equivalence class which is represented by NAME.
	3466	The result are written to MBCSET and SBCSET.
	3467	EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
	3468	is a pointer argument sinse we may update it. */
	3469
	3470	static reg_errcode_t
	3471	#ifdef RE_ENABLE_I18N
	3472	build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
	3473	re_charset_t *mbcset;
	3474	int *equiv_class_alloc;
	3475	#else /* not RE_ENABLE_I18N */
	3476	build_equiv_class (sbcset, name)
	3477	#endif /* not RE_ENABLE_I18N */
	3478	re_bitset_ptr_t sbcset;
	3479	const unsigned char *name;
	3480	{
	3481	#if defined _LIBC
	3482	uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
	3483	if (nrules != 0)
	3484	{
	3485	const int32_t table, indirect;
	3486	const unsigned char weights, extra, *cp;
	3487	unsigned char char_buf[2];
	3488	int32_t idx1, idx2;
	3489	unsigned int ch;
	3490	size_t len;
	3491	/* This #include defines a local function! */
	3492	# include <locale/weight.h>
	3493	/* Calculate the index for equivalence class. */
	3494	cp = name;
	3495	table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
	3496	weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
	3497	_NL_COLLATE_WEIGHTMB);
	3498	extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
	3499	_NL_COLLATE_EXTRAMB);
	3500	indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
	3501	_NL_COLLATE_INDIRECTMB);
	3502	idx1 = findidx (&cp);
	3503	if (BE (idx1 == 0 \|\| cp < name + strlen ((const char *) name), 0))
	3504	/* This isn't a valid character. */
	3505	return REG_ECOLLATE;
	3506
	3507	/* Build single byte matcing table for this equivalence class. */
	3508	char_buf[1] = (unsigned char) '\0';
	3509	len = weights[idx1];
	3510	for (ch = 0; ch < SBC_MAX; ++ch)
	3511	{
	3512	char_buf[0] = ch;
	3513	cp = char_buf;
	3514	idx2 = findidx (&cp);
	3515	/*
	3516	idx2 = table[ch];
	3517	*/
	3518	if (idx2 == 0)
	3519	/* This isn't a valid character. */
	3520	continue;
	3521	if (len == weights[idx2])
	3522	{
	3523	int cnt = 0;
	3524	while (cnt <= len &&
	3525	weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
	3526	++cnt;
	3527
	3528	if (cnt > len)
	3529	bitset_set (sbcset, ch);
	3530	}
	3531	}
	3532	/* Check whether the array has enough space. */
	3533	if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
	3534	{
	3535	/* Not enough, realloc it. */
	3536	/* +1 in case of mbcset->nequiv_classes is 0. */
	3537	int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
	3538	/* Use realloc since the array is NULL if alloc == 0. /
	3539	int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
	3540	int32_t,
	3541	new_equiv_class_alloc);
	3542	if (BE (new_equiv_classes == NULL, 0))
	3543	return REG_ESPACE;
	3544	mbcset->equiv_classes = new_equiv_classes;
	3545	*equiv_class_alloc = new_equiv_class_alloc;
	3546	}
	3547	mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
	3548	}
	3549	else
	3550	#endif /* _LIBC */
	3551	{
	3552	if (BE (strlen ((const char *) name) != 1, 0))
	3553	return REG_ECOLLATE;
	3554	bitset_set (sbcset, *name);
	3555	}
	3556	return REG_NOERROR;
	3557	}
	3558
	3559	/* Helper function for parse_bracket_exp.
	3560	Build the character class which is represented by NAME.
	3561	The result are written to MBCSET and SBCSET.
	3562	CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
	3563	is a pointer argument sinse we may update it. */
	3564
	3565	static reg_errcode_t
	3566	#ifdef RE_ENABLE_I18N
	3567	build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
	3568	re_charset_t *mbcset;
	3569	int *char_class_alloc;
	3570	#else /* not RE_ENABLE_I18N */
	3571	build_charclass (trans, sbcset, class_name, syntax)
	3572	#endif /* not RE_ENABLE_I18N */
	3573	unsigned RE_TRANSLATE_TYPE trans;
	3574	re_bitset_ptr_t sbcset;
	3575	const char *class_name;
	3576	reg_syntax_t syntax;
	3577	{
	3578	int i;
	3579
	3580	/* In case of REG_ICASE "upper" and "lower" match the both of
	3581	upper and lower cases. */
	3582	if ((syntax & RE_ICASE)
	3583	&& (strcmp (class_name, "upper") == 0 \|\| strcmp (class_name, "lower") == 0))
	3584	class_name = "alpha";
	3585
	3586	#ifdef RE_ENABLE_I18N
	3587	/* Check the space of the arrays. */
	3588	if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
	3589	{
	3590	/* Not enough, realloc it. */
	3591	/* +1 in case of mbcset->nchar_classes is 0. */
	3592	int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
	3593	/* Use realloc since array is NULL if alloc == 0. /
	3594	wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
	3595	new_char_class_alloc);
	3596	if (BE (new_char_classes == NULL, 0))
	3597	return REG_ESPACE;
	3598	mbcset->char_classes = new_char_classes;
	3599	*char_class_alloc = new_char_class_alloc;
	3600	}
	3601	mbcset->char_classes[mbcset->nchar_classes++] = __wctype (class_name);
	3602	#endif /* RE_ENABLE_I18N */
	3603
	3604	#define BUILD_CHARCLASS_LOOP(ctype_func) \
	3605	for (i = 0; i < SBC_MAX; ++i) \
	3606	{ \
	3607	if (ctype_func (i)) \
	3608	{ \
	3609	int ch = trans ? trans[i] : i; \
	3610	bitset_set (sbcset, ch); \
	3611	} \
	3612	}
	3613
	3614	if (strcmp (class_name, "alnum") == 0)
	3615	BUILD_CHARCLASS_LOOP (isalnum)
	3616	else if (strcmp (class_name, "cntrl") == 0)
	3617	BUILD_CHARCLASS_LOOP (iscntrl)
	3618	else if (strcmp (class_name, "lower") == 0)
	3619	BUILD_CHARCLASS_LOOP (islower)
	3620	else if (strcmp (class_name, "space") == 0)
	3621	BUILD_CHARCLASS_LOOP (isspace)
	3622	else if (strcmp (class_name, "alpha") == 0)
	3623	BUILD_CHARCLASS_LOOP (isalpha)
	3624	else if (strcmp (class_name, "digit") == 0)
	3625	BUILD_CHARCLASS_LOOP (isdigit)
	3626	else if (strcmp (class_name, "print") == 0)
	3627	BUILD_CHARCLASS_LOOP (isprint)
	3628	else if (strcmp (class_name, "upper") == 0)
	3629	BUILD_CHARCLASS_LOOP (isupper)
	3630	#ifndef GAWK
	3631	else if (strcmp (class_name, "blank") == 0)
	3632	BUILD_CHARCLASS_LOOP (isblank)
	3633	#else
	3634	/* see comments above */
	3635	else if (strcmp (class_name, "blank") == 0)
	3636	BUILD_CHARCLASS_LOOP (is_blank)
	3637	#endif
	3638	else if (strcmp (class_name, "graph") == 0)
	3639	BUILD_CHARCLASS_LOOP (isgraph)
	3640	else if (strcmp (class_name, "punct") == 0)
	3641	BUILD_CHARCLASS_LOOP (ispunct)
	3642	else if (strcmp (class_name, "xdigit") == 0)
	3643	BUILD_CHARCLASS_LOOP (isxdigit)
	3644	else
	3645	return REG_ECTYPE;
	3646
	3647	return REG_NOERROR;
	3648	}
	3649
	3650	static bin_tree_t *
	3651	build_charclass_op (dfa, trans, class_name, extra, non_match, err)
	3652	re_dfa_t *dfa;
	3653	unsigned RE_TRANSLATE_TYPE trans;
	3654	const char *class_name;
	3655	const char *extra;
	3656	int non_match;
	3657	reg_errcode_t *err;
	3658	{
	3659	re_bitset_ptr_t sbcset;
	3660	#ifdef RE_ENABLE_I18N
	3661	re_charset_t *mbcset;
	3662	int alloc = 0;
	3663	#endif /* not RE_ENABLE_I18N */
	3664	reg_errcode_t ret;
	3665	re_token_t br_token;
	3666	bin_tree_t *tree;
	3667
	3668	sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
	3669	#ifdef RE_ENABLE_I18N
	3670	mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
	3671	#endif /* RE_ENABLE_I18N */
	3672
	3673	#ifdef RE_ENABLE_I18N
	3674	if (BE (sbcset == NULL \|\| mbcset == NULL, 0))
	3675	#else /* not RE_ENABLE_I18N */
	3676	if (BE (sbcset == NULL, 0))
	3677	#endif /* not RE_ENABLE_I18N */
	3678	{
	3679	*err = REG_ESPACE;
	3680	return NULL;
	3681	}
	3682
	3683	if (non_match)
	3684	{
	3685	#ifdef RE_ENABLE_I18N
	3686	/*
	3687	if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
	3688	bitset_set(cset->sbcset, '\0');
	3689	*/
	3690	mbcset->non_match = 1;
	3691	#endif /* not RE_ENABLE_I18N */
	3692	}
	3693
	3694	/* We don't care the syntax in this case. */
	3695	ret = build_charclass (trans, sbcset,
	3696	#ifdef RE_ENABLE_I18N
	3697	mbcset, &alloc,
	3698	#endif /* RE_ENABLE_I18N */
	3699	class_name, 0);
	3700
	3701	if (BE (ret != REG_NOERROR, 0))
	3702	{
	3703	re_free (sbcset);
	3704	#ifdef RE_ENABLE_I18N
	3705	free_charset (mbcset);
	3706	#endif /* RE_ENABLE_I18N */
	3707	*err = ret;
	3708	return NULL;
	3709	}
	3710	/* \w match '_' also. */
	3711	for (; *extra; extra++)
	3712	bitset_set (sbcset, *extra);
	3713
	3714	/* If it is non-matching list. */
	3715	if (non_match)
	3716	bitset_not (sbcset);
	3717
	3718	#ifdef RE_ENABLE_I18N
	3719	/* Ensure only single byte characters are set. */
	3720	if (dfa->mb_cur_max > 1)
	3721	bitset_mask (sbcset, dfa->sb_char);
	3722	#endif
	3723
	3724	/* Build a tree for simple bracket. */
	3725	br_token.type = SIMPLE_BRACKET;
	3726	br_token.opr.sbcset = sbcset;
	3727	tree = create_token_tree (dfa, NULL, NULL, &br_token);
	3728	if (BE (tree == NULL, 0))
	3729	goto build_word_op_espace;
	3730
	3731	#ifdef RE_ENABLE_I18N
	3732	if (dfa->mb_cur_max > 1)
	3733	{
	3734	bin_tree_t *mbc_tree;
	3735	/* Build a tree for complex bracket. */
	3736	br_token.type = COMPLEX_BRACKET;
	3737	br_token.opr.mbcset = mbcset;
	3738	dfa->has_mb_node = 1;
	3739	mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
	3740	if (BE (mbc_tree == NULL, 0))
	3741	goto build_word_op_espace;
	3742	/* Then join them by ALT node. */
	3743	tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
	3744	if (BE (mbc_tree != NULL, 1))
	3745	return tree;
	3746	}
	3747	else
	3748	{
	3749	free_charset (mbcset);
	3750	return tree;
	3751	}
	3752	#else /* not RE_ENABLE_I18N */
	3753	return tree;
	3754	#endif /* not RE_ENABLE_I18N */
	3755
	3756	build_word_op_espace:
	3757	re_free (sbcset);
	3758	#ifdef RE_ENABLE_I18N
	3759	free_charset (mbcset);
	3760	#endif /* RE_ENABLE_I18N */
	3761	*err = REG_ESPACE;
	3762	return NULL;
	3763	}
	3764
	3765	/* This is intended for the expressions like "a{1,3}".
	3766	Fetch a number from `input', and return the number.
	3767	Return -1, if the number field is empty like "{,1}".
	3768	Return -2, If an error is occured. */
	3769
	3770	static int
	3771	fetch_number (input, token, syntax)
	3772	re_string_t *input;
	3773	re_token_t *token;
	3774	reg_syntax_t syntax;
	3775	{
	3776	int num = -1;
	3777	unsigned char c;
	3778	while (1)
	3779	{
	3780	fetch_token (token, input, syntax);
	3781	c = token->opr.c;
	3782	if (BE (token->type == END_OF_RE, 0))
	3783	return -2;
	3784	if (token->type == OP_CLOSE_DUP_NUM \|\| c == ',')
	3785	break;
	3786	num = ((token->type != CHARACTER \|\| c < '0' \|\| '9' < c \|\| num == -2)
	3787	? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
	3788	num = (num > RE_DUP_MAX) ? -2 : num;
	3789	}
	3790	return num;
	3791	}
	3792
	3793
	3794	#ifdef RE_ENABLE_I18N
	3795	static void
	3796	free_charset (re_charset_t *cset)
	3797	{
	3798	re_free (cset->mbchars);
	3799	# ifdef _LIBC
	3800	re_free (cset->coll_syms);
	3801	re_free (cset->equiv_classes);
	3802	re_free (cset->range_starts);
	3803	re_free (cset->range_ends);
	3804	# endif
	3805	re_free (cset->char_classes);
	3806	re_free (cset);
	3807	}
	3808	#endif /* RE_ENABLE_I18N */
	3809
	3810
	3811	/* Functions for binary tree operation. */
	3812
	3813	/* Create a tree node. */
	3814
	3815	static bin_tree_t *
	3816	create_tree (dfa, left, right, type)
	3817	re_dfa_t *dfa;
	3818	bin_tree_t *left;
	3819	bin_tree_t *right;
	3820	re_token_type_t type;
	3821	{
	3822	re_token_t t;
	3823	t.type = type;
	3824	return create_token_tree (dfa, left, right, &t);
	3825	}
	3826
	3827	static bin_tree_t *
	3828	create_token_tree (dfa, left, right, token)
	3829	re_dfa_t *dfa;
	3830	bin_tree_t *left;
	3831	bin_tree_t *right;
	3832	const re_token_t *token;
	3833	{
	3834	bin_tree_t *tree;
	3835	if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
	3836	{
	3837	bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
	3838
	3839	if (storage == NULL)
	3840	return NULL;
	3841	storage->next = dfa->str_tree_storage;
	3842	dfa->str_tree_storage = storage;
	3843	dfa->str_tree_storage_idx = 0;
	3844	}
	3845	tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
	3846
	3847	tree->parent = NULL;
	3848	tree->left = left;
	3849	tree->right = right;
	3850	tree->token = *token;
	3851	tree->token.duplicated = 0;
	3852	tree->token.opt_subexp = 0;
	3853	tree->first = NULL;
	3854	tree->next = NULL;
	3855	tree->node_idx = -1;
	3856
	3857	if (left != NULL)
	3858	left->parent = tree;
	3859	if (right != NULL)
	3860	right->parent = tree;
	3861	return tree;
	3862	}
	3863
	3864	/* Mark the tree SRC as an optional subexpression.
	3865	To be called from preorder or postorder. */
	3866
	3867	static reg_errcode_t
	3868	mark_opt_subexp (extra, node)
	3869	void *extra;
	3870	bin_tree_t *node;
	3871	{
	3872	int idx = (int) (long) extra;
	3873	if (node->token.type == SUBEXP && node->token.opr.idx == idx)
	3874	node->token.opt_subexp = 1;
	3875
	3876	return REG_NOERROR;
	3877	}
	3878
	3879	/* Free the allocated memory inside NODE. */
	3880
	3881	static void
	3882	free_token (re_token_t *node)
	3883	{
	3884	#ifdef RE_ENABLE_I18N
	3885	if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
	3886	free_charset (node->opr.mbcset);
	3887	else
	3888	#endif /* RE_ENABLE_I18N */
	3889	if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
	3890	re_free (node->opr.sbcset);
	3891	}
	3892
	3893	/* Worker function for tree walking. Free the allocated memory inside NODE
	3894	and its children. */
	3895
	3896	static reg_errcode_t
	3897	free_tree (void extra, bin_tree_t node)
	3898	{
	3899	free_token (&node->token);
	3900	return REG_NOERROR;
	3901	}
	3902
	3903
	3904	/* Duplicate the node SRC, and return new node. This is a preorder
	3905	visit similar to the one implemented by the generic visitor, but
	3906	we need more infrastructure to maintain two parallel trees --- so,
	3907	it's easier to duplicate. */
	3908
	3909	static bin_tree_t *
	3910	duplicate_tree (root, dfa)
	3911	const bin_tree_t *root;
	3912	re_dfa_t *dfa;
	3913	{
	3914	const bin_tree_t *node;
	3915	bin_tree_t *dup_root;
	3916	bin_tree_t *p_new = &dup_root, dup_node = root->parent;
	3917
	3918	for (node = root; ; )
	3919	{
	3920	/* Create a new tree and link it back to the current parent. */
	3921	*p_new = create_token_tree (dfa, NULL, NULL, &node->token);
	3922	if (*p_new == NULL)
	3923	return NULL;
	3924	(*p_new)->parent = dup_node;
	3925	(*p_new)->token.duplicated = 1;
	3926	dup_node = *p_new;
	3927
	3928	/* Go to the left node, or up and to the right. */
	3929	if (node->left)
	3930	{
	3931	node = node->left;
	3932	p_new = &dup_node->left;
	3933	}
	3934	else
	3935	{
	3936	const bin_tree_t *prev = NULL;
	3937	while (node->right == prev \|\| node->right == NULL)
	3938	{
	3939	prev = node;
	3940	node = node->parent;
	3941	dup_node = dup_node->parent;
	3942	if (!node)
	3943	return dup_root;
	3944	}
	3945	node = node->right;
	3946	p_new = &dup_node->right;
	3947	}
	3948	}
	3949	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: vendor/gawk/3.1.5/regcomp.c

Download in other formats: