Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

field.c@ 3840

Visit:

Last change on this file since 3840 was 3076, checked in by bird, 18 years ago
gawk 3.1.5
File size: 27.5 KB

Line
1	/*
2	* field.c - routines for dealing with fields and record parsing
3	*/
4
5	/*
6	* Copyright (C) 1986, 1988, 1989, 1991-2005 the Free Software Foundation, Inc.
7	*
8	* This file is part of GAWK, the GNU implementation of the
9	* AWK Programming Language.
10	*
11	* GAWK is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* GAWK is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24	*/
25
26	#include "awk.h"
27
28	/*
29	* In case that the system doesn't have isblank().
30	* Don't bother with autoconf ifdef junk, just force it.
31	* See dfa.c and regex_internal.h and regcomp.c. Bleah.
32	*/
33	static int
34	is_blank(int c)
35	{
36	return c == ' ' \|\| c == '\t';
37	}
38
39	typedef void (* Setfunc) P((long, char , long, NODE ));
40
41	static long (parse_field) P((long, char , int, NODE ,
42	Regexp , Setfunc, NODE ));
43	static void rebuild_record P((void));
44	static long re_parse_field P((long, char *, int, NODE ,
45	Regexp , Setfunc, NODE ));
46	static long def_parse_field P((long, char *, int, NODE ,
47	Regexp , Setfunc, NODE ));
48	static long posix_def_parse_field P((long, char *, int, NODE ,
49	Regexp , Setfunc, NODE ));
50	static long null_parse_field P((long, char *, int, NODE ,
51	Regexp , Setfunc, NODE ));
52	static long sc_parse_field P((long, char *, int, NODE ,
53	Regexp , Setfunc, NODE ));
54	static long fw_parse_field P((long, char *, int, NODE ,
55	Regexp , Setfunc, NODE ));
56	static void set_element P((long num, char * str, long len, NODE *arr));
57	static void grow_fields_arr P((long num));
58	static void set_field P((long num, char str, long len, NODE dummy));
59	static void update_PROCINFO P((char subscript, char str));
60
61
62	static char parse_extent; / marks where to restart parse of record */
63	static long parse_high_water = 0; /* field number that we have parsed so far */
64	static long nf_high_water = 0; /* size of fields_arr */
65	static int resave_fs;
66	static NODE save_FS; / save current value of FS when line is read,
67	* to be used in deferred parsing
68	*/
69	static int *FIELDWIDTHS = NULL;
70
71	NODE *fields_arr; / array of pointers to the field nodes */
72	int field0_valid; /* $(>0) has not been changed yet */
73	int default_FS; /* TRUE when FS == " " */
74	Regexp *FS_re_yes_case = NULL;
75	Regexp *FS_re_no_case = NULL;
76	Regexp *FS_regexp = NULL;
77	NODE *Null_field = NULL;
78
79	/* using_FIELDWIDTHS --- static function, macro to avoid overhead */
80	#define using_FIELDWIDTHS() (parse_field == fw_parse_field)
81
82	/* init_fields --- set up the fields array to start with */
83
84	void
85	init_fields()
86	{
87	emalloc(fields_arr, NODE *, sizeof(NODE ), "init_fields");
88	fields_arr[0] = Nnull_string;
89	parse_extent = fields_arr[0]->stptr;
90	save_FS = dupnode(FS_node->var_value);
91	getnode(Null_field);
92	Null_field = Nnull_string;
93	Null_field->flags \|= FIELD;
94	Null_field->flags &= ~(NUMCUR\|NUMBER\|MAYBE_NUM\|PERM);
95	field0_valid = TRUE;
96	}
97
98	/* grow_fields --- acquire new fields as needed */
99
100	static void
101	grow_fields_arr(long num)
102	{
103	register int t;
104	register NODE *n;
105
106	erealloc(fields_arr, NODE *, (num + 1) sizeof(NODE *), "grow_fields_arr");
107	for (t = nf_high_water + 1; t <= num; t++) {
108	getnode(n);
109	n = Null_field;
110	fields_arr[t] = n;
111	}
112	nf_high_water = num;
113	}
114
115	/* set_field --- set the value of a particular field */
116
117	/ARGSUSED/
118	static void
119	set_field(long num,
120	char *str,
121	long len,
122	NODE dummy ATTRIBUTE_UNUSED) / just to make interface same as set_element */
123	{
124	register NODE *n;
125
126	if (num > nf_high_water)
127	grow_fields_arr(num);
128	n = fields_arr[num];
129	n->stptr = str;
130	n->stlen = len;
131	n->flags = (STRCUR\|STRING\|MAYBE_NUM\|FIELD);
132	}
133
134	/* rebuild_record --- Someone assigned a value to $(something).
135	Fix up $0 to be right */
136
137	static void
138	rebuild_record()
139	{
140	/*
141	* use explicit unsigned longs for lengths, in case
142	* a size_t isn't big enough.
143	*/
144	register unsigned long tlen;
145	register unsigned long ofslen;
146	register NODE *tmp;
147	NODE *ofs;
148	char *ops;
149	register char *cops;
150	long i;
151
152	assert(NF != -1);
153
154	tlen = 0;
155	ofs = force_string(OFS_node->var_value);
156	ofslen = ofs->stlen;
157	for (i = NF; i > 0; i--) {
158	tmp = fields_arr[i];
159	tmp = force_string(tmp);
160	tlen += tmp->stlen;
161	}
162	tlen += (NF - 1) * ofslen;
163	if ((long) tlen < 0)
164	tlen = 0;
165	emalloc(ops, char *, tlen + 2, "rebuild_record");
166	cops = ops;
167	ops[0] = '\0';
168	for (i = 1; i <= NF; i++) {
169	tmp = fields_arr[i];
170	/* copy field */
171	if (tmp->stlen == 1)
172	*cops++ = tmp->stptr[0];
173	else if (tmp->stlen != 0) {
174	memcpy(cops, tmp->stptr, tmp->stlen);
175	cops += tmp->stlen;
176	}
177	/* copy OFS */
178	if (i != NF) {
179	if (ofslen == 1)
180	*cops++ = ofs->stptr[0];
181	else if (ofslen != 0) {
182	memcpy(cops, ofs->stptr, ofslen);
183	cops += ofslen;
184	}
185	}
186	}
187	tmp = make_str_node(ops, tlen, ALREADY_MALLOCED);
188
189	/*
190	* Since we are about to unref fields_arr[0], we want to find
191	* any fields that still point into it, and have them point
192	* into the new field zero. This has to be done intelligently,
193	* so that unrefing a field doesn't try to unref into the old $0.
194	*/
195	for (cops = ops, i = 1; i <= NF; i++) {
196	if (fields_arr[i]->stlen > 0) {
197	NODE *n;
198	getnode(n);
199
200	if ((fields_arr[i]->flags & FIELD) == 0) {
201	n = Null_field;
202	n->stlen = fields_arr[i]->stlen;
203	if ((fields_arr[i]->flags & (NUMCUR\|NUMBER)) != 0) {
204	n->flags \|= (fields_arr[i]->flags & (NUMCUR\|NUMBER));
205	n->numbr = fields_arr[i]->numbr;
206	}
207	} else {
208	n = (fields_arr[i]);
209	n->flags &= ~(MALLOC\|TEMP\|PERM\|STRING);
210	}
211
212	n->stptr = cops;
213	unref(fields_arr[i]);
214	fields_arr[i] = n;
215	}
216	cops += fields_arr[i]->stlen + ofslen;
217	}
218
219	unref(fields_arr[0]);
220
221	fields_arr[0] = tmp;
222	field0_valid = TRUE;
223	}
224
225	/*
226	* set_record:
227	* setup $0, but defer parsing rest of line until reference is made to $(>0)
228	* or to NF. At that point, parse only as much as necessary.
229	*
230	* Manage a private buffer for the contents of $0. Doing so keeps us safe
231	* if `getline var' decides to rearrange the contents of the IOBUF that
232	* $0 might have been pointing into. The cost is the copying of the buffer;
233	* but better correct than fast.
234	*/
235	void
236	set_record(const char *buf, int cnt)
237	{
238	NODE *n;
239	static char *databuf;
240	static unsigned long databuf_size;
241	#define INITIAL_SIZE 512
242	#define MAX_SIZE ((unsigned long) ~0) /* maximally portable ... */
243
244	reset_record();
245
246	/* buffer management: */
247	if (databuf_size == 0) { /* first time */
248	emalloc(databuf, char *, INITIAL_SIZE, "set_record");
249	databuf_size = INITIAL_SIZE;
250	memset(databuf, '\0', INITIAL_SIZE);
251
252	}
253	/*
254	* Make sure there's enough room. Since we sometimes need
255	* to place a sentinel at the end, we make sure
256	* databuf_size is > cnt after allocation.
257	*/
258	if (cnt >= databuf_size) {
259	while (cnt >= databuf_size && databuf_size <= MAX_SIZE)
260	databuf_size *= 2;
261	erealloc(databuf, char *, databuf_size, "set_record");
262	memset(databuf, '\0', databuf_size);
263	}
264	/* copy the data */
265	memcpy(databuf, buf, cnt);
266
267	/* manage field 0: */
268	unref(fields_arr[0]);
269	getnode(n);
270	n->stptr = databuf;
271	n->stlen = cnt;
272	n->stref = 1;
273	n->type = Node_val;
274	n->stfmt = -1;
275	n->flags = (STRING\|STRCUR\|MAYBE_NUM\|FIELD);
276	fields_arr[0] = n;
277
278	#undef INITIAL_SIZE
279	#undef MAX_SIZE
280	}
281
282	/* reset_record --- start over again with current $0 */
283
284	void
285	reset_record()
286	{
287	register int i;
288	NODE *n;
289
290	(void) force_string(fields_arr[0]);
291
292	NF = -1;
293	for (i = 1; i <= parse_high_water; i++) {
294	unref(fields_arr[i]);
295	getnode(n);
296	n = Null_field;
297	fields_arr[i] = n;
298	}
299
300	parse_high_water = 0;
301	/*
302	* $0 = $0 should resplit using the current value of FS.
303	*/
304	if (resave_fs) {
305	resave_fs = FALSE;
306	unref(save_FS);
307	save_FS = dupnode(FS_node->var_value);
308	}
309
310	field0_valid = TRUE;
311	}
312
313	/* set_NF --- handle what happens to $0 and fields when NF is changed */
314
315	void
316	set_NF()
317	{
318	register int i;
319	NODE *n;
320
321	assert(NF != -1);
322
323	NF = (long) force_number(NF_node->var_value);
324
325	if (NF < 0)
326	fatal(_("NF set to negative value"));
327
328	if (NF > nf_high_water)
329	grow_fields_arr(NF);
330	if (parse_high_water < NF) {
331	for (i = parse_high_water + 1; i >= 0 && i <= NF; i++) {
332	unref(fields_arr[i]);
333	getnode(n);
334	n = Null_field;
335	fields_arr[i] = n;
336	}
337	} else if (parse_high_water > 0) {
338	for (i = NF + 1; i >= 0 && i <= parse_high_water; i++) {
339	unref(fields_arr[i]);
340	getnode(n);
341	n = Null_field;
342	fields_arr[i] = n;
343	}
344	parse_high_water = NF;
345	}
346	field0_valid = FALSE;
347	}
348
349	/*
350	* re_parse_field --- parse fields using a regexp.
351	*
352	* This is called both from get_field() and from do_split()
353	* via (*parse_field)(). This variation is for when FS is a regular
354	* expression -- either user-defined or because RS=="" and FS==" "
355	*/
356	static long
357	re_parse_field(long up_to, /* parse only up to this field number */
358	char *buf, / on input: string to parse; on output: point to start next */
359	int len,
360	NODE *fs ATTRIBUTE_UNUSED,
361	Regexp *rp,
362	Setfunc set, /* routine to set the value of the parsed field */
363	NODE *n)
364	{
365	register char scan = buf;
366	register long nf = parse_high_water;
367	register char *field;
368	register char *end = scan + len;
369	#ifdef MBS_SUPPORT
370	size_t mbclen = 0;
371	mbstate_t mbs;
372	if (gawk_mb_cur_max > 1)
373	memset(&mbs, 0, sizeof(mbstate_t));
374	#endif
375
376	if (up_to == UNLIMITED)
377	nf = 0;
378	if (len == 0)
379	return nf;
380
381	if (RS_is_null && default_FS)
382	while (scan < end && (scan == ' ' \|\| scan == '\t' \|\| *scan == '\n'))
383	scan++;
384	field = scan;
385	while (scan < end
386	&& research(rp, scan, 0, (end - scan), RE_NEED_START) != -1
387	&& nf < up_to) {
388	if (REEND(rp, scan) == RESTART(rp, scan)) { /* null match */
389	#ifdef MBS_SUPPORT
390	if (gawk_mb_cur_max > 1) {
391	mbclen = mbrlen(scan, end-scan, &mbs);
392	if ((mbclen == 1) \|\| (mbclen == (size_t) -1)
393	\|\| (mbclen == (size_t) -2) \|\| (mbclen == 0)) {
394	/* We treat it as a singlebyte character. */
395	mbclen = 1;
396	}
397	scan += mbclen;
398	} else
399	#endif
400	scan++;
401	if (scan == end) {
402	(*set)(++nf, field, (long)(scan - field), n);
403	up_to = nf;
404	break;
405	}
406	continue;
407	}
408	(*set)(++nf, field,
409	(long)(scan + RESTART(rp, scan) - field), n);
410	scan += REEND(rp, scan);
411	field = scan;
412	if (scan == end) /* FS at end of record */
413	(*set)(++nf, field, 0L, n);
414	}
415	if (nf != up_to && scan < end) {
416	(*set)(++nf, scan, (long)(end - scan), n);
417	scan = end;
418	}
419	*buf = scan;
420	return nf;
421	}
422
423	/*
424	* def_parse_field --- default field parsing.
425	*
426	* This is called both from get_field() and from do_split()
427	* via (*parse_field)(). This variation is for when FS is a single space
428	* character.
429	*/
430
431	static long
432	def_parse_field(long up_to, /* parse only up to this field number */
433	char *buf, / on input: string to parse; on output: point to start next */
434	int len,
435	NODE *fs,
436	Regexp *rp ATTRIBUTE_UNUSED,
437	Setfunc set, /* routine to set the value of the parsed field */
438	NODE *n)
439	{
440	register char scan = buf;
441	register long nf = parse_high_water;
442	register char *field;
443	register char *end = scan + len;
444	char sav;
445
446	if (up_to == UNLIMITED)
447	nf = 0;
448	if (len == 0)
449	return nf;
450
451	/*
452	* Nasty special case. If FS set to "", return whole record
453	* as first field. This is not worth a separate function.
454	*/
455	if (fs->stlen == 0) {
456	(set)(++nf, buf, len, n);
457	*buf += len;
458	return nf;
459	}
460
461	/* before doing anything save the char at end /
462	sav = *end;
463	/* because it will be destroyed now: */
464
465	end = ' '; / sentinel character */
466	for (; nf < up_to; scan++) {
467	/*
468	* special case: fs is single space, strip leading whitespace
469	*/
470	while (scan < end && (scan == ' ' \|\| scan == '\t' \|\| *scan == '\n'))
471	scan++;
472	if (scan >= end)
473	break;
474	field = scan;
475	while (scan != ' ' && scan != '\t' && *scan != '\n')
476	scan++;
477	(*set)(++nf, field, (long)(scan - field), n);
478	if (scan == end)
479	break;
480	}
481
482	/* everything done, restore original char at end /
483	*end = sav;
484
485	*buf = scan;
486	return nf;
487	}
488
489	/*
490	* posix_def_parse_field --- default field parsing.
491	*
492	* This is called both from get_field() and from do_split()
493	* via (*parse_field)(). This variation is for when FS is a single space
494	* character. The only difference between this and def_parse_field()
495	* is that this one does not allow newlines to separate fields.
496	*/
497
498	static long
499	posix_def_parse_field(long up_to, /* parse only up to this field number */
500	char *buf, / on input: string to parse; on output: point to start next */
501	int len,
502	NODE *fs,
503	Regexp *rp ATTRIBUTE_UNUSED,
504	Setfunc set, /* routine to set the value of the parsed field */
505	NODE *n)
506	{
507	register char scan = buf;
508	register long nf = parse_high_water;
509	register char *field;
510	register char *end = scan + len;
511	char sav;
512
513	if (up_to == UNLIMITED)
514	nf = 0;
515	if (len == 0)
516	return nf;
517
518	/*
519	* Nasty special case. If FS set to "", return whole record
520	* as first field. This is not worth a separate function.
521	*/
522	if (fs->stlen == 0) {
523	(set)(++nf, buf, len, n);
524	*buf += len;
525	return nf;
526	}
527
528	/* before doing anything save the char at end /
529	sav = *end;
530	/* because it will be destroyed now: */
531
532	end = ' '; / sentinel character */
533	for (; nf < up_to; scan++) {
534	/*
535	* special case: fs is single space, strip leading whitespace
536	*/
537	while (scan < end && (scan == ' ' \|\| scan == '\t'))
538	scan++;
539	if (scan >= end)
540	break;
541	field = scan;
542	while (scan != ' ' && scan != '\t')
543	scan++;
544	(*set)(++nf, field, (long)(scan - field), n);
545	if (scan == end)
546	break;
547	}
548
549	/* everything done, restore original char at end /
550	*end = sav;
551
552	*buf = scan;
553	return nf;
554	}
555
556	/*
557	* null_parse_field --- each character is a separate field
558	*
559	* This is called both from get_field() and from do_split()
560	* via (*parse_field)(). This variation is for when FS is the null string.
561	*/
562	static long
563	null_parse_field(long up_to, /* parse only up to this field number */
564	char *buf, / on input: string to parse; on output: point to start next */
565	int len,
566	NODE *fs ATTRIBUTE_UNUSED,
567	Regexp *rp ATTRIBUTE_UNUSED,
568	Setfunc set, /* routine to set the value of the parsed field */
569	NODE *n)
570	{
571	register char scan = buf;
572	register long nf = parse_high_water;
573	register char *end = scan + len;
574
575	if (up_to == UNLIMITED)
576	nf = 0;
577	if (len == 0)
578	return nf;
579
580	#ifdef MBS_SUPPORT
581	if (gawk_mb_cur_max > 1) {
582	mbstate_t mbs;
583	memset(&mbs, 0, sizeof(mbstate_t));
584	for (; nf < up_to && scan < end;) {
585	size_t mbclen = mbrlen(scan, end-scan, &mbs);
586	if ((mbclen == 1) \|\| (mbclen == (size_t) -1)
587	\|\| (mbclen == (size_t) -2) \|\| (mbclen == 0)) {
588	/* We treat it as a singlebyte character. */
589	mbclen = 1;
590	}
591	(*set)(++nf, scan, mbclen, n);
592	scan += mbclen;
593	}
594	} else
595	#endif
596	for (; nf < up_to && scan < end; scan++)
597	(*set)(++nf, scan, 1L, n);
598
599	*buf = scan;
600	return nf;
601	}
602
603	/*
604	* sc_parse_field --- single character field separator
605	*
606	* This is called both from get_field() and from do_split()
607	* via (*parse_field)(). This variation is for when FS is a single character
608	* other than space.
609	*/
610	static long
611	sc_parse_field(long up_to, /* parse only up to this field number */
612	char *buf, / on input: string to parse; on output: point to start next */
613	int len,
614	NODE *fs,
615	Regexp *rp ATTRIBUTE_UNUSED,
616	Setfunc set, /* routine to set the value of the parsed field */
617	NODE *n)
618	{
619	register char scan = buf;
620	register char fschar;
621	register long nf = parse_high_water;
622	register char *field;
623	register char *end = scan + len;
624	char sav;
625	#ifdef MBS_SUPPORT
626	size_t mbclen = 0;
627	mbstate_t mbs;
628	if (gawk_mb_cur_max > 1)
629	memset(&mbs, 0, sizeof(mbstate_t));
630	#endif
631
632	if (up_to == UNLIMITED)
633	nf = 0;
634	if (len == 0)
635	return nf;
636
637	if (RS_is_null && fs->stlen == 0)
638	fschar = '\n';
639	else
640	fschar = fs->stptr[0];
641
642	/* before doing anything save the char at end /
643	sav = *end;
644	/* because it will be destroyed now: */
645	end = fschar; / sentinel character */
646
647	for (; nf < up_to;) {
648	field = scan;
649	#ifdef MBS_SUPPORT
650	if (gawk_mb_cur_max > 1) {
651	while (*scan != fschar) {
652	mbclen = mbrlen(scan, end-scan, &mbs);
653	if ((mbclen == 1) \|\| (mbclen == (size_t) -1)
654	\|\| (mbclen == (size_t) -2) \|\| (mbclen == 0)) {
655	/* We treat it as a singlebyte character. */
656	mbclen = 1;
657	}
658	scan += mbclen;
659	}
660	} else
661	#endif
662	while (*scan != fschar)
663	scan++;
664	(*set)(++nf, field, (long)(scan - field), n);
665	if (scan == end)
666	break;
667	scan++;
668	if (scan == end) { /* FS at end of record */
669	(*set)(++nf, field, 0L, n);
670	break;
671	}
672	}
673
674	/* everything done, restore original char at end /
675	*end = sav;
676
677	*buf = scan;
678	return nf;
679	}
680
681	/*
682	* fw_parse_field --- field parsing using FIELDWIDTHS spec
683	*
684	* This is called from get_field() via (*parse_field)().
685	* This variation is for fields are fixed widths.
686	*/
687	static long
688	fw_parse_field(long up_to, /* parse only up to this field number */
689	char *buf, / on input: string to parse; on output: point to start next */
690	int len,
691	NODE *fs ATTRIBUTE_UNUSED,
692	Regexp *rp ATTRIBUTE_UNUSED,
693	Setfunc set, /* routine to set the value of the parsed field */
694	NODE *n)
695	{
696	register char scan = buf;
697	register long nf = parse_high_water;
698	register char *end = scan + len;
699
700	if (up_to == UNLIMITED)
701	nf = 0;
702	if (len == 0)
703	return nf;
704	for (; nf < up_to && (len = FIELDWIDTHS[nf+1]) != -1; ) {
705	if (len > end - scan)
706	len = end - scan;
707	(*set)(++nf, scan, (long) len, n);
708	scan += len;
709	}
710	if (len == -1)
711	*buf = end;
712	else
713	*buf = scan;
714	return nf;
715	}
716
717	/* get_field --- return a particular $n */
718
719	/* assign is not NULL if this field is on the LHS of an assign */
720
721	NODE **
722	get_field(register long requested, Func_ptr *assign)
723	{
724	/*
725	* if requesting whole line but some other field has been altered,
726	* then the whole line must be rebuilt
727	*/
728	if (requested == 0) {
729	if (! field0_valid) {
730	/* first, parse remainder of input record */
731	if (NF == -1) {
732	NF = (*parse_field)(UNLIMITED-1, &parse_extent,
733	fields_arr[0]->stlen -
734	(parse_extent - fields_arr[0]->stptr),
735	save_FS, FS_regexp, set_field,
736	(NODE *) NULL);
737	parse_high_water = NF;
738	}
739	rebuild_record();
740	}
741	if (assign != NULL)
742	*assign = reset_record;
743	return &fields_arr[0];
744	}
745
746	/* assert(requested > 0); */
747
748	if (assign != NULL)
749	field0_valid = FALSE; /* $0 needs reconstruction */
750
751	if (requested <= parse_high_water) /* already parsed this field */
752	return &fields_arr[requested];
753
754	if (NF == -1) { /* have not yet parsed to end of record */
755	/*
756	* parse up to requested fields, calling set_field() for each,
757	* saving in parse_extent the point where the parse left off
758	*/
759	if (parse_high_water == 0) /* starting at the beginning */
760	parse_extent = fields_arr[0]->stptr;
761	parse_high_water = (*parse_field)(requested, &parse_extent,
762	fields_arr[0]->stlen - (parse_extent - fields_arr[0]->stptr),
763	save_FS, FS_regexp, set_field, (NODE *) NULL);
764
765	/*
766	* if we reached the end of the record, set NF to the number of
767	* fields so far. Note that requested might actually refer to
768	* a field that is beyond the end of the record, but we won't
769	* set NF to that value at this point, since this is only a
770	* reference to the field and NF only gets set if the field
771	* is assigned to -- this case is handled below
772	*/
773	if (parse_extent == fields_arr[0]->stptr + fields_arr[0]->stlen)
774	NF = parse_high_water;
775	if (requested == UNLIMITED-1) /* UNLIMITED-1 means set NF */
776	requested = parse_high_water;
777	}
778	if (parse_high_water < requested) { /* requested beyond end of record */
779	if (assign != NULL) { /* expand record */
780	if (requested > nf_high_water)
781	grow_fields_arr(requested);
782
783	NF = requested;
784	parse_high_water = requested;
785	} else
786	return &Null_field;
787	}
788
789	return &fields_arr[requested];
790	}
791
792	/* set_element --- set an array element, used by do_split() */
793
794	static void
795	set_element(long num, char s, long len, NODE n)
796	{
797	register NODE *it;
798
799	it = make_string(s, len);
800	it->flags \|= MAYBE_NUM;
801	*assoc_lookup(n, tmp_number((AWKNUM) (num)), FALSE) = it;
802	}
803
804	/* do_split --- implement split(), semantics are same as for field splitting */
805
806	NODE *
807	do_split(NODE *tree)
808	{
809	NODE src, arr, sep, fs, src2, fs2, *tmp;
810	char *s;
811	long (parseit) P((long, char , int, NODE ,
812	Regexp , Setfunc, NODE ));
813	Regexp *rp = NULL;
814
815	src = force_string(tree_eval(tree->lnode));
816
817	arr = get_param(tree->rnode->lnode);
818	if (arr->type != Node_var_array)
819	fatal(_("split: second argument is not an array"));
820
821	sep = tree->rnode->rnode->lnode;
822
823	if (src->stlen == 0) {
824	/*
825	* Skip the work if first arg is the null string.
826	*/
827	free_temp(src);
828	/*
829	* Evaluate sep if it may have side effects.
830	*/
831	if ((sep->re_flags & (FS_DFLT\|CONST)) == 0)
832	free_temp(tree_eval(sep->re_exp));
833	/*
834	* And now we can safely turn off the array.
835	*/
836	assoc_clear(arr);
837	return tmp_number((AWKNUM) 0);
838	}
839
840	if ((sep->re_flags & FS_DFLT) != 0 && ! using_FIELDWIDTHS() && ! RS_is_null) {
841	parseit = parse_field;
842	fs = force_string(FS_node->var_value);
843	rp = FS_regexp;
844	} else {
845	fs = force_string(tree_eval(sep->re_exp));
846	if (fs->stlen == 0) {
847	static short warned = FALSE;
848
849	parseit = null_parse_field;
850
851	if (do_lint && ! warned) {
852	warned = TRUE;
853	lintwarn(_("split: null string for third arg is a gawk extension"));
854	}
855	} else if (fs->stlen == 1 && (sep->re_flags & CONST) == 0) {
856	if (fs->stptr[0] == ' ') {
857	if (do_posix)
858	parseit = posix_def_parse_field;
859	else
860	parseit = def_parse_field;
861	} else
862	parseit = sc_parse_field;
863	} else {
864	parseit = re_parse_field;
865	rp = re_update(sep);
866	}
867	}
868
869	/*
870	* do dupnode(), to avoid problems like
871	* x = split(a["LINE"], a, a["FS"])
872	* since we assoc_clear the array. gack.
873	* this also gives us complete call by value semantics.
874	*/
875	src2 = dupnode(src);
876	free_temp(src);
877
878	fs2 = dupnode(fs);
879	free_temp(fs);
880
881	assoc_clear(arr);
882
883	s = src2->stptr;
884	tmp = tmp_number((AWKNUM) (*parseit)(UNLIMITED, &s, (int) src2->stlen,
885	fs2, rp, set_element, arr));
886	unref(src2);
887	unref(fs2);
888	return tmp;
889	}
890
891	/* set_FIELDWIDTHS --- handle an assignment to FIELDWIDTHS */
892
893	void
894	set_FIELDWIDTHS()
895	{
896	register char *scan;
897	char *end;
898	register int i;
899	static int fw_alloc = 4;
900	static int warned = FALSE;
901	extern unsigned long strtoul P((const char , char *endptr, int base));
902
903	if (do_lint && ! warned) {
904	warned = TRUE;
905	lintwarn(_("`FIELDWIDTHS' is a gawk extension"));
906	}
907	if (do_traditional) /* quick and dirty, does the trick */
908	return;
909
910	/*
911	* If changing the way fields are split, obey least-suprise
912	* semantics, and force $0 to be split totally.
913	*/
914	if (fields_arr != NULL)
915	(void) get_field(UNLIMITED - 1, 0);
916
917	parse_field = fw_parse_field;
918	scan = force_string(FIELDWIDTHS_node->var_value)->stptr;
919	end = scan + 1;
920	if (FIELDWIDTHS == NULL)
921	emalloc(FIELDWIDTHS, int , fw_alloc sizeof(int), "set_FIELDWIDTHS");
922	FIELDWIDTHS[0] = 0;
923	for (i = 1; ; i++) {
924	unsigned long int tmp;
925	if (i >= fw_alloc) {
926	fw_alloc *= 2;
927	erealloc(FIELDWIDTHS, int , fw_alloc sizeof(int), "set_FIELDWIDTHS");
928	}
929	/* Ensure that there is no leading `-' sign. Otherwise,
930	strtoul would accept it and return a bogus result. */
931	while (is_blank(*scan)) {
932	++scan;
933	}
934	if (*scan == '-')
935	fatal(_("invalid FIELDWIDTHS value, near `%s'"),
936	scan);
937
938	/* Detect an invalid base-10 integer, a valid value that
939	is followed by something other than a blank or '\0',
940	or a value that is not in the range [1..INT_MAX]. */
941	errno = 0;
942	tmp = strtoul(scan, &end, 10);
943	if (errno != 0
944	\|\| !(end == '\0' \|\| is_blank(end))
945	\|\| !(0 < tmp && tmp <= INT_MAX))
946	fatal(_("invalid FIELDWIDTHS value, near `%s'"),
947	scan);
948	FIELDWIDTHS[i] = tmp;
949	scan = end;
950	/* Skip past any trailing blanks. */
951	while (is_blank(*scan)) {
952	++scan;
953	}
954	if (*scan == '\0')
955	break;
956	}
957	FIELDWIDTHS[i] = -1;
958
959	update_PROCINFO("FS", "FIELDWIDTHS");
960	}
961
962	/* set_FS --- handle things when FS is assigned to */
963
964	void
965	set_FS()
966	{
967	char buf[10];
968	NODE *fs;
969	static NODE *save_fs = NULL;
970	static NODE *save_rs = NULL;
971	int remake_re = TRUE;
972
973	/*
974	* If changing the way fields are split, obey least-suprise
975	* semantics, and force $0 to be split totally.
976	*/
977	if (fields_arr != NULL)
978	(void) get_field(UNLIMITED - 1, 0);
979
980	/* It's possible that only IGNORECASE changed, or FS = FS */
981	/*
982	* This comparison can't use cmp_nodes(), which pays attention
983	* to IGNORECASE, and that's not what we want.
984	*/
985	if (save_fs
986	&& FS_node->var_value->stlen == save_fs->stlen
987	&& memcmp(FS_node->var_value->stptr, save_fs->stptr, save_fs->stlen) == 0
988	&& save_rs
989	&& RS_node->var_value->stlen == save_rs->stlen
990	&& memcmp(RS_node->var_value->stptr, save_rs->stptr, save_rs->stlen) == 0) {
991	if (FS_regexp != NULL)
992	FS_regexp = (IGNORECASE ? FS_re_no_case : FS_re_yes_case);
993
994	/* FS = FS */
995	if (! using_FIELDWIDTHS()) {
996	return;
997	} else {
998	remake_re = FALSE;
999	goto choose_fs_function;
1000	}
1001	}
1002
1003	unref(save_fs);
1004	save_fs = dupnode(FS_node->var_value);
1005	unref(save_rs);
1006	save_rs = dupnode(RS_node->var_value);
1007	resave_fs = TRUE;
1008	if (FS_regexp != NULL) {
1009	refree(FS_re_yes_case);
1010	refree(FS_re_no_case);
1011	FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
1012	}
1013
1014
1015	choose_fs_function:
1016	buf[0] = '\0';
1017	default_FS = FALSE;
1018	fs = force_string(FS_node->var_value);
1019
1020	if (! do_traditional && fs->stlen == 0) {
1021	static short warned = FALSE;
1022
1023	parse_field = null_parse_field;
1024
1025	if (do_lint && ! warned) {
1026	warned = TRUE;
1027	lintwarn(_("null string for `FS' is a gawk extension"));
1028	}
1029	} else if (fs->stlen > 1) {
1030	parse_field = re_parse_field;
1031	} else if (RS_is_null) {
1032	/* we know that fs->stlen <= 1 */
1033	parse_field = sc_parse_field;
1034	if (fs->stlen == 1) {
1035	if (fs->stptr[0] == ' ') {
1036	default_FS = TRUE;
1037	strcpy(buf, "[ \t\n]+");
1038	} else if (fs->stptr[0] == '\\') {
1039	/* yet another special case */
1040	strcpy(buf, "[\\\\\n]");
1041	} else if (fs->stptr[0] != '\n')
1042	sprintf(buf, "[%c\n]", fs->stptr[0]);
1043	}
1044	} else {
1045	if (do_posix)
1046	parse_field = posix_def_parse_field;
1047	else
1048	parse_field = def_parse_field;
1049
1050	if (fs->stlen == 1) {
1051	if (fs->stptr[0] == ' ')
1052	default_FS = TRUE;
1053	else if (fs->stptr[0] == '\\')
1054	/* same special case */
1055	strcpy(buf, "[\\\\]");
1056	else
1057	parse_field = sc_parse_field;
1058	}
1059	}
1060	if (remake_re) {
1061	if (FS_regexp != NULL) {
1062	refree(FS_re_yes_case);
1063	refree(FS_re_no_case);
1064	FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
1065	}
1066
1067	if (buf[0] != '\0') {
1068	FS_re_yes_case = make_regexp(buf, strlen(buf), FALSE, TRUE);
1069	FS_re_no_case = make_regexp(buf, strlen(buf), TRUE, TRUE);
1070	FS_regexp = (IGNORECASE ? FS_re_no_case : FS_re_yes_case);
1071	parse_field = re_parse_field;
1072	} else if (parse_field == re_parse_field) {
1073	FS_re_yes_case = make_regexp(fs->stptr, fs->stlen, FALSE, TRUE);
1074	FS_re_no_case = make_regexp(fs->stptr, fs->stlen, TRUE, TRUE);
1075	FS_regexp = (IGNORECASE ? FS_re_no_case : FS_re_yes_case);
1076	} else
1077	FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
1078	}
1079
1080	/*
1081	* For FS = "c", we don't use IGNORECASE. But we must use
1082	* re_parse_field to get the character and the newline as
1083	* field separators.
1084	*/
1085	if (fs->stlen == 1 && parse_field == re_parse_field)
1086	FS_regexp = FS_re_yes_case;
1087
1088	update_PROCINFO("FS", "FS");
1089	}
1090
1091	/* using_fieldwidths --- is FS or FIELDWIDTHS in use? */
1092
1093	int
1094	using_fieldwidths()
1095	{
1096	return using_FIELDWIDTHS();
1097	}
1098
1099	/* update_PROCINFO --- update PROCINFO[sub] when FS or FIELDWIDTHS set */
1100
1101	static void
1102	update_PROCINFO(char subscript, char str)
1103	{
1104	NODE **aptr;
1105
1106	if (PROCINFO_node == NULL)
1107	return;
1108
1109	aptr = assoc_lookup(PROCINFO_node, tmp_string(subscript, strlen(subscript)), FALSE);
1110	assign_val(aptr, tmp_string(str, strlen(str)));
1111	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: vendor/gawk/3.1.5/field.c@ 3840

Download in other formats: