source: trunk/essentials/sys-apps/gawk/field.c@ 3871

Last change on this file since 3871 was 3076, checked in by bird, 19 years ago

gawk 3.1.5

File size: 27.5 KB
Line 
1/*
2 * field.c - routines for dealing with fields and record parsing
3 */
4
5/*
6 * Copyright (C) 1986, 1988, 1989, 1991-2005 the Free Software Foundation, Inc.
7 *
8 * This file is part of GAWK, the GNU implementation of the
9 * AWK Programming Language.
10 *
11 * GAWK is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * GAWK is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24 */
25
26#include "awk.h"
27
28/*
29 * In case that the system doesn't have isblank().
30 * Don't bother with autoconf ifdef junk, just force it.
31 * See dfa.c and regex_internal.h and regcomp.c. Bleah.
32 */
33static int
34is_blank(int c)
35{
36 return c == ' ' || c == '\t';
37}
38
39typedef void (* Setfunc) P((long, char *, long, NODE *));
40
41static long (*parse_field) P((long, char **, int, NODE *,
42 Regexp *, Setfunc, NODE *));
43static void rebuild_record P((void));
44static long re_parse_field P((long, char **, int, NODE *,
45 Regexp *, Setfunc, NODE *));
46static long def_parse_field P((long, char **, int, NODE *,
47 Regexp *, Setfunc, NODE *));
48static long posix_def_parse_field P((long, char **, int, NODE *,
49 Regexp *, Setfunc, NODE *));
50static long null_parse_field P((long, char **, int, NODE *,
51 Regexp *, Setfunc, NODE *));
52static long sc_parse_field P((long, char **, int, NODE *,
53 Regexp *, Setfunc, NODE *));
54static long fw_parse_field P((long, char **, int, NODE *,
55 Regexp *, Setfunc, NODE *));
56static void set_element P((long num, char * str, long len, NODE *arr));
57static void grow_fields_arr P((long num));
58static void set_field P((long num, char *str, long len, NODE *dummy));
59static void update_PROCINFO P((char *subscript, char *str));
60
61
62static char *parse_extent; /* marks where to restart parse of record */
63static long parse_high_water = 0; /* field number that we have parsed so far */
64static long nf_high_water = 0; /* size of fields_arr */
65static int resave_fs;
66static NODE *save_FS; /* save current value of FS when line is read,
67 * to be used in deferred parsing
68 */
69static int *FIELDWIDTHS = NULL;
70
71NODE **fields_arr; /* array of pointers to the field nodes */
72int field0_valid; /* $(>0) has not been changed yet */
73int default_FS; /* TRUE when FS == " " */
74Regexp *FS_re_yes_case = NULL;
75Regexp *FS_re_no_case = NULL;
76Regexp *FS_regexp = NULL;
77NODE *Null_field = NULL;
78
79/* using_FIELDWIDTHS --- static function, macro to avoid overhead */
80#define using_FIELDWIDTHS() (parse_field == fw_parse_field)
81
82/* init_fields --- set up the fields array to start with */
83
84void
85init_fields()
86{
87 emalloc(fields_arr, NODE **, sizeof(NODE *), "init_fields");
88 fields_arr[0] = Nnull_string;
89 parse_extent = fields_arr[0]->stptr;
90 save_FS = dupnode(FS_node->var_value);
91 getnode(Null_field);
92 *Null_field = *Nnull_string;
93 Null_field->flags |= FIELD;
94 Null_field->flags &= ~(NUMCUR|NUMBER|MAYBE_NUM|PERM);
95 field0_valid = TRUE;
96}
97
98/* grow_fields --- acquire new fields as needed */
99
100static void
101grow_fields_arr(long num)
102{
103 register int t;
104 register NODE *n;
105
106 erealloc(fields_arr, NODE **, (num + 1) * sizeof(NODE *), "grow_fields_arr");
107 for (t = nf_high_water + 1; t <= num; t++) {
108 getnode(n);
109 *n = *Null_field;
110 fields_arr[t] = n;
111 }
112 nf_high_water = num;
113}
114
115/* set_field --- set the value of a particular field */
116
117/*ARGSUSED*/
118static void
119set_field(long num,
120 char *str,
121 long len,
122 NODE *dummy ATTRIBUTE_UNUSED) /* just to make interface same as set_element */
123{
124 register NODE *n;
125
126 if (num > nf_high_water)
127 grow_fields_arr(num);
128 n = fields_arr[num];
129 n->stptr = str;
130 n->stlen = len;
131 n->flags = (STRCUR|STRING|MAYBE_NUM|FIELD);
132}
133
134/* rebuild_record --- Someone assigned a value to $(something).
135 Fix up $0 to be right */
136
137static void
138rebuild_record()
139{
140 /*
141 * use explicit unsigned longs for lengths, in case
142 * a size_t isn't big enough.
143 */
144 register unsigned long tlen;
145 register unsigned long ofslen;
146 register NODE *tmp;
147 NODE *ofs;
148 char *ops;
149 register char *cops;
150 long i;
151
152 assert(NF != -1);
153
154 tlen = 0;
155 ofs = force_string(OFS_node->var_value);
156 ofslen = ofs->stlen;
157 for (i = NF; i > 0; i--) {
158 tmp = fields_arr[i];
159 tmp = force_string(tmp);
160 tlen += tmp->stlen;
161 }
162 tlen += (NF - 1) * ofslen;
163 if ((long) tlen < 0)
164 tlen = 0;
165 emalloc(ops, char *, tlen + 2, "rebuild_record");
166 cops = ops;
167 ops[0] = '\0';
168 for (i = 1; i <= NF; i++) {
169 tmp = fields_arr[i];
170 /* copy field */
171 if (tmp->stlen == 1)
172 *cops++ = tmp->stptr[0];
173 else if (tmp->stlen != 0) {
174 memcpy(cops, tmp->stptr, tmp->stlen);
175 cops += tmp->stlen;
176 }
177 /* copy OFS */
178 if (i != NF) {
179 if (ofslen == 1)
180 *cops++ = ofs->stptr[0];
181 else if (ofslen != 0) {
182 memcpy(cops, ofs->stptr, ofslen);
183 cops += ofslen;
184 }
185 }
186 }
187 tmp = make_str_node(ops, tlen, ALREADY_MALLOCED);
188
189 /*
190 * Since we are about to unref fields_arr[0], we want to find
191 * any fields that still point into it, and have them point
192 * into the new field zero. This has to be done intelligently,
193 * so that unrefing a field doesn't try to unref into the old $0.
194 */
195 for (cops = ops, i = 1; i <= NF; i++) {
196 if (fields_arr[i]->stlen > 0) {
197 NODE *n;
198 getnode(n);
199
200 if ((fields_arr[i]->flags & FIELD) == 0) {
201 *n = *Null_field;
202 n->stlen = fields_arr[i]->stlen;
203 if ((fields_arr[i]->flags & (NUMCUR|NUMBER)) != 0) {
204 n->flags |= (fields_arr[i]->flags & (NUMCUR|NUMBER));
205 n->numbr = fields_arr[i]->numbr;
206 }
207 } else {
208 *n = *(fields_arr[i]);
209 n->flags &= ~(MALLOC|TEMP|PERM|STRING);
210 }
211
212 n->stptr = cops;
213 unref(fields_arr[i]);
214 fields_arr[i] = n;
215 }
216 cops += fields_arr[i]->stlen + ofslen;
217 }
218
219 unref(fields_arr[0]);
220
221 fields_arr[0] = tmp;
222 field0_valid = TRUE;
223}
224
225/*
226 * set_record:
227 * setup $0, but defer parsing rest of line until reference is made to $(>0)
228 * or to NF. At that point, parse only as much as necessary.
229 *
230 * Manage a private buffer for the contents of $0. Doing so keeps us safe
231 * if `getline var' decides to rearrange the contents of the IOBUF that
232 * $0 might have been pointing into. The cost is the copying of the buffer;
233 * but better correct than fast.
234 */
235void
236set_record(const char *buf, int cnt)
237{
238 NODE *n;
239 static char *databuf;
240 static unsigned long databuf_size;
241#define INITIAL_SIZE 512
242#define MAX_SIZE ((unsigned long) ~0) /* maximally portable ... */
243
244 reset_record();
245
246 /* buffer management: */
247 if (databuf_size == 0) { /* first time */
248 emalloc(databuf, char *, INITIAL_SIZE, "set_record");
249 databuf_size = INITIAL_SIZE;
250 memset(databuf, '\0', INITIAL_SIZE);
251
252 }
253 /*
254 * Make sure there's enough room. Since we sometimes need
255 * to place a sentinel at the end, we make sure
256 * databuf_size is > cnt after allocation.
257 */
258 if (cnt >= databuf_size) {
259 while (cnt >= databuf_size && databuf_size <= MAX_SIZE)
260 databuf_size *= 2;
261 erealloc(databuf, char *, databuf_size, "set_record");
262 memset(databuf, '\0', databuf_size);
263 }
264 /* copy the data */
265 memcpy(databuf, buf, cnt);
266
267 /* manage field 0: */
268 unref(fields_arr[0]);
269 getnode(n);
270 n->stptr = databuf;
271 n->stlen = cnt;
272 n->stref = 1;
273 n->type = Node_val;
274 n->stfmt = -1;
275 n->flags = (STRING|STRCUR|MAYBE_NUM|FIELD);
276 fields_arr[0] = n;
277
278#undef INITIAL_SIZE
279#undef MAX_SIZE
280}
281
282/* reset_record --- start over again with current $0 */
283
284void
285reset_record()
286{
287 register int i;
288 NODE *n;
289
290 (void) force_string(fields_arr[0]);
291
292 NF = -1;
293 for (i = 1; i <= parse_high_water; i++) {
294 unref(fields_arr[i]);
295 getnode(n);
296 *n = *Null_field;
297 fields_arr[i] = n;
298 }
299
300 parse_high_water = 0;
301 /*
302 * $0 = $0 should resplit using the current value of FS.
303 */
304 if (resave_fs) {
305 resave_fs = FALSE;
306 unref(save_FS);
307 save_FS = dupnode(FS_node->var_value);
308 }
309
310 field0_valid = TRUE;
311}
312
313/* set_NF --- handle what happens to $0 and fields when NF is changed */
314
315void
316set_NF()
317{
318 register int i;
319 NODE *n;
320
321 assert(NF != -1);
322
323 NF = (long) force_number(NF_node->var_value);
324
325 if (NF < 0)
326 fatal(_("NF set to negative value"));
327
328 if (NF > nf_high_water)
329 grow_fields_arr(NF);
330 if (parse_high_water < NF) {
331 for (i = parse_high_water + 1; i >= 0 && i <= NF; i++) {
332 unref(fields_arr[i]);
333 getnode(n);
334 *n = *Null_field;
335 fields_arr[i] = n;
336 }
337 } else if (parse_high_water > 0) {
338 for (i = NF + 1; i >= 0 && i <= parse_high_water; i++) {
339 unref(fields_arr[i]);
340 getnode(n);
341 *n = *Null_field;
342 fields_arr[i] = n;
343 }
344 parse_high_water = NF;
345 }
346 field0_valid = FALSE;
347}
348
349/*
350 * re_parse_field --- parse fields using a regexp.
351 *
352 * This is called both from get_field() and from do_split()
353 * via (*parse_field)(). This variation is for when FS is a regular
354 * expression -- either user-defined or because RS=="" and FS==" "
355 */
356static long
357re_parse_field(long up_to, /* parse only up to this field number */
358 char **buf, /* on input: string to parse; on output: point to start next */
359 int len,
360 NODE *fs ATTRIBUTE_UNUSED,
361 Regexp *rp,
362 Setfunc set, /* routine to set the value of the parsed field */
363 NODE *n)
364{
365 register char *scan = *buf;
366 register long nf = parse_high_water;
367 register char *field;
368 register char *end = scan + len;
369#ifdef MBS_SUPPORT
370 size_t mbclen = 0;
371 mbstate_t mbs;
372 if (gawk_mb_cur_max > 1)
373 memset(&mbs, 0, sizeof(mbstate_t));
374#endif
375
376 if (up_to == UNLIMITED)
377 nf = 0;
378 if (len == 0)
379 return nf;
380
381 if (RS_is_null && default_FS)
382 while (scan < end && (*scan == ' ' || *scan == '\t' || *scan == '\n'))
383 scan++;
384 field = scan;
385 while (scan < end
386 && research(rp, scan, 0, (end - scan), RE_NEED_START) != -1
387 && nf < up_to) {
388 if (REEND(rp, scan) == RESTART(rp, scan)) { /* null match */
389#ifdef MBS_SUPPORT
390 if (gawk_mb_cur_max > 1) {
391 mbclen = mbrlen(scan, end-scan, &mbs);
392 if ((mbclen == 1) || (mbclen == (size_t) -1)
393 || (mbclen == (size_t) -2) || (mbclen == 0)) {
394 /* We treat it as a singlebyte character. */
395 mbclen = 1;
396 }
397 scan += mbclen;
398 } else
399#endif
400 scan++;
401 if (scan == end) {
402 (*set)(++nf, field, (long)(scan - field), n);
403 up_to = nf;
404 break;
405 }
406 continue;
407 }
408 (*set)(++nf, field,
409 (long)(scan + RESTART(rp, scan) - field), n);
410 scan += REEND(rp, scan);
411 field = scan;
412 if (scan == end) /* FS at end of record */
413 (*set)(++nf, field, 0L, n);
414 }
415 if (nf != up_to && scan < end) {
416 (*set)(++nf, scan, (long)(end - scan), n);
417 scan = end;
418 }
419 *buf = scan;
420 return nf;
421}
422
423/*
424 * def_parse_field --- default field parsing.
425 *
426 * This is called both from get_field() and from do_split()
427 * via (*parse_field)(). This variation is for when FS is a single space
428 * character.
429 */
430
431static long
432def_parse_field(long up_to, /* parse only up to this field number */
433 char **buf, /* on input: string to parse; on output: point to start next */
434 int len,
435 NODE *fs,
436 Regexp *rp ATTRIBUTE_UNUSED,
437 Setfunc set, /* routine to set the value of the parsed field */
438 NODE *n)
439{
440 register char *scan = *buf;
441 register long nf = parse_high_water;
442 register char *field;
443 register char *end = scan + len;
444 char sav;
445
446 if (up_to == UNLIMITED)
447 nf = 0;
448 if (len == 0)
449 return nf;
450
451 /*
452 * Nasty special case. If FS set to "", return whole record
453 * as first field. This is not worth a separate function.
454 */
455 if (fs->stlen == 0) {
456 (*set)(++nf, *buf, len, n);
457 *buf += len;
458 return nf;
459 }
460
461 /* before doing anything save the char at *end */
462 sav = *end;
463 /* because it will be destroyed now: */
464
465 *end = ' '; /* sentinel character */
466 for (; nf < up_to; scan++) {
467 /*
468 * special case: fs is single space, strip leading whitespace
469 */
470 while (scan < end && (*scan == ' ' || *scan == '\t' || *scan == '\n'))
471 scan++;
472 if (scan >= end)
473 break;
474 field = scan;
475 while (*scan != ' ' && *scan != '\t' && *scan != '\n')
476 scan++;
477 (*set)(++nf, field, (long)(scan - field), n);
478 if (scan == end)
479 break;
480 }
481
482 /* everything done, restore original char at *end */
483 *end = sav;
484
485 *buf = scan;
486 return nf;
487}
488
489/*
490 * posix_def_parse_field --- default field parsing.
491 *
492 * This is called both from get_field() and from do_split()
493 * via (*parse_field)(). This variation is for when FS is a single space
494 * character. The only difference between this and def_parse_field()
495 * is that this one does not allow newlines to separate fields.
496 */
497
498static long
499posix_def_parse_field(long up_to, /* parse only up to this field number */
500 char **buf, /* on input: string to parse; on output: point to start next */
501 int len,
502 NODE *fs,
503 Regexp *rp ATTRIBUTE_UNUSED,
504 Setfunc set, /* routine to set the value of the parsed field */
505 NODE *n)
506{
507 register char *scan = *buf;
508 register long nf = parse_high_water;
509 register char *field;
510 register char *end = scan + len;
511 char sav;
512
513 if (up_to == UNLIMITED)
514 nf = 0;
515 if (len == 0)
516 return nf;
517
518 /*
519 * Nasty special case. If FS set to "", return whole record
520 * as first field. This is not worth a separate function.
521 */
522 if (fs->stlen == 0) {
523 (*set)(++nf, *buf, len, n);
524 *buf += len;
525 return nf;
526 }
527
528 /* before doing anything save the char at *end */
529 sav = *end;
530 /* because it will be destroyed now: */
531
532 *end = ' '; /* sentinel character */
533 for (; nf < up_to; scan++) {
534 /*
535 * special case: fs is single space, strip leading whitespace
536 */
537 while (scan < end && (*scan == ' ' || *scan == '\t'))
538 scan++;
539 if (scan >= end)
540 break;
541 field = scan;
542 while (*scan != ' ' && *scan != '\t')
543 scan++;
544 (*set)(++nf, field, (long)(scan - field), n);
545 if (scan == end)
546 break;
547 }
548
549 /* everything done, restore original char at *end */
550 *end = sav;
551
552 *buf = scan;
553 return nf;
554}
555
556/*
557 * null_parse_field --- each character is a separate field
558 *
559 * This is called both from get_field() and from do_split()
560 * via (*parse_field)(). This variation is for when FS is the null string.
561 */
562static long
563null_parse_field(long up_to, /* parse only up to this field number */
564 char **buf, /* on input: string to parse; on output: point to start next */
565 int len,
566 NODE *fs ATTRIBUTE_UNUSED,
567 Regexp *rp ATTRIBUTE_UNUSED,
568 Setfunc set, /* routine to set the value of the parsed field */
569 NODE *n)
570{
571 register char *scan = *buf;
572 register long nf = parse_high_water;
573 register char *end = scan + len;
574
575 if (up_to == UNLIMITED)
576 nf = 0;
577 if (len == 0)
578 return nf;
579
580#ifdef MBS_SUPPORT
581 if (gawk_mb_cur_max > 1) {
582 mbstate_t mbs;
583 memset(&mbs, 0, sizeof(mbstate_t));
584 for (; nf < up_to && scan < end;) {
585 size_t mbclen = mbrlen(scan, end-scan, &mbs);
586 if ((mbclen == 1) || (mbclen == (size_t) -1)
587 || (mbclen == (size_t) -2) || (mbclen == 0)) {
588 /* We treat it as a singlebyte character. */
589 mbclen = 1;
590 }
591 (*set)(++nf, scan, mbclen, n);
592 scan += mbclen;
593 }
594 } else
595#endif
596 for (; nf < up_to && scan < end; scan++)
597 (*set)(++nf, scan, 1L, n);
598
599 *buf = scan;
600 return nf;
601}
602
603/*
604 * sc_parse_field --- single character field separator
605 *
606 * This is called both from get_field() and from do_split()
607 * via (*parse_field)(). This variation is for when FS is a single character
608 * other than space.
609 */
610static long
611sc_parse_field(long up_to, /* parse only up to this field number */
612 char **buf, /* on input: string to parse; on output: point to start next */
613 int len,
614 NODE *fs,
615 Regexp *rp ATTRIBUTE_UNUSED,
616 Setfunc set, /* routine to set the value of the parsed field */
617 NODE *n)
618{
619 register char *scan = *buf;
620 register char fschar;
621 register long nf = parse_high_water;
622 register char *field;
623 register char *end = scan + len;
624 char sav;
625#ifdef MBS_SUPPORT
626 size_t mbclen = 0;
627 mbstate_t mbs;
628 if (gawk_mb_cur_max > 1)
629 memset(&mbs, 0, sizeof(mbstate_t));
630#endif
631
632 if (up_to == UNLIMITED)
633 nf = 0;
634 if (len == 0)
635 return nf;
636
637 if (RS_is_null && fs->stlen == 0)
638 fschar = '\n';
639 else
640 fschar = fs->stptr[0];
641
642 /* before doing anything save the char at *end */
643 sav = *end;
644 /* because it will be destroyed now: */
645 *end = fschar; /* sentinel character */
646
647 for (; nf < up_to;) {
648 field = scan;
649#ifdef MBS_SUPPORT
650 if (gawk_mb_cur_max > 1) {
651 while (*scan != fschar) {
652 mbclen = mbrlen(scan, end-scan, &mbs);
653 if ((mbclen == 1) || (mbclen == (size_t) -1)
654 || (mbclen == (size_t) -2) || (mbclen == 0)) {
655 /* We treat it as a singlebyte character. */
656 mbclen = 1;
657 }
658 scan += mbclen;
659 }
660 } else
661#endif
662 while (*scan != fschar)
663 scan++;
664 (*set)(++nf, field, (long)(scan - field), n);
665 if (scan == end)
666 break;
667 scan++;
668 if (scan == end) { /* FS at end of record */
669 (*set)(++nf, field, 0L, n);
670 break;
671 }
672 }
673
674 /* everything done, restore original char at *end */
675 *end = sav;
676
677 *buf = scan;
678 return nf;
679}
680
681/*
682 * fw_parse_field --- field parsing using FIELDWIDTHS spec
683 *
684 * This is called from get_field() via (*parse_field)().
685 * This variation is for fields are fixed widths.
686 */
687static long
688fw_parse_field(long up_to, /* parse only up to this field number */
689 char **buf, /* on input: string to parse; on output: point to start next */
690 int len,
691 NODE *fs ATTRIBUTE_UNUSED,
692 Regexp *rp ATTRIBUTE_UNUSED,
693 Setfunc set, /* routine to set the value of the parsed field */
694 NODE *n)
695{
696 register char *scan = *buf;
697 register long nf = parse_high_water;
698 register char *end = scan + len;
699
700 if (up_to == UNLIMITED)
701 nf = 0;
702 if (len == 0)
703 return nf;
704 for (; nf < up_to && (len = FIELDWIDTHS[nf+1]) != -1; ) {
705 if (len > end - scan)
706 len = end - scan;
707 (*set)(++nf, scan, (long) len, n);
708 scan += len;
709 }
710 if (len == -1)
711 *buf = end;
712 else
713 *buf = scan;
714 return nf;
715}
716
717/* get_field --- return a particular $n */
718
719/* assign is not NULL if this field is on the LHS of an assign */
720
721NODE **
722get_field(register long requested, Func_ptr *assign)
723{
724 /*
725 * if requesting whole line but some other field has been altered,
726 * then the whole line must be rebuilt
727 */
728 if (requested == 0) {
729 if (! field0_valid) {
730 /* first, parse remainder of input record */
731 if (NF == -1) {
732 NF = (*parse_field)(UNLIMITED-1, &parse_extent,
733 fields_arr[0]->stlen -
734 (parse_extent - fields_arr[0]->stptr),
735 save_FS, FS_regexp, set_field,
736 (NODE *) NULL);
737 parse_high_water = NF;
738 }
739 rebuild_record();
740 }
741 if (assign != NULL)
742 *assign = reset_record;
743 return &fields_arr[0];
744 }
745
746 /* assert(requested > 0); */
747
748 if (assign != NULL)
749 field0_valid = FALSE; /* $0 needs reconstruction */
750
751 if (requested <= parse_high_water) /* already parsed this field */
752 return &fields_arr[requested];
753
754 if (NF == -1) { /* have not yet parsed to end of record */
755 /*
756 * parse up to requested fields, calling set_field() for each,
757 * saving in parse_extent the point where the parse left off
758 */
759 if (parse_high_water == 0) /* starting at the beginning */
760 parse_extent = fields_arr[0]->stptr;
761 parse_high_water = (*parse_field)(requested, &parse_extent,
762 fields_arr[0]->stlen - (parse_extent - fields_arr[0]->stptr),
763 save_FS, FS_regexp, set_field, (NODE *) NULL);
764
765 /*
766 * if we reached the end of the record, set NF to the number of
767 * fields so far. Note that requested might actually refer to
768 * a field that is beyond the end of the record, but we won't
769 * set NF to that value at this point, since this is only a
770 * reference to the field and NF only gets set if the field
771 * is assigned to -- this case is handled below
772 */
773 if (parse_extent == fields_arr[0]->stptr + fields_arr[0]->stlen)
774 NF = parse_high_water;
775 if (requested == UNLIMITED-1) /* UNLIMITED-1 means set NF */
776 requested = parse_high_water;
777 }
778 if (parse_high_water < requested) { /* requested beyond end of record */
779 if (assign != NULL) { /* expand record */
780 if (requested > nf_high_water)
781 grow_fields_arr(requested);
782
783 NF = requested;
784 parse_high_water = requested;
785 } else
786 return &Null_field;
787 }
788
789 return &fields_arr[requested];
790}
791
792/* set_element --- set an array element, used by do_split() */
793
794static void
795set_element(long num, char *s, long len, NODE *n)
796{
797 register NODE *it;
798
799 it = make_string(s, len);
800 it->flags |= MAYBE_NUM;
801 *assoc_lookup(n, tmp_number((AWKNUM) (num)), FALSE) = it;
802}
803
804/* do_split --- implement split(), semantics are same as for field splitting */
805
806NODE *
807do_split(NODE *tree)
808{
809 NODE *src, *arr, *sep, *fs, *src2, *fs2, *tmp;
810 char *s;
811 long (*parseit) P((long, char **, int, NODE *,
812 Regexp *, Setfunc, NODE *));
813 Regexp *rp = NULL;
814
815 src = force_string(tree_eval(tree->lnode));
816
817 arr = get_param(tree->rnode->lnode);
818 if (arr->type != Node_var_array)
819 fatal(_("split: second argument is not an array"));
820
821 sep = tree->rnode->rnode->lnode;
822
823 if (src->stlen == 0) {
824 /*
825 * Skip the work if first arg is the null string.
826 */
827 free_temp(src);
828 /*
829 * Evaluate sep if it may have side effects.
830 */
831 if ((sep->re_flags & (FS_DFLT|CONST)) == 0)
832 free_temp(tree_eval(sep->re_exp));
833 /*
834 * And now we can safely turn off the array.
835 */
836 assoc_clear(arr);
837 return tmp_number((AWKNUM) 0);
838 }
839
840 if ((sep->re_flags & FS_DFLT) != 0 && ! using_FIELDWIDTHS() && ! RS_is_null) {
841 parseit = parse_field;
842 fs = force_string(FS_node->var_value);
843 rp = FS_regexp;
844 } else {
845 fs = force_string(tree_eval(sep->re_exp));
846 if (fs->stlen == 0) {
847 static short warned = FALSE;
848
849 parseit = null_parse_field;
850
851 if (do_lint && ! warned) {
852 warned = TRUE;
853 lintwarn(_("split: null string for third arg is a gawk extension"));
854 }
855 } else if (fs->stlen == 1 && (sep->re_flags & CONST) == 0) {
856 if (fs->stptr[0] == ' ') {
857 if (do_posix)
858 parseit = posix_def_parse_field;
859 else
860 parseit = def_parse_field;
861 } else
862 parseit = sc_parse_field;
863 } else {
864 parseit = re_parse_field;
865 rp = re_update(sep);
866 }
867 }
868
869 /*
870 * do dupnode(), to avoid problems like
871 * x = split(a["LINE"], a, a["FS"])
872 * since we assoc_clear the array. gack.
873 * this also gives us complete call by value semantics.
874 */
875 src2 = dupnode(src);
876 free_temp(src);
877
878 fs2 = dupnode(fs);
879 free_temp(fs);
880
881 assoc_clear(arr);
882
883 s = src2->stptr;
884 tmp = tmp_number((AWKNUM) (*parseit)(UNLIMITED, &s, (int) src2->stlen,
885 fs2, rp, set_element, arr));
886 unref(src2);
887 unref(fs2);
888 return tmp;
889}
890
891/* set_FIELDWIDTHS --- handle an assignment to FIELDWIDTHS */
892
893void
894set_FIELDWIDTHS()
895{
896 register char *scan;
897 char *end;
898 register int i;
899 static int fw_alloc = 4;
900 static int warned = FALSE;
901 extern unsigned long strtoul P((const char *, char **endptr, int base));
902
903 if (do_lint && ! warned) {
904 warned = TRUE;
905 lintwarn(_("`FIELDWIDTHS' is a gawk extension"));
906 }
907 if (do_traditional) /* quick and dirty, does the trick */
908 return;
909
910 /*
911 * If changing the way fields are split, obey least-suprise
912 * semantics, and force $0 to be split totally.
913 */
914 if (fields_arr != NULL)
915 (void) get_field(UNLIMITED - 1, 0);
916
917 parse_field = fw_parse_field;
918 scan = force_string(FIELDWIDTHS_node->var_value)->stptr;
919 end = scan + 1;
920 if (FIELDWIDTHS == NULL)
921 emalloc(FIELDWIDTHS, int *, fw_alloc * sizeof(int), "set_FIELDWIDTHS");
922 FIELDWIDTHS[0] = 0;
923 for (i = 1; ; i++) {
924 unsigned long int tmp;
925 if (i >= fw_alloc) {
926 fw_alloc *= 2;
927 erealloc(FIELDWIDTHS, int *, fw_alloc * sizeof(int), "set_FIELDWIDTHS");
928 }
929 /* Ensure that there is no leading `-' sign. Otherwise,
930 strtoul would accept it and return a bogus result. */
931 while (is_blank(*scan)) {
932 ++scan;
933 }
934 if (*scan == '-')
935 fatal(_("invalid FIELDWIDTHS value, near `%s'"),
936 scan);
937
938 /* Detect an invalid base-10 integer, a valid value that
939 is followed by something other than a blank or '\0',
940 or a value that is not in the range [1..INT_MAX]. */
941 errno = 0;
942 tmp = strtoul(scan, &end, 10);
943 if (errno != 0
944 || !(*end == '\0' || is_blank(*end))
945 || !(0 < tmp && tmp <= INT_MAX))
946 fatal(_("invalid FIELDWIDTHS value, near `%s'"),
947 scan);
948 FIELDWIDTHS[i] = tmp;
949 scan = end;
950 /* Skip past any trailing blanks. */
951 while (is_blank(*scan)) {
952 ++scan;
953 }
954 if (*scan == '\0')
955 break;
956 }
957 FIELDWIDTHS[i] = -1;
958
959 update_PROCINFO("FS", "FIELDWIDTHS");
960}
961
962/* set_FS --- handle things when FS is assigned to */
963
964void
965set_FS()
966{
967 char buf[10];
968 NODE *fs;
969 static NODE *save_fs = NULL;
970 static NODE *save_rs = NULL;
971 int remake_re = TRUE;
972
973 /*
974 * If changing the way fields are split, obey least-suprise
975 * semantics, and force $0 to be split totally.
976 */
977 if (fields_arr != NULL)
978 (void) get_field(UNLIMITED - 1, 0);
979
980 /* It's possible that only IGNORECASE changed, or FS = FS */
981 /*
982 * This comparison can't use cmp_nodes(), which pays attention
983 * to IGNORECASE, and that's not what we want.
984 */
985 if (save_fs
986 && FS_node->var_value->stlen == save_fs->stlen
987 && memcmp(FS_node->var_value->stptr, save_fs->stptr, save_fs->stlen) == 0
988 && save_rs
989 && RS_node->var_value->stlen == save_rs->stlen
990 && memcmp(RS_node->var_value->stptr, save_rs->stptr, save_rs->stlen) == 0) {
991 if (FS_regexp != NULL)
992 FS_regexp = (IGNORECASE ? FS_re_no_case : FS_re_yes_case);
993
994 /* FS = FS */
995 if (! using_FIELDWIDTHS()) {
996 return;
997 } else {
998 remake_re = FALSE;
999 goto choose_fs_function;
1000 }
1001 }
1002
1003 unref(save_fs);
1004 save_fs = dupnode(FS_node->var_value);
1005 unref(save_rs);
1006 save_rs = dupnode(RS_node->var_value);
1007 resave_fs = TRUE;
1008 if (FS_regexp != NULL) {
1009 refree(FS_re_yes_case);
1010 refree(FS_re_no_case);
1011 FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
1012 }
1013
1014
1015choose_fs_function:
1016 buf[0] = '\0';
1017 default_FS = FALSE;
1018 fs = force_string(FS_node->var_value);
1019
1020 if (! do_traditional && fs->stlen == 0) {
1021 static short warned = FALSE;
1022
1023 parse_field = null_parse_field;
1024
1025 if (do_lint && ! warned) {
1026 warned = TRUE;
1027 lintwarn(_("null string for `FS' is a gawk extension"));
1028 }
1029 } else if (fs->stlen > 1) {
1030 parse_field = re_parse_field;
1031 } else if (RS_is_null) {
1032 /* we know that fs->stlen <= 1 */
1033 parse_field = sc_parse_field;
1034 if (fs->stlen == 1) {
1035 if (fs->stptr[0] == ' ') {
1036 default_FS = TRUE;
1037 strcpy(buf, "[ \t\n]+");
1038 } else if (fs->stptr[0] == '\\') {
1039 /* yet another special case */
1040 strcpy(buf, "[\\\\\n]");
1041 } else if (fs->stptr[0] != '\n')
1042 sprintf(buf, "[%c\n]", fs->stptr[0]);
1043 }
1044 } else {
1045 if (do_posix)
1046 parse_field = posix_def_parse_field;
1047 else
1048 parse_field = def_parse_field;
1049
1050 if (fs->stlen == 1) {
1051 if (fs->stptr[0] == ' ')
1052 default_FS = TRUE;
1053 else if (fs->stptr[0] == '\\')
1054 /* same special case */
1055 strcpy(buf, "[\\\\]");
1056 else
1057 parse_field = sc_parse_field;
1058 }
1059 }
1060 if (remake_re) {
1061 if (FS_regexp != NULL) {
1062 refree(FS_re_yes_case);
1063 refree(FS_re_no_case);
1064 FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
1065 }
1066
1067 if (buf[0] != '\0') {
1068 FS_re_yes_case = make_regexp(buf, strlen(buf), FALSE, TRUE);
1069 FS_re_no_case = make_regexp(buf, strlen(buf), TRUE, TRUE);
1070 FS_regexp = (IGNORECASE ? FS_re_no_case : FS_re_yes_case);
1071 parse_field = re_parse_field;
1072 } else if (parse_field == re_parse_field) {
1073 FS_re_yes_case = make_regexp(fs->stptr, fs->stlen, FALSE, TRUE);
1074 FS_re_no_case = make_regexp(fs->stptr, fs->stlen, TRUE, TRUE);
1075 FS_regexp = (IGNORECASE ? FS_re_no_case : FS_re_yes_case);
1076 } else
1077 FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
1078 }
1079
1080 /*
1081 * For FS = "c", we don't use IGNORECASE. But we must use
1082 * re_parse_field to get the character and the newline as
1083 * field separators.
1084 */
1085 if (fs->stlen == 1 && parse_field == re_parse_field)
1086 FS_regexp = FS_re_yes_case;
1087
1088 update_PROCINFO("FS", "FS");
1089}
1090
1091/* using_fieldwidths --- is FS or FIELDWIDTHS in use? */
1092
1093int
1094using_fieldwidths()
1095{
1096 return using_FIELDWIDTHS();
1097}
1098
1099/* update_PROCINFO --- update PROCINFO[sub] when FS or FIELDWIDTHS set */
1100
1101static void
1102update_PROCINFO(char *subscript, char *str)
1103{
1104 NODE **aptr;
1105
1106 if (PROCINFO_node == NULL)
1107 return;
1108
1109 aptr = assoc_lookup(PROCINFO_node, tmp_string(subscript, strlen(subscript)), FALSE);
1110 assign_val(aptr, tmp_string(str, strlen(str)));
1111}
Note: See TracBrowser for help on using the repository browser.