Context Navigation

fmt.c

Last change on this file was 599, checked in by bird, 19 years ago
GNU sed 4.1.5.
File size: 14.9 KB

Line
1	/* `L' command implementation for GNU sed, based on GNU fmt 1.22.
2	Copyright (C) 1994, 1995, 1996, 2002, 2003 Free Software Foundation, Inc.
3
4	This program is free software; you can redistribute it and/or modify
5	it under the terms of the GNU General Public License as published by
6	the Free Software Foundation; either version 2, or (at your option)
7	any later version.
8
9	This program is distributed in the hope that it will be useful,
10	but WITHOUT ANY WARRANTY; without even the implied warranty of
11	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12	GNU General Public License for more details.
13
14	You should have received a copy of the GNU General Public License
15	along with this program; if not, write to the Free Software Foundation,
16	Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
17
18	/* GNU fmt was written by Ross Paterson <rap@doc.ic.ac.uk>. */
19
20	#include "sed.h"
21
22	#include <stdio.h>
23	#include <ctype.h>
24	#include <sys/types.h>
25
26	#if HAVE_LIMITS_H
27	# include <limits.h>
28	#endif
29
30	#ifndef UINT_MAX
31	# define UINT_MAX ((unsigned int) ~(unsigned int) 0)
32	#endif
33
34	#ifndef INT_MAX
35	# define INT_MAX ((int) (UINT_MAX >> 1))
36	#endif
37
38	/* The following parameters represent the program's idea of what is
39	"best". Adjust to taste, subject to the caveats given. */
40
41	/* Prefer lines to be LEEWAY % shorter than the maximum width, giving
42	room for optimization. */
43	#define LEEWAY 7
44
45	/* Costs and bonuses are expressed as the equivalent departure from the
46	optimal line length, multiplied by 10. e.g. assigning something a
47	cost of 50 means that it is as bad as a line 5 characters too short
48	or too long. The definition of SHORT_COST(n) should not be changed.
49	However, EQUIV(n) may need tuning. */
50
51	typedef long COST;
52
53	#define MAXCOST (~(((unsigned long) 1) << (8 * sizeof (COST) -1)))
54
55	#define SQR(n) ((n) * (n))
56	#define EQUIV(n) SQR ((COST) (n))
57
58	/* Cost of a filled line n chars longer or shorter than best_width. */
59	#define SHORT_COST(n) EQUIV ((n) * 10)
60
61	/* Cost of the difference between adjacent filled lines. */
62	#define RAGGED_COST(n) (SHORT_COST (n) / 2)
63
64	/* Basic cost per line. */
65	#define LINE_COST EQUIV (70)
66
67	/* Cost of breaking a line after the first word of a sentence, where
68	the length of the word is N. */
69	#define WIDOW_COST(n) (EQUIV (200) / ((n) + 2))
70
71	/* Cost of breaking a line before the last word of a sentence, where
72	the length of the word is N. */
73	#define ORPHAN_COST(n) (EQUIV (150) / ((n) + 2))
74
75	/* Bonus for breaking a line at the end of a sentence. */
76	#define SENTENCE_BONUS EQUIV (50)
77
78	/* Cost of breaking a line after a period not marking end of a sentence.
79	With the definition of sentence we are using (borrowed from emacs, see
80	get_line()) such a break would then look like a sentence break. Hence
81	we assign a very high cost -- it should be avoided unless things are
82	really bad. */
83	#define NOBREAK_COST EQUIV (600)
84
85	/* Bonus for breaking a line before open parenthesis. */
86	#define PAREN_BONUS EQUIV (40)
87
88	/* Bonus for breaking a line after other punctuation. */
89	#define PUNCT_BONUS EQUIV(40)
90
91	/* Credit for breaking a long paragraph one line later. */
92	#define LINE_CREDIT EQUIV(3)
93
94	/* Size of paragraph buffer in words. Longer paragraphs are handled
95	neatly (cf. flush_paragraph()), so there's little to gain by making
96	these larger. */
97	#define MAXWORDS 1000
98
99	#define GETC() (parabuf == end_of_parabuf ? EOF : *parabuf++)
100
101	/* Extra ctype(3)-style macros. */
102
103	#define isopen(c) (strchr ("([`'\"", (c)) != NULL)
104	#define isclose(c) (strchr (")]'\"", (c)) != NULL)
105	#define isperiod(c) (strchr (".?!", (c)) != NULL)
106
107	/* Size of a tab stop, for expansion on input and re-introduction on
108	output. */
109	#define TABWIDTH 8
110
111	/* Word descriptor structure. */
112
113	typedef struct Word WORD;
114
115	struct Word
116	{
117
118	/* Static attributes determined during input. */
119
120	const char text; / the text of the word */
121	short length; /* length of this word */
122	short space; /* the size of the following space */
123	unsigned paren:1; /* starts with open paren */
124	unsigned period:1; /* ends in [.?!])* */
125	unsigned punct:1; /* ends in punctuation */
126	unsigned final:1; /* end of sentence */
127
128	/* The remaining fields are computed during the optimization. */
129
130	short line_length; /* length of the best line starting here */
131	COST best_cost; /* cost of best paragraph starting here */
132	WORD next_break; / break which achieves best_cost */
133	};
134
135	/* Forward declarations. */
136
137	static bool get_paragraph P_ ((void));
138	static int get_line P_ ((int c));
139	static int get_space P_ ((int c));
140	static int copy_rest P_ ((int c));
141	static bool same_para P_ ((int c));
142	static void flush_paragraph P_ ((void));
143	static void fmt_paragraph P_ ((void));
144	static void check_punctuation P_ ((WORD *w));
145	static COST base_cost P_ ((WORD *this));
146	static COST line_cost P_ ((WORD *next, int len));
147	static void put_paragraph P_ ((WORD *finish));
148	static void put_line P_ ((WORD *w, int indent));
149	static void put_word P_ ((WORD *w));
150	static void put_space P_ ((int space));
151
152	/* Option values. */
153
154	/* User-supplied maximum line width (default WIDTH). The only output
155	lines
156	longer than this will each comprise a single word. */
157	static int max_width;
158
159	/* Space for the paragraph text. */
160	static const char *parabuf;
161
162	/* End of space for the paragraph text. */
163	static const char *end_of_parabuf;
164
165	/* The file on which we output */
166	static FILE *outfile;
167
168	/* Values derived from the option values. */
169
170	/* The preferred width of text lines, set to LEEWAY % less than max_width. */
171	static int best_width;
172
173	/* Dynamic variables. */
174
175	/* Start column of the character most recently read from the input file. */
176	static int in_column;
177
178	/* Start column of the next character to be written to stdout. */
179	static int out_column;
180
181	/* The words of a paragraph -- longer paragraphs are handled neatly
182	(cf. flush_paragraph()). */
183	static WORD words[MAXWORDS];
184
185	/* A pointer into the above word array, indicating the first position
186	after the last complete word. Sometimes it will point at an incomplete
187	word. */
188	static WORD *word_limit;
189
190	/* Indentation of the first line of the current paragraph. */
191	static int first_indent;
192
193	/* Indentation of other lines of the current paragraph */
194	static int other_indent;
195
196	/* The last character read from the input file. */
197	static int next_char;
198
199	/* If nonzero, the length of the last line output in the current
200	paragraph, used to charge for raggedness at the split point for long
201	paragraphs chosen by fmt_paragraph(). */
202	static int last_line_length;
203
204	/* read file F and send formatted output to stdout. */
205
206	void
207	fmt (const char line, const char line_end, int max_length, FILE *output_file)
208	{
209	parabuf = line;
210	end_of_parabuf = line_end;
211	outfile = output_file;
212
213	max_width = max_length;
214	best_width = max_width * (201 - 2 * LEEWAY) / 200;
215
216	in_column = 0;
217	other_indent = 0;
218	next_char = GETC();
219	while (get_paragraph ())
220	{
221	fmt_paragraph ();
222	put_paragraph (word_limit);
223	}
224	}
225
226	/* Read a paragraph from input file F. A paragraph consists of a
227	maximal number of non-blank (excluding any prefix) lines
228	with the same indent.
229
230	Return false if end-of-file was encountered before the start of a
231	paragraph, else true. */
232
233	static bool
234	get_paragraph ()
235	{
236	register int c;
237
238	last_line_length = 0;
239	c = next_char;
240
241	/* Scan (and copy) blank lines, and lines not introduced by the prefix. */
242
243	while (c == '\n' \|\| c == EOF)
244	{
245	c = copy_rest (c);
246	if (c == EOF)
247	{
248	next_char = EOF;
249	return false;
250	}
251	putc ('\n', outfile);
252	c = GETC();
253	}
254
255	/* Got a suitable first line for a paragraph. */
256
257	first_indent = in_column;
258	word_limit = words;
259	c = get_line (c);
260
261	/* Read rest of paragraph. */
262
263	other_indent = in_column;
264	while (same_para (c) && in_column == other_indent)
265	c = get_line (c);
266
267	(word_limit - 1)->period = (word_limit - 1)->final = true;
268	next_char = c;
269	return true;
270	}
271
272	/* Copy to the output a blank line. In the latter, C is \n or EOF.
273	Return the character (\n or EOF) ending the line. */
274
275	static int
276	copy_rest (register int c)
277	{
278	out_column = 0;
279	while (c != '\n' && c != EOF)
280	{
281	putc (c, outfile);
282	c = GETC();
283	}
284	return c;
285	}
286
287	/* Return true if a line whose first non-blank character after the
288	prefix (if any) is C could belong to the current paragraph,
289	otherwise false. */
290
291	static bool
292	same_para (register int c)
293	{
294	return (c != '\n' && c != EOF);
295	}
296
297	/* Read a line from the input data given first non-blank character C
298	after the prefix, and the following indent, and break it into words.
299	A word is a maximal non-empty string of non-white characters. A word
300	ending in [.?!]["')\]]* and followed by end-of-line or at least two
301	spaces ends a sentence, as in emacs.
302
303	Return the first non-blank character of the next line. */
304
305	static int
306	get_line (register int c)
307	{
308	int start;
309	register WORD *end_of_word;
310
311	end_of_word = &words[MAXWORDS - 2];
312
313	do
314	{ /* for each word in a line */
315
316	/* Scan word. */
317
318	word_limit->text = parabuf - 1;
319	do
320	c = GETC();
321	while (c != EOF && !ISSPACE (c));
322	word_limit->length = parabuf - word_limit->text - (c != EOF);
323	in_column += word_limit->length;
324
325	check_punctuation (word_limit);
326
327	/* Scan inter-word space. */
328
329	start = in_column;
330	c = get_space (c);
331	word_limit->space = in_column - start;
332	word_limit->final = (c == EOF
333	\|\| (word_limit->period
334	&& (c == '\n' \|\| word_limit->space > 1)));
335	if (c == '\n' \|\| c == EOF)
336	word_limit->space = word_limit->final ? 2 : 1;
337	if (word_limit == end_of_word)
338	flush_paragraph ();
339	word_limit++;
340	if (c == EOF)
341	{
342	in_column = first_indent;
343	return EOF;
344	}
345	}
346	while (c != '\n');
347
348	in_column = 0;
349	c = GETC();
350	return get_space (c);
351	}
352
353	/* Read blank characters from the input data, starting with C, and keeping
354	in_column up-to-date. Return first non-blank character. */
355
356	static int
357	get_space (register int c)
358	{
359	for (;;)
360	{
361	if (c == ' ')
362	in_column++;
363	else if (c == '\t')
364	in_column = (in_column / TABWIDTH + 1) * TABWIDTH;
365	else
366	return c;
367	c = GETC();
368	}
369	}
370
371	/* Set extra fields in word W describing any attached punctuation. */
372
373	static void
374	check_punctuation (register WORD *w)
375	{
376	register const char start, finish;
377
378	start = w->text;
379	finish = start + (w->length - 1);
380	w->paren = isopen (*start);
381	w->punct = ISPUNCT (*finish);
382	while (isclose (*finish) && finish > start)
383	finish--;
384	w->period = isperiod (*finish);
385	}
386
387	/* Flush part of the paragraph to make room. This function is called on
388	hitting the limit on the number of words or characters. */
389
390	static void
391	flush_paragraph (void)
392	{
393	WORD *split_point;
394	register WORD *w;
395	COST best_break;
396
397	/* - format what you have so far as a paragraph,
398	- find a low-cost line break near the end,
399	- output to there,
400	- make that the start of the paragraph. */
401
402	fmt_paragraph ();
403
404	/* Choose a good split point. */
405
406	split_point = word_limit;
407	best_break = MAXCOST;
408	for (w = words->next_break; w != word_limit; w = w->next_break)
409	{
410	if (w->best_cost - w->next_break->best_cost < best_break)
411	{
412	split_point = w;
413	best_break = w->best_cost - w->next_break->best_cost;
414	}
415	if (best_break <= MAXCOST - LINE_CREDIT)
416	best_break += LINE_CREDIT;
417	}
418	put_paragraph (split_point);
419
420	/* Copy words from split_point down to word -- we use memmove because
421	the source and target may overlap. */
422
423	memmove ((char ) words, (char ) split_point,
424	(word_limit - split_point + 1) * sizeof (WORD));
425	word_limit -= split_point - words;
426	}
427
428	/* Compute the optimal formatting for the whole paragraph by computing
429	and remembering the optimal formatting for each suffix from the empty
430	one to the whole paragraph. */
431
432	static void
433	fmt_paragraph (void)
434	{
435	register WORD start, w;
436	register int len;
437	register COST wcost, best;
438	int saved_length;
439
440	word_limit->best_cost = 0;
441	saved_length = word_limit->length;
442	word_limit->length = max_width; /* sentinel */
443
444	for (start = word_limit - 1; start >= words; start--)
445	{
446	best = MAXCOST;
447	len = start == words ? first_indent : other_indent;
448
449	/* At least one word, however long, in the line. */
450
451	w = start;
452	len += w->length;
453	do
454	{
455	w++;
456
457	/* Consider breaking before w. */
458
459	wcost = line_cost (w, len) + w->best_cost;
460	if (start == words && last_line_length > 0)
461	wcost += RAGGED_COST (len - last_line_length);
462	if (wcost < best)
463	{
464	best = wcost;
465	start->next_break = w;
466	start->line_length = len;
467	}
468	len += (w - 1)->space + w->length; /* w > start >= words */
469	}
470	while (len < max_width);
471	start->best_cost = best + base_cost (start);
472	}
473
474	word_limit->length = saved_length;
475	}
476
477	/* Return the constant component of the cost of breaking before the
478	word THIS. */
479
480	static COST
481	base_cost (register WORD *this)
482	{
483	register COST cost;
484
485	cost = LINE_COST;
486
487	if (this > words)
488	{
489	if ((this - 1)->period)
490	{
491	if ((this - 1)->final)
492	cost -= SENTENCE_BONUS;
493	else
494	cost += NOBREAK_COST;
495	}
496	else if ((this - 1)->punct)
497	cost -= PUNCT_BONUS;
498	else if (this > words + 1 && (this - 2)->final)
499	cost += WIDOW_COST ((this - 1)->length);
500	}
501
502	if (this->paren)
503	cost -= PAREN_BONUS;
504	else if (this->final)
505	cost += ORPHAN_COST (this->length);
506
507	return cost;
508	}
509
510	/* Return the component of the cost of breaking before word NEXT that
511	depends on LEN, the length of the line beginning there. */
512
513	static COST
514	line_cost (register WORD *next, register int len)
515	{
516	register int n;
517	register COST cost;
518
519	if (next == word_limit)
520	return 0;
521	n = best_width - len;
522	cost = SHORT_COST (n);
523	if (next->next_break != word_limit)
524	{
525	n = len - next->line_length;
526	cost += RAGGED_COST (n);
527	}
528	return cost;
529	}
530
531	/* Output to stdout a paragraph from word up to (but not including)
532	FINISH, which must be in the next_break chain from word. */
533
534	static void
535	put_paragraph (register WORD *finish)
536	{
537	register WORD *w;
538
539	put_line (words, first_indent);
540	for (w = words->next_break; w != finish; w = w->next_break)
541	put_line (w, other_indent);
542	}
543
544	/* Output to stdout the line beginning with word W, beginning in column
545	INDENT, including the prefix (if any). */
546
547	static void
548	put_line (register WORD *w, int indent)
549	{
550	register WORD *endline;
551	out_column = 0;
552	put_space (indent);
553
554	endline = w->next_break - 1;
555	for (; w != endline; w++)
556	{
557	put_word (w);
558	put_space (w->space);
559	}
560	put_word (w);
561	last_line_length = out_column;
562	putc ('\n', outfile);
563	}
564
565	/* Output to stdout the word W. */
566
567	static void
568	put_word (register WORD *w)
569	{
570	register const char *s;
571	register int n;
572
573	s = w->text;
574	for (n = w->length; n != 0; n--)
575	putc (*s++, outfile);
576	out_column += w->length;
577	}
578
579	/* Output to stdout SPACE spaces, or equivalent tabs. */
580
581	static void
582	put_space (int space)
583	{
584	out_column += space;
585	while (space--)
586	putc (' ', outfile);
587	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/src/oldsed/sed/fmt.c

Download in other formats: