1 | #! /usr/bin/env perl
|
---|
2 | # html2texi.pl -- Convert HTML documentation to Texinfo format
|
---|
3 | # Michael Ernst <mernst@cs.washington.edu>
|
---|
4 | # Time-stamp: <1999-01-12 21:34:27 mernst>
|
---|
5 |
|
---|
6 | # This program converts HTML documentation trees into Texinfo format.
|
---|
7 | # Given the name of a main (or contents) HTML file, it processes that file,
|
---|
8 | # and other files (transitively) referenced by it, into a Texinfo file
|
---|
9 | # (whose name is chosen from the file or directory name of the argument).
|
---|
10 | # For instance:
|
---|
11 | # html2texi.pl api/index.html
|
---|
12 | # produces file "api.texi".
|
---|
13 |
|
---|
14 | # Texinfo format can be easily converted to Info format (for browsing in
|
---|
15 | # Emacs or the standalone Info browser), to a printed manual, or to HTML.
|
---|
16 | # Thus, html2texi.pl permits conversion of HTML files to Info format, and
|
---|
17 | # secondarily enables producing printed versions of Web page hierarchies.
|
---|
18 |
|
---|
19 | # Unlike HTML, Info format is searchable. Since Info is integrated into
|
---|
20 | # Emacs, one can read documentation without starting a separate Web
|
---|
21 | # browser. Additionally, Info browsers (including Emacs) contain
|
---|
22 | # convenient features missing from Web browsers, such as easy index lookup
|
---|
23 | # and mouse-free browsing.
|
---|
24 |
|
---|
25 | # Limitations:
|
---|
26 | # html2texi.pl is currently tuned to latex2html output (and it corrects
|
---|
27 | # several latex2html bugs), but should be extensible to arbitrary HTML
|
---|
28 | # documents. It will be most useful for HTML with a hierarchical structure
|
---|
29 | # and an index, and it recognizes those features as created by latex2html
|
---|
30 | # (and possibly by some other tools). The HTML tree to be traversed must
|
---|
31 | # be on local disk, rather than being accessed via HTTP.
|
---|
32 | # This script requires the use of "checkargs.pm". To eliminate that
|
---|
33 | # dependence, replace calls to check_args* by @_ (which is always the last
|
---|
34 | # argument to those functions).
|
---|
35 | # Also see the "to do" section, below.
|
---|
36 | # Comments, suggestions, bug fixes, and enhancements are welcome.
|
---|
37 |
|
---|
38 | # Troubleshooting:
|
---|
39 | # Malformed HTML can cause this program to abort, so
|
---|
40 | # you should check your HTML files to make sure they are legal.
|
---|
41 |
|
---|
42 |
|
---|
43 | ###
|
---|
44 | ### Typical usage for the Python documentation:
|
---|
45 | ###
|
---|
46 |
|
---|
47 | # (Actually, most of this is in a Makefile instead.)
|
---|
48 | # The resulting Info format Python documentation is currently available at
|
---|
49 | # ftp://ftp.cs.washington.edu/homes/mernst/python-info.tar.gz
|
---|
50 |
|
---|
51 | # Fix up HTML problems, eg <DT><DL COMPACT><DD> should be <DT><DL COMPACT><DD>.
|
---|
52 |
|
---|
53 | # html2texi.pl /homes/fish/mernst/tmp/python-doc/html/api/index.html
|
---|
54 | # html2texi.pl /homes/fish/mernst/tmp/python-doc/html/ext/index.html
|
---|
55 | # html2texi.pl /homes/fish/mernst/tmp/python-doc/html/lib/index.html
|
---|
56 | # html2texi.pl /homes/fish/mernst/tmp/python-doc/html/mac/index.html
|
---|
57 | # html2texi.pl /homes/fish/mernst/tmp/python-doc/html/ref/index.html
|
---|
58 | # html2texi.pl /homes/fish/mernst/tmp/python-doc/html/tut/index.html
|
---|
59 |
|
---|
60 | # Edit the generated .texi files:
|
---|
61 | # * change @setfilename to prefix "python-"
|
---|
62 | # * fix up any sectioning, such as for Abstract
|
---|
63 | # * make Texinfo menus
|
---|
64 | # * perhaps remove the @detailmenu ... @end detailmenu
|
---|
65 | # In Emacs, to do all this:
|
---|
66 | # (progn (goto-char (point-min)) (replace-regexp "\\(@setfilename \\)\\([-a-z]*\\)$" "\\1python-\\2.info") (replace-string "@node Front Matter\n@chapter Abstract\n" "@node Abstract\n@section Abstract\n") (progn (mark-whole-buffer) (texinfo-master-menu 'update-all-nodes)) (save-buffer))
|
---|
67 |
|
---|
68 | # makeinfo api.texi
|
---|
69 | # makeinfo ext.texi
|
---|
70 | # makeinfo lib.texi
|
---|
71 | # makeinfo mac.texi
|
---|
72 | # makeinfo ref.texi
|
---|
73 | # makeinfo tut.texi
|
---|
74 |
|
---|
75 |
|
---|
76 | ###
|
---|
77 | ### Structure of the code
|
---|
78 | ###
|
---|
79 |
|
---|
80 | # To be written...
|
---|
81 |
|
---|
82 |
|
---|
83 | ###
|
---|
84 | ### Design decisions
|
---|
85 | ###
|
---|
86 |
|
---|
87 | # Source and destination languages
|
---|
88 | # --------------------------------
|
---|
89 | #
|
---|
90 | # The goal is Info files; I create Texinfo, so I don't have to worry about
|
---|
91 | # the finer details of Info file creation. (I'm not even sure of its exact
|
---|
92 | # format.)
|
---|
93 | #
|
---|
94 | # Why not start from LaTeX rather than HTML?
|
---|
95 | # I could hack latex2html itself to produce Texinfo instead, or fix up
|
---|
96 | # partparse.py (which already translates LaTeX to Teinfo).
|
---|
97 | # Pros:
|
---|
98 | # * has high-level information such as index entries, original formatting
|
---|
99 | # Cons:
|
---|
100 | # * those programs are complicated to read and understand
|
---|
101 | # * those programs try to handle arbitrary LaTeX input, track catcodes,
|
---|
102 | # and more: I don't want to go to that effort. HTML isn't as powerful
|
---|
103 | # as LaTeX, so there are fewer subtleties.
|
---|
104 | # * the result wouldn't work for arbitrary HTML documents; it would be
|
---|
105 | # nice to eventually extend this program to HTML produced from Docbook,
|
---|
106 | # Frame, and more.
|
---|
107 |
|
---|
108 | # Parsing
|
---|
109 | # -------
|
---|
110 | #
|
---|
111 | # I don't want to view the text as a linear stream; I'd rather parse the
|
---|
112 | # whole thing and then do pattern matching over the parsed representation (to
|
---|
113 | # find idioms such as indices, lists of child nodes, etc.).
|
---|
114 | # * Perl provides HTML::TreeBuilder, which does just what I want.
|
---|
115 | # * libwww-perl: http://www.linpro.no/lwp/
|
---|
116 | # * TreeBuilder: HTML-Tree-0.51.tar.gz
|
---|
117 | # * Python Parsers, Formatters, and Writers don't really provide the right
|
---|
118 | # interface (and the version in Grail doesn't correspond to another
|
---|
119 | # distributed version, so I'm confused about which to be using). I could
|
---|
120 | # write something in Python that creates a parse tree, but why bother?
|
---|
121 |
|
---|
122 | # Other implementation language issues:
|
---|
123 | # * Python lacks variable declarations, reasonable scoping, and static
|
---|
124 | # checking tools. I've written some of the latter for myself that make
|
---|
125 | # my Perl programming a lot safer than my Python programming will be until
|
---|
126 | # I have a similar suite for that language.
|
---|
127 |
|
---|
128 |
|
---|
129 | ###########################################################################
|
---|
130 | ### To do
|
---|
131 | ###
|
---|
132 |
|
---|
133 | # Section names:
|
---|
134 | # Fix the problem with multiple sections in a single file (eg, Abstract in
|
---|
135 | # Front Matter section).
|
---|
136 | # Deal with cross-references, as in /homes/fish/mernst/tmp/python-doc/html/ref/types.html:310
|
---|
137 | # Index:
|
---|
138 | # Perhaps double-check that every tag mentioned in the index is found
|
---|
139 | # in the text.
|
---|
140 | # Python: email to docs@python.org, to get their feedback.
|
---|
141 | # Compare to existing lib/ Info manual
|
---|
142 | # Write the hooks into info-look; replace pyliblookup1-1.tar.gz.
|
---|
143 | # Postpass to remove extra quotation marks around typography already in
|
---|
144 | # a different font (to avoid double delimiters as in "`code'"); or
|
---|
145 | # perhaps consider using only font-based markup so that we don't get
|
---|
146 | # the extra *bold* and `code' markup in Info.
|
---|
147 |
|
---|
148 | ## Perhaps don't rely on automatic means for adding up, next, prev; I have
|
---|
149 | ## all that info available to me already, so it's not so much trouble to
|
---|
150 | ## add it. (Right?) But it is *so* easy to use Emacs instead...
|
---|
151 |
|
---|
152 |
|
---|
153 | ###########################################################################
|
---|
154 | ### Strictures
|
---|
155 | ###
|
---|
156 |
|
---|
157 | # man HTML::TreeBuilder
|
---|
158 | # man HTML::Parser
|
---|
159 | # man HTML::Element
|
---|
160 |
|
---|
161 | # require HTML::ParserWComment;
|
---|
162 | require HTML::Parser;
|
---|
163 | require HTML::TreeBuilder;
|
---|
164 | require HTML::Element;
|
---|
165 |
|
---|
166 | use File::Basename;
|
---|
167 |
|
---|
168 | use strict;
|
---|
169 | # use Carp;
|
---|
170 |
|
---|
171 | use checkargs;
|
---|
172 |
|
---|
173 |
|
---|
174 | ###########################################################################
|
---|
175 | ### Variables
|
---|
176 | ###
|
---|
177 |
|
---|
178 | my @section_stack = (); # elements are chapter/section/subsec nodetitles (I think)
|
---|
179 | my $current_ref_tdf; # for the file currently being processed;
|
---|
180 | # used in error messages
|
---|
181 | my $html_directory;
|
---|
182 | my %footnotes;
|
---|
183 |
|
---|
184 | # First element should not be used.
|
---|
185 | my @sectionmarker = ("manual", "chapter", "section", "subsection", "subsubsection");
|
---|
186 |
|
---|
187 | my %inline_markup = ("b" => "strong",
|
---|
188 | "code" => "code",
|
---|
189 | "i" => "emph",
|
---|
190 | "kbd" => "kbd",
|
---|
191 | "samp" => "samp",
|
---|
192 | "strong" => "strong",
|
---|
193 | "tt" => "code",
|
---|
194 | "var" => "var");
|
---|
195 |
|
---|
196 | my @deferred_index_entries = ();
|
---|
197 |
|
---|
198 | my @index_titles = (); # list of (filename, type) lists
|
---|
199 | my %index_info = ("Index" => ["\@blindex", "bl"],
|
---|
200 | "Concept Index" => ["\@cindex", "cp"],
|
---|
201 | "Module Index" => ["\@mdindex", "md"]);
|
---|
202 |
|
---|
203 |
|
---|
204 | ###########################################################################
|
---|
205 | ### Main/contents page
|
---|
206 | ###
|
---|
207 |
|
---|
208 | # Process first-level page on its own, or just a contents page? Well, I do
|
---|
209 | # want the title, author, etc., and the front matter... For now, just add
|
---|
210 | # that by hand at the end.
|
---|
211 |
|
---|
212 |
|
---|
213 | # data structure possibilities:
|
---|
214 | # * tree-like (need some kind of stack when processing (or parent pointers))
|
---|
215 | # * list of name and depth; remember old and new depths.
|
---|
216 |
|
---|
217 | # Each element is a reference to a list of (nodetitle, depth, filename).
|
---|
218 | my @contents_list = ();
|
---|
219 |
|
---|
220 | # The problem with doing fixups on the fly is that some sections may have
|
---|
221 | # already been processed (and no longer available) by the time we notice
|
---|
222 | # others with the same name. It's probably better to fully construct the
|
---|
223 | # contents list (reading in all files of interest) upfront; that will also
|
---|
224 | # let me do a better job with cross-references, because again, all files
|
---|
225 | # will already be read in.
|
---|
226 | my %contents_hash = ();
|
---|
227 | my %contents_fixups = ();
|
---|
228 |
|
---|
229 | my @current_contents_list = ();
|
---|
230 |
|
---|
231 | # Merge @current_contents_list into @contents_list,
|
---|
232 | # and set @current_contents_list to be empty.
|
---|
233 | sub merge_contents_lists ( )
|
---|
234 | { check_args(0, @_);
|
---|
235 |
|
---|
236 | # Three possibilities:
|
---|
237 | # * @contents_list is empty: replace it by @current_contents_list.
|
---|
238 | # * prefixes of the two lists are identical: do nothing
|
---|
239 | # * @current_contents_list is all at lower level than $contents_list[0];
|
---|
240 | # prefix @contents_list by @current_contents_list
|
---|
241 |
|
---|
242 | if (scalar(@current_contents_list) == 0)
|
---|
243 | { die "empty current_contents_list"; }
|
---|
244 |
|
---|
245 | # if (scalar(@contents_list) == 0)
|
---|
246 | # { @contents_list = @current_contents_list;
|
---|
247 | # @current_contents_list = ();
|
---|
248 | # return; }
|
---|
249 |
|
---|
250 | # if (($ {$contents_list[0]}[1]) < ($ {$current_contents_list[0]}[1]))
|
---|
251 | # { unshift @contents_list, @current_contents_list;
|
---|
252 | # @current_contents_list = ();
|
---|
253 | # return; }
|
---|
254 |
|
---|
255 | for (my $i=0; $i<scalar(@current_contents_list); $i++)
|
---|
256 | { my $ref_c_tdf = $current_contents_list[$i];
|
---|
257 | if ($i >= scalar(@contents_list))
|
---|
258 | { push @contents_list, $ref_c_tdf;
|
---|
259 | my $title = $ {$ref_c_tdf}[0];
|
---|
260 | if (defined $contents_hash{$title})
|
---|
261 | { $contents_fixups{$title} = 1; }
|
---|
262 | else
|
---|
263 | { $contents_hash{$title} = 1; }
|
---|
264 | next; }
|
---|
265 | my $ref_tdf = $contents_list[$i];
|
---|
266 | my ($title, $depth, $file) = @{$ref_tdf};
|
---|
267 | my ($c_title, $c_depth, $c_file) = @{$ref_c_tdf};
|
---|
268 |
|
---|
269 | if (($title ne $c_title)
|
---|
270 | && ($depth < $c_depth)
|
---|
271 | && ($file ne $c_file))
|
---|
272 | { splice @contents_list, $i, 0, $ref_c_tdf;
|
---|
273 | if (defined $contents_hash{$c_title})
|
---|
274 | { $contents_fixups{$c_title} = 1; }
|
---|
275 | else
|
---|
276 | { $contents_hash{$c_title} = 1; }
|
---|
277 | next; }
|
---|
278 |
|
---|
279 | if (($title ne $c_title)
|
---|
280 | || ($depth != $c_depth)
|
---|
281 | || ($file ne $c_file))
|
---|
282 | { die ("while processing $ {$current_ref_tdf}[2] at depth $ {$current_ref_tdf}[1], mismatch at index $i:",
|
---|
283 | "\n main: <<<$title>>> $depth $file",
|
---|
284 | "\n curr: <<<$c_title>>> $c_depth $c_file"); }
|
---|
285 | }
|
---|
286 | @current_contents_list = ();
|
---|
287 | }
|
---|
288 |
|
---|
289 |
|
---|
290 |
|
---|
291 | # Set @current_contents_list to a list of (title, href, sectionlevel);
|
---|
292 | # then merge that list into @contents_list.
|
---|
293 | # Maybe this function should also produce a map
|
---|
294 | # from title (or href) to sectionlevel (eg "chapter"?).
|
---|
295 | sub process_child_links ( $ )
|
---|
296 | { my ($he) = check_args(1, @_);
|
---|
297 |
|
---|
298 | # $he->dump();
|
---|
299 | if (scalar(@current_contents_list) != 0)
|
---|
300 | { die "current_contents_list nonempty: @current_contents_list"; }
|
---|
301 | $he->traverse(\&increment_current_contents_list, 'ignore text');
|
---|
302 |
|
---|
303 | # Normalize the depths; for instance, convert 1,3,5 into 0,1,2.
|
---|
304 | my %depths = ();
|
---|
305 | for my $ref_tdf (@current_contents_list)
|
---|
306 | { $depths{$ {$ref_tdf}[1]} = 1; }
|
---|
307 | my @sorted_depths = sort keys %depths;
|
---|
308 | my $current_depth = scalar(@section_stack)-1;
|
---|
309 | my $current_depth_2 = $ {$current_ref_tdf}[1];
|
---|
310 | if ($current_depth != $current_depth_2)
|
---|
311 | { die "mismatch in current depths: $current_depth $current_depth_2; ", join(", ", @section_stack); }
|
---|
312 | for (my $i=0; $i<scalar(@sorted_depths); $i++)
|
---|
313 | { $depths{$sorted_depths[$i]} = $i + $current_depth+1; }
|
---|
314 | for my $ref_tdf (@current_contents_list)
|
---|
315 | { $ {$ref_tdf}[1] = $depths{$ {$ref_tdf}[1]}; }
|
---|
316 |
|
---|
317 | # Eliminate uninteresting sections. Hard-coded hack for now.
|
---|
318 | if ($ {$current_contents_list[-1]}[0] eq "About this document ...")
|
---|
319 | { pop @current_contents_list; }
|
---|
320 | if ((scalar(@current_contents_list) > 1)
|
---|
321 | && ($ {$current_contents_list[1]}[0] eq "Contents"))
|
---|
322 | { my $ref_first_tdf = shift @current_contents_list;
|
---|
323 | $current_contents_list[0] = $ref_first_tdf; }
|
---|
324 |
|
---|
325 | for (my $i=0; $i<scalar(@current_contents_list); $i++)
|
---|
326 | { my $ref_tdf = $current_contents_list[$i];
|
---|
327 | my $title = $ {$ref_tdf}[0];
|
---|
328 | if (exists $index_info{$title})
|
---|
329 | { my $index_file = $ {$ref_tdf}[2];
|
---|
330 | my ($indexing_command, $suffix) = @{$index_info{$title}};
|
---|
331 | process_index_file($index_file, $indexing_command);
|
---|
332 | print TEXI "\n\@defindex $suffix\n";
|
---|
333 | push @index_titles, $title;
|
---|
334 | splice @current_contents_list, $i, 1;
|
---|
335 | $i--; }
|
---|
336 | elsif ($title =~ /\bIndex$/)
|
---|
337 | { print STDERR "Warning: \"$title\" might be an index; if so, edit \%index_info.\n"; } }
|
---|
338 |
|
---|
339 | merge_contents_lists();
|
---|
340 |
|
---|
341 | # print_contents_list();
|
---|
342 | # print_index_info();
|
---|
343 | }
|
---|
344 |
|
---|
345 |
|
---|
346 | sub increment_current_contents_list ( $$$ )
|
---|
347 | { my ($he, $startflag, $depth) = check_args(3, @_);
|
---|
348 | if (!$startflag)
|
---|
349 | { return; }
|
---|
350 |
|
---|
351 | if ($he->tag eq "li")
|
---|
352 | { my @li_content = @{$he->content};
|
---|
353 | if ($li_content[0]->tag ne "a")
|
---|
354 | { die "first element of <LI> should be <A>"; }
|
---|
355 | my ($name, $href, @content) = anchor_info($li_content[0]);
|
---|
356 | # unused $name
|
---|
357 | my $title = join("", collect_texts($li_content[0]));
|
---|
358 | $title = texi_remove_punctuation($title);
|
---|
359 | # The problem with these is that they are formatted differently in
|
---|
360 | # @menu and @node!
|
---|
361 | $title =~ s/``/\"/g;
|
---|
362 | $title =~ s/''/\"/g;
|
---|
363 | $title =~ s/ -- / /g;
|
---|
364 | push @current_contents_list, [ $title, $depth, $href ]; }
|
---|
365 | return 1;
|
---|
366 | }
|
---|
367 |
|
---|
368 | # Simple version for section titles
|
---|
369 | sub html_to_texi ( $ )
|
---|
370 | { my ($he) = check_args(1, @_);
|
---|
371 | if (!ref $he)
|
---|
372 | { return $he; }
|
---|
373 |
|
---|
374 | my $tag = $he->tag;
|
---|
375 | if (exists $inline_markup{$tag})
|
---|
376 | { my $result = "\@$inline_markup{$tag}\{";
|
---|
377 | for my $elt (@{$he->content})
|
---|
378 | { $result .= html_to_texi($elt); }
|
---|
379 | $result .= "\}";
|
---|
380 | return $result; }
|
---|
381 | else
|
---|
382 | { $he->dump();
|
---|
383 | die "html_to_texi confused by <$tag>"; }
|
---|
384 | }
|
---|
385 |
|
---|
386 |
|
---|
387 |
|
---|
388 | sub print_contents_list ()
|
---|
389 | { check_args(0, @_);
|
---|
390 | print STDERR "Contents list:\n";
|
---|
391 | for my $ref_tdf (@contents_list)
|
---|
392 | { my ($title, $depth, $file) = @{$ref_tdf};
|
---|
393 | print STDERR "$title $depth $file\n"; }
|
---|
394 | }
|
---|
395 |
|
---|
396 |
|
---|
397 |
|
---|
398 | ###########################################################################
|
---|
399 | ### Index
|
---|
400 | ###
|
---|
401 |
|
---|
402 | my $l2h_broken_link_name = "l2h-";
|
---|
403 |
|
---|
404 |
|
---|
405 | # map from file to (map from anchor name to (list of index texts))
|
---|
406 | # (The list is needed when a single LaTeX command like \envvar
|
---|
407 | # expands to multiple \index commands.)
|
---|
408 | my %file_index_entries = ();
|
---|
409 | my %this_index_entries; # map from anchor name to (list of index texts)
|
---|
410 |
|
---|
411 | my %file_index_entries_broken = (); # map from file to (list of index texts)
|
---|
412 | my @this_index_entries_broken;
|
---|
413 |
|
---|
414 | my $index_prefix = "";
|
---|
415 | my @index_prefixes = ();
|
---|
416 |
|
---|
417 | my $this_indexing_command;
|
---|
418 |
|
---|
419 | sub print_index_info ()
|
---|
420 | { check_args(0, @_);
|
---|
421 | my ($key, $val);
|
---|
422 | for my $file (sort keys %file_index_entries)
|
---|
423 | { my %index_entries = %{$file_index_entries{$file}};
|
---|
424 | print STDERR "file: $file\n";
|
---|
425 | for my $aname (sort keys %index_entries)
|
---|
426 | { my @entries = @{$index_entries{$aname}};
|
---|
427 | if (scalar(@entries) == 1)
|
---|
428 | { print STDERR " $aname : $entries[0]\n"; }
|
---|
429 | else
|
---|
430 | { print STDERR " $aname : ", join("\n " . (" " x length($aname)), @entries), "\n"; } } }
|
---|
431 | for my $file (sort keys %file_index_entries_broken)
|
---|
432 | { my @entries = @{$file_index_entries_broken{$file}};
|
---|
433 | print STDERR "file: $file\n";
|
---|
434 | for my $entry (@entries)
|
---|
435 | { print STDERR " $entry\n"; }
|
---|
436 | }
|
---|
437 | }
|
---|
438 |
|
---|
439 |
|
---|
440 | sub process_index_file ( $$ )
|
---|
441 | { my ($file, $indexing_command) = check_args(2, @_);
|
---|
442 | # print "process_index_file $file $indexing_command\n";
|
---|
443 |
|
---|
444 | my $he = file_to_tree($html_directory . $file);
|
---|
445 | # $he->dump();
|
---|
446 |
|
---|
447 | $this_indexing_command = $indexing_command;
|
---|
448 | $he->traverse(\&process_if_index_dl_compact, 'ignore text');
|
---|
449 | undef $this_indexing_command;
|
---|
450 | # print "process_index_file done\n";
|
---|
451 | }
|
---|
452 |
|
---|
453 |
|
---|
454 | sub process_if_index_dl_compact ( $$$ )
|
---|
455 | { my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
|
---|
456 | if (!$startflag)
|
---|
457 | { return; }
|
---|
458 |
|
---|
459 | if (($he->tag() eq "dl") && (defined $he->attr('compact')))
|
---|
460 | { process_index_dl_compact($he);
|
---|
461 | return 0; }
|
---|
462 | else
|
---|
463 | { return 1; }
|
---|
464 | }
|
---|
465 |
|
---|
466 |
|
---|
467 | # The elements of a <DL COMPACT> list from a LaTeX2HTML index:
|
---|
468 | # * a single space: text to be ignored
|
---|
469 | # * <DT> elements with an optional <DD> element following each one
|
---|
470 | # Two types of <DT> elements:
|
---|
471 | # * Followed by a <DD> element: the <DT> contains a single
|
---|
472 | # string, and the <DD> contains a whitespace string to be ignored, a
|
---|
473 | # <DL COMPACT> to be recursively processed (with the <DT> string as a
|
---|
474 | # prefix), and a whitespace string to be ignored.
|
---|
475 | # * Not followed by a <DD> element: contains a list of anchors
|
---|
476 | # and texts (ignore the texts, which are only whitespace and commas).
|
---|
477 | # Optionally contains a <DL COMPACT> to be recursively processed (with
|
---|
478 | # the <DT> string as a prefix)
|
---|
479 | sub process_index_dl_compact ( $ )
|
---|
480 | { my ($h) = check_args(1, @_);
|
---|
481 | my @content = @{$h->content()};
|
---|
482 | for (my $i = 0; $i < scalar(@content); $i++)
|
---|
483 | { my $this_he = $content[$i];
|
---|
484 | if ($this_he->tag ne "dt")
|
---|
485 | { $this_he->dump();
|
---|
486 | die "Expected <DT> tag: " . $this_he->tag; }
|
---|
487 | if (($i < scalar(@content) - 1) && ($content[$i+1]->tag eq "dd"))
|
---|
488 | { process_index_dt_and_dd($this_he, $content[$i+1]);
|
---|
489 | $i++; }
|
---|
490 | else
|
---|
491 | { process_index_lone_dt($this_he); } } }
|
---|
492 |
|
---|
493 |
|
---|
494 |
|
---|
495 | # Argument is a <DT> element. If it contains more than one anchor, then
|
---|
496 | # the texts of all subsequent ones are "[Link]". Example:
|
---|
497 | # <DT>
|
---|
498 | # <A HREF="embedding.html#l2h-201">
|
---|
499 | # "$PATH"
|
---|
500 | # ", "
|
---|
501 | # <A HREF="embedding.html#l2h-205">
|
---|
502 | # "[Link]"
|
---|
503 | # Optionally contains a <DL COMPACT> as well. Example:
|
---|
504 | # <DT>
|
---|
505 | # <A HREF="types.html#l2h-616">
|
---|
506 | # "attribute"
|
---|
507 | # <DL COMPACT>
|
---|
508 | # <DT>
|
---|
509 | # <A HREF="assignment.html#l2h-3074">
|
---|
510 | # "assignment"
|
---|
511 | # ", "
|
---|
512 | # <A HREF="assignment.html#l2h-3099">
|
---|
513 | # "[Link]"
|
---|
514 | # <DT>
|
---|
515 | # <A HREF="types.html#l2h-">
|
---|
516 | # "assignment, class"
|
---|
517 |
|
---|
518 | sub process_index_lone_dt ( $ )
|
---|
519 | { my ($dt) = check_args(1, @_);
|
---|
520 | my @dtcontent = @{$dt->content()};
|
---|
521 | my $acontent;
|
---|
522 | my $acontent_suffix;
|
---|
523 | for my $a (@dtcontent)
|
---|
524 | { if ($a eq ", ")
|
---|
525 | { next; }
|
---|
526 | if (!ref $a)
|
---|
527 | { $dt->dump;
|
---|
528 | die "Unexpected <DT> string element: $a"; }
|
---|
529 |
|
---|
530 | if ($a->tag eq "dl")
|
---|
531 | { push @index_prefixes, $index_prefix;
|
---|
532 | if (!defined $acontent_suffix)
|
---|
533 | { die "acontent_suffix not yet defined"; }
|
---|
534 | $index_prefix .= $acontent_suffix . ", ";
|
---|
535 | process_index_dl_compact($a);
|
---|
536 | $index_prefix = pop(@index_prefixes);
|
---|
537 | return; }
|
---|
538 |
|
---|
539 | if ($a->tag ne "a")
|
---|
540 | { $dt->dump;
|
---|
541 | $a->dump;
|
---|
542 | die "Expected anchor in lone <DT>"; }
|
---|
543 |
|
---|
544 | my ($aname, $ahref, @acontent) = anchor_info($a);
|
---|
545 | # unused $aname
|
---|
546 | if (scalar(@acontent) != 1)
|
---|
547 | { die "Expected just one content of <A> in <DT>: @acontent"; }
|
---|
548 | if (ref $acontent[0])
|
---|
549 | { $acontent[0]->dump;
|
---|
550 | die "Expected string content of <A> in <DT>: $acontent[0]"; }
|
---|
551 | if (!defined($acontent))
|
---|
552 | { $acontent = $index_prefix . $acontent[0];
|
---|
553 | $acontent_suffix = $acontent[0]; }
|
---|
554 | elsif (($acontent[0] ne "[Link]") && ($acontent ne ($index_prefix . $acontent[0])))
|
---|
555 | { die "Differing content: <<<$acontent>>>, <<<$acontent[0]>>>"; }
|
---|
556 |
|
---|
557 | if (!defined $ahref)
|
---|
558 | { $dt->dump;
|
---|
559 | die "no HREF in nachor in <DT>"; }
|
---|
560 | my ($ahref_file, $ahref_name) = split(/\#/, $ahref);
|
---|
561 | if (!defined $ahref_name)
|
---|
562 | { # Reference to entire file
|
---|
563 | $ahref_name = ""; }
|
---|
564 |
|
---|
565 | if ($ahref_name eq $l2h_broken_link_name)
|
---|
566 | { if (!exists $file_index_entries_broken{$ahref_file})
|
---|
567 | { $file_index_entries_broken{$ahref_file} = []; }
|
---|
568 | push @{$file_index_entries_broken{$ahref_file}}, "$this_indexing_command $acontent";
|
---|
569 | next; }
|
---|
570 |
|
---|
571 | if (!exists $file_index_entries{$ahref_file})
|
---|
572 | { $file_index_entries{$ahref_file} = {}; }
|
---|
573 | # Don't do this! It appears to make a copy, which is not desired.
|
---|
574 | # my %index_entries = %{$file_index_entries{$ahref_file}};
|
---|
575 | if (!exists $ {$file_index_entries{$ahref_file}}{$ahref_name})
|
---|
576 | { $ {$file_index_entries{$ahref_file}}{$ahref_name} = []; }
|
---|
577 | # { my $oldcontent = $ {$file_index_entries{$ahref_file}}{$ahref_name};
|
---|
578 | # if ($acontent eq $oldcontent)
|
---|
579 | # { die "Multiple identical index entries?"; }
|
---|
580 | # die "Trying to add $acontent, but already have index entry pointing at $ahref_file\#$ahref_name: ${$file_index_entries{$ahref_file}}{$ahref_name}"; }
|
---|
581 |
|
---|
582 | push @{$ {$file_index_entries{$ahref_file}}{$ahref_name}}, "$this_indexing_command $acontent";
|
---|
583 | # print STDERR "keys: ", keys %{$file_index_entries{$ahref_file}}, "\n";
|
---|
584 | }
|
---|
585 | }
|
---|
586 |
|
---|
587 | sub process_index_dt_and_dd ( $$ )
|
---|
588 | { my ($dt, $dd) = check_args(2, @_);
|
---|
589 | my $dtcontent;
|
---|
590 | { my @dtcontent = @{$dt->content()};
|
---|
591 | if ((scalar(@dtcontent) != 1) || (ref $dtcontent[0]))
|
---|
592 | { $dd->dump;
|
---|
593 | $dt->dump;
|
---|
594 | die "Expected single string (actual size = " . scalar(@dtcontent) . ") in content of <DT>: @dtcontent"; }
|
---|
595 | $dtcontent = $dtcontent[0];
|
---|
596 | $dtcontent =~ s/ +$//; }
|
---|
597 | my $ddcontent;
|
---|
598 | { my @ddcontent = @{$dd->content()};
|
---|
599 | if (scalar(@ddcontent) != 1)
|
---|
600 | { die "Expected single <DD> content, got ", scalar(@ddcontent), " elements:\n", join("\n", @ddcontent), "\n "; }
|
---|
601 | $ddcontent = $ddcontent[0]; }
|
---|
602 | if ($ddcontent->tag ne "dl")
|
---|
603 | { die "Expected <DL> as content of <DD>, but saw: $ddcontent"; }
|
---|
604 |
|
---|
605 | push @index_prefixes, $index_prefix;
|
---|
606 | $index_prefix .= $dtcontent . ", ";
|
---|
607 | process_index_dl_compact($ddcontent);
|
---|
608 | $index_prefix = pop(@index_prefixes);
|
---|
609 | }
|
---|
610 |
|
---|
611 |
|
---|
612 | ###########################################################################
|
---|
613 | ### Ordinary sections
|
---|
614 | ###
|
---|
615 |
|
---|
616 | sub process_section_file ( $$$ )
|
---|
617 | { my ($file, $depth, $nodetitle) = check_args(3, @_);
|
---|
618 | my $he = file_to_tree(($file =~ /^\//) ? $file : $html_directory . $file);
|
---|
619 |
|
---|
620 | # print STDERR "process_section_file: $file $depth $nodetitle\n";
|
---|
621 |
|
---|
622 | # Equivalently:
|
---|
623 | # while ($depth >= scalar(@section_stack)) { pop(@section_stack); }
|
---|
624 | @section_stack = @section_stack[0..$depth-1];
|
---|
625 |
|
---|
626 | # Not a great nodename fixup scheme; need a more global view
|
---|
627 | if ((defined $contents_fixups{$nodetitle})
|
---|
628 | && (scalar(@section_stack) > 0))
|
---|
629 | { my $up_title = $section_stack[$#section_stack];
|
---|
630 | # hack for Python Standard Library
|
---|
631 | $up_title =~ s/^(Built-in|Standard) Module //g;
|
---|
632 | my ($up_first_word) = split(/ /, $up_title);
|
---|
633 | $nodetitle = "$up_first_word $nodetitle";
|
---|
634 | }
|
---|
635 |
|
---|
636 | push @section_stack, $nodetitle;
|
---|
637 | # print STDERR "new section_stack: ", join(", ", @section_stack), "\n";
|
---|
638 |
|
---|
639 | $he->traverse(\&process_if_child_links, 'ignore text');
|
---|
640 | %footnotes = ();
|
---|
641 | # $he->dump;
|
---|
642 | $he->traverse(\&process_if_footnotes, 'ignore text');
|
---|
643 |
|
---|
644 | # $he->dump;
|
---|
645 |
|
---|
646 | if (exists $file_index_entries{$file})
|
---|
647 | { %this_index_entries = %{$file_index_entries{$file}};
|
---|
648 | # print STDERR "this_index_entries:\n ", join("\n ", keys %this_index_entries), "\n";
|
---|
649 | }
|
---|
650 | else
|
---|
651 | { # print STDERR "Warning: no index entries for file $file\n";
|
---|
652 | %this_index_entries = (); }
|
---|
653 |
|
---|
654 | if (exists $file_index_entries_broken{$file})
|
---|
655 | { @this_index_entries_broken = @{$file_index_entries_broken{$file}}; }
|
---|
656 | else
|
---|
657 | { # print STDERR "Warning: no index entries for file $file\n";
|
---|
658 | @this_index_entries_broken = (); }
|
---|
659 |
|
---|
660 |
|
---|
661 | if ($he->tag() ne "html")
|
---|
662 | { die "Expected <HTML> at top level"; }
|
---|
663 | my @content = @{$he->content()};
|
---|
664 | if ((!ref $content[0]) or ($content[0]->tag ne "head"))
|
---|
665 | { $he->dump;
|
---|
666 | die "<HEAD> not first element of <HTML>"; }
|
---|
667 | if ((!ref $content[1]) or ($content[1]->tag ne "body"))
|
---|
668 | { $he->dump;
|
---|
669 | die "<BODY> not second element of <HTML>"; }
|
---|
670 |
|
---|
671 | $content[1]->traverse(\&output_body);
|
---|
672 | }
|
---|
673 |
|
---|
674 | # stack of things we're inside that are preventing indexing from occurring now.
|
---|
675 | # These are "h1", "h2", "h3", "h4", "h5", "h6", "dt" (and possibly others?)
|
---|
676 | my @index_deferrers = ();
|
---|
677 |
|
---|
678 | sub push_or_pop_index_deferrers ( $$ )
|
---|
679 | { my ($tag, $startflag) = check_args(2, @_);
|
---|
680 | if ($startflag)
|
---|
681 | { push @index_deferrers, $tag; }
|
---|
682 | else
|
---|
683 | { my $old_deferrer = pop @index_deferrers;
|
---|
684 | if ($tag ne $old_deferrer)
|
---|
685 | { die "Expected $tag at top of index_deferrers but saw $old_deferrer; remainder = ", join(" ", @index_deferrers); }
|
---|
686 | do_deferred_index_entries(); }
|
---|
687 | }
|
---|
688 |
|
---|
689 |
|
---|
690 | sub label_add_index_entries ( $;$ )
|
---|
691 | { my ($label, $he) = check_args_range(1, 2, @_);
|
---|
692 | # print ((exists $this_index_entries{$label}) ? "*" : " "), " label_add_index_entries $label\n";
|
---|
693 | # $he is the anchor element
|
---|
694 | if (exists $this_index_entries{$label})
|
---|
695 | { push @deferred_index_entries, @{$this_index_entries{$label}};
|
---|
696 | return; }
|
---|
697 |
|
---|
698 | if ($label eq $l2h_broken_link_name)
|
---|
699 | { # Try to find some text to use in guessing which links should point here
|
---|
700 | # I should probably only look at the previous element, or if that is
|
---|
701 | # all punctuation, the one before it; collecting all the previous texts
|
---|
702 | # is a bit of overkill.
|
---|
703 | my @anchor_texts = collect_texts($he);
|
---|
704 | my @previous_texts = collect_texts($he->parent, $he);
|
---|
705 | # 4 elements is arbitrary; ought to filter out punctuation and small words
|
---|
706 | # first, then perhaps keep fewer. Perhaps also filter out formatting so
|
---|
707 | # that we can see a larger chunk of text? (Probably not.)
|
---|
708 | # Also perhaps should do further chunking into words, in case the
|
---|
709 | # index term isn't a chunk of its own (eg, was in <tt>...</tt>.
|
---|
710 | my @candidate_texts = (@anchor_texts, (reverse(@previous_texts))[0..min(3,$#previous_texts)]);
|
---|
711 |
|
---|
712 | my $guessed = 0;
|
---|
713 | for my $text (@candidate_texts)
|
---|
714 | { # my $orig_text = $text;
|
---|
715 | if ($text =~ /^[\"\`\'().?! ]*$/)
|
---|
716 | { next; }
|
---|
717 | if (length($text) <= 2)
|
---|
718 | { next; }
|
---|
719 | # hack for Python manual; maybe defer until failure first time around?
|
---|
720 | $text =~ s/^sys\.//g;
|
---|
721 | for my $iterm (@this_index_entries_broken)
|
---|
722 | { # I could test for zero: LaTeX2HTML's failures in the Python
|
---|
723 | # documentation are only for items of the form "... (built-in...)"
|
---|
724 | if (index($iterm, $text) != -1)
|
---|
725 | { push @deferred_index_entries, $iterm;
|
---|
726 | # print STDERR "Guessing index term `$iterm' for text `$orig_text'\n";
|
---|
727 | $guessed = 1;
|
---|
728 | } } }
|
---|
729 | if (!$guessed)
|
---|
730 | { # print STDERR "No guess in `", join("'; `", @this_index_entries_broken), "' for texts:\n `", join("'\n `", @candidate_texts), "'\n";
|
---|
731 | }
|
---|
732 | }
|
---|
733 | }
|
---|
734 |
|
---|
735 |
|
---|
736 | # Need to add calls to this at various places.
|
---|
737 | # Perhaps add HTML::Element argument and do the check for appropriateness
|
---|
738 | # here (ie, no action if inside <H1>, etc.).
|
---|
739 | sub do_deferred_index_entries ()
|
---|
740 | { check_args(0, @_);
|
---|
741 | if ((scalar(@deferred_index_entries) > 0)
|
---|
742 | && (scalar(@index_deferrers) == 0))
|
---|
743 | { print TEXI "\n", join("\n", @deferred_index_entries), "\n";
|
---|
744 | @deferred_index_entries = (); }
|
---|
745 | }
|
---|
746 |
|
---|
747 | my $table_columns; # undefined if not in a table
|
---|
748 | my $table_first_column; # boolean
|
---|
749 |
|
---|
750 | sub output_body ( $$$ )
|
---|
751 | { my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
|
---|
752 |
|
---|
753 | if (!ref $he)
|
---|
754 | { my $space_index = index($he, " ");
|
---|
755 | if ($space_index != -1)
|
---|
756 | { # Why does
|
---|
757 | # print TEXI texi_quote(substr($he, 0, $space_index+1));
|
---|
758 | # give: Can't locate object method "TEXI" via package "texi_quote"
|
---|
759 | # (Because the definition texi_quote hasn't been seen yet.)
|
---|
760 | print TEXI &texi_quote(substr($he, 0, $space_index+1));
|
---|
761 | do_deferred_index_entries();
|
---|
762 | print TEXI &texi_quote(substr($he, $space_index+1)); }
|
---|
763 | else
|
---|
764 | { print TEXI &texi_quote($he); }
|
---|
765 | return; }
|
---|
766 |
|
---|
767 | my $tag = $he->tag();
|
---|
768 |
|
---|
769 | # Ordinary text markup first
|
---|
770 | if (exists $inline_markup{$tag})
|
---|
771 | { if ($startflag)
|
---|
772 | { print TEXI "\@$inline_markup{$tag}\{"; }
|
---|
773 | else
|
---|
774 | { print TEXI "\}"; } }
|
---|
775 | elsif ($tag eq "a")
|
---|
776 | { my ($name, $href, @content) = anchor_info($he);
|
---|
777 | if (!$href)
|
---|
778 | { # This anchor is only here for indexing/cross referencing purposes.
|
---|
779 | if ($startflag)
|
---|
780 | { label_add_index_entries($name, $he); }
|
---|
781 | }
|
---|
782 | elsif ($href =~ "^(ftp|http|news):")
|
---|
783 | { if ($startflag)
|
---|
784 | { # Should avoid second argument if it's identical to the URL.
|
---|
785 | print TEXI "\@uref\{$href, "; }
|
---|
786 | else
|
---|
787 | { print TEXI "\}"; }
|
---|
788 | }
|
---|
789 | elsif ($href =~ /^\#(foot[0-9]+)$/)
|
---|
790 | { # Footnote
|
---|
791 | if ($startflag)
|
---|
792 | { # Could double-check name and content, but I'm not
|
---|
793 | # currently storing that information.
|
---|
794 | print TEXI "\@footnote\{";
|
---|
795 | $footnotes{$1}->traverse(\&output_body);
|
---|
796 | print TEXI "\}";
|
---|
797 | return 0; } }
|
---|
798 | else
|
---|
799 | { if ($startflag)
|
---|
800 | { # cross-references are not active Info links, but no text is lost
|
---|
801 | print STDERR "Can't deal with internal HREF anchors yet:\n";
|
---|
802 | $he->dump; }
|
---|
803 | }
|
---|
804 | }
|
---|
805 | elsif ($tag eq "br")
|
---|
806 | { print TEXI "\@\n"; }
|
---|
807 | elsif ($tag eq "body")
|
---|
808 | { }
|
---|
809 | elsif ($tag eq "center")
|
---|
810 | { if (has_single_content_string($he)
|
---|
811 | && ($ {$he->content}[0] =~ /^ *$/))
|
---|
812 | { return 0; }
|
---|
813 | if ($startflag)
|
---|
814 | { print TEXI "\n\@center\n"; }
|
---|
815 | else
|
---|
816 | { print TEXI "\n\@end center\n"; }
|
---|
817 | }
|
---|
818 | elsif ($tag eq "div")
|
---|
819 | { my $align = $he->attr('align');
|
---|
820 | if (defined($align) && ($align eq "center"))
|
---|
821 | { if (has_single_content_string($he)
|
---|
822 | && ($ {$he->content}[0] =~ /^ *$/))
|
---|
823 | { return 0; }
|
---|
824 | if ($startflag)
|
---|
825 | { print TEXI "\n\@center\n"; }
|
---|
826 | else
|
---|
827 | { print TEXI "\n\@end center\n"; } }
|
---|
828 | }
|
---|
829 | elsif ($tag eq "dl")
|
---|
830 | { # Recognize "<dl><dd><pre> ... </pre></dl>" paradigm for "@example"
|
---|
831 | if (has_single_content_with_tag($he, "dd"))
|
---|
832 | { my $he_dd = $ {$he->content}[0];
|
---|
833 | if (has_single_content_with_tag($he_dd, "pre"))
|
---|
834 | { my $he_pre = $ {$he_dd->content}[0];
|
---|
835 | print_pre($he_pre);
|
---|
836 | return 0; } }
|
---|
837 | if ($startflag)
|
---|
838 | { # Could examine the elements, to be cleverer about formatting.
|
---|
839 | # (Also to use ftable, vtable...)
|
---|
840 | print TEXI "\n\@table \@asis\n"; }
|
---|
841 | else
|
---|
842 | { print TEXI "\n\@end table\n"; }
|
---|
843 | }
|
---|
844 | elsif ($tag eq "dt")
|
---|
845 | { push_or_pop_index_deferrers($tag, $startflag);
|
---|
846 | if ($startflag)
|
---|
847 | { print TEXI "\n\@item "; }
|
---|
848 | else
|
---|
849 | { } }
|
---|
850 | elsif ($tag eq "dd")
|
---|
851 | { if ($startflag)
|
---|
852 | { print TEXI "\n"; }
|
---|
853 | else
|
---|
854 | { }
|
---|
855 | if (scalar(@index_deferrers) != 0)
|
---|
856 | { $he->dump;
|
---|
857 | die "Unexpected <$tag> while inside: (" . join(" ", @index_deferrers) . "); bad HTML?"; }
|
---|
858 | do_deferred_index_entries();
|
---|
859 | }
|
---|
860 | elsif ($tag =~ /^(font|big|small)$/)
|
---|
861 | { # Do nothing for now.
|
---|
862 | }
|
---|
863 | elsif ($tag =~ /^h[1-6]$/)
|
---|
864 | { # We don't need this because we never recursively enter the heading content.
|
---|
865 | # push_or_pop_index_deferrers($tag, $startflag);
|
---|
866 | my $secname = "";
|
---|
867 | my @seclabels = ();
|
---|
868 | for my $elt (@{$he->content})
|
---|
869 | { if (!ref $elt)
|
---|
870 | { $secname .= $elt; }
|
---|
871 | elsif ($elt->tag eq "br")
|
---|
872 | { }
|
---|
873 | elsif ($elt->tag eq "a")
|
---|
874 | { my ($name, $href, @acontent) = anchor_info($elt);
|
---|
875 | if ($href)
|
---|
876 | { $he->dump;
|
---|
877 | $elt->dump;
|
---|
878 | die "Nonsimple anchor in <$tag>"; }
|
---|
879 | if (!defined $name)
|
---|
880 | { die "No NAME for anchor in $tag"; }
|
---|
881 | push @seclabels, $name;
|
---|
882 | for my $subelt (@acontent)
|
---|
883 | { $secname .= html_to_texi($subelt); } }
|
---|
884 | else
|
---|
885 | { $secname .= html_to_texi($elt); } }
|
---|
886 | if ($secname eq "")
|
---|
887 | { die "No section name in <$tag>"; }
|
---|
888 | if (scalar(@section_stack) == 1)
|
---|
889 | { if ($section_stack[-1] ne "Top")
|
---|
890 | { die "Not top? $section_stack[-1]"; }
|
---|
891 | print TEXI "\@settitle $secname\n";
|
---|
892 | print TEXI "\@c %**end of header\n";
|
---|
893 | print TEXI "\n";
|
---|
894 | print TEXI "\@node Top\n";
|
---|
895 | print TEXI "\n"; }
|
---|
896 | else
|
---|
897 | { print TEXI "\n\@node $section_stack[-1]\n";
|
---|
898 | print TEXI "\@$sectionmarker[scalar(@section_stack)-1] ", texi_remove_punctuation($secname), "\n"; }
|
---|
899 | for my $seclabel (@seclabels)
|
---|
900 | { label_add_index_entries($seclabel); }
|
---|
901 | # This should only happen once per file.
|
---|
902 | label_add_index_entries("");
|
---|
903 | if (scalar(@index_deferrers) != 0)
|
---|
904 | { $he->dump;
|
---|
905 | die "Unexpected <$tag> while inside: (" . join(" ", @index_deferrers) . "); bad HTML?"; }
|
---|
906 | do_deferred_index_entries();
|
---|
907 | return 0;
|
---|
908 | }
|
---|
909 | elsif ($tag eq "hr")
|
---|
910 | { }
|
---|
911 | elsif ($tag eq "ignore")
|
---|
912 | { # Hack for ignored elements
|
---|
913 | return 0;
|
---|
914 | }
|
---|
915 | elsif ($tag eq "li")
|
---|
916 | { if ($startflag)
|
---|
917 | { print TEXI "\n\n\@item\n";
|
---|
918 | do_deferred_index_entries(); } }
|
---|
919 | elsif ($tag eq "ol")
|
---|
920 | { if ($startflag)
|
---|
921 | { print TEXI "\n\@enumerate \@bullet\n"; }
|
---|
922 | else
|
---|
923 | { print TEXI "\n\@end enumerate\n"; } }
|
---|
924 | elsif ($tag eq "p")
|
---|
925 | { if ($startflag)
|
---|
926 | { print TEXI "\n\n"; }
|
---|
927 | if (scalar(@index_deferrers) != 0)
|
---|
928 | { $he->dump;
|
---|
929 | die "Unexpected <$tag> while inside: (" . join(" ", @index_deferrers) . "); bad HTML?"; }
|
---|
930 | do_deferred_index_entries(); }
|
---|
931 | elsif ($tag eq "pre")
|
---|
932 | { print_pre($he);
|
---|
933 | return 0; }
|
---|
934 | elsif ($tag eq "table")
|
---|
935 | { # Could also indicate common formatting for first column, or
|
---|
936 | # determine relative widths for columns (or determine a prototype row)
|
---|
937 | if ($startflag)
|
---|
938 | { if (defined $table_columns)
|
---|
939 | { $he->dump;
|
---|
940 | die "Can't deal with table nested inside $table_columns-column table"; }
|
---|
941 | $table_columns = table_columns($he);
|
---|
942 | if ($table_columns < 2)
|
---|
943 | { $he->dump;
|
---|
944 | die "Column with $table_columns columns?"; }
|
---|
945 | elsif ($table_columns == 2)
|
---|
946 | { print TEXI "\n\@table \@asis\n"; }
|
---|
947 | else
|
---|
948 | { print TEXI "\n\@multitable \@columnfractions";
|
---|
949 | for (my $i=0; $i<$table_columns; $i++)
|
---|
950 | { print TEXI " ", 1.0/$table_columns; }
|
---|
951 | print TEXI "\n"; } }
|
---|
952 | else
|
---|
953 | { if ($table_columns == 2)
|
---|
954 | { print TEXI "\n\@end table\n"; }
|
---|
955 | else
|
---|
956 | { print TEXI "\n\@end multitable\n"; }
|
---|
957 | undef $table_columns; } }
|
---|
958 | elsif (($tag eq "td") || ($tag eq "th"))
|
---|
959 | { if ($startflag)
|
---|
960 | { if ($table_first_column)
|
---|
961 | { print TEXI "\n\@item ";
|
---|
962 | $table_first_column = 0; }
|
---|
963 | elsif ($table_columns > 2)
|
---|
964 | { print TEXI "\n\@tab "; } }
|
---|
965 | else
|
---|
966 | { print TEXI "\n"; } }
|
---|
967 | elsif ($tag eq "tr")
|
---|
968 | { if ($startflag)
|
---|
969 | { $table_first_column = 1; } }
|
---|
970 | elsif ($tag eq "ul")
|
---|
971 | { if ($startflag)
|
---|
972 | { print TEXI "\n\@itemize \@bullet\n"; }
|
---|
973 | else
|
---|
974 | { print TEXI "\n\@end itemize\n"; } }
|
---|
975 | else
|
---|
976 | { # I used to have a newline before "output_body" here.
|
---|
977 | print STDERR "output_body: ignoring <$tag> tag\n";
|
---|
978 | $he->dump;
|
---|
979 | return 0; }
|
---|
980 |
|
---|
981 | return 1;
|
---|
982 | }
|
---|
983 |
|
---|
984 | sub print_pre ( $ )
|
---|
985 | { my ($he_pre) = check_args(1, @_);
|
---|
986 | if (!has_single_content_string($he_pre))
|
---|
987 | { die "Multiple or non-string content for <PRE>: ", @{$he_pre->content}; }
|
---|
988 | my $pre_content = $ {$he_pre->content}[0];
|
---|
989 | print TEXI "\n\@example";
|
---|
990 | print TEXI &texi_quote($pre_content);
|
---|
991 | print TEXI "\@end example\n";
|
---|
992 | }
|
---|
993 |
|
---|
994 | sub table_columns ( $ )
|
---|
995 | { my ($table) = check_args(1, @_);
|
---|
996 | my $result = 0;
|
---|
997 | for my $row (@{$table->content})
|
---|
998 | { if ($row->tag ne "tr")
|
---|
999 | { $table->dump;
|
---|
1000 | $row->dump;
|
---|
1001 | die "Expected <TR> as table row."; }
|
---|
1002 | $result = max($result, scalar(@{$row->content})); }
|
---|
1003 | return $result;
|
---|
1004 | }
|
---|
1005 |
|
---|
1006 |
|
---|
1007 | ###########################################################################
|
---|
1008 | ### Utilities
|
---|
1009 | ###
|
---|
1010 |
|
---|
1011 | sub min ( $$ )
|
---|
1012 | { my ($x, $y) = check_args(2, @_);
|
---|
1013 | return ($x < $y) ? $x : $y;
|
---|
1014 | }
|
---|
1015 |
|
---|
1016 | sub max ( $$ )
|
---|
1017 | { my ($x, $y) = check_args(2, @_);
|
---|
1018 | return ($x > $y) ? $x : $y;
|
---|
1019 | }
|
---|
1020 |
|
---|
1021 | sub file_to_tree ( $ )
|
---|
1022 | { my ($file) = check_args(1, @_);
|
---|
1023 |
|
---|
1024 | my $tree = new HTML::TreeBuilder;
|
---|
1025 | $tree->ignore_unknown(1);
|
---|
1026 | # $tree->warn(1);
|
---|
1027 | $tree->parse_file($file);
|
---|
1028 | cleanup_parse_tree($tree);
|
---|
1029 | return $tree
|
---|
1030 | }
|
---|
1031 |
|
---|
1032 |
|
---|
1033 | sub has_single_content ( $ )
|
---|
1034 | { my ($he) = check_args(1, @_);
|
---|
1035 | if (!ref $he)
|
---|
1036 | { # return 0;
|
---|
1037 | die "Non-reference argument: $he"; }
|
---|
1038 | my $ref_content = $he->content;
|
---|
1039 | if (!defined $ref_content)
|
---|
1040 | { return 0; }
|
---|
1041 | my @content = @{$ref_content};
|
---|
1042 | if (scalar(@content) != 1)
|
---|
1043 | { return 0; }
|
---|
1044 | return 1;
|
---|
1045 | }
|
---|
1046 |
|
---|
1047 |
|
---|
1048 | # Return true if the content of the element contains only one element itself,
|
---|
1049 | # and that inner element has the specified tag.
|
---|
1050 | sub has_single_content_with_tag ( $$ )
|
---|
1051 | { my ($he, $tag) = check_args(2, @_);
|
---|
1052 | if (!has_single_content($he))
|
---|
1053 | { return 0; }
|
---|
1054 | my $content = $ {$he->content}[0];
|
---|
1055 | if (!ref $content)
|
---|
1056 | { return 0; }
|
---|
1057 | my $content_tag = $content->tag;
|
---|
1058 | if (!defined $content_tag)
|
---|
1059 | { return 0; }
|
---|
1060 | return $content_tag eq $tag;
|
---|
1061 | }
|
---|
1062 |
|
---|
1063 | sub has_single_content_string ( $ )
|
---|
1064 | { my ($he) = check_args(1, @_);
|
---|
1065 | if (!has_single_content($he))
|
---|
1066 | { return 0; }
|
---|
1067 | my $content = $ {$he->content}[0];
|
---|
1068 | if (ref $content)
|
---|
1069 | { return 0; }
|
---|
1070 | return 1;
|
---|
1071 | }
|
---|
1072 |
|
---|
1073 |
|
---|
1074 | # Return name, href, content. First two may be undefined; third is an array.
|
---|
1075 | # I don't see how to determine if there are more attributes.
|
---|
1076 | sub anchor_info ( $ )
|
---|
1077 | { my ($he) = check_args(1, @_);
|
---|
1078 | if ($he->tag ne "a")
|
---|
1079 | { $he->dump;
|
---|
1080 | die "passed non-anchor to anchor_info"; }
|
---|
1081 | my $name = $he->attr('name');
|
---|
1082 | my $href = $he->attr('href');
|
---|
1083 | my @content = ();
|
---|
1084 | { my $ref_content = $he->content;
|
---|
1085 | if (defined $ref_content)
|
---|
1086 | { @content = @{$ref_content}; } }
|
---|
1087 | return ($name, $href, @content);
|
---|
1088 | }
|
---|
1089 |
|
---|
1090 |
|
---|
1091 | sub texi_quote ( $ )
|
---|
1092 | { my ($text) = check_args(1, @_);
|
---|
1093 | $text =~ s/([\@\{\}])/\@$1/g;
|
---|
1094 | $text =~ s/ -- / --- /g;
|
---|
1095 | return $text;
|
---|
1096 | }
|
---|
1097 |
|
---|
1098 | # Eliminate bad punctuation (that confuses Makeinfo or Info) for section titles.
|
---|
1099 | sub texi_remove_punctuation ( $ )
|
---|
1100 | { my ($text) = check_args(1, @_);
|
---|
1101 |
|
---|
1102 | $text =~ s/^ +//g;
|
---|
1103 | $text =~ s/[ :]+$//g;
|
---|
1104 | $text =~ s/^[1-9][0-9.]* +//g;
|
---|
1105 | $text =~ s/,//g;
|
---|
1106 | # Both embedded colons and " -- " confuse makeinfo. (Perhaps " -- "
|
---|
1107 | # gets converted into " - ", just as "---" would be converted into " -- ",
|
---|
1108 | # so the names end up differing.)
|
---|
1109 | # $text =~ s/:/ -- /g;
|
---|
1110 | $text =~ s/://g;
|
---|
1111 | return $text;
|
---|
1112 | }
|
---|
1113 |
|
---|
1114 |
|
---|
1115 | ## Do not use this inside `traverse': it throws off the traversal. Use
|
---|
1116 | ## html_replace_by_ignore or html_replace_by_meta instead.
|
---|
1117 | # Returns 1 if success, 0 if failure.
|
---|
1118 | sub html_remove ( $;$ )
|
---|
1119 | { my ($he, $parent) = check_args_range(1, 2, @_);
|
---|
1120 | if (!defined $parent)
|
---|
1121 | { $parent = $he->parent; }
|
---|
1122 | my $ref_pcontent = $parent->content;
|
---|
1123 | my @pcontent = @{$ref_pcontent};
|
---|
1124 | for (my $i=0; $i<scalar(@pcontent); $i++)
|
---|
1125 | { if ($pcontent[$i] eq $he)
|
---|
1126 | { splice @{$ref_pcontent}, $i, 1;
|
---|
1127 | $he->parent(undef);
|
---|
1128 | return 1; } }
|
---|
1129 | die "Didn't find $he in $parent";
|
---|
1130 | }
|
---|
1131 |
|
---|
1132 |
|
---|
1133 | sub html_replace ( $$;$ )
|
---|
1134 | { my ($orig, $new, $parent) = check_args_range(2, 3, @_);
|
---|
1135 | if (!defined $parent)
|
---|
1136 | { $parent = $orig->parent; }
|
---|
1137 | my $ref_pcontent = $parent->content;
|
---|
1138 | my @pcontent = @{$ref_pcontent};
|
---|
1139 | for (my $i=0; $i<scalar(@pcontent); $i++)
|
---|
1140 | { if ($pcontent[$i] eq $orig)
|
---|
1141 | { $ {$ref_pcontent}[$i] = $new;
|
---|
1142 | $new->parent($parent);
|
---|
1143 | $orig->parent(undef);
|
---|
1144 | return 1; } }
|
---|
1145 | die "Didn't find $orig in $parent";
|
---|
1146 | }
|
---|
1147 |
|
---|
1148 | sub html_replace_by_meta ( $;$ )
|
---|
1149 | { my ($orig, $parent) = check_args_range(1, 2, @_);
|
---|
1150 | my $meta = new HTML::Element "meta";
|
---|
1151 | if (!defined $parent)
|
---|
1152 | { $parent = $orig->parent; }
|
---|
1153 | return html_replace($orig, $meta, $parent);
|
---|
1154 | }
|
---|
1155 |
|
---|
1156 | sub html_replace_by_ignore ( $;$ )
|
---|
1157 | { my ($orig, $parent) = check_args_range(1, 2, @_);
|
---|
1158 | my $ignore = new HTML::Element "ignore";
|
---|
1159 | if (!defined $parent)
|
---|
1160 | { $parent = $orig->parent; }
|
---|
1161 | return html_replace($orig, $ignore, $parent);
|
---|
1162 | }
|
---|
1163 |
|
---|
1164 |
|
---|
1165 |
|
---|
1166 | ###
|
---|
1167 | ### Collect text elements
|
---|
1168 | ###
|
---|
1169 |
|
---|
1170 | my @collected_texts;
|
---|
1171 | my $collect_texts_stoppoint;
|
---|
1172 | my $done_collecting;
|
---|
1173 |
|
---|
1174 | sub collect_texts ( $;$ )
|
---|
1175 | { my ($root, $stop) = check_args_range(1, 2, @_);
|
---|
1176 | # print STDERR "collect_texts: $root $stop\n";
|
---|
1177 | $collect_texts_stoppoint = $stop;
|
---|
1178 | $done_collecting = 0;
|
---|
1179 | @collected_texts = ();
|
---|
1180 | $root->traverse(\&collect_if_text); # process texts
|
---|
1181 | # print STDERR "collect_texts => ", join(";;;", @collected_texts), "\n";
|
---|
1182 | return @collected_texts;
|
---|
1183 | }
|
---|
1184 |
|
---|
1185 | sub collect_if_text ( $$$ )
|
---|
1186 | { my $he = (check_args(3, @_))[0]; # ignore depth and startflag arguments
|
---|
1187 | if ($done_collecting)
|
---|
1188 | { return 0; }
|
---|
1189 | if (!defined $he)
|
---|
1190 | { return 0; }
|
---|
1191 | if (!ref $he)
|
---|
1192 | { push @collected_texts, $he;
|
---|
1193 | return 0; }
|
---|
1194 | if ((defined $collect_texts_stoppoint) && ($he eq $collect_texts_stoppoint))
|
---|
1195 | { $done_collecting = 1;
|
---|
1196 | return 0; }
|
---|
1197 | return 1;
|
---|
1198 | }
|
---|
1199 |
|
---|
1200 |
|
---|
1201 | ###########################################################################
|
---|
1202 | ### Clean up parse tree
|
---|
1203 | ###
|
---|
1204 |
|
---|
1205 | sub cleanup_parse_tree ( $ )
|
---|
1206 | { my ($he) = check_args(1, @_);
|
---|
1207 | $he->traverse(\&delete_if_navigation, 'ignore text');
|
---|
1208 | $he->traverse(\&delete_extra_spaces, 'ignore text');
|
---|
1209 | $he->traverse(\&merge_dl, 'ignore text');
|
---|
1210 | $he->traverse(\&reorder_dt_and_dl, 'ignore text');
|
---|
1211 | return $he;
|
---|
1212 | }
|
---|
1213 |
|
---|
1214 |
|
---|
1215 | ## Simpler version that deletes contents but not the element itself.
|
---|
1216 | # sub delete_if_navigation ( $$$ )
|
---|
1217 | # { my $he = (check_args(3, @_))[0]; # ignore startflag and depth
|
---|
1218 | # if (($he->tag() eq "div") && ($he->attr('class') eq 'navigation'))
|
---|
1219 | # { $he->delete();
|
---|
1220 | # return 0; }
|
---|
1221 | # else
|
---|
1222 | # { return 1; }
|
---|
1223 | # }
|
---|
1224 |
|
---|
1225 | sub delete_if_navigation ( $$$ )
|
---|
1226 | { my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
|
---|
1227 | if (!$startflag)
|
---|
1228 | { return; }
|
---|
1229 |
|
---|
1230 | if (($he->tag() eq "div") && (defined $he->attr('class')) && ($he->attr('class') eq 'navigation'))
|
---|
1231 | { my $ref_pcontent = $he->parent()->content();
|
---|
1232 | # Don't try to modify @pcontent, which appears to be a COPY.
|
---|
1233 | # my @pcontent = @{$ref_pcontent};
|
---|
1234 | for (my $i = 0; $i<scalar(@{$ref_pcontent}); $i++)
|
---|
1235 | { if (${$ref_pcontent}[$i] eq $he)
|
---|
1236 | { splice(@{$ref_pcontent}, $i, 1);
|
---|
1237 | last; } }
|
---|
1238 | $he->delete();
|
---|
1239 | return 0; }
|
---|
1240 | else
|
---|
1241 | { return 1; }
|
---|
1242 | }
|
---|
1243 |
|
---|
1244 | sub delete_extra_spaces ( $$$ )
|
---|
1245 | { my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
|
---|
1246 | if (!$startflag)
|
---|
1247 | { return; }
|
---|
1248 |
|
---|
1249 | my $tag = $he->tag;
|
---|
1250 | if ($tag =~ /^(head|html|table|tr|ul)$/)
|
---|
1251 | { delete_child_spaces($he); }
|
---|
1252 | delete_trailing_spaces($he);
|
---|
1253 | return 1;
|
---|
1254 | }
|
---|
1255 |
|
---|
1256 |
|
---|
1257 | sub delete_child_spaces ( $ )
|
---|
1258 | { my ($he) = check_args(1, @_);
|
---|
1259 | my $ref_content = $he->content();
|
---|
1260 | for (my $i = 0; $i<scalar(@{$ref_content}); $i++)
|
---|
1261 | { if ($ {$ref_content}[$i] =~ /^ *$/)
|
---|
1262 | { splice(@{$ref_content}, $i, 1);
|
---|
1263 | $i--; } }
|
---|
1264 | }
|
---|
1265 |
|
---|
1266 | sub delete_trailing_spaces ( $ )
|
---|
1267 | { my ($he) = check_args(1, @_);
|
---|
1268 | my $ref_content = $he->content();
|
---|
1269 | if (! defined $ref_content)
|
---|
1270 | { return; }
|
---|
1271 | # Could also check for previous element = /^h[1-6]$/.
|
---|
1272 | for (my $i = 0; $i<scalar(@{$ref_content})-1; $i++)
|
---|
1273 | { if ($ {$ref_content}[$i] =~ /^ *$/)
|
---|
1274 | { my $next_elt = $ {$ref_content}[$i+1];
|
---|
1275 | if ((ref $next_elt) && ($next_elt->tag =~ /^(br|dd|dl|dt|hr|p|ul)$/))
|
---|
1276 | { splice(@{$ref_content}, $i, 1);
|
---|
1277 | $i--; } } }
|
---|
1278 | if ($he->tag =~ /^(dd|dt|^h[1-6]|li|p)$/)
|
---|
1279 | { my $last_elt = $ {$ref_content}[$#{$ref_content}];
|
---|
1280 | if ((defined $last_elt) && ($last_elt =~ /^ *$/))
|
---|
1281 | { pop @{$ref_content}; } }
|
---|
1282 | }
|
---|
1283 |
|
---|
1284 |
|
---|
1285 | # LaTeX2HTML sometimes creates
|
---|
1286 | # <DT>text
|
---|
1287 | # <DL COMPACT><DD>text
|
---|
1288 | # which should actually be:
|
---|
1289 | # <DL COMPACT>
|
---|
1290 | # <DT>text
|
---|
1291 | # <DD>text
|
---|
1292 | # Since a <DL> gets added, this ends up looking like
|
---|
1293 | # <P>
|
---|
1294 | # <DL>
|
---|
1295 | # <DT>
|
---|
1296 | # text1...
|
---|
1297 | # <DL COMPACT>
|
---|
1298 | # <DD>
|
---|
1299 | # text2...
|
---|
1300 | # dt_or_dd1...
|
---|
1301 | # dt_or_dd2...
|
---|
1302 | # which should become
|
---|
1303 | # <P>
|
---|
1304 | # <DL COMPACT>
|
---|
1305 | # <DT>
|
---|
1306 | # text1...
|
---|
1307 | # <DD>
|
---|
1308 | # text2...
|
---|
1309 | # dt_or_dd1...
|
---|
1310 | # dt_or_dd2...
|
---|
1311 |
|
---|
1312 | sub reorder_dt_and_dl ( $$$ )
|
---|
1313 | { my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
|
---|
1314 | if (!$startflag)
|
---|
1315 | { return; }
|
---|
1316 |
|
---|
1317 | if ($he->tag() eq "p")
|
---|
1318 | { my $ref_pcontent = $he->content();
|
---|
1319 | if (defined $ref_pcontent)
|
---|
1320 | { my @pcontent = @{$ref_pcontent};
|
---|
1321 | # print "reorder_dt_and_dl found a <p>\n"; $he->dump();
|
---|
1322 | if ((scalar(@pcontent) >= 1)
|
---|
1323 | && (ref $pcontent[0]) && ($pcontent[0]->tag() eq "dl")
|
---|
1324 | && $pcontent[0]->implicit())
|
---|
1325 | { my $ref_dlcontent = $pcontent[0]->content();
|
---|
1326 | # print "reorder_dt_and_dl found a <p> and implicit <dl>\n";
|
---|
1327 | if (defined $ref_dlcontent)
|
---|
1328 | { my @dlcontent = @{$ref_dlcontent};
|
---|
1329 | if ((scalar(@dlcontent) >= 1)
|
---|
1330 | && (ref $dlcontent[0]) && ($dlcontent[0]->tag() eq "dt"))
|
---|
1331 | { my $ref_dtcontent = $dlcontent[0]->content();
|
---|
1332 | # print "reorder_dt_and_dl found a <p>, implicit <dl>, and <dt>\n";
|
---|
1333 | if (defined $ref_dtcontent)
|
---|
1334 | { my @dtcontent = @{$ref_dtcontent};
|
---|
1335 | if ((scalar(@dtcontent) > 0)
|
---|
1336 | && (ref $dtcontent[$#dtcontent])
|
---|
1337 | && ($dtcontent[$#dtcontent]->tag() eq "dl"))
|
---|
1338 | { my $ref_dl2content = $dtcontent[$#dtcontent]->content();
|
---|
1339 | # print "reorder_dt_and_dl found a <p>, implicit <dl>, <dt>, and <dl>\n";
|
---|
1340 | if (defined $ref_dl2content)
|
---|
1341 | { my @dl2content = @{$ref_dl2content};
|
---|
1342 | if ((scalar(@dl2content) > 0)
|
---|
1343 | && (ref ($dl2content[0]))
|
---|
1344 | && ($dl2content[0]->tag() eq "dd"))
|
---|
1345 | {
|
---|
1346 | # print "reorder_dt_and_dl found a <p>, implicit <dl>, <dt>, <dl>, and <dd>\n";
|
---|
1347 | # print STDERR "CHANGING\n"; $he->dump();
|
---|
1348 | html_replace_by_ignore($dtcontent[$#dtcontent]);
|
---|
1349 | splice(@{$ref_dlcontent}, 1, 0, @dl2content);
|
---|
1350 | # print STDERR "CHANGED TO:\n"; $he->dump();
|
---|
1351 | return 0; # don't traverse children
|
---|
1352 | } } } } } } } } }
|
---|
1353 | return 1;
|
---|
1354 | }
|
---|
1355 |
|
---|
1356 |
|
---|
1357 | # If we find a paragraph that looks like
|
---|
1358 | # <P>
|
---|
1359 | # <HR>
|
---|
1360 | # <UL>
|
---|
1361 | # then accumulate its links into a contents_list and delete the paragraph.
|
---|
1362 | sub process_if_child_links ( $$$ )
|
---|
1363 | { my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
|
---|
1364 | if (!$startflag)
|
---|
1365 | { return; }
|
---|
1366 |
|
---|
1367 | if ($he->tag() eq "p")
|
---|
1368 | { my $ref_content = $he->content();
|
---|
1369 | if (defined $ref_content)
|
---|
1370 | { my @content = @{$ref_content};
|
---|
1371 | if ((scalar(@content) == 2)
|
---|
1372 | && (ref $content[0]) && $content[0]->tag() eq "hr"
|
---|
1373 | && (ref $content[1]) && $content[1]->tag() eq "ul")
|
---|
1374 | { process_child_links($he);
|
---|
1375 | $he->delete();
|
---|
1376 | return 0; } } }
|
---|
1377 | return 1;
|
---|
1378 | }
|
---|
1379 |
|
---|
1380 |
|
---|
1381 | # If we find
|
---|
1382 | # <H4>
|
---|
1383 | # "Footnotes"
|
---|
1384 | # <DL>
|
---|
1385 | # <DT>
|
---|
1386 | # <A NAME="foot560">
|
---|
1387 | # "...borrow"
|
---|
1388 | # <A HREF="refcountsInPython.html#tex2html2" NAME="foot560">
|
---|
1389 | # "1.2"
|
---|
1390 | # <DD>
|
---|
1391 | # "The metaphor of ``borrowing'' a reference is not completely correct: the owner still has a copy of the reference. "
|
---|
1392 | # ...
|
---|
1393 | # then record the footnote information and delete the section and list.
|
---|
1394 |
|
---|
1395 | my $process_if_footnotes_expect_dl_next = 0;
|
---|
1396 |
|
---|
1397 | sub process_if_footnotes ( $$$ )
|
---|
1398 | { my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
|
---|
1399 | if (!$startflag)
|
---|
1400 | { return; }
|
---|
1401 |
|
---|
1402 | if (($he->tag() eq "h4")
|
---|
1403 | && has_single_content_string($he)
|
---|
1404 | && ($ {$he->content}[0] eq "Footnotes"))
|
---|
1405 | { html_replace_by_ignore($he);
|
---|
1406 | $process_if_footnotes_expect_dl_next = 1;
|
---|
1407 | return 0; }
|
---|
1408 |
|
---|
1409 | if ($process_if_footnotes_expect_dl_next && ($he->tag() eq "dl"))
|
---|
1410 | { my $ref_content = $he->content();
|
---|
1411 | if (defined $ref_content)
|
---|
1412 | { $process_if_footnotes_expect_dl_next = 0;
|
---|
1413 | my @content = @{$ref_content};
|
---|
1414 | for (my $i=0; $i<$#content; $i+=2)
|
---|
1415 | { my $he_dt = $content[$i];
|
---|
1416 | my $he_dd = $content[$i+1];
|
---|
1417 | if (($he_dt->tag ne "dt") || ($he_dd->tag ne "dd"))
|
---|
1418 | { $he->dump;
|
---|
1419 | die "expected <DT> and <DD> at positions $i and ", $i+1; }
|
---|
1420 | my @dt_content = @{$he_dt->content()};
|
---|
1421 | if ((scalar(@dt_content) != 2)
|
---|
1422 | || ($dt_content[0]->tag ne "a")
|
---|
1423 | || ($dt_content[1]->tag ne "a"))
|
---|
1424 | { $he_dt->dump;
|
---|
1425 | die "Expected 2 anchors as content of <DT>"; }
|
---|
1426 | my ($dt1_name, $dt1_href, $dt1_content) = anchor_info($dt_content[0]);
|
---|
1427 | my ($dt2_name, $dt2_href, $dt2_content) = anchor_info($dt_content[0]);
|
---|
1428 | # unused: $dt1_href, $dt1_content, $dt2_href, $dt2_content
|
---|
1429 | if ($dt1_name ne $dt2_name)
|
---|
1430 | { $he_dt->dump;
|
---|
1431 | die "Expected identical names for anchors"; }
|
---|
1432 | html_replace_by_ignore($he_dd);
|
---|
1433 | $he_dd->tag("div"); # has no effect
|
---|
1434 | $footnotes{$dt1_name} = $he_dd; }
|
---|
1435 | html_replace_by_ignore($he);
|
---|
1436 | return 0; } }
|
---|
1437 |
|
---|
1438 | if ($process_if_footnotes_expect_dl_next)
|
---|
1439 | { $he->dump;
|
---|
1440 | die "Expected <DL> for footnotes next"; }
|
---|
1441 |
|
---|
1442 | return 1;
|
---|
1443 | }
|
---|
1444 |
|
---|
1445 |
|
---|
1446 |
|
---|
1447 | ## Merge two adjacent paragraphs containing <DL> items, such as:
|
---|
1448 | # <P>
|
---|
1449 | # <DL>
|
---|
1450 | # <DT>
|
---|
1451 | # ...
|
---|
1452 | # <DD>
|
---|
1453 | # ...
|
---|
1454 | # <P>
|
---|
1455 | # <DL>
|
---|
1456 | # <DT>
|
---|
1457 | # ...
|
---|
1458 | # <DD>
|
---|
1459 | # ...
|
---|
1460 |
|
---|
1461 | sub merge_dl ( $$$ )
|
---|
1462 | { my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
|
---|
1463 | if (!$startflag)
|
---|
1464 | { return; }
|
---|
1465 |
|
---|
1466 | my $ref_content = $he->content;
|
---|
1467 | if (!defined $ref_content)
|
---|
1468 | { return; }
|
---|
1469 | my $i = 0;
|
---|
1470 | while ($i < scalar(@{$ref_content})-1)
|
---|
1471 | { my $p1 = $ {$ref_content}[$i];
|
---|
1472 | if ((ref $p1) && ($p1->tag eq "p")
|
---|
1473 | && has_single_content_with_tag($p1, "dl"))
|
---|
1474 | { my $dl1 = $ {$p1->content}[0];
|
---|
1475 | # In this loop, rhs, not lhs, of < comparison changes,
|
---|
1476 | # because we are removing elements from the content of $he.
|
---|
1477 | while ($i < scalar(@{$ref_content})-1)
|
---|
1478 | { my $p2 = $ {$ref_content}[$i+1];
|
---|
1479 | if (!((ref $p2) && ($p2->tag eq "p")
|
---|
1480 | && has_single_content_with_tag($p2, "dl")))
|
---|
1481 | { last; }
|
---|
1482 | # Merge these two elements.
|
---|
1483 | splice(@{$ref_content}, $i+1, 1); # remove $p2
|
---|
1484 | my $dl2 = $ {$p2->content}[0];
|
---|
1485 | $dl1->push_content(@{$dl2->content}); # put $dl2's content in $dl1
|
---|
1486 | }
|
---|
1487 | # extra increment because next element isn't a candidate for $p1
|
---|
1488 | $i++; }
|
---|
1489 | $i++; }
|
---|
1490 | return 1;
|
---|
1491 | }
|
---|
1492 |
|
---|
1493 |
|
---|
1494 |
|
---|
1495 | ###########################################################################
|
---|
1496 | ### Testing
|
---|
1497 | ###
|
---|
1498 |
|
---|
1499 | sub test ( $$ )
|
---|
1500 | { my ($action, $file) = check_args(2, @_);
|
---|
1501 |
|
---|
1502 | # General testing
|
---|
1503 | if (($action eq "view") || ($action eq ""))
|
---|
1504 | { # # $file = "/homes/gws/mernst/www/links.html";
|
---|
1505 | # # $file = "/homes/gws/mernst/www/index.html";
|
---|
1506 | # # $file = "/homes/fish/mernst/java/gud/doc/manual.html";
|
---|
1507 | # # $file = "/projects/cecil/cecil/doc/manuals/stdlib-man/stdlib/stdlib.html";
|
---|
1508 | # # $file = "/homes/fish/mernst/tmp/python-doc/html/index.html";
|
---|
1509 | # $file = "/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html";
|
---|
1510 | my $tree = file_to_tree($file);
|
---|
1511 |
|
---|
1512 | ## Testing
|
---|
1513 | # print STDERR $tree->as_HTML;
|
---|
1514 | $tree->dump();
|
---|
1515 |
|
---|
1516 | # print STDERR $tree->tag(), "\n";
|
---|
1517 | # print STDERR @{$tree->content()}, "\n";
|
---|
1518 | #
|
---|
1519 | # for (@{ $tree->extract_links(qw(a img)) }) {
|
---|
1520 | # my ($link, $linkelem) = @$_;
|
---|
1521 | # print STDERR "$link ", $linkelem->as_HTML;
|
---|
1522 | # }
|
---|
1523 | #
|
---|
1524 | # print STDERR @{$tree->extract_links()}, "\n";
|
---|
1525 |
|
---|
1526 | # my @top_level_elts = @{$tree->content()};
|
---|
1527 |
|
---|
1528 | # if scalar(@{$tree->content()})
|
---|
1529 | return;
|
---|
1530 | }
|
---|
1531 |
|
---|
1532 | elsif ($action eq "raw")
|
---|
1533 | { my $tree = new HTML::TreeBuilder;
|
---|
1534 | $tree->ignore_unknown(1);
|
---|
1535 | # $tree->warn(1);
|
---|
1536 | $tree->parse_file($file);
|
---|
1537 |
|
---|
1538 | $tree->dump();
|
---|
1539 |
|
---|
1540 | # cleanup_parse_tree($tree);
|
---|
1541 | # $tree->dump();
|
---|
1542 | return;
|
---|
1543 | }
|
---|
1544 |
|
---|
1545 | # Test dealing with a section.
|
---|
1546 | elsif ($action eq "section")
|
---|
1547 | { # my $file;
|
---|
1548 | # $file = "/homes/fish/mernst/tmp/python-doc/html/api/intro.html";
|
---|
1549 | # $file = "/homes/fish/mernst/tmp/python-doc/html/api/includes.html";
|
---|
1550 | # $file = "/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html";
|
---|
1551 | process_section_file($file, 0, "Title");
|
---|
1552 | }
|
---|
1553 |
|
---|
1554 | # Test dealing with many sections
|
---|
1555 | elsif (0)
|
---|
1556 | { my @files = ("/homes/fish/mernst/tmp/python-doc/html/api/about.html",
|
---|
1557 | "/homes/fish/mernst/tmp/python-doc/html/api/abstract.html",
|
---|
1558 | "/homes/fish/mernst/tmp/python-doc/html/api/api.html",
|
---|
1559 | "/homes/fish/mernst/tmp/python-doc/html/api/cObjects.html",
|
---|
1560 | "/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html",
|
---|
1561 | "/homes/fish/mernst/tmp/python-doc/html/api/concrete.html",
|
---|
1562 | # "/homes/fish/mernst/tmp/python-doc/html/api/contents.html",
|
---|
1563 | "/homes/fish/mernst/tmp/python-doc/html/api/countingRefs.html",
|
---|
1564 | "/homes/fish/mernst/tmp/python-doc/html/api/debugging.html",
|
---|
1565 | "/homes/fish/mernst/tmp/python-doc/html/api/dictObjects.html",
|
---|
1566 | "/homes/fish/mernst/tmp/python-doc/html/api/embedding.html",
|
---|
1567 | "/homes/fish/mernst/tmp/python-doc/html/api/exceptionHandling.html",
|
---|
1568 | "/homes/fish/mernst/tmp/python-doc/html/api/exceptions.html",
|
---|
1569 | "/homes/fish/mernst/tmp/python-doc/html/api/fileObjects.html",
|
---|
1570 | "/homes/fish/mernst/tmp/python-doc/html/api/floatObjects.html",
|
---|
1571 | "/homes/fish/mernst/tmp/python-doc/html/api/front.html",
|
---|
1572 | "/homes/fish/mernst/tmp/python-doc/html/api/fundamental.html",
|
---|
1573 | # "/homes/fish/mernst/tmp/python-doc/html/api/genindex.html",
|
---|
1574 | "/homes/fish/mernst/tmp/python-doc/html/api/importing.html",
|
---|
1575 | "/homes/fish/mernst/tmp/python-doc/html/api/includes.html",
|
---|
1576 | "/homes/fish/mernst/tmp/python-doc/html/api/index.html",
|
---|
1577 | "/homes/fish/mernst/tmp/python-doc/html/api/initialization.html",
|
---|
1578 | "/homes/fish/mernst/tmp/python-doc/html/api/intObjects.html",
|
---|
1579 | "/homes/fish/mernst/tmp/python-doc/html/api/intro.html",
|
---|
1580 | "/homes/fish/mernst/tmp/python-doc/html/api/listObjects.html",
|
---|
1581 | "/homes/fish/mernst/tmp/python-doc/html/api/longObjects.html",
|
---|
1582 | "/homes/fish/mernst/tmp/python-doc/html/api/mapObjects.html",
|
---|
1583 | "/homes/fish/mernst/tmp/python-doc/html/api/mapping.html",
|
---|
1584 | "/homes/fish/mernst/tmp/python-doc/html/api/newTypes.html",
|
---|
1585 | "/homes/fish/mernst/tmp/python-doc/html/api/node24.html",
|
---|
1586 | "/homes/fish/mernst/tmp/python-doc/html/api/noneObject.html",
|
---|
1587 | "/homes/fish/mernst/tmp/python-doc/html/api/number.html",
|
---|
1588 | "/homes/fish/mernst/tmp/python-doc/html/api/numericObjects.html",
|
---|
1589 | "/homes/fish/mernst/tmp/python-doc/html/api/object.html",
|
---|
1590 | "/homes/fish/mernst/tmp/python-doc/html/api/objects.html",
|
---|
1591 | "/homes/fish/mernst/tmp/python-doc/html/api/os.html",
|
---|
1592 | "/homes/fish/mernst/tmp/python-doc/html/api/otherObjects.html",
|
---|
1593 | "/homes/fish/mernst/tmp/python-doc/html/api/processControl.html",
|
---|
1594 | "/homes/fish/mernst/tmp/python-doc/html/api/refcountDetails.html",
|
---|
1595 | "/homes/fish/mernst/tmp/python-doc/html/api/refcounts.html",
|
---|
1596 | "/homes/fish/mernst/tmp/python-doc/html/api/sequence.html",
|
---|
1597 | "/homes/fish/mernst/tmp/python-doc/html/api/sequenceObjects.html",
|
---|
1598 | "/homes/fish/mernst/tmp/python-doc/html/api/standardExceptions.html",
|
---|
1599 | "/homes/fish/mernst/tmp/python-doc/html/api/stringObjects.html",
|
---|
1600 | "/homes/fish/mernst/tmp/python-doc/html/api/threads.html",
|
---|
1601 | "/homes/fish/mernst/tmp/python-doc/html/api/tupleObjects.html",
|
---|
1602 | "/homes/fish/mernst/tmp/python-doc/html/api/typeObjects.html",
|
---|
1603 | "/homes/fish/mernst/tmp/python-doc/html/api/types.html",
|
---|
1604 | "/homes/fish/mernst/tmp/python-doc/html/api/utilities.html",
|
---|
1605 | "/homes/fish/mernst/tmp/python-doc/html/api/veryhigh.html");
|
---|
1606 | for my $file (@files)
|
---|
1607 | { print STDERR "\n", "=" x 75, "\n", "$file:\n";
|
---|
1608 | process_section_file($file, 0, "Title");
|
---|
1609 | }
|
---|
1610 | }
|
---|
1611 |
|
---|
1612 | # Test dealing with index.
|
---|
1613 | elsif ($action eq "index")
|
---|
1614 | { # my $file;
|
---|
1615 | # $file = "/homes/fish/mernst/tmp/python-doc/html/api/genindex.html";
|
---|
1616 |
|
---|
1617 | process_index_file($file, "\@cindex");
|
---|
1618 | print_index_info();
|
---|
1619 | }
|
---|
1620 |
|
---|
1621 | else
|
---|
1622 | { die "Unrecognized action `$action'"; }
|
---|
1623 | }
|
---|
1624 |
|
---|
1625 |
|
---|
1626 | ###########################################################################
|
---|
1627 | ### Main loop
|
---|
1628 | ###
|
---|
1629 |
|
---|
1630 | sub process_contents_file ( $ )
|
---|
1631 | { my ($file) = check_args(1, @_);
|
---|
1632 |
|
---|
1633 | # could also use File::Basename
|
---|
1634 | my $info_file = $file;
|
---|
1635 | $info_file =~ s/(\/?index)?\.html$//;
|
---|
1636 | if ($info_file eq "")
|
---|
1637 | { chomp($info_file = `pwd`); }
|
---|
1638 | $info_file =~ s/^.*\///; # not the most efficient way to remove dirs
|
---|
1639 |
|
---|
1640 | $html_directory = $file;
|
---|
1641 | $html_directory =~ s/(\/|^)[^\/]+$/$1/;
|
---|
1642 |
|
---|
1643 | my $texi_file = "$info_file.texi";
|
---|
1644 | open(TEXI, ">$texi_file");
|
---|
1645 |
|
---|
1646 | print TEXI "\\input texinfo \@c -*-texinfo-*-\n";
|
---|
1647 | print TEXI "\@c %**start of header\n";
|
---|
1648 | print TEXI "\@setfilename $info_file\n";
|
---|
1649 |
|
---|
1650 | # 2. Summary Description and Copyright
|
---|
1651 | # The "Summary Description and Copyright" segment describes the
|
---|
1652 | # document and contains the copyright notice and copying permissions
|
---|
1653 | # for the Info file. The segment must be enclosed between `@ifinfo'
|
---|
1654 | # and `@end ifinfo' commands so that the formatters place it only in
|
---|
1655 | # the Info file.
|
---|
1656 | #
|
---|
1657 | # The summary description and copyright segment does not appear in the
|
---|
1658 | # printed document.
|
---|
1659 | #
|
---|
1660 | # @ifinfo
|
---|
1661 | # This is a short example of a complete Texinfo file.
|
---|
1662 | #
|
---|
1663 | # Copyright @copyright{} 1990 Free Software Foundation, Inc.
|
---|
1664 | # @end ifinfo
|
---|
1665 |
|
---|
1666 |
|
---|
1667 | # 3. Title and Copyright
|
---|
1668 | # The "Title and Copyright" segment contains the title and copyright
|
---|
1669 | # pages and copying permissions for the printed manual. The segment
|
---|
1670 | # must be enclosed between `@titlepage' and `@end titlepage'
|
---|
1671 | # commands. The title and copyright page appear only in the printed
|
---|
1672 | # manual.
|
---|
1673 | #
|
---|
1674 | # The titlepage segment does not appear in the Info file.
|
---|
1675 | #
|
---|
1676 | # @titlepage
|
---|
1677 | # @sp 10
|
---|
1678 | # @comment The title is printed in a large font.
|
---|
1679 | # @center @titlefont{Sample Title}
|
---|
1680 | #
|
---|
1681 | # @c The following two commands start the copyright page.
|
---|
1682 | # @page
|
---|
1683 | # @vskip 0pt plus 1filll
|
---|
1684 | # Copyright @copyright{} 1990 Free Software Foundation, Inc.
|
---|
1685 | # @end titlepage
|
---|
1686 |
|
---|
1687 |
|
---|
1688 | # 4. `Top' Node and Master Menu
|
---|
1689 | # The "Master Menu" contains a complete menu of all the nodes in the
|
---|
1690 | # whole Info file. It appears only in the Info file, in the `Top'
|
---|
1691 | # node.
|
---|
1692 | #
|
---|
1693 | # The `Top' node contains the master menu for the Info file. Since a
|
---|
1694 | # printed manual uses a table of contents rather than a menu, the master
|
---|
1695 | # menu appears only in the Info file.
|
---|
1696 | #
|
---|
1697 | # @node Top, First Chapter, , (dir)
|
---|
1698 | # @comment node-name, next, previous, up
|
---|
1699 | #
|
---|
1700 | # @menu
|
---|
1701 | # * First Chapter:: The first chapter is the
|
---|
1702 | # only chapter in this sample.
|
---|
1703 | # * Concept Index:: This index has two entries.
|
---|
1704 | # @end menu
|
---|
1705 |
|
---|
1706 |
|
---|
1707 |
|
---|
1708 | $current_ref_tdf = [ "Top", 0, $ARGV[0] ];
|
---|
1709 | process_section_file($file, 0, "Top");
|
---|
1710 | while (scalar(@contents_list))
|
---|
1711 | { $current_ref_tdf = shift @contents_list;
|
---|
1712 | process_section_file($ {$current_ref_tdf}[2], $ {$current_ref_tdf}[1], $ {$current_ref_tdf}[0]);
|
---|
1713 | }
|
---|
1714 |
|
---|
1715 | print TEXI "\n";
|
---|
1716 | for my $indextitle (@index_titles)
|
---|
1717 | { print TEXI "\@node $indextitle\n";
|
---|
1718 | print TEXI "\@unnumbered $indextitle\n";
|
---|
1719 | print TEXI "\@printindex $ {$index_info{$indextitle}}[1]\n";
|
---|
1720 | print TEXI "\n"; }
|
---|
1721 |
|
---|
1722 | print TEXI "\@contents\n";
|
---|
1723 | print TEXI "\@bye\n";
|
---|
1724 | close(TEXI);
|
---|
1725 | }
|
---|
1726 |
|
---|
1727 | # This needs to be last so global variable initializations are reached.
|
---|
1728 |
|
---|
1729 | if (scalar(@ARGV) == 0)
|
---|
1730 | { die "No arguments supplied to html2texi.pl"; }
|
---|
1731 |
|
---|
1732 | if ($ARGV[0] eq "-test")
|
---|
1733 | { my @test_args = @ARGV[1..$#ARGV];
|
---|
1734 | if (scalar(@test_args) == 0)
|
---|
1735 | { test("", "index.html"); }
|
---|
1736 | elsif (scalar(@test_args) == 1)
|
---|
1737 | { test("", $test_args[0]); }
|
---|
1738 | elsif (scalar(@test_args) == 2)
|
---|
1739 | { test($test_args[0], $test_args[1]); }
|
---|
1740 | else
|
---|
1741 | { die "Too many test arguments passed to html2texi: ", join(" ", @ARGV); }
|
---|
1742 | exit();
|
---|
1743 | }
|
---|
1744 |
|
---|
1745 | if (scalar(@ARGV) != 1)
|
---|
1746 | { die "Pass one argument, the main/contents page"; }
|
---|
1747 |
|
---|
1748 | process_contents_file($ARGV[0]);
|
---|
1749 |
|
---|
1750 | # end of html2texi.pl
|
---|