: # Use perl
eval 'exec perl -S $0 "$@"'
if 0;
# It's faster to use something like #!/usr/bin/perl but you have to
# know where perl is on your system. I didn't want to have to tell
# people how to do this, so I don't distribute it that way. You
# really ought to change it though. On my machine, it saves about a
# half a second per invokation.
#
# txt2html.pl
# Convert raw text to something with a little HTML formatting
#
# Written by Seth Golub //;
$line_action |= $HRULE;
} elsif ($line =~ /\014/)
{
$line_action |= $HRULE;
$line =~ s/\014/\n
# [-p
# <= 0 : Preformat entire document
# 1 : one line triggers
# >= 2 : two lines trigger
# [-pe
# <= 0 : Never preformat within document
# 1 : one line triggers
# >= 2 : two lines trigger
# NOTE for --prebegin and --preend:
# A zero takes precedence. If one is zero, the other is ignored.
# If both are zero, entire document is preformatted.
# [-r
\n";
$prev =~ s/<\/P>\\n\\n
\n/g; # Linefeeds become horizontal rules
}
}
sub shortline
{
# Short lines should be broken even on list item lines iff the
# following line is more text. I haven't figured out how to do
# that yet. For now, I'll just not break on short lines in lists.
# (sorry)
if (!($mode & ($PRE | $LIST))
&& !&is_blank($line)
&& !&is_blank($prev)
&& ($prev_line_length < $short_line_length)
&& !($line_action & ($END | $HEADER | $HRULE | $LIST | $PAR))
&& !($prev_action & ($HEADER | $HRULE | $BREAK)))
{
$prev .= "
" . chop($prev);
$prev_action |= $BREAK;
}
}
sub mailstuff
{
if ((($line =~ /^\w*>/) # Handle "FF> Werewolves."
|| ($line =~ /^\w*\|/)) # Handle "Igor| There wolves."
&& !&is_blank($nextline))
{
$line =~ s/$/
/;
$line_action |= ($BREAK | $MAILQUOTE);
if(!($prev_action & ($BREAK | $PAR)))
{
$prev .= "
\n"; $line_action |= $PAR; } } elsif (($line =~ /^(From:?)|(Newsgroups:) /) && &is_blank($prev)) { &anchor_mail if !($prev_action & $MAILHEADER); chop $line; $line = "\n
\n" . $line . "
\n";
$line_action |= ($BREAK | $MAILHEADER | $PAR);
} elsif (($line =~ /^[\w\-]*:/) # Handle "Some-Header: blah"
&& ($prev_action & $MAILHEADER)
&& !&is_blank($nextline))
{
$line =~ s/$/
/;
$line_action |= ($BREAK | $MAILHEADER);
} elsif (($line =~ /^\s+\S/) && # Handle multi-line mail headers
($prev_action & $MAILHEADER) &&
!&is_blank($nextline))
{
$line =~ s/$/
/;
$line_action |= ($BREAK | $MAILHEADER);
}
}
# Subtracts modes listed in $mask from $vector.
sub subtract_modes
{
local($vector, $mask) = @_;
($vector | $mask) - $mask;
}
sub paragraph
{
if(!&is_blank($line)
&& !($mode & $PRE)
&& !&subtract_modes($line_action, $END | $MAILQUOTE | $CAPS | $BREAK)
&& (&is_blank($prev)
|| ($line_action & $END)
|| ($line_indent > $prev_indent + $par_indent)))
{
$prev .= "
\n"; $line_action |= $PAR; } } # If the line is blank, return the second argument. Otherwise, # return the number of spaces before any nonspaces on the line. sub count_indent { local($line, $prev_length) = @_; if(&is_blank($line)) { return $prev_length; } local($ws) = $line =~ /^( *)[^ ]/; length($ws); } sub listprefix { local($line) = @_; local($prefix, $number, $rawprefix); return (0,0,0) if (!($line =~ /^\s*[-=\*o]+\s+\S/ ) && !($line =~ /^\s*(\d+|[^\W\d_])[\.\)\]:]\s+\S/ )); ($number) = $line =~ /^\s*(\d+|[^\W\d_])/; $number = 0 unless defined( $number ); # That slippery exception of "o" as a bullet # (This ought to be determined using the context of what lists # we have in progress, but this will probably work well enough.) if($line =~ /^\s*o\s/) { $number = 0; } if ($number) { ($rawprefix) = $line =~ /^(\s*(\d+|[^\W\d_]).)/; $prefix = $rawprefix; $prefix =~ s/(\d+|[^\W\d_])//; # Take the number out } else { ($rawprefix) = $line =~ /^(\s*[-=o\*]+.)/; $prefix = $rawprefix; } ($prefix, $number, $rawprefix); } sub startlist { local($prefix, $number, $rawprefix) = @_; $listprefix[$listnum] = $prefix; if($number) { # It doesn't start with 1,a,A. Let's not screw with it. if (($number ne "1") && ($number ne "a") && ($number ne "A")) { return 0; } $prev .= "$list_indent
\n/; $prev =~ s/<\/P>\\n\\n\n" if ($mode & $PRE); if ($append_file) { if(-r $append_file) { open(APPEND, $append_file); print while//; $mode |= $PRE; $line_action |= $PRE; } } sub make_new_anchor { local( $heading_level ) = @_; local($anchor, $i); return sprintf("%d", $non_header_anchor++) if(!$heading_level); $anchor = "section-"; $heading_count[$heading_level-1]++; # Reset lower order counters for($i=$#heading_count + 1; $i > $heading_level; $i--) { $heading_count[$i-1] = 0; } for($i=0; $i < $heading_level; $i++) { $heading_count[$i] = 1 if !$heading_count[$i]; # In case they skip any $anchor .= sprintf("%d.", $heading_count[$i]); } chop($anchor); $anchor; } sub anchor_mail { local($anchor) = &make_new_anchor(0); $line =~ s/([^ ]*)/$1<\/A>/; } sub anchor_heading { local($level) = @_; local($anchor) = &make_new_anchor( $level ); $line =~ s/(
)(.*)(<\/H.>)/$1$2<\/A>$3/; } sub heading_level { local($style) = @_; $heading_styles{$style} = ++$num_heading_styles if !$heading_styles{$style}; $heading_styles{$style}; } sub heading { local($hoffset, $heading) = $line =~ /^(\s*)(.+)$/; $hoffset = "" unless defined( $hoffset ); $heading = "" unless defined( $heading ); local($uoffset, $underline) = $nextline =~ /^(\s*)(\S+)\s*$/; $uoffset = "" unless defined( $uoffset ); $underline = "" unless defined( $underline ); local($lendiff, $offsetdiff); $lendiff = length($heading) - length($underline); $lendiff *= -1 if $lendiff < 0; $offsetdiff = length($hoffset) - length($uoffset); $offsetdiff *= -1 if $offsetdiff < 0; if(&is_blank($line) ||($lendiff > $underline_length_tolerance) ||($offsetdiff > $underline_offset_tolerance)) { return; } $underline = substr($underline,0,1); $underline .= "C" if &iscaps($line); # Call it a different style if the # heading is in all caps. $nextline = &getline; # Eat the underline $heading_level = &heading_level($underline); &tagline("H" . $heading_level); &anchor_heading( $heading_level ); $line_action |= $HEADER; } sub custom_heading { local($i, $level); for($i=0; $i <= $#custom_heading_regexp; $i++) { if ($line =~ /$custom_heading_regexp[$i]/) { if ( $explicit_headings ) { $level = $i + 1; } else { $level = &heading_level("Cust" . $i); } &tagline("H" . $level); &anchor_heading( $level ); $line_action |= $HEADER; last; } } } sub unhyphenate { local($second); # This looks hairy because of all the quoted characters. # All I'm doing is pulling out the word that begins the next line. # Along with it, I pull out any punctuation that follows. # Preceding whitespace is preserved. We don't want to screw up # our own guessing systems that rely on indentation. ($second) = $nextline =~ /^\s*([^\W\d_]+[\)\}\]\.,:;\'\"\>]*\s*)/; # " $nextline =~ s/^(\s*)[^\W\d_]+[\)\}\]\.,:;\'\"\>]*\s*/$1/; # " # (The silly comments are for my less-than-perfect code hilighter) $nextline = &getline if $nextline eq ""; $line =~ s/\-\s*$/$second/; $line .= "\n"; } sub untabify { local($line) = @_; while($line =~ /\011/) { $line =~ s/\011/" " x ($tab_width - (length($`) % $tab_width))/e; } $line; } sub tagline { local($tag) = @_; chop $line; # Drop newline $line =~ s/^\s*(.*)$/<$tag>$1<\/$tag>\n/; } sub iscaps { local($_) = @_; # This is ugly, but I don't know a better way to do it. # (And, yes, I could use the literal characters instead of the # numeric codes, but this keeps the script 8-bit clean, which will # save someone a big headache when they transfer via ASCII ftp. /^[^a-z\341\343\344\352\353\354\363\370\337\373\375\342\345\347\350\355\357\364\365\376\371\377\340\346\351\360\356\361\362\366\372\374<]*[A-Z\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\330\331\332\333\334\335\336]{$min_caps_length,}[^a-z\341\343\344\352\353\354\363\370\337\373\375\342\345\347\350\355\357\364\365\376\371\377\340\346\351\360\356\361\362\366\372\374<]*$/; } sub caps { if(&iscaps($line)) { &tagline($caps_tag); $line_action |= $CAPS; } } # Convert very simple globs to regexps sub glob2regexp { local($glob) = @_; # Escape funky chars $glob =~ s/[^\w\[\]\*\?\|\\]/\\$&/g; local($regexp,$i,$len,$escaped) = ("",0,length($glob),0); for(;$i < $len; $i++) { $char = substr($glob,$i,1); if($escaped) { $escaped = 0; $regexp .= $char; next; } if ($char eq "\\") { $escaped = 1; next; $regexp .= $char; } if ($char eq "?") { $regexp .= "."; next; } if ($char eq "*") { $regexp .= ".*"; next; } $regexp .= $char; # Normal character } "\\b" . $regexp . "\\b"; } sub add_regexp_to_links_table { local($key,$URL,$switches) = @_; # No sense adding a second one if it's already in there. # It would never get used. if(!$links_table{$key}) { # Keep track of the order they were added so we can # look for matches in the same order push(@links_table_order, ($key)); $links_table{$key} = $URL; # Put it in The Table $links_switch_table{$key} = $switches; print STDERR " ($#links_table_order)\tKEY: $key\n\tVALUE: $URL\n\tSWITCHES: $switches\n\n" if ($dict_debug & 1); } else { if($dict_debug & 1) { print STDERR " Skipping entry. Key already in table.\n"; print STDERR "\tKEY: $key\n\tVALUE: $URL\n\n"; } } } sub add_literal_to_links_table { local($key,$URL,$switches) = @_; $key =~ s/(\W)/\\$1/g; # Escape non-alphanumeric chars $key = "\\b$key\\b"; # Make a regexp out of it &add_regexp_to_links_table($key,$URL,$switches); } sub add_glob_to_links_table { local($key,$URL,$switches) = @_; &add_regexp_to_links_table(&glob2regexp($key),$URL,$switches); } # This is the only function you should need to change if you want to # use a different dictionary file format. sub parse_dict { local($dictfile, $dict) = @_; print STDERR "Parsing dictionary file $dictfile\n" if ($dict_debug & 1); $dict =~ s/^\#.*$//g; # Strip lines that start with '#' $dict =~ s/^.*[^\\]:\s*$//g; # Strip lines that end with unescaped ':' if($dict =~ /->\s*->/) { $message = "Two consecutive '->'s found in $dictfile\n"; # Print out any useful context so they can find it. ($near) = $dict =~ /([\S ]*\s*->\s*->\s*\S*)/; $message .= "\n$near\n" if $near =~ /\S/; die $message; } while($dict =~ /\s*(.+)\s+\-+([ieho]+\-+)?\>\s*(.*\S+)\s*\n/ig) { local($key, $URL,$switches,$options); $key = $1; $options = $2; $options = "" unless defined($options); $URL = $3; $switches = 0; $switches += 1 if $options =~ /i/i; # Case insensitivity $switches += 2 if $options =~ /e/i; # Evaluate as Perl code $switches += 4 if $options =~ /h/i; # provides HTML, not just URL $switches += 8 if $options =~ /o/i; # Only do this link once $key =~ s/\s*$//; # Chop trailing whitespace if($key =~ m|^/|) # Regexp { $key = substr($key,1); $key =~ s|/$||; # Allow them to forget the closing / &add_regexp_to_links_table($key,$URL,$switches); } elsif($key =~ /^\|/) # alternate regexp format { $key = substr($key,1); $key =~ s/\|$//; # Allow them to forget the closing | $key =~ s|/|\\/|g; # Escape all slashes &add_regexp_to_links_table($key,$URL,$switches); } elsif ($key =~ /\"/) { $key = substr($key,1); $key =~ s/\"$//; # Allow them to forget the closing " &add_literal_to_links_table($key,$URL,$switches); } else { &add_glob_to_links_table($key,$URL,$switches); } } } sub in_link_context { local($match, $before) = @_; return 1 if $match =~ m@?A>@i; # No links allowed inside match local($final_open, $final_close); $final_open = rindex($before, "") - $[; return 1 if ($final_open >= 0) # Link opened && (($final_close < 0) # and not closed or || ($final_open > $final_close)); # one opened after last close # Now check to see if we're inside a tag, matching a tag name, # or attribute name or value $final_open = rindex($before, "<") - $[; $final_close = rindex($before, ">") - $[; ($final_open >= 0) # Tag opened && (($final_close < 0) # and not closed or || ($final_open > $final_close)); # one opened after last close } # This subroutine looks a little odd. Rather than build up some code # and keep "eval"ing later, I'm building a new subroutine. This way I # can declare local vars and not worry about the namespace in the # calling context. I don't know how much it really gains me, but I # don't know of any real costs and it seems like it could be # friendlier to optimization. (And it's fun to define new # subroutines at runtime. :-) # I once thought that storing the finished dynamic_make_dictionary_links # in a file and using it for subsequent invokations (when the # dictionaries were the same) would save time. I tried it, and the # speed gain is insignificant. (Using the standard links dictionary, # it speeds up by 0.1 seconds per invokation on a 386/33 with a slow # old hard drive. I couldn't measure a difference on my fast machine.) sub make_dictionary_links_code { local($i,$pattern,$switches,$options,$code,$href); $code = < $&<\\/A>' if !($switches & 4); $code .= " \$line_with_links = \"\";"; if($switches & 8) # Do link only once { $code .= " while(!\$done_with_link[$i] && \$line =~ /$pattern/$s_sw) { \$done_with_link[$i] = 1; "; } else { $code .= "\n while(\$line =~ /$pattern/$s_sw)\n {"; } $code .= < ); close(DICT); &parse_dict($dict, $contents); } &make_dictionary_links_code; } sub make_dictionary_links { eval "&dynamic_make_dictionary_links;"; warn $@ if $@; } sub getline { local($line); $line = <>; $line = "" unless defined ($line); $line =~ s/[ \011]*\015$//; # Chop trailing whitespace and DOS CRs $line = &untabify($line); # Change all tabs to spaces $line; } sub main { $* = 1; # Turn on multiline searches push(@links_dictionaries,($default_link_dict)) if ($make_links && (-f $default_link_dict)); &deal_with_options; if($make_links) { push(@links_dictionaries,($system_link_dict)) if -f $system_link_dict; &load_dictionary_links; } $non_header_anchor = 0; # Moved this way up here so we can grab the first line and use it # as the title (if --titlefirst is set) $mode = 0; $listnum = 0; $list_indent = ""; $line_action = $NONE; $prev_action = $NONE; $prev_line_length = 0; $prev_indent = 0; $prev = ""; $line = &getline; $nextline = 0; $nextline = &getline if $line; # Skip leading blank lines while( &is_blank($line) && $line ) { $prev = $line; $line = $nextline; $nextline = &getline if $nextline; } if(!$extract) { print '\n" unless !$doctype; print "\n"; print "\n"; # if --titlefirst is set and --title isn't, use the first line # as the title. if ($titlefirst && !$title) { ($title) = $line =~ /^ *(.*)/; # grab first line $title =~ s/ *$//; # strip trailing whitespace } $title = "" if !$title; print " $title \n" if $title; if ($append_head) { open(APPEND, $append_head) || die "Failed to open $append_head\n"; print while; close(APPEND); } print "\n"; print "\n"; print "\n"; } if ($prepend_file) { if(-r $prepend_file) { open( PREPEND, $prepend_file ); print while ; close( PREPEND ); } else { print STDERR "Can't find or read file $prepend_file to prepend.\n"; } } do { if ( !$link_only ) { $line_length = length($line); # Do this before tags go in $line_indent = &count_indent($line, $prev_indent); &escape if $escape_HTML_chars; &endpreformat if (($mode & $PRE) && ($preformat_trigger_lines != 0)); &hrule if !($mode & $PRE); &custom_heading if (($#custom_heading_regexp > -1) && !($mode & $PRE)); &liststuff if (!($mode & $PRE) && !&is_blank($line)); &heading if (!$explicit_headings && !($mode & ($PRE | $HEADER)) && $nextline =~ /^\s*[=\-\*\.~\+]+\s*$/); # &custom_tag if (($#custom_tags > -1) # && !($mode & $PRE) # && !($line_action & $HEADER)); &mailstuff if ($mailmode && !($mode & $PRE) && !($line_action & $HEADER)); &preformat if (!($line_action & ($HEADER | $LIST | $MAILHEADER)) && !($mode & ($LIST | $PRE)) && ($endpreformat_trigger_lines != 0)); ¶graph; &shortline; &unhyphenate if ($unhyphenation && ($line =~ /[^\W\d_]\-$/) && # ends in hyphen # next line starts w/letters ($nextline =~ /^\s*[^\W\d_]/) && !($mode & ($PRE | $HEADER | $MAILHEADER | $BREAK))); &caps if !($mode & $PRE); } &make_dictionary_links if ($make_links && !&is_blank($line) && $#links_table_order); # All the matching and formatting is done. Now we can # replace non-ASCII characters with character entities. @chars = split(//,$line); foreach $_ (@chars) { $_ = $char_entities{$_} if defined( $char_entities{$_} ); } $line = join( "", @chars ); # Print it out and move on. print $prev; if (!&is_blank($nextline)) { $prev_action = $line_action; $line_action = $NONE; $prev_line_length = $line_length; $prev_indent = $line_indent; } $prev = $line; $line = $nextline; $nextline = &getline if $nextline; } until (!$nextline && !$line && !$prev); $prev = ""; &endlist($listnum) if ($mode & $LIST); # End all lists print $prev; print "\n"; print "