#!/usr/bin/perl -p0777
#
#########################################################
# striphtml ("striff tummel")
# tchrist@perl.com 
# version 1.0: Thu 01 Feb 1996 1:53:31pm MST 
# version 1.1: Sat Feb  3 06:23:50 MST 1996
# 		(fix up comments in annoying places)
#########################################################
#
# how to strip out html comments and tags and transform
# entities in just three -- count 'em three -- substitutions;
# sed and awk eat your heart out.  :-)
#
# as always, translations from this nacr� rendition into 
# more characteristically marine, herpetoid, titillative, 
# or indonesian idioms are welcome for the furthering of 
# comparitive cyberlinguistic studies.
#
#########################################################

require 5.002;      # for nifty embedded regexp comments

#########################################################
# first we'll shoot all the <!-- comments -->
#########################################################

s{ <!                   # comments begin with a `<!'
                        # followed by 0 or more comments;

    (.*?)		# this is actually to eat up comments in non 
			# random places

     (                  # not suppose to have any white space here

                        # just a quick start; 
      --                # each comment starts with a `--'
        .*?             # and includes all text up to and including
      --                # the *next* occurrence of `--'
        \s*             # and may have trailing while space
                        #   (albeit not leading white space XXX)
     )+                 # repetire ad libitum  XXX should be * not +
    (.*?)		# trailing non comment text
   >                    # up to a `>'
}{
    if ($1 || $3) {	# this silliness for embedded comments in tags
	"<!$1 $3>";
    } 
}gesx;                 # mutate into nada, nothing, and niente

#########################################################
# next we'll remove all the <tags>
#########################################################

s{ <                    # opening angle bracket

    (?:                 # Non-backreffing grouping paren
         [^>'"] *       # 0 or more things that are neither > nor ' nor "
            |           #    or else
         ".*?"          # a section between double quotes (stingy match)
            |           #    or else
         '.*?'          # a section between single quotes (stingy match)
    ) +                 # repetire ad libitum
                        #  hm.... are null tags <> legal? XXX
   >                    # closing angle bracket
}{}gsx;                 # mutate into nada, nothing, and niente

#########################################################
# finally we'll translate all &valid; HTML 2.0 entities
#########################################################

s{ (
        &              # an entity starts with a semicolon
        ( 
	    \x23\d+    # and is either a pound (#) and numbers
	     |	       #   or else
	    \w+        # has alphanumunders up to a semi
	)         
        ;?             # a semi terminates AS DOES ANYTHING ELSE (XXX)
    )
} {

    $entity{$2}        # if it's a known entity use that
        ||             #   but otherwise
        $1             # leave what we'd found; NO WARNINGS (XXX)

}gex;                  # execute replacement -- that's code not a string

#########################################################
# but wait! load up the %entity mappings enwrapped in 
# a BEGIN that the last might be first, and only execute
# once, since we're in a -p "loop"; awk is kinda nice after all.
#########################################################

BEGIN {

    %entity = (

        lt     => '<',     #a less-than
        gt     => '>',     #a greater-than
        amp    => '&',     #a nampersand
        quot   => '"',     #a (verticle) double-quote

        nbsp   => chr 160, #no-break space
        iexcl  => chr 161, #inverted exclamation mark
        cent   => chr 162, #cent sign
        pound  => chr 163, #pound sterling sign CURRENCY NOT WEIGHT
        curren => chr 164, #general currency sign
        yen    => chr 165, #yen sign
        brvbar => chr 166, #broken (vertical) bar
        sect   => chr 167, #section sign
        uml    => chr 168, #umlaut (dieresis)
        copy   => chr 169, #copyright sign
        ordf   => chr 170, #ordinal indicator, feminine
        laquo  => chr 171, #angle quotation mark, left
        not    => chr 172, #not sign
        shy    => chr 173, #soft hyphen
        reg    => chr 174, #registered sign
        macr   => chr 175, #macron
        deg    => chr 176, #degree sign
        plusmn => chr 177, #plus-or-minus sign
        sup2   => chr 178, #superscript two
        sup3   => chr 179, #superscript three
        acute  => chr 180, #acute accent
        micro  => chr 181, #micro sign
        para   => chr 182, #pilcrow (paragraph sign)
        middot => chr 183, #middle dot
        cedil  => chr 184, #cedilla
        sup1   => chr 185, #superscript one
        ordm   => chr 186, #ordinal indicator, masculine
        raquo  => chr 187, #angle quotation mark, right
        frac14 => chr 188, #fraction one-quarter
        frac12 => chr 189, #fraction one-half
        frac34 => chr 190, #fraction three-quarters
        iquest => chr 191, #inverted question mark
        Agrave => chr 192, #capital A, grave accent
        Aacute => chr 193, #capital A, acute accent
        Acirc  => chr 194, #capital A, circumflex accent
        Atilde => chr 195, #capital A, tilde
        Auml   => chr 196, #capital A, dieresis or umlaut mark
        Aring  => chr 197, #capital A, ring
        AElig  => chr 198, #capital AE diphthong (ligature)
        Ccedil => chr 199, #capital C, cedilla
        Egrave => chr 200, #capital E, grave accent
        Eacute => chr 201, #capital E, acute accent
        Ecirc  => chr 202, #capital E, circumflex accent
        Euml   => chr 203, #capital E, dieresis or umlaut mark
        Igrave => chr 204, #capital I, grave accent
        Iacute => chr 205, #capital I, acute accent
        Icirc  => chr 206, #capital I, circumflex accent
        Iuml   => chr 207, #capital I, dieresis or umlaut mark
        ETH    => chr 208, #capital Eth, Icelandic
        Ntilde => chr 209, #capital N, tilde
        Ograve => chr 210, #capital O, grave accent
        Oacute => chr 211, #capital O, acute accent
        Ocirc  => chr 212, #capital O, circumflex accent
        Otilde => chr 213, #capital O, tilde
        Ouml   => chr 214, #capital O, dieresis or umlaut mark
        times  => chr 215, #multiply sign
        Oslash => chr 216, #capital O, slash
        Ugrave => chr 217, #capital U, grave accent
        Uacute => chr 218, #capital U, acute accent
        Ucirc  => chr 219, #capital U, circumflex accent
        Uuml   => chr 220, #capital U, dieresis or umlaut mark
        Yacute => chr 221, #capital Y, acute accent
        THORN  => chr 222, #capital THORN, Icelandic
        szlig  => chr 223, #small sharp s, German (sz ligature)
        agrave => chr 224, #small a, grave accent
        aacute => chr 225, #small a, acute accent
        acirc  => chr 226, #small a, circumflex accent
        atilde => chr 227, #small a, tilde
        auml   => chr 228, #small a, dieresis or umlaut mark
        aring  => chr 229, #small a, ring
        aelig  => chr 230, #small ae diphthong (ligature)
        ccedil => chr 231, #small c, cedilla
        egrave => chr 232, #small e, grave accent
        eacute => chr 233, #small e, acute accent
        ecirc  => chr 234, #small e, circumflex accent
        euml   => chr 235, #small e, dieresis or umlaut mark
        igrave => chr 236, #small i, grave accent
        iacute => chr 237, #small i, acute accent
        icirc  => chr 238, #small i, circumflex accent
        iuml   => chr 239, #small i, dieresis or umlaut mark
        eth    => chr 240, #small eth, Icelandic
        ntilde => chr 241, #small n, tilde
        ograve => chr 242, #small o, grave accent
        oacute => chr 243, #small o, acute accent
        ocirc  => chr 244, #small o, circumflex accent
        otilde => chr 245, #small o, tilde
        ouml   => chr 246, #small o, dieresis or umlaut mark
        divide => chr 247, #divide sign
        oslash => chr 248, #small o, slash
        ugrave => chr 249, #small u, grave accent
        uacute => chr 250, #small u, acute accent
        ucirc  => chr 251, #small u, circumflex accent
        uuml   => chr 252, #small u, dieresis or umlaut mark
        yacute => chr 253, #small y, acute accent
        thorn  => chr 254, #small thorn, Icelandic
        yuml   => chr 255, #small y, dieresis or umlaut mark
    );

    ####################################################
    # now fill in all the numbers to match themselves
    ####################################################
    for $chr ( 0 .. 255 ) { 
        $entity{ '#' . $chr } = chr $chr;
    }
}

#########################################################
# premature finish lest someone clip my signature
#########################################################

__END__

<title>Tom Christiansen's Mox.Perl.COM Home Page</title>
<BODY BGCOLOR=#ffffff TEXT=#000000>
<!--

<BODY BGCOLOR="#000000" TEXT="#FFFFFF"
      LINK="#FFFF00" VLINK="#22AA22" ALINK="#0077FF">

!-->



<A NAME=TOP>

<CENTER>
<h3>
<A HREF="#PERL">perl</a> /
<A HREF="#MAGIC">magic</a> /
<A HREF="#USENIX">usenix</a> /
<A HREF="#BOULDER">boulder</a>
</h3>
<BR>
The word of the day is <i>nidificate</i>.
</CENTER>

Testing: &#69; &#202; &Auml;
</a>

<HR NOSHADE SIZE=3>

<A NAME=PERL>
<CENTER>
<h1>
<IMG SRC="/deckmaster/gifs/camel.gif" ALT="">
<font size=7>
Perl
</font>
<IMG SRC="/deckmaster/gifs/camel.gif" ALT="">
</h1>
</a>

DOCTYPE START1
<!DOCTYPE  HTML PUBLIC "-//IETF//DTD HTML 2.0//EN"
  -- This is an annoying comment > --
>
END1

DOCTYPE START2
<!DOCTYPE  HTML PUBLIC "-//IETF//DTD HTML 2.0//EN"
  -- This is an annoying comment  --
>
END2



<I>
<BLOCKQUOTE>
<DL><DT>A ship then new they built for him
<DD>of mithril and of elven glass...
</DL>
</I>
</BLOCKQUOTE>
</CENTER>

<HR size=3 noshade>

<UL>
<LI>
<A HREF="/perl/CPAN/README.html">CPAN
(Comprehensive Perl Archive Network)</a> sites are replicated around the world; please choose
from <A HREF="/perl/CPAN/CPAN.html">one near you</a>.
The <A HREF="ftp://mox.perl.com/pub/perl/CPAN/modules/01modules.index.html">CPAN index</a>
to the <A HREF="ftp://mox.perl.com/pub/perl/CPAN/modules/00modlist.long.html">full modules file</a>
are also good places to look.

<LI><IMG SRC="/deckmaster/gifs/new.gif" WIDTH=26 HEIGHT=13 ALT="NEW">
Here's a table of perl and CGI-related books and publications, in either 
<A HREF="/perl/info/books.html"><SMALL>HTML</SMALL> 3.0 table format</a>
or else in 
<A HREF="/perl/info/books.txt">pre-formatted</a> for old browsers.