package HTMLParser;

# $Id: HTMLParser.pm,v 1.0 1998/08/11 09:32:14 meg Exp $

=head1 NAME

HTMLParser - SGML parser class

=head1 SYNOPSIS

 require HTMLParser;
 $p = HTMLParser->new;
 $p->parse($chunk1);
 $p->parse($chunk2);
 #...
 $p->eof;                 # signal end of document

 # Parse directly from file
 $p->parse_file("foo.html");
 # or
 open(F, "foo.html") || die;
 $p->parse_file(\*F);

 # dump internal HTML representation to stdout (for debug)
 $p->dump;

 # parse tag attributes
 $t = 'FONT SIZE=+1';
 %atts = $p->attList( $t ); # return hash of tag attributes

=head1 DESCRIPTION

HTMLParser is a heavily modified version HTML::Parser class by Gisle
Aas' <aas@sn.no> from his excellent libwww-perl library.  I have
pealed the HTML::Parser class down to its bare essentials.
Additionally HTMLParser stores the parsed HTML as an array in an
instance variable.  Any bugs or problems introduced should be blamed
on me meg@dexa.com.

The C<HTMLParser> will tokenize a HTML document when the $p->parse()
method is called.  The document to parse can be supplied in arbitrary
chunks.  Call $p->eof() the end of the document to flush any remaining
text.  The return value from parse() is a reference to the parser
object.

The $p->parse_file() method can be called to parse text from a file.
The argument can be a filename or an already opened file handle. The
return value from parse_file() is a reference to the parser object.
=head1 BUGS

You can instruct the parser to parse comments the way Netscape does it
by calling the netscape_buggy_comment() method with a TRUE argument.
This means that comments will always be terminated by the first
occurence of "-->".

=head1 COPYRIGHT

Copyright 1996 Gisle Aas. All rights reserved.

The HTMLParser.pm library is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.

=head1 AUTHOR

Gisle Aas <aas@sn.no>
Marshall Giguere <meg@dexa.com>

=cut

use strict;
use vars qw($VERSION);

$VERSION = '1.0';

sub new {
    my $class = shift;

    my $self = bless {
	'text' => shift(),
	'tags' => [],
    }, $class;

    $self->parse if length $self->{'text'};
    return $self;
}

sub text {
    my $self = shift;
    if (@_) {
	$self->{'text'} = shift;
    }
    return $self->{'text'};
}

sub getTags { return \ $_[0]->{'tags'}; }

sub tags { @{$_[0]->{'tags'}} }

sub eof
{
    shift->parse(undef);
}

sub parse {
    # This HTML parser is a streamlined version of the libwww-perl
    # HTML::Parser class.
    # The parser builds an in memory tree representing an HTML document
    
    my $self = shift;
    my $text = \ $self->{'text'};

    if ( defined $_[0] ) {
	$$text .= $_[0];
    }

    # Parse html text in $$text.  The strategy is to remove complete
    # tokens from the beginning of $$text until we can't deside whether
    # it is a token or not, or the $$text is empty.

    while (1) {
	my ($content, $eaten, $type);

	# First we try to pull off any plain text (anything before a "<" char)
	if ($$text =~ s/\G^([^<]+)//xm ) {
	    $content = $1; $type = 'text';
	    # Then, markup declarations (usually either <!DOCTYPE...> or a comment)
	} elsif (  $$text =~ m/^<!--/ ) {
	    if ( $$text =~ s/^<!--(.*?)-->//s ) {
		$type = 'comment';
		$content = $1;
	    }
        } elsif ($$text =~ s/\G<!// ) {
	    my $eaten = $1;
	    my $buf = '';
	    my $cmt = "";  # keeps comments until we have seen the end
	    # Eat text and beginning of comment
	    while ($$text =~ s/^(([^>]*?)--)//) {
		$eaten .= $1;
		$buf .= $2;
		# Look for end of comment
		if ($$text =~ s/^((.*?)--)//s) {
		    $eaten .= $1;
		    $cmt .= $2;
		} else {
		    # Need more data to get all comment text.
		    $$text = $eaten . $$buf;
		    last;
		}
	    }
	    # Can we finish the tag
	    if ($$text =~ s/^([^>]*)>//) {
		$buf .= $1;
		if ( $buf =~ /\S/ ) {
		    $type = 'markup';
		    $content = $buf;
		}
		else {
		    $type = 'comment';
		    $content = $cmt;
		}
	    } else {
		$$text = $eaten . $$text;  # must start with it all next time
		last;
	    }

	    # Then, look for an end tag
	} elsif ($$text =~ s/\G^<(\/[a-zA-Z][a-zA-Z0-9_\.\-]*\s*)>//xm) {
	    $content = $1; $type = 'endtag';
	    # Then, finally we look for a start tag
	    # We know the first char is <, make sure there's a >
	} elsif ($$text =~ s/\G^<(([a-zA-Z][a-zA-Z0-9_\.\-]*)\s*)// ) {
	    $eaten = $1; $type = 'starttag';
	    while ( $$text =~ s/(^[^>])// ) {
		$eaten .= $1;
	    }
	    if ( $$text =~ s/(^>)// ) { # we have a start tag
		$content = $eaten;
	    }
	    else { # just plain text
		$content = $eaten;
		$type = 'text';
	    }

	} else {
	    # the string is exhausted, or there's no > in it.
	    $content = substr( $$text, pos( $$text ));
	    $type = 'text';
	    if ( pos( $$text ) < length( $$text ) ) {
		$self->addTag( $type, $content );
	    }
	    last;
	}
	$self->addTag( $type, $content );
    }

    return $self;
}

# add a new HTML tag node to the tree
sub addTag  {

    my $self = shift;
    my $type = shift;
    my $content = shift;
    my $tags = $self->{'tags'};

    push @$tags, {
	'type'   => $type,
	'content'=> $content,
    };
}


# dump the contents of the HTML doc. tree
sub dump {
    my $self = shift;
    my $itm;

    foreach $itm ( $self->tags ) {
	print "TagType: $itm->{'type'}, Content: $itm->{'content'}\n";
    }
}

sub attList {
    my $self = shift;
    my $tag = shift;
    my %atts = ();
    $tag =~ s/^([A-Za-z][0-9A-Za-z_]*[ \t]*)//gx;   # strip off tagname
    while ($tag =~ m/\G
	   ([^\=]*)=                                # the key
	   (?:
	    "([^\"\\]*  (?: \\.[^\"\\]* )* )"\s*    # quoted string, with possible whitespace inside
	    |
	    ([^\s>]*)\s*                            # anything else, without whitespace or >
	    )/gx) {
	
	$atts{ lc($1) } = $+;
    }
    return %atts;
}


sub parse_file
{
    my($self, $file) = @_;
    no strict 'refs';  # so that a symbol ref as $file works
    local(*F);
    unless (ref($file) || $file =~ /^\*[\w:]+$/) {
	# Assume $file is a filename
	open(F, $file) || die "Can't open $file: $!";
	$file = \*F;
    }
    my $chunk = '';
    while(read($file, $chunk, 2048)) {
	$self->parse($chunk);
    }
    close($file);
    $self->eof;

}

1;
