HTML::Parser



examples/html-parser/html_parser.pl
use strict;
use warnings;
use HTML::Parser;
use Data::Dumper qw(Dumper);

my $p = HTML::Parser->new(
    api_version => 3,
    start_h => [ \&start, "event, self, tagname, attr, "],
    text_h  => [ \&text, "event, self, dtext"],
    end_h   => [ 'end',   "event, self, tagname"], # no point in attr
);

my $html = <<'END_HTML';

<body>
<ul>
 <li>first elem</li>
  <li>second <a href="http://url" id=42>link</a> elem 
 <li>third <b>bold</b> elem</li>
 <!-- <li>commented out elem -->
 <li>5th elem</li>
</ul>
<img src="/path/to/img.png" />

</body>
END_HTML


# callback can be either referencfe to subroutine (or anonymous sub) 
# or name of sub

# does not call "end" when </li> was missed out, even if new <li> starts
# element that is both opening and closing tag will get a '/' key with a '/' value 
#    empty_element_tags => 1,
#       will remove that entry and generate an end call after the start call

# in the attributes of the opening tag but no call to 'end'


sub start {
    print Dumper \@_;
}
sub end {
    print Dumper \@_;
}
sub text {
    print Dumper \@_;
}

$p->parse($html);