print "1..11\n";
use strict;
use HTML::TokeParser;
# First we create an HTML document to test
my $file = "ttest$$.htm";
die "$file already exists" if -e $file;
open(F, ">$file") or die "Can't create $file: $!";
print F <<'EOT'; close(F);
This is the <title>
This is the title again
And this is a link to the Institute
process instruction >
EOT
END { unlink($file) || warn "Can't unlink $file: $!"; }
my $p;
$p = HTML::TokeParser->new($file) || die "Can't open $file: $!";
if ($p->get_tag("foo", "title")) {
my $title = $p->get_trimmed_text;
#print "Title: $title\n";
print "not " unless $title eq "This is the ";
print "ok 1\n";
}
undef($p);
# Test with reference to glob
open(F, $file) || die "Can't open $file: $!";
$p = HTML::TokeParser->new(\*F);
my $scount = 0;
my $ecount = 0;
my $tcount = 0;
my $pcount = 0;
while (my $token = $p->get_token) {
$scount++ if $token->[0] eq "S";
$ecount++ if $token->[0] eq "E";
$pcount++ if $token->[0] eq "PI";
}
undef($p);
close F;
# Test with glob
open(F, $file) || die "Can't open $file: $!";
$p = HTML::TokeParser->new(*F);
$tcount++ while $p->get_tag;
undef($p);
close F;
# Test with plain file name
$p = HTML::TokeParser->new($file) || die;
$tcount++ while $p->get_tag;
undef($p);
print "Number of tokens found: $tcount/2 = $scount + $ecount\n";
print "Number of process instruction found: $pcount\n";
print "not " unless $tcount == 32 &&
$scount == 9 && $ecount == 7 &&
$pcount == 1;
print "ok 2\n";
print "not " if HTML::TokeParser->new("/noT/thEre/$$");
print "ok 3\n";
$p = HTML::TokeParser->new($file) || die;
$p->get_tag("a");
my $atext = $p->get_text;
undef($p);
#print "ATEXT: $atext\n";
print "not " unless $atext eq "Perl\240Institute";
print "ok 4\n";
# test parsing of embeded document
$p = HTML::TokeParser->new(\<Title
Heading
HTML
print "not " unless $p->get_tag("h1") && $p->get_trimmed_text eq "Heading";
print "ok 5\n";
undef($p);
# test parsing of large embedded documents
my $doc = "foo is bar\n\n\n" x 2022;
#use Time::HiRes qw(time);
my $start = time;
$p = HTML::TokeParser->new(\$doc);
print "Contruction time: ", time - $start, "\n";
my $count;
while (my $t = $p->get_token) {
$count++ if $t->[0] eq "S";
}
print "Parse time: ", time - $start, "\n";
print "not " unless $count == 2022;
print "ok 6\n";
$p = HTML::TokeParser->new(\<<'EOT');
This is a heading
This is some
text.
This is some more text.
This is even some more.
EOT
$p->get_tag("/h1");
my $t = $p->get_trimmed_text("br", "p");
print "not " unless $t eq "This is some text.";
print "ok 7\n";
$p->get_tag;
$t = $p->get_trimmed_text("br", "p");
print "not " unless $t eq "This is some more text.";
print "ok 8\n";
undef($p);
$p = HTML::TokeParser->new(\<<'EOT');
This is a bold heading
This is some italic text.
This is some more text.
This is even some more.
EOT
$p->get_tag("h1");
$t = $p->get_phrase;
print "not " unless $t eq "This is a bold heading";
print "ok 9\n";
$t = $p->get_phrase;
print "not " unless $t eq "";
print "ok 10\n";
$p->get_tag;
$t = $p->get_phrase;
print "not " unless $t eq "This is some italic text. This is some more text.";
print "ok 11\n";
undef($p);