use Test; BEGIN { plan tests => 38 } use XML::LibXML; use IO::File; ok(1); my $html = "example/test.html"; my $parser = XML::LibXML->new(); { my $doc = $parser->parse_html_file($html); ok($doc); } my $fh = IO::File->new($html) || die "Can't open $html: $!"; my $string; { local $/; $string = <$fh>; } seek($fh, 0, 0); ok($string); $doc = $parser->parse_html_string($string); ok($doc); undef $doc; $doc = $parser->parse_html_fh($fh); ok($doc); $fh->close(); # parsing HTML's CGI calling links my $strhref = < foo

test EOHTML my $htmldoc; $parser->recover(1); eval { local $SIG{'__WARN__'} = sub { }; $htmldoc = $parser->parse_html_string( $strhref ); }; # ok( not $@ ); ok( $htmldoc ); print "parse_html_string with encoding...\n"; # encodings if (eval { require Encode; }) { use utf8; my $utf_str = "ěščř"; # w/o 'meta' charset $strhref = <

$utf_str

EOHTML ok( Encode::is_utf8($strhref) ); $htmldoc = $parser->parse_html_string( $strhref ); ok( $htmldoc && $htmldoc->getDocumentElement ); ok($htmldoc->findvalue('//p/text()'), $utf_str); $htmldoc = $parser->parse_html_string( $strhref, { encoding => 'UTF-8' } ); ok( $htmldoc && $htmldoc->getDocumentElement ); ok($htmldoc->findvalue('//p/text()'), $utf_str); my $iso_str = Encode::encode('iso-8859-2', $strhref); $htmldoc = $parser->parse_html_string( $iso_str, { encoding => 'iso-8859-2' } ); ok( $htmldoc && $htmldoc->getDocumentElement ); ok($htmldoc->findvalue('//p/text()'), $utf_str); # w/ 'meta' charset $strhref = <

$utf_str

EOHTML $htmldoc = $parser->parse_html_string( $strhref, { encoding => 'UTF-8' }); ok( $htmldoc && $htmldoc->getDocumentElement ); ok($htmldoc->findvalue('//p/text()'), $utf_str); $iso_str = Encode::encode('iso-8859-2', $strhref); $htmldoc = $parser->parse_html_string( $iso_str ); ok( $htmldoc && $htmldoc->getDocumentElement ); ok($htmldoc->findvalue('//p/text()'), $utf_str); $htmldoc = $parser->parse_html_string( $iso_str, { encoding => 'iso-8859-2', URI => 'foo' } ); ok( $htmldoc && $htmldoc->getDocumentElement ); ok($htmldoc->findvalue('//p/text()'), $utf_str); ok($htmldoc->URI, 'foo'); } else { skip("Encoding related tests require Encode") for 1..14; } print "parse example/enc_latin2.html...\n"; # w/ 'meta' charset { use utf8; my $utf_str = "ěščř"; my $test_file = 'example/enc_latin2.html'; my $fh; $htmldoc = $parser->parse_html_file( $test_file ); ok( $htmldoc && $htmldoc->getDocumentElement ); ok($htmldoc->findvalue('//p/text()'), $utf_str); $htmldoc = $parser->parse_html_file( $test_file, { encoding => 'iso-8859-2', URI => 'foo' }); ok( $htmldoc && $htmldoc->getDocumentElement ); ok($htmldoc->findvalue('//p/text()'), $utf_str); ok($htmldoc->URI, 'foo'); open $fh, $test_file; $htmldoc = $parser->parse_html_fh( $fh ); close $fh; ok( $htmldoc && $htmldoc->getDocumentElement ); ok($htmldoc->findvalue('//p/text()'), $utf_str); open $fh, $test_file; $htmldoc = $parser->parse_html_fh( $fh, { encoding => 'iso-8859-2', URI => 'foo', }); close $fh; ok( $htmldoc && $htmldoc->getDocumentElement ); ok($htmldoc->URI, 'foo'); ok($htmldoc->findvalue('//p/text()'), $utf_str); if (1000*$] < 5008) { skip("skipping for Perl < 5.8") for 1..2; } elsif (20627 > XML::LibXML::LIBXML_VERSION) { skip("skipping for libxml2 < 2.6.27") for 1..2; } else { # translate to UTF8 on perl-side open $fh, '<:encoding(iso-8859-2)', $test_file; $htmldoc = $parser->parse_html_fh( $fh, { encoding => 'UTF-8' }); close $fh; ok( $htmldoc && $htmldoc->getDocumentElement ); ok($htmldoc->findvalue('//p/text()'), $utf_str); } } print "parse example/enc2_latin2.html...\n"; # w/o 'meta' charset { use utf8; my $utf_str = "ěščř"; my $test_file = 'example/enc2_latin2.html'; my $fh; $htmldoc = $parser->parse_html_file( $test_file, { encoding => 'iso-8859-2' }); ok( $htmldoc && $htmldoc->getDocumentElement ); ok($htmldoc->findvalue('//p/text()'), $utf_str); open $fh, $test_file; $htmldoc = $parser->parse_html_fh( $fh, { encoding => 'iso-8859-2' }); close $fh; ok( $htmldoc && $htmldoc->getDocumentElement ); ok($htmldoc->findvalue('//p/text()'), $utf_str); if (1000*$] < 5008) { skip("skipping for Perl < 5.8") for 1..2; } else { # translate to UTF8 on perl-side open $fh, '<:encoding(iso-8859-2)', $test_file; $htmldoc = $parser->parse_html_fh( $fh, { encoding => 'UTF-8' } ); close $fh; ok( $htmldoc && $htmldoc->getDocumentElement ); ok($htmldoc->findvalue('//p/text()'), $utf_str); } }