animate.tvのscrape (2)
use strict; use warnings; use WWW::Mechanize; use Web::Scraper; use XML::LibXML::Simple; #use Path::Class; use Encode; use utf8; binmode STDOUT, ":utf8"; my $uri = shift || die; print STDERR "get $uri ... \n"; my $info = scraper{ process '//table[@class="playlist"]', 'content[]' => scraper{ process '//tr[1]', 'title' => 'TEXT', process '//tr[3]/td[1]', subtitle => 'TEXT', process '//tr[3]/td[@class="play_btn"]/a', 'play' => '@href', }; result 'content'; }->scrape(new URI($uri)); ### $info my $mech = new WWW::Mechanize( autocheck => 1 ); my $parser = XML::LibXML::Simple->new(); foreach my $e (@$info){ next unless $e->{play}; print STDERR "get $e->{play} ... \n"; $mech->get($uri); $mech->get($e->{play}); ### content : $mech->content my $content_utf8 = Encode::decode("sjis", $mech->content); my $tree = $parser->XMLin($content_utf8)->{Entry}; print "$tree->{Ref}->{href}\t$e->{title}\t$e->{subtitle}\t$tree->{Title}\t$tree->{Author}\n"; sleep 1; }
こんなもんか。XML::Simpleの日本語の取り扱いがヘンではまった。
ていうか、なにげにメチャ便利だなこれ。