1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228 |
- =head1 NAME
- XML::TreePP -- Pure Perl implementation for parsing/writing xml files
- =head1 SYNOPSIS
- parse xml file into hash tree
- use XML::TreePP;
- my $tpp = XML::TreePP->new();
- my $tree = $tpp->parsefile( "index.rdf" );
- print "Title: ", $tree->{"rdf:RDF"}->{item}->[0]->{title}, "\n";
- print "URL: ", $tree->{"rdf:RDF"}->{item}->[0]->{link}, "\n";
- write xml as string from hash tree
- use XML::TreePP;
- my $tpp = XML::TreePP->new();
- my $tree = { rss => { channel => { item => [ {
- title => "The Perl Directory",
- link => "http://www.perl.org/",
- }, {
- title => "The Comprehensive Perl Archive Network",
- link => "http://cpan.perl.org/",
- } ] } } };
- my $xml = $tpp->write( $tree );
- print $xml;
- get remote xml file with HTTP-GET and parse it into hash tree
- use XML::TreePP;
- my $tpp = XML::TreePP->new();
- my $tree = $tpp->parsehttp( GET => "http://use.perl.org/index.rss" );
- print "Title: ", $tree->{"rdf:RDF"}->{channel}->{title}, "\n";
- print "URL: ", $tree->{"rdf:RDF"}->{channel}->{link}, "\n";
- get remote xml file with HTTP-POST and parse it into hash tree
- use XML::TreePP;
- my $tpp = XML::TreePP->new( force_array => [qw( item )] );
- my $cgiurl = "http://search.hatena.ne.jp/keyword";
- my $keyword = "ajax";
- my $cgiquery = "mode=rss2&word=".$keyword;
- my $tree = $tpp->parsehttp( POST => $cgiurl, $cgiquery );
- print "Link: ", $tree->{rss}->{channel}->{item}->[0]->{link}, "\n";
- print "Desc: ", $tree->{rss}->{channel}->{item}->[0]->{description}, "\n";
- =head1 DESCRIPTION
- XML::TreePP module parses XML file and expands it for a hash tree.
- And also generate XML file from a hash tree.
- This is a pure Perl implementation.
- You can also download XML from remote web server
- like XMLHttpRequest object at JavaScript language.
- =head1 EXAMPLES
- =head2 Parse XML file
- Sample XML source:
- <?xml version="1.0" encoding="UTF-8"?>
- <family name="Kawasaki">
- <father>Yasuhisa</father>
- <mother>Chizuko</mother>
- <children>
- <girl>Shiori</girl>
- <boy>Yusuke</boy>
- <boy>Kairi</boy>
- </children>
- </family>
- Sample program to read a xml file and dump it:
- use XML::TreePP;
- use Data::Dumper;
- my $tpp = XML::TreePP->new();
- my $tree = $tpp->parsefile( "family.xml" );
- my $text = Dumper( $tree );
- print $text;
- Result dumped:
- $VAR1 = {
- 'family' => {
- '-name' => 'Kawasaki',
- 'father' => 'Yasuhisa',
- 'mother' => 'Chizuko',
- 'children' => {
- 'girl' => 'Shiori'
- 'boy' => [
- 'Yusuke',
- 'Kairi'
- ],
- }
- }
- };
- Details:
- print $tree->{family}->{father}; # the father's given name.
- The prefix '-' is added on every attribute's name.
- print $tree->{family}->{"-name"}; # the family name of the family
- The array is used because the family has two boys.
- print $tree->{family}->{children}->{boy}->[1]; # The second boy's name
- print $tree->{family}->{children}->{girl}; # The girl's name
- =head2 Text node and attributes:
- If a element has both of a text node and attributes
- or both of a text node and other child nodes,
- value of a text node is moved to C<#text> like child nodes.
- use XML::TreePP;
- use Data::Dumper;
- my $tpp = XML::TreePP->new();
- my $source = '<span class="author">Kawasaki Yusuke</span>';
- my $tree = $tpp->parse( $source );
- my $text = Dumper( $tree );
- print $text;
- The result dumped is following:
- $VAR1 = {
- 'span' => {
- '-class' => 'author',
- '#text' => 'Kawasaki Yusuke'
- }
- };
- The special node name of C<#text> is used because this elements
- has attribute(s) in addition to the text node.
- See also L</text_node_key> option.
- =head1 METHODS
- =head2 new
- This constructor method returns a new XML::TreePP object with C<%options>.
- $tpp = XML::TreePP->new( %options );
- =head2 set
- This method sets a option value for C<option_name>.
- If C<$option_value> is not defined, its option is deleted.
- $tpp->set( option_name => $option_value );
- See OPTIONS section below for details.
- =head2 get
- This method returns a current option value for C<option_name>.
- $tpp->get( 'option_name' );
- =head2 parse
- This method reads XML source and returns a hash tree converted.
- The first argument is a scalar or a reference to a scalar.
- $tree = $tpp->parse( $source );
- =head2 parsefile
- This method reads a XML file and returns a hash tree converted.
- The first argument is a filename.
- $tree = $tpp->parsefile( $file );
- =head2 parsehttp
- This method receives a XML file from a remote server via HTTP and
- returns a hash tree converted.
- $tree = $tpp->parsehttp( $method, $url, $body, $head );
- C<$method> is a method of HTTP connection: GET/POST/PUT/DELETE
- C<$url> is an URI of a XML file.
- C<$body> is a request body when you use POST method.
- C<$head> is a request headers as a hash ref.
- L<LWP::UserAgent> module or L<HTTP::Lite> module is required to fetch a file.
- ( $tree, $xml, $code ) = $tpp->parsehttp( $method, $url, $body, $head );
- In array context, This method returns also raw XML source received
- and HTTP response's status code.
- =head2 write
- This method parses a hash tree and returns a XML source generated.
- $source = $tpp->write( $tree, $encode );
- C<$tree> is a reference to a hash tree.
- =head2 writefile
- This method parses a hash tree and writes a XML source into a file.
- $tpp->writefile( $file, $tree, $encode );
- C<$file> is a filename to create.
- C<$tree> is a reference to a hash tree.
- =head1 OPTIONS FOR PARSING XML
- This module accepts option parameters following:
- =head2 force_array
- This option allows you to specify a list of element names which
- should always be forced into an array representation.
- $tpp->set( force_array => [ 'rdf:li', 'item', '-xmlns' ] );
- The default value is null, it means that context of the elements
- will determine to make array or to keep it scalar or hash.
- Note that the special wildcard name C<'*'> means all elements.
- =head2 force_hash
- This option allows you to specify a list of element names which
- should always be forced into an hash representation.
- $tpp->set( force_hash => [ 'item', 'image' ] );
- The default value is null, it means that context of the elements
- will determine to make hash or to keep it scalar as a text node.
- See also L</text_node_key> option below.
- Note that the special wildcard name C<'*'> means all elements.
- =head2 cdata_scalar_ref
- This option allows you to convert a cdata section into a reference
- for scalar on parsing XML source.
- $tpp->set( cdata_scalar_ref => 1 );
- The default value is false, it means that each cdata section is converted into a scalar.
- =head2 user_agent
- This option allows you to specify a HTTP_USER_AGENT string which
- is used by parsehttp() method.
- $tpp->set( user_agent => 'Mozilla/4.0 (compatible; ...)' );
- The default string is C<'XML-TreePP/#.##'>, where C<'#.##'> is
- substituted with the version number of this library.
- =head2 http_lite
- This option forces pasrsehttp() method to use a L<HTTP::Lite> instance.
- my $http = HTTP::Lite->new();
- $tpp->set( http_lite => $http );
- =head2 lwp_useragent
- This option forces pasrsehttp() method to use a L<LWP::UserAgent> instance.
- my $ua = LWP::UserAgent->new();
- $ua->timeout( 60 );
- $ua->env_proxy;
- $tpp->set( lwp_useragent => $ua );
- You may use this with L<LWP::UserAgent::WithCache>.
- =head2 base_class
- This blesses class name for each element's hashref.
- Each class is named straight as a child class of it parent class.
- $tpp->set( base_class => 'MyElement' );
- my $xml = '<root><parent><child key="val">text</child></parent></root>';
- my $tree = $tpp->parse( $xml );
- print ref $tree->{root}->{parent}->{child}, "\n";
- A hash for <child> element above is blessed to C<MyElement::root::parent::child>
- class. You may use this with L<Class::Accessor>.
- =head2 elem_class
- This blesses class name for each element's hashref.
- Each class is named horizontally under the direct child of C<MyElement>.
- $tpp->set( base_class => 'MyElement' );
- my $xml = '<root><parent><child key="val">text</child></parent></root>';
- my $tree = $tpp->parse( $xml );
- print ref $tree->{root}->{parent}->{child}, "\n";
- A hash for <child> element above is blessed to C<MyElement::child> class.
- =head1 OPTIONS FOR WRITING XML
- =head2 first_out
- This option allows you to specify a list of element/attribute
- names which should always appears at first on output XML code.
- $tpp->set( first_out => [ 'link', 'title', '-type' ] );
- The default value is null, it means alphabetical order is used.
- =head2 last_out
- This option allows you to specify a list of element/attribute
- names which should always appears at last on output XML code.
- $tpp->set( last_out => [ 'items', 'item', 'entry' ] );
- =head2 indent
- This makes the output more human readable by indenting appropriately.
- $tpp->set( indent => 2 );
- This doesn't strictly follow the XML Document Spec but does looks nice.
- =head2 xml_decl
- This module generates an XML declaration on writing an XML code per default.
- This option forces to change or leave it.
- $tpp->set( xml_decl => '' );
- =head2 output_encoding
- This option allows you to specify a encoding of xml file generated
- by write/writefile methods.
- $tpp->set( output_encoding => 'UTF-8' );
- On Perl 5.8.0 and later, you can select it from every
- encodings supported by Encode.pm. On Perl 5.6.x and before with
- Jcode.pm, you can use C<Shift_JIS>, C<EUC-JP>, C<ISO-2022-JP> and
- C<UTF-8>. The default value is C<UTF-8> which is recommended encoding.
- =head1 OPTIONS FOR BOTH
- =head2 utf8_flag
- This makes utf8 flag on for every element's value parsed
- and makes it on for an XML code generated as well.
- $tpp->set( utf8_flag => 1 );
- Perl 5.8.1 or later is required to use this.
- =head2 attr_prefix
- This option allows you to specify a prefix character(s) which
- is inserted before each attribute names.
- $tpp->set( attr_prefix => '@' );
- The default character is C<'-'>.
- Or set C<'@'> to access attribute values like E4X, ECMAScript for XML.
- Zero-length prefix C<''> is available as well, it means no prefix is added.
- =head2 text_node_key
- This option allows you to specify a hash key for text nodes.
- $tpp->set( text_node_key => '#text' );
- The default key is C<#text>.
- =head2 ignore_error
- This module calls Carp::croak function on an error per default.
- This option makes all errors ignored and just return.
- $tpp->set( ignore_error => 1 );
- =head2 use_ixhash
- This option keeps the order for each element appeared in XML.
- L<Tie::IxHash> module is required.
- $tpp->set( use_ixhash => 1 );
- This makes parsing performance slow.
- (about 100% slower than default)
- =head1 AUTHOR
- Yusuke Kawasaki, http://www.kawa.net/
- =head1 COPYRIGHT AND LICENSE
- Copyright (c) 2006-2007 Yusuke Kawasaki. All rights reserved.
- This program is free software; you can redistribute it and/or
- modify it under the same terms as Perl itself.
- =cut
- package XML::TreePP;
- use strict;
- use Carp;
- use Symbol;
- use vars qw( $VERSION );
- $VERSION = '0.32';
- my $XML_ENCODING = 'UTF-8';
- my $INTERNAL_ENCODING = 'UTF-8';
- my $USER_AGENT = 'XML-TreePP/'.$VERSION.' ';
- my $ATTR_PREFIX = '-';
- my $TEXT_NODE_KEY = '#text';
- sub new {
- my $package = shift;
- my $self = {@_};
- bless $self, $package;
- $self;
- }
- sub die {
- my $self = shift;
- my $mess = shift;
- return if $self->{ignore_error};
- Carp::croak $mess;
- }
- sub warn {
- my $self = shift;
- my $mess = shift;
- return if $self->{ignore_error};
- Carp::carp $mess;
- }
- sub set {
- my $self = shift;
- my $key = shift;
- my $val = shift;
- if ( defined $val ) {
- $self->{$key} = $val;
- }
- else {
- delete $self->{$key};
- }
- }
- sub get {
- my $self = shift;
- my $key = shift;
- $self->{$key} if exists $self->{$key};
- }
- sub writefile {
- my $self = shift;
- my $file = shift;
- my $tree = shift or return $self->die( 'Invalid tree' );
- my $encode = shift;
- return $self->die( 'Invalid filename' ) unless defined $file;
- my $text = $self->write( $tree, $encode );
- if ( $] >= 5.008001 && utf8::is_utf8( $text ) ) {
- utf8::encode( $text );
- }
- $self->write_raw_xml( $file, $text );
- }
- sub write {
- my $self = shift;
- my $tree = shift or return $self->die( 'Invalid tree' );
- my $from = $self->{internal_encoding} || $INTERNAL_ENCODING;
- my $to = shift || $self->{output_encoding} || $XML_ENCODING;
- my $decl = $self->{xml_decl};
- $decl = '<?xml version="1.0" encoding="' . $to . '" ?>' unless defined $decl;
- local $self->{__first_out};
- if ( exists $self->{first_out} ) {
- my $keys = $self->{first_out};
- $keys = [$keys] unless ref $keys;
- $self->{__first_out} = { map { $keys->[$_] => $_ } 0 .. $#$keys };
- }
- local $self->{__last_out};
- if ( exists $self->{last_out} ) {
- my $keys = $self->{last_out};
- $keys = [$keys] unless ref $keys;
- $self->{__last_out} = { map { $keys->[$_] => $_ } 0 .. $#$keys };
- }
- my $tnk = $self->{text_node_key} if exists $self->{text_node_key};
- $tnk = $TEXT_NODE_KEY unless defined $tnk;
- local $self->{text_node_key} = $tnk;
- my $apre = $self->{attr_prefix} if exists $self->{attr_prefix};
- $apre = $ATTR_PREFIX unless defined $apre;
- local $self->{__attr_prefix_len} = length($apre);
- local $self->{__attr_prefix_rex} = defined $apre ? qr/^\Q$apre\E/s : undef;
- local $self->{__indent};
- if ( exists $self->{indent} && $self->{indent} ) {
- $self->{__indent} = ' ' x $self->{indent};
- }
- my $text = $self->hash_to_xml( undef, $tree );
- if ( $from && $to ) {
- my $stat = $self->encode_from_to( \$text, $from, $to );
- return $self->die( "Unsupported encoding: $to" ) unless $stat;
- }
- return $text if ( $decl eq '' );
- join( "\n", $decl, $text );
- }
- sub parsehttp {
- my $self = shift;
- local $self->{__user_agent};
- if ( exists $self->{user_agent} ) {
- my $agent = $self->{user_agent};
- $agent .= $USER_AGENT if ( $agent =~ /\s$/s );
- $self->{__user_agent} = $agent if ( $agent ne '' );
- } else {
- $self->{__user_agent} = $USER_AGENT;
- }
- my $http = $self->{__http_module};
- unless ( $http ) {
- $http = $self->find_http_module(@_);
- $self->{__http_module} = $http;
- }
- if ( $http eq 'LWP::UserAgent' ) {
- return $self->parsehttp_lwp(@_);
- }
- elsif ( $http eq 'HTTP::Lite' ) {
- return $self->parsehttp_lite(@_);
- }
- else {
- return $self->die( "LWP::UserAgent or HTTP::Lite is required: $_[1]" );
- }
- }
- sub find_http_module {
- my $self = shift || {};
- if ( exists $self->{lwp_useragent} && ref $self->{lwp_useragent} ) {
- return 'LWP::UserAgent' if defined $LWP::UserAgent::VERSION;
- return 'LWP::UserAgent' if &load_lwp_useragent();
- return $self->die( "LWP::UserAgent is required: $_[1]" );
- }
- if ( exists $self->{http_lite} && ref $self->{http_lite} ) {
- return 'HTTP::Lite' if defined $HTTP::Lite::VERSION;
- return 'HTTP::Lite' if &load_http_lite();
- return $self->die( "HTTP::Lite is required: $_[1]" );
- }
- return 'LWP::UserAgent' if defined $LWP::UserAgent::VERSION;
- return 'HTTP::Lite' if defined $HTTP::Lite::VERSION;
- return 'LWP::UserAgent' if &load_lwp_useragent();
- return 'HTTP::Lite' if &load_http_lite();
- return $self->die( "LWP::UserAgent or HTTP::Lite is required: $_[1]" );
- }
- sub load_lwp_useragent {
- return $LWP::UserAgent::VERSION if defined $LWP::UserAgent::VERSION;
- local $@;
- eval { require LWP::UserAgent; };
- $LWP::UserAgent::VERSION;
- }
- sub load_http_lite {
- return $HTTP::Lite::VERSION if defined $HTTP::Lite::VERSION;
- local $@;
- eval { require HTTP::Lite; };
- $HTTP::Lite::VERSION;
- }
- sub load_tie_ixhash {
- return $Tie::IxHash::VERSION if defined $Tie::IxHash::VERSION;
- local $@;
- eval { require Tie::IxHash; };
- $Tie::IxHash::VERSION;
- }
- sub parsehttp_lwp {
- my $self = shift;
- my $method = shift or return $self->die( 'Invalid HTTP method' );
- my $url = shift or return $self->die( 'Invalid URL' );
- my $body = shift;
- my $header = shift;
- my $ua = $self->{lwp_useragent} if exists $self->{lwp_useragent};
- if ( ! ref $ua ) {
- $ua = LWP::UserAgent->new();
- $ua->timeout(10);
- $ua->env_proxy();
- $ua->agent( $self->{__user_agent} ) if defined $self->{__user_agent};
- } else {
- $ua->agent( $self->{__user_agent} ) if exists $self->{user_agent};
- }
- my $req = HTTP::Request->new( $method, $url );
- my $ct = 0;
- if ( ref $header ) {
- foreach my $field ( sort keys %$header ) {
- my $value = $header->{$field};
- $req->header( $field => $value );
- $ct ++ if ( $field =~ /^Content-Type$/i );
- }
- }
- if ( defined $body && ! $ct ) {
- $req->header( 'Content-Type' => 'application/x-www-form-urlencoded' );
- }
- $req->content($body) if defined $body;
- my $res = $ua->request($req);
- my $code = $res->code();
- my $text = $res->content();
- my $tree = $self->parse( \$text ) if $res->is_success();
- wantarray ? ( $tree, $text, $code ) : $tree;
- }
- sub parsehttp_lite {
- my $self = shift;
- my $method = shift or return $self->die( 'Invalid HTTP method' );
- my $url = shift or return $self->die( 'Invalid URL' );
- my $body = shift;
- my $header = shift;
- my $http = HTTP::Lite->new();
- $http->method($method);
- my $ua = 0;
- if ( ref $header ) {
- foreach my $field ( sort keys %$header ) {
- my $value = $header->{$field};
- $http->add_req_header( $field, $value );
- $ua ++ if ( $field =~ /^User-Agent$/i );
- }
- }
- if ( defined $self->{__user_agent} && ! $ua ) {
- $http->add_req_header( 'User-Agent', $self->{__user_agent} );
- }
- $http->{content} = $body if defined $body;
- my $code = $http->request($url) or return;
- my $text = $http->body();
- my $tree = $self->parse( \$text );
- wantarray ? ( $tree, $text, $code ) : $tree;
- }
- sub parsefile {
- my $self = shift;
- my $file = shift;
- return $self->die( 'Invalid filename' ) unless defined $file;
- my $text = $self->read_raw_xml($file);
- $self->parse( \$text );
- }
- sub parse {
- my $self = shift;
- my $text = ref $_[0] ? ${$_[0]} : $_[0];
- return $self->die( 'Null XML source' ) unless defined $text;
- my $from = &xml_decl_encoding(\$text) || $XML_ENCODING;
- my $to = $self->{internal_encoding} || $INTERNAL_ENCODING;
- if ( $from && $to ) {
- my $stat = $self->encode_from_to( \$text, $from, $to );
- return $self->die( "Unsupported encoding: $from" ) unless $stat;
- }
- local $self->{__force_array};
- local $self->{__force_array_all};
- if ( exists $self->{force_array} ) {
- my $force = $self->{force_array};
- $force = [$force] unless ref $force;
- $self->{__force_array} = { map { $_ => 1 } @$force };
- $self->{__force_array_all} = $self->{__force_array}->{'*'};
- }
- local $self->{__force_hash};
- local $self->{__force_hash_all};
- if ( exists $self->{force_hash} ) {
- my $force = $self->{force_hash};
- $force = [$force] unless ref $force;
- $self->{__force_hash} = { map { $_ => 1 } @$force };
- $self->{__force_hash_all} = $self->{__force_hash}->{'*'};
- }
- my $tnk = $self->{text_node_key} if exists $self->{text_node_key};
- $tnk = $TEXT_NODE_KEY unless defined $tnk;
- local $self->{text_node_key} = $tnk;
- my $apre = $self->{attr_prefix} if exists $self->{attr_prefix};
- $apre = $ATTR_PREFIX unless defined $apre;
- local $self->{attr_prefix} = $apre;
- if ( exists $self->{use_ixhash} && $self->{use_ixhash} ) {
- return $self->die( "Tie::IxHash is required." ) unless &load_tie_ixhash();
- }
- my $flat = $self->xml_to_flat(\$text);
- my $class = $self->{base_class} if exists $self->{base_class};
- my $tree = $self->flat_to_tree( $flat, '', $class );
- if ( ref $tree ) {
- if ( defined $class ) {
- bless( $tree, $class );
- }
- elsif ( exists $self->{elem_class} && $self->{elem_class} ) {
- bless( $tree, $self->{elem_class} );
- }
- }
- wantarray ? ( $tree, $text ) : $tree;
- }
- sub xml_to_flat {
- my $self = shift;
- my $textref = shift; # reference
- my $flat = [];
- my $prefix = $self->{attr_prefix};
- my $ixhash = ( exists $self->{use_ixhash} && $self->{use_ixhash} );
- while ( $$textref =~ m{
- ([^<]*) <
- ((
- \? ([^<>]*) \?
- )|(
- \!\[CDATA\[(.*?)\]\]
- )|(
- \!DOCTYPE\s+([^\[\]<>]*(?:\[.*?\]\s*)?)
- )|(
- \!--(.*?)--
- )|(
- ([^\!\?\s<>](?:"[^"]*"|'[^']*'|[^"'<>])*)
- ))
- > ([^<]*)
- }sxg ) {
- my (
- $ahead, $match, $typePI, $contPI, $typeCDATA,
- $contCDATA, $typeDocT, $contDocT, $typeCmnt, $contCmnt,
- $typeElem, $contElem, $follow
- )
- = ( $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13 );
- if ( defined $ahead && $ahead =~ /\S/ ) {
- $self->warn( "Invalid string: [$ahead] before <$match>" );
- }
- if ($typeElem) { # Element
- my $node = {};
- if ( $contElem =~ s#^/## ) {
- $node->{endTag}++;
- }
- elsif ( $contElem =~ s#/$## ) {
- $node->{emptyTag}++;
- }
- else {
- $node->{startTag}++;
- }
- $node->{tagName} = $1 if ( $contElem =~ s#^(\S+)\s*## );
- unless ( $node->{endTag} ) {
- my $attr;
- while ( $contElem =~ m{
- ([^\s\=\"\']+)=(?:(")(.*?)"|'(.*?)')
- }sxg ) {
- my $key = $1;
- my $val = &xml_unescape( $2 ? $3 : $4 );
- if ( ! ref $attr ) {
- $attr = {};
- tie( %$attr, 'Tie::IxHash' ) if $ixhash;
- }
- $attr->{$prefix.$key} = $val;
- }
- $node->{attributes} = $attr if ref $attr;
- }
- push( @$flat, $node );
- }
- elsif ($typeCDATA) { ## CDATASection
- if ( exists $self->{cdata_scalar_ref} && $self->{cdata_scalar_ref} ) {
- push( @$flat, \$contCDATA ); # as reference for scalar
- }
- else {
- push( @$flat, $contCDATA ); # as scalar like text node
- }
- }
- elsif ($typeCmnt) { # Comment (ignore)
- }
- elsif ($typeDocT) { # DocumentType (ignore)
- }
- elsif ($typePI) { # ProcessingInstruction (ignore)
- }
- else {
- $self->warn( "Invalid Tag: <$match>" );
- }
- if ( $follow =~ /\S/ ) { # text node
- my $val = &xml_unescape($follow);
- push( @$flat, $val );
- }
- }
- $flat;
- }
- sub flat_to_tree {
- my $self = shift;
- my $source = shift;
- my $parent = shift;
- my $class = shift;
- my $tree = {};
- my $text = [];
- if ( exists $self->{use_ixhash} && $self->{use_ixhash} ) {
- tie( %$tree, 'Tie::IxHash' );
- }
- while ( scalar @$source ) {
- my $node = shift @$source;
- if ( !ref $node || UNIVERSAL::isa( $node, "SCALAR" ) ) {
- push( @$text, $node ); # cdata or text node
- next;
- }
- my $name = $node->{tagName};
- if ( $node->{endTag} ) {
- last if ( $parent eq $name );
- return $self->die( "Invalid tag sequence: <$parent></$name>" );
- }
- my $elem = $node->{attributes};
- my $forcehash = $self->{__force_hash_all} || $self->{__force_hash}->{$name};
- my $subclass;
- if ( defined $class ) {
- my $escname = $name;
- $escname =~ s/\W/_/sg;
- $subclass = $class.'::'.$escname;
- }
- if ( $node->{startTag} ) { # recursive call
- my $child = $self->flat_to_tree( $source, $name, $subclass );
- next unless defined $child;
- my $hasattr = scalar keys %$elem if ref $elem;
- if ( UNIVERSAL::isa( $child, "HASH" ) ) {
- if ( $hasattr ) {
- # some attributes and some child nodes
- %$elem = ( %$elem, %$child );
- }
- else {
- # some child nodes without attributes
- $elem = $child;
- }
- }
- else {
- if ( $hasattr ) {
- # some attributes and text node
- $elem->{$self->{text_node_key}} = $child;
- }
- elsif ( $forcehash ) {
- # only text node without attributes
- $elem = { $self->{text_node_key} => $child };
- }
- else {
- # text node without attributes
- $elem = $child;
- }
- }
- }
- elsif ( $forcehash && ! ref $elem ) {
- $elem = {};
- }
- # bless to a class by base_class or elem_class
- if ( ref $elem && UNIVERSAL::isa( $elem, "HASH" ) ) {
- if ( defined $subclass ) {
- bless( $elem, $subclass );
- } elsif ( exists $self->{elem_class} && $self->{elem_class} ) {
- my $escname = $name;
- $escname =~ s/\W/_/sg;
- my $elmclass = $self->{elem_class}.'::'.$escname;
- bless( $elem, $elmclass );
- }
- }
- # next unless defined $elem;
- $tree->{$name} ||= [];
- push( @{ $tree->{$name} }, $elem );
- }
- if ( ! $self->{__force_array_all} ) {
- foreach my $key ( keys %$tree ) {
- next if $self->{__force_array}->{$key};
- next if ( 1 < scalar @{ $tree->{$key} } );
- $tree->{$key} = shift @{ $tree->{$key} };
- }
- }
- my $haschild = scalar keys %$tree;
- if ( scalar @$text ) {
- if ( scalar @$text == 1 ) {
- # one text node (normal)
- $text = shift @$text;
- }
- elsif ( ! scalar grep {ref $_} @$text ) {
- # some text node splitted
- $text = join( '', @$text );
- }
- else {
- # some cdata node
- my $join = join( '', map {ref $_ ? $$_ : $_} @$text );
- $text = \$join;
- }
- if ( $haschild ) {
- # some child nodes and also text node
- $tree->{$self->{text_node_key}} = $text;
- }
- else {
- # only text node without child nodes
- $tree = $text;
- }
- }
- elsif ( ! $haschild ) {
- # no child and no text
- $tree = "";
- }
- $tree;
- }
- sub hash_to_xml {
- my $self = shift;
- my $name = shift;
- my $hash = shift;
- my $out = [];
- my $attr = [];
- my $allkeys = [ keys %$hash ];
- my $fo = $self->{__first_out} if ref $self->{__first_out};
- my $lo = $self->{__last_out} if ref $self->{__last_out};
- my $firstkeys = [ sort { $fo->{$a} <=> $fo->{$b} } grep { exists $fo->{$_} } @$allkeys ] if ref $fo;
- my $lastkeys = [ sort { $lo->{$a} <=> $lo->{$b} } grep { exists $lo->{$_} } @$allkeys ] if ref $lo;
- $allkeys = [ grep { ! exists $fo->{$_} } @$allkeys ] if ref $fo;
- $allkeys = [ grep { ! exists $lo->{$_} } @$allkeys ] if ref $lo;
- unless ( exists $self->{use_ixhash} && $self->{use_ixhash} ) {
- $allkeys = [ sort @$allkeys ];
- }
- my $prelen = $self->{__attr_prefix_len};
- my $pregex = $self->{__attr_prefix_rex};
- foreach my $keys ( $firstkeys, $allkeys, $lastkeys ) {
- next unless ref $keys;
- my $elemkey = $prelen ? [ grep { $_ !~ $pregex } @$keys ] : $keys;
- my $attrkey = $prelen ? [ grep { $_ =~ $pregex } @$keys ] : [];
- foreach my $key ( @$elemkey ) {
- my $val = $hash->{$key};
- if ( !defined $val ) {
- push( @$out, "<$key />" );
- }
- elsif ( UNIVERSAL::isa( $val, 'ARRAY' ) ) {
- my $child = $self->array_to_xml( $key, $val );
- push( @$out, $child );
- }
- elsif ( UNIVERSAL::isa( $val, 'SCALAR' ) ) {
- my $child = $self->scalaref_to_cdata( $key, $val );
- push( @$out, $child );
- }
- elsif ( ref $val ) {
- my $child = $self->hash_to_xml( $key, $val );
- push( @$out, $child );
- }
- else {
- my $child = $self->scalar_to_xml( $key, $val );
- push( @$out, $child );
- }
- }
- foreach my $key ( @$attrkey ) {
- my $name = substr( $key, $prelen );
- my $val = &xml_escape( $hash->{$key} );
- push( @$attr, ' ' . $name . '="' . $val . '"' );
- }
- }
- my $jattr = join( '', @$attr );
- if ( defined $name && scalar @$out && ! grep { ! /^</s } @$out ) {
- # Use human-friendly white spacing
- if ( defined $self->{__indent} ) {
- s/^(\s*<)/$self->{__indent}$1/mg foreach @$out;
- }
- unshift( @$out, "\n" );
- }
- my $text = join( '', @$out );
- if ( defined $name ) {
- if ( scalar @$out ) {
- $text = "<$name$jattr>$text</$name>\n";
- }
- else {
- $text = "<$name$jattr />\n";
- }
- }
- $text;
- }
- sub array_to_xml {
- my $self = shift;
- my $name = shift;
- my $array = shift;
- my $out = [];
- foreach my $val (@$array) {
- if ( !defined $val ) {
- push( @$out, "<$name />\n" );
- }
- elsif ( UNIVERSAL::isa( $val, 'ARRAY' ) ) {
- my $child = $self->array_to_xml( $name, $val );
- push( @$out, $child );
- }
- elsif ( UNIVERSAL::isa( $val, 'SCALAR' ) ) {
- my $child = $self->scalaref_to_cdata( $name, $val );
- push( @$out, $child );
- }
- elsif ( ref $val ) {
- my $child = $self->hash_to_xml( $name, $val );
- push( @$out, $child );
- }
- else {
- my $child = $self->scalar_to_xml( $name, $val );
- push( @$out, $child );
- }
- }
- my $text = join( '', @$out );
- $text;
- }
- sub scalaref_to_cdata {
- my $self = shift;
- my $name = shift;
- my $ref = shift;
- my $data = defined $$ref ? $$ref : '';
- $data =~ s#(]])(>)#$1]]><![CDATA[$2#g;
- #my $text = '<![CDATA[' . $data . ']]>';
- my $text = $data;
- $text = "<$name>$text</$name>\n" if ( $name ne $self->{text_node_key} );
- $text;
- }
- sub scalar_to_xml {
- my $self = shift;
- my $name = shift;
- my $scalar = shift;
- my $copy = $scalar;
- my $text = &xml_escape($copy);
- $text = "<$name>$text</$name>\n" if ( $name ne $self->{text_node_key} );
- $text;
- }
- sub write_raw_xml {
- my $self = shift;
- my $file = shift;
- my $fh = Symbol::gensym();
- open( $fh, ">$file" ) or return $self->die( "$! - $file" );
- print $fh @_;
- close($fh);
- }
- sub read_raw_xml {
- my $self = shift;
- my $file = shift;
- my $fh = Symbol::gensym();
- open( $fh, $file ) or return $self->die( "$! - $file" );
- local $/ = undef;
- my $text = <$fh>;
- close($fh);
- $text;
- }
- sub xml_decl_encoding {
- my $textref = shift;
- return unless defined $$textref;
- my $args = ( $$textref =~ /^\s*<\?xml(\s+\S.*)\?>/s )[0] or return;
- my $getcode = ( $args =~ /\s+encoding=(".*?"|'.*?')/ )[0] or return;
- $getcode =~ s/^['"]//;
- $getcode =~ s/['"]$//;
- $getcode;
- }
- sub encode_from_to {
- my $self = shift;
- my $txtref = shift or return;
- my $from = shift or return;
- my $to = shift or return;
- unless ( defined $Encode::EUCJPMS::VERSION ) {
- $from = 'EUC-JP' if ( $from =~ /\beuc-?jp-?(win|ms)$/i );
- $to = 'EUC-JP' if ( $to =~ /\beuc-?jp-?(win|ms)$/i );
- }
- my $setflag = $self->{utf8_flag} if exists $self->{utf8_flag};
- if ( $] < 5.008001 && $setflag ) {
- return $self->die( "Perl 5.8.1 is required for utf8_flag: $]" );
- }
- if ( $] >= 5.008 ) {
- &load_encode();
- my $check = ( $Encode::VERSION < 2.13 ) ? 0x400 : Encode::FB_XMLCREF();
- if ( $] >= 5.008001 && utf8::is_utf8( $$txtref ) ) {
- if ( $to =~ /^utf-?8$/i ) {
- # skip
- } else {
- $$txtref = Encode::encode( $to, $$txtref, $check );
- }
- } else {
- $$txtref = Encode::decode( $from, $$txtref );
- if ( $to =~ /^utf-?8$/i && $setflag ) {
- # skip
- } else {
- $$txtref = Encode::encode( $to, $$txtref, $check );
- }
- }
- }
- elsif ( ( uc($from) eq 'ISO-8859-1'
- || uc($from) eq 'US-ASCII'
- || uc($from) eq 'LATIN-1' ) && uc($to) eq 'UTF-8' ) {
- &latin1_to_utf8($txtref);
- }
- else {
- my $jfrom = &get_jcode_name($from);
- my $jto = &get_jcode_name($to);
- return $to if ( uc($jfrom) eq uc($jto) );
- if ( $jfrom && $jto ) {
- &load_jcode();
- if ( defined $Jcode::VERSION ) {
- Jcode::convert( $txtref, $jto, $jfrom );
- }
- else {
- return $self->die( "Jcode.pm is required: $from to $to" );
- }
- }
- else {
- return $self->die( "Encode.pm is required: $from to $to" );
- }
- }
- $to;
- }
- sub load_jcode {
- return if defined $Jcode::VERSION;
- local $@;
- eval { require Jcode; };
- }
- sub load_encode {
- return if defined $Encode::VERSION;
- local $@;
- eval { require Encode; };
- }
- sub latin1_to_utf8 {
- my $strref = shift;
- $$strref =~ s{
- ([\x80-\xFF])
- }{
- pack( 'C2' => 0xC0|(ord($1)>>6),0x80|(ord($1)&0x3F) )
- }exg;
- }
- sub get_jcode_name {
- my $src = shift;
- my $dst;
- if ( $src =~ /^utf-?8$/i ) {
- $dst = 'utf8';
- }
- elsif ( $src =~ /^euc.*jp(-?(win|ms))?$/i ) {
- $dst = 'euc';
- }
- elsif ( $src =~ /^(shift.*jis|cp932|windows-31j)$/i ) {
- $dst = 'sjis';
- }
- elsif ( $src =~ /^iso-2022-jp/ ) {
- $dst = 'jis';
- }
- $dst;
- }
- sub xml_escape {
- my $str = shift;
- return '' unless defined $str;
- # except for TAB(\x09),CR(\x0D),LF(\x0A)
- $str =~ s{
- ([\x00-\x08\x0B\x0C\x0E-\x1F\x7F])
- }{
- sprintf( '&#%d;', ord($1) );
- }gex;
- $str =~ s/&(?!#(\d+;|x[\dA-Fa-f]+;))/&/g;
- $str =~ s/</</g;
- $str =~ s/>/>/g;
- $str =~ s/'/'/g;
- $str =~ s/"/"/g;
- $str;
- }
- sub xml_unescape {
- my $str = shift;
- my $map = {qw( quot " lt < gt > apos ' amp & )};
- $str =~ s{
- (&(?:\#(\d+)|\#x([0-9a-fA-F]+)|(quot|lt|gt|apos|amp));)
- }{
- $4 ? $map->{$4} : &char_deref($1,$2,$3);
- }gex;
- $str;
- }
- sub char_deref {
- my( $str, $dec, $hex ) = @_;
- if ( defined $dec ) {
- return &code_to_utf8( $dec ) if ( $dec < 256 );
- }
- elsif ( defined $hex ) {
- my $num = hex($hex);
- return &code_to_utf8( $num ) if ( $num < 256 );
- }
- return $str;
- }
- sub code_to_utf8 {
- my $code = shift;
- if ( $code < 128 ) {
- return pack( C => $code );
- }
- elsif ( $code < 256 ) {
- return pack( C2 => 0xC0|($code>>6), 0x80|($code&0x3F));
- }
- elsif ( $code < 65536 ) {
- return pack( C3 => 0xC0|($code>>12), 0x80|(($code>>6)&0x3F), 0x80|($code&0x3F));
- }
- return shift if scalar @_; # default value
- sprintf( '&#x%04X;', $code );
- }
- 1;
|