TreePP.pm 35 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228
  1. =head1 NAME
  2. XML::TreePP -- Pure Perl implementation for parsing/writing xml files
  3. =head1 SYNOPSIS
  4. parse xml file into hash tree
  5. use XML::TreePP;
  6. my $tpp = XML::TreePP->new();
  7. my $tree = $tpp->parsefile( "index.rdf" );
  8. print "Title: ", $tree->{"rdf:RDF"}->{item}->[0]->{title}, "\n";
  9. print "URL: ", $tree->{"rdf:RDF"}->{item}->[0]->{link}, "\n";
  10. write xml as string from hash tree
  11. use XML::TreePP;
  12. my $tpp = XML::TreePP->new();
  13. my $tree = { rss => { channel => { item => [ {
  14. title => "The Perl Directory",
  15. link => "http://www.perl.org/",
  16. }, {
  17. title => "The Comprehensive Perl Archive Network",
  18. link => "http://cpan.perl.org/",
  19. } ] } } };
  20. my $xml = $tpp->write( $tree );
  21. print $xml;
  22. get remote xml file with HTTP-GET and parse it into hash tree
  23. use XML::TreePP;
  24. my $tpp = XML::TreePP->new();
  25. my $tree = $tpp->parsehttp( GET => "http://use.perl.org/index.rss" );
  26. print "Title: ", $tree->{"rdf:RDF"}->{channel}->{title}, "\n";
  27. print "URL: ", $tree->{"rdf:RDF"}->{channel}->{link}, "\n";
  28. get remote xml file with HTTP-POST and parse it into hash tree
  29. use XML::TreePP;
  30. my $tpp = XML::TreePP->new( force_array => [qw( item )] );
  31. my $cgiurl = "http://search.hatena.ne.jp/keyword";
  32. my $keyword = "ajax";
  33. my $cgiquery = "mode=rss2&word=".$keyword;
  34. my $tree = $tpp->parsehttp( POST => $cgiurl, $cgiquery );
  35. print "Link: ", $tree->{rss}->{channel}->{item}->[0]->{link}, "\n";
  36. print "Desc: ", $tree->{rss}->{channel}->{item}->[0]->{description}, "\n";
  37. =head1 DESCRIPTION
  38. XML::TreePP module parses XML file and expands it for a hash tree.
  39. And also generate XML file from a hash tree.
  40. This is a pure Perl implementation.
  41. You can also download XML from remote web server
  42. like XMLHttpRequest object at JavaScript language.
  43. =head1 EXAMPLES
  44. =head2 Parse XML file
  45. Sample XML source:
  46. <?xml version="1.0" encoding="UTF-8"?>
  47. <family name="Kawasaki">
  48. <father>Yasuhisa</father>
  49. <mother>Chizuko</mother>
  50. <children>
  51. <girl>Shiori</girl>
  52. <boy>Yusuke</boy>
  53. <boy>Kairi</boy>
  54. </children>
  55. </family>
  56. Sample program to read a xml file and dump it:
  57. use XML::TreePP;
  58. use Data::Dumper;
  59. my $tpp = XML::TreePP->new();
  60. my $tree = $tpp->parsefile( "family.xml" );
  61. my $text = Dumper( $tree );
  62. print $text;
  63. Result dumped:
  64. $VAR1 = {
  65. 'family' => {
  66. '-name' => 'Kawasaki',
  67. 'father' => 'Yasuhisa',
  68. 'mother' => 'Chizuko',
  69. 'children' => {
  70. 'girl' => 'Shiori'
  71. 'boy' => [
  72. 'Yusuke',
  73. 'Kairi'
  74. ],
  75. }
  76. }
  77. };
  78. Details:
  79. print $tree->{family}->{father}; # the father's given name.
  80. The prefix '-' is added on every attribute's name.
  81. print $tree->{family}->{"-name"}; # the family name of the family
  82. The array is used because the family has two boys.
  83. print $tree->{family}->{children}->{boy}->[1]; # The second boy's name
  84. print $tree->{family}->{children}->{girl}; # The girl's name
  85. =head2 Text node and attributes:
  86. If a element has both of a text node and attributes
  87. or both of a text node and other child nodes,
  88. value of a text node is moved to C<#text> like child nodes.
  89. use XML::TreePP;
  90. use Data::Dumper;
  91. my $tpp = XML::TreePP->new();
  92. my $source = '<span class="author">Kawasaki Yusuke</span>';
  93. my $tree = $tpp->parse( $source );
  94. my $text = Dumper( $tree );
  95. print $text;
  96. The result dumped is following:
  97. $VAR1 = {
  98. 'span' => {
  99. '-class' => 'author',
  100. '#text' => 'Kawasaki Yusuke'
  101. }
  102. };
  103. The special node name of C<#text> is used because this elements
  104. has attribute(s) in addition to the text node.
  105. See also L</text_node_key> option.
  106. =head1 METHODS
  107. =head2 new
  108. This constructor method returns a new XML::TreePP object with C<%options>.
  109. $tpp = XML::TreePP->new( %options );
  110. =head2 set
  111. This method sets a option value for C<option_name>.
  112. If C<$option_value> is not defined, its option is deleted.
  113. $tpp->set( option_name => $option_value );
  114. See OPTIONS section below for details.
  115. =head2 get
  116. This method returns a current option value for C<option_name>.
  117. $tpp->get( 'option_name' );
  118. =head2 parse
  119. This method reads XML source and returns a hash tree converted.
  120. The first argument is a scalar or a reference to a scalar.
  121. $tree = $tpp->parse( $source );
  122. =head2 parsefile
  123. This method reads a XML file and returns a hash tree converted.
  124. The first argument is a filename.
  125. $tree = $tpp->parsefile( $file );
  126. =head2 parsehttp
  127. This method receives a XML file from a remote server via HTTP and
  128. returns a hash tree converted.
  129. $tree = $tpp->parsehttp( $method, $url, $body, $head );
  130. C<$method> is a method of HTTP connection: GET/POST/PUT/DELETE
  131. C<$url> is an URI of a XML file.
  132. C<$body> is a request body when you use POST method.
  133. C<$head> is a request headers as a hash ref.
  134. L<LWP::UserAgent> module or L<HTTP::Lite> module is required to fetch a file.
  135. ( $tree, $xml, $code ) = $tpp->parsehttp( $method, $url, $body, $head );
  136. In array context, This method returns also raw XML source received
  137. and HTTP response's status code.
  138. =head2 write
  139. This method parses a hash tree and returns a XML source generated.
  140. $source = $tpp->write( $tree, $encode );
  141. C<$tree> is a reference to a hash tree.
  142. =head2 writefile
  143. This method parses a hash tree and writes a XML source into a file.
  144. $tpp->writefile( $file, $tree, $encode );
  145. C<$file> is a filename to create.
  146. C<$tree> is a reference to a hash tree.
  147. =head1 OPTIONS FOR PARSING XML
  148. This module accepts option parameters following:
  149. =head2 force_array
  150. This option allows you to specify a list of element names which
  151. should always be forced into an array representation.
  152. $tpp->set( force_array => [ 'rdf:li', 'item', '-xmlns' ] );
  153. The default value is null, it means that context of the elements
  154. will determine to make array or to keep it scalar or hash.
  155. Note that the special wildcard name C<'*'> means all elements.
  156. =head2 force_hash
  157. This option allows you to specify a list of element names which
  158. should always be forced into an hash representation.
  159. $tpp->set( force_hash => [ 'item', 'image' ] );
  160. The default value is null, it means that context of the elements
  161. will determine to make hash or to keep it scalar as a text node.
  162. See also L</text_node_key> option below.
  163. Note that the special wildcard name C<'*'> means all elements.
  164. =head2 cdata_scalar_ref
  165. This option allows you to convert a cdata section into a reference
  166. for scalar on parsing XML source.
  167. $tpp->set( cdata_scalar_ref => 1 );
  168. The default value is false, it means that each cdata section is converted into a scalar.
  169. =head2 user_agent
  170. This option allows you to specify a HTTP_USER_AGENT string which
  171. is used by parsehttp() method.
  172. $tpp->set( user_agent => 'Mozilla/4.0 (compatible; ...)' );
  173. The default string is C<'XML-TreePP/#.##'>, where C<'#.##'> is
  174. substituted with the version number of this library.
  175. =head2 http_lite
  176. This option forces pasrsehttp() method to use a L<HTTP::Lite> instance.
  177. my $http = HTTP::Lite->new();
  178. $tpp->set( http_lite => $http );
  179. =head2 lwp_useragent
  180. This option forces pasrsehttp() method to use a L<LWP::UserAgent> instance.
  181. my $ua = LWP::UserAgent->new();
  182. $ua->timeout( 60 );
  183. $ua->env_proxy;
  184. $tpp->set( lwp_useragent => $ua );
  185. You may use this with L<LWP::UserAgent::WithCache>.
  186. =head2 base_class
  187. This blesses class name for each element's hashref.
  188. Each class is named straight as a child class of it parent class.
  189. $tpp->set( base_class => 'MyElement' );
  190. my $xml = '<root><parent><child key="val">text</child></parent></root>';
  191. my $tree = $tpp->parse( $xml );
  192. print ref $tree->{root}->{parent}->{child}, "\n";
  193. A hash for <child> element above is blessed to C<MyElement::root::parent::child>
  194. class. You may use this with L<Class::Accessor>.
  195. =head2 elem_class
  196. This blesses class name for each element's hashref.
  197. Each class is named horizontally under the direct child of C<MyElement>.
  198. $tpp->set( base_class => 'MyElement' );
  199. my $xml = '<root><parent><child key="val">text</child></parent></root>';
  200. my $tree = $tpp->parse( $xml );
  201. print ref $tree->{root}->{parent}->{child}, "\n";
  202. A hash for <child> element above is blessed to C<MyElement::child> class.
  203. =head1 OPTIONS FOR WRITING XML
  204. =head2 first_out
  205. This option allows you to specify a list of element/attribute
  206. names which should always appears at first on output XML code.
  207. $tpp->set( first_out => [ 'link', 'title', '-type' ] );
  208. The default value is null, it means alphabetical order is used.
  209. =head2 last_out
  210. This option allows you to specify a list of element/attribute
  211. names which should always appears at last on output XML code.
  212. $tpp->set( last_out => [ 'items', 'item', 'entry' ] );
  213. =head2 indent
  214. This makes the output more human readable by indenting appropriately.
  215. $tpp->set( indent => 2 );
  216. This doesn't strictly follow the XML Document Spec but does looks nice.
  217. =head2 xml_decl
  218. This module generates an XML declaration on writing an XML code per default.
  219. This option forces to change or leave it.
  220. $tpp->set( xml_decl => '' );
  221. =head2 output_encoding
  222. This option allows you to specify a encoding of xml file generated
  223. by write/writefile methods.
  224. $tpp->set( output_encoding => 'UTF-8' );
  225. On Perl 5.8.0 and later, you can select it from every
  226. encodings supported by Encode.pm. On Perl 5.6.x and before with
  227. Jcode.pm, you can use C<Shift_JIS>, C<EUC-JP>, C<ISO-2022-JP> and
  228. C<UTF-8>. The default value is C<UTF-8> which is recommended encoding.
  229. =head1 OPTIONS FOR BOTH
  230. =head2 utf8_flag
  231. This makes utf8 flag on for every element's value parsed
  232. and makes it on for an XML code generated as well.
  233. $tpp->set( utf8_flag => 1 );
  234. Perl 5.8.1 or later is required to use this.
  235. =head2 attr_prefix
  236. This option allows you to specify a prefix character(s) which
  237. is inserted before each attribute names.
  238. $tpp->set( attr_prefix => '@' );
  239. The default character is C<'-'>.
  240. Or set C<'@'> to access attribute values like E4X, ECMAScript for XML.
  241. Zero-length prefix C<''> is available as well, it means no prefix is added.
  242. =head2 text_node_key
  243. This option allows you to specify a hash key for text nodes.
  244. $tpp->set( text_node_key => '#text' );
  245. The default key is C<#text>.
  246. =head2 ignore_error
  247. This module calls Carp::croak function on an error per default.
  248. This option makes all errors ignored and just return.
  249. $tpp->set( ignore_error => 1 );
  250. =head2 use_ixhash
  251. This option keeps the order for each element appeared in XML.
  252. L<Tie::IxHash> module is required.
  253. $tpp->set( use_ixhash => 1 );
  254. This makes parsing performance slow.
  255. (about 100% slower than default)
  256. =head1 AUTHOR
  257. Yusuke Kawasaki, http://www.kawa.net/
  258. =head1 COPYRIGHT AND LICENSE
  259. Copyright (c) 2006-2007 Yusuke Kawasaki. All rights reserved.
  260. This program is free software; you can redistribute it and/or
  261. modify it under the same terms as Perl itself.
  262. =cut
  263. package XML::TreePP;
  264. use strict;
  265. use Carp;
  266. use Symbol;
  267. use vars qw( $VERSION );
  268. $VERSION = '0.32';
  269. my $XML_ENCODING = 'UTF-8';
  270. my $INTERNAL_ENCODING = 'UTF-8';
  271. my $USER_AGENT = 'XML-TreePP/'.$VERSION.' ';
  272. my $ATTR_PREFIX = '-';
  273. my $TEXT_NODE_KEY = '#text';
  274. sub new {
  275. my $package = shift;
  276. my $self = {@_};
  277. bless $self, $package;
  278. $self;
  279. }
  280. sub die {
  281. my $self = shift;
  282. my $mess = shift;
  283. return if $self->{ignore_error};
  284. Carp::croak $mess;
  285. }
  286. sub warn {
  287. my $self = shift;
  288. my $mess = shift;
  289. return if $self->{ignore_error};
  290. Carp::carp $mess;
  291. }
  292. sub set {
  293. my $self = shift;
  294. my $key = shift;
  295. my $val = shift;
  296. if ( defined $val ) {
  297. $self->{$key} = $val;
  298. }
  299. else {
  300. delete $self->{$key};
  301. }
  302. }
  303. sub get {
  304. my $self = shift;
  305. my $key = shift;
  306. $self->{$key} if exists $self->{$key};
  307. }
  308. sub writefile {
  309. my $self = shift;
  310. my $file = shift;
  311. my $tree = shift or return $self->die( 'Invalid tree' );
  312. my $encode = shift;
  313. return $self->die( 'Invalid filename' ) unless defined $file;
  314. my $text = $self->write( $tree, $encode );
  315. if ( $] >= 5.008001 && utf8::is_utf8( $text ) ) {
  316. utf8::encode( $text );
  317. }
  318. $self->write_raw_xml( $file, $text );
  319. }
  320. sub write {
  321. my $self = shift;
  322. my $tree = shift or return $self->die( 'Invalid tree' );
  323. my $from = $self->{internal_encoding} || $INTERNAL_ENCODING;
  324. my $to = shift || $self->{output_encoding} || $XML_ENCODING;
  325. my $decl = $self->{xml_decl};
  326. $decl = '<?xml version="1.0" encoding="' . $to . '" ?>' unless defined $decl;
  327. local $self->{__first_out};
  328. if ( exists $self->{first_out} ) {
  329. my $keys = $self->{first_out};
  330. $keys = [$keys] unless ref $keys;
  331. $self->{__first_out} = { map { $keys->[$_] => $_ } 0 .. $#$keys };
  332. }
  333. local $self->{__last_out};
  334. if ( exists $self->{last_out} ) {
  335. my $keys = $self->{last_out};
  336. $keys = [$keys] unless ref $keys;
  337. $self->{__last_out} = { map { $keys->[$_] => $_ } 0 .. $#$keys };
  338. }
  339. my $tnk = $self->{text_node_key} if exists $self->{text_node_key};
  340. $tnk = $TEXT_NODE_KEY unless defined $tnk;
  341. local $self->{text_node_key} = $tnk;
  342. my $apre = $self->{attr_prefix} if exists $self->{attr_prefix};
  343. $apre = $ATTR_PREFIX unless defined $apre;
  344. local $self->{__attr_prefix_len} = length($apre);
  345. local $self->{__attr_prefix_rex} = defined $apre ? qr/^\Q$apre\E/s : undef;
  346. local $self->{__indent};
  347. if ( exists $self->{indent} && $self->{indent} ) {
  348. $self->{__indent} = ' ' x $self->{indent};
  349. }
  350. my $text = $self->hash_to_xml( undef, $tree );
  351. if ( $from && $to ) {
  352. my $stat = $self->encode_from_to( \$text, $from, $to );
  353. return $self->die( "Unsupported encoding: $to" ) unless $stat;
  354. }
  355. return $text if ( $decl eq '' );
  356. join( "\n", $decl, $text );
  357. }
  358. sub parsehttp {
  359. my $self = shift;
  360. local $self->{__user_agent};
  361. if ( exists $self->{user_agent} ) {
  362. my $agent = $self->{user_agent};
  363. $agent .= $USER_AGENT if ( $agent =~ /\s$/s );
  364. $self->{__user_agent} = $agent if ( $agent ne '' );
  365. } else {
  366. $self->{__user_agent} = $USER_AGENT;
  367. }
  368. my $http = $self->{__http_module};
  369. unless ( $http ) {
  370. $http = $self->find_http_module(@_);
  371. $self->{__http_module} = $http;
  372. }
  373. if ( $http eq 'LWP::UserAgent' ) {
  374. return $self->parsehttp_lwp(@_);
  375. }
  376. elsif ( $http eq 'HTTP::Lite' ) {
  377. return $self->parsehttp_lite(@_);
  378. }
  379. else {
  380. return $self->die( "LWP::UserAgent or HTTP::Lite is required: $_[1]" );
  381. }
  382. }
  383. sub find_http_module {
  384. my $self = shift || {};
  385. if ( exists $self->{lwp_useragent} && ref $self->{lwp_useragent} ) {
  386. return 'LWP::UserAgent' if defined $LWP::UserAgent::VERSION;
  387. return 'LWP::UserAgent' if &load_lwp_useragent();
  388. return $self->die( "LWP::UserAgent is required: $_[1]" );
  389. }
  390. if ( exists $self->{http_lite} && ref $self->{http_lite} ) {
  391. return 'HTTP::Lite' if defined $HTTP::Lite::VERSION;
  392. return 'HTTP::Lite' if &load_http_lite();
  393. return $self->die( "HTTP::Lite is required: $_[1]" );
  394. }
  395. return 'LWP::UserAgent' if defined $LWP::UserAgent::VERSION;
  396. return 'HTTP::Lite' if defined $HTTP::Lite::VERSION;
  397. return 'LWP::UserAgent' if &load_lwp_useragent();
  398. return 'HTTP::Lite' if &load_http_lite();
  399. return $self->die( "LWP::UserAgent or HTTP::Lite is required: $_[1]" );
  400. }
  401. sub load_lwp_useragent {
  402. return $LWP::UserAgent::VERSION if defined $LWP::UserAgent::VERSION;
  403. local $@;
  404. eval { require LWP::UserAgent; };
  405. $LWP::UserAgent::VERSION;
  406. }
  407. sub load_http_lite {
  408. return $HTTP::Lite::VERSION if defined $HTTP::Lite::VERSION;
  409. local $@;
  410. eval { require HTTP::Lite; };
  411. $HTTP::Lite::VERSION;
  412. }
  413. sub load_tie_ixhash {
  414. return $Tie::IxHash::VERSION if defined $Tie::IxHash::VERSION;
  415. local $@;
  416. eval { require Tie::IxHash; };
  417. $Tie::IxHash::VERSION;
  418. }
  419. sub parsehttp_lwp {
  420. my $self = shift;
  421. my $method = shift or return $self->die( 'Invalid HTTP method' );
  422. my $url = shift or return $self->die( 'Invalid URL' );
  423. my $body = shift;
  424. my $header = shift;
  425. my $ua = $self->{lwp_useragent} if exists $self->{lwp_useragent};
  426. if ( ! ref $ua ) {
  427. $ua = LWP::UserAgent->new();
  428. $ua->timeout(10);
  429. $ua->env_proxy();
  430. $ua->agent( $self->{__user_agent} ) if defined $self->{__user_agent};
  431. } else {
  432. $ua->agent( $self->{__user_agent} ) if exists $self->{user_agent};
  433. }
  434. my $req = HTTP::Request->new( $method, $url );
  435. my $ct = 0;
  436. if ( ref $header ) {
  437. foreach my $field ( sort keys %$header ) {
  438. my $value = $header->{$field};
  439. $req->header( $field => $value );
  440. $ct ++ if ( $field =~ /^Content-Type$/i );
  441. }
  442. }
  443. if ( defined $body && ! $ct ) {
  444. $req->header( 'Content-Type' => 'application/x-www-form-urlencoded' );
  445. }
  446. $req->content($body) if defined $body;
  447. my $res = $ua->request($req);
  448. my $code = $res->code();
  449. my $text = $res->content();
  450. my $tree = $self->parse( \$text ) if $res->is_success();
  451. wantarray ? ( $tree, $text, $code ) : $tree;
  452. }
  453. sub parsehttp_lite {
  454. my $self = shift;
  455. my $method = shift or return $self->die( 'Invalid HTTP method' );
  456. my $url = shift or return $self->die( 'Invalid URL' );
  457. my $body = shift;
  458. my $header = shift;
  459. my $http = HTTP::Lite->new();
  460. $http->method($method);
  461. my $ua = 0;
  462. if ( ref $header ) {
  463. foreach my $field ( sort keys %$header ) {
  464. my $value = $header->{$field};
  465. $http->add_req_header( $field, $value );
  466. $ua ++ if ( $field =~ /^User-Agent$/i );
  467. }
  468. }
  469. if ( defined $self->{__user_agent} && ! $ua ) {
  470. $http->add_req_header( 'User-Agent', $self->{__user_agent} );
  471. }
  472. $http->{content} = $body if defined $body;
  473. my $code = $http->request($url) or return;
  474. my $text = $http->body();
  475. my $tree = $self->parse( \$text );
  476. wantarray ? ( $tree, $text, $code ) : $tree;
  477. }
  478. sub parsefile {
  479. my $self = shift;
  480. my $file = shift;
  481. return $self->die( 'Invalid filename' ) unless defined $file;
  482. my $text = $self->read_raw_xml($file);
  483. $self->parse( \$text );
  484. }
  485. sub parse {
  486. my $self = shift;
  487. my $text = ref $_[0] ? ${$_[0]} : $_[0];
  488. return $self->die( 'Null XML source' ) unless defined $text;
  489. my $from = &xml_decl_encoding(\$text) || $XML_ENCODING;
  490. my $to = $self->{internal_encoding} || $INTERNAL_ENCODING;
  491. if ( $from && $to ) {
  492. my $stat = $self->encode_from_to( \$text, $from, $to );
  493. return $self->die( "Unsupported encoding: $from" ) unless $stat;
  494. }
  495. local $self->{__force_array};
  496. local $self->{__force_array_all};
  497. if ( exists $self->{force_array} ) {
  498. my $force = $self->{force_array};
  499. $force = [$force] unless ref $force;
  500. $self->{__force_array} = { map { $_ => 1 } @$force };
  501. $self->{__force_array_all} = $self->{__force_array}->{'*'};
  502. }
  503. local $self->{__force_hash};
  504. local $self->{__force_hash_all};
  505. if ( exists $self->{force_hash} ) {
  506. my $force = $self->{force_hash};
  507. $force = [$force] unless ref $force;
  508. $self->{__force_hash} = { map { $_ => 1 } @$force };
  509. $self->{__force_hash_all} = $self->{__force_hash}->{'*'};
  510. }
  511. my $tnk = $self->{text_node_key} if exists $self->{text_node_key};
  512. $tnk = $TEXT_NODE_KEY unless defined $tnk;
  513. local $self->{text_node_key} = $tnk;
  514. my $apre = $self->{attr_prefix} if exists $self->{attr_prefix};
  515. $apre = $ATTR_PREFIX unless defined $apre;
  516. local $self->{attr_prefix} = $apre;
  517. if ( exists $self->{use_ixhash} && $self->{use_ixhash} ) {
  518. return $self->die( "Tie::IxHash is required." ) unless &load_tie_ixhash();
  519. }
  520. my $flat = $self->xml_to_flat(\$text);
  521. my $class = $self->{base_class} if exists $self->{base_class};
  522. my $tree = $self->flat_to_tree( $flat, '', $class );
  523. if ( ref $tree ) {
  524. if ( defined $class ) {
  525. bless( $tree, $class );
  526. }
  527. elsif ( exists $self->{elem_class} && $self->{elem_class} ) {
  528. bless( $tree, $self->{elem_class} );
  529. }
  530. }
  531. wantarray ? ( $tree, $text ) : $tree;
  532. }
  533. sub xml_to_flat {
  534. my $self = shift;
  535. my $textref = shift; # reference
  536. my $flat = [];
  537. my $prefix = $self->{attr_prefix};
  538. my $ixhash = ( exists $self->{use_ixhash} && $self->{use_ixhash} );
  539. while ( $$textref =~ m{
  540. ([^<]*) <
  541. ((
  542. \? ([^<>]*) \?
  543. )|(
  544. \!\[CDATA\[(.*?)\]\]
  545. )|(
  546. \!DOCTYPE\s+([^\[\]<>]*(?:\[.*?\]\s*)?)
  547. )|(
  548. \!--(.*?)--
  549. )|(
  550. ([^\!\?\s<>](?:"[^"]*"|'[^']*'|[^"'<>])*)
  551. ))
  552. > ([^<]*)
  553. }sxg ) {
  554. my (
  555. $ahead, $match, $typePI, $contPI, $typeCDATA,
  556. $contCDATA, $typeDocT, $contDocT, $typeCmnt, $contCmnt,
  557. $typeElem, $contElem, $follow
  558. )
  559. = ( $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13 );
  560. if ( defined $ahead && $ahead =~ /\S/ ) {
  561. $self->warn( "Invalid string: [$ahead] before <$match>" );
  562. }
  563. if ($typeElem) { # Element
  564. my $node = {};
  565. if ( $contElem =~ s#^/## ) {
  566. $node->{endTag}++;
  567. }
  568. elsif ( $contElem =~ s#/$## ) {
  569. $node->{emptyTag}++;
  570. }
  571. else {
  572. $node->{startTag}++;
  573. }
  574. $node->{tagName} = $1 if ( $contElem =~ s#^(\S+)\s*## );
  575. unless ( $node->{endTag} ) {
  576. my $attr;
  577. while ( $contElem =~ m{
  578. ([^\s\=\"\']+)=(?:(")(.*?)"|'(.*?)')
  579. }sxg ) {
  580. my $key = $1;
  581. my $val = &xml_unescape( $2 ? $3 : $4 );
  582. if ( ! ref $attr ) {
  583. $attr = {};
  584. tie( %$attr, 'Tie::IxHash' ) if $ixhash;
  585. }
  586. $attr->{$prefix.$key} = $val;
  587. }
  588. $node->{attributes} = $attr if ref $attr;
  589. }
  590. push( @$flat, $node );
  591. }
  592. elsif ($typeCDATA) { ## CDATASection
  593. if ( exists $self->{cdata_scalar_ref} && $self->{cdata_scalar_ref} ) {
  594. push( @$flat, \$contCDATA ); # as reference for scalar
  595. }
  596. else {
  597. push( @$flat, $contCDATA ); # as scalar like text node
  598. }
  599. }
  600. elsif ($typeCmnt) { # Comment (ignore)
  601. }
  602. elsif ($typeDocT) { # DocumentType (ignore)
  603. }
  604. elsif ($typePI) { # ProcessingInstruction (ignore)
  605. }
  606. else {
  607. $self->warn( "Invalid Tag: <$match>" );
  608. }
  609. if ( $follow =~ /\S/ ) { # text node
  610. my $val = &xml_unescape($follow);
  611. push( @$flat, $val );
  612. }
  613. }
  614. $flat;
  615. }
  616. sub flat_to_tree {
  617. my $self = shift;
  618. my $source = shift;
  619. my $parent = shift;
  620. my $class = shift;
  621. my $tree = {};
  622. my $text = [];
  623. if ( exists $self->{use_ixhash} && $self->{use_ixhash} ) {
  624. tie( %$tree, 'Tie::IxHash' );
  625. }
  626. while ( scalar @$source ) {
  627. my $node = shift @$source;
  628. if ( !ref $node || UNIVERSAL::isa( $node, "SCALAR" ) ) {
  629. push( @$text, $node ); # cdata or text node
  630. next;
  631. }
  632. my $name = $node->{tagName};
  633. if ( $node->{endTag} ) {
  634. last if ( $parent eq $name );
  635. return $self->die( "Invalid tag sequence: <$parent></$name>" );
  636. }
  637. my $elem = $node->{attributes};
  638. my $forcehash = $self->{__force_hash_all} || $self->{__force_hash}->{$name};
  639. my $subclass;
  640. if ( defined $class ) {
  641. my $escname = $name;
  642. $escname =~ s/\W/_/sg;
  643. $subclass = $class.'::'.$escname;
  644. }
  645. if ( $node->{startTag} ) { # recursive call
  646. my $child = $self->flat_to_tree( $source, $name, $subclass );
  647. next unless defined $child;
  648. my $hasattr = scalar keys %$elem if ref $elem;
  649. if ( UNIVERSAL::isa( $child, "HASH" ) ) {
  650. if ( $hasattr ) {
  651. # some attributes and some child nodes
  652. %$elem = ( %$elem, %$child );
  653. }
  654. else {
  655. # some child nodes without attributes
  656. $elem = $child;
  657. }
  658. }
  659. else {
  660. if ( $hasattr ) {
  661. # some attributes and text node
  662. $elem->{$self->{text_node_key}} = $child;
  663. }
  664. elsif ( $forcehash ) {
  665. # only text node without attributes
  666. $elem = { $self->{text_node_key} => $child };
  667. }
  668. else {
  669. # text node without attributes
  670. $elem = $child;
  671. }
  672. }
  673. }
  674. elsif ( $forcehash && ! ref $elem ) {
  675. $elem = {};
  676. }
  677. # bless to a class by base_class or elem_class
  678. if ( ref $elem && UNIVERSAL::isa( $elem, "HASH" ) ) {
  679. if ( defined $subclass ) {
  680. bless( $elem, $subclass );
  681. } elsif ( exists $self->{elem_class} && $self->{elem_class} ) {
  682. my $escname = $name;
  683. $escname =~ s/\W/_/sg;
  684. my $elmclass = $self->{elem_class}.'::'.$escname;
  685. bless( $elem, $elmclass );
  686. }
  687. }
  688. # next unless defined $elem;
  689. $tree->{$name} ||= [];
  690. push( @{ $tree->{$name} }, $elem );
  691. }
  692. if ( ! $self->{__force_array_all} ) {
  693. foreach my $key ( keys %$tree ) {
  694. next if $self->{__force_array}->{$key};
  695. next if ( 1 < scalar @{ $tree->{$key} } );
  696. $tree->{$key} = shift @{ $tree->{$key} };
  697. }
  698. }
  699. my $haschild = scalar keys %$tree;
  700. if ( scalar @$text ) {
  701. if ( scalar @$text == 1 ) {
  702. # one text node (normal)
  703. $text = shift @$text;
  704. }
  705. elsif ( ! scalar grep {ref $_} @$text ) {
  706. # some text node splitted
  707. $text = join( '', @$text );
  708. }
  709. else {
  710. # some cdata node
  711. my $join = join( '', map {ref $_ ? $$_ : $_} @$text );
  712. $text = \$join;
  713. }
  714. if ( $haschild ) {
  715. # some child nodes and also text node
  716. $tree->{$self->{text_node_key}} = $text;
  717. }
  718. else {
  719. # only text node without child nodes
  720. $tree = $text;
  721. }
  722. }
  723. elsif ( ! $haschild ) {
  724. # no child and no text
  725. $tree = "";
  726. }
  727. $tree;
  728. }
  729. sub hash_to_xml {
  730. my $self = shift;
  731. my $name = shift;
  732. my $hash = shift;
  733. my $out = [];
  734. my $attr = [];
  735. my $allkeys = [ keys %$hash ];
  736. my $fo = $self->{__first_out} if ref $self->{__first_out};
  737. my $lo = $self->{__last_out} if ref $self->{__last_out};
  738. my $firstkeys = [ sort { $fo->{$a} <=> $fo->{$b} } grep { exists $fo->{$_} } @$allkeys ] if ref $fo;
  739. my $lastkeys = [ sort { $lo->{$a} <=> $lo->{$b} } grep { exists $lo->{$_} } @$allkeys ] if ref $lo;
  740. $allkeys = [ grep { ! exists $fo->{$_} } @$allkeys ] if ref $fo;
  741. $allkeys = [ grep { ! exists $lo->{$_} } @$allkeys ] if ref $lo;
  742. unless ( exists $self->{use_ixhash} && $self->{use_ixhash} ) {
  743. $allkeys = [ sort @$allkeys ];
  744. }
  745. my $prelen = $self->{__attr_prefix_len};
  746. my $pregex = $self->{__attr_prefix_rex};
  747. foreach my $keys ( $firstkeys, $allkeys, $lastkeys ) {
  748. next unless ref $keys;
  749. my $elemkey = $prelen ? [ grep { $_ !~ $pregex } @$keys ] : $keys;
  750. my $attrkey = $prelen ? [ grep { $_ =~ $pregex } @$keys ] : [];
  751. foreach my $key ( @$elemkey ) {
  752. my $val = $hash->{$key};
  753. if ( !defined $val ) {
  754. push( @$out, "<$key />" );
  755. }
  756. elsif ( UNIVERSAL::isa( $val, 'ARRAY' ) ) {
  757. my $child = $self->array_to_xml( $key, $val );
  758. push( @$out, $child );
  759. }
  760. elsif ( UNIVERSAL::isa( $val, 'SCALAR' ) ) {
  761. my $child = $self->scalaref_to_cdata( $key, $val );
  762. push( @$out, $child );
  763. }
  764. elsif ( ref $val ) {
  765. my $child = $self->hash_to_xml( $key, $val );
  766. push( @$out, $child );
  767. }
  768. else {
  769. my $child = $self->scalar_to_xml( $key, $val );
  770. push( @$out, $child );
  771. }
  772. }
  773. foreach my $key ( @$attrkey ) {
  774. my $name = substr( $key, $prelen );
  775. my $val = &xml_escape( $hash->{$key} );
  776. push( @$attr, ' ' . $name . '="' . $val . '"' );
  777. }
  778. }
  779. my $jattr = join( '', @$attr );
  780. if ( defined $name && scalar @$out && ! grep { ! /^</s } @$out ) {
  781. # Use human-friendly white spacing
  782. if ( defined $self->{__indent} ) {
  783. s/^(\s*<)/$self->{__indent}$1/mg foreach @$out;
  784. }
  785. unshift( @$out, "\n" );
  786. }
  787. my $text = join( '', @$out );
  788. if ( defined $name ) {
  789. if ( scalar @$out ) {
  790. $text = "<$name$jattr>$text</$name>\n";
  791. }
  792. else {
  793. $text = "<$name$jattr />\n";
  794. }
  795. }
  796. $text;
  797. }
  798. sub array_to_xml {
  799. my $self = shift;
  800. my $name = shift;
  801. my $array = shift;
  802. my $out = [];
  803. foreach my $val (@$array) {
  804. if ( !defined $val ) {
  805. push( @$out, "<$name />\n" );
  806. }
  807. elsif ( UNIVERSAL::isa( $val, 'ARRAY' ) ) {
  808. my $child = $self->array_to_xml( $name, $val );
  809. push( @$out, $child );
  810. }
  811. elsif ( UNIVERSAL::isa( $val, 'SCALAR' ) ) {
  812. my $child = $self->scalaref_to_cdata( $name, $val );
  813. push( @$out, $child );
  814. }
  815. elsif ( ref $val ) {
  816. my $child = $self->hash_to_xml( $name, $val );
  817. push( @$out, $child );
  818. }
  819. else {
  820. my $child = $self->scalar_to_xml( $name, $val );
  821. push( @$out, $child );
  822. }
  823. }
  824. my $text = join( '', @$out );
  825. $text;
  826. }
  827. sub scalaref_to_cdata {
  828. my $self = shift;
  829. my $name = shift;
  830. my $ref = shift;
  831. my $data = defined $$ref ? $$ref : '';
  832. $data =~ s#(]])(>)#$1]]><![CDATA[$2#g;
  833. #my $text = '<![CDATA[' . $data . ']]>';
  834. my $text = $data;
  835. $text = "<$name>$text</$name>\n" if ( $name ne $self->{text_node_key} );
  836. $text;
  837. }
  838. sub scalar_to_xml {
  839. my $self = shift;
  840. my $name = shift;
  841. my $scalar = shift;
  842. my $copy = $scalar;
  843. my $text = &xml_escape($copy);
  844. $text = "<$name>$text</$name>\n" if ( $name ne $self->{text_node_key} );
  845. $text;
  846. }
  847. sub write_raw_xml {
  848. my $self = shift;
  849. my $file = shift;
  850. my $fh = Symbol::gensym();
  851. open( $fh, ">$file" ) or return $self->die( "$! - $file" );
  852. print $fh @_;
  853. close($fh);
  854. }
  855. sub read_raw_xml {
  856. my $self = shift;
  857. my $file = shift;
  858. my $fh = Symbol::gensym();
  859. open( $fh, $file ) or return $self->die( "$! - $file" );
  860. local $/ = undef;
  861. my $text = <$fh>;
  862. close($fh);
  863. $text;
  864. }
  865. sub xml_decl_encoding {
  866. my $textref = shift;
  867. return unless defined $$textref;
  868. my $args = ( $$textref =~ /^\s*<\?xml(\s+\S.*)\?>/s )[0] or return;
  869. my $getcode = ( $args =~ /\s+encoding=(".*?"|'.*?')/ )[0] or return;
  870. $getcode =~ s/^['"]//;
  871. $getcode =~ s/['"]$//;
  872. $getcode;
  873. }
  874. sub encode_from_to {
  875. my $self = shift;
  876. my $txtref = shift or return;
  877. my $from = shift or return;
  878. my $to = shift or return;
  879. unless ( defined $Encode::EUCJPMS::VERSION ) {
  880. $from = 'EUC-JP' if ( $from =~ /\beuc-?jp-?(win|ms)$/i );
  881. $to = 'EUC-JP' if ( $to =~ /\beuc-?jp-?(win|ms)$/i );
  882. }
  883. my $setflag = $self->{utf8_flag} if exists $self->{utf8_flag};
  884. if ( $] < 5.008001 && $setflag ) {
  885. return $self->die( "Perl 5.8.1 is required for utf8_flag: $]" );
  886. }
  887. if ( $] >= 5.008 ) {
  888. &load_encode();
  889. my $check = ( $Encode::VERSION < 2.13 ) ? 0x400 : Encode::FB_XMLCREF();
  890. if ( $] >= 5.008001 && utf8::is_utf8( $$txtref ) ) {
  891. if ( $to =~ /^utf-?8$/i ) {
  892. # skip
  893. } else {
  894. $$txtref = Encode::encode( $to, $$txtref, $check );
  895. }
  896. } else {
  897. $$txtref = Encode::decode( $from, $$txtref );
  898. if ( $to =~ /^utf-?8$/i && $setflag ) {
  899. # skip
  900. } else {
  901. $$txtref = Encode::encode( $to, $$txtref, $check );
  902. }
  903. }
  904. }
  905. elsif ( ( uc($from) eq 'ISO-8859-1'
  906. || uc($from) eq 'US-ASCII'
  907. || uc($from) eq 'LATIN-1' ) && uc($to) eq 'UTF-8' ) {
  908. &latin1_to_utf8($txtref);
  909. }
  910. else {
  911. my $jfrom = &get_jcode_name($from);
  912. my $jto = &get_jcode_name($to);
  913. return $to if ( uc($jfrom) eq uc($jto) );
  914. if ( $jfrom && $jto ) {
  915. &load_jcode();
  916. if ( defined $Jcode::VERSION ) {
  917. Jcode::convert( $txtref, $jto, $jfrom );
  918. }
  919. else {
  920. return $self->die( "Jcode.pm is required: $from to $to" );
  921. }
  922. }
  923. else {
  924. return $self->die( "Encode.pm is required: $from to $to" );
  925. }
  926. }
  927. $to;
  928. }
  929. sub load_jcode {
  930. return if defined $Jcode::VERSION;
  931. local $@;
  932. eval { require Jcode; };
  933. }
  934. sub load_encode {
  935. return if defined $Encode::VERSION;
  936. local $@;
  937. eval { require Encode; };
  938. }
  939. sub latin1_to_utf8 {
  940. my $strref = shift;
  941. $$strref =~ s{
  942. ([\x80-\xFF])
  943. }{
  944. pack( 'C2' => 0xC0|(ord($1)>>6),0x80|(ord($1)&0x3F) )
  945. }exg;
  946. }
  947. sub get_jcode_name {
  948. my $src = shift;
  949. my $dst;
  950. if ( $src =~ /^utf-?8$/i ) {
  951. $dst = 'utf8';
  952. }
  953. elsif ( $src =~ /^euc.*jp(-?(win|ms))?$/i ) {
  954. $dst = 'euc';
  955. }
  956. elsif ( $src =~ /^(shift.*jis|cp932|windows-31j)$/i ) {
  957. $dst = 'sjis';
  958. }
  959. elsif ( $src =~ /^iso-2022-jp/ ) {
  960. $dst = 'jis';
  961. }
  962. $dst;
  963. }
  964. sub xml_escape {
  965. my $str = shift;
  966. return '' unless defined $str;
  967. # except for TAB(\x09),CR(\x0D),LF(\x0A)
  968. $str =~ s{
  969. ([\x00-\x08\x0B\x0C\x0E-\x1F\x7F])
  970. }{
  971. sprintf( '&#%d;', ord($1) );
  972. }gex;
  973. $str =~ s/&(?!#(\d+;|x[\dA-Fa-f]+;))/&amp;/g;
  974. $str =~ s/</&lt;/g;
  975. $str =~ s/>/&gt;/g;
  976. $str =~ s/'/&apos;/g;
  977. $str =~ s/"/&quot;/g;
  978. $str;
  979. }
  980. sub xml_unescape {
  981. my $str = shift;
  982. my $map = {qw( quot " lt < gt > apos ' amp & )};
  983. $str =~ s{
  984. (&(?:\#(\d+)|\#x([0-9a-fA-F]+)|(quot|lt|gt|apos|amp));)
  985. }{
  986. $4 ? $map->{$4} : &char_deref($1,$2,$3);
  987. }gex;
  988. $str;
  989. }
  990. sub char_deref {
  991. my( $str, $dec, $hex ) = @_;
  992. if ( defined $dec ) {
  993. return &code_to_utf8( $dec ) if ( $dec < 256 );
  994. }
  995. elsif ( defined $hex ) {
  996. my $num = hex($hex);
  997. return &code_to_utf8( $num ) if ( $num < 256 );
  998. }
  999. return $str;
  1000. }
  1001. sub code_to_utf8 {
  1002. my $code = shift;
  1003. if ( $code < 128 ) {
  1004. return pack( C => $code );
  1005. }
  1006. elsif ( $code < 256 ) {
  1007. return pack( C2 => 0xC0|($code>>6), 0x80|($code&0x3F));
  1008. }
  1009. elsif ( $code < 65536 ) {
  1010. return pack( C3 => 0xC0|($code>>12), 0x80|(($code>>6)&0x3F), 0x80|($code&0x3F));
  1011. }
  1012. return shift if scalar @_; # default value
  1013. sprintf( '&#x%04X;', $code );
  1014. }
  1015. 1;