From 6da20677ef062b295c5cae6b44df4a50a51aeea1 Mon Sep 17 00:00:00 2001 From: Willem van Bergen Date: Fri, 1 Jan 2010 10:03:34 +0100 Subject: [PATCH] Bugfixes, speed improvements and code cleanup for Nokogiri's and LibXML's XmlMini backend --- .../lib/active_support/xml_mini/libxml.rb | 106 +++-------- .../lib/active_support/xml_mini/nokogiri.rb | 47 +++-- activesupport/test/xml_mini/libxml_engine_test.rb | 204 ++++++++++++++++++++ .../test/xml_mini/nokogiri_engine_test.rb | 42 ++++- 4 files changed, 291 insertions(+), 108 deletions(-) create mode 100644 activesupport/test/xml_mini/libxml_engine_test.rb diff --git a/activesupport/lib/active_support/xml_mini/libxml.rb b/activesupport/lib/active_support/xml_mini/libxml.rb index 3586b24..837a1df 100644 --- a/activesupport/lib/active_support/xml_mini/libxml.rb +++ b/activesupport/lib/active_support/xml_mini/libxml.rb @@ -9,15 +9,12 @@ module ActiveSupport # string:: # XML Document string to parse def parse(string) - LibXML::XML.default_keep_blanks = false - if string.blank? {} else LibXML::XML::Parser.string(string.strip).parse.to_hash end end - end end @@ -30,101 +27,44 @@ module LibXML end module Node - CONTENT_ROOT = '__content__' - LIB_XML_LIMIT = 30000000 # Hardcoded LibXML limit + CONTENT_ROOT = '__content__'.freeze # Convert XML document to hash # # hash:: # Hash to merge the converted element into. def to_hash(hash={}) - if text? - raise LibXML::XML::Error if content.length >= LIB_XML_LIMIT - hash[CONTENT_ROOT] = content - else - sub_hash = insert_name_into_hash(hash, name) - attributes_to_hash(sub_hash) - if array? - children_array_to_hash(sub_hash) - elsif yaml? - children_yaml_to_hash(sub_hash) - else - children_to_hash(sub_hash) - end + node_hash = {} + + # Insert node hash into parent hash correctly. + case hash[name] + when Array then hash[name] << node_hash + when Hash then hash[name] = [hash[name], node_hash] + when nil then hash[name] = node_hash + else raise "Unexpected error during hash insertion!" end - hash - end - protected - - # Insert name into hash - # - # hash:: - # Hash to merge the converted element into. - # name:: - # name to to merge into hash - def insert_name_into_hash(hash, name) - sub_hash = {} - if hash[name] - if !hash[name].kind_of? Array - hash[name] = [hash[name]] - end - hash[name] << sub_hash - else - hash[name] = sub_hash + # Handle child elements + each_child do |c| + if c.element? + c.to_hash(node_hash) + elsif c.text? || c.cdata? + node_hash[CONTENT_ROOT] ||= '' + node_hash[CONTENT_ROOT] << c.content end - sub_hash end - # Insert children into hash - # - # hash:: - # Hash to merge the children into. - def children_to_hash(hash={}) - each { |child| child.to_hash(hash) } - attributes_to_hash(hash) - hash - end - # Convert xml attributes to hash - # - # hash:: - # Hash to merge the attributes into - def attributes_to_hash(hash={}) - each_attr { |attr| hash[attr.name] = attr.value } - hash + # Remove content node if it is blank + if node_hash.length > 1 && node_hash[CONTENT_ROOT].blank? + node_hash.delete(CONTENT_ROOT) end - # Convert array into hash - # - # hash:: - # Hash to merge the array into - def children_array_to_hash(hash={}) - hash[child.name] = map do |child| - returning({}) { |sub_hash| child.children_to_hash(sub_hash) } - end - hash - end - - # Convert yaml into hash - # - # hash:: - # Hash to merge the yaml into - def children_yaml_to_hash(hash = {}) - hash[CONTENT_ROOT] = content unless content.blank? - hash - end - - # Check if child is of type array - def array? - child? && child.next? && child.name == child.next.name - end - - # Check if child is of type yaml - def yaml? - attributes.collect{|x| x.value}.include?('yaml') - end + # Handle attributes + each_attr { |a| node_hash[a.name] = a.value } + hash + end end end end diff --git a/activesupport/lib/active_support/xml_mini/nokogiri.rb b/activesupport/lib/active_support/xml_mini/nokogiri.rb index f3c64c6..096ae0f 100644 --- a/activesupport/lib/active_support/xml_mini/nokogiri.rb +++ b/activesupport/lib/active_support/xml_mini/nokogiri.rb @@ -12,7 +12,7 @@ module ActiveSupport if string.blank? {} else - doc = Nokogiri::XML(string) { |cfg| cfg.noblanks } + doc = Nokogiri::XML(string) raise doc.errors.first if doc.errors.length > 0 doc.to_hash end @@ -26,40 +26,43 @@ module ActiveSupport end module Node - CONTENT_ROOT = '__content__' + CONTENT_ROOT = '__content__'.freeze # Convert XML document to hash # # hash:: # Hash to merge the converted element into. def to_hash(hash = {}) - attributes = attributes_as_hash - if hash[name] - hash[name] = [hash[name]].flatten - hash[name] << attributes - else - hash[name] ||= attributes + node_hash = {} + + # Insert node hash into parent hash correctly. + case hash[name] + when Array then hash[name] << node_hash + when Hash then hash[name] = [hash[name], node_hash] + when nil then hash[name] = node_hash + else raise "Unexpected error during hash insertion!" end - children.each { |child| - next if child.blank? && 'file' != self['type'] + # Handle child elements + children.each do |c| + if c.element? + c.to_hash(node_hash) + elsif c.text? || c.cdata? + node_hash[CONTENT_ROOT] ||= '' + node_hash[CONTENT_ROOT] << c.content + end + end - if child.text? || child.cdata? - (attributes[CONTENT_ROOT] ||= '') << child.content - next - end + # Remove content node if it is blank and there are child tags + if node_hash.length > 1 && node_hash[CONTENT_ROOT].blank? + node_hash.delete(CONTENT_ROOT) + end - child.to_hash attributes - } + # Handle attributes + attribute_nodes.each { |a| node_hash[a.node_name] = a.value } hash end - - def attributes_as_hash - Hash[*(attribute_nodes.map { |node| - [node.node_name, node.value] - }.flatten)] - end end end diff --git a/activesupport/test/xml_mini/libxml_engine_test.rb b/activesupport/test/xml_mini/libxml_engine_test.rb new file mode 100644 index 0000000..dcff636 --- /dev/null +++ b/activesupport/test/xml_mini/libxml_engine_test.rb @@ -0,0 +1,204 @@ +require 'abstract_unit' +require 'active_support/xml_mini' + +begin + gem 'libxml-ruby' +rescue Gem::LoadError + # Skip nokogiri tests +else + +require 'libxml' + +class NokogiriEngineTest < Test::Unit::TestCase + include ActiveSupport + + def setup + @default_backend = XmlMini.backend + XmlMini.backend = 'LibXML' + end + + def teardown + XmlMini.backend = @default_backend + end + + def test_file_from_xml + hash = Hash.from_xml(<<-eoxml) + + + + + eoxml + assert hash.has_key?('blog') + assert hash['blog'].has_key?('logo') + + file = hash['blog']['logo'] + assert_equal 'logo.png', file.original_filename + assert_equal 'image/png', file.content_type + end + + def test_exception_thrown_on_expansion_attack + assert_raise LibXML::XML::Error do + attack_xml = <<-EOT + + + + + + + + + ]> + + &a; + + EOT + Hash.from_xml(attack_xml) + end + end + + def test_setting_nokogiri_as_backend + XmlMini.backend = 'LibXML' + assert_equal XmlMini_LibXML, XmlMini.backend + end + + def test_blank_returns_empty_hash + assert_equal({}, XmlMini.parse(nil)) + assert_equal({}, XmlMini.parse('')) + end + + def test_array_type_makes_an_array + assert_equal_rexml(<<-eoxml) + + + a post + another post + + + eoxml + end + + def test_one_node_document_as_hash + assert_equal_rexml(<<-eoxml) + + eoxml + end + + def test_one_node_with_attributes_document_as_hash + assert_equal_rexml(<<-eoxml) + + eoxml + end + + def test_products_node_with_book_node_as_hash + assert_equal_rexml(<<-eoxml) + + + + eoxml + end + + def test_products_node_with_two_book_nodes_as_hash + assert_equal_rexml(<<-eoxml) + + + + + eoxml + end + + def test_single_node_with_content_as_hash + assert_equal_rexml(<<-eoxml) + + hello world + + eoxml + end + + def test_children_with_children + assert_equal_rexml(<<-eoxml) + + + + + + eoxml + end + + def test_children_with_text + assert_equal_rexml(<<-eoxml) + + + hello everyone + + + eoxml + end + + def test_children_with_non_adjacent_text + assert_equal_rexml(<<-eoxml) + + good + + hello everyone + + morning + + eoxml + end + + def test_children_with_simple_cdata + assert_equal_rexml(<<-eoxml) + + + + + + eoxml + end + + def test_children_with_multiple_cdata + assert_equal_rexml(<<-eoxml) + + + + + + eoxml + end + + def test_children_with_text_and_cdata + assert_equal_rexml(<<-eoxml) + + + hello + morning + + + eoxml + end + + def test_children_with_blank_text + assert_equal_rexml(<<-eoxml) + + + + eoxml + end + + def test_children_with_blank_text_and_attribute + assert_equal_rexml(<<-eoxml) + + + + eoxml + end + + private + def assert_equal_rexml(xml) + hash = XmlMini.with_backend('REXML') { XmlMini.parse(xml) } + assert_equal(hash, XmlMini.parse(xml)) + end +end + +end diff --git a/activesupport/test/xml_mini/nokogiri_engine_test.rb b/activesupport/test/xml_mini/nokogiri_engine_test.rb index 09989f2..08e2147 100644 --- a/activesupport/test/xml_mini/nokogiri_engine_test.rb +++ b/activesupport/test/xml_mini/nokogiri_engine_test.rb @@ -147,17 +147,53 @@ class NokogiriEngineTest < Test::Unit::TestCase eoxml end - def test_children_with_cdata + def test_children_with_simple_cdata + assert_equal_rexml(<<-eoxml) + + + + + + eoxml + end + + def test_children_with_multiple_cdata assert_equal_rexml(<<-eoxml) - hello - morning + eoxml end + def test_children_with_text_and_cdata + assert_equal_rexml(<<-eoxml) + + + hello + morning + + + eoxml + end + + def test_children_with_blank_text + assert_equal_rexml(<<-eoxml) + + + + eoxml + end + + def test_children_with_blank_text_and_attribute + assert_equal_rexml(<<-eoxml) + + + + eoxml + end + private def assert_equal_rexml(xml) hash = XmlMini.with_backend('REXML') { XmlMini.parse(xml) } -- 1.6.5.1