2004.10.24 (q1) - 多くの場合に要素が正しい入れ子関係になるように修正。 - 余分な終了タグが生成される点を修正。 - テストケースを追加。 2004.10.29 (q2) - テストケースを追加。 diff -Naurw htmlrepair-1.0.1.orig/htmlrepair.rb htmlrepair-1.0.1/htmlrepair.rb --- htmlrepair-1.0.1.orig/htmlrepair.rb 2004-10-24 15:57:46.000000000 +0900 +++ htmlrepair-1.0.1/htmlrepair.rb 2004-10-24 20:36:22.000000000 +0900 @@ -22,9 +22,17 @@ require "htmlsplit" class HTMLSplit + CONTENT_FLOW = %w(div center blockquote ins del dd li form button th td iframe noscript) + NOT_FLOW = %w(area param dt dd li optgroup option legend + caption col colgroup thead tfoot tbody tr th td) + CONTENT_INLINE = %w(tt i b u s strike big small + em strong dfn code samp kbd var cite abbr acronym + sub sup span bdo font a p h1 h2 h3 h4 h5 h6 pre q dt label + legend caption) + BLOCK = %w(p h1 h2 h3 h4 h5 h6 ul ol dir menu pre dl div center noscript noframes + blockquote form isindex hr table fieldset address) PARENTTAG = { - 'p' => %w(body table), 'a' => %w(body), 'thead' => %w(table), 'tfoot' => %w(table), @@ -44,9 +52,8 @@ 'select' => %w(form), 'keygen' => %w(form), 'label' => %w(form), - 'fieldset' => %w(form), - 'legend' => %w(fieldset), - 'option' => %w(select), + 'option' => %w(select optgroup), + 'optgroup' => %w(select), } def repair @@ -54,9 +61,7 @@ doc = [] @document.each {|e| case e - when EmptyElementTag - doc.push e - when StartTag + when StartTag, EmptyElementTag if PARENTTAG[e.name] && (a = tag.rindex(e.name)) #ネストか終了タグの省略かチェック flag = true @@ -70,17 +75,26 @@ if flag #省略された終了タグを出力 while t=tag.pop - c = EndTag.new(t) - doc.push c + doc.push EndTag.new(t) if t==e.name break end end end - else end + + if NOT_FLOW.include?(e.name) + while CONTENT_FLOW.include?(tag.last) || CONTENT_INLINE.include?(tag.last) + doc.push EndTag.new(tag.pop) + end + elsif BLOCK.include?(e.name) + while CONTENT_INLINE.include?(tag.last) + doc.push EndTag.new(tag.pop) + end + end + # - tag.push e.name + tag.push e.name if e.is_a?(StartTag) doc.push e when EndTag if tag.include?(e.name) @@ -88,19 +102,11 @@ if t==e.name break else - c = EndTag.new(t) - doc.push c - end + doc.push EndTag.new(t) end - else end doc.push e - when CharacterData - doc.push e - when Declaration - doc.push e - when Comment - doc.push e + end else doc.push e end diff -Naurw htmlrepair-1.0.1.orig/repairtest.rb htmlrepair-1.0.1/repairtest.rb --- htmlrepair-1.0.1.orig/repairtest.rb 2004-10-24 15:57:46.000000000 +0900 +++ htmlrepair-1.0.1/repairtest.rb 1970-01-01 09:00:00.000000000 +0900 @@ -1,15 +0,0 @@ -#/usr/bin/ruby - -require "htmlsplit" -require "htmlrepair" - -#テスト処理 -html = open(ARGV[0]).read -out = open(ARGV[1],"w") -obj = HTMLSplit.new(html) -obj.repair - -obj.document.each {|e| - out.write e.to_s -} -out.close diff -Naurw htmlrepair-1.0.1.orig/test/repair-test.rb htmlrepair-1.0.1/test/repair-test.rb --- htmlrepair-1.0.1.orig/test/repair-test.rb 1970-01-01 09:00:00.000000000 +0900 +++ htmlrepair-1.0.1/test/repair-test.rb 2004-10-29 20:58:26.646416544 +0900 @@ -0,0 +1,137 @@ + +# htmlrepairのテスト +# htmlrepair -- http://www.moonwolf.com/ruby/ + +require "test/unit/testcase" +require "test/unit/ui/console/testrunner" + +require "htmlrepair.rb" + +class HtmlRepairTest < Test::Unit::TestCase + # 終了タグの省略 + def test11 + html = "" + r = HTMLSplit.new(html); r.repair + assert_equal("", r.document.to_s) + end + + def test12 + html = "

a

b" + r = HTMLSplit.new(html); r.repair + assert_equal("

a

b

", r.document.to_s) + end + + # pタグ(内容が%inline)はulの前で終了 + # v1.0.1は失敗。 =>

hoge

+ def test21 + html = "

hoge

" + r = HTMLSplit.new(html); r.repair + assert_equal("

hoge

", r.document.to_s) + end + + # v1.0.1は失敗 =>

hoge

+ def test23 + html = "

hoge
" + r = HTMLSplit.new(html); r.repair + assert_equal("

hoge

", r.document.to_s) + end + + # liの子は%flowだが、%flowにliは含まれない。 + def test31 + html = "" + r = HTMLSplit.new(html); r.repair + assert_equal("", r.document.to_s) + end + + # ulは%blockで、liの子になれる + def test32 + html = "