2004.10.29 (q1) - HTMLデータの文字コードが$KCODE依存になっていたが、明示的に指定できるようにした。 HTMLSplit.new()の第2引数で指定する。UTF-8は、$KCODEに合わせて、"UTF8" - HTMLデータの文字コードがUTF-8のとき、コードポイント0x100以上の文字参照を取り込む ようにした。 - XMLスタイルの空要素タグを解釈するようにした。ただし、StartTagと見なす。 - 不正なタグに対処した。'<'のみ、'<>'、'<0'など。 - 閉じない開始タグ、閉じない終了タグに対処した。 - テストケースを追加した。 diff -Naurw --minimal htmlsplit-1.0.2.orig/htmlsplit.rb htmlsplit-1.0.2/htmlsplit.rb --- htmlsplit-1.0.2.orig/htmlsplit.rb 2000-09-26 22:48:30.000000000 +0900 +++ htmlsplit-1.0.2/htmlsplit.rb 2004-10-29 22:01:55.336407656 +0900 @@ -382,9 +382,55 @@ =end class HTMLSplit + def HTMLSplit.make_char(encoding, code, unmatch) + if code == 0x9 || code == 0xa || code == 0xd || code >= 0x20 && code <= 0xff + code.chr + else + if (code <= 0xd7ff || + code >= 0xe000 && code <= 0x10ffff && code != 0xfffe && code != 0xffff) && + encoding == "UTF8" + [code].pack("U") + else + unmatch + end + end + end + + def HTMLSplit.unescapeHTML(string, encoding) + string.gsub(/&(.*?);/n) { + match = $1.dup + case match + when /\Aamp\z/ni then '&' + when /\Aquot\z/ni then '"' + when /\Aapos\z/ni then "'" + when /\Agt\z/ni then '>' + when /\Alt\z/ni then '<' + when /\A#0*(\d+)\z/n + HTMLSplit.make_char(encoding, Integer($1), "&##{$1};") + when /\A#x([0-9a-f]+)\z/ni + HTMLSplit.make_char(encoding, $1.hex, "&#x#{$1};") + else + "&#{match};" + end + } + end + + def make_tag(name, attr) + name.downcase! + if EMPTY.include?(name) + @document << EmptyElementTag.new(name, attr) + else + if name[0, 1] == '/' + @document << EndTag.new(name[1..-1]) + else + @document << StartTag.new(name, attr) + end + end + end + EMPTY = %w(area base basefont bgsound br col frame hr img input isindex keygen link meta nextid param spacer wbr) - def initialize(html) + def initialize(html, encoding = $KCODE) @document = [] #パースしたHTMLのリスト name = '' text = '' @@ -407,20 +453,8 @@ text << char end when :TAGNAME + if name.length == 0 case char - when '>' - name.downcase! - if EMPTY.include?(name) - @document << EmptyElementTag.new(name,nil) - else - if name[0,1]=='/' - @document << EndTag.new(name[1..-1]) - else - @document << StartTag.new(name,nil) - end - end - text = '' - state = :TEXT when '!' text = '' state = :DECLARE @@ -430,27 +464,40 @@ when '?' text = '' state = :PHP + when /[a-zA-Z_:\/]/ + name << char + else + text = '<' + char + state = :TEXT + end + else + case char + when '>' + make_tag(name, nil) + text = '' + state = :TEXT when /\s/ text='' state = :SPACE - else + when /[a-zA-Z0-9\._:-]/ name << char + else + text = '' + state = :SPACE + redo + end end when :SPACE #属性間の空白 case char when '>' - name.downcase! - if EMPTY.include?(name) - @document << EmptyElementTag.new(name,attr) - else - if name[0,1]=='/' - @document << EndTag.new(name[1..-1]) - else - @document << StartTag.new(name,attr) - end - end + make_tag(name, attr) text = '' state = :TEXT + when '<' # 閉じない開始タグ + make_tag(name, attr) + name = '' + attr = {} + state = :TAGNAME when /\s/ else attrname=char @@ -464,16 +511,7 @@ state = :AFTEREQUAL when '>' attr[attrname.downcase]=true - name.downcase! - if EMPTY.include?(name) - @document << EmptyElementTag.new(name,attr) - else - if name[0,1]=='/' - @document << EndTag.new(name[1..-1]) - else - @document << StartTag.new(name,attr) - end - end + make_tag(name, attr) text = '' state = :TEXT else @@ -485,16 +523,7 @@ state = :AFTEREQUAL when '>' attr[attrname.downcase]=true - name.downcase! - if EMPTY.include?(name) - @document << EmptyElementTag.new(name,attr) - else - if name[0,1]=='/' - @document << EndTag.new(name[1..-1]) - else - @document << StartTag.new(name,attr) - end - end + make_tag(name, attr) text = '' state = :TEXT when /\s/ @@ -513,16 +542,7 @@ state = :DQVALUE when '>' attr[attrname.downcase]=true - name.downcase! - if EMPTY.include?(name) - @document << EmptyElementTag.new(name,attr) - else - if name[0,1]=='/' - @document << EndTag.new(name[1..-1]) - else - @document << StartTag.new(name,attr) - end - end + make_tag(name, attr) text = '' state = :TEXT when /\s/ @@ -533,20 +553,11 @@ when :VALUE #値 case char when /\s/ - attr[attrname.downcase]=CGI::unescapeHTML(text) + attr[attrname.downcase] = HTMLSplit.unescapeHTML(text, encoding) state = :SPACE when '>' - attr[attrname.downcase]=CGI::unescapeHTML(text) - name.downcase! - if EMPTY.include?(name) - @document << EmptyElementTag.new(name,attr) - else - if name[0,1]=='/' - @document << EndTag.new(name[1..-1]) - else - @document << StartTag.new(name,attr) - end - end + attr[attrname.downcase] = HTMLSplit.unescapeHTML(text, encoding) + make_tag(name, attr) text = '' state = :TEXT else @@ -554,14 +565,14 @@ end when :SQVALUE #'値' if c==39 - attr[attrname.downcase]=CGI::unescapeHTML(text) + attr[attrname.downcase] = HTMLSplit.unescapeHTML(text, encoding) state = :SPACE else text << char end when :DQVALUE #"値" if c==34 - attr[attrname.downcase]=CGI::unescapeHTML(text) + attr[attrname.downcase] = HTMLSplit.unescapeHTML(text, encoding) state = :SPACE else text << char @@ -634,68 +645,23 @@ when :TAGNAME @document << CharacterData.new('<'+text) when :SPACE #属性間の空白 - name.downcase! - if EMPTY.include?(name) - @document << EmptyElementTag.new(name,attr) - else - if name[0,1]=='/' - @document << EndTag.new(name[1..-1]) - else - @document << StartTag.new(name,attr) - end - end + make_tag(name, attr) when :ATTRNAME #属性名 attr[attrname.downcase]=true - name.downcase! - if EMPTY.include?(name) - @document << EmptyElementTag.new(name,attr) - else - if name[0,1]=='/' - @document << EndTag.new(name[1..-1]) - else - @document << StartTag.new(name,attr) - end - end + make_tag(name, attr) when :BEFOREEQUAL #= attr[attrname.downcase]=true - name.downcase! - if EMPTY.include?(name) - @document << EmptyElementTag.new(name,attr) - else - if name[0,1]=='/' - @document << EndTag.new(name[1..-1]) - else - @document << StartTag.new(name,attr) - end - end + make_tag(name, attr) when :AFTEREQUAL #= attr[attrname.downcase]=true - name.downcase! - if EMPTY.include?(name) - @document << EmptyElementTag.new(name,attr) - else - if name[0,1]=='/' - @document << EndTag.new(name[1..-1]) - else - @document << StartTag.new(name,attr) - end - end + make_tag(name, attr) when :VALUE #値 - attr[attrname.downcase]=CGI::unescapeHTML(text) - name.downcase! - if EMPTY.include?(name) - @document << EmptyElementTag.new(name,attr) - else - if name[0,1]=='/' - @document << EndTag.new(name[1..-1]) - else - @document << StartTag.new(name,attr) - end - end + attr[attrname.downcase] = HTMLSplit.unescapeHTML(text, encoding) + make_tag(name, attr) when :SQVALUE #'値' - attr[attrname.downcase]=CGI::unescapeHTML(text) + attr[attrname.downcase] = HTMLSplit.unescapeHTML(text, encoding) when :DQVALUE #"値" - attr[attrname.downcase]=CGI::unescapeHTML(text) + attr[attrname.downcase] = HTMLSplit.unescapeHTML(text, encoding) when :COMMENT if text=~/^#[a-zA-Z]+/ #SSI @document << SSI.new(text) diff -Naurw --minimal htmlsplit-1.0.2.orig/splittest.rb htmlsplit-1.0.2/splittest.rb --- htmlsplit-1.0.2.orig/splittest.rb 2000-08-24 21:17:24.000000000 +0900 +++ htmlsplit-1.0.2/splittest.rb 1970-01-01 09:00:00.000000000 +0900 @@ -1,7 +0,0 @@ -#!/usr/bin/ruby -require "htmlsplit" - -obj = HTMLSplit.new(ARGF.read) -obj.document.each {|e| - print e.to_s -} diff -Naurw --minimal htmlsplit-1.0.2.orig/test/ref-test.rb htmlsplit-1.0.2/test/ref-test.rb --- htmlsplit-1.0.2.orig/test/ref-test.rb 1970-01-01 09:00:00.000000000 +0900 +++ htmlsplit-1.0.2/test/ref-test.rb 2004-10-28 20:45:35.000000000 +0900 @@ -0,0 +1,59 @@ + +require "test/unit/testcase" +require "test/unit/ui/console/testrunner" + +require "htmlsplit" + +# JIS XML 目次 +# http://www.y-adagio.com/public/standards/jis_xml/toc.html + +# @IT:やさしく読む「XML 1.0勧告」 第22回 物理構造における「文字参照」と「実体参照」 +# http://www.atmarkit.co.jp/fxml/rensai/w3cread22/w3cread22_1.html + +class RefTest < Test::Unit::TestCase + # 内容 + def test11 + html = "hoge<hoge" + o = HTMLSplit.new(html) + assert_equal("hoge<hoge", o.document[1].to_s) # 取り込まない + end + + def test12 + html = "hoge)hoge" + o = HTMLSplit.new(html) + assert_equal("hoge)hoge", o.document[1].to_s) # 取り込まない + end + + # 属性値 + def test13 + html = "" + o = HTMLSplit.new(html) + assert_equal("hoge & ", o.document[0].text) + end + + # 不正なコメント。Webブラウザでは、"-->"までコメントになる。 + def test12 + html = "" + o = HTMLSplit.new(html) + assert_equal(" foo -- bar ", o.document[0].text) + end + + # 処理命令 (PI) + def test21 + html = " & ?>" + o = HTMLSplit.new(html) + assert_instance_of(PHP, o.document[0]) + assert_equal("hoge & ", o.document[0].text) # PITargetは認識しない + end + +=begin + # CDATAセクション + def test31 + end +=end + + # 文書型宣言 + def test41 + html = '' + o = HTMLSplit.new(html) + assert_instance_of(Declaration, o.document[0]) + assert_equal('DOCTYPE greeting SYSTEM "hello.dtd"', o.document[0].text) + end + +=begin + # 内部サブセット + def test42 + html = ' ]>' + o = HTMLSplit.new(html) + assert_instance_of(Declaration, o.document[0]) + assert_equal('DOCTYPE greeting SYSTEM "hello.dtd"', o.document[0].text) + end +=end + + # 開始タグ + def test51 + html = "" # '...'で囲んでもよい + o = HTMLSplit.new(html) + tag = o.document[0] + assert_instance_of(StartTag, tag) + assert_equal({"bar" => "hoge"}, tag.attr) + end + + # 空要素タグ (XMLスタイル) + def test52 + html = "" + o = HTMLSplit.new(html) + tag = o.document[0] + assert_instance_of(StartTag, tag) # やむなしか? + assert_equal({"/" => true}, tag.attr) + end + + # 不正なタグ + def test53 + html = "<0> < <>" + o = HTMLSplit.new(html) + assert_instance_of(CharacterData, o.document[0]) + assert_equal("<0> ", o.document[0].text) + assert_instance_of(CharacterData, o.document[1]) + assert_equal("< ", o.document[1].text) + assert_instance_of(CharacterData, o.document[2]) + assert_equal("<>", o.document[2].text) + end + + # 終了タグ + def test54 + html = "" + o = HTMLSplit.new(html) + tag = o.document[0] + assert_instance_of(EndTag, tag) + assert_equal("foo", tag.name) + end + + # 閉じない開始タグ SGMLで規定。Webブラウザでも解釈している + def test61 + html = "" + o = HTMLSplit.new(html) + tag = o.document[0] + assert_instance_of(StartTag, tag) + assert_equal("foo", tag.name) + tag = o.document[1] + assert_instance_of(StartTag, tag) + assert_equal("bar", tag.name) + end + + # 閉じない終了タグ SGMLで規定。Webブラウザでも解釈している + def test62 + html = "" + o = HTMLSplit.new(html) + tag = o.document[0] + assert_instance_of(EndTag, tag) + assert_equal("foo", tag.name) + tag = o.document[1] + assert_instance_of(EndTag, tag) + assert_equal("bar", tag.name) + end +end + +Test::Unit::UI::Console::TestRunner.new(StructureTest).start()