2004.10.29 (q1)
- HTMLデータの文字コードが$KCODE依存になっていたが、明示的に指定できるようにした。
HTMLSplit.new()の第2引数で指定する。UTF-8は、$KCODEに合わせて、"UTF8"
- HTMLデータの文字コードがUTF-8のとき、コードポイント0x100以上の文字参照を取り込む
ようにした。
- XMLスタイルの空要素タグを解釈するようにした。ただし、StartTagと見なす。
- 不正なタグに対処した。'<'のみ、'<>'、'<0'など。
- 閉じない開始タグ、閉じない終了タグに対処した。
- テストケースを追加した。
diff -Naurw --minimal htmlsplit-1.0.2.orig/htmlsplit.rb htmlsplit-1.0.2/htmlsplit.rb
--- htmlsplit-1.0.2.orig/htmlsplit.rb 2000-09-26 22:48:30.000000000 +0900
+++ htmlsplit-1.0.2/htmlsplit.rb 2004-10-29 22:01:55.336407656 +0900
@@ -382,9 +382,55 @@
=end
class HTMLSplit
+ def HTMLSplit.make_char(encoding, code, unmatch)
+ if code == 0x9 || code == 0xa || code == 0xd || code >= 0x20 && code <= 0xff
+ code.chr
+ else
+ if (code <= 0xd7ff ||
+ code >= 0xe000 && code <= 0x10ffff && code != 0xfffe && code != 0xffff) &&
+ encoding == "UTF8"
+ [code].pack("U")
+ else
+ unmatch
+ end
+ end
+ end
+
+ def HTMLSplit.unescapeHTML(string, encoding)
+ string.gsub(/&(.*?);/n) {
+ match = $1.dup
+ case match
+ when /\Aamp\z/ni then '&'
+ when /\Aquot\z/ni then '"'
+ when /\Aapos\z/ni then "'"
+ when /\Agt\z/ni then '>'
+ when /\Alt\z/ni then '<'
+ when /\A#0*(\d+)\z/n
+ HTMLSplit.make_char(encoding, Integer($1), "#{$1};")
+ when /\A#x([0-9a-f]+)\z/ni
+ HTMLSplit.make_char(encoding, $1.hex, "#{$1};")
+ else
+ "{match};"
+ end
+ }
+ end
+
+ def make_tag(name, attr)
+ name.downcase!
+ if EMPTY.include?(name)
+ @document << EmptyElementTag.new(name, attr)
+ else
+ if name[0, 1] == '/'
+ @document << EndTag.new(name[1..-1])
+ else
+ @document << StartTag.new(name, attr)
+ end
+ end
+ end
+
EMPTY = %w(area base basefont bgsound br col frame hr img input isindex
keygen link meta nextid param spacer wbr)
- def initialize(html)
+ def initialize(html, encoding = $KCODE)
@document = [] #パースしたHTMLのリスト
name = ''
text = ''
@@ -407,20 +453,8 @@
text << char
end
when :TAGNAME
+ if name.length == 0
case char
- when '>'
- name.downcase!
- if EMPTY.include?(name)
- @document << EmptyElementTag.new(name,nil)
- else
- if name[0,1]=='/'
- @document << EndTag.new(name[1..-1])
- else
- @document << StartTag.new(name,nil)
- end
- end
- text = ''
- state = :TEXT
when '!'
text = ''
state = :DECLARE
@@ -430,27 +464,40 @@
when '?'
text = ''
state = :PHP
+ when /[a-zA-Z_:\/]/
+ name << char
+ else
+ text = '<' + char
+ state = :TEXT
+ end
+ else
+ case char
+ when '>'
+ make_tag(name, nil)
+ text = ''
+ state = :TEXT
when /\s/
text=''
state = :SPACE
- else
+ when /[a-zA-Z0-9\._:-]/
name << char
+ else
+ text = ''
+ state = :SPACE
+ redo
+ end
end
when :SPACE #属性間の空白
case char
when '>'
- name.downcase!
- if EMPTY.include?(name)
- @document << EmptyElementTag.new(name,attr)
- else
- if name[0,1]=='/'
- @document << EndTag.new(name[1..-1])
- else
- @document << StartTag.new(name,attr)
- end
- end
+ make_tag(name, attr)
text = ''
state = :TEXT
+ when '<' # 閉じない開始タグ
+ make_tag(name, attr)
+ name = ''
+ attr = {}
+ state = :TAGNAME
when /\s/
else
attrname=char
@@ -464,16 +511,7 @@
state = :AFTEREQUAL
when '>'
attr[attrname.downcase]=true
- name.downcase!
- if EMPTY.include?(name)
- @document << EmptyElementTag.new(name,attr)
- else
- if name[0,1]=='/'
- @document << EndTag.new(name[1..-1])
- else
- @document << StartTag.new(name,attr)
- end
- end
+ make_tag(name, attr)
text = ''
state = :TEXT
else
@@ -485,16 +523,7 @@
state = :AFTEREQUAL
when '>'
attr[attrname.downcase]=true
- name.downcase!
- if EMPTY.include?(name)
- @document << EmptyElementTag.new(name,attr)
- else
- if name[0,1]=='/'
- @document << EndTag.new(name[1..-1])
- else
- @document << StartTag.new(name,attr)
- end
- end
+ make_tag(name, attr)
text = ''
state = :TEXT
when /\s/
@@ -513,16 +542,7 @@
state = :DQVALUE
when '>'
attr[attrname.downcase]=true
- name.downcase!
- if EMPTY.include?(name)
- @document << EmptyElementTag.new(name,attr)
- else
- if name[0,1]=='/'
- @document << EndTag.new(name[1..-1])
- else
- @document << StartTag.new(name,attr)
- end
- end
+ make_tag(name, attr)
text = ''
state = :TEXT
when /\s/
@@ -533,20 +553,11 @@
when :VALUE #値
case char
when /\s/
- attr[attrname.downcase]=CGI::unescapeHTML(text)
+ attr[attrname.downcase] = HTMLSplit.unescapeHTML(text, encoding)
state = :SPACE
when '>'
- attr[attrname.downcase]=CGI::unescapeHTML(text)
- name.downcase!
- if EMPTY.include?(name)
- @document << EmptyElementTag.new(name,attr)
- else
- if name[0,1]=='/'
- @document << EndTag.new(name[1..-1])
- else
- @document << StartTag.new(name,attr)
- end
- end
+ attr[attrname.downcase] = HTMLSplit.unescapeHTML(text, encoding)
+ make_tag(name, attr)
text = ''
state = :TEXT
else
@@ -554,14 +565,14 @@
end
when :SQVALUE #'値'
if c==39
- attr[attrname.downcase]=CGI::unescapeHTML(text)
+ attr[attrname.downcase] = HTMLSplit.unescapeHTML(text, encoding)
state = :SPACE
else
text << char
end
when :DQVALUE #"値"
if c==34
- attr[attrname.downcase]=CGI::unescapeHTML(text)
+ attr[attrname.downcase] = HTMLSplit.unescapeHTML(text, encoding)
state = :SPACE
else
text << char
@@ -634,68 +645,23 @@
when :TAGNAME
@document << CharacterData.new('<'+text)
when :SPACE #属性間の空白
- name.downcase!
- if EMPTY.include?(name)
- @document << EmptyElementTag.new(name,attr)
- else
- if name[0,1]=='/'
- @document << EndTag.new(name[1..-1])
- else
- @document << StartTag.new(name,attr)
- end
- end
+ make_tag(name, attr)
when :ATTRNAME #属性名
attr[attrname.downcase]=true
- name.downcase!
- if EMPTY.include?(name)
- @document << EmptyElementTag.new(name,attr)
- else
- if name[0,1]=='/'
- @document << EndTag.new(name[1..-1])
- else
- @document << StartTag.new(name,attr)
- end
- end
+ make_tag(name, attr)
when :BEFOREEQUAL #=
attr[attrname.downcase]=true
- name.downcase!
- if EMPTY.include?(name)
- @document << EmptyElementTag.new(name,attr)
- else
- if name[0,1]=='/'
- @document << EndTag.new(name[1..-1])
- else
- @document << StartTag.new(name,attr)
- end
- end
+ make_tag(name, attr)
when :AFTEREQUAL #=
attr[attrname.downcase]=true
- name.downcase!
- if EMPTY.include?(name)
- @document << EmptyElementTag.new(name,attr)
- else
- if name[0,1]=='/'
- @document << EndTag.new(name[1..-1])
- else
- @document << StartTag.new(name,attr)
- end
- end
+ make_tag(name, attr)
when :VALUE #値
- attr[attrname.downcase]=CGI::unescapeHTML(text)
- name.downcase!
- if EMPTY.include?(name)
- @document << EmptyElementTag.new(name,attr)
- else
- if name[0,1]=='/'
- @document << EndTag.new(name[1..-1])
- else
- @document << StartTag.new(name,attr)
- end
- end
+ attr[attrname.downcase] = HTMLSplit.unescapeHTML(text, encoding)
+ make_tag(name, attr)
when :SQVALUE #'値'
- attr[attrname.downcase]=CGI::unescapeHTML(text)
+ attr[attrname.downcase] = HTMLSplit.unescapeHTML(text, encoding)
when :DQVALUE #"値"
- attr[attrname.downcase]=CGI::unescapeHTML(text)
+ attr[attrname.downcase] = HTMLSplit.unescapeHTML(text, encoding)
when :COMMENT
if text=~/^#[a-zA-Z]+/ #SSI
@document << SSI.new(text)
diff -Naurw --minimal htmlsplit-1.0.2.orig/splittest.rb htmlsplit-1.0.2/splittest.rb
--- htmlsplit-1.0.2.orig/splittest.rb 2000-08-24 21:17:24.000000000 +0900
+++ htmlsplit-1.0.2/splittest.rb 1970-01-01 09:00:00.000000000 +0900
@@ -1,7 +0,0 @@
-#!/usr/bin/ruby
-require "htmlsplit"
-
-obj = HTMLSplit.new(ARGF.read)
-obj.document.each {|e|
- print e.to_s
-}
diff -Naurw --minimal htmlsplit-1.0.2.orig/test/ref-test.rb htmlsplit-1.0.2/test/ref-test.rb
--- htmlsplit-1.0.2.orig/test/ref-test.rb 1970-01-01 09:00:00.000000000 +0900
+++ htmlsplit-1.0.2/test/ref-test.rb 2004-10-28 20:45:35.000000000 +0900
@@ -0,0 +1,59 @@
+
+require "test/unit/testcase"
+require "test/unit/ui/console/testrunner"
+
+require "htmlsplit"
+
+# JIS XML 目次
+# http://www.y-adagio.com/public/standards/jis_xml/toc.html
+
+# @IT:やさしく読む「XML 1.0勧告」 第22回 物理構造における「文字参照」と「実体参照」
+# http://www.atmarkit.co.jp/fxml/rensai/w3cread22/w3cread22_1.html
+
+class RefTest < Test::Unit::TestCase
+ # 内容
+ def test11
+ html = "hoge<hoge"
+ o = HTMLSplit.new(html)
+ assert_equal("hoge<hoge", o.document[1].to_s) # 取り込まない
+ end
+
+ def test12
+ html = "hoge)hoge"
+ o = HTMLSplit.new(html)
+ assert_equal("hoge)hoge", o.document[1].to_s) # 取り込まない
+ end
+
+ # 属性値
+ def test13
+ html = ""
+ o = HTMLSplit.new(html)
+ assert_equal("hoge & ", o.document[0].text)
+ end
+
+ # 不正なコメント。Webブラウザでは、"-->"までコメントになる。
+ def test12
+ html = ""
+ o = HTMLSplit.new(html)
+ assert_equal(" foo -- bar ", o.document[0].text)
+ end
+
+ # 処理命令 (PI)
+ def test21
+ html = " & ?>"
+ o = HTMLSplit.new(html)
+ assert_instance_of(PHP, o.document[0])
+ assert_equal("hoge & ", o.document[0].text) # PITargetは認識しない
+ end
+
+=begin
+ # CDATAセクション
+ def test31
+ end
+=end
+
+ # 文書型宣言
+ def test41
+ html = ''
+ o = HTMLSplit.new(html)
+ assert_instance_of(Declaration, o.document[0])
+ assert_equal('DOCTYPE greeting SYSTEM "hello.dtd"', o.document[0].text)
+ end
+
+=begin
+ # 内部サブセット
+ def test42
+ html = ' ]>'
+ o = HTMLSplit.new(html)
+ assert_instance_of(Declaration, o.document[0])
+ assert_equal('DOCTYPE greeting SYSTEM "hello.dtd"', o.document[0].text)
+ end
+=end
+
+ # 開始タグ
+ def test51
+ html = "" # '...'で囲んでもよい
+ o = HTMLSplit.new(html)
+ tag = o.document[0]
+ assert_instance_of(StartTag, tag)
+ assert_equal({"bar" => "hoge"}, tag.attr)
+ end
+
+ # 空要素タグ (XMLスタイル)
+ def test52
+ html = ""
+ o = HTMLSplit.new(html)
+ tag = o.document[0]
+ assert_instance_of(StartTag, tag) # やむなしか?
+ assert_equal({"/" => true}, tag.attr)
+ end
+
+ # 不正なタグ
+ def test53
+ html = "<0> < <>"
+ o = HTMLSplit.new(html)
+ assert_instance_of(CharacterData, o.document[0])
+ assert_equal("<0> ", o.document[0].text)
+ assert_instance_of(CharacterData, o.document[1])
+ assert_equal("< ", o.document[1].text)
+ assert_instance_of(CharacterData, o.document[2])
+ assert_equal("<>", o.document[2].text)
+ end
+
+ # 終了タグ
+ def test54
+ html = ""
+ o = HTMLSplit.new(html)
+ tag = o.document[0]
+ assert_instance_of(EndTag, tag)
+ assert_equal("foo", tag.name)
+ end
+
+ # 閉じない開始タグ SGMLで規定。Webブラウザでも解釈している
+ def test61
+ html = ""
+ o = HTMLSplit.new(html)
+ tag = o.document[0]
+ assert_instance_of(StartTag, tag)
+ assert_equal("foo", tag.name)
+ tag = o.document[1]
+ assert_instance_of(StartTag, tag)
+ assert_equal("bar", tag.name)
+ end
+
+ # 閉じない終了タグ SGMLで規定。Webブラウザでも解釈している
+ def test62
+ html = ""
+ o = HTMLSplit.new(html)
+ tag = o.document[0]
+ assert_instance_of(EndTag, tag)
+ assert_equal("foo", tag.name)
+ tag = o.document[1]
+ assert_instance_of(EndTag, tag)
+ assert_equal("bar", tag.name)
+ end
+end
+
+Test::Unit::UI::Console::TestRunner.new(StructureTest).start()