2004.10.29  (q1)
    - HTMLデータの文字コードが$KCODE依存になっていたが、明示的に指定できるようにした。
      HTMLSplit.new()の第2引数で指定する。UTF-8は、$KCODEに合わせて、"UTF8"
    - HTMLデータの文字コードがUTF-8のとき、コードポイント0x100以上の文字参照を取り込む
      ようにした。
    - XMLスタイルの空要素タグを解釈するようにした。ただし、StartTagと見なす。
    - 不正なタグに対処した。'<'のみ、'<>'、'<0'など。
    - 閉じない開始タグ、閉じない終了タグに対処した。
    - テストケースを追加した。

diff -Naurw --minimal htmlsplit-1.0.2.orig/htmlsplit.rb htmlsplit-1.0.2/htmlsplit.rb
--- htmlsplit-1.0.2.orig/htmlsplit.rb	2000-09-26 22:48:30.000000000 +0900
+++ htmlsplit-1.0.2/htmlsplit.rb	2004-10-29 22:01:55.336407656 +0900
@@ -382,9 +382,55 @@
 </dl>
 =end
 class HTMLSplit
+  def HTMLSplit.make_char(encoding, code, unmatch)
+    if code == 0x9 || code == 0xa || code == 0xd || code >= 0x20 && code <= 0xff
+      code.chr
+    else
+      if (code <= 0xd7ff ||
+          code >= 0xe000 && code <= 0x10ffff && code != 0xfffe && code != 0xffff) && 
+         encoding == "UTF8"
+        [code].pack("U")
+      else
+        unmatch
+      end
+    end
+  end
+
+  def HTMLSplit.unescapeHTML(string, encoding)
+    string.gsub(/&(.*?);/n) {
+      match = $1.dup
+      case match
+      when /\Aamp\z/ni           then '&'
+      when /\Aquot\z/ni          then '"'
+      when /\Aapos\z/ni          then "'"
+      when /\Agt\z/ni            then '>'
+      when /\Alt\z/ni            then '<'
+      when /\A#0*(\d+)\z/n
+        HTMLSplit.make_char(encoding, Integer($1), "&##{$1};")
+      when /\A#x([0-9a-f]+)\z/ni
+        HTMLSplit.make_char(encoding, $1.hex, "&#x#{$1};")
+      else
+        "&#{match};"
+      end
+    }
+  end
+
+  def make_tag(name, attr)
+    name.downcase!
+    if EMPTY.include?(name)
+      @document << EmptyElementTag.new(name, attr)
+    else
+      if name[0, 1] == '/'
+        @document << EndTag.new(name[1..-1])
+      else
+        @document << StartTag.new(name, attr)
+      end
+    end
+  end
+
 	EMPTY = %w(area base basefont bgsound br col frame hr img input isindex 
 	           keygen link meta nextid param spacer wbr)
-	def initialize(html)
+  def initialize(html, encoding = $KCODE)
 		@document = []	#パースしたHTMLのリスト
 		name = ''
 		text = ''
@@ -407,20 +453,8 @@
 					text << char
 				end
 			when :TAGNAME
+        if name.length == 0
 				case char
-				when '>'
-					name.downcase!
-					if EMPTY.include?(name)
-						@document << EmptyElementTag.new(name,nil)
-					else
-						if name[0,1]=='/'
-							@document << EndTag.new(name[1..-1])
-						else
-							@document << StartTag.new(name,nil)
-						end
-					end
-					text = ''
-					state = :TEXT
 				when '!'
 					text = ''
 					state = :DECLARE
@@ -430,27 +464,40 @@
 				when '?'
 					text = ''
 					state = :PHP
+          when /[a-zA-Z_:\/]/
+            name << char
+          else
+            text = '<' + char
+            state = :TEXT
+          end
+        else
+          case char
+          when '>'
+            make_tag(name, nil)
+            text = ''
+            state = :TEXT
 				when /\s/
 					text=''
 					state = :SPACE
-				else
+          when /[a-zA-Z0-9\._:-]/
 					name << char
+          else
+            text = ''
+            state = :SPACE
+            redo
+          end
 				end
 			when :SPACE	#属性間の空白
 				case char
 				when '>'
-					name.downcase!
-					if EMPTY.include?(name)
-						@document << EmptyElementTag.new(name,attr)
-					else
-						if name[0,1]=='/'
-							@document << EndTag.new(name[1..-1])
-						else
-							@document << StartTag.new(name,attr)
-						end
-					end
+          make_tag(name, attr)
 					text = ''
 					state = :TEXT
+        when '<'  # 閉じない開始タグ
+          make_tag(name, attr)
+          name = ''
+          attr = {}
+          state = :TAGNAME
 				when /\s/
 				else
 					attrname=char
@@ -464,16 +511,7 @@
 					state = :AFTEREQUAL
 				when '>'
 					attr[attrname.downcase]=true
-					name.downcase!
-					if EMPTY.include?(name)
-						@document << EmptyElementTag.new(name,attr)
-					else
-						if name[0,1]=='/'
-							@document << EndTag.new(name[1..-1])
-						else
-							@document << StartTag.new(name,attr)
-						end
-					end
+          make_tag(name, attr)
 					text = ''
 					state = :TEXT
 				else
@@ -485,16 +523,7 @@
 					state = :AFTEREQUAL
 				when '>'
 					attr[attrname.downcase]=true
-					name.downcase!
-					if EMPTY.include?(name)
-						@document << EmptyElementTag.new(name,attr)
-					else
-						if name[0,1]=='/'
-							@document << EndTag.new(name[1..-1])
-						else
-							@document << StartTag.new(name,attr)
-						end
-					end
+          make_tag(name, attr)
 					text = ''
 					state = :TEXT
 				when /\s/
@@ -513,16 +542,7 @@
 					state = :DQVALUE
 				when '>'
 					attr[attrname.downcase]=true
-					name.downcase!
-					if EMPTY.include?(name)
-						@document << EmptyElementTag.new(name,attr)
-					else
-						if name[0,1]=='/'
-							@document << EndTag.new(name[1..-1])
-						else
-							@document << StartTag.new(name,attr)
-						end
-					end
+          make_tag(name, attr)
 					text = ''
 					state = :TEXT
 				when /\s/
@@ -533,20 +553,11 @@
 			when :VALUE		#値
 				case char
 				when /\s/
-					attr[attrname.downcase]=CGI::unescapeHTML(text)
+          attr[attrname.downcase] = HTMLSplit.unescapeHTML(text, encoding)
 					state = :SPACE
 				when '>'
-					attr[attrname.downcase]=CGI::unescapeHTML(text)
-					name.downcase!
-					if EMPTY.include?(name)
-						@document << EmptyElementTag.new(name,attr)
-					else
-						if name[0,1]=='/'
-							@document << EndTag.new(name[1..-1])
-						else
-							@document << StartTag.new(name,attr)
-						end
-					end
+          attr[attrname.downcase] = HTMLSplit.unescapeHTML(text, encoding)
+          make_tag(name, attr)
 					text = ''
 					state = :TEXT
 				else
@@ -554,14 +565,14 @@
 				end
 			when :SQVALUE	#'値'
 				if c==39
-					attr[attrname.downcase]=CGI::unescapeHTML(text)
+          attr[attrname.downcase] = HTMLSplit.unescapeHTML(text, encoding)
 					state = :SPACE
 				else
 					text << char
 				end
 			when :DQVALUE	#"値"
 				if c==34
-					attr[attrname.downcase]=CGI::unescapeHTML(text)
+          attr[attrname.downcase] = HTMLSplit.unescapeHTML(text, encoding)
 					state = :SPACE
 				else
 					text << char
@@ -634,68 +645,23 @@
 		when :TAGNAME
 			@document << CharacterData.new('<'+text)
 		when :SPACE	#属性間の空白
-			name.downcase!
-			if EMPTY.include?(name)
-				@document << EmptyElementTag.new(name,attr)
-			else
-				if name[0,1]=='/'
-					@document << EndTag.new(name[1..-1])
-				else
-					@document << StartTag.new(name,attr)
-				end
-			end
+      make_tag(name, attr)
 		when :ATTRNAME	#属性名
 			attr[attrname.downcase]=true
-			name.downcase!
-			if EMPTY.include?(name)
-				@document << EmptyElementTag.new(name,attr)
-			else
-				if name[0,1]=='/'
-					@document << EndTag.new(name[1..-1])
-				else
-					@document << StartTag.new(name,attr)
-				end
-			end
+      make_tag(name, attr)
 		when :BEFOREEQUAL	#=
 			attr[attrname.downcase]=true
-			name.downcase!
-			if EMPTY.include?(name)
-				@document << EmptyElementTag.new(name,attr)
-			else
-				if name[0,1]=='/'
-					@document << EndTag.new(name[1..-1])
-				else
-					@document << StartTag.new(name,attr)
-				end
-			end
+      make_tag(name, attr)
 		when :AFTEREQUAL	#=
 			attr[attrname.downcase]=true
-			name.downcase!
-			if EMPTY.include?(name)
-				@document << EmptyElementTag.new(name,attr)
-			else
-				if name[0,1]=='/'
-					@document << EndTag.new(name[1..-1])
-				else
-					@document << StartTag.new(name,attr)
-				end
-			end
+      make_tag(name, attr)
 		when :VALUE		#値
-			attr[attrname.downcase]=CGI::unescapeHTML(text)
-			name.downcase!
-			if EMPTY.include?(name)
-				@document << EmptyElementTag.new(name,attr)
-			else
-				if name[0,1]=='/'
-					@document << EndTag.new(name[1..-1])
-				else
-					@document << StartTag.new(name,attr)
-				end
-			end
+      attr[attrname.downcase] = HTMLSplit.unescapeHTML(text, encoding)
+      make_tag(name, attr)
 		when :SQVALUE	#'値'
-			attr[attrname.downcase]=CGI::unescapeHTML(text)
+      attr[attrname.downcase] = HTMLSplit.unescapeHTML(text, encoding)
 		when :DQVALUE	#"値"
-			attr[attrname.downcase]=CGI::unescapeHTML(text)
+      attr[attrname.downcase] = HTMLSplit.unescapeHTML(text, encoding)
 		when :COMMENT
 			if text=~/^#[a-zA-Z]+/	#SSI
 				@document << SSI.new(text)
diff -Naurw --minimal htmlsplit-1.0.2.orig/splittest.rb htmlsplit-1.0.2/splittest.rb
--- htmlsplit-1.0.2.orig/splittest.rb	2000-08-24 21:17:24.000000000 +0900
+++ htmlsplit-1.0.2/splittest.rb	1970-01-01 09:00:00.000000000 +0900
@@ -1,7 +0,0 @@
-#!/usr/bin/ruby
-require "htmlsplit"
-
-obj = HTMLSplit.new(ARGF.read)
-obj.document.each {|e|
-	print e.to_s
-}
diff -Naurw --minimal htmlsplit-1.0.2.orig/test/ref-test.rb htmlsplit-1.0.2/test/ref-test.rb
--- htmlsplit-1.0.2.orig/test/ref-test.rb	1970-01-01 09:00:00.000000000 +0900
+++ htmlsplit-1.0.2/test/ref-test.rb	2004-10-28 20:45:35.000000000 +0900
@@ -0,0 +1,59 @@
+
+require "test/unit/testcase"
+require "test/unit/ui/console/testrunner"
+
+require "htmlsplit"
+
+# JIS XML 目次
+# http://www.y-adagio.com/public/standards/jis_xml/toc.html
+
+# ＠IT：やさしく読む「XML 1.0勧告」 第22回　物理構造における「文字参照」と「実体参照」
+# http://www.atmarkit.co.jp/fxml/rensai/w3cread22/w3cread22_1.html
+
+class RefTest < Test::Unit::TestCase
+  # 内容
+  def test11
+    html = "<e>hoge&lt;hoge</e>"
+    o = HTMLSplit.new(html)
+    assert_equal("hoge&lt;hoge", o.document[1].to_s)   # 取り込まない
+  end
+
+  def test12
+    html = "<e>hoge&#41;hoge</e>"
+    o = HTMLSplit.new(html)
+    assert_equal("hoge&#41;hoge", o.document[1].to_s)  # 取り込まない
+  end
+
+  # 属性値
+  def test13
+    html = "<e foo='hoge&lt;hoge'>"
+    o = HTMLSplit.new(html)
+    assert_equal("hoge<hoge", o.document[0].attr["foo"])  # 取り込む
+  end
+
+  def test14
+    html = "<e foo='&#41;'>"
+    o = HTMLSplit.new(html)
+    assert_equal(")", o.document[0].attr["foo"])   # 取り込む
+  end
+
+  # 第2引数を追加
+  def test15
+    html = "<e foo='&#x4100;'>"
+    o = HTMLSplit.new(html)
+    assert_equal("&#x4100;", o.document[0].attr["foo"])
+    o = HTMLSplit.new(html, "UTF8")                        # UTF8の場合のみ取り込む
+    assert_equal([0x4100].pack("U"), o.document[0].attr["foo"])
+  end
+
+  def test21
+    html = "<e foo='&#xd7ff;'>"
+    o = HTMLSplit.new(html, "UTF8")
+    assert_equal([0xd7ff].pack("U"), o.document[0].attr["foo"])
+    html = "<e foo='&#xd800;'>"
+    o = HTMLSplit.new(html, "UTF8")
+    assert_equal("&#xd800;", o.document[0].attr["foo"])   # サロゲート領域は取り込まない
+  end
+end
+
+Test::Unit::UI::Console::TestRunner.new(RefTest).start()
diff -Naurw --minimal htmlsplit-1.0.2.orig/test/struct-test.rb htmlsplit-1.0.2/test/struct-test.rb
--- htmlsplit-1.0.2.orig/test/struct-test.rb	1970-01-01 09:00:00.000000000 +0900
+++ htmlsplit-1.0.2/test/struct-test.rb	2004-10-29 21:59:22.134697856 +0900
@@ -0,0 +1,118 @@
+
+require "test/unit/testcase"
+require "test/unit/ui/console/testrunner"
+
+require "htmlsplit"
+
+class StructureTest < Test::Unit::TestCase
+  # コメント
+  def test11
+    html = "<!--  <foo> & <bar> -->"
+    o = HTMLSplit.new(html)
+    assert_equal("  <foo> & <bar> ", o.document[0].text)
+  end
+
+  # 不正なコメント。Webブラウザでは、"-->"までコメントになる。
+  def test12
+    html = "<!-- foo -- bar -->"
+    o = HTMLSplit.new(html)
+    assert_equal(" foo -- bar ", o.document[0].text)
+  end
+
+  # 処理命令 (PI)
+  def test21
+    html = "<?hoge <foo> & <bar> ?>"
+    o = HTMLSplit.new(html)
+    assert_instance_of(PHP, o.document[0])
+    assert_equal("hoge <foo> & <bar> ", o.document[0].text)   # PITargetは認識しない
+  end
+
+=begin
+  # CDATAセクション
+  def test31
+  end
+=end
+
+  # 文書型宣言
+  def test41
+    html = '<!DOCTYPE greeting SYSTEM "hello.dtd">'
+    o = HTMLSplit.new(html)
+    assert_instance_of(Declaration, o.document[0])
+    assert_equal('DOCTYPE greeting SYSTEM "hello.dtd"', o.document[0].text)
+  end
+
+=begin
+  # 内部サブセット
+  def test42
+    html = '<!DOCTYPE greeting [ <!ELEMENT greeting (#PCDATA)> ]>'
+    o = HTMLSplit.new(html)
+    assert_instance_of(Declaration, o.document[0])
+    assert_equal('DOCTYPE greeting SYSTEM "hello.dtd"', o.document[0].text)
+  end
+=end
+
+  # 開始タグ
+  def test51
+    html = "<foo bar='hoge'>"    # '...'で囲んでもよい
+    o = HTMLSplit.new(html)
+    tag = o.document[0]
+    assert_instance_of(StartTag, tag)
+    assert_equal({"bar" => "hoge"}, tag.attr)
+  end
+
+  # 空要素タグ (XMLスタイル)
+  def test52
+    html = "<foo/>"
+    o = HTMLSplit.new(html)
+    tag = o.document[0]
+    assert_instance_of(StartTag, tag)      # やむなしか？
+    assert_equal({"/" => true}, tag.attr)
+  end
+
+  # 不正なタグ
+  def test53
+    html = "<0> < <>"
+    o = HTMLSplit.new(html)
+    assert_instance_of(CharacterData, o.document[0])
+    assert_equal("<0> ", o.document[0].text)
+    assert_instance_of(CharacterData, o.document[1])
+    assert_equal("< ", o.document[1].text)
+    assert_instance_of(CharacterData, o.document[2])
+    assert_equal("<>", o.document[2].text)
+  end
+
+  # 終了タグ
+  def test54
+    html = "</foo>"
+    o = HTMLSplit.new(html)
+    tag = o.document[0]
+    assert_instance_of(EndTag, tag)
+    assert_equal("foo", tag.name)
+  end
+
+  # 閉じない開始タグ     SGMLで規定。Webブラウザでも解釈している
+  def test61
+    html = "<foo<bar>"
+    o = HTMLSplit.new(html)
+    tag = o.document[0]
+    assert_instance_of(StartTag, tag)
+    assert_equal("foo", tag.name)
+    tag = o.document[1]
+    assert_instance_of(StartTag, tag)
+    assert_equal("bar", tag.name)
+  end
+
+  # 閉じない終了タグ    SGMLで規定。Webブラウザでも解釈している
+  def test62
+    html = "</foo</bar>"
+    o = HTMLSplit.new(html)
+    tag = o.document[0]
+    assert_instance_of(EndTag, tag)
+    assert_equal("foo", tag.name)
+    tag = o.document[1]
+    assert_instance_of(EndTag, tag)
+    assert_equal("bar", tag.name)
+  end
+end
+
+Test::Unit::UI::Console::TestRunner.new(StructureTest).start()