# 異体字の統合
# TODO: replace_text 統合テーブルによる置換
# TODO: 調査 説解部分の置換の有無

require 'rexml/document'

SOURCE_DIR = File.join(File.dirname(__FILE__), '..', 'htdocs', 'swjz')
TARGET_DIR = File.join(File.dirname(__FILE__), '..', 'htdocs', 'unified')

class Unifier
  def initialize(filename)
    source_file = File.new(File.join(SOURCE_DIR, filename))
    @doc = REXML::Document.new(source_file)
    @out = File.open(File.join(TARGET_DIR, filename), "w")
  end
  def replace_char(s, pattern, char)
    s.gsub!(pattern, char)
  end
  def replace_text(s) # TODO: 統合テーブルの導入
    replace_char(s, /告/u, '吿')
    replace_char(s, /㑹/u, '會')
    replace_char(s, /緫/u, '總')
    replace_char(s, /尢/u, '尤')
    replace_char(s, /𡵉/u, '𡴆')
    replace_char(s, /備/u, '僃')
    replace_char(s, /彚/u, '彙')
    replace_char(s, /𨕖/u, '選')
    replace_char(s, /䖍/u, '虔')
  end
  def replace(td, text)
    content = text.value
    replace_text(content)
    td.replace_child(text, REXML::Text.new(content))
  end
  def unify_texts(path)
    REXML::XPath.each(@doc.root, path) do |td|
      REXML::XPath.each(td, 'text()') do |text|
        replace(td, text)
      end
    end
  end
  def unify
    unify_texts("//td[@class = 'body']") # 段注
    unify_texts("//table[@class = 'part_wordnum']/tr/td") # 部首末
    unify_texts("//span[@class = 'explanation']") # 説解
    @out.print @doc
  end
end

Dir.foreach(SOURCE_DIR) do |filename|
  if filename =~ /^.*\.html$/ then
    unifier = Unifier.new(filename)
    unifier.unify
  end
end
