# 標点ファイルの生成
#  巻別に標点を付けるためのファイルを生成する
#  出力はYAML
#  History
#    21 July 11: part_wordnum, volume_wordnumの追加

# require 'yaml'
require 'rexml/parsers/sax2parser'
require 'rexml/sax2listener'

SOURCE_DIR = File.join(File.dirname(__FILE__), '..', 'data')
TARGET_DIR = File.join(File.dirname(__FILE__), '..', 'yml', 'swjz')
SWJZ = File.new(File.join(SOURCE_DIR, 'swjz.xml'))

class SwjzCollector
  include REXML::SAX2Listener
  def initialize
    @out = nil
    @doc_count = 0  # yaml documents in a file
    @sw_count = 0
    @word_count = 0

    # states
    @in_chapter = false
    @in_chaptertitle = false
    @in_shuowen = false
    @in_wordhead = false
    @in_part_wordnum = false
    @in_volume_wordnum = false

    # stacks
    @wordhead = ''
    @wordhead_id = nil
    @position = nil
    @chaptertitle = ''
    @explanation = ''
    @duan_note = ''
    @wordnum = ''
  end
  def start_element(uri, localname, qname, attributes)
    before_chapter(qname)
    before_chaptertitle(qname, attributes)
    before_shuowen(qname)
    before_wordhead(qname, attributes)
    before_explanation(qname)
    before_duan_note(qname)
    before_part_wordnum(qname)
    before_volume_wordnum(qname)
  end
  def end_element(uri, localname, qname)
    after_chapter(qname)
    after_chaptertitle(qname)
    after_shuowen(qname)
    after_wordhead(qname)
    after_explanation(qname)
    after_duan_note(qname)
    after_part_wordnum(qname)
    after_volume_wordnum(qname)
  end
  def characters(text)
    text.strip!
    push_chaptertitle(text)
    push_wordhead(text)
    push_explanation(text)
    push_duan_note(text)
    push_wordnum(text)
  end
  def end_document
    printf("\nSW: %s\n", @sw_count)
    printf("Word: %s\n", @word_count)
  end

  #--- Process elements ---
  # chapter
  def before_chapter(qname)
    if qname == 'chapter' then
      @in_chapter = true
      @doc_count = 0
    end
  end
  def after_chapter(qname)
    @in_chapter = false if qname == 'chapter'
  end
  # chaptertitle
  def before_chaptertitle(qname, attributes)
    if qname == 'chaptertitle' and @in_chapter then
      @in_chaptertitle = true
      chapter_id = attributes['id']
#      printf("chapter_id: %s\n", chapter_id)
      setup_outfile(chapter_id)
    end
  end
  def after_chaptertitle(qname)
    if qname == 'chaptertitle' and @in_chapter then
      @out.printf("chapter: %s\n", @chaptertitle + @duan_note)
      @chaptertitle = ''
      @in_chaptertitle = false
    end
  end
  def push_chaptertitle(text)
    @chaptertitle += text if @in_chaptertitle
  end
  # shuowen
  def before_shuowen(qname)
    if qname == 'shuowen'
      @in_shuowen = true
      @sw_count += 1
      @doc_count += 1
      @out.printf("\n--- #%i\ncontent:\n", @doc_count)  # sequence of sw's
    end
  end
  def after_shuowen(qname)
    @in_shuowen = false if qname == 'shuowen'
  end
  # wordhead
  def before_wordhead(qname, attributes)
    if qname == 'wordhead' then
      @in_wordhead = true
      @wordhead_id = attributes['id']
      @position = attributes['img']
    end
  end
  def after_wordhead(qname)
    if qname == 'wordhead' then
      @out.printf("  - word: %s\n", @wordhead)
      @out.printf("    position: %s\n", @position) if @position
      if @wordhead_id then
#        wordid = @wordhead_id.delete('w')
#        @out.printf("    id: '%s'\n", wordid)
        @out.printf("    id: %s\n", @wordhead_id)
        @word_count += 1
      end
      @out.printf("    content:\n")
      # Reset flags
      @in_wordhead = false
      @position = ''
      @wordhead = ''
      @wordhead_id = nil
    end
  end
  def push_wordhead(text)
    @wordhead += text if @in_wordhead
  end
  # explanation
  def before_explanation(qname)
    @in_explanation = true if qname == 'explanation'
  end
  def after_explanation(qname)
    if qname == 'explanation'
      if @in_shuowen then
        @out.printf("      - ex: %s\n", @explanation)
      end
      @in_explanation = false
      @explanation = ''
    end
  end
  def push_explanation(text)
    @explanation += text if @in_explanation
  end
  # duan_note
  def before_duan_note(qname)
    if qname == 'duan_note' then
      @in_duan_note = true
      after_wordnum
    end
  end
  def after_duan_note(qname)
    if qname == 'duan_note' then
      if @in_shuowen then
        @out.printf("      - dn: %s\n", @duan_note)
      elsif @in_part_wordnum or @in_volume_wordnum then
        @out.printf("  - dn: %s\n", @duan_note)
      end
      @duan_note = ''
      @in_duan_note = false
    end
  end
  def push_duan_note(text)
    @duan_note += text if @in_duan_note
  end
  # part_wordnum
  def before_part_wordnum(qname)
    if qname == 'part_wordnum' then
      @in_part_wordnum = true
      before_wordnum
      @doc_count += 1
      @out.printf("\n--- #%i\npart:\n", @doc_count)  # sequence number
    end
  end
  def after_part_wordnum(qname)
    if qname == 'part_wordnum' then
      @in_part_wordnum = false
      after_wordnum
    end
  end
  # volume_wordnum
  def before_volume_wordnum(qname)
    if qname == 'volume_wordnum' then
      @in_volume_wordnum = true
      before_wordnum
      @doc_count += 1
      @out.printf("\n--- #%i\nvolume:\n", @doc_count)  # sequence number
    end
  end
  def after_volume_wordnum(qname)
    if qname == 'volume_wordnum' then
      @in_volume_wordnum = false
      after_wordnum
    end
  end

  #--- virtual element processing: wordnum ---
  def before_wordnum
    @in_wordnum = true
  end
  def after_wordnum
    @out.printf("  - ex: %s\n", @wordnum) unless @wordnum.empty?
    @in_wordnum = false
    @wordnum = ''
  end
  def push_wordnum(text)
    @wordnum += text if @in_wordnum
  end

  #--- utilities ---
  def setup_outfile(chapter_id)
    @out = File.open(File.join(TARGET_DIR, chapter_id + '.yml'), "w")
  end
end

parser = REXML::Parsers::SAX2Parser.new SWJZ
listener = SwjzCollector.new
parser.listen(listener)
parser.parse
