#!/usr/bin/ruby
# -*- coding: euc-jp -*-
# bogofilter-nihongo.rb - read mail from stdin, translate to EUC and make wakati-gaki then send to bogofilter
# $Id: bogofilter-nihongo.rb,v 0.3 2012/07/16 hiromatsu $
#   modified for ruby 1.9.3
# $Id: bogofilter-nihongo.rb,v 0.2 2009/06/04 hiromatsu $
#   modified for gems, taht open4 and rmail depends on.
# $Id: bogofilter-nihongo.rb,v 0.1 2006/12/03 14:33:20 hiromatsu $
=begin
Copyright (C) 2006 Takashi Hiromatsu.  All rights reserved.

This software is based on prepare.rb. Copyright notices are also upon
it.

prepare.rb - read mail from stdin, write prepared to stdout
$Id: prepare.rb,v 1.7 2006/09/04 03:04:55 tominaga Exp $

Copyright (C) 2006 Kazuto Tominaga.  All rights reserved.

You can do to this software any combination of the following:
use, copy, modify, and distribute (henceforth called Activity),
provided that the purpose of your Activity is not directly related
to any of the following:

* aggravating confrontation among any groups of people
* exacerbating the situation of poverty and famine
* encouraging discrimination among any groups of people
* causing distrust, anxiety, or hostility among any people
* hurting somebody physically or mentally

This software is here as it is.  It has no specification that
it shall abide by, it has no correct behavior in any sense,
and it does not have any kind of expressed or implied warranty.
I have no responsibility for any result, any trouble, any damage,
or any loss related to your Activity.
=end

require 'timeout'
require 'MeCab'
require 'kconv'

require 'rubygems'
require 'rmail'
require 'open4'

$HelpMsg = "bogofilter-nihongo version 0.3\n\n" +
"Usage: ruby bogofilter-nihongo.rb [options] < message\n\n" +
"bogofilter-nihongo original options:\n" +
"  --filter                  - work like as encoder and wakati-gaki filter\n" +
"  --help                    - show this message\n" +
"  --debug                   - show bogofilter process status\n\n" +
"other options will be passed through bogofilter\n\n"

$EXIT_STATUS = 0
$USER_DEBUG = 0
$FILTER = 0
$HELP = 0

def splitbody(s)
#   $stderr.puts "splitbody #{Time.now}"
  if /^mime-version:/i !~ s
    $stderr.puts 'internal error'
    throw :LogicInconsistency
  end
  part1 = $`
  part2 = $& + $'
  if /\A(((.|\n)*)^$\n)^((.|\n)*)\z/ =~ part1
    preamble = $1
    embedded = $4 + part2
    return [ preamble, embedded ]
  else
    return [ '', part1+part2 ]
  end
end

def singlepart(msg)
#   $stderr.puts "single part #{Time.now}"
  typ = msg.header['content-type']
  if typ.nil?
    typ = ''
  end
  # $stderr.puts "typ = #{typ}"
  typ = typ.downcase
  if typ != '' and typ !~ /^text/ and typ !~ /^message/
    msg.body = 'body-of-this-part-deleted'
    return msg
  end

  if typ =~ /^message\/rfc2?822/
    m = RMail::Parser.read(msg.body)
    m = multipart(m)
    msg.body = RMail::Serialize.write('', m)	# cliche; see RMail::Serialize
    return msg
  end

  # some mailer (e.g., Mew) adds comments to CTE, and rubymail doesn't handle it; trim
  cte = msg.header['content-transfer-encoding']
  if /quoted-printable/i =~ cte
    msg.header.delete('content-transfer-Encoding')
    msg.header['Content-Transfer-Encoding'] = 'quoted-printable'
  elsif /base64/i =~ cte
    msg.header.delete('content-transfer-encoding')
    msg.header['Content-Transfer-Encoding'] = 'base64'
  end
  # $stderr.puts "cte = #{msg.header['content-transfer-encoding']}"
  if msg.body.nil?
    rawbody = ''
  else
    rawbody = msg.decode
  end

  if typ =~ /^text\/rfc2?822-headers/
    msg.body = rawbody
    return msg
  end

  # check if there is embedded mail (heuristic, for qmail daemon report)

  if /^mime-version:/i =~ rawbody
    # $stderr.puts "EMBEDDED MAIL"
    preamble, embedded = splitbody(rawbody)
    tmpmsg = RMail::Message.new
    msg.body = preamble
    tmpmsg.add_part(msg)
    p = RMail::Parser.read(embedded)
    tmpmsg.add_part(p)
    m = multipart(tmpmsg)
    return tmpmsg
  end

  if /^text\/html/ =~ typ
    rawbody = rawbody.encode("ASCII-8BIT").gsub(/<[^>]+>/,"")
  else
    rawbody = rawbody.gsub(/^[[:graph:]]{61}$/,"")
  end

  rawbody = MeCab::Tagger.new("-O wakati").parse(rawbody.toeuc.gsub(/\n/ ,""))
  msg.body = rawbody.gsub(" ","\n")
  msg
end

def multipart(msg)
#   $stderr.puts "multipart #{Time.now}"
  if msg.multipart?
    bodies = []
#     $stderr.puts "#{msg}"
    msg.body.each do |m|
      bodies << multipart(m)
    end
    msg.body = bodies
    msg
  else
    singlepart(msg)
  end
end

def execbogofilter(body, bogoflag)
#   $stderr.puts "execbogofilter #{Time.now}"
  $stdout.flush

  pid, stdin, stdout, stderr = Open4.popen4 "bogofilter #{bogoflag}"
  stdin.puts body
  stdin.close

  ignored, status = Process::waitpid2 pid

  $EXIT_STATUS = status.exitstatus

  s = stdout.read.strip
  stdout.close
  
  if $USER_DEBUG == 1
    puts "pid        : #{ pid }"
    puts "stdout     : #{ s }"
    puts "stderr     : #{ stderr.read.strip }"
    puts "status     : #{ status.inspect }"
    puts "exitstatus : #{ status.exitstatus }"
  end

  s
end

def bogoarg(bogoflag)
#   $stderr.puts "bogoarg #{Time.now}"
  flag = ''
  bogoflag.each do |l|
    if l == "--filter"
      $FILTER = 1
    elsif l == "--help"
      $HELP = 1
      flag += l + " "
    elsif l == "--debug"
      $USER_DEBUG = 1
    else
      flag += l + " "
    end
  end

  flag
end

def main
  GC.disable
#   $stderr.puts "main 0 #{Time.now}"
  bogo_arg = bogoarg(ARGV) 

  if $HELP == 0
#   $stderr.puts "main 1 #{Time.now}"
    m = RMail::Parser.read($stdin)
#   $stderr.puts "main 2 #{Time.now}"
    m = multipart(m)
#   $stderr.puts "main 3 #{Time.now}"
    s = RMail::Serialize.write('', m)
#   $stderr.puts "main 4 #{Time.now}"
    if $FILTER == 0
      s = execbogofilter(s, bogo_arg)
    end
  else
    s = $HelpMsg + execbogofilter('', bogo_arg)
  end

#   $stderr.puts "main 5 #{Time.now}"
  puts s
  exit($EXIT_STATUS)  
end

main
