# -*- coding: utf-8 -*-

from Products.ZCTextIndex.ISplitter import ISplitter
from Products.ZCTextIndex.PipelineFactory import element_factory

import MeCab


# arguments passed to mecab
NG_WORDS = ('*','?')
#REPLACE_WORDS = (('スペース',' '),)

class MeCabAdaptor(object):
	"""
	wrapper of MeCab::Tagger
	"""
	def __init__(self):
		self.adaptor = MeCab.Tagger('-Owakati')

	def parse(self, txt):
		#parsed = ''
		#self.adaptor.lock()
		#parsed = self.adaptor.parse(txt)
		#self.adaptor.unlock()
		words = self.adaptor.parse(txt).split()
		return (w for w in words if w and w not in NG_WORDS)

class MTJSplitter(object):
	"""
	MTJSplitter
	Mecab-based Japanese Splitter
	"""
	__implements__ = ISplitter

	mecabAdaptor = MeCabAdaptor()

	def process(self, text, glob=0):
		txt = ' '.join(text)
		return self.mecabAdaptor.parse(txt)

element_factory.registerFactory('Word Splitter', 'MTJSplitter', MTJSplitter)
