--- CSVTokenizer.java	Fri Apr 25 19:14:10 2003
+++ ../../src/jp/riken/brain/ni/samuraigraph/base/SGCSVTokenizer.java	Wed Aug 17 13:12:43 2005
@@ -1,308 +1,282 @@
-package org.arhyme.csv;
-
-import java.util.*;
+ /* ------------------------------
+  * CSVTokenizer.java
+  * ------------------------------
+  * (C)opyright 2003, abupon (Manabu Hashimoto)
+  * This class is based on the CSV tokenizer found at
+  * http://sourceforge.net/projects/csvtokenizer/
+   */
+ 
+package jp.riken.brain.ni.samuraigraph.base;
+
+import java.util.ArrayList;
+import java.util.Enumeration;
+import java.util.NoSuchElementException;
 
 /**
- * The csv tokenizer class allows an application to break a 
- * Comma Separated Value format into tokens. 
- * The tokenization method is much simpler than 
- * the one used by the <code>StringTokenizer</code> class. The 
- * <code>CSVTokenizer</code> methods do not distinguish among 
- * identifiers, numbers, and quoted strings, nor do they recognize 
- * and skip comments. 
+ * The csv tokenizer class allows an application to break a Comma Separated
+ * Value format into tokens. The tokenization method is much simpler than the
+ * one used by the <code>StringTokenizer</code> class. The
+ * <code>CSVTokenizer</code> methods do not distinguish among identifiers,
+ * numbers, and quoted strings, nor do they recognize and skip comments.
+ * <p>
+ * The set of separator (the characters that separate tokens) may be specified
+ * either at creation time or on a per-token basis.
  * <p>
- * The set of separator (the characters that separate tokens) may 
- * be specified either at creation time or on a per-token basis. 
+ * A <tt>CSVTokenizer</tt> object internally maintains a current position
+ * within the string to be tokenized. Some operations advance this current
+ * position past the characters processed.
  * <p>
- * An instance of <code>CSVTokenizer</code> behaves in one of two 
- * ways, depending on whether it was created with the 
- * <code>returnSeparators</code> flag having the value <code>true</code> 
- * or <code>false</code>: 
- * <ul>
- * <li>If the flag is <code>false</code>, delimiter characters serve to 
- *     separate tokens. A token is a maximal sequence of consecutive 
- *     characters that are not separator. 
- * <li>If the flag is <code>true</code>, delimiter characters are themselves 
- *     considered to be tokens. A token is thus either one delimiter 
- *     character, or a maximal sequence of consecutive characters that are 
- *     not separator.
- * </ul><p>
- * A <tt>CSVTokenizer</tt> object internally maintains a current 
- * position within the string to be tokenized. Some operations advance this 
- * current position past the characters processed.<p>
- * A token is returned by taking a substring of the string that was used to 
+ * A token is returned by taking a substring of the string that was used to
  * create the <tt>CSVTokenizer</tt> object.
  * <p>
  * The following is one example of the use of the tokenizer. The code:
- * <blockquote><pre>
- *     CSVTokenizer csvt = new CSVTokenizer("this,is,a,test");
- *     while (csvt.hasMoreTokens()) {
- *         println(csvt.nextToken());
- *     }
- * </pre></blockquote>
+ * <blockquote>
+ * 
+ * <pre>
+ * CSVTokenizer csvt = new CSVTokenizer(&quot;this,is,a,test&quot;);
+ * while (csvt.hasMoreTokens()) {
+ * 	println(csvt.nextToken());
+ * }
+ * </pre>
+ * 
+ * </blockquote>
  * <p>
- * prints the following output:
- * <blockquote><pre>
- *     this
- *     is
- *     a
- *     test
- * </pre></blockquote>
- * @author  abupon
+ * prints the following output: <blockquote>
+ * 
+ * <pre>
+ * 
+ *      this
+ *      is
+ *      a
+ *      test
+ *  
+ * </pre>
+ * 
+ * </blockquote>
+ * 
+ * @author abupon
  * @version
- * @see     
- * @since   
-*/
-public class CSVTokenizer implements Enumeration {
+ * @see
+ * @since
+ */
+public class SGCSVTokenizer implements Enumeration {
+	
 	private String record;
-	private String separator;
-	private String quate;
-	private boolean returnSeparators;
-
 	private int currentIndex;
 
-	private static final String COMMA = ",";
-	private static final String TAB = "\t";
-	private static final String SPACE = " ";
-
 	private static final String DOUBLE_QUATE = "\"";
-	private static final String SINGLE_QUATE = "'";
+	private static final String WHITE_SPACE = " \t\n\r\f,";
+	private static final String SEPARATOR_COMMA = ",";
+	private static final int SEPARATOR_LEN = 1;
+	private static final int DOUBLE_QUATE_LEN = 1;
+
+	private ArrayList mWhiteSpaceList = new ArrayList();
+	private boolean is_csv_mode = false;
+	private boolean is_comment_line = false;
 
 	/**
-	 * Constructs a csv tokenizer for the specified string.   
-	 * <code>theSeparator</code> argument is the separator 
-	 * for separating tokens. 
-	 * <p>
-	 * If the <code>returnSeparators</code> flag is <code>true</code>, 
-	 * then the separator string is also returned as tokens. 
-	 * separator is returned as a string. If the flag is 
-	 * <code>false</code>, the separator string is skipped and only 
-	 * serve as separator between tokens. 
-	 *
-	 * @param	aString			a string to be parsed.
-	 * @param	theSeparator	the separator 
-	 * 							(CSVTokenizer.COMMA, CSVTokenizer.TAB, CSVTokenizer.SPACE, etc.).
-	 * @param	theQuate		the quate 
-	 * 							(CSVTokenizer.SINGLE_QUATE, CSVTokenizer.DOUBLE_QUATE, etc.).
-	 * @param	fragReturnSeparators	flag indicating whether to return the separator
-	 * 							as tokens.
+	 * Constructs a csv tokenizer for the specified string.
+	 * <code>theSeparator</code> argument is the separator for separating
+	 * tokens.
+	 * 
+	 * @param aString
+	 *            a string to be parsed.
+	 * @param isDataFile
+	 *            a data file reading flag
 	 */
-	public CSVTokenizer(
-		String aString,
-		String theSeparator,
-		String theQuate,
-		boolean fragReturnSeparators) {
+	public SGCSVTokenizer(final String aString, final boolean isDataFile) {
 		this.record = aString.trim();
-		this.separator = theSeparator;
-		this.quate = theQuate;
-		this.returnSeparators = fragReturnSeparators;
 		this.currentIndex = 0;
+		for (int ii = 0; ii < WHITE_SPACE.length(); ii++)
+			this.mWhiteSpaceList.add(new Character(WHITE_SPACE.charAt(ii)));
+		// check comment line
+		if ( isDataFile && this.record.startsWith("#") )
+			is_comment_line = true;
+		// check camma separated mode
+		char c;
+		boolean in_quote = false;
+		for (int ii = 0; ii < this.record.length(); ii++){
+			c = this.record.charAt(ii);
+			if(in_quote){
+				if( c == '"')
+					in_quote = false;
+			}else{
+				if( c == '"'){
+					in_quote = true;
+				}else if( c == ',' ){
+					is_csv_mode = true;
+				}
+			}
+		}
 	}
 
-	/**
-	 * Constructs a csv tokenizer for the specified string. All  
-	 * characters in the <code>theSeparator</code> argument are the separator 
-	 * for separating tokens. 
-	 * <p>
-	 * If the <code>returnSeparators</code> flag is <code>true</code>, then 
-	 * the separator string is also returned as tokens.
-	 * separator is returned as a string. If the flag is 
-	 * <code>false</code>, the Separator string is skipped and only 
-	 * serve as separator between tokens. 
-	 *
-	 * @param	aString			a string to be parsed.
-	 * @param	theSeparator	the separator 
-	 * 							(CSVTokenizer.COMMA, CSVTokenizer.TAB, CSVTokenizer.SPACE, etc.).
-	 * @param	fragReturnSeparators	flag indicating whether to return the separator
-	 * 							as tokens.
-	 */
-	public CSVTokenizer(
-		String aString,
-		String theSeparator,
-		boolean fragReturnSeparators) {
-		this(
-			aString,
-			theSeparator,
-			CSVTokenizer.DOUBLE_QUATE,
-			fragReturnSeparators);
-	}
-
-	/**
-	 * Constructs a csv tokenizer for the specified string. The 
-	 * characters in the <code>theSeparator</code> argument are 
-	 * the separator for separating tokens. 
-	 * Separator string themselves will not be treated as tokens.
-	 * 
-	 * @param	aString			a string to be parsed.
-	 * @param	theSeparator	the separator 
-	 * 							(CSVTokenizer.COMMA, CSVTokenizer.TAB, CSVTokenizer.SPACE, etc.).
-	 */
-	public CSVTokenizer(String aString, String theSeparator) {
-		this(aString, theSeparator, false);
-	}
-
-	/**
-	 * Constructs a string tokenizer for the specified string. The 
-	 * tokenizer uses the default separator set, which is 
-	 * <code>CSVTokenizer.COMMA</code>. 
-	 * Separator string themselves will not be treated as tokens.
-	 * 
-	 * @param	aString			a string to be parsed.
-	 * 
-	 */
-	public CSVTokenizer(String aString) {
-		this(aString, CSVTokenizer.COMMA);
-	}
 
 	/**
-	 * Tests if there are more tokens available from this tokenizer's string. 
-	 * If this method returns <tt>true</tt>, then a subsequent call to 
+	 * Tests if there are more tokens available from this tokenizer's string. If
+	 * this method returns <tt>true</tt>, then a subsequent call to
 	 * <tt>nextToken</tt> with no argument will successfully return a token.
-	 *
-	 * @return  <code>true</code> if and only if there is at least one token 
-	 *          in the string after the current position; <code>false</code> 
-	 *          otherwise.
+	 * 
+	 * @return <code>true</code> if and only if there is at least one token in
+	 *         the string after the current position; <code>false</code>
+	 *         otherwise.
 	 */
 	public boolean hasMoreTokens() {
-		return (this.currentIndex < this.record.length());
+		if( is_comment_line )
+			return false;
+		return (this.currentIndex >= 0);
 	}
 
 	/**
 	 * Returns the next token from this string tokenizer.
-	 *
-	 * @return     the next token from this string tokenizer.
-	 * @exception  NoSuchElementException  if there are no more tokens in this
-	 *               tokenizer's string.
-	 * @exception  IllegalArgumentException if given parameter string format was wrong  
+	 * 
+	 * @return the next token from this string tokenizer.
+	 * @exception NoSuchElementException
+	 *                if there are no more tokens in this tokenizer's string.
+	 * @exception IllegalArgumentException
+	 *                if given parameter string format was wrong
 	 */
-	public String nextToken()
-		throws NoSuchElementException, IllegalArgumentException {
+	public String nextToken() throws NoSuchElementException, IllegalArgumentException {
 		String token = null;
 		int start;
 		int end;
-
 		if (!this.hasMoreTokens()) {
 			throw new NoSuchElementException();
+		}
+		if (this.record.startsWith(SGCSVTokenizer.DOUBLE_QUATE, this.currentIndex)) {
+		    String rec = this.record.substring(this.currentIndex + SGCSVTokenizer.DOUBLE_QUATE_LEN);
+		    token = "";
+		    for (;;) {
+		        end = rec.indexOf(SGCSVTokenizer.DOUBLE_QUATE);
+		        if (end < 0) {
+		            throw new IllegalArgumentException("Illegal format");
+		        }
+		        if (!rec.startsWith(SGCSVTokenizer.DOUBLE_QUATE, end + 1)) {
+		            token = token + rec.substring(0, end);
+		            break;
+		        }
+		        token = token + rec.substring(0, end + 1);
+		        rec = rec.substring(end + SGCSVTokenizer.DOUBLE_QUATE_LEN * 2);
+		        this.currentIndex++;
+		    }
+		    // don't trim string
+		    this.currentIndex += (token.length() + SGCSVTokenizer.DOUBLE_QUATE_LEN * 2	+ SGCSVTokenizer.SEPARATOR_LEN);
+		    if ( !is_csv_mode ){
+		        this.currentIndex = nextTokenIndexOf(this.currentIndex);
+		    }
+		    if(this.currentIndex >= this.record.length())
+		        this.currentIndex = -1;
 		} else {
-			if (this.record.startsWith(this.quate, this.currentIndex)) {
-				String rec = this.record.substring(this.currentIndex + this.quate.length());
-				token = "";
-				for (;;) {
-					end = rec.indexOf(this.quate);
-					if (end < 0) {
-						throw new IllegalArgumentException("Illegal format");
-					}
-					if (!rec.startsWith(this.quate, end + 1)) {
-						token = token + rec.substring(0, end);
-						break;
-					}
-					token = token + rec.substring(0, end + 1);
-					rec = rec.substring(end + this.quate.length() * 2);
-					this.currentIndex++;
-				}
-				this.currentIndex += (token.length() + this.quate.length() * 2 + this.separator.length());
-			} else if (
-				(end = this.record.indexOf(this.separator, this.currentIndex))
-					>= 0) {
-				start = this.currentIndex;
-				token = this.record.substring(start, end);
-				this.currentIndex = end + separator.length();
-			} else {
-				start = this.currentIndex;
-				token = this.record.substring(start);
-				this.currentIndex = this.record.length();
-			}
+		    start = this.currentIndex;
+		    if ( is_csv_mode )
+		        end = this.record.indexOf(SEPARATOR_COMMA, this.currentIndex);
+		    else
+		        end = nextSeparatorIndexOf(this.currentIndex);
+		    if (end >= 0) {
+		        token = this.record.substring(start, end);
+		        if ( is_csv_mode ){
+		            this.currentIndex = end + SEPARATOR_LEN;
+		        } else {
+		            this.currentIndex = nextTokenIndexOf(end);
+		            if(this.currentIndex == this.record.length())
+		                this.currentIndex = -1;
+		        }
+		    } else {
+		        // end of line reached
+		        if(this.currentIndex == this.record.length())
+		            token = "";
+		        else
+		            token = this.record.substring(start);
+		        this.currentIndex = -1;
+		    }
+		    token = token.trim();
 		}
-
 		return token;
 	}
-
-	/**
-	 * Returns the next token in this string tokenizer's string. First, 
-	 * the set of characters considered to be separator by this 
-	 * <tt>CSVTokenizer</tt> object is changed to be the characters in 
-	 * the string <tt>separator</tt>. Then the next token in the string
-	 * after the current position is returned. The current position is 
-	 * advanced beyond the recognized token.  The new delimiter set 
-	 * remains the default after this call. 
-	 *
-	 * @param      theSeparator   the new separator.
-	 * @return     the next token, after switching to the new delimiter set.
-	 * @exception  NoSuchElementException  if there are no more tokens in this
-	 *               tokenizer's string.
-	 */
-	public String nextToken(String theSeparator) {
-		separator = theSeparator;
-		return nextToken();
+	
+	private int nextSeparatorIndexOf(int fromIndex) {
+		char c;
+		int cnt = 0;
+		int ii;
+		int len = this.record.length();
+		if(len == fromIndex) return -1;
+		for (ii = fromIndex; ii < len; ii++) {
+			c = this.record.charAt(ii);
+			if ( this.mWhiteSpaceList.contains(new Character(c)) )
+				break;
+			cnt++;
+		}
+		if(ii == len) return -1;
+		return cnt + fromIndex;
+	}
+	
+	private int nextTokenIndexOf(int fromIndex) {
+		char c;
+		int cnt = 0;
+		int len = this.record.length();
+		for (int ii = fromIndex; ii < len; ii++) {
+			c = this.record.charAt(ii);
+			if ( !this.mWhiteSpaceList.contains(new Character(c)) ) {
+				break;
+			}
+			cnt++;
+		}
+		return cnt + fromIndex;
 	}
 
 	/**
-	 * Returns the same value as the <code>hasMoreTokens</code>
-	 * method. It exists so that this class can implement the
-	 * <code>Enumeration</code> interface. 
-	 *
-	 * @return  <code>true</code> if there are more tokens;
-	 *          <code>false</code> otherwise.
-	 * @see     java.util.Enumeration
-	 * @see     java.util.CSVTokenizer#hasMoreTokens()
+	 * Returns the same value as the <code>hasMoreTokens</code> method. It
+	 * exists so that this class can implement the <code>Enumeration</code>
+	 * interface.
+	 * 
+	 * @return <code>true</code> if there are more tokens; <code>false</code>
+	 *         otherwise.
+	 * @see java.util.Enumeration
+	 * @see java.util.SGCSVTokenizer#hasMoreTokens()
 	 */
 	public boolean hasMoreElements() {
 		return hasMoreTokens();
 	}
 
 	/**
-	 * Returns the same value as the <code>nextToken</code> method,
-	 * except that its declared return value is <code>Object</code> rather than
+	 * Returns the same value as the <code>nextToken</code> method, except
+	 * that its declared return value is <code>Object</code> rather than
 	 * <code>String</code>. It exists so that this class can implement the
-	 * <code>Enumeration</code> interface. 
-	 *
-	 * @return     the next token in the string.
-	 * @exception  NoSuchElementException  if there are no more tokens in this
-	 *               tokenizer's string.
-	 * @see        java.util.Enumeration
-	 * @see        java.util.CSVTokenizer#nextToken()
+	 * <code>Enumeration</code> interface.
+	 * 
+	 * @return the next token in the string.
+	 * @exception NoSuchElementException
+	 *                if there are no more tokens in this tokenizer's string.
+	 * @see java.util.Enumeration
+	 * @see java.util.SGCSVTokenizer#nextToken()
 	 */
 	public Object nextElement() {
 		return nextToken();
 	}
 
-	/**
-	 * Calculates the number of times that this tokenizer's 
-	 * <code>nextToken</code> method can be called before it generates an 
-	 * exception. The current position is not advanced.
-	 *
-	 * @return  the number of tokens remaining in the string using the current
-	 *          delimiter set.
-	 * @see     java.util.CSVTokenizer#nextToken()
-	 */
-	public int countTokens() {
-		int count = 0;
-		
-		// <TODO>
-		int preserve = this.currentIndex;
-		while (this.hasMoreTokens()) {
-			this.nextToken();
-			count++;
+	public static void main(String[] args) {
+		int i = 1;
+		String str;
+		String expect;
+		String result;
+
+		str = "1, \t 2, \"\" 3,  4, \"a,  \"\"\\hoge\"";
+		str = "1, \t 2, \"\" 3, b 4, a hoge";
+		System.out.println("String : [" + str + "]");
+		SGCSVTokenizer csvt = new SGCSVTokenizer(str, true);
+		i = 1;
+		while (csvt.hasMoreTokens()) {
+			try {
+				expect = String.valueOf(i++);
+				result = csvt.nextToken();
+				System.out.print(expect + ": [");
+				System.out.println(result + "]");
+			} catch (NoSuchElementException e) {
+				e.printStackTrace();
+				System.exit(-1);
+			}
 		}
-		this.currentIndex = preserve;
-		// </TODO>
 		
-		return count;
-	}
 
-	/**
-	 * Returns the quate.
-	 * @return char
-	 */
-	public String getQuate() {
-		return this.quate;
 	}
 
-	/**
-	 * Sets the quate.
-	 * @param quate The quate to set
-	 */
-	public void setQuate(String quate) {
-		this.quate = quate;
-	}
 }
