001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.text;
018
019import java.util.ArrayList;
020import java.util.Collections;
021import java.util.List;
022import java.util.ListIterator;
023import java.util.NoSuchElementException;
024
025import org.apache.commons.text.matcher.StringMatcher;
026import org.apache.commons.text.matcher.StringMatcherFactory;
027
028/**
029 * Tokenizes a string based on delimiters (separators) and supporting quoting and ignored character concepts.
030 * <p>
031 * This class can split a String into many smaller strings. It aims to do a similar job to
032 * {@link java.util.StringTokenizer StringTokenizer}, however it offers much more control and flexibility including
033 * implementing the <code>ListIterator</code> interface. By default, it is set up like <code>StringTokenizer</code>.
034 * <p>
035 * The input String is split into a number of <i>tokens</i>. Each token is separated from the next String by a
036 * <i>delimiter</i>. One or more delimiter characters must be specified.
037 * <p>
038 * Each token may be surrounded by quotes. The <i>quote</i> matcher specifies the quote character(s). A quote may be
039 * escaped within a quoted section by duplicating itself.
040 * <p>
041 * Between each token and the delimiter are potentially characters that need trimming. The <i>trimmer</i> matcher
042 * specifies these characters. One usage might be to trim whitespace characters.
043 * <p>
044 * At any point outside the quotes there might potentially be invalid characters. The <i>ignored</i> matcher specifies
045 * these characters to be removed. One usage might be to remove new line characters.
046 * <p>
047 * Empty tokens may be removed or returned as null.
048 *
049 * <pre>
050 * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
051 * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
052 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
053 * </pre>
054 *
055 * <table>
056 * <caption>StringTokenizer properties and options</caption>
057 * <tr>
058 * <th>Property</th>
059 * <th>Type</th>
060 * <th>Default</th>
061 * </tr>
062 * <tr>
063 * <td>delim</td>
064 * <td>CharSetMatcher</td>
065 * <td>{ \t\n\r\f}</td>
066 * </tr>
067 * <tr>
068 * <td>quote</td>
069 * <td>NoneMatcher</td>
070 * <td>{}</td>
071 * </tr>
072 * <tr>
073 * <td>ignore</td>
074 * <td>NoneMatcher</td>
075 * <td>{}</td>
076 * </tr>
077 * <tr>
078 * <td>emptyTokenAsNull</td>
079 * <td>boolean</td>
080 * <td>false</td>
081 * </tr>
082 * <tr>
083 * <td>ignoreEmptyTokens</td>
084 * <td>boolean</td>
085 * <td>true</td>
086 * </tr>
087 * </table>
088 *
089 * @since 1.3
090 */
091public class StringTokenizer implements ListIterator<String>, Cloneable {
092
093    /** Comma separated values tokenizer internal variable. */
094    private static final StringTokenizer CSV_TOKENIZER_PROTOTYPE;
095    /** Tab separated values tokenizer internal variable. */
096    private static final StringTokenizer TSV_TOKENIZER_PROTOTYPE;
097    static {
098        CSV_TOKENIZER_PROTOTYPE = new StringTokenizer();
099        CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StringMatcherFactory.INSTANCE.commaMatcher());
100        CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher());
101        CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher());
102        CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher());
103        CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
104        CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
105
106        TSV_TOKENIZER_PROTOTYPE = new StringTokenizer();
107        TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StringMatcherFactory.INSTANCE.tabMatcher());
108        TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher());
109        TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher());
110        TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher());
111        TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
112        TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
113    }
114
115    /** The text to work on. */
116    private char[] chars;
117    /** The parsed tokens. */
118    private String[] tokens;
119    /** The current iteration position. */
120    private int tokenPos;
121
122    /** The delimiter matcher. */
123    private StringMatcher delimMatcher = StringMatcherFactory.INSTANCE.splitMatcher();
124    /** The quote matcher. */
125    private StringMatcher quoteMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
126    /** The ignored matcher. */
127    private StringMatcher ignoredMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
128    /** The trimmer matcher. */
129    private StringMatcher trimmerMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
130
131    /** Whether to return empty tokens as null. */
132    private boolean emptyAsNull = false;
133    /** Whether to ignore empty tokens. */
134    private boolean ignoreEmptyTokens = true;
135
136    // -----------------------------------------------------------------------
137
138    /**
139     * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
140     *
141     * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
142     */
143    private static StringTokenizer getCSVClone() {
144        return (StringTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
145    }
146
147    /**
148     * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
149     * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
150     * setTrimmer method).
151     * <p>
152     * You must call a "reset" method to set the string which you want to parse.
153     *
154     * @return a new tokenizer instance which parses Comma Separated Value strings
155     */
156    public static StringTokenizer getCSVInstance() {
157        return getCSVClone();
158    }
159
160    /**
161     * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
162     * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
163     * setTrimmer method).
164     *
165     * @param input
166     *            the text to parse
167     * @return a new tokenizer instance which parses Comma Separated Value strings
168     */
169    public static StringTokenizer getCSVInstance(final String input) {
170        final StringTokenizer tok = getCSVClone();
171        tok.reset(input);
172        return tok;
173    }
174
175    /**
176     * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
177     * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
178     * setTrimmer method).
179     *
180     * @param input
181     *            the text to parse
182     * @return a new tokenizer instance which parses Comma Separated Value strings
183     */
184    public static StringTokenizer getCSVInstance(final char[] input) {
185        final StringTokenizer tok = getCSVClone();
186        tok.reset(input);
187        return tok;
188    }
189
190    /**
191     * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
192     *
193     * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
194     */
195    private static StringTokenizer getTSVClone() {
196        return (StringTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
197    }
198
199    /**
200     * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
201     * trim whitespace from both ends (which can be overridden with the setTrimmer method).
202     * <p>
203     * You must call a "reset" method to set the string which you want to parse.
204     *
205     * @return a new tokenizer instance which parses Tab Separated Value strings.
206     */
207    public static StringTokenizer getTSVInstance() {
208        return getTSVClone();
209    }
210
211    /**
212     * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
213     * trim whitespace from both ends (which can be overridden with the setTrimmer method).
214     *
215     * @param input
216     *            the string to parse
217     * @return a new tokenizer instance which parses Tab Separated Value strings.
218     */
219    public static StringTokenizer getTSVInstance(final String input) {
220        final StringTokenizer tok = getTSVClone();
221        tok.reset(input);
222        return tok;
223    }
224
225    /**
226     * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
227     * trim whitespace from both ends (which can be overridden with the setTrimmer method).
228     *
229     * @param input
230     *            the string to parse
231     * @return a new tokenizer instance which parses Tab Separated Value strings.
232     */
233    public static StringTokenizer getTSVInstance(final char[] input) {
234        final StringTokenizer tok = getTSVClone();
235        tok.reset(input);
236        return tok;
237    }
238
239    // -----------------------------------------------------------------------
240    /**
241     * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer, but with no text to
242     * tokenize.
243     * <p>
244     * This constructor is normally used with {@link #reset(String)}.
245     */
246    public StringTokenizer() {
247        super();
248        this.chars = null;
249    }
250
251    /**
252     * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer.
253     *
254     * @param input
255     *            the string which is to be parsed
256     */
257    public StringTokenizer(final String input) {
258        super();
259        if (input != null) {
260            chars = input.toCharArray();
261        } else {
262            chars = null;
263        }
264    }
265
266    /**
267     * Constructs a tokenizer splitting on the specified delimiter character.
268     *
269     * @param input
270     *            the string which is to be parsed
271     * @param delim
272     *            the field delimiter character
273     */
274    public StringTokenizer(final String input, final char delim) {
275        this(input);
276        setDelimiterChar(delim);
277    }
278
279    /**
280     * Constructs a tokenizer splitting on the specified delimiter string.
281     *
282     * @param input
283     *            the string which is to be parsed
284     * @param delim
285     *            the field delimiter string
286     */
287    public StringTokenizer(final String input, final String delim) {
288        this(input);
289        setDelimiterString(delim);
290    }
291
292    /**
293     * Constructs a tokenizer splitting using the specified delimiter matcher.
294     *
295     * @param input
296     *            the string which is to be parsed
297     * @param delim
298     *            the field delimiter matcher
299     */
300    public StringTokenizer(final String input, final StringMatcher delim) {
301        this(input);
302        setDelimiterMatcher(delim);
303    }
304
305    /**
306     * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified
307     * quote character.
308     *
309     * @param input
310     *            the string which is to be parsed
311     * @param delim
312     *            the field delimiter character
313     * @param quote
314     *            the field quoted string character
315     */
316    public StringTokenizer(final String input, final char delim, final char quote) {
317        this(input, delim);
318        setQuoteChar(quote);
319    }
320
321    /**
322     * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified
323     * quote matcher.
324     *
325     * @param input
326     *            the string which is to be parsed
327     * @param delim
328     *            the field delimiter matcher
329     * @param quote
330     *            the field quoted string matcher
331     */
332    public StringTokenizer(final String input, final StringMatcher delim, final StringMatcher quote) {
333        this(input, delim);
334        setQuoteMatcher(quote);
335    }
336
337    /**
338     * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer.
339     *
340     * @param input
341     *            the string which is to be parsed, not cloned
342     */
343    public StringTokenizer(final char[] input) {
344        super();
345        if (input == null) {
346            this.chars = null;
347        } else {
348            this.chars = input.clone();
349        }
350    }
351
352    /**
353     * Constructs a tokenizer splitting on the specified character.
354     *
355     * @param input
356     *            the string which is to be parsed, not cloned
357     * @param delim
358     *            the field delimiter character
359     */
360    public StringTokenizer(final char[] input, final char delim) {
361        this(input);
362        setDelimiterChar(delim);
363    }
364
365    /**
366     * Constructs a tokenizer splitting on the specified string.
367     *
368     * @param input
369     *            the string which is to be parsed, not cloned
370     * @param delim
371     *            the field delimiter string
372     */
373    public StringTokenizer(final char[] input, final String delim) {
374        this(input);
375        setDelimiterString(delim);
376    }
377
378    /**
379     * Constructs a tokenizer splitting using the specified delimiter matcher.
380     *
381     * @param input
382     *            the string which is to be parsed, not cloned
383     * @param delim
384     *            the field delimiter matcher
385     */
386    public StringTokenizer(final char[] input, final StringMatcher delim) {
387        this(input);
388        setDelimiterMatcher(delim);
389    }
390
391    /**
392     * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified
393     * quote character.
394     *
395     * @param input
396     *            the string which is to be parsed, not cloned
397     * @param delim
398     *            the field delimiter character
399     * @param quote
400     *            the field quoted string character
401     */
402    public StringTokenizer(final char[] input, final char delim, final char quote) {
403        this(input, delim);
404        setQuoteChar(quote);
405    }
406
407    /**
408     * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified
409     * quote matcher.
410     *
411     * @param input
412     *            the string which is to be parsed, not cloned
413     * @param delim
414     *            the field delimiter character
415     * @param quote
416     *            the field quoted string character
417     */
418    public StringTokenizer(final char[] input, final StringMatcher delim, final StringMatcher quote) {
419        this(input, delim);
420        setQuoteMatcher(quote);
421    }
422
423    // API
424    // -----------------------------------------------------------------------
425    /**
426     * Gets the number of tokens found in the String.
427     *
428     * @return the number of matched tokens
429     */
430    public int size() {
431        checkTokenized();
432        return tokens.length;
433    }
434
435    /**
436     * Gets the next token from the String. Equivalent to {@link #next()} except it returns null rather than throwing
437     * {@link NoSuchElementException} when no tokens remain.
438     *
439     * @return the next sequential token, or null when no more tokens are found
440     */
441    public String nextToken() {
442        if (hasNext()) {
443            return tokens[tokenPos++];
444        }
445        return null;
446    }
447
448    /**
449     * Gets the previous token from the String.
450     *
451     * @return the previous sequential token, or null when no more tokens are found
452     */
453    public String previousToken() {
454        if (hasPrevious()) {
455            return tokens[--tokenPos];
456        }
457        return null;
458    }
459
460    /**
461     * Gets a copy of the full token list as an independent modifiable array.
462     *
463     * @return the tokens as a String array
464     */
465    public String[] getTokenArray() {
466        checkTokenized();
467        return tokens.clone();
468    }
469
470    /**
471     * Gets a copy of the full token list as an independent modifiable list.
472     *
473     * @return the tokens as a String array
474     */
475    public List<String> getTokenList() {
476        checkTokenized();
477        final List<String> list = new ArrayList<>(tokens.length);
478        Collections.addAll(list, tokens);
479
480        return list;
481    }
482
483    /**
484     * Resets this tokenizer, forgetting all parsing and iteration already completed.
485     * <p>
486     * This method allows the same tokenizer to be reused for the same String.
487     *
488     * @return this, to enable chaining
489     */
490    public StringTokenizer reset() {
491        tokenPos = 0;
492        tokens = null;
493        return this;
494    }
495
496    /**
497     * Reset this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the
498     * same settings on multiple input lines.
499     *
500     * @param input
501     *            the new string to tokenize, null sets no text to parse
502     * @return this, to enable chaining
503     */
504    public StringTokenizer reset(final String input) {
505        reset();
506        if (input != null) {
507            this.chars = input.toCharArray();
508        } else {
509            this.chars = null;
510        }
511        return this;
512    }
513
514    /**
515     * Reset this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the
516     * same settings on multiple input lines.
517     *
518     * @param input
519     *            the new character array to tokenize, not cloned, null sets no text to parse
520     * @return this, to enable chaining
521     */
522    public StringTokenizer reset(final char[] input) {
523        reset();
524        if (input != null) {
525            this.chars = input.clone();
526        } else {
527            this.chars = null;
528        }
529        return this;
530    }
531
532    // ListIterator
533    // -----------------------------------------------------------------------
534    /**
535     * Checks whether there are any more tokens.
536     *
537     * @return true if there are more tokens
538     */
539    @Override
540    public boolean hasNext() {
541        checkTokenized();
542        return tokenPos < tokens.length;
543    }
544
545    /**
546     * Gets the next token.
547     *
548     * @return the next String token
549     * @throws NoSuchElementException
550     *             if there are no more elements
551     */
552    @Override
553    public String next() {
554        if (hasNext()) {
555            return tokens[tokenPos++];
556        }
557        throw new NoSuchElementException();
558    }
559
560    /**
561     * Gets the index of the next token to return.
562     *
563     * @return the next token index
564     */
565    @Override
566    public int nextIndex() {
567        return tokenPos;
568    }
569
570    /**
571     * Checks whether there are any previous tokens that can be iterated to.
572     *
573     * @return true if there are previous tokens
574     */
575    @Override
576    public boolean hasPrevious() {
577        checkTokenized();
578        return tokenPos > 0;
579    }
580
581    /**
582     * Gets the token previous to the last returned token.
583     *
584     * @return the previous token
585     */
586    @Override
587    public String previous() {
588        if (hasPrevious()) {
589            return tokens[--tokenPos];
590        }
591        throw new NoSuchElementException();
592    }
593
594    /**
595     * Gets the index of the previous token.
596     *
597     * @return the previous token index
598     */
599    @Override
600    public int previousIndex() {
601        return tokenPos - 1;
602    }
603
604    /**
605     * Unsupported ListIterator operation.
606     *
607     * @throws UnsupportedOperationException
608     *             always
609     */
610    @Override
611    public void remove() {
612        throw new UnsupportedOperationException("remove() is unsupported");
613    }
614
615    /**
616     * Unsupported ListIterator operation.
617     *
618     * @param obj
619     *            this parameter ignored.
620     * @throws UnsupportedOperationException
621     *             always
622     */
623    @Override
624    public void set(final String obj) {
625        throw new UnsupportedOperationException("set() is unsupported");
626    }
627
628    /**
629     * Unsupported ListIterator operation.
630     *
631     * @param obj
632     *            this parameter ignored.
633     * @throws UnsupportedOperationException
634     *             always
635     */
636    @Override
637    public void add(final String obj) {
638        throw new UnsupportedOperationException("add() is unsupported");
639    }
640
641    // Implementation
642    // -----------------------------------------------------------------------
643    /**
644     * Checks if tokenization has been done, and if not then do it.
645     */
646    private void checkTokenized() {
647        if (tokens == null) {
648            if (chars == null) {
649                // still call tokenize as subclass may do some work
650                final List<String> split = tokenize(null, 0, 0);
651                tokens = split.toArray(new String[split.size()]);
652            } else {
653                final List<String> split = tokenize(chars, 0, chars.length);
654                tokens = split.toArray(new String[split.size()]);
655            }
656        }
657    }
658
659    /**
660     * Internal method to performs the tokenization.
661     * <p>
662     * Most users of this class do not need to call this method. This method will be called automatically by other
663     * (public) methods when required.
664     * <p>
665     * This method exists to allow subclasses to add code before or after the tokenization. For example, a subclass
666     * could alter the character array, offset or count to be parsed, or call the tokenizer multiple times on multiple
667     * strings. It is also be possible to filter the results.
668     * <p>
669     * <code>StrTokenizer</code> will always pass a zero offset and a count equal to the length of the array to this
670     * method, however a subclass may pass other values, or even an entirely different array.
671     *
672     * @param srcChars
673     *            the character array being tokenized, may be null
674     * @param offset
675     *            the start position within the character array, must be valid
676     * @param count
677     *            the number of characters to tokenize, must be valid
678     * @return the modifiable list of String tokens, unmodifiable if null array or zero count
679     */
680    protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
681        if (srcChars == null || count == 0) {
682            return Collections.emptyList();
683        }
684        final TextStringBuilder buf = new TextStringBuilder();
685        final List<String> tokenList = new ArrayList<>();
686        int pos = offset;
687
688        // loop around the entire buffer
689        while (pos >= 0 && pos < count) {
690            // find next token
691            pos = readNextToken(srcChars, pos, count, buf, tokenList);
692
693            // handle case where end of string is a delimiter
694            if (pos >= count) {
695                addToken(tokenList, "");
696            }
697        }
698        return tokenList;
699    }
700
701    /**
702     * Adds a token to a list, paying attention to the parameters we've set.
703     *
704     * @param list
705     *            the list to add to
706     * @param tok
707     *            the token to add
708     */
709    private void addToken(final List<String> list, String tok) {
710        if (tok == null || tok.length() == 0) {
711            if (isIgnoreEmptyTokens()) {
712                return;
713            }
714            if (isEmptyTokenAsNull()) {
715                tok = null;
716            }
717        }
718        list.add(tok);
719    }
720
721    /**
722     * Reads character by character through the String to get the next token.
723     *
724     * @param srcChars
725     *            the character array being tokenized
726     * @param start
727     *            the first character of field
728     * @param len
729     *            the length of the character array being tokenized
730     * @param workArea
731     *            a temporary work area
732     * @param tokenList
733     *            the list of parsed tokens
734     * @return the starting position of the next field (the character immediately after the delimiter), or -1 if end of
735     *         string found
736     */
737    private int readNextToken(final char[] srcChars, int start, final int len, final TextStringBuilder workArea,
738            final List<String> tokenList) {
739        // skip all leading whitespace, unless it is the
740        // field delimiter or the quote character
741        while (start < len) {
742            final int removeLen = Math.max(getIgnoredMatcher().isMatch(srcChars, start, start, len),
743                    getTrimmerMatcher().isMatch(srcChars, start, start, len));
744            if (removeLen == 0 || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0
745                    || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
746                break;
747            }
748            start += removeLen;
749        }
750
751        // handle reaching end
752        if (start >= len) {
753            addToken(tokenList, "");
754            return -1;
755        }
756
757        // handle empty token
758        final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
759        if (delimLen > 0) {
760            addToken(tokenList, "");
761            return start + delimLen;
762        }
763
764        // handle found token
765        final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
766        if (quoteLen > 0) {
767            return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
768        }
769        return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
770    }
771
772    /**
773     * Reads a possibly quoted string token.
774     *
775     * @param srcChars
776     *            the character array being tokenized
777     * @param start
778     *            the first character of field
779     * @param len
780     *            the length of the character array being tokenized
781     * @param workArea
782     *            a temporary work area
783     * @param tokenList
784     *            the list of parsed tokens
785     * @param quoteStart
786     *            the start position of the matched quote, 0 if no quoting
787     * @param quoteLen
788     *            the length of the matched quote, 0 if no quoting
789     * @return the starting position of the next field (the character immediately after the delimiter, or if end of
790     *         string found, then the length of string
791     */
792    private int readWithQuotes(final char[] srcChars, final int start, final int len, final TextStringBuilder workArea,
793            final List<String> tokenList, final int quoteStart, final int quoteLen) {
794        // Loop until we've found the end of the quoted
795        // string or the end of the input
796        workArea.clear();
797        int pos = start;
798        boolean quoting = quoteLen > 0;
799        int trimStart = 0;
800
801        while (pos < len) {
802            // quoting mode can occur several times throughout a string
803            // we must switch between quoting and non-quoting until we
804            // encounter a non-quoted delimiter, or end of string
805            if (quoting) {
806                // In quoting mode
807
808                // If we've found a quote character, see if it's
809                // followed by a second quote. If so, then we need
810                // to actually put the quote character into the token
811                // rather than end the token.
812                if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
813                    if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
814                        // matched pair of quotes, thus an escaped quote
815                        workArea.append(srcChars, pos, quoteLen);
816                        pos += quoteLen * 2;
817                        trimStart = workArea.size();
818                        continue;
819                    }
820
821                    // end of quoting
822                    quoting = false;
823                    pos += quoteLen;
824                    continue;
825                }
826
827                // copy regular character from inside quotes
828                workArea.append(srcChars[pos++]);
829                trimStart = workArea.size();
830
831            } else {
832                // Not in quoting mode
833
834                // check for delimiter, and thus end of token
835                final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
836                if (delimLen > 0) {
837                    // return condition when end of token found
838                    addToken(tokenList, workArea.substring(0, trimStart));
839                    return pos + delimLen;
840                }
841
842                // check for quote, and thus back into quoting mode
843                if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
844                    quoting = true;
845                    pos += quoteLen;
846                    continue;
847                }
848
849                // check for ignored (outside quotes), and ignore
850                final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
851                if (ignoredLen > 0) {
852                    pos += ignoredLen;
853                    continue;
854                }
855
856                // check for trimmed character
857                // don't yet know if its at the end, so copy to workArea
858                // use trimStart to keep track of trim at the end
859                final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
860                if (trimmedLen > 0) {
861                    workArea.append(srcChars, pos, trimmedLen);
862                    pos += trimmedLen;
863                    continue;
864                }
865
866                // copy regular character from outside quotes
867                workArea.append(srcChars[pos++]);
868                trimStart = workArea.size();
869            }
870        }
871
872        // return condition when end of string found
873        addToken(tokenList, workArea.substring(0, trimStart));
874        return -1;
875    }
876
877    /**
878     * Checks if the characters at the index specified match the quote already matched in readNextToken().
879     *
880     * @param srcChars
881     *            the character array being tokenized
882     * @param pos
883     *            the position to check for a quote
884     * @param len
885     *            the length of the character array being tokenized
886     * @param quoteStart
887     *            the start position of the matched quote, 0 if no quoting
888     * @param quoteLen
889     *            the length of the matched quote, 0 if no quoting
890     * @return true if a quote is matched
891     */
892    private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart,
893            final int quoteLen) {
894        for (int i = 0; i < quoteLen; i++) {
895            if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
896                return false;
897            }
898        }
899        return true;
900    }
901
902    // Delimiter
903    // -----------------------------------------------------------------------
904    /**
905     * Gets the field delimiter matcher.
906     *
907     * @return the delimiter matcher in use
908     */
909    public StringMatcher getDelimiterMatcher() {
910        return this.delimMatcher;
911    }
912
913    /**
914     * Sets the field delimiter matcher.
915     * <p>
916     * The delimiter is used to separate one token from another.
917     *
918     * @param delim
919     *            the delimiter matcher to use
920     * @return this, to enable chaining
921     */
922    public StringTokenizer setDelimiterMatcher(final StringMatcher delim) {
923        if (delim == null) {
924            this.delimMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
925        } else {
926            this.delimMatcher = delim;
927        }
928        return this;
929    }
930
931    /**
932     * Sets the field delimiter character.
933     *
934     * @param delim
935     *            the delimiter character to use
936     * @return this, to enable chaining
937     */
938    public StringTokenizer setDelimiterChar(final char delim) {
939        return setDelimiterMatcher(StringMatcherFactory.INSTANCE.charMatcher(delim));
940    }
941
942    /**
943     * Sets the field delimiter string.
944     *
945     * @param delim
946     *            the delimiter string to use
947     * @return this, to enable chaining
948     */
949    public StringTokenizer setDelimiterString(final String delim) {
950        return setDelimiterMatcher(StringMatcherFactory.INSTANCE.stringMatcher(delim));
951    }
952
953    // Quote
954    // -----------------------------------------------------------------------
955    /**
956     * Gets the quote matcher currently in use.
957     * <p>
958     * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. The
959     * default value is '"' (double quote).
960     *
961     * @return the quote matcher in use
962     */
963    public StringMatcher getQuoteMatcher() {
964        return quoteMatcher;
965    }
966
967    /**
968     * Set the quote matcher to use.
969     * <p>
970     * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
971     *
972     * @param quote
973     *            the quote matcher to use, null ignored
974     * @return this, to enable chaining
975     */
976    public StringTokenizer setQuoteMatcher(final StringMatcher quote) {
977        if (quote != null) {
978            this.quoteMatcher = quote;
979        }
980        return this;
981    }
982
983    /**
984     * Sets the quote character to use.
985     * <p>
986     * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
987     *
988     * @param quote
989     *            the quote character to use
990     * @return this, to enable chaining
991     */
992    public StringTokenizer setQuoteChar(final char quote) {
993        return setQuoteMatcher(StringMatcherFactory.INSTANCE.charMatcher(quote));
994    }
995
996    // Ignored
997    // -----------------------------------------------------------------------
998    /**
999     * Gets the ignored character matcher.
1000     * <p>
1001     * These characters are ignored when parsing the String, unless they are within a quoted region. The default value
1002     * is not to ignore anything.
1003     *
1004     * @return the ignored matcher in use
1005     */
1006    public StringMatcher getIgnoredMatcher() {
1007        return ignoredMatcher;
1008    }
1009
1010    /**
1011     * Set the matcher for characters to ignore.
1012     * <p>
1013     * These characters are ignored when parsing the String, unless they are within a quoted region.
1014     *
1015     * @param ignored
1016     *            the ignored matcher to use, null ignored
1017     * @return this, to enable chaining
1018     */
1019    public StringTokenizer setIgnoredMatcher(final StringMatcher ignored) {
1020        if (ignored != null) {
1021            this.ignoredMatcher = ignored;
1022        }
1023        return this;
1024    }
1025
1026    /**
1027     * Set the character to ignore.
1028     * <p>
1029     * This character is ignored when parsing the String, unless it is within a quoted region.
1030     *
1031     * @param ignored
1032     *            the ignored character to use
1033     * @return this, to enable chaining
1034     */
1035    public StringTokenizer setIgnoredChar(final char ignored) {
1036        return setIgnoredMatcher(StringMatcherFactory.INSTANCE.charMatcher(ignored));
1037    }
1038
1039    // Trimmer
1040    // -----------------------------------------------------------------------
1041    /**
1042     * Gets the trimmer character matcher.
1043     * <p>
1044     * These characters are trimmed off on each side of the delimiter until the token or quote is found. The default
1045     * value is not to trim anything.
1046     *
1047     * @return the trimmer matcher in use
1048     */
1049    public StringMatcher getTrimmerMatcher() {
1050        return trimmerMatcher;
1051    }
1052
1053    /**
1054     * Sets the matcher for characters to trim.
1055     * <p>
1056     * These characters are trimmed off on each side of the delimiter until the token or quote is found.
1057     *
1058     * @param trimmer
1059     *            the trimmer matcher to use, null ignored
1060     * @return this, to enable chaining
1061     */
1062    public StringTokenizer setTrimmerMatcher(final StringMatcher trimmer) {
1063        if (trimmer != null) {
1064            this.trimmerMatcher = trimmer;
1065        }
1066        return this;
1067    }
1068
1069    // -----------------------------------------------------------------------
1070    /**
1071     * Gets whether the tokenizer currently returns empty tokens as null. The default for this property is false.
1072     *
1073     * @return true if empty tokens are returned as null
1074     */
1075    public boolean isEmptyTokenAsNull() {
1076        return this.emptyAsNull;
1077    }
1078
1079    /**
1080     * Sets whether the tokenizer should return empty tokens as null. The default for this property is false.
1081     *
1082     * @param emptyAsNull
1083     *            whether empty tokens are returned as null
1084     * @return this, to enable chaining
1085     */
1086    public StringTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
1087        this.emptyAsNull = emptyAsNull;
1088        return this;
1089    }
1090
1091    // -----------------------------------------------------------------------
1092    /**
1093     * Gets whether the tokenizer currently ignores empty tokens. The default for this property is true.
1094     *
1095     * @return true if empty tokens are not returned
1096     */
1097    public boolean isIgnoreEmptyTokens() {
1098        return ignoreEmptyTokens;
1099    }
1100
1101    /**
1102     * Sets whether the tokenizer should ignore and not return empty tokens. The default for this property is true.
1103     *
1104     * @param ignoreEmptyTokens
1105     *            whether empty tokens are not returned
1106     * @return this, to enable chaining
1107     */
1108    public StringTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
1109        this.ignoreEmptyTokens = ignoreEmptyTokens;
1110        return this;
1111    }
1112
1113    // -----------------------------------------------------------------------
1114    /**
1115     * Gets the String content that the tokenizer is parsing.
1116     *
1117     * @return the string content being parsed
1118     */
1119    public String getContent() {
1120        if (chars == null) {
1121            return null;
1122        }
1123        return new String(chars);
1124    }
1125
1126    // -----------------------------------------------------------------------
1127    /**
1128     * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token
1129     * list. If a {@link CloneNotSupportedException} is caught, return <code>null</code>.
1130     *
1131     * @return a new instance of this Tokenizer which has been reset.
1132     */
1133    @Override
1134    public Object clone() {
1135        try {
1136            return cloneReset();
1137        } catch (final CloneNotSupportedException ex) {
1138            return null;
1139        }
1140    }
1141
1142    /**
1143     * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token
1144     * list.
1145     *
1146     * @return a new instance of this Tokenizer which has been reset.
1147     * @throws CloneNotSupportedException
1148     *             if there is a problem cloning
1149     */
1150    Object cloneReset() throws CloneNotSupportedException {
1151        // this method exists to enable 100% test coverage
1152        final StringTokenizer cloned = (StringTokenizer) super.clone();
1153        if (cloned.chars != null) {
1154            cloned.chars = cloned.chars.clone();
1155        }
1156        cloned.reset();
1157        return cloned;
1158    }
1159
1160    // -----------------------------------------------------------------------
1161    /**
1162     * Gets the String content that the tokenizer is parsing.
1163     *
1164     * @return the string content being parsed
1165     */
1166    @Override
1167    public String toString() {
1168        if (tokens == null) {
1169            return "StringTokenizer[not tokenized yet]";
1170        }
1171        return "StringTokenizer" + getTokenList();
1172    }
1173
1174}