001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.text;
018
019import java.io.UnsupportedEncodingException;
020import java.util.Arrays;
021import java.util.Collection;
022import java.util.Collections;
023import java.util.HashMap;
024import java.util.Iterator;
025import java.util.LinkedHashMap;
026import java.util.LinkedHashSet;
027import java.util.Map;
028import java.util.Map.Entry;
029import java.util.Objects;
030import java.util.Set;
031
032/**
033 * <p>
034 * Convert from one alphabet to another, with the possibility of leaving certain
035 * characters unencoded.
036 * </p>
037 *
038 * <p>
039 * The target and do not encode languages must be in the Unicode BMP, but the
040 * source language does not.
041 * </p>
042 *
043 * <p>
044 * The encoding will all be of a fixed length, except for the 'do not encode'
045 * chars, which will be of length 1
046 * </p>
047 *
048 * <h3>Sample usage</h3>
049 *
050 * <pre>
051 * Character[] originals;   // a, b, c, d
052 * Character[] encoding;    // 0, 1, d
053 * Character[] doNotEncode; // d
054 *
055 * AlphabetConverter ac = AlphabetConverter.createConverterFromChars(originals,
056 * encoding, doNotEncode);
057 *
058 * ac.encode("a");    // 00
059 * ac.encode("b");    // 01
060 * ac.encode("c");    // 0d
061 * ac.encode("d");    // d
062 * ac.encode("abcd"); // 00010dd
063 * </pre>
064 *
065 * <p>
066 * #ThreadSafe# AlphabetConverter class methods are thread-safe as they do not
067 * change internal state.
068 * </p>
069 *
070 * @since 1.0
071 *
072 */
073public final class AlphabetConverter {
074
075    /**
076     * Original string to be encoded.
077     */
078    private final Map<Integer, String> originalToEncoded;
079    /**
080     * Encoding alphabet.
081     */
082    private final Map<String, String> encodedToOriginal;
083    /**
084     * Length of the encoded letter.
085     */
086    private final int encodedLetterLength;
087    /**
088     * Arrow constant, used for converting the object into a string.
089     */
090    private static final String ARROW = " -> ";
091
092    /**
093     * Hidden constructor for alphabet converter. Used by static helper methods.
094     *
095     * @param originalToEncoded original string to be encoded
096     * @param encodedToOriginal encoding alphabet
097     * @param encodedLetterLength length of the encoded letter
098     */
099    private AlphabetConverter(final Map<Integer, String> originalToEncoded,
100                              final Map<String, String> encodedToOriginal,
101                              final int encodedLetterLength) {
102
103        this.originalToEncoded = originalToEncoded;
104        this.encodedToOriginal = encodedToOriginal;
105        this.encodedLetterLength = encodedLetterLength;
106    }
107
108    /**
109     * Encode a given string.
110     *
111     * @param original the string to be encoded
112     * @return the encoded string, {@code null} if the given string is null
113     * @throws UnsupportedEncodingException if chars that are not supported are
114     *                                      encountered
115     */
116    public String encode(final String original)
117            throws UnsupportedEncodingException {
118        if (original == null) {
119            return null;
120        }
121
122        final StringBuilder sb = new StringBuilder();
123
124        for (int i = 0; i < original.length();) {
125            final int codepoint = original.codePointAt(i);
126
127            final String nextLetter = originalToEncoded.get(codepoint);
128
129            if (nextLetter == null) {
130                throw new UnsupportedEncodingException(
131                        "Couldn't find encoding for '"
132                                + codePointToString(codepoint)
133                                + "' in "
134                                + original
135                );
136            }
137
138            sb.append(nextLetter);
139
140            i += Character.charCount(codepoint);
141        }
142
143        return sb.toString();
144    }
145
146    /**
147     * Decode a given string.
148     *
149     * @param encoded a string that has been encoded using this
150     *                AlphabetConverter
151     * @return the decoded string, {@code null} if the given string is null
152     * @throws UnsupportedEncodingException if unexpected characters that
153     *                                      cannot be handled are encountered
154     */
155    public String decode(final String encoded)
156            throws UnsupportedEncodingException {
157        if (encoded == null) {
158            return null;
159        }
160
161        final StringBuilder result = new StringBuilder();
162
163        for (int j = 0; j < encoded.length();) {
164            final Integer i = encoded.codePointAt(j);
165            final String s = codePointToString(i);
166
167            if (s.equals(originalToEncoded.get(i))) {
168                result.append(s);
169                j++; // because we do not encode in Unicode extended the
170                     // length of each encoded char is 1
171            } else {
172                if (j + encodedLetterLength > encoded.length()) {
173                    throw new UnsupportedEncodingException("Unexpected end "
174                            + "of string while decoding " + encoded);
175                }
176                final String nextGroup = encoded.substring(j,
177                        j + encodedLetterLength);
178                final String next = encodedToOriginal.get(nextGroup);
179                if (next == null) {
180                    throw new UnsupportedEncodingException(
181                            "Unexpected string without decoding ("
182                                    + nextGroup + ") in " + encoded);
183                }
184                result.append(next);
185                j += encodedLetterLength;
186            }
187        }
188
189        return result.toString();
190    }
191
192    /**
193     * Get the length of characters in the encoded alphabet that are necessary
194     * for each character in the original
195     * alphabet.
196     *
197     * @return the length of the encoded char
198     */
199    public int getEncodedCharLength() {
200        return encodedLetterLength;
201    }
202
203    /**
204     * Get the mapping from integer code point of source language to encoded
205     * string. Use to reconstruct converter from
206     * serialized map.
207     *
208     * @return the original map
209     */
210    public Map<Integer, String> getOriginalToEncoded() {
211        return Collections.unmodifiableMap(originalToEncoded);
212    }
213
214    /**
215     * Recursive method used when creating encoder/decoder.
216     *
217     * @param level at which point it should add a single encoding
218     * @param currentEncoding current encoding
219     * @param encoding letters encoding
220     * @param originals original values
221     * @param doNotEncodeMap map of values that should not be encoded
222     */
223    @SuppressWarnings("PMD")
224    private void addSingleEncoding(final int level,
225                                   final String currentEncoding,
226                                   final Collection<Integer> encoding,
227                                   final Iterator<Integer> originals,
228                                   final Map<Integer, String> doNotEncodeMap) {
229
230        if (level > 0) {
231            for (final int encodingLetter : encoding) {
232                if (originals.hasNext()) {
233
234                    // this skips the doNotEncode chars if they are in the
235                    // leftmost place
236                    if (level != encodedLetterLength
237                            || !doNotEncodeMap.containsKey(encodingLetter)) {
238                        addSingleEncoding(level - 1,
239                                currentEncoding
240                                        + codePointToString(encodingLetter),
241                                encoding,
242                                originals,
243                                doNotEncodeMap
244                        );
245                    }
246                } else {
247                    return; // done encoding all the original alphabet
248                }
249            }
250        } else {
251            Integer next = originals.next();
252
253            while (doNotEncodeMap.containsKey(next)) {
254                final String originalLetterAsString = codePointToString(next);
255
256                originalToEncoded.put(next, originalLetterAsString);
257                encodedToOriginal.put(originalLetterAsString,
258                        originalLetterAsString);
259
260                if (!originals.hasNext()) {
261                    return;
262                }
263
264                next = originals.next();
265            }
266
267            final String originalLetterAsString = codePointToString(next);
268
269            originalToEncoded.put(next, currentEncoding);
270            encodedToOriginal.put(currentEncoding, originalLetterAsString);
271        }
272    }
273
274    @Override
275    public String toString() {
276        final StringBuilder sb = new StringBuilder();
277
278        for (final Entry<Integer, String> entry
279                : originalToEncoded.entrySet()) {
280            sb.append(codePointToString(entry.getKey()))
281                    .append(ARROW)
282                    .append(entry.getValue()).append(System.lineSeparator());
283        }
284
285        return sb.toString();
286    }
287
288    @Override
289    public boolean equals(final Object obj) {
290        if (obj == null) {
291            return false;
292        }
293        if (obj == this) {
294            return true;
295        }
296        if (!(obj instanceof AlphabetConverter)) {
297            return false;
298        }
299        final AlphabetConverter other = (AlphabetConverter) obj;
300        return originalToEncoded.equals(other.originalToEncoded)
301                && encodedToOriginal.equals(other.encodedToOriginal)
302                && encodedLetterLength == other.encodedLetterLength;
303    }
304
305    @Override
306    public int hashCode() {
307        return Objects.hash(originalToEncoded,
308                encodedToOriginal,
309                encodedLetterLength);
310    }
311
312    // -- static methods
313
314    /**
315     * Create a new converter from a map.
316     *
317     * @param originalToEncoded a map returned from getOriginalToEncoded()
318     * @return the reconstructed AlphabetConverter
319     * @see AlphabetConverter#getOriginalToEncoded()
320     */
321    public static AlphabetConverter createConverterFromMap(
322            final Map<Integer, String> originalToEncoded) {
323        final Map<Integer, String> unmodifiableOriginalToEncoded =
324                Collections.unmodifiableMap(originalToEncoded);
325        final Map<String, String> encodedToOriginal = new LinkedHashMap<>();
326
327        int encodedLetterLength = 1;
328
329        for (final Entry<Integer, String> e
330                : unmodifiableOriginalToEncoded.entrySet()) {
331            final String originalAsString = codePointToString(e.getKey());
332            encodedToOriginal.put(e.getValue(), originalAsString);
333
334            if (e.getValue().length() > encodedLetterLength) {
335                encodedLetterLength = e.getValue().length();
336            }
337        }
338
339        return new AlphabetConverter(unmodifiableOriginalToEncoded,
340                encodedToOriginal,
341                encodedLetterLength);
342    }
343
344    /**
345     * Create an alphabet converter, for converting from the original alphabet,
346     * to the encoded alphabet, while leaving the characters in
347     * <em>doNotEncode</em> as they are (if possible).
348     *
349     * <p>Duplicate letters in either original or encoding will be ignored.</p>
350     *
351     * @param original an array of chars representing the original alphabet
352     * @param encoding an array of chars representing the alphabet to be used
353     *                 for encoding
354     * @param doNotEncode an array of chars to be encoded using the original
355     *                    alphabet - every char here must appear in
356     *                    both the previous params
357     * @return the AlphabetConverter
358     * @throws IllegalArgumentException if an AlphabetConverter cannot be
359     *                                  constructed
360     */
361    public static AlphabetConverter createConverterFromChars(
362            final Character[] original,
363            final Character[] encoding,
364            final Character[] doNotEncode) {
365        return AlphabetConverter.createConverter(
366                convertCharsToIntegers(original),
367                convertCharsToIntegers(encoding),
368                convertCharsToIntegers(doNotEncode));
369    }
370
371    /**
372     * Convert characters to integers.
373     *
374     * @param chars array of characters
375     * @return an equivalent array of integers
376     */
377    private static Integer[] convertCharsToIntegers(final Character[] chars) {
378        if (chars == null || chars.length == 0) {
379            return new Integer[0];
380        }
381        final Integer[] integers = new Integer[chars.length];
382        for (int i = 0; i < chars.length; i++) {
383            integers[i] = (int) chars[i];
384        }
385        return integers;
386    }
387
388    /**
389     * Create an alphabet converter, for converting from the original alphabet,
390     * to the encoded alphabet, while leaving
391     * the characters in <em>doNotEncode</em> as they are (if possible).
392     *
393     * <p>Duplicate letters in either original or encoding will be ignored.</p>
394     *
395     * @param original an array of ints representing the original alphabet in
396     *                 codepoints
397     * @param encoding an array of ints representing the alphabet to be used for
398     *                 encoding, in codepoints
399     * @param doNotEncode an array of ints representing the chars to be encoded
400     *                    using the original alphabet - every char
401     *                    here must appear in both the previous params
402     * @return the AlphabetConverter
403     * @throws IllegalArgumentException if an AlphabetConverter cannot be
404     *                                   constructed
405     */
406    public static AlphabetConverter createConverter(
407            final Integer[] original,
408            final Integer[] encoding,
409            final Integer[] doNotEncode) {
410        final Set<Integer> originalCopy = new LinkedHashSet<>(Arrays.<Integer> asList(original));
411        final Set<Integer> encodingCopy = new LinkedHashSet<>(Arrays.<Integer> asList(encoding));
412        final Set<Integer> doNotEncodeCopy = new LinkedHashSet<>(Arrays.<Integer> asList(doNotEncode));
413
414        final Map<Integer, String> originalToEncoded = new LinkedHashMap<>();
415        final Map<String, String> encodedToOriginal = new LinkedHashMap<>();
416        final Map<Integer, String> doNotEncodeMap = new HashMap<>();
417
418        int encodedLetterLength;
419
420        for (final int i : doNotEncodeCopy) {
421            if (!originalCopy.contains(i)) {
422                throw new IllegalArgumentException(
423                        "Can not use 'do not encode' list because original "
424                                + "alphabet does not contain '"
425                                + codePointToString(i) + "'");
426            }
427
428            if (!encodingCopy.contains(i)) {
429                throw new IllegalArgumentException(
430                        "Can not use 'do not encode' list because encoding alphabet does not contain '"
431                                + codePointToString(i) + "'");
432            }
433
434            doNotEncodeMap.put(i, codePointToString(i));
435        }
436
437        if (encodingCopy.size() >= originalCopy.size()) {
438            encodedLetterLength = 1;
439
440            final Iterator<Integer> it = encodingCopy.iterator();
441
442            for (final int originalLetter : originalCopy) {
443                final String originalLetterAsString =
444                        codePointToString(originalLetter);
445
446                if (doNotEncodeMap.containsKey(originalLetter)) {
447                    originalToEncoded.put(originalLetter,
448                            originalLetterAsString);
449                    encodedToOriginal.put(originalLetterAsString,
450                            originalLetterAsString);
451                } else {
452                    Integer next = it.next();
453
454                    while (doNotEncodeCopy.contains(next)) {
455                        next = it.next();
456                    }
457
458                    final String encodedLetter = codePointToString(next);
459
460                    originalToEncoded.put(originalLetter, encodedLetter);
461                    encodedToOriginal.put(encodedLetter,
462                            originalLetterAsString);
463                }
464            }
465
466            return new AlphabetConverter(originalToEncoded,
467                    encodedToOriginal,
468                    encodedLetterLength);
469
470        } else if (encodingCopy.size() - doNotEncodeCopy.size() < 2) {
471            throw new IllegalArgumentException(
472                    "Must have at least two encoding characters (excluding "
473                            + "those in the 'do not encode' list), but has "
474                            + (encodingCopy.size() - doNotEncodeCopy.size()));
475        } else {
476            // we start with one which is our minimum, and because we do the
477            // first division outside the loop
478            int lettersSoFar = 1;
479
480            // the first division takes into account that the doNotEncode
481            // letters can't be in the leftmost place
482            int lettersLeft = (originalCopy.size() - doNotEncodeCopy.size())
483                    / (encodingCopy.size() - doNotEncodeCopy.size());
484
485            while (lettersLeft / encodingCopy.size() >= 1) {
486                lettersLeft = lettersLeft / encodingCopy.size();
487                lettersSoFar++;
488            }
489
490            encodedLetterLength = lettersSoFar + 1;
491
492            final AlphabetConverter ac =
493                    new AlphabetConverter(originalToEncoded,
494                            encodedToOriginal,
495                            encodedLetterLength);
496
497            ac.addSingleEncoding(encodedLetterLength,
498                    "",
499                    encodingCopy,
500                    originalCopy.iterator(),
501                    doNotEncodeMap);
502
503            return ac;
504        }
505    }
506
507    /**
508     * Create new String that contains just the given code point.
509     *
510     * @param i code point
511     * @return a new string with the new code point
512     * @see "http://www.oracle.com/us/technologies/java/supplementary-142654.html"
513     */
514    private static String codePointToString(final int i) {
515        if (Character.charCount(i) == 1) {
516            return String.valueOf((char) i);
517        }
518        return new String(Character.toChars(i));
519    }
520}