1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.fileupload2.core;
18
19 import java.io.ByteArrayOutputStream;
20 import java.io.IOException;
21 import java.io.UnsupportedEncodingException;
22 import java.nio.charset.StandardCharsets;
23 import java.text.ParseException;
24 import java.util.Base64;
25 import java.util.HashMap;
26 import java.util.Locale;
27 import java.util.Map;
28
29 /**
30 * Utility class to decode MIME texts.
31 */
32 final class MimeUtils {
33
34 /**
35 * The marker to indicate text is encoded with BASE64 algorithm.
36 */
37 private static final String BASE64_ENCODING_MARKER = "B";
38
39 /**
40 * The marker to indicate text is encoded with QuotedPrintable algorithm.
41 */
42 private static final String QUOTEDPRINTABLE_ENCODING_MARKER = "Q";
43
44 /**
45 * If the text contains any encoded tokens, those tokens will be marked with "=?".
46 */
47 private static final String ENCODED_TOKEN_MARKER = "=?";
48
49 /**
50 * If the text contains any encoded tokens, those tokens will terminate with "=?".
51 */
52 private static final String ENCODED_TOKEN_FINISHER = "?=";
53
54 /**
55 * The linear whitespace chars sequence.
56 */
57 private static final String LINEAR_WHITESPACE = " \t\r\n";
58
59 /**
60 * Mappings between MIME and Java charset.
61 */
62 private static final Map<String, String> MIME2JAVA = new HashMap<>();
63
64 static {
65 MIME2JAVA.put("iso-2022-cn", "ISO2022CN");
66 MIME2JAVA.put("iso-2022-kr", "ISO2022KR");
67 MIME2JAVA.put("utf-8", "UTF8");
68 MIME2JAVA.put("utf8", "UTF8");
69 MIME2JAVA.put("ja_jp.iso2022-7", "ISO2022JP");
70 MIME2JAVA.put("ja_jp.eucjp", "EUCJIS");
71 MIME2JAVA.put("euc-kr", "KSC5601");
72 MIME2JAVA.put("euckr", "KSC5601");
73 MIME2JAVA.put("us-ascii", StandardCharsets.ISO_8859_1.name());
74 MIME2JAVA.put("x-us-ascii", StandardCharsets.ISO_8859_1.name());
75 }
76
77 /**
78 * Decodes a string of text obtained from a mail header into its proper form. The text generally will consist of a string of tokens, some of which may be
79 * encoded using base64 encoding.
80 *
81 * @param text The text to decode.
82 *
83 * @return The decoded text string.
84 * @throws UnsupportedEncodingException if the detected encoding in the input text is not supported.
85 */
86 static String decodeText(final String text) throws UnsupportedEncodingException {
87 // if the text contains any encoded tokens, those tokens will be marked with "=?". If the
88 // source string doesn't contain that sequent, no decoding is required.
89 if (!text.contains(ENCODED_TOKEN_MARKER)) {
90 return text;
91 }
92
93 var offset = 0;
94 final var endOffset = text.length();
95
96 var startWhiteSpace = -1;
97 var endWhiteSpace = -1;
98
99 final var decodedText = new StringBuilder(text.length());
100
101 var previousTokenEncoded = false;
102
103 while (offset < endOffset) {
104 var ch = text.charAt(offset);
105
106 // is this a whitespace character?
107 if (LINEAR_WHITESPACE.indexOf(ch) != -1) { // whitespace found
108 startWhiteSpace = offset;
109 while (offset < endOffset) {
110 // step over the white space characters.
111 ch = text.charAt(offset);
112 if (LINEAR_WHITESPACE.indexOf(ch) == -1) {
113 // record the location of the first non lwsp and drop down to process the
114 // token characters.
115 endWhiteSpace = offset;
116 break;
117 }
118 offset++;
119 }
120 } else {
121 // we have a word token. We need to scan over the word and then try to parse it.
122 final var wordStart = offset;
123
124 while (offset < endOffset) {
125 // step over the non white space characters.
126 ch = text.charAt(offset);
127 if (LINEAR_WHITESPACE.indexOf(ch) != -1) {
128 break;
129 }
130 offset++;
131
132 // NB: Trailing whitespace on these header strings will just be discarded.
133 }
134 // pull out the word token.
135 final var word = text.substring(wordStart, offset);
136 // is the token encoded? decode the word
137 if (word.startsWith(ENCODED_TOKEN_MARKER)) {
138 try {
139 // if this gives a parsing failure, treat it like a non-encoded word.
140 final var decodedWord = decodeWord(word);
141
142 // are any whitespace characters significant? Append 'em if we've got 'em.
143 if (!previousTokenEncoded && startWhiteSpace != -1) {
144 decodedText.append(text, startWhiteSpace, endWhiteSpace);
145 startWhiteSpace = -1;
146 }
147 // this is definitely a decoded token.
148 previousTokenEncoded = true;
149 // and add this to the text.
150 decodedText.append(decodedWord);
151 // we continue parsing from here...we allow parsing errors to fall through
152 // and get handled as normal text.
153 continue;
154
155 } catch (final ParseException ignored) {
156 // just ignore it, skip to next word
157 }
158 }
159 // this is a normal token, so it doesn't matter what the previous token was. Add the white space
160 // if we have it.
161 if (startWhiteSpace != -1) {
162 decodedText.append(text, startWhiteSpace, endWhiteSpace);
163 startWhiteSpace = -1;
164 }
165 // this is not a decoded token.
166 previousTokenEncoded = false;
167 decodedText.append(word);
168 }
169 }
170
171 return decodedText.toString();
172 }
173
174 /**
175 * Decodes a string using the RFC 2047 rules for an "encoded-word" type. This encoding has the syntax:
176 *
177 * encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
178 *
179 * @param word The possibly encoded word value.
180 *
181 * @return The decoded word.
182 * @throws ParseException in case of a parse error of the RFC 2047.
183 * @throws UnsupportedEncodingException Thrown when Invalid RFC 2047 encoding was found.
184 */
185 private static String decodeWord(final String word) throws ParseException, UnsupportedEncodingException {
186 // encoded words start with the characters "=?". If this not an encoded word, we throw a
187 // ParseException for the caller.
188
189 final var etmPos = word.indexOf(ENCODED_TOKEN_MARKER);
190 if (etmPos != 0) {
191 throw new ParseException("Invalid RFC 2047 encoded-word: " + word, etmPos);
192 }
193
194 final var charsetPos = word.indexOf('?', 2);
195 if (charsetPos == -1) {
196 throw new ParseException("Missing charset in RFC 2047 encoded-word: " + word, charsetPos);
197 }
198
199 // pull out the character set information (this is the MIME name at this point).
200 final var charset = word.substring(2, charsetPos).toLowerCase(Locale.ENGLISH);
201
202 // now pull out the encoding token the same way.
203 final var encodingPos = word.indexOf('?', charsetPos + 1);
204 if (encodingPos == -1) {
205 throw new ParseException("Missing encoding in RFC 2047 encoded-word: " + word, encodingPos);
206 }
207
208 final var encoding = word.substring(charsetPos + 1, encodingPos);
209
210 // and finally the encoded text.
211 final var encodedTextPos = word.indexOf(ENCODED_TOKEN_FINISHER, encodingPos + 1);
212 if (encodedTextPos == -1) {
213 throw new ParseException("Missing encoded text in RFC 2047 encoded-word: " + word, encodedTextPos);
214 }
215
216 final var encodedText = word.substring(encodingPos + 1, encodedTextPos);
217
218 // seems a bit silly to encode a null string, but easy to deal with.
219 if (encodedText.isEmpty()) {
220 return "";
221 }
222
223 try {
224 // the decoder writes directly to an output stream.
225 final var out = new ByteArrayOutputStream(encodedText.length());
226
227 final var encodedData = encodedText.getBytes(StandardCharsets.US_ASCII);
228
229 // Base64 encoded?
230 if (encoding.equals(BASE64_ENCODING_MARKER)) {
231 out.write(Base64.getMimeDecoder().decode(encodedData));
232 } else if (encoding.equals(QUOTEDPRINTABLE_ENCODING_MARKER)) { // maybe quoted printable.
233 QuotedPrintableDecoder.decode(encodedData, out);
234 } else {
235 throw new UnsupportedEncodingException("Unknown RFC 2047 encoding: " + encoding);
236 }
237 // get the decoded byte data and convert into a string.
238 final var decodedData = out.toByteArray();
239 return new String(decodedData, javaCharset(charset));
240 } catch (final IOException e) {
241 throw new UnsupportedEncodingException("Invalid RFC 2047 encoding");
242 }
243 }
244
245 /**
246 * Translate a MIME standard character set name into the Java equivalent.
247 *
248 * @param charset The MIME standard name.
249 *
250 * @return The Java equivalent for this name.
251 */
252 private static String javaCharset(final String charset) {
253 // nothing in, nothing out.
254 if (charset == null) {
255 return null;
256 }
257 final var mappedCharset = MIME2JAVA.get(charset.toLowerCase(Locale.ENGLISH));
258 // if there is no mapping, then the original name is used. Many of the MIME character set
259 // names map directly back into Java. The reverse isn't necessarily true.
260 return mappedCharset == null ? charset : mappedCharset;
261 }
262
263 /**
264 * Hidden constructor, this class must not be instantiated.
265 */
266 private MimeUtils() {
267 // do nothing
268 }
269
270 }