001 /****************************************************************
002 * Licensed to the Apache Software Foundation (ASF) under one *
003 * or more contributor license agreements. See the NOTICE file *
004 * distributed with this work for additional information *
005 * regarding copyright ownership. The ASF licenses this file *
006 * to you under the Apache License, Version 2.0 (the *
007 * "License"); you may not use this file except in compliance *
008 * with the License. You may obtain a copy of the License at *
009 * *
010 * http://www.apache.org/licenses/LICENSE-2.0 *
011 * *
012 * Unless required by applicable law or agreed to in writing, *
013 * software distributed under the License is distributed on an *
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY *
015 * KIND, either express or implied. See the License for the *
016 * specific language governing permissions and limitations *
017 * under the License. *
018 ****************************************************************/
019
020 package org.apache.james.mime4j.stream;
021
022 import java.util.ArrayList;
023 import java.util.BitSet;
024 import java.util.List;
025
026 import org.apache.james.mime4j.MimeException;
027 import org.apache.james.mime4j.util.ByteSequence;
028 import org.apache.james.mime4j.util.CharsetUtil;
029 import org.apache.james.mime4j.util.ContentUtil;
030
031 /**
032 * Low level parser for header field elements. The parsing routines of this class are designed
033 * to produce near zero intermediate garbage and make no intermediate copies of input data.
034 * <p/>
035 * This class is immutable and thread safe.
036 */
037 public class RawFieldParser {
038
039 public static BitSet INIT_BITSET(int ... b) {
040 BitSet bitset = new BitSet(b.length);
041 for (int i = 0; i < b.length; i++) {
042 bitset.set(b[i]);
043 }
044 return bitset;
045 }
046
047 static final BitSet COLON = INIT_BITSET(':');
048 static final BitSet EQUAL_OR_SEMICOLON = INIT_BITSET('=', ';');
049 static final BitSet SEMICOLON = INIT_BITSET(';');
050
051 public static final RawFieldParser DEFAULT = new RawFieldParser();
052
053 /**
054 * Parses the sequence of bytes into {@link RawField}.
055 *
056 * @throws MimeException if the input data does not contain a valid MIME field.
057 */
058 public RawField parseField(final ByteSequence raw) throws MimeException {
059 if (raw == null) {
060 return null;
061 }
062 ParserCursor cursor = new ParserCursor(0, raw.length());
063 String name = parseToken(raw, cursor, COLON);
064 if (cursor.atEnd()) {
065 throw new MimeException("Invalid MIME field: no name/value separator found: " +
066 raw.toString());
067 }
068 return new RawField(raw, cursor.getPos(), name, null);
069 }
070
071 /**
072 * Parses the field body containing a value with parameters into {@link RawBody}.
073 *
074 * @param field unstructured (raw) field
075 */
076 public RawBody parseRawBody(final RawField field) {
077 ByteSequence buf = field.getRaw();
078 int pos = field.getDelimiterIdx() + 1;
079 if (buf == null) {
080 String body = field.getBody();
081 if (body == null) {
082 return new RawBody("", null);
083 }
084 buf = ContentUtil.encode(body);
085 pos = 0;
086 }
087 ParserCursor cursor = new ParserCursor(pos, buf.length());
088 return parseRawBody(buf, cursor);
089 }
090
091 /**
092 * Parses the sequence of bytes containing a value with parameters into {@link RawBody}.
093 *
094 * @param buf buffer with the sequence of bytes to be parsed
095 * @param cursor defines the bounds and current position of the buffer
096 */
097 public RawBody parseRawBody(final ByteSequence buf, final ParserCursor cursor) {
098 String value = parseToken(buf, cursor, SEMICOLON);
099 if (cursor.atEnd()) {
100 return new RawBody(value, new ArrayList<NameValuePair>());
101 }
102 cursor.updatePos(cursor.getPos() + 1);
103 List<NameValuePair> params = parseParameters(buf, cursor);
104 return new RawBody(value, params);
105 }
106
107 /**
108 * Parses the sequence of bytes containing field parameters delimited with semicolon into
109 * a list of {@link NameValuePair}s.
110 *
111 * @param buf buffer with the sequence of bytes to be parsed
112 * @param cursor defines the bounds and current position of the buffer
113 */
114 public List<NameValuePair> parseParameters(final ByteSequence buf, final ParserCursor cursor) {
115 List<NameValuePair> params = new ArrayList<NameValuePair>();
116 skipWhiteSpace(buf, cursor);
117 while (!cursor.atEnd()) {
118 NameValuePair param = parseParameter(buf, cursor);
119 params.add(param);
120 }
121 return params;
122 }
123
124 /**
125 * Parses the sequence of bytes containing a field parameter delimited with semicolon into
126 * {@link NameValuePair}.
127 *
128 * @param buf buffer with the sequence of bytes to be parsed
129 * @param cursor defines the bounds and current position of the buffer
130 */
131 public NameValuePair parseParameter(final ByteSequence buf, final ParserCursor cursor) {
132 String name = parseToken(buf, cursor, EQUAL_OR_SEMICOLON);
133 if (cursor.atEnd()) {
134 return new NameValuePair(name, null);
135 }
136 int delim = buf.byteAt(cursor.getPos());
137 cursor.updatePos(cursor.getPos() + 1);
138 if (delim == ';') {
139 return new NameValuePair(name, null);
140 }
141 String value = parseValue(buf, cursor, SEMICOLON);
142 if (!cursor.atEnd()) {
143 cursor.updatePos(cursor.getPos() + 1);
144 }
145 return new NameValuePair(name, value);
146 }
147
148 /**
149 * Extracts from the sequence of bytes a token terminated with any of the given delimiters
150 * discarding semantically insignificant whitespace characters and comments.
151 *
152 * @param buf buffer with the sequence of bytes to be parsed
153 * @param cursor defines the bounds and current position of the buffer
154 * @param delimiters set of delimiting characters. Can be <code>null</code> if the token
155 * is not delimited by any character.
156 */
157 public String parseToken(final ByteSequence buf, final ParserCursor cursor, final BitSet delimiters) {
158 StringBuilder dst = new StringBuilder();
159 boolean whitespace = false;
160 while (!cursor.atEnd()) {
161 char current = (char) (buf.byteAt(cursor.getPos()) & 0xff);
162 if (delimiters != null && delimiters.get(current)) {
163 break;
164 } else if (CharsetUtil.isWhitespace(current)) {
165 skipWhiteSpace(buf, cursor);
166 whitespace = true;
167 } else if (current == '(') {
168 skipComment(buf, cursor);
169 } else {
170 if (dst.length() > 0 && whitespace) {
171 dst.append(' ');
172 }
173 copyContent(buf, cursor, delimiters, dst);
174 whitespace = false;
175 }
176 }
177 return dst.toString();
178 }
179
180 /**
181 * Extracts from the sequence of bytes a value which can be enclosed in quote marks and
182 * terminated with any of the given delimiters discarding semantically insignificant
183 * whitespace characters and comments.
184 *
185 * @param buf buffer with the sequence of bytes to be parsed
186 * @param cursor defines the bounds and current position of the buffer
187 * @param delimiters set of delimiting characters. Can be <code>null</code> if the value
188 * is not delimited by any character.
189 */
190 public String parseValue(final ByteSequence buf, final ParserCursor cursor, final BitSet delimiters) {
191 StringBuilder dst = new StringBuilder();
192 boolean whitespace = false;
193 while (!cursor.atEnd()) {
194 char current = (char) (buf.byteAt(cursor.getPos()) & 0xff);
195 if (delimiters != null && delimiters.get(current)) {
196 break;
197 } else if (CharsetUtil.isWhitespace(current)) {
198 skipWhiteSpace(buf, cursor);
199 whitespace = true;
200 } else if (current == '(') {
201 skipComment(buf, cursor);
202 } else if (current == '\"') {
203 if (dst.length() > 0 && whitespace) {
204 dst.append(' ');
205 }
206 copyQuotedContent(buf, cursor, dst);
207 whitespace = false;
208 } else {
209 if (dst.length() > 0 && whitespace) {
210 dst.append(' ');
211 }
212 copyContent(buf, cursor, delimiters, dst);
213 whitespace = false;
214 }
215 }
216 return dst.toString();
217 }
218
219 /**
220 * Skips semantically insignificant whitespace characters and moves the cursor to the closest
221 * non-whitespace character.
222 *
223 * @param buf buffer with the sequence of bytes to be parsed
224 * @param cursor defines the bounds and current position of the buffer
225 */
226 public void skipWhiteSpace(final ByteSequence buf, final ParserCursor cursor) {
227 int pos = cursor.getPos();
228 int indexFrom = cursor.getPos();
229 int indexTo = cursor.getUpperBound();
230 for (int i = indexFrom; i < indexTo; i++) {
231 char current = (char) (buf.byteAt(i) & 0xff);
232 if (!CharsetUtil.isWhitespace(current)) {
233 break;
234 } else {
235 pos++;
236 }
237 }
238 cursor.updatePos(pos);
239 }
240
241 /**
242 * Skips semantically insignificant content if the current position is positioned at the
243 * beginning of a comment and moves the cursor past the end of the comment.
244 * Nested comments and escaped characters are recognized and handled appropriately.
245 *
246 * @param buf buffer with the sequence of bytes to be parsed
247 * @param cursor defines the bounds and current position of the buffer
248 */
249 public void skipComment(final ByteSequence buf, final ParserCursor cursor) {
250 if (cursor.atEnd()) {
251 return;
252 }
253 int pos = cursor.getPos();
254 int indexFrom = cursor.getPos();
255 int indexTo = cursor.getUpperBound();
256 char current = (char) (buf.byteAt(pos) & 0xff);
257 if (current != '(') {
258 return;
259 }
260 pos++;
261 indexFrom++;
262
263 int level = 1;
264 boolean escaped = false;
265 for (int i = indexFrom; i < indexTo; i++, pos++) {
266 current = (char) (buf.byteAt(i) & 0xff);
267 if (escaped) {
268 escaped = false;
269 } else {
270 if (current == '\\') {
271 escaped = true;
272 } else if (current == '(') {
273 level++;
274 } else if (current == ')') {
275 level--;
276 }
277 }
278 if (level <= 0) {
279 pos++;
280 break;
281 }
282 }
283 cursor.updatePos(pos);
284 }
285
286 /**
287 * Skips semantically insignificant whitespace characters and comments and moves the cursor
288 * to the closest semantically significant non-whitespace character.
289 * Nested comments and escaped characters are recognized and handled appropriately.
290 *
291 * @param buf buffer with the sequence of bytes to be parsed
292 * @param cursor defines the bounds and current position of the buffer
293 */
294 public void skipAllWhiteSpace(final ByteSequence buf, final ParserCursor cursor) {
295 while (!cursor.atEnd()) {
296 char current = (char) (buf.byteAt(cursor.getPos()) & 0xff);
297 if (CharsetUtil.isWhitespace(current)) {
298 skipWhiteSpace(buf, cursor);
299 } else if (current == '(') {
300 skipComment(buf, cursor);
301 } else {
302 break;
303 }
304 }
305 }
306
307 /**
308 * Transfers content into the destination buffer until a whitespace character, a comment,
309 * or any of the given delimiters is encountered.
310 *
311 * @param buf buffer with the sequence of bytes to be parsed
312 * @param cursor defines the bounds and current position of the buffer
313 * @param delimiters set of delimiting characters. Can be <code>null</code> if the value
314 * is delimited by a whitespace or a comment only.
315 * @param dst destination buffer
316 */
317 public void copyContent(final ByteSequence buf, final ParserCursor cursor, final BitSet delimiters,
318 final StringBuilder dst) {
319 int pos = cursor.getPos();
320 int indexFrom = cursor.getPos();
321 int indexTo = cursor.getUpperBound();
322 for (int i = indexFrom; i < indexTo; i++) {
323 char current = (char) (buf.byteAt(i) & 0xff);
324 if ((delimiters != null && delimiters.get(current))
325 || CharsetUtil.isWhitespace(current) || current == '(') {
326 break;
327 } else {
328 pos++;
329 dst.append(current);
330 }
331 }
332 cursor.updatePos(pos);
333 }
334
335 /**
336 * Transfers content enclosed with quote marks into the destination buffer.
337 *
338 * @param buf buffer with the sequence of bytes to be parsed
339 * @param cursor defines the bounds and current position of the buffer
340 * @param dst destination buffer
341 */
342 public void copyQuotedContent(final ByteSequence buf, final ParserCursor cursor,
343 final StringBuilder dst) {
344 if (cursor.atEnd()) {
345 return;
346 }
347 int pos = cursor.getPos();
348 int indexFrom = cursor.getPos();
349 int indexTo = cursor.getUpperBound();
350 char current = (char) (buf.byteAt(pos) & 0xff);
351 if (current != '\"') {
352 return;
353 }
354 pos++;
355 indexFrom++;
356 boolean escaped = false;
357 for (int i = indexFrom; i < indexTo; i++, pos++) {
358 current = (char) (buf.byteAt(i) & 0xff);
359 if (escaped) {
360 if (current != '\"' && current != '\\') {
361 dst.append('\\');
362 }
363 dst.append(current);
364 escaped = false;
365 } else {
366 if (current == '\"') {
367 pos++;
368 break;
369 }
370 if (current == '\\') {
371 escaped = true;
372 } else if (current != '\r' && current != '\n') {
373 dst.append(current);
374 }
375 }
376 }
377 cursor.updatePos(pos);
378 }
379
380 }