001 /****************************************************************
002 * Licensed to the Apache Software Foundation (ASF) under one *
003 * or more contributor license agreements. See the NOTICE file *
004 * distributed with this work for additional information *
005 * regarding copyright ownership. The ASF licenses this file *
006 * to you under the Apache License, Version 2.0 (the *
007 * "License"); you may not use this file except in compliance *
008 * with the License. You may obtain a copy of the License at *
009 * *
010 * http://www.apache.org/licenses/LICENSE-2.0 *
011 * *
012 * Unless required by applicable law or agreed to in writing, *
013 * software distributed under the License is distributed on an *
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY *
015 * KIND, either express or implied. See the License for the *
016 * specific language governing permissions and limitations *
017 * under the License. *
018 ****************************************************************/
019
020 package org.apache.james.mime4j.stream;
021
022 import java.io.IOException;
023 import java.io.InputStream;
024 import java.io.InputStreamReader;
025 import java.io.Reader;
026 import java.nio.charset.Charset;
027 import java.util.LinkedList;
028
029 import org.apache.james.mime4j.MimeException;
030 import org.apache.james.mime4j.codec.DecodeMonitor;
031 import org.apache.james.mime4j.io.LineNumberInputStream;
032 import org.apache.james.mime4j.io.LineNumberSource;
033 import org.apache.james.mime4j.util.CharsetUtil;
034
035 /**
036 * <p>
037 * Parses MIME (or RFC822) message streams of bytes or characters.
038 * The stream is converted into an event stream.
039 * <p>
040 * <p>
041 * Typical usage:
042 * </p>
043 * <pre>
044 * MimeTokenStream stream = new MimeTokenStream();
045 * InputStream instream = new FileInputStream("mime.msg");
046 * try {
047 * stream.parse(instream);
048 * for (int state = stream.getState();
049 * state != MimeTokenStream.T_END_OF_STREAM;
050 * state = stream.next()) {
051 * switch (state) {
052 * case MimeTokenStream.T_BODY:
053 * System.out.println("Body detected, contents = "
054 * + stream.getInputStream() + ", header data = "
055 * + stream.getBodyDescriptor());
056 * break;
057 * case MimeTokenStream.T_FIELD:
058 * System.out.println("Header field detected: "
059 * + stream.getField());
060 * break;
061 * case MimeTokenStream.T_START_MULTIPART:
062 * System.out.println("Multipart message detexted,"
063 * + " header data = "
064 * + stream.getBodyDescriptor());
065 * ...
066 * }
067 * }
068 * } finally {
069 * instream.close();
070 * }
071 * </pre>
072 * <p>Instances of {@link MimeTokenStream} are reusable: Invoking the
073 * method {@link #parse(InputStream)} resets the token streams internal
074 * state. However, they are definitely <em>not</em> thread safe. If you
075 * have a multi threaded application, then the suggested use is to have
076 * one instance per thread.</p>
077 */
078 public class MimeTokenStream {
079
080 private final MimeConfig config;
081 private final DecodeMonitor monitor;
082 private final FieldBuilder fieldBuilder;
083 private final BodyDescriptorBuilder bodyDescBuilder;
084 private final LinkedList<EntityStateMachine> entities = new LinkedList<EntityStateMachine>();
085
086 private EntityState state = EntityState.T_END_OF_STREAM;
087 private EntityStateMachine currentStateMachine;
088 private RecursionMode recursionMode = RecursionMode.M_RECURSE;
089 private MimeEntity rootentity;
090
091 /**
092 * Constructs a standard (lax) stream.
093 * Optional validation events will be logged only.
094 * Use {@link MimeConfig#setStrictParsing(boolean)} to turn on strict
095 * parsing mode and pass the config object to
096 * {@link MimeTokenStream#MimeTokenStream(MimeConfig)} to create
097 * a stream that strictly validates the input.
098 */
099 public MimeTokenStream() {
100 this(null);
101 }
102
103 public MimeTokenStream(final MimeConfig config) {
104 this(config, null, null, null);
105 }
106
107 public MimeTokenStream(
108 final MimeConfig config,
109 final BodyDescriptorBuilder bodyDescBuilder) {
110 this(config, null, null, bodyDescBuilder);
111 }
112
113 public MimeTokenStream(
114 final MimeConfig config,
115 final DecodeMonitor monitor,
116 final BodyDescriptorBuilder bodyDescBuilder) {
117 this(config, monitor, null, bodyDescBuilder);
118 }
119
120 public MimeTokenStream(
121 final MimeConfig config,
122 final DecodeMonitor monitor,
123 final FieldBuilder fieldBuilder,
124 final BodyDescriptorBuilder bodyDescBuilder) {
125 super();
126 this.config = config != null ? config : new MimeConfig();
127 this.fieldBuilder = fieldBuilder != null ? fieldBuilder :
128 new DefaultFieldBuilder(this.config.getMaxHeaderLen());
129 this.monitor = monitor != null ? monitor :
130 (this.config.isStrictParsing() ? DecodeMonitor.STRICT : DecodeMonitor.SILENT);
131 this.bodyDescBuilder = bodyDescBuilder != null ? bodyDescBuilder :
132 new FallbackBodyDescriptorBuilder();
133 }
134
135 /** Instructs the {@code MimeTokenStream} to parse the given streams contents.
136 * If the {@code MimeTokenStream} has already been in use, resets the streams
137 * internal state.
138 */
139 public void parse(InputStream stream) {
140 doParse(stream, EntityState.T_START_MESSAGE);
141 }
142
143 /**
144 * <p>Instructs the {@code MimeTokenStream} to parse the given content with
145 * the content type. The message stream is assumed to have no message header
146 * and is expected to begin with a message body. This can be the case when
147 * the message content is transmitted using a different transport protocol
148 * such as HTTP.</p>
149 * <p>If the {@code MimeTokenStream} has already been in use, resets the
150 * streams internal state.</p>
151 * @return a parsed Field representing the input contentType
152 */
153 public Field parseHeadless(InputStream stream, String contentType) {
154 if (contentType == null) {
155 throw new IllegalArgumentException("Content type may not be null");
156 }
157 Field newContentType;
158 try {
159 RawField rawContentType = new RawField("Content-Type", contentType);
160 newContentType = bodyDescBuilder.addField(rawContentType);
161 if (newContentType == null) newContentType = rawContentType;
162 } catch (MimeException ex) {
163 // should never happen
164 throw new IllegalArgumentException(ex.getMessage());
165 }
166
167 doParse(stream, EntityState.T_END_HEADER);
168 try {
169 next();
170 } catch (IOException e) {
171 // Should never happend: the first next after END_HEADER does not produce IO
172 throw new IllegalStateException(e);
173 } catch (MimeException e) {
174 // This should never happen
175 throw new IllegalStateException(e);
176 }
177 return newContentType;
178 }
179
180 private void doParse(InputStream stream, EntityState start) {
181 LineNumberSource lineSource = null;
182 if (config.isCountLineNumbers()) {
183 LineNumberInputStream lineInput = new LineNumberInputStream(stream);
184 lineSource = lineInput;
185 stream = lineInput;
186 }
187
188 rootentity = new MimeEntity(
189 lineSource,
190 stream,
191 config,
192 start,
193 EntityState.T_END_MESSAGE,
194 monitor,
195 fieldBuilder,
196 bodyDescBuilder);
197
198 rootentity.setRecursionMode(recursionMode);
199 currentStateMachine = rootentity;
200 entities.clear();
201 entities.add(currentStateMachine);
202 state = currentStateMachine.getState();
203 }
204
205 /**
206 * Determines if this parser is currently in raw mode.
207 *
208 * @return <code>true</code> if in raw mode, <code>false</code>
209 * otherwise.
210 * @see #setRecursionMode(RecursionMode)
211 */
212 public boolean isRaw() {
213 return recursionMode == RecursionMode.M_RAW;
214 }
215
216 /**
217 * Gets the current recursion mode.
218 * The recursion mode specifies the approach taken to parsing parts.
219 * {@link RecursionMode#M_RAW} mode does not parse the part at all.
220 * {@link RecursionMode#M_RECURSE} mode recursively parses each mail
221 * when an <code>message/rfc822</code> part is encountered;
222 * {@link RecursionMode#M_NO_RECURSE} does not.
223 * @return {@link RecursionMode#M_RECURSE}, {@link RecursionMode#M_RAW} or
224 * {@link RecursionMode#M_NO_RECURSE}
225 */
226 public RecursionMode getRecursionMode() {
227 return recursionMode;
228 }
229
230 /**
231 * Sets the current recursion.
232 * The recursion mode specifies the approach taken to parsing parts.
233 * {@link RecursionMode#M_RAW} mode does not parse the part at all.
234 * {@link RecursionMode#M_RECURSE} mode recursively parses each mail
235 * when an <code>message/rfc822</code> part is encountered;
236 * {@link RecursionMode#M_NO_RECURSE} does not.
237 * @param mode {@link RecursionMode#M_RECURSE}, {@link RecursionMode#M_RAW} or
238 * {@link RecursionMode#M_NO_RECURSE}
239 */
240 public void setRecursionMode(RecursionMode mode) {
241 recursionMode = mode;
242 if (currentStateMachine != null) {
243 currentStateMachine.setRecursionMode(mode);
244 }
245 }
246
247 /**
248 * Finishes the parsing and stops reading lines.
249 * NOTE: No more lines will be parsed but the parser
250 * will still trigger 'end' events to match previously
251 * triggered 'start' events.
252 */
253 public void stop() {
254 rootentity.stop();
255 }
256
257 /**
258 * Returns the current state.
259 */
260 public EntityState getState() {
261 return state;
262 }
263
264 /**
265 * This method returns the raw entity, preamble, or epilogue contents.
266 * <p/>
267 * This method is valid, if {@link #getState()} returns either of
268 * {@link EntityState#T_RAW_ENTITY}, {@link EntityState#T_PREAMBLE}, or
269 * {@link EntityState#T_EPILOGUE}.
270 *
271 * @return Data stream, depending on the current state.
272 * @throws IllegalStateException {@link #getState()} returns an
273 * invalid value.
274 */
275 public InputStream getInputStream() {
276 return currentStateMachine.getContentStream();
277 }
278
279 /**
280 * This method returns a transfer decoded stream based on the MIME
281 * fields with the standard defaults.
282 * <p/>
283 * This method is valid, if {@link #getState()} returns either of
284 * {@link EntityState#T_RAW_ENTITY}, {@link EntityState#T_PREAMBLE}, or
285 * {@link EntityState#T_EPILOGUE}.
286 *
287 * @return Data stream, depending on the current state.
288 * @throws IllegalStateException {@link #getState()} returns an
289 * invalid value.
290 */
291 public InputStream getDecodedInputStream() {
292 return currentStateMachine.getDecodedContentStream();
293 }
294
295 /**
296 * Gets a reader configured for the current body or body part.
297 * The reader will return a transfer and charset decoded
298 * stream of characters based on the MIME fields with the standard
299 * defaults.
300 * This is a conveniance method and relies on {@link #getInputStream()}.
301 * Consult the javadoc for that method for known limitations.
302 *
303 * @return <code>Reader</code>, not null
304 * @see #getInputStream
305 * @throws IllegalStateException {@link #getState()} returns an
306 * invalid value
307 * @throws UnsupportedCharsetException if there is no JVM support
308 * for decoding the charset
309 * @throws IllegalCharsetNameException if the charset name specified
310 * in the mime type is illegal
311 */
312 public Reader getReader() {
313 final BodyDescriptor bodyDescriptor = getBodyDescriptor();
314 final String mimeCharset = bodyDescriptor.getCharset();
315 final Charset charset;
316 if (mimeCharset == null || "".equals(mimeCharset)) {
317 charset = CharsetUtil.US_ASCII;
318 } else {
319 charset = Charset.forName(mimeCharset);
320 }
321 final InputStream instream = getDecodedInputStream();
322 return new InputStreamReader(instream, charset);
323 }
324
325 /**
326 * <p>Gets a descriptor for the current entity.
327 * This method is valid if {@link #getState()} returns:</p>
328 * <ul>
329 * <li>{@link EntityState#T_BODY}</li>
330 * <li>{@link EntityState#T_START_MULTIPART}</li>
331 * <li>{@link EntityState#T_EPILOGUE}</li>
332 * <li>{@link EntityState#T_PREAMBLE}</li>
333 * </ul>
334 * @return <code>BodyDescriptor</code>, not nulls
335 */
336 public BodyDescriptor getBodyDescriptor() {
337 return currentStateMachine.getBodyDescriptor();
338 }
339
340 /**
341 * This method is valid, if {@link #getState()} returns {@link EntityState#T_FIELD}.
342 * @return String with the fields raw contents.
343 * @throws IllegalStateException {@link #getState()} returns another
344 * value than {@link EntityState#T_FIELD}.
345 */
346 public Field getField() {
347 return currentStateMachine.getField();
348 }
349
350 /**
351 * This method advances the token stream to the next token.
352 * @throws IllegalStateException The method has been called, although
353 * {@link #getState()} was already {@link EntityState#T_END_OF_STREAM}.
354 */
355 public EntityState next() throws IOException, MimeException {
356 if (state == EntityState.T_END_OF_STREAM || currentStateMachine == null) {
357 throw new IllegalStateException("No more tokens are available.");
358 }
359 while (currentStateMachine != null) {
360 EntityStateMachine next = currentStateMachine.advance();
361 if (next != null) {
362 entities.add(next);
363 currentStateMachine = next;
364 }
365 state = currentStateMachine.getState();
366 if (state != EntityState.T_END_OF_STREAM) {
367 return state;
368 }
369 entities.removeLast();
370 if (entities.isEmpty()) {
371 currentStateMachine = null;
372 } else {
373 currentStateMachine = entities.getLast();
374 currentStateMachine.setRecursionMode(recursionMode);
375 }
376 }
377 state = EntityState.T_END_OF_STREAM;
378 return state;
379 }
380
381 /**
382 * Renders a state as a string suitable for logging.
383 * @param state
384 * @return rendered as string, not null
385 */
386 public static final String stateToString(EntityState state) {
387 return MimeEntity.stateToString(state);
388 }
389
390
391 public MimeConfig getConfig() {
392 return config;
393 }
394 }