001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.io.input;
018
019import java.io.BufferedInputStream;
020import java.io.BufferedReader;
021import java.io.File;
022import java.io.IOException;
023import java.io.InputStream;
024import java.io.InputStreamReader;
025import java.io.Reader;
026import java.io.StringReader;
027import java.net.HttpURLConnection;
028import java.net.URL;
029import java.net.URLConnection;
030import java.nio.charset.Charset;
031import java.nio.charset.StandardCharsets;
032import java.nio.file.Files;
033import java.nio.file.Path;
034import java.text.MessageFormat;
035import java.util.Locale;
036import java.util.Objects;
037import java.util.regex.Matcher;
038import java.util.regex.Pattern;
039
040import org.apache.commons.io.ByteOrderMark;
041import org.apache.commons.io.Charsets;
042import org.apache.commons.io.IOUtils;
043import org.apache.commons.io.build.AbstractStreamBuilder;
044import org.apache.commons.io.function.IOConsumer;
045import org.apache.commons.io.output.XmlStreamWriter;
046
047/**
048 * Character stream that handles all the necessary Voodoo to figure out the charset encoding of the XML document within the stream.
049 * <p>
050 * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a character stream.
051 * </p>
052 * <p>
053 * All this has to be done without consuming characters from the stream, if not the XML parser will not recognized the document as a valid XML. This is not 100%
054 * true, but it's close enough (UTF-8 BOM is not handled by all parsers right now, XmlStreamReader handles it and things work in all parsers).
055 * </p>
056 * <p>
057 * The XmlStreamReader class handles the charset encoding of XML documents in Files, raw streams and HTTP streams by offering a wide set of constructors.
058 * </p>
059 * <p>
060 * By default the charset encoding detection is lenient, the constructor with the lenient flag can be used for a script (following HTTP MIME and XML
061 * specifications). All this is nicely explained by Mark Pilgrim in his blog, <a href="http://diveintomark.org/archives/2004/02/13/xml-media-types"> Determining
062 * the character encoding of a feed</a>.
063 * </p>
064 * <p>
065 * To build an instance, see {@link Builder}.
066 * </p>
067 * <p>
068 * Originally developed for <a href="http://rome.dev.java.net">ROME</a> under Apache License 2.0.
069 * </p>
070 *
071 * @see org.apache.commons.io.output.XmlStreamWriter
072 * @since 2.0
073 */
074public class XmlStreamReader extends Reader {
075
076    /**
077     * Builds a new {@link XmlStreamWriter} instance.
078     *
079     * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection.
080     * <p>
081     * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
082     * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
083     * </p>
084     * <p>
085     * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
086     * </p>
087     * <p>
088     * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
089     * </p>
090     * <p>
091     * Else if the XML prolog had a charset encoding that encoding is used.
092     * </p>
093     * <p>
094     * Else if the content type had a charset encoding that encoding is used.
095     * </p>
096     * <p>
097     * Else 'UTF-8' is used.
098     * </p>
099     * <p>
100     * If lenient detection is indicated an XmlStreamReaderException is never thrown.
101     * </p>
102     * <p>
103     * For example:
104     * </p>
105     *
106     * <pre>{@code
107     * XmlStreamReader r = XmlStreamReader.builder().setPath(path).setCharset(StandardCharsets.UTF_8).get();
108     * }
109     * </pre>
110     *
111     * @since 2.12.0
112     */
113    public static class Builder extends AbstractStreamBuilder<XmlStreamReader, Builder> {
114
115        private boolean nullCharset = true;
116        private boolean lenient = true;
117        private String httpContentType;
118
119        /**
120         * Constructs a new instance.
121         * <p>
122         * This builder use the aspect InputStream, OpenOption[], httpContentType, lenient, and defaultEncoding.
123         * </p>
124         * <p>
125         * You must provide an origin that can be converted to an InputStream by this builder, otherwise, this call will throw an
126         * {@link UnsupportedOperationException}.
127         * </p>
128         *
129         * @return a new instance.
130         * @throws UnsupportedOperationException if the origin cannot provide an InputStream.
131         * @throws IOException                   thrown if there is a problem reading the stream.
132         * @throws XmlStreamReaderException      thrown if the charset encoding could not be determined according to the specification.
133         * @see #getInputStream()
134         */
135        @SuppressWarnings("resource")
136        @Override
137        public XmlStreamReader get() throws IOException {
138            final String defaultEncoding = nullCharset ? null : getCharset().name();
139            // @formatter:off
140            return httpContentType == null
141                    ? new XmlStreamReader(getInputStream(), lenient, defaultEncoding)
142                    : new XmlStreamReader(getInputStream(), httpContentType, lenient, defaultEncoding);
143            // @formatter:on
144        }
145
146        @Override
147        public Builder setCharset(final Charset charset) {
148            nullCharset = charset == null;
149            return super.setCharset(charset);
150        }
151
152        @Override
153        public Builder setCharset(final String charset) {
154            nullCharset = charset == null;
155            return super.setCharset(Charsets.toCharset(charset, getCharsetDefault()));
156        }
157
158        /**
159         * Sets the HTTP content type.
160         *
161         * @param httpContentType the HTTP content type.
162         * @return this.
163         */
164        public Builder setHttpContentType(final String httpContentType) {
165            this.httpContentType = httpContentType;
166            return this;
167        }
168
169        /**
170         * Sets the lenient toggle.
171         *
172         * @param lenient the lenient toggle.
173         * @return this.
174         */
175        public Builder setLenient(final boolean lenient) {
176            this.lenient = lenient;
177            return this;
178        }
179
180    }
181
182    private static final String UTF_8 = StandardCharsets.UTF_8.name();
183
184    private static final String US_ASCII = StandardCharsets.US_ASCII.name();
185
186    private static final String UTF_16BE = StandardCharsets.UTF_16BE.name();
187
188    private static final String UTF_16LE = StandardCharsets.UTF_16LE.name();
189
190    private static final String UTF_32BE = "UTF-32BE";
191
192    private static final String UTF_32LE = "UTF-32LE";
193
194    private static final String UTF_16 = StandardCharsets.UTF_16.name();
195
196    private static final String UTF_32 = "UTF-32";
197
198    private static final String EBCDIC = "CP1047";
199
200    private static final ByteOrderMark[] BOMS = { ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE,
201            ByteOrderMark.UTF_32LE };
202
203    /** UTF_16LE and UTF_32LE have the same two starting BOM bytes. */
204    private static final ByteOrderMark[] XML_GUESS_BYTES = { new ByteOrderMark(UTF_8, 0x3C, 0x3F, 0x78, 0x6D),
205            new ByteOrderMark(UTF_16BE, 0x00, 0x3C, 0x00, 0x3F), new ByteOrderMark(UTF_16LE, 0x3C, 0x00, 0x3F, 0x00),
206            new ByteOrderMark(UTF_32BE, 0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D),
207            new ByteOrderMark(UTF_32LE, 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00),
208            new ByteOrderMark(EBCDIC, 0x4C, 0x6F, 0xA7, 0x94) };
209
210    private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=[\"']?([.[^; \"']]*)[\"']?");
211
212    /**
213     * Pattern capturing the encoding of the "xml" processing instruction.
214     * <p>
215     * See also the <a href="https://www.w3.org/TR/2008/REC-xml-20081126/#NT-EncName">XML specification</a>.
216     * </p>
217     */
218    public static final Pattern ENCODING_PATTERN = Pattern.compile(
219    // @formatter:off
220            "^<\\?xml\\s+"
221            + "version\\s*=\\s*(?:(?:\"1\\.[0-9]+\")|(?:'1.[0-9]+'))\\s+"
222            + "encoding\\s*=\\s*"
223            + "((?:\"[A-Za-z0-9][A-Za-z0-9._+:-]*\")"  // double-quoted
224            +  "|(?:'[A-Za-z0-9][A-Za-z0-9._+:-]*'))", // single-quoted
225            Pattern.MULTILINE);
226    // N.B. the documented pattern is
227    // EncName   ::=   [A-Za-z] ([A-Za-z0-9._] | '-')*
228    // However this does not match all the aliases that are supported by Java.
229    // e.g.  '437', 'ISO_8859-1:1987' and 'ebcdic-de-273+euro'
230    // @formatter:on
231
232    private static final String RAW_EX_1 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch";
233
234    private static final String RAW_EX_2 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM";
235
236    private static final String HTTP_EX_1 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be null";
237
238    private static final String HTTP_EX_2 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch";
239
240    private static final String HTTP_EX_3 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Illegal MIME";
241
242    /**
243     * Constructs a new {@link Builder}.
244     *
245     * @return a new {@link Builder}.
246     * @since 2.12.0
247     */
248    public static Builder builder() {
249        return new Builder();
250    }
251
252    /**
253     * Gets the charset parameter value, {@code null} if not present, {@code null} if httpContentType is {@code null}.
254     *
255     * @param httpContentType the HTTP content type
256     * @return The content type encoding (upcased)
257     */
258    static String getContentTypeEncoding(final String httpContentType) {
259        String encoding = null;
260        if (httpContentType != null) {
261            final int i = httpContentType.indexOf(";");
262            if (i > -1) {
263                final String postMime = httpContentType.substring(i + 1);
264                final Matcher m = CHARSET_PATTERN.matcher(postMime);
265                encoding = m.find() ? m.group(1) : null;
266                encoding = encoding != null ? encoding.toUpperCase(Locale.ROOT) : null;
267            }
268        }
269        return encoding;
270    }
271
272    /**
273     * Gets the MIME type or {@code null} if httpContentType is {@code null}.
274     *
275     * @param httpContentType the HTTP content type
276     * @return The mime content type
277     */
278    static String getContentTypeMime(final String httpContentType) {
279        String mime = null;
280        if (httpContentType != null) {
281            final int i = httpContentType.indexOf(";");
282            if (i >= 0) {
283                mime = httpContentType.substring(0, i);
284            } else {
285                mime = httpContentType;
286            }
287            mime = mime.trim();
288        }
289        return mime;
290    }
291
292    /**
293     * Gets the encoding declared in the <?xml encoding=...?>, {@code null} if none.
294     *
295     * @param inputStream InputStream to create the reader from.
296     * @param guessedEnc  guessed encoding
297     * @return the encoding declared in the <?xml encoding=...?>
298     * @throws IOException thrown if there is a problem reading the stream.
299     */
300    private static String getXmlProlog(final InputStream inputStream, final String guessedEnc) throws IOException {
301        String encoding = null;
302        if (guessedEnc != null) {
303            final byte[] bytes = IOUtils.byteArray();
304            inputStream.mark(IOUtils.DEFAULT_BUFFER_SIZE);
305            int offset = 0;
306            int max = IOUtils.DEFAULT_BUFFER_SIZE;
307            int c = inputStream.read(bytes, offset, max);
308            int firstGT = -1;
309            String xmlProlog = ""; // avoid possible NPE warning (cannot happen; this just silences the warning)
310            while (c != -1 && firstGT == -1 && offset < IOUtils.DEFAULT_BUFFER_SIZE) {
311                offset += c;
312                max -= c;
313                c = inputStream.read(bytes, offset, max);
314                xmlProlog = new String(bytes, 0, offset, guessedEnc);
315                firstGT = xmlProlog.indexOf('>');
316            }
317            if (firstGT == -1) {
318                if (c == -1) {
319                    throw new IOException("Unexpected end of XML stream");
320                }
321                throw new IOException("XML prolog or ROOT element not found on first " + offset + " bytes");
322            }
323            final int bytesRead = offset;
324            if (bytesRead > 0) {
325                inputStream.reset();
326                final BufferedReader bReader = new BufferedReader(new StringReader(xmlProlog.substring(0, firstGT + 1)));
327                final StringBuilder prolog = new StringBuilder();
328                IOConsumer.forEach(bReader.lines(), prolog::append);
329                final Matcher m = ENCODING_PATTERN.matcher(prolog);
330                if (m.find()) {
331                    encoding = m.group(1).toUpperCase(Locale.ROOT);
332                    encoding = encoding.substring(1, encoding.length() - 1);
333                }
334            }
335        }
336        return encoding;
337    }
338
339    /**
340     * Tests if the MIME type belongs to the APPLICATION XML family.
341     *
342     * @param mime The mime type
343     * @return true if the mime type belongs to the APPLICATION XML family, otherwise false
344     */
345    static boolean isAppXml(final String mime) {
346        return mime != null && (mime.equals("application/xml") || mime.equals("application/xml-dtd") || mime.equals("application/xml-external-parsed-entity")
347                || mime.startsWith("application/") && mime.endsWith("+xml"));
348    }
349
350    /**
351     * Tests if the MIME type belongs to the TEXT XML family.
352     *
353     * @param mime The mime type
354     * @return true if the mime type belongs to the TEXT XML family, otherwise false
355     */
356    static boolean isTextXml(final String mime) {
357        return mime != null && (mime.equals("text/xml") || mime.equals("text/xml-external-parsed-entity") || mime.startsWith("text/") && mime.endsWith("+xml"));
358    }
359
360    private final Reader reader;
361
362    private final String encoding;
363
364    private final String defaultEncoding;
365
366    /**
367     * Constructs a Reader for a File.
368     * <p>
369     * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to UTF-8.
370     * </p>
371     * <p>
372     * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
373     * </p>
374     *
375     * @param file File to create a Reader from.
376     * @throws NullPointerException if the input is {@code null}.
377     * @throws IOException          thrown if there is a problem reading the file.
378     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
379     */
380    @Deprecated
381    public XmlStreamReader(final File file) throws IOException {
382        this(Objects.requireNonNull(file, "file").toPath());
383    }
384
385    /**
386     * Constructs a Reader for a raw InputStream.
387     * <p>
388     * It follows the same logic used for files.
389     * </p>
390     * <p>
391     * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
392     * </p>
393     *
394     * @param inputStream InputStream to create a Reader from.
395     * @throws NullPointerException if the input stream is {@code null}.
396     * @throws IOException          thrown if there is a problem reading the stream.
397     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
398     */
399    @Deprecated
400    public XmlStreamReader(final InputStream inputStream) throws IOException {
401        this(inputStream, true);
402    }
403
404    /**
405     * Constructs a Reader for a raw InputStream.
406     * <p>
407     * It follows the same logic used for files.
408     * </p>
409     * <p>
410     * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
411     * </p>
412     * <p>
413     * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
414     * </p>
415     * <p>
416     * Else if the XML prolog had a charset encoding that encoding is used.
417     * </p>
418     * <p>
419     * Else if the content type had a charset encoding that encoding is used.
420     * </p>
421     * <p>
422     * Else 'UTF-8' is used.
423     * </p>
424     * <p>
425     * If lenient detection is indicated an XmlStreamReaderException is never thrown.
426     * </p>
427     *
428     * @param inputStream InputStream to create a Reader from.
429     * @param lenient     indicates if the charset encoding detection should be relaxed.
430     * @throws NullPointerException     if the input stream is {@code null}.
431     * @throws IOException              thrown if there is a problem reading the stream.
432     * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
433     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
434     */
435    @Deprecated
436    public XmlStreamReader(final InputStream inputStream, final boolean lenient) throws IOException {
437        this(inputStream, lenient, null);
438    }
439
440    /**
441     * Constructs a Reader for a raw InputStream.
442     * <p>
443     * It follows the same logic used for files.
444     * </p>
445     * <p>
446     * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
447     * </p>
448     * <p>
449     * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
450     * </p>
451     * <p>
452     * Else if the XML prolog had a charset encoding that encoding is used.
453     * </p>
454     * <p>
455     * Else if the content type had a charset encoding that encoding is used.
456     * </p>
457     * <p>
458     * Else 'UTF-8' is used.
459     * </p>
460     * <p>
461     * If lenient detection is indicated an XmlStreamReaderException is never thrown.
462     * </p>
463     *
464     * @param inputStream     InputStream to create a Reader from.
465     * @param lenient         indicates if the charset encoding detection should be relaxed.
466     * @param defaultEncoding The default encoding
467     * @throws NullPointerException     if the input stream is {@code null}.
468     * @throws IOException              thrown if there is a problem reading the stream.
469     * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
470     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
471     */
472    @Deprecated
473    @SuppressWarnings("resource") // InputStream is managed through a InputStreamReader in this instance.
474    public XmlStreamReader(final InputStream inputStream, final boolean lenient, final String defaultEncoding) throws IOException {
475        this.defaultEncoding = defaultEncoding;
476        final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(Objects.requireNonNull(inputStream, "inputStream"), IOUtils.DEFAULT_BUFFER_SIZE),
477                false, BOMS);
478        final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
479        this.encoding = processHttpStream(bom, pis, lenient);
480        this.reader = new InputStreamReader(pis, encoding);
481    }
482
483    /**
484     * Constructs a Reader using an InputStream and the associated content-type header.
485     * <p>
486     * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
487     * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
488     * </p>
489     * <p>
490     * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
491     * </p>
492     *
493     * @param inputStream     InputStream to create the reader from.
494     * @param httpContentType content-type header to use for the resolution of the charset encoding.
495     * @throws NullPointerException if the input stream is {@code null}.
496     * @throws IOException          thrown if there is a problem reading the file.
497     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
498     */
499    @Deprecated
500    public XmlStreamReader(final InputStream inputStream, final String httpContentType) throws IOException {
501        this(inputStream, httpContentType, true);
502    }
503
504    /**
505     * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection.
506     * <p>
507     * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
508     * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
509     * </p>
510     * <p>
511     * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
512     * </p>
513     * <p>
514     * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
515     * </p>
516     * <p>
517     * Else if the XML prolog had a charset encoding that encoding is used.
518     * </p>
519     * <p>
520     * Else if the content type had a charset encoding that encoding is used.
521     * </p>
522     * <p>
523     * Else 'UTF-8' is used.
524     * </p>
525     * <p>
526     * If lenient detection is indicated an XmlStreamReaderException is never thrown.
527     * </p>
528     *
529     * @param inputStream     InputStream to create the reader from.
530     * @param httpContentType content-type header to use for the resolution of the charset encoding.
531     * @param lenient         indicates if the charset encoding detection should be relaxed.
532     * @throws NullPointerException     if the input stream is {@code null}.
533     * @throws IOException              thrown if there is a problem reading the file.
534     * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
535     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
536     */
537    @Deprecated
538    public XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient) throws IOException {
539        this(inputStream, httpContentType, lenient, null);
540    }
541
542    /**
543     * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection.
544     * <p>
545     * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
546     * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
547     * </p>
548     * <p>
549     * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
550     * </p>
551     * <p>
552     * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
553     * </p>
554     * <p>
555     * Else if the XML prolog had a charset encoding that encoding is used.
556     * </p>
557     * <p>
558     * Else if the content type had a charset encoding that encoding is used.
559     * </p>
560     * <p>
561     * Else 'UTF-8' is used.
562     * </p>
563     * <p>
564     * If lenient detection is indicated an XmlStreamReaderException is never thrown.
565     * </p>
566     *
567     * @param inputStream     InputStream to create the reader from.
568     * @param httpContentType content-type header to use for the resolution of the charset encoding.
569     * @param lenient         indicates if the charset encoding detection should be relaxed.
570     * @param defaultEncoding The default encoding
571     * @throws NullPointerException     if the input stream is {@code null}.
572     * @throws IOException              thrown if there is a problem reading the file.
573     * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
574     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
575     */
576    @Deprecated
577    @SuppressWarnings("resource") // InputStream is managed through a InputStreamReader in this instance.
578    public XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient, final String defaultEncoding)
579            throws IOException {
580        this.defaultEncoding = defaultEncoding;
581        final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(Objects.requireNonNull(inputStream, "inputStream"), IOUtils.DEFAULT_BUFFER_SIZE),
582                false, BOMS);
583        final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
584        this.encoding = processHttpStream(bom, pis, lenient, httpContentType);
585        this.reader = new InputStreamReader(pis, encoding);
586    }
587
588    /**
589     * Constructs a Reader for a File.
590     * <p>
591     * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to UTF-8.
592     * </p>
593     * <p>
594     * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
595     * </p>
596     *
597     * @param file File to create a Reader from.
598     * @throws NullPointerException if the input is {@code null}.
599     * @throws IOException          thrown if there is a problem reading the file.
600     * @since 2.11.0
601     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
602     */
603    @Deprecated
604    @SuppressWarnings("resource") // InputStream is managed through another reader in this instance.
605    public XmlStreamReader(final Path file) throws IOException {
606        this(Files.newInputStream(Objects.requireNonNull(file, "file")));
607    }
608
609    /**
610     * Constructs a Reader using the InputStream of a URL.
611     * <p>
612     * If the URL is not of type HTTP and there is not 'content-type' header in the fetched data it uses the same logic used for Files.
613     * </p>
614     * <p>
615     * If the URL is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for an InputStream with content-type.
616     * </p>
617     * <p>
618     * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
619     * </p>
620     *
621     * @param url URL to create a Reader from.
622     * @throws NullPointerException if the input is {@code null}.
623     * @throws IOException          thrown if there is a problem reading the stream of the URL.
624     */
625    public XmlStreamReader(final URL url) throws IOException {
626        this(Objects.requireNonNull(url, "url").openConnection(), null);
627    }
628
629    /**
630     * Constructs a Reader using the InputStream of a URLConnection.
631     * <p>
632     * If the URLConnection is not of type HttpURLConnection and there is not 'content-type' header in the fetched data it uses the same logic used for files.
633     * </p>
634     * <p>
635     * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for an InputStream with
636     * content-type.
637     * </p>
638     * <p>
639     * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
640     * </p>
641     *
642     * @param urlConnection   URLConnection to create a Reader from.
643     * @param defaultEncoding The default encoding
644     * @throws NullPointerException if the input is {@code null}.
645     * @throws IOException          thrown if there is a problem reading the stream of the URLConnection.
646     */
647    public XmlStreamReader(final URLConnection urlConnection, final String defaultEncoding) throws IOException {
648        Objects.requireNonNull(urlConnection, "urlConnection");
649        this.defaultEncoding = defaultEncoding;
650        final boolean lenient = true;
651        final String contentType = urlConnection.getContentType();
652        final InputStream inputStream = urlConnection.getInputStream();
653        @SuppressWarnings("resource") // managed by the InputStreamReader tracked by this instance
654        // @formatter:off
655        final BOMInputStream bomInput = BOMInputStream.builder()
656            .setInputStream(new BufferedInputStream(inputStream, IOUtils.DEFAULT_BUFFER_SIZE))
657            .setInclude(false)
658            .setByteOrderMarks(BOMS)
659            .get();
660        @SuppressWarnings("resource")
661        final BOMInputStream piInput = BOMInputStream.builder()
662            .setInputStream(new BufferedInputStream(bomInput, IOUtils.DEFAULT_BUFFER_SIZE))
663            .setInclude(true)
664            .setByteOrderMarks(XML_GUESS_BYTES)
665            .get();
666        // @formatter:on
667        if (urlConnection instanceof HttpURLConnection || contentType != null) {
668            this.encoding = processHttpStream(bomInput, piInput, lenient, contentType);
669        } else {
670            this.encoding = processHttpStream(bomInput, piInput, lenient);
671        }
672        this.reader = new InputStreamReader(piInput, encoding);
673    }
674
675    /**
676     * Calculates the HTTP encoding.
677     * @param bomEnc          BOM encoding
678     * @param xmlGuessEnc     XML Guess encoding
679     * @param xmlEnc          XML encoding
680     * @param lenient         indicates if the charset encoding detection should be relaxed.
681     * @param httpContentType The HTTP content type
682     *
683     * @return the HTTP encoding
684     * @throws IOException thrown if there is a problem reading the stream.
685     */
686    String calculateHttpEncoding(final String bomEnc, final String xmlGuessEnc, final String xmlEnc, final boolean lenient, final String httpContentType)
687            throws IOException {
688
689        // Lenient and has XML encoding
690        if (lenient && xmlEnc != null) {
691            return xmlEnc;
692        }
693
694        // Determine mime/encoding content types from HTTP Content Type
695        final String cTMime = getContentTypeMime(httpContentType);
696        final String cTEnc = getContentTypeEncoding(httpContentType);
697        final boolean appXml = isAppXml(cTMime);
698        final boolean textXml = isTextXml(cTMime);
699
700        // Mime type NOT "application/xml" or "text/xml"
701        if (!appXml && !textXml) {
702            final String msg = MessageFormat.format(HTTP_EX_3, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
703            throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
704        }
705
706        // No content type encoding
707        if (cTEnc == null) {
708            if (appXml) {
709                return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
710            }
711            return defaultEncoding == null ? US_ASCII : defaultEncoding;
712        }
713
714        // UTF-16BE or UTF-16LE content type encoding
715        if (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE)) {
716            if (bomEnc != null) {
717                final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
718                throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
719            }
720            return cTEnc;
721        }
722
723        // UTF-16 content type encoding
724        if (cTEnc.equals(UTF_16)) {
725            if (bomEnc != null && bomEnc.startsWith(UTF_16)) {
726                return bomEnc;
727            }
728            final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
729            throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
730        }
731
732        // UTF-32BE or UTF-132E content type encoding
733        if (cTEnc.equals(UTF_32BE) || cTEnc.equals(UTF_32LE)) {
734            if (bomEnc != null) {
735                final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
736                throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
737            }
738            return cTEnc;
739        }
740
741        // UTF-32 content type encoding
742        if (cTEnc.equals(UTF_32)) {
743            if (bomEnc != null && bomEnc.startsWith(UTF_32)) {
744                return bomEnc;
745            }
746            final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
747            throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
748        }
749
750        return cTEnc;
751    }
752
753    /**
754     * Calculate the raw encoding.
755     *
756     * @param bomEnc      BOM encoding
757     * @param xmlGuessEnc XML Guess encoding
758     * @param xmlEnc      XML encoding
759     * @return the raw encoding
760     * @throws IOException thrown if there is a problem reading the stream.
761     */
762    String calculateRawEncoding(final String bomEnc, final String xmlGuessEnc, final String xmlEnc) throws IOException {
763
764        // BOM is Null
765        if (bomEnc == null) {
766            if (xmlGuessEnc == null || xmlEnc == null) {
767                return defaultEncoding == null ? UTF_8 : defaultEncoding;
768            }
769            if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
770                return xmlGuessEnc;
771            }
772            return xmlEnc;
773        }
774
775        // BOM is UTF-8
776        if (bomEnc.equals(UTF_8)) {
777            if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) {
778                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
779                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
780            }
781            if (xmlEnc != null && !xmlEnc.equals(UTF_8)) {
782                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
783                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
784            }
785            return bomEnc;
786        }
787
788        // BOM is UTF-16BE or UTF-16LE
789        if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
790            if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
791                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
792                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
793            }
794            if (xmlEnc != null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
795                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
796                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
797            }
798            return bomEnc;
799        }
800
801        // BOM is UTF-32BE or UTF-32LE
802        if (bomEnc.equals(UTF_32BE) || bomEnc.equals(UTF_32LE)) {
803            if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
804                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
805                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
806            }
807            if (xmlEnc != null && !xmlEnc.equals(UTF_32) && !xmlEnc.equals(bomEnc)) {
808                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
809                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
810            }
811            return bomEnc;
812        }
813
814        // BOM is something else
815        final String msg = MessageFormat.format(RAW_EX_2, bomEnc, xmlGuessEnc, xmlEnc);
816        throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
817    }
818
819    /**
820     * Closes the XmlStreamReader stream.
821     *
822     * @throws IOException thrown if there was a problem closing the stream.
823     */
824    @Override
825    public void close() throws IOException {
826        reader.close();
827    }
828
829    /**
830     * Does lenient detection.
831     *
832     * @param httpContentType content-type header to use for the resolution of the charset encoding.
833     * @param ex              The thrown exception
834     * @return the encoding
835     * @throws IOException thrown if there is a problem reading the stream.
836     */
837    private String doLenientDetection(String httpContentType, XmlStreamReaderException ex) throws IOException {
838        if (httpContentType != null && httpContentType.startsWith("text/html")) {
839            httpContentType = httpContentType.substring("text/html".length());
840            httpContentType = "text/xml" + httpContentType;
841            try {
842                return calculateHttpEncoding(ex.getBomEncoding(), ex.getXmlGuessEncoding(), ex.getXmlEncoding(), true, httpContentType);
843            } catch (final XmlStreamReaderException ex2) {
844                ex = ex2;
845            }
846        }
847        String encoding = ex.getXmlEncoding();
848        if (encoding == null) {
849            encoding = ex.getContentTypeEncoding();
850        }
851        if (encoding == null) {
852            encoding = defaultEncoding == null ? UTF_8 : defaultEncoding;
853        }
854        return encoding;
855    }
856
857    /**
858     * Gets the default encoding to use if none is set in HTTP content-type, XML prolog and the rules based on content-type are not adequate.
859     * <p>
860     * If it is {@code null} the content-type based rules are used.
861     * </p>
862     *
863     * @return the default encoding to use.
864     */
865    public String getDefaultEncoding() {
866        return defaultEncoding;
867    }
868
869    /**
870     * Gets the charset encoding of the XmlStreamReader.
871     *
872     * @return charset encoding.
873     */
874    public String getEncoding() {
875        return encoding;
876    }
877
878    /**
879     * Process the raw stream.
880     *
881     * @param bomInput     BOMInputStream to detect byte order marks
882     * @param piInput     BOMInputStream to guess XML encoding
883     * @param lenient indicates if the charset encoding detection should be relaxed.
884     * @return the encoding to be used
885     * @throws IOException thrown if there is a problem reading the stream.
886     */
887    private String processHttpStream(final BOMInputStream bomInput, final BOMInputStream piInput, final boolean lenient) throws IOException {
888        final String bomEnc = bomInput.getBOMCharsetName();
889        final String xmlGuessEnc = piInput.getBOMCharsetName();
890        final String xmlEnc = getXmlProlog(piInput, xmlGuessEnc);
891        try {
892            return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
893        } catch (final XmlStreamReaderException ex) {
894            if (lenient) {
895                return doLenientDetection(null, ex);
896            }
897            throw ex;
898        }
899    }
900
901    /**
902     * Processes an HTTP stream.
903     *
904     * @param bomInput        BOMInputStream to detect byte order marks
905     * @param piInput         BOMInputStream to guess XML encoding
906     * @param lenient         indicates if the charset encoding detection should be relaxed.
907     * @param httpContentType The HTTP content type
908     * @return the encoding to be used
909     * @throws IOException thrown if there is a problem reading the stream.
910     */
911    private String processHttpStream(final BOMInputStream bomInput, final BOMInputStream piInput, final boolean lenient, final String httpContentType)
912            throws IOException {
913        final String bomEnc = bomInput.getBOMCharsetName();
914        final String xmlGuessEnc = piInput.getBOMCharsetName();
915        final String xmlEnc = getXmlProlog(piInput, xmlGuessEnc);
916        try {
917            return calculateHttpEncoding(bomEnc, xmlGuessEnc, xmlEnc, lenient, httpContentType);
918        } catch (final XmlStreamReaderException ex) {
919            if (lenient) {
920                return doLenientDetection(httpContentType, ex);
921            }
922            throw ex;
923        }
924    }
925
926    /**
927     * Reads the underlying reader's {@code read(char[], int, int)} method.
928     *
929     * @param buf    the buffer to read the characters into
930     * @param offset The start offset
931     * @param len    The number of bytes to read
932     * @return the number of characters read or -1 if the end of stream
933     * @throws IOException if an I/O error occurs.
934     */
935    @Override
936    public int read(final char[] buf, final int offset, final int len) throws IOException {
937        return reader.read(buf, offset, len);
938    }
939
940}