/*
 * Decompiled with CFR 0.152.
 */
package org.apache.tika.parser.html;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Collections;
import java.util.HashSet;
import java.util.Locale;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.tika.config.Field;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.utils.CharsetUtils;

public class HtmlEncodingDetector
implements EncodingDetector {
    private static Set<String> CHARSETS_UNSUPPORTED_BY_IANA;
    private static final int DEFAULT_MARK_LIMIT = 8192;
    private static final Pattern HTTP_META_PATTERN;
    private static final Pattern FLEXIBLE_CHARSET_ATTR_PATTERN;
    private static final Charset ASCII;
    @Field
    private int markLimit = 8192;

    @Override
    public Charset detect(InputStream input, Metadata metadata) throws IOException {
        if (input == null) {
            return null;
        }
        input.mark(this.markLimit);
        byte[] buffer = new byte[this.markLimit];
        int n = 0;
        int m = input.read(buffer);
        while (m != -1 && n < buffer.length) {
            m = input.read(buffer, n += m, buffer.length - n);
        }
        input.reset();
        String head = ASCII.decode(ByteBuffer.wrap(buffer, 0, n)).toString();
        String headNoComments = head.replaceAll("<!--.*?(-->|$)", " ");
        Charset charset = this.findCharset(headNoComments);
        if (charset == null) {
            return this.findCharset(head);
        }
        return charset;
    }

    private Charset findCharset(String s) {
        Matcher equiv = HTTP_META_PATTERN.matcher(s);
        Matcher charsetMatcher = FLEXIBLE_CHARSET_ATTR_PATTERN.matcher("");
        while (equiv.find()) {
            String attrs = equiv.group(1);
            charsetMatcher.reset(attrs);
            while (charsetMatcher.find()) {
                String candCharset = charsetMatcher.group(1);
                if (CHARSETS_UNSUPPORTED_BY_IANA.contains(candCharset.toLowerCase(Locale.US))) continue;
                if ("x-user-defined".equalsIgnoreCase(candCharset)) {
                    candCharset = "windows-1252";
                }
                if (!CharsetUtils.isSupported(candCharset)) continue;
                try {
                    return CharsetUtils.forName(candCharset);
                }
                catch (Exception exception) {
                }
            }
        }
        return null;
    }

    @Field
    public void setMarkLimit(int markLimit) {
        this.markLimit = markLimit;
    }

    public int getMarkLimit() {
        return this.markLimit;
    }

    static {
        HashSet<String> unsupported = new HashSet<String>();
        try (BufferedReader reader = new BufferedReader(new InputStreamReader(HtmlEncodingDetector.class.getResourceAsStream("StandardCharsets_unsupported_by_IANA.txt"), StandardCharsets.UTF_8));){
            String line = reader.readLine();
            while (line != null) {
                if (line.startsWith("#")) {
                    line = reader.readLine();
                    continue;
                }
                if ((line = line.trim()).length() > 0) {
                    unsupported.add(line.toLowerCase(Locale.US));
                }
                line = reader.readLine();
            }
        }
        catch (IOException e) {
            throw new IllegalArgumentException("couldn't find StandardCharsets_unsupported_by_IANA.txt on the class path");
        }
        CHARSETS_UNSUPPORTED_BY_IANA = Collections.unmodifiableSet(unsupported);
        HTTP_META_PATTERN = Pattern.compile("(?is)<\\s*meta(?:/|\\s+)([^<>]+)");
        FLEXIBLE_CHARSET_ATTR_PATTERN = Pattern.compile("(?is)\\bcharset\\s*=\\s*(?:['\\\"]\\s*)?([-_:\\.a-z0-9]+)");
        ASCII = Charset.forName("US-ASCII");
    }
}

