package org.codelibs.robot.transformer.impl;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.annotation.Resource;
import javax.xml.transform.TransformerException;
import org.apache.commons.io.IOUtils;
import org.apache.xpath.CachedXPathAPI;
import org.codelibs.core.io.InputStreamUtil;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.robot.Constants;
import org.codelibs.robot.builder.RequestDataBuilder;
import org.codelibs.robot.container.RobotContainer;
import org.codelibs.robot.entity.AccessResultData;
import org.codelibs.robot.entity.RequestData;
import org.codelibs.robot.entity.ResponseData;
import org.codelibs.robot.entity.ResultData;
import org.codelibs.robot.exception.RobotCrawlAccessException;
import org.codelibs.robot.exception.RobotSystemException;
import org.codelibs.robot.helper.EncodingHelper;
import org.codelibs.robot.helper.UrlConvertHelper;
import org.codelibs.robot.util.CharUtil;
import org.codelibs.robot.util.ResponseDataUtil;
import org.cyberneko.html.parsers.DOMParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;

/* loaded from: input_file:org/codelibs/robot/transformer/impl/HtmlTransformer.class */
public class HtmlTransformer extends AbstractTransformer {
    private static final Logger logger = LoggerFactory.getLogger(HtmlTransformer.class);
    protected static final String LOCATION_HEADER = "Location";

    @Resource
    protected RobotContainer robotContainer;
    protected String defaultEncoding;
    protected Map<String, String> featureMap = new HashMap();
    protected Map<String, String> propertyMap = new HashMap();
    protected Map<String, String> childUrlRuleMap = new LinkedHashMap();
    protected int preloadSizeForCharset = 2048;
    protected Pattern invalidUrlPattern = Pattern.compile("^\\s*javascript:|^\\s*mailto:|^\\s*irc:|^\\s*skype:|^\\s*about:|^\\s*fscommand:|^\\s*aim:|^\\s*msnim:|^\\s*news:|^\\s*tel:|^\\s*unsaved:|^\\s*callto:", 2);
    private final ThreadLocal<CachedXPathAPI> xpathAPI = new ThreadLocal<>();

    @Override // org.codelibs.robot.transformer.impl.AbstractTransformer, org.codelibs.robot.transformer.Transformer
    public ResultData transform(ResponseData responseData) {
        if (responseData == null || responseData.getResponseBody() == null) {
            throw new RobotCrawlAccessException("No response body.");
        }
        File createResponseBodyFile = ResponseDataUtil.createResponseBodyFile(responseData);
        FileInputStream fileInputStream = null;
        try {
            try {
                fileInputStream = new FileInputStream(createResponseBodyFile);
                responseData.setResponseBody(fileInputStream);
                updateCharset(responseData);
                IOUtils.closeQuietly(fileInputStream);
                ResultData resultData = new ResultData();
                resultData.setTransformerName(getName());
                try {
                    try {
                        fileInputStream = new FileInputStream(createResponseBodyFile);
                        responseData.setResponseBody(fileInputStream);
                        storeData(responseData, resultData);
                        IOUtils.closeQuietly(fileInputStream);
                        try {
                            if (isHtml(responseData)) {
                                try {
                                    fileInputStream = new FileInputStream(createResponseBodyFile);
                                    responseData.setResponseBody(fileInputStream);
                                    storeChildUrls(responseData, resultData);
                                    IOUtils.closeQuietly(fileInputStream);
                                } catch (RobotSystemException e) {
                                    IOUtils.closeQuietly(fileInputStream);
                                    if (!createResponseBodyFile.delete()) {
                                        logger.warn("Could not delete a temp file: " + createResponseBodyFile);
                                    }
                                    throw e;
                                } catch (Exception e2) {
                                    IOUtils.closeQuietly(fileInputStream);
                                    if (!createResponseBodyFile.delete()) {
                                        logger.warn("Could not delete a temp file: " + createResponseBodyFile);
                                    }
                                    throw new RobotSystemException("Could not store data.", e2);
                                }
                            }
                            Object obj = responseData.getMetaDataMap().get(LOCATION_HEADER);
                            if (obj instanceof String) {
                                resultData.addUrl(RequestDataBuilder.newRequestData().get().url(((UrlConvertHelper) this.robotContainer.getComponent("urlConvertHelper")).convert(obj.toString())).build());
                            }
                            if (!createResponseBodyFile.delete()) {
                                logger.warn("Could not delete a temp file: " + createResponseBodyFile);
                            }
                            return resultData;
                        } finally {
                            IOUtils.closeQuietly(fileInputStream);
                        }
                    } finally {
                    }
                } catch (RobotSystemException e3) {
                    IOUtils.closeQuietly(fileInputStream);
                    if (!createResponseBodyFile.delete()) {
                        logger.warn("Could not delete a temp file: " + createResponseBodyFile);
                    }
                    throw e3;
                } catch (Exception e4) {
                    IOUtils.closeQuietly(fileInputStream);
                    if (!createResponseBodyFile.delete()) {
                        logger.warn("Could not delete a temp file: " + createResponseBodyFile);
                    }
                    throw new RobotSystemException("Could not store data.", e4);
                }
            } finally {
            }
        } catch (RobotSystemException e5) {
            IOUtils.closeQuietly(fileInputStream);
            if (!createResponseBodyFile.delete()) {
                logger.warn("Could not delete a temp file: " + createResponseBodyFile);
            }
            throw e5;
        } catch (Exception e6) {
            IOUtils.closeQuietly(fileInputStream);
            if (!createResponseBodyFile.delete()) {
                logger.warn("Could not delete a temp file: " + createResponseBodyFile);
            }
            throw new RobotSystemException("Could not load response data: " + responseData.getUrl(), e6);
        }
    }

    protected boolean isHtml(ResponseData responseData) {
        String mimeType = responseData.getMimeType();
        return "text/html".equals(mimeType) || "application/xhtml+xml".equals(mimeType);
    }

    public void addChildUrlRule(String str, String str2) {
        if (StringUtil.isNotBlank(str) && StringUtil.isNotBlank(str2)) {
            this.childUrlRuleMap.put(str, str2);
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public CachedXPathAPI getXPathAPI() {
        CachedXPathAPI cachedXPathAPI = this.xpathAPI.get();
        if (cachedXPathAPI == null) {
            cachedXPathAPI = new CachedXPathAPI();
            this.xpathAPI.set(cachedXPathAPI);
        }
        return cachedXPathAPI;
    }

    protected void storeChildUrls(ResponseData responseData, ResultData resultData) {
        List<RequestData> arrayList = new ArrayList();
        try {
            try {
                DOMParser domParser = getDomParser();
                domParser.parse(new InputSource(responseData.getResponseBody()));
                Document document = domParser.getDocument();
                String baseHref = getBaseHref(document);
                URL url = new URL(baseHref == null ? responseData.getUrl() : baseHref);
                for (Map.Entry<String, String> entry : this.childUrlRuleMap.entrySet()) {
                    Iterator<String> it = getUrlFromTagAttribute(url, document, entry.getKey(), entry.getValue(), responseData.getCharSet()).iterator();
                    while (it.hasNext()) {
                        arrayList.add(RequestDataBuilder.newRequestData().get().url(it.next()).build());
                    }
                }
                arrayList = convertChildUrlList(arrayList);
                this.xpathAPI.remove();
            } catch (Exception e) {
                logger.warn("Could not create child urls.", e);
                this.xpathAPI.remove();
            }
            resultData.addAllUrl(arrayList);
            resultData.addAllUrl(responseData.getChildUrlSet());
            RequestData requestData = responseData.getRequestData();
            resultData.removeUrl(requestData);
            resultData.removeUrl(getDuplicateUrl(requestData));
        } catch (Throwable th) {
            this.xpathAPI.remove();
            throw th;
        }
    }

    protected List<RequestData> convertChildUrlList(List<RequestData> list) {
        try {
            UrlConvertHelper urlConvertHelper = (UrlConvertHelper) this.robotContainer.getComponent("urlConvertHelper");
            for (RequestData requestData : list) {
                requestData.setUrl(urlConvertHelper.convert(requestData.getUrl()));
            }
            return list;
        } catch (Exception e) {
            return list;
        }
    }

    protected void storeData(ResponseData responseData, ResultData resultData) {
        resultData.setData(InputStreamUtil.getBytes(responseData.getResponseBody()));
        resultData.setEncoding(responseData.getCharSet());
    }

    protected void updateCharset(ResponseData responseData) {
        String loadCharset = loadCharset(responseData.getResponseBody());
        if (loadCharset != null) {
            responseData.setCharSet(loadCharset.trim());
        } else if (this.defaultEncoding == null) {
            responseData.setCharSet(Constants.UTF_8);
        } else if (responseData.getCharSet() == null) {
            responseData.setCharSet(this.defaultEncoding);
        }
        if (isSupportedCharset(responseData.getCharSet())) {
            return;
        }
        responseData.setCharSet(Constants.UTF_8);
    }

    protected boolean isSupportedCharset(String str) {
        if (str == null) {
            return false;
        }
        try {
            Charset.forName(str);
            return true;
        } catch (Exception e) {
            return false;
        }
    }

    protected String loadCharset(InputStream inputStream) {
        String str = null;
        try {
            BufferedInputStream bufferedInputStream = new BufferedInputStream(inputStream);
            byte[] bArr = new byte[this.preloadSizeForCharset];
            int read = bufferedInputStream.read(bArr);
            if (read != -1) {
                str = parseCharset(new String(bArr, 0, read));
            }
            try {
                str = ((EncodingHelper) this.robotContainer.getComponent("encodingHelper")).normalize(str);
            } catch (Exception e) {
            }
            return str;
        } catch (IOException e2) {
            throw new RobotCrawlAccessException("Could not load a content.", e2);
        }
    }

    protected String parseCharset(String str) {
        Matcher matcher = Pattern.compile("; *charset *= *([a-zA-Z0-9\\-_]+)", 2).matcher(str);
        if (matcher.find()) {
            return matcher.group(1);
        }
        return null;
    }

    protected RequestData getDuplicateUrl(RequestData requestData) {
        String url = requestData.getUrl();
        if (url.endsWith("/")) {
            requestData.setUrl(url.substring(0, url.length() - 1));
        } else {
            requestData.setUrl(url + "/");
        }
        return requestData;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public DOMParser getDomParser() {
        DOMParser dOMParser = new DOMParser();
        try {
            for (Map.Entry<String, String> entry : this.featureMap.entrySet()) {
                dOMParser.setFeature(entry.getKey(), "true".equalsIgnoreCase(entry.getValue()));
            }
            for (Map.Entry<String, String> entry2 : this.propertyMap.entrySet()) {
                dOMParser.setProperty(entry2.getKey(), entry2.getValue());
            }
            return dOMParser;
        } catch (Exception e) {
            throw new RobotSystemException("Invalid parser configuration.", e);
        }
    }

    protected String getBaseHref(Document document) {
        try {
            NodeList selectNodeList = getXPathAPI().selectNodeList(document, "//BASE");
            if (selectNodeList.getLength() <= 0) {
                return null;
            }
            String attribute = ((Element) selectNodeList.item(0)).getAttribute("href");
            if (!StringUtil.isNotBlank(attribute)) {
                return null;
            }
            if (attribute.startsWith("www.")) {
                attribute = "http://" + attribute;
            }
            return attribute;
        } catch (Exception e) {
            logger.warn("Could not get a base tag. ", e);
            return null;
        }
    }

    protected List<String> getUrlFromTagAttribute(URL url, Document document, String str, String str2, String str3) {
        if (logger.isDebugEnabled()) {
            logger.debug("Base URL: " + url);
        }
        ArrayList arrayList = new ArrayList();
        try {
            NodeList selectNodeList = getXPathAPI().selectNodeList(document, str);
            for (int i = 0; i < selectNodeList.getLength(); i++) {
                String attribute = ((Element) selectNodeList.item(i)).getAttribute(str2);
                if (isValidPath(attribute)) {
                    addChildUrlFromTagAttribute(arrayList, url, attribute, str3);
                }
            }
        } catch (TransformerException e) {
            logger.warn("Could not get urls: (" + str + ", " + str2 + ")", e);
        }
        return arrayList;
    }

    protected void addChildUrlFromTagAttribute(List<String> list, URL url, String str, String str2) {
        try {
            String trim = str.trim();
            String encodeUrl = encodeUrl(normalizeUrl((trim.startsWith("?") ? new URL(url.toExternalForm() + trim) : new URL(url, trim)).toExternalForm()), str2);
            if (logger.isDebugEnabled()) {
                logger.debug(str + " -> " + encodeUrl);
            }
            if (StringUtil.isNotBlank(encodeUrl)) {
                if (logger.isDebugEnabled()) {
                    logger.debug("Add Child: " + encodeUrl);
                }
                list.add(encodeUrl);
            } else if (logger.isDebugEnabled()) {
                logger.debug("Skip Child: " + encodeUrl);
            }
        } catch (MalformedURLException e) {
            logger.warn("Malformed URL: " + str, e);
        }
    }

    protected String encodeUrl(String str, String str2) {
        if (StringUtil.isBlank(str) || StringUtil.isBlank(str2)) {
            return str;
        }
        StringBuilder sb = new StringBuilder(str.length() + 100);
        for (char c : str.toCharArray()) {
            if (CharUtil.isUrlChar(c)) {
                sb.append(c);
            } else {
                try {
                    sb.append(URLEncoder.encode(String.valueOf(c), str2));
                } catch (UnsupportedEncodingException e) {
                }
            }
        }
        return sb.toString();
    }

    protected String normalizeUrl(String str) {
        if (str == null) {
            return null;
        }
        String trim = str.trim();
        int indexOf = trim.indexOf(35);
        if (indexOf >= 0) {
            trim = trim.substring(0, indexOf);
        }
        String replaceAll = trim.replaceAll(Pattern.quote("/./"), "/");
        if (replaceAll.indexOf(";jsessionid") >= 0) {
            replaceAll = replaceAll.replaceFirst(";jsessionid=[a-zA-Z0-9\\.]*", Constants.EMPTY_STRING);
        }
        if (replaceAll.indexOf(32) >= 0) {
            if (!logger.isDebugEnabled()) {
                return null;
            }
            logger.debug("INVALID URL: " + replaceAll);
            return null;
        }
        String str2 = null;
        while (replaceAll.indexOf("/../") >= 0 && !replaceAll.equals(str2)) {
            str2 = replaceAll;
            replaceAll = replaceAll.replaceFirst("/[^/]+/\\.\\./", "/");
        }
        return replaceAll.replaceAll("([^:])/+", "$1/");
    }

    protected boolean isValidPath(String str) {
        return (StringUtil.isBlank(str) || this.invalidUrlPattern.matcher(str).find()) ? false : true;
    }

    public void addFeature(String str, String str2) {
        if (StringUtil.isBlank(str) || StringUtil.isBlank(str2)) {
            throw new RobotSystemException("key or value is null.");
        }
        this.featureMap.put(str, str2);
    }

    public void addProperty(String str, String str2) {
        if (StringUtil.isBlank(str) || StringUtil.isBlank(str2)) {
            throw new RobotSystemException("key or value is null.");
        }
        this.propertyMap.put(str, str2);
    }

    @Override // org.codelibs.robot.transformer.Transformer
    public Object getData(AccessResultData accessResultData) {
        if (!getName().equals(accessResultData.getTransformerName())) {
            throw new RobotSystemException("Transformer is invalid. Use " + accessResultData.getTransformerName() + ". This transformer is " + getName() + ".");
        }
        byte[] data = accessResultData.getData();
        if (data == null) {
            return null;
        }
        String encoding = accessResultData.getEncoding();
        try {
            return new String(data, encoding == null ? Constants.UTF_8 : encoding);
        } catch (UnsupportedEncodingException e) {
            if (logger.isInfoEnabled()) {
                logger.info("Invalid charsetName: " + encoding + ". Changed to " + Constants.UTF_8, e);
            }
            return new String(data, Constants.UTF_8_CHARSET);
        }
    }

    public Map<String, String> getFeatureMap() {
        return this.featureMap;
    }

    public void setFeatureMap(Map<String, String> map) {
        this.featureMap = map;
    }

    public Map<String, String> getPropertyMap() {
        return this.propertyMap;
    }

    public void setPropertyMap(Map<String, String> map) {
        this.propertyMap = map;
    }

    public Map<String, String> getChildUrlRuleMap() {
        return this.childUrlRuleMap;
    }

    public void setChildUrlRuleMap(Map<String, String> map) {
        this.childUrlRuleMap = map;
    }

    public String getDefaultEncoding() {
        return this.defaultEncoding;
    }

    public void setDefaultEncoding(String str) {
        this.defaultEncoding = str;
    }

    public int getPreloadSizeForCharset() {
        return this.preloadSizeForCharset;
    }

    public void setPreloadSizeForCharset(int i) {
        this.preloadSizeForCharset = i;
    }

    public Pattern getInvalidUrlPattern() {
        return this.invalidUrlPattern;
    }

    public void setInvalidUrlPattern(Pattern pattern) {
        this.invalidUrlPattern = pattern;
    }
}
