package de.l3s.icrawl.contentanalysis;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Predicate;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Maps;
import com.google.common.io.LineProcessor;
import com.google.common.io.Resources;
import de.l3s.icrawl.util.DateUtils;
import de.l3s.icrawl.util.WebPageUtils;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.time.DateTimeException;
import java.time.Instant;
import java.time.LocalDate;
import java.time.LocalTime;
import java.time.ZoneId;
import java.time.ZoneOffset;
import java.time.ZonedDateTime;
import java.time.chrono.ChronoZonedDateTime;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.hadoop.mapreduce.Mapper;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:de/l3s/icrawl/contentanalysis/WebPageDateExtractor.class */
public class WebPageDateExtractor {
    private static final Predicate<WebPageDate> VALID_DATE_PREDICATE = webPageDate -> {
        return DateUtils.isValidDate(webPageDate.getDate());
    };
    private static final Logger logger = LoggerFactory.getLogger(WebPageDateExtractor.class);

    @VisibleForTesting
    static final Pattern DATE_TRIGGERS = Pattern.compile("created?|updated?|modified|last modifi|letzte? (ge|ver)?änder|publi(z|sh)", 258);
    private static final Set<String> SKIPPED_ELEMENTS = ImmutableSet.of("script", "style", "pre");
    private static final Map<String, Integer> NAMES_TO_MONTH = namesMap();
    static final List<Pattern> DATE_PATTERNS = buildDatePattern();

    /* loaded from: input_file:de/l3s/icrawl/contentanalysis/WebPageDateExtractor$DateSource.class */
    public enum DateSource {
        URL,
        TIME,
        META,
        TRIGGER_WORD,
        TEXT_DATE,
        HEADER,
        NOT_FOUND
    }

    /* loaded from: input_file:de/l3s/icrawl/contentanalysis/WebPageDateExtractor$ExtractionException.class */
    public static class ExtractionException extends RuntimeException {
        private static final long serialVersionUID = 1;

        public ExtractionException(String str, Throwable th) {
            super(str, th);
        }

        public ExtractionException(String str) {
            super(str);
        }

        public ExtractionException(Throwable th) {
            super(th);
        }
    }

    /* loaded from: input_file:de/l3s/icrawl/contentanalysis/WebPageDateExtractor$WebPageDate.class */
    public static final class WebPageDate {
        private final ZonedDateTime date;
        private final DateSource dateSource;

        public WebPageDate(ZonedDateTime zonedDateTime, DateSource dateSource) {
            this.date = zonedDateTime;
            this.dateSource = dateSource;
        }

        public ZonedDateTime getDate() {
            return this.date;
        }

        public DateSource getDateSource() {
            return this.dateSource;
        }

        public String toString() {
            return String.format("%s [%s]", this.date, this.dateSource);
        }
    }

    private static List<Pattern> buildDatePattern() {
        ImmutableList.Builder builder = ImmutableList.builder();
        builder.add(Pattern.compile("(?<year>\\d{4})-(?<month>\\d{2})-(?<day>\\d{2})", 256)).add(Pattern.compile("(?<day>\\d{1,2})\\.\\s*(?<month>\\d{1,2})\\.\\s*(?<year>\\d{2,4})", 256)).add(Pattern.compile("(?<day>\\d{1,2})\\.?\\s*(?<month>\\w+)\\.?\\s+(?<year>\\d{2,4})(,\\s*(?<hour>\\d{1,2}):(?<minute>\\d{1,2})( Uhr)?)?", 256)).add(Pattern.compile("(?<day>\\d{1,2})\\.?\\s*(?<month>\\w+)\\.?\\s+(?<year>\\d{2,4})", 256)).add(Pattern.compile("(?<month>\\w+)\\s+(?<day>\\d{1,2})[\\.,]\\s*(?<year>\\d{4})", 256)).add(Pattern.compile("(?<month>\\d{1,2})/(?<day>\\d{1,2})/(?<year>\\d{4})", 256)).add(Pattern.compile("(?<day>\\d{1,2})/(?<month>\\d{1,2})/(?<year>\\d{4})", 256)).add(Pattern.compile("(?<year>\\\\d{4})-(?<day>\\d{1,2})-(?<month>\\d{1,2})/", 256));
        return builder.build();
    }

    private static Map<String, Integer> namesMap() {
        try {
            return (Map) Resources.readLines(Resources.getResource("de/l3s/icrawl/month_mappings.tsv"), StandardCharsets.UTF_8, new LineProcessor<Map<String, Integer>>() { // from class: de.l3s.icrawl.contentanalysis.WebPageDateExtractor.1
                private final ImmutableMap.Builder<String, Integer> namesBuilder = ImmutableMap.builder();

                public boolean processLine(String str) throws IOException {
                    String[] split = str.split("\t", 2);
                    this.namesBuilder.put(split[0], Integer.valueOf(Integer.parseInt(split[1])));
                    return true;
                }

                /* renamed from: getResult, reason: merged with bridge method [inline-methods] */
                public Map<String, Integer> m13getResult() {
                    return this.namesBuilder.build();
                }
            });
        } catch (IOException e) {
            logger.warn("Cannot initialize date extractor: ", e);
            return Collections.emptyMap();
        }
    }

    public static WebPageDate extractModifiedDate(Document document) throws InterruptedException {
        Map<Element, WebPageDate> findCandidateElements = findCandidateElements(document);
        logger.trace("Found {} candidates: {}", Integer.valueOf(findCandidateElements.size()), findCandidateElements);
        Map filterValues = Maps.filterValues(findCandidateElements, VALID_DATE_PREDICATE);
        if (filterValues.isEmpty()) {
            filterValues = Maps.filterValues(findElementsWithDate(document), VALID_DATE_PREDICATE);
        }
        return getBestDateMatch(filterValues);
    }

    private static WebPageDate getBestDateMatch(Map<Element, WebPageDate> map) {
        return (WebPageDate) ((Optional) map.values().stream().collect(Collectors.maxBy((webPageDate, webPageDate2) -> {
            int compareTo = webPageDate.getDateSource().compareTo(webPageDate2.getDateSource());
            if (compareTo != 0) {
                return -compareTo;
            }
            boolean equals = webPageDate.getDate().toLocalTime().equals(LocalTime.MIDNIGHT);
            boolean equals2 = webPageDate2.getDate().toLocalTime().equals(LocalTime.MIDNIGHT);
            return (!(equals && equals2) && (equals || equals2)) ? equals ? -1 : 1 : webPageDate.getDate().compareTo((ChronoZonedDateTime<?>) webPageDate2.getDate());
        }))).orElse(null);
    }

    private static Map<Element, WebPageDate> findElementsWithDate(Document document) {
        ZonedDateTime findDateMatch;
        LinkedHashMap linkedHashMap = new LinkedHashMap();
        Iterator<Node> it = new TreeWalker(findDomRoot(document), SKIPPED_ELEMENTS).iterator();
        while (it.hasNext()) {
            TextNode textNode = (Node) it.next();
            if ((textNode instanceof TextNode) && (findDateMatch = findDateMatch(textNode.text())) != null) {
                linkedHashMap.put(WebPageUtils.findParagraphParent((Node) textNode, -1), new WebPageDate(findDateMatch, DateSource.TEXT_DATE));
            }
        }
        return linkedHashMap;
    }

    private static Map<Element, WebPageDate> findCandidateElements(Document document) throws InterruptedException {
        LinkedHashMap linkedHashMap = new LinkedHashMap();
        Iterator it = document.getElementsByTag("time").iterator();
        while (it.hasNext()) {
            Element element = (Element) it.next();
            ZonedDateTime timeElementDate = getTimeElementDate(element);
            if (timeElementDate != null) {
                linkedHashMap.put(element, new WebPageDate(timeElementDate, DateSource.TIME));
            }
        }
        Iterator it2 = document.getElementsByTag("meta").iterator();
        while (it2.hasNext()) {
            Element element2 = (Element) it2.next();
            ZonedDateTime metaElementDate = getMetaElementDate(element2);
            if (metaElementDate != null) {
                linkedHashMap.put(element2, new WebPageDate(metaElementDate, DateSource.META));
            }
        }
        Iterator<Node> it3 = new TreeWalker(findDomRoot(document), SKIPPED_ELEMENTS).iterator();
        while (it3.hasNext()) {
            TextNode textNode = (Node) it3.next();
            if (Thread.interrupted()) {
                throw new InterruptedException();
            }
            if ((textNode instanceof TextNode) && DATE_TRIGGERS.matcher(textNode.text()).find()) {
                extractDateFromTextNode(textNode, linkedHashMap);
            }
        }
        return linkedHashMap;
    }

    private static void extractDateFromTextNode(TextNode textNode, Map<Element, WebPageDate> map) {
        Element findParagraphParent = WebPageUtils.findParagraphParent((Node) textNode, -1);
        ZonedDateTime findDateMatch = findDateMatch(findParagraphParent.text());
        if (findDateMatch != null) {
            map.put(findParagraphParent, new WebPageDate(findDateMatch, DateSource.TRIGGER_WORD));
        }
    }

    static Node findDomRoot(Document document) {
        Document body = document.body();
        if (body == null) {
            body = document.ownerDocument();
        }
        return body;
    }

    private static ZonedDateTime getMetaElementDate(Element element) {
        ZonedDateTime liberalParseDate;
        Iterator<String> it = DateUtils.META_ATTRIBUTE_NAMES.iterator();
        while (it.hasNext()) {
            String attr = element.attr(it.next());
            if (attr != null && DateUtils.dateMetaKey(attr) && (liberalParseDate = DateUtils.liberalParseDate(element.attr("content"))) != null) {
                return liberalParseDate;
            }
        }
        return null;
    }

    private static ZonedDateTime getTimeElementDate(Element element) {
        if (element.hasAttr("datetime")) {
            return DateUtils.liberalParseDate(element.attr("datetime"));
        }
        logger.trace("Expected attribte 'datetime' on element '{}'", element);
        return null;
    }

    /* JADX WARN: Type inference failed for: r0v24, types: [java.time.ZonedDateTime] */
    /* JADX WARN: Type inference failed for: r2v4, types: [java.time.ZonedDateTime] */
    public static WebPageDate getModifiedDate(String str, Document document, Long l, Mapper<?, ?, ?, ?>.Context context) throws InterruptedException {
        LocalDate extractDateFromUrl = DateUtils.extractDateFromUrl(str);
        if (extractDateFromUrl != null && DateUtils.isValidDate(extractDateFromUrl.atStartOfDay().atZone((ZoneId) ZoneOffset.UTC))) {
            incrementCount(context, DateSource.URL);
            return new WebPageDate(extractDateFromUrl.atStartOfDay().atZone((ZoneId) ZoneOffset.UTC), DateSource.URL);
        }
        WebPageDate extractModifiedDate = extractModifiedDate(document);
        if (extractModifiedDate != null && DateUtils.isValidDate(extractModifiedDate.getDate())) {
            incrementCount(context, extractModifiedDate.getDateSource());
            return extractModifiedDate;
        }
        if (l != null) {
            ZonedDateTime atZone = Instant.ofEpochMilli(l.longValue()).atZone(ZoneOffset.UTC);
            if (DateUtils.isValidDate(atZone)) {
                incrementCount(context, DateSource.HEADER);
                return new WebPageDate(atZone, DateSource.HEADER);
            }
        }
        logger.debug("No date found for URL {}", str);
        return null;
    }

    private static void incrementCount(Mapper<?, ?, ?, ?>.Context context, Enum<?> r5) {
        if (context != null) {
            context.getCounter(r5).increment(1L);
        }
    }

    @VisibleForTesting
    static ZonedDateTime findDateMatch(String str) {
        int intValue;
        Iterator<Pattern> it = DATE_PATTERNS.iterator();
        while (it.hasNext()) {
            Matcher matcher = it.next().matcher(str);
            if (matcher.find()) {
                int parseInt = Integer.parseInt(matcher.group("year"));
                if (parseInt < 15) {
                    parseInt += 2000;
                } else if (parseInt < 100) {
                    parseInt += 1900;
                }
                String group = matcher.group("month");
                Integer num = NAMES_TO_MONTH.get(group.toLowerCase(Locale.ENGLISH));
                if (num != null) {
                    intValue = num.intValue();
                } else if (group.matches("\\d{1,2}")) {
                    intValue = Integer.parseInt(group);
                } else {
                    continue;
                }
                int parseInt2 = Integer.parseInt(matcher.group("day"));
                int i = 0;
                int i2 = 0;
                if (matcher.groupCount() > 3 && matcher.group("hour") != null && matcher.group("minute") != null) {
                    i = Integer.parseInt(matcher.group("hour"));
                    i2 = Integer.parseInt(matcher.group("minute"));
                }
                try {
                    return ZonedDateTime.of(parseInt, intValue, parseInt2, i, i2, 0, 0, ZoneOffset.UTC);
                } catch (DateTimeException e) {
                    logger.trace("Could not use as a date: {}-{}-{}: ", new Object[]{Integer.valueOf(parseInt), Integer.valueOf(intValue), Integer.valueOf(parseInt2), e});
                }
            }
        }
        return null;
    }
}
