package de.l3s.icrawl.crawler.urls;

import com.google.common.collect.ImmutableMultimap;
import io.mola.galimatias.GalimatiasParseException;
import io.mola.galimatias.NameValue;
import io.mola.galimatias.URL;
import io.mola.galimatias.URLSearchParameters;
import io.mola.galimatias.canonicalize.CombinedCanonicalizer;
import io.mola.galimatias.canonicalize.RFC3986Canonicalizer;
import io.mola.galimatias.canonicalize.StripPartCanonicalizer;
import io.mola.galimatias.canonicalize.URLCanonicalizer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:de/l3s/icrawl/crawler/urls/UrlCanonicalizerNormalizer.class */
public class UrlCanonicalizerNormalizer implements UrlNormalizer {
    private static final Pattern EXCLUDE_PATTERN = Pattern.compile("^utm_|sess(ion)?id", 2);
    private static final Logger logger = LoggerFactory.getLogger(UrlCanonicalizerNormalizer.class);
    private final URLCanonicalizer canonicalizer = new CombinedCanonicalizer(new URLCanonicalizer[]{new StripParametersCanonicalizer(EXCLUDE_PATTERN), new StripPartCanonicalizer(StripPartCanonicalizer.Part.FRAGMENT), new RFC3986Canonicalizer()});

    /* loaded from: input_file:de/l3s/icrawl/crawler/urls/UrlCanonicalizerNormalizer$StripParametersCanonicalizer.class */
    public static class StripParametersCanonicalizer implements URLCanonicalizer {
        private final Pattern excludePattern;

        public StripParametersCanonicalizer(Pattern pattern) {
            this.excludePattern = pattern;
        }

        public URL canonicalize(URL url) throws GalimatiasParseException {
            return url.withQuery(canonicalizeQuery(new URLSearchParameters(url.query())));
        }

        private String canonicalizeQuery(URLSearchParameters uRLSearchParameters) {
            ImmutableMultimap.Builder builder = ImmutableMultimap.builder();
            Iterator<NameValue> it = uRLSearchParameters.iterator();
            while (it.hasNext()) {
                NameValue next = it.next();
                if (!this.excludePattern.matcher(next.name()).find()) {
                    builder.put(next.name(), next.value());
                }
            }
            ImmutableMultimap build = builder.build();
            if (build.isEmpty()) {
                return null;
            }
            ArrayList<String> arrayList = new ArrayList(build.keySet());
            Collections.sort(arrayList);
            StringBuilder sb = new StringBuilder(100);
            for (String str : arrayList) {
                for (String str2 : build.get(str)) {
                    if (sb.length() > 0) {
                        sb.append('&');
                    }
                    sb.append(str);
                    if (!str2.isEmpty()) {
                        sb.append('=');
                        sb.append(str2);
                    }
                }
            }
            return sb.toString();
        }
    }

    @Override // de.l3s.icrawl.crawler.urls.UrlNormalizer
    public String normalize(String str) {
        try {
            try {
                return this.canonicalizer.canonicalize(URL.parse(str)).toString();
            } catch (GalimatiasParseException e) {
                logger.debug("Could not canonicalize URL '{}', returning unchanged ", str, e);
                return str;
            }
        } catch (GalimatiasParseException e2) {
            logger.trace("Invalid URL '{}', dropping", str, e2);
            return null;
        }
    }
}
