package tri.promptfx.docs;

import java.io.File;
import java.net.URL;
import java.nio.file.CopyOption;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import javafx.fxml.FXMLLoader;
import kotlin.Metadata;
import kotlin.Unit;
import kotlin.collections.CollectionsKt;
import kotlin.io.FilesKt;
import kotlin.jvm.internal.Intrinsics;
import kotlin.jvm.internal.SourceDebugExtension;
import kotlin.text.Regex;
import kotlin.text.StringsKt;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.poi.openxml4j.opc.PackagingURIHelper;
import org.jetbrains.annotations.NotNull;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.safety.Safelist;
import org.jsoup.select.Elements;

/* compiled from: TextCrawler.kt */
@Metadata(mv = {1, 8, 0}, k = 1, xi = 48, d1 = {"��<\n\u0002\u0018\u0002\n\u0002\u0010��\n\u0002\b\u0002\n\u0002\u0010\b\n��\n\u0002\u0010\u000b\n��\n\u0002\u0010\u0002\n��\n\u0002\u0010\u000e\n\u0002\b\u0002\n\u0002\u0018\u0002\n��\n\u0002\u0010#\n\u0002\b\u0002\n\u0002\u0018\u0002\n\u0002\b\u0003\bÆ\u0002\u0018��2\u00020\u0001B\u0007\b\u0002¢\u0006\u0002\u0010\u0002J0\u0010\u0007\u001a\u00020\b2\u0006\u0010\t\u001a\u00020\n2\b\b\u0002\u0010\u000b\u001a\u00020\u00042\u0006\u0010\f\u001a\u00020\r2\u000e\b\u0002\u0010\u000e\u001a\b\u0012\u0004\u0012\u00020\n0\u000fJ \u0010\u0010\u001a\u00020\b2\u0006\u0010\u0011\u001a\u00020\u00122\u0006\u0010\u0013\u001a\u00020\n2\u0006\u0010\f\u001a\u00020\rH\u0002J.\u0010\u0014\u001a\u00020\b2\u0006\u0010\u0011\u001a\u00020\u00122\u0006\u0010\u000b\u001a\u00020\u00042\u0006\u0010\f\u001a\u00020\r2\f\u0010\u000e\u001a\b\u0012\u0004\u0012\u00020\n0\u000fH\u0002R\u000e\u0010\u0003\u001a\u00020\u0004X\u0082T¢\u0006\u0002\n��R\u000e\u0010\u0005\u001a\u00020\u0006X\u0082T¢\u0006\u0002\n��¨\u0006\u0015"}, d2 = {"Ltri/promptfx/docs/TextCrawler;", "", "()V", "CRAWL_LIMIT_LINKS", "", "REQUIRE_ARTICLE", "", "crawlWebsite", "", "url", "", "depth", "targetFolder", "Ljava/io/File;", "scraped", "", "saveTextToFile", "docNode", "Lorg/jsoup/nodes/Element;", "title", "scrapeLinks", "promptfx"})
@SourceDebugExtension({"SMAP\nTextCrawler.kt\nKotlin\n*S Kotlin\n*F\n+ 1 TextCrawler.kt\ntri/promptfx/docs/TextCrawler\n+ 2 fake.kt\nkotlin/jvm/internal/FakeKt\n+ 3 _Collections.kt\nkotlin/collections/CollectionsKt___CollectionsKt\n*L\n1#1,83:1\n1#2:84\n1549#3:85\n1620#3,3:86\n1855#3,2:89\n*S KotlinDebug\n*F\n+ 1 TextCrawler.kt\ntri/promptfx/docs/TextCrawler\n*L\n76#1:85\n76#1:86,3\n79#1:89,2\n*E\n"})
/* loaded from: input_file:tri/promptfx/docs/TextCrawler.class */
public final class TextCrawler {

    @NotNull
    public static final TextCrawler INSTANCE = new TextCrawler();
    private static final boolean REQUIRE_ARTICLE = true;
    private static final int CRAWL_LIMIT_LINKS = 100;

    private TextCrawler() {
    }

    public final void crawlWebsite(@NotNull String url, int i, @NotNull File targetFolder, @NotNull Set<String> scraped) {
        String str;
        Intrinsics.checkNotNullParameter(url, "url");
        Intrinsics.checkNotNullParameter(targetFolder, "targetFolder");
        Intrinsics.checkNotNullParameter(scraped, "scraped");
        if (StringsKt.isBlank(url) || scraped.contains(url)) {
            return;
        }
        if (StringsKt.endsWith$default(url, ".pdf", false, 2, (Object) null)) {
            System.out.println((Object) ("Downloading PDF from " + url + "..."));
            Files.copy(new URL(url).openStream(), new File(targetFolder, StringsKt.substringAfterLast$default(url, PackagingURIHelper.FORWARD_SLASH_STRING, (String) null, 2, (Object) null)).toPath(), new CopyOption[0]);
            return;
        }
        System.out.println((Object) ("Scraping text and links from " + url + "..."));
        Document document = Jsoup.connect(url).get();
        Elements select = document.select("article");
        Intrinsics.checkNotNullExpressionValue(select, "doc.select(\"article\")");
        Element element = (Element) CollectionsKt.firstOrNull((List) select);
        if (element == null) {
            return;
        }
        TextCrawler textCrawler = this;
        Element element2 = element;
        String title = document.title();
        if (StringsKt.isBlank(title)) {
            textCrawler = textCrawler;
            element2 = element2;
            str = StringsKt.substringAfterLast$default(StringsKt.removeSuffix(url, (CharSequence) PackagingURIHelper.FORWARD_SLASH_STRING), PackagingURIHelper.FORWARD_SLASH_STRING, (String) null, 2, (Object) null);
        } else {
            str = title;
        }
        Intrinsics.checkNotNullExpressionValue(str, "doc.title().ifBlank { ur…substringAfterLast(\"/\") }");
        textCrawler.saveTextToFile(element2, str, targetFolder);
        scraped.add(url);
        if (i > 0) {
            scrapeLinks(element, i, targetFolder, scraped);
        }
    }

    public static /* synthetic */ void crawlWebsite$default(TextCrawler textCrawler, String str, int i, File file, Set set, int i2, Object obj) {
        if ((i2 & 2) != 0) {
            i = 0;
        }
        if ((i2 & 8) != 0) {
            set = new LinkedHashSet();
        }
        textCrawler.crawlWebsite(str, i, file, set);
    }

    private final void saveTextToFile(Element element, String str, File file) {
        element.select(CompressorStreamFactory.BROTLI).before("\\n");
        element.select("p").before("\\n");
        String html = element.html();
        Intrinsics.checkNotNullExpressionValue(html, "docNode.apply {\n        …e(\"\\\\n\")\n        }.html()");
        String replace$default = StringsKt.replace$default(html, "\\n", "\n", false, 4, (Object) null);
        Safelist none = Safelist.none();
        Document.OutputSettings outputSettings = new Document.OutputSettings();
        outputSettings.prettyPrint(false);
        Unit unit = Unit.INSTANCE;
        String clean = Jsoup.clean(replace$default, "", none, outputSettings);
        Intrinsics.checkNotNullExpressionValue(clean, "clean(nodeHtml, \"\", Safe…yPrint(false) }\n        )");
        String replace = new Regex("\n{3,}").replace(clean, "\n\n");
        if (replace.length() > 0) {
            FilesKt.writeText$default(new File(file, new Regex("_{2,}").replace(new Regex("[^a-zA-Z0-9.-]").replace(str, "_"), "_") + ".txt"), replace, null, 2, null);
        }
    }

    private final void scrapeLinks(Element element, int i, File file, Set<String> set) {
        Elements select = element.select("a[href]");
        Intrinsics.checkNotNullExpressionValue(select, "docNode.select(\"a[href]\")");
        Elements elements = select;
        ArrayList arrayList = new ArrayList(CollectionsKt.collectionSizeOrDefault(elements, 10));
        Iterator<Element> it = elements.iterator();
        while (it.hasNext()) {
            String absUrl = it.next().absUrl("href");
            Intrinsics.checkNotNullExpressionValue(absUrl, "it.absUrl(\"href\")");
            arrayList.add(StringsKt.substringBeforeLast$default(absUrl, FXMLLoader.CONTROLLER_METHOD_PREFIX, (String) null, 2, (Object) null));
        }
        Iterator it2 = CollectionsKt.take(CollectionsKt.toSet(arrayList), 100).iterator();
        while (it2.hasNext()) {
            INSTANCE.crawlWebsite((String) it2.next(), i - 1, file, set);
        }
    }
}
