diff --git a/README.adoc b/README.adoc index bf6cb662..0a67cf25 100644 --- a/README.adoc +++ b/README.adoc @@ -85,6 +85,34 @@ INDEXING_REPORTING_GITHUB_ISSUE_ID={github-issue-id-in-your-repository} INDEXING_REPORTING_GITHUB_WARNING_REPEAT_DELAY=10m ---- +=== Updating the built-in docs.quarkiverse.io sample + +By default, the app will use the built-in docs.quarkiverse.io sample containing a snapshot of docs.quarkiverse.io. + +Quarkiverse guides are indexed from the GitHub workflow artifact created as part of +the https://github.com/quarkiverse/quarkiverse-docs/actions/workflows/publish.yml[publish action]. +To update the included sample: go to the https://github.com/quarkiverse/quarkiverse-docs/actions/workflows/publish.yml?query=is%3Asuccess[page], +open the first run at the top (latest successful execution) and download the `github-pages` at the bottom of the page. + +GitHub has a 100mb limit on the files we can include in the repository. This means that the downloaded artifact has to be cleaned up +a bit. Unpack the archive and remove some of the extensions (in particular try dropping the ones taking the most space, e.g. `quarkus-backstage`.). +Make sure to keep Amazon S3 guides as those are using in tests. +Once you are done removing extensions, archive remaining guides back and replace the existing file. + +=== Downloading the latest docs.quarkiverse.io artifact + +If during the development there's a need to use a fresh docs.quarkiverse.io artifact downloaded by the app itself during the indexing process, +add the following properties to your `.env` file: + +[source,properties] +---- +# see about tokens https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens +QUARKIVERSEIO_GITHUB_ARTIFACT_TOKEN={your-generated-github-token} +QUARKIVERSEIO_GITHUB_ARTIFACT_REPOSITORY=quarkiverse/quarkiverse-docs +QUARKIVERSEIO_GITHUB_ARTIFACT_ACTION_NAME=Publish website +QUARKIVERSEIO_GITHUB_ARTIFACT_ARTIFACT_NAME=github-pages +---- + [[testing]] == Testing diff --git a/pom.xml b/pom.xml index 2390cba6..aed522dd 100644 --- a/pom.xml +++ b/pom.xml @@ -112,6 +112,11 @@ lodash 4.17.13 + + org.apache.commons + commons-compress + 1.27.1 + @@ -204,6 +209,10 @@ io.quarkiverse.helm quarkus-helm + + org.apache.commons + commons-compress + diff --git a/src/main/java/io/quarkus/search/app/fetching/FetchingService.java b/src/main/java/io/quarkus/search/app/fetching/FetchingService.java index fca3409e..2bfe8112 100644 --- a/src/main/java/io/quarkus/search/app/fetching/FetchingService.java +++ b/src/main/java/io/quarkus/search/app/fetching/FetchingService.java @@ -1,9 +1,11 @@ package io.quarkus.search.app.fetching; +import static io.quarkus.search.app.util.FileUtils.untar; import static io.quarkus.search.app.util.FileUtils.unzip; import java.io.IOException; import java.net.URI; +import java.nio.file.Files; import java.nio.file.Path; import java.util.LinkedHashMap; import java.util.Map; @@ -32,6 +34,12 @@ import org.hibernate.search.util.common.impl.Closer; import org.hibernate.search.util.common.impl.SuppressingCloser; +import org.kohsuke.github.GHArtifact; +import org.kohsuke.github.GHRepository; +import org.kohsuke.github.GHWorkflowRun; +import org.kohsuke.github.GitHub; +import org.kohsuke.github.GitHubBuilder; + import io.vertx.core.impl.ConcurrentHashSet; @ApplicationScoped @@ -49,8 +57,45 @@ public class FetchingService { private final Map detailsCache = new ConcurrentHashMap<>(); private final Set tempDirectories = new ConcurrentHashSet<>(); - public QuarkiverseIO fetchQuarkiverseIo(FailureCollector failureCollector) { - return new QuarkiverseIO(quarkiverseIOConfig, failureCollector); + public QuarkiverseIO fetchQuarkiverseIo(FailureCollector failureCollector) throws IOException { + CloseableDirectory tempDir = CloseableDirectory.temp("quarkiverse-io"); + Path artifact = null; + if (quarkiverseIOConfig.githubArtifact().isPresent()) { + QuarkiverseIOConfig.GithubArtifact ghConfig = quarkiverseIOConfig.githubArtifact().get(); + GitHub github = new GitHubBuilder().withOAuthToken(ghConfig.token()).build(); + GHRepository repository = github.getRepository(ghConfig.repository()); + for (GHWorkflowRun run : repository.queryWorkflowRuns() + .conclusion(GHWorkflowRun.Conclusion.SUCCESS) + .list().withPageSize(10)) { + if (ghConfig.actionName().equals(run.getName())) { + for (GHArtifact ghArtifact : run.listArtifacts().withPageSize(5).toList()) { + if (ghConfig.artifactName().equals(ghArtifact.getName())) { + artifact = tempDir.path().resolve(ghArtifact.getName() + ".zip"); + final Path finalArtifact = artifact; + Log.infof("Downloading Quarkiverse %s artifact #%s", ghConfig.artifactName(), ghArtifact.getId()); + ghArtifact.download(is -> Files.copy(is, finalArtifact)); + break; + } + } + break; + } + } + if (artifact == null) { + throw new IllegalStateException("Artifact " + ghConfig.artifactName() + " not found."); + } + } else if (quarkiverseIOConfig.zip().isPresent()) { + artifact = quarkiverseIOConfig.zip().get().path(); + } else { + throw new IllegalStateException( + "Cannot fetch Quarkiverse guides as neither zip nor GitHub configuration was supplied"); + } + + unzip(artifact, tempDir.path()); + Path pages = tempDir.path().resolve("pages"); + untar(tempDir.path().resolve("artifact.tar"), pages); + + return new QuarkiverseIO(quarkiverseIOConfig.enabled(), pages, quarkiverseIOConfig.baseUri(), failureCollector, + tempDir); } public QuarkusIO fetchQuarkusIo(FailureCollector failureCollector) { diff --git a/src/main/java/io/quarkus/search/app/quarkiverseio/QuarkiverseIO.java b/src/main/java/io/quarkus/search/app/quarkiverseio/QuarkiverseIO.java index bfb8a2af..cc8b658b 100644 --- a/src/main/java/io/quarkus/search/app/quarkiverseio/QuarkiverseIO.java +++ b/src/main/java/io/quarkus/search/app/quarkiverseio/QuarkiverseIO.java @@ -2,23 +2,18 @@ import java.io.Closeable; import java.io.FileInputStream; -import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.net.URI; -import java.net.URISyntaxException; -import java.nio.charset.StandardCharsets; +import java.nio.file.FileVisitResult; +import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.SimpleFileVisitor; +import java.nio.file.attribute.BasicFileAttributes; import java.util.ArrayList; -import java.util.HashMap; import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.UUID; import java.util.stream.Stream; -import jakarta.ws.rs.core.UriBuilder; - import io.quarkus.search.app.entity.Guide; import io.quarkus.search.app.hibernate.InputProvider; import io.quarkus.search.app.indexing.IndexableGuides; @@ -31,110 +26,86 @@ import org.jsoup.Jsoup; import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; public class QuarkiverseIO implements IndexableGuides, Closeable { public static final String QUARKIVERSE_ORIGIN = "quarkiverse-hub"; - private final URI quarkiverseDocsIndex; private final FailureCollector failureCollector; private final List quarkiverseGuides = new ArrayList<>(); private final boolean enabled; - private final CloseableDirectory guideHtmls; + private final Path pages; + private final URI baseUri; + private final CloseableDirectory tempDir; - public QuarkiverseIO(QuarkiverseIOConfig config, FailureCollector failureCollector) { - this.quarkiverseDocsIndex = config.webUri(); - this.enabled = config.enabled(); + public QuarkiverseIO(boolean enabled, Path pages, URI baseUri, FailureCollector failureCollector, + CloseableDirectory tempDir) { this.failureCollector = failureCollector; - try { - guideHtmls = CloseableDirectory.temp("quarkiverse_htmls_"); - } catch (IOException e) { - throw new IllegalStateException("Failed to fetch quarkiverse guides: %s".formatted(e.getMessage()), e); - } + this.enabled = enabled; + this.pages = pages; + this.baseUri = baseUri; + this.tempDir = tempDir; } public void parseGuides() { - Document index = null; - try { - index = Jsoup.connect(quarkiverseDocsIndex.toString()).get(); + try (Stream stream = Files.list(pages)) { + List guideDirectories = stream.filter(Files::isDirectory) + .filter(dir -> dir.getFileName().toString().startsWith("quarkus")) + .toList(); + for (Path directory : guideDirectories) { + Files.walkFileTree( + directory, new SimpleFileVisitor<>() { + @Override + public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { + if (file.getFileName().toString().endsWith(".html")) { + quarkiverseGuides.add(readGuide(file)); + } + return FileVisitResult.CONTINUE; + } + + @Override + public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) throws IOException { + // some extensions have these extra directories that are not "visible" from the docs page themselves, + // but are still deployed (and hence accessible). We want to ignore those: + if (dir.getFileName().toString().equals("includes")) { + return FileVisitResult.SKIP_SUBTREE; + } + return super.preVisitDirectory(dir, attrs); + } + }); + } } catch (IOException e) { failureCollector.critical(FailureCollector.Stage.PARSING, "Unable to fetch the Quarkiverse Docs index page.", e); - // no point in doing anything else here: - return; - } - - // find links to quarkiverse extension docs: - Elements quarkiverseGuideIndexLinks = index.select("ul.components li.component a.title"); - - for (Element quarkiverseGuideIndexLink : quarkiverseGuideIndexLinks) { - Guide guide = new Guide(); - String topLevelTitle = quarkiverseGuideIndexLink.text(); - guide.title.set(topLevelTitle); - - Document extensionIndex = null; - try { - extensionIndex = readGuide(guide, quarkiverseGuideIndexLink.absUrl("href"), Optional.empty()); - } catch (URISyntaxException | IOException e) { - failureCollector.warning(FailureCollector.Stage.PARSING, - "Unable to fetch guide: " + topLevelTitle, e); - continue; - } - - quarkiverseGuides.add(guide); - - // find other sub-pages on the left side - Map indexLinks = new HashMap<>(); - Elements extensionSubGuides = extensionIndex.select("nav.nav-menu .nav-item a"); - for (Element element : extensionSubGuides) { - String href = element.absUrl("href"); - URI uri = UriBuilder.fromUri(href).replaceQuery(null).fragment(null).build(); - indexLinks.computeIfAbsent(uri, u -> element.text()); - } - - for (Map.Entry entry : indexLinks.entrySet()) { - Guide sub = new Guide(); - sub.title.set(entry.getValue()); - try { - readGuide(sub, entry.getKey().toString(), Optional.of(topLevelTitle)); - } catch (URISyntaxException | IOException e) { - failureCollector.warning(FailureCollector.Stage.PARSING, - "Unable to fetch guide: " + topLevelTitle, e); - continue; - } - quarkiverseGuides.add(sub); - } } } - private Document readGuide(Guide guide, String link, Optional titlePrefix) throws URISyntaxException, IOException { - guide.url = new URI(link); + private Guide readGuide(Path file) throws IOException { + Guide guide = new Guide(); + guide.url = baseUri.resolve(pages.relativize(file).toString()); guide.type = "reference"; guide.origin = QUARKIVERSE_ORIGIN; - Document extensionIndex = Jsoup.connect(link).get(); - Elements content = extensionIndex.select("div.content"); + Document document = Jsoup.parse(file); - String title = content.select("h1.page").text(); - if (!title.isBlank()) { - String actualTitle = titlePrefix.map(prefix -> "%s: %s".formatted(prefix, title)).orElse(title); - guide.title.set(actualTitle); + String version = pages.relativize(file).getName(1).toString(); + + String title = document.select("h1.page").text(); + if (title.isBlank()) { + document.select("nav.breadcrumbs").text(); + } + if (title.isBlank()) { + title = document.select("h3.title").text(); } - guide.summary.set(content.select("div#preamble").text()); - guide.htmlFullContentProvider.set(new FileInputProvider(link, dumpHtmlToFile(content.html()))); + // we always add version to the title, since Quarkiverse extensions can have multiple docs + // with the same titles for different versions, and it's hard to distinguish them otherwise. + guide.title.set("%s (%s)".formatted(title.trim(), version)); - Log.debugf("Parsed guide: %s", guide.url); - return extensionIndex; - } + guide.summary.set(document.select("div#preamble").text()); + guide.htmlFullContentProvider.set(new FileInputProvider(file)); - private Path dumpHtmlToFile(String html) throws IOException { - Path path = guideHtmls.path().resolve(UUID.randomUUID().toString()); - try (FileOutputStream fos = new FileOutputStream(path.toFile())) { - fos.write(html.getBytes(StandardCharsets.UTF_8)); - } - return path; + Log.debugf("Parsed guide: %s", guide.url); + return guide; } public Stream guides() { @@ -147,12 +118,12 @@ public Stream guides() { @Override public void close() throws IOException { try (var closer = new Closer()) { - closer.push(CloseableDirectory::close, guideHtmls); closer.push(List::clear, quarkiverseGuides); + closer.push(CloseableDirectory::close, tempDir); } } - private record FileInputProvider(String link, Path content) implements InputProvider { + private record FileInputProvider(Path content) implements InputProvider { @Override public InputStream open() throws IOException { diff --git a/src/main/java/io/quarkus/search/app/quarkiverseio/QuarkiverseIOConfig.java b/src/main/java/io/quarkus/search/app/quarkiverseio/QuarkiverseIOConfig.java index e463619f..b82a06e6 100644 --- a/src/main/java/io/quarkus/search/app/quarkiverseio/QuarkiverseIOConfig.java +++ b/src/main/java/io/quarkus/search/app/quarkiverseio/QuarkiverseIOConfig.java @@ -1,18 +1,39 @@ package io.quarkus.search.app.quarkiverseio; import java.net.URI; +import java.nio.file.Path; +import java.util.Optional; import io.smallrye.config.ConfigMapping; import io.smallrye.config.WithDefault; @ConfigMapping(prefix = "quarkiverseio") public interface QuarkiverseIOConfig { - String WEB_URI_DEFAULT_STRING = "https://docs.quarkiverse.io/index/explore/index.html"; - URI WEB_URI_DEFAULT = URI.create(WEB_URI_DEFAULT_STRING); - @WithDefault(WEB_URI_DEFAULT_STRING) - URI webUri(); + Optional zip(); + + Optional githubArtifact(); @WithDefault("true") boolean enabled(); + + @WithDefault("https://docs.quarkiverse.io/") + URI baseUri(); + + interface GithubArtifact { + String token(); + + String repository(); + + //@WithDefault( "Publish website" ) + String actionName(); + + //@WithDefault( "github-pages" ) + String artifactName(); + } + + interface Zip { + Path path(); + } + } diff --git a/src/main/java/io/quarkus/search/app/util/FileUtils.java b/src/main/java/io/quarkus/search/app/util/FileUtils.java index c0da6f84..25d1479a 100644 --- a/src/main/java/io/quarkus/search/app/util/FileUtils.java +++ b/src/main/java/io/quarkus/search/app/util/FileUtils.java @@ -1,6 +1,8 @@ package io.quarkus.search.app.util; +import java.io.BufferedInputStream; import java.io.IOException; +import java.io.InputStream; import java.nio.file.CopyOption; import java.nio.file.FileSystem; import java.nio.file.FileSystems; @@ -11,6 +13,10 @@ import java.nio.file.attribute.BasicFileAttributes; import java.util.Map; +import org.apache.commons.compress.archivers.ArchiveInputStream; +import org.apache.commons.compress.archivers.tar.TarArchiveEntry; +import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; + public final class FileUtils { private FileUtils() { @@ -32,6 +38,25 @@ public static void unzip(Path sourceFile, Path targetDir) throws IOException { } } + public static void untar(Path sourceFile, Path targetDir) throws IOException { + try (InputStream fi = Files.newInputStream(sourceFile); + InputStream bi = new BufferedInputStream(fi); + ArchiveInputStream archive = new TarArchiveInputStream(bi)) { + if (!Files.exists(targetDir)) { + Files.createDirectories(targetDir); + } + TarArchiveEntry entry; + while ((entry = archive.getNextEntry()) != null) { + Path extractTo = targetDir.resolve(entry.getName()); + if (entry.isDirectory()) { + Files.createDirectories(extractTo); + } else { + Files.copy(archive, extractTo); + } + } + } + } + static void copyRecursively(Path source, Path target, CopyOption... options) throws IOException { Files.walkFileTree(source, new SimpleFileVisitor() { @Override diff --git a/src/main/resources/application.properties b/src/main/resources/application.properties index 6ef26330..268da75a 100644 --- a/src/main/resources/application.properties +++ b/src/main/resources/application.properties @@ -116,7 +116,7 @@ quarkus.hibernate-search-standalone.elasticsearch.max-connections=90 %dev,staging.quarkus.http.header."Access-Control-Allow-Private-Network".value=true # disable indexing and fetching of html quarkiverse guides in tests/dev %dev,test.quarkiverseio.enabled=false - +%dev,test.quarkiverseio.zip.path=${maven.project.testResourceDirectory}/github-pages.zip ######################## # Logging diff --git a/src/test/resources/github-pages.zip b/src/test/resources/github-pages.zip new file mode 100644 index 00000000..6a0c3e80 Binary files /dev/null and b/src/test/resources/github-pages.zip differ