diff --git a/README.adoc b/README.adoc
index bf6cb662..0a67cf25 100644
--- a/README.adoc
+++ b/README.adoc
@@ -85,6 +85,34 @@ INDEXING_REPORTING_GITHUB_ISSUE_ID={github-issue-id-in-your-repository}
INDEXING_REPORTING_GITHUB_WARNING_REPEAT_DELAY=10m
----
+=== Updating the built-in docs.quarkiverse.io sample
+
+By default, the app will use the built-in docs.quarkiverse.io sample containing a snapshot of docs.quarkiverse.io.
+
+Quarkiverse guides are indexed from the GitHub workflow artifact created as part of
+the https://github.com/quarkiverse/quarkiverse-docs/actions/workflows/publish.yml[publish action].
+To update the included sample: go to the https://github.com/quarkiverse/quarkiverse-docs/actions/workflows/publish.yml?query=is%3Asuccess[page],
+open the first run at the top (latest successful execution) and download the `github-pages` at the bottom of the page.
+
+GitHub has a 100mb limit on the files we can include in the repository. This means that the downloaded artifact has to be cleaned up
+a bit. Unpack the archive and remove some of the extensions (in particular try dropping the ones taking the most space, e.g. `quarkus-backstage`.).
+Make sure to keep Amazon S3 guides as those are using in tests.
+Once you are done removing extensions, archive remaining guides back and replace the existing file.
+
+=== Downloading the latest docs.quarkiverse.io artifact
+
+If during the development there's a need to use a fresh docs.quarkiverse.io artifact downloaded by the app itself during the indexing process,
+add the following properties to your `.env` file:
+
+[source,properties]
+----
+# see about tokens https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens
+QUARKIVERSEIO_GITHUB_ARTIFACT_TOKEN={your-generated-github-token}
+QUARKIVERSEIO_GITHUB_ARTIFACT_REPOSITORY=quarkiverse/quarkiverse-docs
+QUARKIVERSEIO_GITHUB_ARTIFACT_ACTION_NAME=Publish website
+QUARKIVERSEIO_GITHUB_ARTIFACT_ARTIFACT_NAME=github-pages
+----
+
[[testing]]
== Testing
diff --git a/pom.xml b/pom.xml
index 2390cba6..aed522dd 100644
--- a/pom.xml
+++ b/pom.xml
@@ -112,6 +112,11 @@
lodash
4.17.13
+
+ org.apache.commons
+ commons-compress
+ 1.27.1
+
@@ -204,6 +209,10 @@
io.quarkiverse.helm
quarkus-helm
+
+ org.apache.commons
+ commons-compress
+
diff --git a/src/main/java/io/quarkus/search/app/fetching/FetchingService.java b/src/main/java/io/quarkus/search/app/fetching/FetchingService.java
index fca3409e..2bfe8112 100644
--- a/src/main/java/io/quarkus/search/app/fetching/FetchingService.java
+++ b/src/main/java/io/quarkus/search/app/fetching/FetchingService.java
@@ -1,9 +1,11 @@
package io.quarkus.search.app.fetching;
+import static io.quarkus.search.app.util.FileUtils.untar;
import static io.quarkus.search.app.util.FileUtils.unzip;
import java.io.IOException;
import java.net.URI;
+import java.nio.file.Files;
import java.nio.file.Path;
import java.util.LinkedHashMap;
import java.util.Map;
@@ -32,6 +34,12 @@
import org.hibernate.search.util.common.impl.Closer;
import org.hibernate.search.util.common.impl.SuppressingCloser;
+import org.kohsuke.github.GHArtifact;
+import org.kohsuke.github.GHRepository;
+import org.kohsuke.github.GHWorkflowRun;
+import org.kohsuke.github.GitHub;
+import org.kohsuke.github.GitHubBuilder;
+
import io.vertx.core.impl.ConcurrentHashSet;
@ApplicationScoped
@@ -49,8 +57,45 @@ public class FetchingService {
private final Map detailsCache = new ConcurrentHashMap<>();
private final Set tempDirectories = new ConcurrentHashSet<>();
- public QuarkiverseIO fetchQuarkiverseIo(FailureCollector failureCollector) {
- return new QuarkiverseIO(quarkiverseIOConfig, failureCollector);
+ public QuarkiverseIO fetchQuarkiverseIo(FailureCollector failureCollector) throws IOException {
+ CloseableDirectory tempDir = CloseableDirectory.temp("quarkiverse-io");
+ Path artifact = null;
+ if (quarkiverseIOConfig.githubArtifact().isPresent()) {
+ QuarkiverseIOConfig.GithubArtifact ghConfig = quarkiverseIOConfig.githubArtifact().get();
+ GitHub github = new GitHubBuilder().withOAuthToken(ghConfig.token()).build();
+ GHRepository repository = github.getRepository(ghConfig.repository());
+ for (GHWorkflowRun run : repository.queryWorkflowRuns()
+ .conclusion(GHWorkflowRun.Conclusion.SUCCESS)
+ .list().withPageSize(10)) {
+ if (ghConfig.actionName().equals(run.getName())) {
+ for (GHArtifact ghArtifact : run.listArtifacts().withPageSize(5).toList()) {
+ if (ghConfig.artifactName().equals(ghArtifact.getName())) {
+ artifact = tempDir.path().resolve(ghArtifact.getName() + ".zip");
+ final Path finalArtifact = artifact;
+ Log.infof("Downloading Quarkiverse %s artifact #%s", ghConfig.artifactName(), ghArtifact.getId());
+ ghArtifact.download(is -> Files.copy(is, finalArtifact));
+ break;
+ }
+ }
+ break;
+ }
+ }
+ if (artifact == null) {
+ throw new IllegalStateException("Artifact " + ghConfig.artifactName() + " not found.");
+ }
+ } else if (quarkiverseIOConfig.zip().isPresent()) {
+ artifact = quarkiverseIOConfig.zip().get().path();
+ } else {
+ throw new IllegalStateException(
+ "Cannot fetch Quarkiverse guides as neither zip nor GitHub configuration was supplied");
+ }
+
+ unzip(artifact, tempDir.path());
+ Path pages = tempDir.path().resolve("pages");
+ untar(tempDir.path().resolve("artifact.tar"), pages);
+
+ return new QuarkiverseIO(quarkiverseIOConfig.enabled(), pages, quarkiverseIOConfig.baseUri(), failureCollector,
+ tempDir);
}
public QuarkusIO fetchQuarkusIo(FailureCollector failureCollector) {
diff --git a/src/main/java/io/quarkus/search/app/quarkiverseio/QuarkiverseIO.java b/src/main/java/io/quarkus/search/app/quarkiverseio/QuarkiverseIO.java
index bfb8a2af..cc8b658b 100644
--- a/src/main/java/io/quarkus/search/app/quarkiverseio/QuarkiverseIO.java
+++ b/src/main/java/io/quarkus/search/app/quarkiverseio/QuarkiverseIO.java
@@ -2,23 +2,18 @@
import java.io.Closeable;
import java.io.FileInputStream;
-import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
-import java.net.URISyntaxException;
-import java.nio.charset.StandardCharsets;
+import java.nio.file.FileVisitResult;
+import java.nio.file.Files;
import java.nio.file.Path;
+import java.nio.file.SimpleFileVisitor;
+import java.nio.file.attribute.BasicFileAttributes;
import java.util.ArrayList;
-import java.util.HashMap;
import java.util.List;
-import java.util.Map;
-import java.util.Optional;
-import java.util.UUID;
import java.util.stream.Stream;
-import jakarta.ws.rs.core.UriBuilder;
-
import io.quarkus.search.app.entity.Guide;
import io.quarkus.search.app.hibernate.InputProvider;
import io.quarkus.search.app.indexing.IndexableGuides;
@@ -31,110 +26,86 @@
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
-import org.jsoup.nodes.Element;
-import org.jsoup.select.Elements;
public class QuarkiverseIO implements IndexableGuides, Closeable {
public static final String QUARKIVERSE_ORIGIN = "quarkiverse-hub";
- private final URI quarkiverseDocsIndex;
private final FailureCollector failureCollector;
private final List quarkiverseGuides = new ArrayList<>();
private final boolean enabled;
- private final CloseableDirectory guideHtmls;
+ private final Path pages;
+ private final URI baseUri;
+ private final CloseableDirectory tempDir;
- public QuarkiverseIO(QuarkiverseIOConfig config, FailureCollector failureCollector) {
- this.quarkiverseDocsIndex = config.webUri();
- this.enabled = config.enabled();
+ public QuarkiverseIO(boolean enabled, Path pages, URI baseUri, FailureCollector failureCollector,
+ CloseableDirectory tempDir) {
this.failureCollector = failureCollector;
- try {
- guideHtmls = CloseableDirectory.temp("quarkiverse_htmls_");
- } catch (IOException e) {
- throw new IllegalStateException("Failed to fetch quarkiverse guides: %s".formatted(e.getMessage()), e);
- }
+ this.enabled = enabled;
+ this.pages = pages;
+ this.baseUri = baseUri;
+ this.tempDir = tempDir;
}
public void parseGuides() {
- Document index = null;
- try {
- index = Jsoup.connect(quarkiverseDocsIndex.toString()).get();
+ try (Stream stream = Files.list(pages)) {
+ List guideDirectories = stream.filter(Files::isDirectory)
+ .filter(dir -> dir.getFileName().toString().startsWith("quarkus"))
+ .toList();
+ for (Path directory : guideDirectories) {
+ Files.walkFileTree(
+ directory, new SimpleFileVisitor<>() {
+ @Override
+ public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
+ if (file.getFileName().toString().endsWith(".html")) {
+ quarkiverseGuides.add(readGuide(file));
+ }
+ return FileVisitResult.CONTINUE;
+ }
+
+ @Override
+ public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) throws IOException {
+ // some extensions have these extra directories that are not "visible" from the docs page themselves,
+ // but are still deployed (and hence accessible). We want to ignore those:
+ if (dir.getFileName().toString().equals("includes")) {
+ return FileVisitResult.SKIP_SUBTREE;
+ }
+ return super.preVisitDirectory(dir, attrs);
+ }
+ });
+ }
} catch (IOException e) {
failureCollector.critical(FailureCollector.Stage.PARSING, "Unable to fetch the Quarkiverse Docs index page.", e);
- // no point in doing anything else here:
- return;
- }
-
- // find links to quarkiverse extension docs:
- Elements quarkiverseGuideIndexLinks = index.select("ul.components li.component a.title");
-
- for (Element quarkiverseGuideIndexLink : quarkiverseGuideIndexLinks) {
- Guide guide = new Guide();
- String topLevelTitle = quarkiverseGuideIndexLink.text();
- guide.title.set(topLevelTitle);
-
- Document extensionIndex = null;
- try {
- extensionIndex = readGuide(guide, quarkiverseGuideIndexLink.absUrl("href"), Optional.empty());
- } catch (URISyntaxException | IOException e) {
- failureCollector.warning(FailureCollector.Stage.PARSING,
- "Unable to fetch guide: " + topLevelTitle, e);
- continue;
- }
-
- quarkiverseGuides.add(guide);
-
- // find other sub-pages on the left side
- Map indexLinks = new HashMap<>();
- Elements extensionSubGuides = extensionIndex.select("nav.nav-menu .nav-item a");
- for (Element element : extensionSubGuides) {
- String href = element.absUrl("href");
- URI uri = UriBuilder.fromUri(href).replaceQuery(null).fragment(null).build();
- indexLinks.computeIfAbsent(uri, u -> element.text());
- }
-
- for (Map.Entry entry : indexLinks.entrySet()) {
- Guide sub = new Guide();
- sub.title.set(entry.getValue());
- try {
- readGuide(sub, entry.getKey().toString(), Optional.of(topLevelTitle));
- } catch (URISyntaxException | IOException e) {
- failureCollector.warning(FailureCollector.Stage.PARSING,
- "Unable to fetch guide: " + topLevelTitle, e);
- continue;
- }
- quarkiverseGuides.add(sub);
- }
}
}
- private Document readGuide(Guide guide, String link, Optional titlePrefix) throws URISyntaxException, IOException {
- guide.url = new URI(link);
+ private Guide readGuide(Path file) throws IOException {
+ Guide guide = new Guide();
+ guide.url = baseUri.resolve(pages.relativize(file).toString());
guide.type = "reference";
guide.origin = QUARKIVERSE_ORIGIN;
- Document extensionIndex = Jsoup.connect(link).get();
- Elements content = extensionIndex.select("div.content");
+ Document document = Jsoup.parse(file);
- String title = content.select("h1.page").text();
- if (!title.isBlank()) {
- String actualTitle = titlePrefix.map(prefix -> "%s: %s".formatted(prefix, title)).orElse(title);
- guide.title.set(actualTitle);
+ String version = pages.relativize(file).getName(1).toString();
+
+ String title = document.select("h1.page").text();
+ if (title.isBlank()) {
+ document.select("nav.breadcrumbs").text();
+ }
+ if (title.isBlank()) {
+ title = document.select("h3.title").text();
}
- guide.summary.set(content.select("div#preamble").text());
- guide.htmlFullContentProvider.set(new FileInputProvider(link, dumpHtmlToFile(content.html())));
+ // we always add version to the title, since Quarkiverse extensions can have multiple docs
+ // with the same titles for different versions, and it's hard to distinguish them otherwise.
+ guide.title.set("%s (%s)".formatted(title.trim(), version));
- Log.debugf("Parsed guide: %s", guide.url);
- return extensionIndex;
- }
+ guide.summary.set(document.select("div#preamble").text());
+ guide.htmlFullContentProvider.set(new FileInputProvider(file));
- private Path dumpHtmlToFile(String html) throws IOException {
- Path path = guideHtmls.path().resolve(UUID.randomUUID().toString());
- try (FileOutputStream fos = new FileOutputStream(path.toFile())) {
- fos.write(html.getBytes(StandardCharsets.UTF_8));
- }
- return path;
+ Log.debugf("Parsed guide: %s", guide.url);
+ return guide;
}
public Stream guides() {
@@ -147,12 +118,12 @@ public Stream guides() {
@Override
public void close() throws IOException {
try (var closer = new Closer()) {
- closer.push(CloseableDirectory::close, guideHtmls);
closer.push(List::clear, quarkiverseGuides);
+ closer.push(CloseableDirectory::close, tempDir);
}
}
- private record FileInputProvider(String link, Path content) implements InputProvider {
+ private record FileInputProvider(Path content) implements InputProvider {
@Override
public InputStream open() throws IOException {
diff --git a/src/main/java/io/quarkus/search/app/quarkiverseio/QuarkiverseIOConfig.java b/src/main/java/io/quarkus/search/app/quarkiverseio/QuarkiverseIOConfig.java
index e463619f..b82a06e6 100644
--- a/src/main/java/io/quarkus/search/app/quarkiverseio/QuarkiverseIOConfig.java
+++ b/src/main/java/io/quarkus/search/app/quarkiverseio/QuarkiverseIOConfig.java
@@ -1,18 +1,39 @@
package io.quarkus.search.app.quarkiverseio;
import java.net.URI;
+import java.nio.file.Path;
+import java.util.Optional;
import io.smallrye.config.ConfigMapping;
import io.smallrye.config.WithDefault;
@ConfigMapping(prefix = "quarkiverseio")
public interface QuarkiverseIOConfig {
- String WEB_URI_DEFAULT_STRING = "https://docs.quarkiverse.io/index/explore/index.html";
- URI WEB_URI_DEFAULT = URI.create(WEB_URI_DEFAULT_STRING);
- @WithDefault(WEB_URI_DEFAULT_STRING)
- URI webUri();
+ Optional zip();
+
+ Optional githubArtifact();
@WithDefault("true")
boolean enabled();
+
+ @WithDefault("https://docs.quarkiverse.io/")
+ URI baseUri();
+
+ interface GithubArtifact {
+ String token();
+
+ String repository();
+
+ //@WithDefault( "Publish website" )
+ String actionName();
+
+ //@WithDefault( "github-pages" )
+ String artifactName();
+ }
+
+ interface Zip {
+ Path path();
+ }
+
}
diff --git a/src/main/java/io/quarkus/search/app/util/FileUtils.java b/src/main/java/io/quarkus/search/app/util/FileUtils.java
index c0da6f84..25d1479a 100644
--- a/src/main/java/io/quarkus/search/app/util/FileUtils.java
+++ b/src/main/java/io/quarkus/search/app/util/FileUtils.java
@@ -1,6 +1,8 @@
package io.quarkus.search.app.util;
+import java.io.BufferedInputStream;
import java.io.IOException;
+import java.io.InputStream;
import java.nio.file.CopyOption;
import java.nio.file.FileSystem;
import java.nio.file.FileSystems;
@@ -11,6 +13,10 @@
import java.nio.file.attribute.BasicFileAttributes;
import java.util.Map;
+import org.apache.commons.compress.archivers.ArchiveInputStream;
+import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
+import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
+
public final class FileUtils {
private FileUtils() {
@@ -32,6 +38,25 @@ public static void unzip(Path sourceFile, Path targetDir) throws IOException {
}
}
+ public static void untar(Path sourceFile, Path targetDir) throws IOException {
+ try (InputStream fi = Files.newInputStream(sourceFile);
+ InputStream bi = new BufferedInputStream(fi);
+ ArchiveInputStream archive = new TarArchiveInputStream(bi)) {
+ if (!Files.exists(targetDir)) {
+ Files.createDirectories(targetDir);
+ }
+ TarArchiveEntry entry;
+ while ((entry = archive.getNextEntry()) != null) {
+ Path extractTo = targetDir.resolve(entry.getName());
+ if (entry.isDirectory()) {
+ Files.createDirectories(extractTo);
+ } else {
+ Files.copy(archive, extractTo);
+ }
+ }
+ }
+ }
+
static void copyRecursively(Path source, Path target, CopyOption... options) throws IOException {
Files.walkFileTree(source, new SimpleFileVisitor() {
@Override
diff --git a/src/main/resources/application.properties b/src/main/resources/application.properties
index 6ef26330..268da75a 100644
--- a/src/main/resources/application.properties
+++ b/src/main/resources/application.properties
@@ -116,7 +116,7 @@ quarkus.hibernate-search-standalone.elasticsearch.max-connections=90
%dev,staging.quarkus.http.header."Access-Control-Allow-Private-Network".value=true
# disable indexing and fetching of html quarkiverse guides in tests/dev
%dev,test.quarkiverseio.enabled=false
-
+%dev,test.quarkiverseio.zip.path=${maven.project.testResourceDirectory}/github-pages.zip
########################
# Logging
diff --git a/src/test/resources/github-pages.zip b/src/test/resources/github-pages.zip
new file mode 100644
index 00000000..6a0c3e80
Binary files /dev/null and b/src/test/resources/github-pages.zip differ