diff --git a/.github/labels.yml b/.github/labels.yml new file mode 100644 index 0000000..82bb9b9 --- /dev/null +++ b/.github/labels.yml @@ -0,0 +1,66 @@ +--- +# Labels names are important as they are used by Release Drafter to decide +# regarding where to record them in changelog or if to skip them. +# +# The repository labels will be automatically configured using this file and +# the GitHub Action https://github.com/marketplace/actions/github-labeler. +- name: breaking + description: Breaking Changes + color: bfd4f2 +- name: bug + description: Something isn't working + color: d73a4a +- name: build + description: Build System and Dependencies + color: bfdadc +- name: ci + description: Continuous Integration + color: 4a97d6 +- name: dependencies + description: Pull requests that update a dependency file + color: 0366d6 +- name: documentation + description: Improvements or additions to documentation + color: 0075ca +- name: duplicate + description: This issue or pull request already exists + color: cfd3d7 +- name: enhancement + description: New feature or request + color: a2eeef +- name: github_actions + description: Pull requests that update Github_actions code + color: "000000" +- name: good first issue + description: Good for newcomers + color: 7057ff +- name: help wanted + description: Extra attention is needed + color: 008672 +- name: invalid + description: This doesn't seem right + color: e4e669 +- name: performance + description: Performance + color: "016175" +- name: python + description: Pull requests that update Python code + color: 2b67c6 +- name: question + description: Further information is requested + color: d876e3 +- name: refactoring + description: Refactoring + color: ef67c4 +- name: removal + description: Removals and Deprecations + color: 9ae7ea +- name: style + description: Style + color: c120e5 +- name: testing + description: Testing + color: b1fc6f +- name: wontfix + description: This will not be worked on + color: ffffff \ No newline at end of file diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml new file mode 100644 index 0000000..f86c79d --- /dev/null +++ b/.github/release-drafter.yml @@ -0,0 +1,59 @@ +categories: + - title: ":boom: Breaking Changes" + label: "breaking" + - title: ":rocket: Features" + label: "enhancement" + - title: ":fire: Removals and Deprecations" + label: "removal" + - title: ":beetle: Fixes" + label: "bug" + - title: ":racehorse: Performance" + label: "performance" + - title: ":rotating_light: Testing" + label: "testing" + - title: ":construction_worker: Continuous Integration" + label: "ci" + - title: ":books: Documentation" + label: "documentation" + - title: ":hammer: Refactoring" + label: "refactoring" + - title: ":lipstick: Style" + label: "style" + - title: ":package: Dependencies" + labels: + - "dependencies" + - "build" + +autolabeler: + - label: 'documentation' + branch: + - '/docs{0,1}\/.+/' + - label: 'bug' + branch: + - '/fix\/.+/' + title: + - '/fix/i' + - label: 'enhancement' + branch: + - '/feat\/.+/' + body: + - '/JIRA-[0-9]{1,4}/' + - label: 'refactoring' + branch: + - '/refactor\/.+/' + title: + - '/^refactor/i' + - label: 'testing' + branch: + - '/test\/.+/' + - label: 'breaking' + title: + - '/breaking change/i' + - label: 'ci' + files: + - '.github/*' + +template: | + ## Changes + + $CHANGES diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..411675f --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,44 @@ +name: Build + +on: + push: + branches: + - master + pull_request: + branches: + - master + +jobs: + build: + runs-on: ubuntu-latest + permissions: + contents: read + id-token: write + + steps: + - uses: actions/checkout@v3 + + - name: Set up JDK 21 + uses: actions/setup-java@v3 + with: + java-version: 21 + distribution: zulu + + - name: Authenticate to Google Cloud + id: auth + uses: google-github-actions/auth@v1.1.1 + with: + workload_identity_provider: "projects/848539402404/locations/global/workloadIdentityPools/gh-actions/providers/gh-actions" + service_account: "gh-actions-dapla-pseudo@artifact-registry-5n.iam.gserviceaccount.com" + token_format: access_token + + - name: Cache Maven packages + uses: actions/cache@v3 + with: + path: ~/.m2 + key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} + restore-keys: ${{ runner.os }}-m2 + + - name: Build with Maven and deploy to Artifact Registry + run: mvn --batch-mode -P ssb-bip deploy + diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml new file mode 100644 index 0000000..19d5246 --- /dev/null +++ b/.github/workflows/labeler.yml @@ -0,0 +1,19 @@ +name: Labeler + +on: + push: + branches: + - master + +jobs: + labeler: + runs-on: ubuntu-latest + steps: + - name: Check out the repository + uses: actions/checkout@v3 + + # Reads labels from .github/labels.yml + - name: Run Labeler + uses: crazy-max/ghaction-github-labeler@v4 + with: + skip-delete: true diff --git a/.github/workflows/release-drafter.yml b/.github/workflows/release-drafter.yml new file mode 100644 index 0000000..5dea360 --- /dev/null +++ b/.github/workflows/release-drafter.yml @@ -0,0 +1,30 @@ +name: Release Drafter + +on: + push: + branches: + - master + # pull_request event is required only for autolabeler + pull_request: + types: + - opened + - reopened + - synchronize + +permissions: + contents: read + +jobs: + update_release_draft: + permissions: + # write permission is required to create a GitHub release + contents: write + # write permission is required for autolabeler + # otherwise, read permission is required at least + pull-requests: write + runs-on: ubuntu-latest + steps: + # Draft the next Release notes as Pull Requests are merged into main + - uses: release-drafter/release-drafter@v5 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} \ No newline at end of file diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..6f352fb --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,100 @@ +name: Release + +on: + push: + branches: + - release + +jobs: + release: + runs-on: ubuntu-latest + permissions: + contents: write + id-token: write + packages: write + + steps: + - uses: actions/create-github-app-token@v1 + id: app-token + with: + app-id: ${{ secrets.DAPLA_BOT_APP_ID }} + private-key: ${{ secrets.DAPLA_BOT_PRIVATE_KEY }} + + - uses: actions/checkout@v3 + with: + token: ${{ steps.app-token.outputs.token }} + ref: refs/heads/master + + - name: Set up JDK 21 + uses: actions/setup-java@v3 + with: + java-version: 21 + distribution: zulu + server-id: github + + - name: Authenticate to Google Cloud + id: auth + uses: google-github-actions/auth@v1.1.1 + with: + workload_identity_provider: "projects/848539402404/locations/global/workloadIdentityPools/gh-actions/providers/gh-actions" + service_account: "gh-actions-dapla-pseudo@artifact-registry-5n.iam.gserviceaccount.com" + token_format: access_token + + - name: Cache Maven packages + uses: actions/cache@v3 + with: + path: ~/.m2 + key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} + restore-keys: ${{ runner.os }}-m2 + + - name: Configure Git user + run: | + git config user.name "dapla-bot[bot]" + git config user.email "143391972+dapla-bot[bot]@users.noreply.github.com" + + - name: Setup Maven authentication to GitHub packages + uses: s4u/maven-settings-action@v2.8.0 + id: maven_settings + with: + servers: '[{"id": "github","configuration": {"httpHeaders": {"property": {"name": "Authorization","value": "Bearer ${{ secrets.GITHUB_TOKEN }}"}}}}]' + + - name: Maven release and deploy to GitHub packages + id: release_jar + env: + GITHUB_TOKEN: ${{ steps.app-token.outputs.token }} + run: | + # Get the release version from the pom.xml before the next snapshot increment + VERSION=$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout | sed "s/-SNAPSHOT//") + echo "version=${VERSION}" >> $GITHUB_OUTPUT + # Perform the release/deploy and increment the version to the next snapshot + mvn --batch-mode release:prepare -Darguments="-Dmaven.deploy.skip=true -DskipTests" + mvn --batch-mode release:perform + TAG=$(git describe --abbrev=0 --tags) + echo "tag=${TAG}" >> $GITHUB_OUTPUT + + - name: Create GitHub release draft + uses: release-drafter/release-drafter@v5 + id: create_github_release + env: + GITHUB_TOKEN: ${{ steps.app-token.outputs.token }} + with: + tag: ${{ steps.release_jar.outputs.tag }} + + - name: Upload assets to GitHub release draft + env: + GITHUB_TOKEN: ${{ steps.app-token.outputs.token }} + run: | + ARTIFACT_ID=$(mvn help:evaluate -Dexpression=project.artifactId -q -DforceStdout) + # Get all files matching the artifact id and version (source, javadoc, etc.) + ARTIFACT_GLOB=(./target/$ARTIFACT_ID-${{ steps.release_jar.outputs.version }}*.jar) + for file in "${ARTIFACT_GLOB[@]}"; do + echo "Uploading $file" + gh release upload ${{ steps.create_github_release.outputs.tag_name }} $file + done + + - name: Publish GitHub release + uses: eregon/publish-release@v1 + env: + GITHUB_TOKEN: ${{ steps.app-token.outputs.token }} + with: + release_id: ${{ steps.create_github_release.outputs.id }} \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md deleted file mode 100644 index 139e5c0..0000000 --- a/CHANGELOG.md +++ /dev/null @@ -1,32 +0,0 @@ -# Changelog - -## [0.2.5](https://github.com/statisticsnorway/dapla-dlp-pseudo-service/tree/0.2.5) (2021-04-29) - -[Full Changelog](https://github.com/statisticsnorway/dapla-dlp-pseudo-service/compare/0.2.4...0.2.5) - -**Closed issues:** - -- Add version endpoint [\#8](https://github.com/statisticsnorway/dapla-dlp-pseudo-service/issues/8) - -## [0.2.4](https://github.com/statisticsnorway/dapla-dlp-pseudo-service/tree/0.2.4) (2021-04-26) - -[Full Changelog](https://github.com/statisticsnorway/dapla-dlp-pseudo-service/compare/0.2.3...0.2.4) - -## [0.2.3](https://github.com/statisticsnorway/dapla-dlp-pseudo-service/tree/0.2.3) (2021-04-25) - -[Full Changelog](https://github.com/statisticsnorway/dapla-dlp-pseudo-service/compare/0.2.2...0.2.3) - -**Closed issues:** - -- Export: upload "export report" along with exported archives [\#6](https://github.com/statisticsnorway/dapla-dlp-pseudo-service/issues/6) -- Export: deduce export destination path from dataset source path [\#5](https://github.com/statisticsnorway/dapla-dlp-pseudo-service/issues/5) -- Support "short format" when specifying dataset paths with timestamps [\#3](https://github.com/statisticsnorway/dapla-dlp-pseudo-service/issues/3) -- Export: support retrieving pseudo rules from another dataset path [\#2](https://github.com/statisticsnorway/dapla-dlp-pseudo-service/issues/2) - -**Merged pull requests:** - -- Feature/support multiple pseudo rules sources [\#7](https://github.com/statisticsnorway/dapla-dlp-pseudo-service/pull/7) ([kschulst](https://github.com/kschulst)) - - - -\* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)* diff --git a/Makefile b/Makefile index 7f6d980..d8bffc8 100644 --- a/Makefile +++ b/Makefile @@ -22,11 +22,7 @@ release-dryrun: ## Simulate a release in order to detect any issues .PHONY: release release: ## Release a new version. Update POMs and tag the new version in git - ./mvnw release:prepare release:perform -Darguments="-Dmaven.deploy.skip=true -Dmaven.javadoc.skip=true" - -.PHONY: changelog -changelog: ## Generate CHANGELOG.md - github_changelog_generator -u statisticsnorway -p dapla-dlp-pseudo-service + git push origin master:release .PHONY: help help: diff --git a/pom.xml b/pom.xml index b78179c..5921028 100644 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,7 @@ 4.0.0 no.ssb.dlp.pseudo.service dapla-dlp-pseudo-service - 2.2.7-SNAPSHOT + 3.0.0-SNAPSHOT io.micronaut @@ -12,31 +12,61 @@ - 17 + 21 ${jdk.version} ${jdk.version} UTF-8 UTF-8 + artifactregistry://europe-north1-maven.pkg.dev/artifact-registry-5n/dapla-pseudo-maven/ + statisticsnorway/dapla-dlp-pseudo-service + + 2.1.4 + 2.0.0 3.8.7 4.6.0 3.8.1 1.1.0 1.0.1 no.ssb.dlp.pseudo.service.Application - 2.22.2 - 3.2.4 - 2.22.2 - 1.18.26 + 3.2.2 + 3.5.1 + 3.0.1 + 3.2.2 + 1.18.30 + 5.7.0 - scm:git:git://github.com/statisticsnorway/dapla-dlp-pseudo-service.git - scm:git:git@github.com:statisticsnorway/dapla-dlp-pseudo-service.git - https://github.com/statisticsnorway/dapla-dlp-pseudo-service - 2.2.3 + scm:git:https://github.com/${github.repository}.git + https://github.com/${github.repository} + HEAD + + artifact-registry + ${artifact-registry.url} + + true + + + false + + + + artifact-registry-snapshot + ${artifact-registry.url} + + false + + + true + + + + central + https://repo.maven.apache.org/maven2 + jcenter.bintray.com https://jcenter.bintray.com @@ -113,27 +143,17 @@ no.ssb.dapla.dlp.pseudo dapla-dlp-pseudo-core - 1.2.5 - - - no.ssb.avro.convert.core - avro-buddy-core - 1.1.0 + ${dapla-dlp-pseudo-core.version} - no.ssb.dapla.storage - dapla-storage-client - 5.1.2 + javax.inject + javax.inject + 1 - no.ssb.dapla.parquet - parquet-buddy - 1.0.0 - - - no.ssb.dapla - dataset-api - 0.6 + org.apache.commons + commons-lang3 + 3.7 org.projectlombok @@ -205,6 +225,7 @@ org.mockito mockito-core + ${mockito.version} test @@ -275,7 +296,7 @@ org.apache.maven.plugins maven-release-plugin - 3.0.0-M4 + ${maven-release-plugin.version} @{project.version} @@ -428,7 +449,7 @@ com.google.cloud.artifactregistry artifactregistry-maven-wagon - 2.1.5 + ${artifactregistry-maven-wagon.version} @@ -437,43 +458,24 @@ ssb-bip - - - central - https://repo.maven.apache.org/maven2 - + artifact-registry - artifactregistry://europe-north1-maven.pkg.dev/artifact-registry-14da/maven-releases - - true - - - false - - - - artifact-registry-snapshot - artifactregistry://europe-north1-maven.pkg.dev/artifact-registry-14da/maven-snapshots - - false - - - true - + ${artifact-registry.url} - - - artifact-registry - artifactregistry://europe-north1-maven.pkg.dev/artifact-registry-14da/maven-snapshots + artifact-registry-snapshots + ${artifact-registry.url} - - artifact-registry - artifactregistry://europe-north1-maven.pkg.dev/artifact-registry-14da/maven-releases - + + + github + GitHub Packages + https://maven.pkg.github.com/${github.repository} + + diff --git a/src/main/java/no/ssb/dlp/pseudo/service/datasetmeta/DatasetMetaService.java b/src/main/java/no/ssb/dlp/pseudo/service/datasetmeta/DatasetMetaService.java deleted file mode 100644 index 6e1f2c8..0000000 --- a/src/main/java/no/ssb/dlp/pseudo/service/datasetmeta/DatasetMetaService.java +++ /dev/null @@ -1,88 +0,0 @@ -package no.ssb.dlp.pseudo.service.datasetmeta; - -import com.google.protobuf.util.JsonFormat; -import lombok.RequiredArgsConstructor; -import lombok.extern.slf4j.Slf4j; -import no.ssb.dapla.dataset.api.DatasetMeta; -import no.ssb.dapla.dataset.uri.DatasetUri; -import no.ssb.dapla.storage.client.backend.BinaryBackend; -import no.ssb.dlp.pseudo.core.func.PseudoFuncRule; -import no.ssb.dlp.pseudo.service.pseudo.PseudoConfig; - -import javax.inject.Singleton; -import java.io.ByteArrayOutputStream; -import java.nio.ByteBuffer; -import java.nio.channels.SeekableByteChannel; -import java.nio.charset.StandardCharsets; -import java.util.Optional; -import java.util.concurrent.atomic.AtomicInteger; - -@RequiredArgsConstructor -@Singleton -@Slf4j -public class DatasetMetaService { - - private final BinaryBackend storageBackend; - - public Optional readDatasetMeta(DatasetUri datasetUri) { - String datasetMetaPath = datasetUri.toString() + "/.dataset-meta.json"; - log.info("Reading dataset metadata from {}", datasetMetaPath); - - try { - final SeekableByteChannel channel; - try { - channel = storageBackend.read(datasetMetaPath); - } - catch (NullPointerException e) { - log.info("No dataset metadata found"); - return Optional.empty(); - } - - int bufferSize = Math.min(1024, (int) channel.size()); - ByteBuffer buffer = ByteBuffer.allocate(bufferSize); - ByteArrayOutputStream out = new ByteArrayOutputStream(); - - while (channel.read(buffer) > 0) { - out.write(buffer.array(), 0, buffer.position()); - buffer.clear(); - } - String datasetMetaJson = new String(out.toByteArray(), StandardCharsets.UTF_8); - - DatasetMeta.Builder datasetMetaBuilder = DatasetMeta.newBuilder(); - JsonFormat.parser().ignoringUnknownFields().merge(datasetMetaJson, datasetMetaBuilder); - return Optional.of(datasetMetaBuilder.build()); - } - catch (Exception e) { - throw new DatasetMetaReadException("Unable to read dataset meta for " + datasetUri.toString(), e); - } - } - - public PseudoConfig readDatasetPseudoConfig(DatasetUri datasetUri) { - log.debug("Read pseudo rules from " + datasetUri); - DatasetMeta datasetMeta = readDatasetMeta(datasetUri).orElse(null); - return pseudoConfigOf(datasetMeta); - } - - public PseudoConfig pseudoConfigOf(DatasetMeta datasetMeta) { - PseudoConfig pseudoConfig = new PseudoConfig(); - if (datasetMeta == null) { - return pseudoConfig; - } - - AtomicInteger ruleNo = new AtomicInteger(); - pseudoConfig.setRules(datasetMeta.getPseudoConfig().getVarsList().stream() - .map(i -> new PseudoFuncRule("pseudo-rule-" + ruleNo.getAndIncrement(), i.getVar(), i.getPseudoFunc())) - .toList() - ); - - return pseudoConfig; - } - - public static class DatasetMetaReadException extends RuntimeException { - public DatasetMetaReadException(String message, Throwable cause) { - super(message, cause); - } - } - - -} diff --git a/src/main/java/no/ssb/dlp/pseudo/service/export/ExportConfig.java b/src/main/java/no/ssb/dlp/pseudo/service/export/ExportConfig.java deleted file mode 100644 index 4b4ad7e..0000000 --- a/src/main/java/no/ssb/dlp/pseudo/service/export/ExportConfig.java +++ /dev/null @@ -1,15 +0,0 @@ -package no.ssb.dlp.pseudo.service.export; - -import io.micronaut.context.annotation.ConfigurationProperties; -import lombok.Data; - -@Data -@ConfigurationProperties("export") -public class ExportConfig { - - /** - * The default root path of exports. E.g. gs://[export-bucket-name]/export - */ - private String defaultTargetRoot; - -} diff --git a/src/main/java/no/ssb/dlp/pseudo/service/export/ExportController.java b/src/main/java/no/ssb/dlp/pseudo/service/export/ExportController.java deleted file mode 100644 index be232db..0000000 --- a/src/main/java/no/ssb/dlp/pseudo/service/export/ExportController.java +++ /dev/null @@ -1,151 +0,0 @@ -package no.ssb.dlp.pseudo.service.export; - - -import io.micronaut.core.annotation.Introspected; -import io.micronaut.http.HttpRequest; -import io.micronaut.http.HttpResponse; -import io.micronaut.http.HttpStatus; -import io.micronaut.http.MediaType; -import io.micronaut.http.annotation.Body; -import io.micronaut.http.annotation.Controller; -import io.micronaut.http.annotation.Error; -import io.micronaut.http.annotation.Post; -import io.micronaut.http.hateoas.JsonError; -import io.micronaut.http.hateoas.Link; -import io.micronaut.scheduling.TaskExecutors; -import io.micronaut.scheduling.annotation.ExecuteOn; -import io.micronaut.security.annotation.Secured; -import io.micronaut.security.rules.SecurityRule; -import io.micronaut.validation.Validated; -import io.swagger.v3.oas.annotations.Operation; -import io.swagger.v3.oas.annotations.media.Schema; -import io.swagger.v3.oas.annotations.tags.Tag; -import lombok.Data; -import lombok.RequiredArgsConstructor; -import lombok.extern.slf4j.Slf4j; -import no.ssb.dlp.pseudo.core.file.Compression; -import no.ssb.dlp.pseudo.core.file.CompressionEncryptionMethod; -import no.ssb.dlp.pseudo.core.file.MoreMediaTypes; -import no.ssb.dlp.pseudo.core.func.PseudoFuncRule; - -import javax.validation.Valid; -import javax.validation.constraints.Min; -import javax.validation.constraints.NotBlank; -import javax.validation.constraints.NotNull; -import java.security.Principal; -import java.util.ArrayList; -import java.util.List; -import java.util.Set; - -@RequiredArgsConstructor -@Controller -@Slf4j -@Secured(SecurityRule.IS_AUTHENTICATED) -@Validated -@Tag(name = "Pseudo operations") -public class ExportController { - - private final ExportService exportService; - - @Operation( - summary = "Export", - description = """ - Export a dataset in GCS to CSV or JSON, and optionally depseudonymize the data. The dataset will be - archived in an encrypted zip file protected by a user provided password. - - It is possible to specify `columnSelectors`, that allows for _partial export_, e.g. only specific fields. - This can be applied as a means to perform data minimization. - - Data is exported and stored to a specific, predefined GCS bucket. This is specified in the application - configuration and cannot be overridden. - """ - ) - @Post("/export") - @ExecuteOn(TaskExecutors.IO) - public ExportService.DatasetExportResult export(@Body @Valid ExportRequest request, Principal principal) { - log.info("Export dataset - user={}, dataset={}", principal.getName(), request.getSourceDataset()); - - ExportService.DatasetExport datasetExport = ExportService.DatasetExport.builder() - .userId(principal.getName()) - .sourceDataset(request.getSourceDataset()) - .columnSelectors(request.getColumnSelectors() == null ? Set.of() : request.getColumnSelectors()) - .depseudonymize(request.getDepseudonymize()) - .pseudoRules(request.getPseudoRules() == null ? List.of() : request.getPseudoRules()) - .compression(Compression.builder() - .encryption(CompressionEncryptionMethod.AES) - .type(MoreMediaTypes.APPLICATION_ZIP_TYPE) - .password(request.getTargetPassword()) - .build()) - .targetContentName(request.getTargetContentName()) - .targetContentType(request.getTargetContentType()) - .build(); - - return exportService.export(datasetExport); - } - - @Error - public HttpResponse datasetNotFoundError(HttpRequest request, ExportService.DatasetNotFoundException e) { - return error(request, e, HttpStatus.BAD_REQUEST, "dataset not found"); - } - - @Error - public HttpResponse noPseudoRulesFoundError(HttpRequest request, ExportService.NoPseudoRulesFoundException e) { - return error(request, e, HttpStatus.BAD_REQUEST, "no pseudo rules found for dataset"); - } - - static HttpResponse error(HttpRequest request, Exception e, HttpStatus httpStatus, String httpStatusReason) { - JsonError error = new JsonError(e.getMessage()) - .link(Link.SELF, Link.of(request.getUri())); - - return HttpResponse.status(httpStatus, httpStatusReason) - .body(error); - } - - @Data - @Introspected - static class ExportRequest { - - /** Path to dataset to be exported */ - @NotNull - private ExportService.DatasetUri sourceDataset; - - /** - * A set of glob patterns that can be used to specify a subset of all fields to export. - * A dataset can however be exported in its entirety by simply omitting any column selectors. - */ - private Set columnSelectors = Set.of(); - - /** - * Descriptive name of the contents. This will be used as baseline for the target archive name and its contents. - * If not specified then this will be deduced from the source dataset name. - * Should not include file suffixes such as .csv or .json. - */ - private String targetContentName; - - /** - * The content type of the resulting file. - * Defaults to application/json. - */ - @Schema(implementation = String.class, allowableValues = { - MediaType.APPLICATION_JSON, MoreMediaTypes.TEXT_CSV}) - private MediaType targetContentType = MediaType.APPLICATION_JSON_TYPE; - - /** The password for the resulting archive */ - @NotBlank - @Schema(implementation = String.class) - @Min(9) - private char[] targetPassword; - - /** Whether or not to depseudonymize dataset during export */ - private Boolean depseudonymize = false; - - /** - *

Pseudonymization rules to be used to depseudonymize the dataset. This is only - * relevant if depseudonymize=true.

- * - *

If not specified then pseudonymization rules are read from the - * dataset's dataset-meta.json file.

- */ - private List pseudoRules = new ArrayList<>(); - } -} diff --git a/src/main/java/no/ssb/dlp/pseudo/service/export/ExportService.java b/src/main/java/no/ssb/dlp/pseudo/service/export/ExportService.java deleted file mode 100644 index b41c2a4..0000000 --- a/src/main/java/no/ssb/dlp/pseudo/service/export/ExportService.java +++ /dev/null @@ -1,311 +0,0 @@ -package no.ssb.dlp.pseudo.service.export; - -import com.google.common.base.CharMatcher; -import com.google.common.base.Stopwatch; -import io.micronaut.context.event.ApplicationEventPublisher; -import io.micronaut.core.annotation.Introspected; -import io.micronaut.http.MediaType; -import io.micronaut.runtime.event.annotation.EventListener; -import io.micronaut.scheduling.annotation.Async; -import io.reactivex.Flowable; -import lombok.*; -import lombok.experimental.Accessors; -import lombok.extern.slf4j.Slf4j; -import no.ssb.dapla.dataset.api.DatasetMeta; -import no.ssb.dapla.parquet.FieldInterceptor; -import no.ssb.dapla.storage.client.DatasetStorage; -import no.ssb.dapla.storage.client.backend.gcs.GoogleCloudStorageBackend; -import no.ssb.dlp.pseudo.core.FieldPseudoInterceptor; -import no.ssb.dlp.pseudo.core.func.PseudoFuncRule; -import no.ssb.dlp.pseudo.core.func.PseudoFuncs; -import no.ssb.dlp.pseudo.core.PseudoOperation; -import no.ssb.dlp.pseudo.core.file.Compression; -import no.ssb.dlp.pseudo.core.file.MoreMediaTypes; -import no.ssb.dlp.pseudo.core.map.RecordMapSerializerFactory; -import no.ssb.dlp.pseudo.core.util.Json; -import no.ssb.dlp.pseudo.core.util.PathJoiner; -import no.ssb.dlp.pseudo.core.util.Zips; -import no.ssb.dlp.pseudo.service.datasetmeta.DatasetMetaService; -import no.ssb.dlp.pseudo.service.pseudo.PseudoConfig; -import no.ssb.dlp.pseudo.service.pseudo.PseudoSecrets; - -import javax.inject.Singleton; -import javax.validation.constraints.NotNull; -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.time.Instant; -import java.time.ZoneOffset; -import java.time.format.DateTimeFormatter; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicInteger; - -import static com.google.common.base.CharMatcher.anyOf; -import static com.google.common.base.CharMatcher.inRange; -import static no.ssb.dlp.pseudo.core.util.Zips.ZipOptions.zipOpts; - -@RequiredArgsConstructor -@Singleton -@Slf4j -public class ExportService { - - private static final CharMatcher ALLOWED_FILENAME_CHARACTERS = inRange('a', 'z') - .or(inRange('A', 'Z')) - .or(inRange('0', '9')) - .or(anyOf("_-")); - - private final ExportConfig exportConfig; - private final DatasetStorage datasetClient; - private final GoogleCloudStorageBackend storageBackend; - private final PseudoSecrets pseudoSecrets; - private final DatasetMetaService datasetMetaService; - private final ApplicationEventPublisher eventPublisher; - - public DatasetExportResult export(DatasetExport e) { - final DatasetExportReport report = new DatasetExportReport(e); - final DatasetMeta sourceDatasetMeta = datasetMetaService.readDatasetMeta(e.getSourceDataset().legacyDatasetUri()).orElse(null); - - // Initialize target names - if (e.getTargetContentName() == null || e.getTargetContentName().isBlank()) { - // if not specified then deduce content name from dataset name - e.setTargetContentName(e.getSourceDataset().getPath().replaceFirst(".*/([^/?]+).*", "$1")); - } - String targetRootLocation = targetRootLocationOf(e.getSourceDataset()); - String targetArchiveUri = targetFileLocationOf(targetRootLocation, archiveFilenameOf(e.getTargetContentName(), e.getCompression().getType())); - - eventPublisher.publishEvent(ExportEvent.builder() - .targetRootLocation(targetRootLocation) - .targetArchiveUri(targetArchiveUri) - .datasetExport(e) - .report(report) - .sourceDataset(e.getSourceDataset()) - .sourceDatasetMeta(sourceDatasetMeta) - .build()); - - return DatasetExportResult.builder() - .targetUri(targetArchiveUri) - .build(); - } - - @EventListener - @Async - public void onExportEvent(ExportEvent e) { - final Stopwatch stopwatch = Stopwatch.createStarted(); - final AtomicInteger counter = new AtomicInteger(); - - // Initialize depseudo mechanism if needed - else use a noOp interceptor - FieldInterceptor fieldPseudoInterceptor = initPseudoInterceptor(e.datasetExport(), e.sourceDatasetMeta(), e.report()); - - // Initiate record stream - log.debug("Read parquet records"); - Flowable> records = datasetClient.readParquetRecords(e.sourceDataset().legacyDatasetUri(), e.datasetExport().getColumnSelectors(), fieldPseudoInterceptor) - .doOnNext(s -> { - if (counter.incrementAndGet() % 100000 == 0) { - log.debug(String.format("Processed %,d records", counter.get())); - } - }); - - // Serialize records - MediaType targetContentType = MoreMediaTypes.validContentType(e.datasetExport().getTargetContentType()); - log.debug("Serialize records as %s".formatted(targetContentType)); - Flowable serializedRecords = RecordMapSerializerFactory.newFromMediaType(targetContentType).serialize(records); - - // Encrypt and compress stream contents: - Flowable compressedRecords = encryptAndCompress(e.datasetExport(), serializedRecords); - // Upload stream contents - - storageBackend - .write(e.targetArchiveUri(), compressedRecords) - .timeout(30, TimeUnit.SECONDS) - .doOnError(throwable -> log.error("Upload failed: %s".formatted(e.targetArchiveUri()), throwable)) - .subscribe(() -> { - e.report().setElapsedMillis(stopwatch.stop().elapsed(TimeUnit.MILLISECONDS)); - log.info("Successful upload: %s".formatted(e.targetArchiveUri())); - uploadExportReport(e.report(), e.targetRootLocation()); - }); - } - - void uploadExportReport(DatasetExportReport report, String targetRootLocation) { - String targetUri = targetFileLocationOf(targetRootLocation, ".export-meta.json"); - try { - storageBackend.write(targetUri, Json.prettyFrom(report).getBytes(StandardCharsets.UTF_8)); - } catch (IOException e) { - log.error("Error uploading export report to " + targetUri, e); - } - } - - /** - * Encrypt and compress serialized records to temporary file storage - */ - Flowable encryptAndCompress(DatasetExport e, Flowable serializedRecords) { - Stopwatch stopwatch = Stopwatch.createStarted(); - Zips.ZipOptions zipOptions = zipOpts() - .password(e.getCompression().getPassword()) - .encryptionMethod(e.getCompression().getEncryption()) - .build(); - log.debug("Compress and encrypt serialized stream to temporary file. Encryption type: %s, Content name: %s. This can take some time...".formatted(e.getCompression().getEncryption(), e.getTargetContentName())); - Flowable compressedRecords = Zips.zip(serializedRecords, archiveFilenameOf(e.getTargetContentName(), e.getTargetContentType()), zipOptions); - log.debug("Compression/encryption done in %s".formatted(stopwatch.stop().elapsed())); - return compressedRecords; - } - - String targetRootLocationOf(DatasetUri datasetPath) { - return PathJoiner.joinWithoutLeadingOrTrailingSlash( - exportConfig.getDefaultTargetRoot(), - datasetPath.getPath(), - "" + System.currentTimeMillis() - ); - } - - String targetFileLocationOf(String rootLocation, String filename) { - return PathJoiner.joinWithoutLeadingOrTrailingSlash( - rootLocation, - filename - ); - } - - String archiveFilenameOf(String contentName, MediaType contentType) { - String timestamp = DateTimeFormatter.ofPattern("yyyyMMdd") - .withZone(ZoneOffset.UTC) - .format(Instant.now()); - contentName = ALLOWED_FILENAME_CHARACTERS.retainFrom(contentName); - - return "%s-%s.%s".formatted(timestamp, contentName, contentType.getExtension().toLowerCase()); - } - - FieldInterceptor initPseudoInterceptor(DatasetExport e, DatasetMeta datasetMeta, DatasetExportReport report) { - if (!e.getDepseudonymize()) { - return FieldInterceptor.noOp(); - } - - final List pseudoRules; - - // Use any explicitly specified pseudo rules - if (e.getPseudoRules() != null && !e.getPseudoRules().isEmpty()) { - log.debug("Pseudo rules were explicitly specified"); - pseudoRules = e.getPseudoRules(); - } - - // Or try to retrieve pseudo rules from another dataset's metadata file if this has been specified - else if (e.getPseudoRulesDataset() != null) { - log.debug("Retrieve pseudo rules from explicitly specified dataset path {}", e.getPseudoRulesDataset()); - PseudoConfig pseudoConfig = datasetMetaService.readDatasetPseudoConfig(e.getPseudoRulesDataset().legacyDatasetUri()); - pseudoRules = pseudoConfig.getRules(); - } - // Or try to retrieve the pseudo rules from the source dataset's metadata (if present) - else if (e.getSourceDataset() != null) { - log.debug("Retrieve pseudo rules from source dataset at {}", e.getSourceDataset()); - PseudoConfig pseudoConfig = datasetMetaService.pseudoConfigOf(datasetMeta); - pseudoRules = pseudoConfig.getRules(); - } - - // Else proceed without any pseudo rules - else { - pseudoRules = List.of(); - } - - if (e.getDepseudonymize() && pseudoRules.isEmpty()) { - throw new NoPseudoRulesFoundException("no pseudonymization rules found - unable to depseudonymize dataset %s".formatted(e.getSourceDataset())); - } else { - log.info("Pseudo rules: {}", pseudoRules); - } - - report.setAppliedPseudoRules(pseudoRules); - PseudoFuncs pseudoFuncs = new PseudoFuncs(pseudoRules, pseudoSecrets.resolve(), List.of()); - return new FieldPseudoInterceptor(pseudoFuncs, PseudoOperation.DEPSEUDONYMIZE); - } - - @Data - @NoArgsConstructor - @AllArgsConstructor - public static class DatasetUri { - @NonNull - private String root; - @NonNull - private String path; - @NonNull - private String version; - - no.ssb.dapla.dataset.uri.DatasetUri legacyDatasetUri() { - return no.ssb.dapla.dataset.uri.DatasetUri.of(root, path, version); - } - } - - @Data - @Builder - @Introspected - static class DatasetExport { - private String userId; - - @NotNull - private DatasetUri sourceDataset; - private Set columnSelectors; - - @NotNull - private Compression compression; - - private Boolean depseudonymize; - - private List pseudoRules; - private DatasetUri pseudoRulesDataset; - - private String targetContentName; - private MediaType targetContentType; - } - - @Value - @Builder - @Accessors(fluent = true) - public static class ExportEvent { - @NonNull DatasetUri sourceDataset; - DatasetMeta sourceDatasetMeta; - @NonNull DatasetExport datasetExport; - @NonNull DatasetExportReport report; - @NonNull String targetRootLocation; - @NonNull String targetArchiveUri; - } - - @Data - @Builder - static class DatasetExportResult { - private String targetUri; - } - - @Data - static class DatasetExportReport { - public DatasetExportReport(DatasetExport e) { - this.userId = e.getUserId(); - this.exportTimestamp = Instant.now().toString(); - this.depseudonymize = e.getDepseudonymize(); - this.sourceDataset = e.getSourceDataset(); - } - - private final DatasetUri sourceDataset; - private final String userId; - private final String exportTimestamp; - private Long elapsedMillis; - private boolean depseudonymize; - private List appliedPseudoRules; - } - - public static class ExportServiceException extends RuntimeException { - public ExportServiceException(String message) { - super(message); - } - } - - public static class DatasetNotFoundException extends ExportServiceException { - public DatasetNotFoundException(String message) { - super(message); - } - } - - public static class NoPseudoRulesFoundException extends ExportServiceException { - public NoPseudoRulesFoundException(String message) { - super(message); - } - } - -} diff --git a/src/main/java/no/ssb/dlp/pseudo/service/pseudo/PseudoController.java b/src/main/java/no/ssb/dlp/pseudo/service/pseudo/PseudoController.java index 216f31a..b02d890 100644 --- a/src/main/java/no/ssb/dlp/pseudo/service/pseudo/PseudoController.java +++ b/src/main/java/no/ssb/dlp/pseudo/service/pseudo/PseudoController.java @@ -22,7 +22,6 @@ import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import no.ssb.dapla.dlp.pseudo.func.PseudoFuncFactory; -import no.ssb.dapla.storage.client.backend.gcs.GoogleCloudStorageBackend; import no.ssb.dlp.pseudo.core.PseudoOperation; import no.ssb.dlp.pseudo.core.StreamProcessor; import no.ssb.dlp.pseudo.core.exception.NoSuchPseudoKeyException; @@ -64,7 +63,6 @@ public class PseudoController { private final StreamProcessorFactory streamProcessorFactory; private final RecordMapProcessorFactory recordProcessorFactory; - private final GoogleCloudStorageBackend storageBackend; private final PseudoConfigSplitter pseudoConfigSplitter; /** @@ -133,16 +131,6 @@ public HttpResponse pseudonymizeFile(@Schema(implementation = PseudoRe RecordMapProcessor recordProcessor = recordProcessorFactory.newPseudonymizeRecordProcessor(pseudoConfigs); ProcessFileResult res = processFile(data, PseudoOperation.PSEUDONYMIZE, recordProcessor, req.getTargetContentType(), req.getCompression()); Flowable file = res.getFlowable(); - if (req.getTargetUri() != null) { - URI targetUri = req.getTargetUri(); - Completable fileUpload = storageBackend - .write(targetUri.toString(), file) - .timeout(30, TimeUnit.SECONDS) - .doOnError(throwable -> log.error("Upload failed: %s".formatted(targetUri), throwable)) - .doOnComplete(() -> log.info("Successful upload: %s".formatted(targetUri))); - return HttpResponse.ok(fileUpload.toFlowable()); - } - return HttpResponse.ok(file).contentType(res.getTargetContentType()); } catch (RuntimeException e) { @@ -192,16 +180,6 @@ public HttpResponse depseudonymizeFile(@Schema(implementation = Pseudo RecordMapProcessor recordProcessor = recordProcessorFactory.newDepseudonymizeRecordProcessor(pseudoConfigs); ProcessFileResult res = processFile(data, PseudoOperation.DEPSEUDONYMIZE, recordProcessor, req.getTargetContentType(), req.getCompression()); Flowable file = res.getFlowable(); - if (req.getTargetUri() != null) { - URI targetUri = req.getTargetUri(); - Completable fileUpload = storageBackend - .write(targetUri.toString(), file) - .timeout(30, TimeUnit.SECONDS) - .doOnError(throwable -> log.error("Upload failed: %s".formatted(targetUri), throwable)) - .doOnComplete(() -> log.info("Successful upload: %s".formatted(targetUri))); - return HttpResponse.ok(fileUpload.toFlowable()); - } - return HttpResponse.ok(file).contentType(res.getTargetContentType()); } catch (Exception e) { log.error(String.format("Failed to depseudonymize:%nrequest:%n%s", request), e); @@ -249,16 +227,6 @@ public HttpResponse repseudonymizeFile(@Schema(implementation = Repseu RecordMapProcessor recordProcessor = recordProcessorFactory.newRepseudonymizeRecordProcessor(req.getSourcePseudoConfig(), req.getTargetPseudoConfig()); ProcessFileResult res = processFile(data, PseudoOperation.REPSEUDONYMIZE, recordProcessor, req.getTargetContentType(), req.getCompression()); Flowable file = res.getFlowable(); - if (req.getTargetUri() != null) { - URI targetUri = req.getTargetUri(); - Completable fileUpload = storageBackend - .write(targetUri.toString(), file) - .timeout(30, TimeUnit.SECONDS) - .doOnError(throwable -> log.error("Upload failed: %s".formatted(targetUri), throwable)) - .doOnComplete(() -> log.info("Successful upload: %s".formatted(targetUri))); - return HttpResponse.ok(fileUpload.toFlowable()); - } - return HttpResponse.ok(file).contentType(res.getTargetContentType()); } catch (Exception e) { log.error(String.format("Failed to repseudonymize:%nrequest:%n%s", request), e); @@ -368,14 +336,6 @@ public static class PseudoRequest { */ private PseudoConfig pseudoConfig; - /** - * Specify this if you want to stream the result to a specific location such as a GCS bucket. Note that the pseudo - * service needs to have access to the bucket. Leave this unspecified in order to just stream the result back to - * the client. - */ - @Schema(implementation = String.class) - private URI targetUri; - /** * The content type of the resulting file. */ @@ -414,14 +374,6 @@ public static class RepseudoRequest { */ private PseudoConfig targetPseudoConfig; - /** - * Specify this if you want to stream the result to a specific location such as a GCS bucket. Note that the pseudo - * service needs to have access to the bucket. Leave this unspecified in order to just stream the result back to - * the client. - */ - @Schema(implementation = String.class) - private URI targetUri; - /** * The content type of the resulting file. */ diff --git a/src/main/java/no/ssb/dlp/pseudo/service/storage/DatasetStorageFactory.java b/src/main/java/no/ssb/dlp/pseudo/service/storage/DatasetStorageFactory.java deleted file mode 100644 index 939bdcc..0000000 --- a/src/main/java/no/ssb/dlp/pseudo/service/storage/DatasetStorageFactory.java +++ /dev/null @@ -1,22 +0,0 @@ -package no.ssb.dlp.pseudo.service.storage; - -import io.micronaut.context.annotation.Factory; -import lombok.RequiredArgsConstructor; -import lombok.extern.slf4j.Slf4j; -import no.ssb.dapla.storage.client.DatasetStorage; -import no.ssb.dapla.storage.client.backend.gcs.GoogleCloudStorageBackend; - -import javax.inject.Singleton; - -@Factory -@RequiredArgsConstructor -@Slf4j -public class DatasetStorageFactory { - - private final GoogleCloudStorageBackend googleCloudStorageBackend; - - @Singleton - public DatasetStorage datasetStorage() { - return DatasetStorage.builder().withBinaryBackend(googleCloudStorageBackend).build(); - } -} diff --git a/src/main/java/no/ssb/dlp/pseudo/service/storage/GoogleCloudStorageBackendFactory.java b/src/main/java/no/ssb/dlp/pseudo/service/storage/GoogleCloudStorageBackendFactory.java deleted file mode 100644 index a363639..0000000 --- a/src/main/java/no/ssb/dlp/pseudo/service/storage/GoogleCloudStorageBackendFactory.java +++ /dev/null @@ -1,42 +0,0 @@ -package no.ssb.dlp.pseudo.service.storage; - -import io.micronaut.context.annotation.Factory; -import io.micronaut.context.annotation.Property; -import lombok.RequiredArgsConstructor; -import lombok.extern.slf4j.Slf4j; -import no.ssb.dapla.storage.client.backend.gcs.GoogleCloudStorageBackend; -import no.ssb.dapla.storage.client.backend.gcs.GoogleCloudStorageBackend.Configuration; - -import javax.annotation.Nullable; -import javax.inject.Singleton; -import java.nio.file.Files; -import java.nio.file.Path; - -@Factory -@RequiredArgsConstructor -@Slf4j -public class GoogleCloudStorageBackendFactory { - - @Nullable - @Property(name = "storage.gcs-service-account-file") - private Path serviceAccountCredentials; - - @Singleton - public GoogleCloudStorageBackend googleCloudStorageBackend() { - Configuration configuration = new Configuration(); - if (serviceAccountCredentials != null) { - if (Files.notExists(serviceAccountCredentials)) { - throw new StorageBackendException("Could not find service account credentials: " + serviceAccountCredentials); - } - configuration.setServiceAccountCredentials(serviceAccountCredentials); - } - return new GoogleCloudStorageBackend(configuration); - } - - public static class StorageBackendException extends RuntimeException { - public StorageBackendException(String message) { - super(message); - } - } - -}