From c999477269fa3a84574c9b8da9cd4d9d422fac72 Mon Sep 17 00:00:00 2001 From: Ciaran Morinan <37743469+CiaranMn@users.noreply.github.com> Date: Wed, 4 Dec 2024 15:08:19 +0000 Subject: [PATCH] H-3618: Add GitHub action step to authenticate with Google Cloud, for `chonky` tests (#5687) Co-authored-by: JesusFileto <91223391+JesusFileto@users.noreply.github.com> Co-authored-by: Jesus Fileto --- .dockerignore | 3 ++ .github/workflows/test.yml | 37 +++++++++++++++++++ .gitignore | 3 ++ .markdownlintignore | 3 ++ .../src/embedding/multi_modal_embedding.rs | 29 +++++++++++++-- 5 files changed, 71 insertions(+), 4 deletions(-) diff --git a/.dockerignore b/.dockerignore index 327fbd7d728..585fb20a4de 100644 --- a/.dockerignore +++ b/.dockerignore @@ -89,6 +89,9 @@ blocks/**/.env .env.local .env.*.local +# Generated credentials from google-github-actions/auth +gha-creds-*.json + # macOS directory file **/.DS_Store diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e0732d6ef2b..8605189c746 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -105,6 +105,11 @@ jobs: fail-fast: false if: needs.setup.outputs.unit-tests != '{"package":[],"include":[]}' runs-on: ubuntu-24.04 + permissions: + # Required to fetch an OIDC token, used to auth with Google Cloud Platform for @rust/chonky tests + id-token: "write" + # Maintain permission to read repo contents + contents: "read" steps: - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 @@ -148,6 +153,15 @@ jobs: rm -rf $temp_dir echo "PDFIUM_DYNAMIC_LIB_PATH=$(pwd)/${{ matrix.directory }}/libs/" >> $GITHUB_ENV + # Sets GOOGLE_APPLICATION_CREDENTIALS in the environment, to be consumed by gcloud or client libraries + - name: Generate Google Cloud credential configuration + if: matrix.package == '@rust/chonky' + uses: google-github-actions/auth@6fc4af4b145ae7821d527454aa9bd537d1f2dc5f # v2.1.7 + with: + project_id: ${{ secrets.GOOGLE_CLOUD_HASH_PROJECT_ID }} + service_account: ${{ secrets.GOOGLE_CLOUD_VERTEX_SERVICE_ACCOUNT }} + workload_identity_provider: ${{ secrets.GOOGLE_CLOUD_IDENTITY_PROVIDER }} + - name: Install Rust toolchain if: always() && steps.tests.outputs.has-rust == 'true' uses: ./.github/actions/install-rust-toolchain @@ -178,6 +192,10 @@ jobs: continue-on-error: ${{ steps.tests.outputs.allow-failure == 'true' }} env: TEST_COVERAGE: ${{ github.event_name != 'merge_group' }} + # Variables needed for chonky tests + GOOGLE_PROJECT_ID: ${{ secrets.GOOGLE_CLOUD_HASH_PROJECT_ID }} + GOOGLE_DEFAULT_CREDENTIALS: ${{ env.GOOGLE_DEFAULT_CREDENTIALS }} # set by google-github-actions/auth + HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} run: | turbo run test:unit --env-mode=loose --filter "${{ matrix.package }}" echo "TRIMMED_PACKAGE_NAME=$(echo "${{ matrix.package }}" | sed 's|@||g' | sed 's|/|.|g')" >> $GITHUB_ENV @@ -225,6 +243,11 @@ jobs: fail-fast: false if: needs.setup.outputs.integration-tests != '{"package":[],"include":[]}' runs-on: ubuntu-24.04 + permissions: + # Required to fetch an OIDC token, used to auth with Google Cloud Platform for @rust/chonky tests + id-token: "write" + # Maintain permission to read repo contents + contents: "read" steps: - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 @@ -293,6 +316,15 @@ jobs: rm -rf $temp_dir echo "PDFIUM_DYNAMIC_LIB_PATH=$(pwd)/${{ matrix.directory }}/libs/" >> $GITHUB_ENV + # Sets GOOGLE_APPLICATION_CREDENTIALS in the environment, to be consumed by gcloud or client libraries + - name: Generate Google Cloud credential configuration + if: matrix.package == '@rust/chonky' + uses: google-github-actions/auth@6fc4af4b145ae7821d527454aa9bd537d1f2dc5f # v2.1.7 + with: + project_id: ${{ secrets.GOOGLE_CLOUD_HASH_PROJECT_ID }} + service_account: ${{ secrets.GOOGLE_CLOUD_VERTEX_SERVICE_ACCOUNT }} + workload_identity_provider: ${{ secrets.GOOGLE_CLOUD_IDENTITY_PROVIDER }} + - name: Install playwright if: matrix.package == '@tests/hash-playwright' uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0 @@ -350,6 +382,11 @@ jobs: - name: Run tests continue-on-error: ${{ steps.tests.outputs.allow-failure == 'true' }} + env: + # Variables needed for chonky tests + GOOGLE_PROJECT_ID: ${{ secrets.GOOGLE_CLOUD_HASH_PROJECT_ID }} + GOOGLE_DEFAULT_CREDENTIALS: ${{ env.GOOGLE_DEFAULT_CREDENTIALS }} # set by google-github-actions/auth + HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} run: | turbo run test:integration --env-mode=loose --filter "${{ matrix.package }}" echo "TRIMMED_PACKAGE_NAME=$(echo "${{ matrix.package }}" | sed 's|@||g' | sed 's|/|.|g')" >> $GITHUB_ENV diff --git a/.gitignore b/.gitignore index a4d01ae697c..cb93555d008 100644 --- a/.gitignore +++ b/.gitignore @@ -73,6 +73,9 @@ blocks/**/.env .env.local .env.*.local +# Generated credentials from google-github-actions/auth +gha-creds-*.json + # macOS directory file **/.DS_Store diff --git a/.markdownlintignore b/.markdownlintignore index 4ef93d4f6e2..39fafbff4d9 100644 --- a/.markdownlintignore +++ b/.markdownlintignore @@ -82,6 +82,9 @@ blocks/**/.env .env.local .env.*.local +# Generated credentials from google-github-actions/auth +gha-creds-*.json + # macOS directory file **/.DS_Store diff --git a/libs/chonky/src/embedding/multi_modal_embedding.rs b/libs/chonky/src/embedding/multi_modal_embedding.rs index 05a63b865dd..d510acf3819 100644 --- a/libs/chonky/src/embedding/multi_modal_embedding.rs +++ b/libs/chonky/src/embedding/multi_modal_embedding.rs @@ -210,8 +210,13 @@ pub async fn make_multimodal_api_request( if !response.status().is_success() { return Err( Report::new(ChonkyError::VertexAPI).attach_printable(format!( - "Received the error code {} in the response status", - response.status() + "Received the error code {} in the response status with error text {:?}", + response.status(), + response + .error_for_status() + .change_context(ChonkyError::VertexAPI)? + .text() + .await, )), ); } @@ -289,6 +294,7 @@ pub fn add_structural_embedding( #[cfg(test)] mod tests { use insta::{assert_binary_snapshot, assert_snapshot}; + use serde_json::to_string_pretty; use tokio::fs; use super::*; @@ -301,7 +307,17 @@ mod tests { .await .change_context(ChonkyError::ImageError)?; // source of truth found by decoding base64 encoding to get same image - assert_binary_snapshot!("page_1.json", base64_json(image_data).to_string().into()); + // must use string_pretty since there is autoformating done by compiler with addition of + // newline + assert_binary_snapshot!( + "page_1.json", + format!( + "{}\n", + to_string_pretty(&base64_json(image_data)) + .change_context(ChonkyError::ImageError)? + ) + .into() + ); Ok(()) } @@ -327,8 +343,13 @@ mod tests { .await .change_context(ChonkyError::ImageError)?; + //project id + + let project_id = + std::env::var("GOOGLE_PROJECT_ID").change_context(ChonkyError::VertexAPI)?; + let test_embedding = - make_multimodal_api_request("hash-embed", Some(image_data), None).await?; + make_multimodal_api_request(&project_id, Some(image_data), None).await?; //find cosine similarity of vectors