From 8a9c7aed884d3a3303e13a80ad70b217cc604bf2 Mon Sep 17 00:00:00 2001 From: tefirman Date: Wed, 15 Jan 2025 21:02:41 -0800 Subject: [PATCH 01/21] Adding cache test WDL and GitHub Action --- .github/workflows/test-cromwell-cache.yml | 68 +++++++++++++++++++++++ cacheTest/cacheTest.wdl | 51 +++++++++++++++++ cacheTest/inputs.json | 3 + 3 files changed, 122 insertions(+) create mode 100644 .github/workflows/test-cromwell-cache.yml create mode 100644 cacheTest/cacheTest.wdl create mode 100644 cacheTest/inputs.json diff --git a/.github/workflows/test-cromwell-cache.yml b/.github/workflows/test-cromwell-cache.yml new file mode 100644 index 0000000..461a4f3 --- /dev/null +++ b/.github/workflows/test-cromwell-cache.yml @@ -0,0 +1,68 @@ +name: Validate Cromwell Caching + +on: + pull_request: + workflow_dispatch: + +jobs: + validate-cromwell-cache: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Set up Java + uses: actions/setup-java@v3 + with: + distribution: 'temurin' + java-version: '11' + + - name: Download Cromwell + run: | + wget https://github.com/broadinstitute/cromwell/releases/download/85/cromwell-85.jar + + - name: First run + id: first-run + run: | + echo "Running first execution..." + start_time=$(date +%s) + java -jar cromwell-85.jar run cacheTest/cacheTest.wdl -i cacheTest/inputs.json + end_time=$(date +%s) + duration=$((end_time - start_time)) + echo "first_duration=$duration" >> $GITHUB_OUTPUT + # Extract execution ID from the output file + execution_id=$(grep "Execution ID:" output.txt | cut -d' ' -f3) + echo "first_execution_id=$execution_id" >> $GITHUB_OUTPUT + + - name: Second run (should use cache) + id: second-run + run: | + echo "Running second execution..." + start_time=$(date +%s) + java -jar cromwell-85.jar run cacheTest/cacheTest.wdl -i cacheTest/inputs.json + end_time=$(date +%s) + duration=$((end_time - start_time)) + echo "second_duration=$duration" >> $GITHUB_OUTPUT + # Extract execution ID from the output file + execution_id=$(grep "Execution ID:" output.txt | cut -d' ' -f3) + echo "second_execution_id=$execution_id" >> $GITHUB_OUTPUT + + - name: Validate caching behavior + run: | + echo "First run duration: ${{ steps.first-run.outputs.first_duration }} seconds" + echo "Second run duration: ${{ steps.second-run.outputs.second_duration }} seconds" + echo "First execution ID: ${{ steps.first-run.outputs.first_execution_id }}" + echo "Second execution ID: ${{ steps.second-run.outputs.second_execution_id }}" + + # Verify execution IDs match (indicating cache was used) + if [ "${{ steps.first-run.outputs.first_execution_id }}" != "${{ steps.second-run.outputs.second_execution_id }}" ]; then + echo "::error::Cache validation failed! Execution IDs don't match between runs." + exit 1 + fi + + # Verify second run was significantly faster (< 50% of first run time) + if [ ${{ steps.second-run.outputs.second_duration }} -gt $(( ${{ steps.first-run.outputs.first_duration }} / 2 )) ]; then + echo "::error::Cache validation failed! Second run took too long (${steps.second-run.outputs.second_duration}s), suggesting cache wasn't used." + exit 1 + fi + + echo "Cache validation passed! ✅" diff --git a/cacheTest/cacheTest.wdl b/cacheTest/cacheTest.wdl new file mode 100644 index 0000000..b7e124f --- /dev/null +++ b/cacheTest/cacheTest.wdl @@ -0,0 +1,51 @@ +version 1.0 + +workflow CacheTest { + input { + String message + Int sleep_time = 5 + } + + call GenerateTimestamp { + input: + input_message = message, + sleep_seconds = sleep_time + } + + output { + File output_file = GenerateTimestamp.timestamp_file + String execution_id = GenerateTimestamp.execution_id + } +} + +task GenerateTimestamp { + input { + String input_message + Int sleep_seconds + } + + command <<< + # Sleep to make the task take a noticeable amount of time + sleep ~{sleep_seconds} + + # Generate a unique execution ID + execution_id=$(date +%s%N) + echo "Execution ID: $execution_id" + + # Create output with timestamp and message + echo "Message: ~{input_message}" > output.txt + echo "Timestamp: $(date)" >> output.txt + echo "Execution ID: $execution_id" >> output.txt + >>> + + output { + File timestamp_file = "output.txt" + String execution_id = read_string(stdout()) + } + + runtime { + docker: "ubuntu:latest" + cpu: 1 + memory: "1 GB" + } +} diff --git a/cacheTest/inputs.json b/cacheTest/inputs.json new file mode 100644 index 0000000..44be999 --- /dev/null +++ b/cacheTest/inputs.json @@ -0,0 +1,3 @@ +{ + "CacheTest.message": "This WDL is intended to test caching functionality, we'll see how it works..." +} \ No newline at end of file From 89b5331fd23e69a271aa0a22b484f37d3a0cb4e8 Mon Sep 17 00:00:00 2001 From: tefirman Date: Wed, 15 Jan 2025 21:13:05 -0800 Subject: [PATCH 02/21] Adding options.json to cacheTest --- .github/workflows/test-cromwell-cache.yml | 4 ++-- cacheTest/options.json | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) create mode 100644 cacheTest/options.json diff --git a/.github/workflows/test-cromwell-cache.yml b/.github/workflows/test-cromwell-cache.yml index 461a4f3..0f02719 100644 --- a/.github/workflows/test-cromwell-cache.yml +++ b/.github/workflows/test-cromwell-cache.yml @@ -25,7 +25,7 @@ jobs: run: | echo "Running first execution..." start_time=$(date +%s) - java -jar cromwell-85.jar run cacheTest/cacheTest.wdl -i cacheTest/inputs.json + java -jar cromwell-85.jar run cacheTest/cacheTest.wdl -i cacheTest/inputs.json -o cacheTest/options.json end_time=$(date +%s) duration=$((end_time - start_time)) echo "first_duration=$duration" >> $GITHUB_OUTPUT @@ -38,7 +38,7 @@ jobs: run: | echo "Running second execution..." start_time=$(date +%s) - java -jar cromwell-85.jar run cacheTest/cacheTest.wdl -i cacheTest/inputs.json + java -jar cromwell-85.jar run cacheTest/cacheTest.wdl -i cacheTest/inputs.json -o cacheTest/options.json end_time=$(date +%s) duration=$((end_time - start_time)) echo "second_duration=$duration" >> $GITHUB_OUTPUT diff --git a/cacheTest/options.json b/cacheTest/options.json new file mode 100644 index 0000000..d03a5ef --- /dev/null +++ b/cacheTest/options.json @@ -0,0 +1,5 @@ +{ + "workflow_failure_mode": "ContinueWhilePossible", + "write_to_cache": true, + "read_from_cache": true +} From 459177f755d9a5300f3b10404c9c1261c2851619 Mon Sep 17 00:00:00 2001 From: tefirman Date: Wed, 15 Jan 2025 21:26:18 -0800 Subject: [PATCH 03/21] Identifying output.txt file after each task --- .github/workflows/test-cromwell-cache.yml | 29 +++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test-cromwell-cache.yml b/.github/workflows/test-cromwell-cache.yml index 0f02719..1fbb7da 100644 --- a/.github/workflows/test-cromwell-cache.yml +++ b/.github/workflows/test-cromwell-cache.yml @@ -29,9 +29,17 @@ jobs: end_time=$(date +%s) duration=$((end_time - start_time)) echo "first_duration=$duration" >> $GITHUB_OUTPUT - # Extract execution ID from the output file - execution_id=$(grep "Execution ID:" output.txt | cut -d' ' -f3) + + # Find the most recent output file in cromwell-executions + output_file=$(find cromwell-executions -name "output.txt" | sort -r | head -n 1) + if [ -z "$output_file" ]; then + echo "::error::Could not find output.txt in cromwell-executions" + exit 1 + fi + execution_id=$(grep "Execution ID:" "$output_file" | cut -d' ' -f3) echo "first_execution_id=$execution_id" >> $GITHUB_OUTPUT + echo "First run output file: $output_file" + cat "$output_file" - name: Second run (should use cache) id: second-run @@ -42,9 +50,17 @@ jobs: end_time=$(date +%s) duration=$((end_time - start_time)) echo "second_duration=$duration" >> $GITHUB_OUTPUT - # Extract execution ID from the output file - execution_id=$(grep "Execution ID:" output.txt | cut -d' ' -f3) + + # Find the most recent output file in cromwell-executions + output_file=$(find cromwell-executions -name "output.txt" | sort -r | head -n 1) + if [ -z "$output_file" ]; then + echo "::error::Could not find output.txt in cromwell-executions" + exit 1 + fi + execution_id=$(grep "Execution ID:" "$output_file" | cut -d' ' -f3) echo "second_execution_id=$execution_id" >> $GITHUB_OUTPUT + echo "Second run output file: $output_file" + cat "$output_file" - name: Validate caching behavior run: | @@ -53,6 +69,11 @@ jobs: echo "First execution ID: ${{ steps.first-run.outputs.first_execution_id }}" echo "Second execution ID: ${{ steps.second-run.outputs.second_execution_id }}" + if [ -z "${{ steps.first-run.outputs.first_execution_id }}" ] || [ -z "${{ steps.second-run.outputs.second_execution_id }}" ]; then + echo "::error::Failed to extract execution IDs from output files" + exit 1 + fi + # Verify execution IDs match (indicating cache was used) if [ "${{ steps.first-run.outputs.first_execution_id }}" != "${{ steps.second-run.outputs.second_execution_id }}" ]; then echo "::error::Cache validation failed! Execution IDs don't match between runs." From 4666037f7318b78f3e59755cedd70b6e861ead9b Mon Sep 17 00:00:00 2001 From: tefirman Date: Wed, 15 Jan 2025 21:41:06 -0800 Subject: [PATCH 04/21] Identifying output.txt file after each task --- .github/workflows/test-cromwell-cache.yml | 30 ++++++++++++++++------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/.github/workflows/test-cromwell-cache.yml b/.github/workflows/test-cromwell-cache.yml index 1fbb7da..903114b 100644 --- a/.github/workflows/test-cromwell-cache.yml +++ b/.github/workflows/test-cromwell-cache.yml @@ -25,19 +25,25 @@ jobs: run: | echo "Running first execution..." start_time=$(date +%s) - java -jar cromwell-85.jar run cacheTest/cacheTest.wdl -i cacheTest/inputs.json -o cacheTest/options.json + # Capture Cromwell's output to get workflow ID + workflow_output=$(java -jar cromwell-85.jar run cacheTest/cacheTest.wdl -i cacheTest/inputs.json -o cacheTest/options.json) end_time=$(date +%s) duration=$((end_time - start_time)) echo "first_duration=$duration" >> $GITHUB_OUTPUT - # Find the most recent output file in cromwell-executions - output_file=$(find cromwell-executions -name "output.txt" | sort -r | head -n 1) + # Extract workflow ID from Cromwell output + workflow_id=$(echo "$workflow_output" | grep "started workflow" | sed 's/.*workflow \([^)]*\).*/\1/') + echo "First workflow ID: $workflow_id" + + # Find the output file for this specific workflow + output_file=$(find cromwell-executions/CacheTest/"$workflow_id" -name "output.txt") if [ -z "$output_file" ]; then - echo "::error::Could not find output.txt in cromwell-executions" + echo "::error::Could not find output.txt for workflow $workflow_id" exit 1 fi execution_id=$(grep "Execution ID:" "$output_file" | cut -d' ' -f3) echo "first_execution_id=$execution_id" >> $GITHUB_OUTPUT + echo "first_workflow_id=$workflow_id" >> $GITHUB_OUTPUT echo "First run output file: $output_file" cat "$output_file" @@ -46,19 +52,25 @@ jobs: run: | echo "Running second execution..." start_time=$(date +%s) - java -jar cromwell-85.jar run cacheTest/cacheTest.wdl -i cacheTest/inputs.json -o cacheTest/options.json + # Capture Cromwell's output to get workflow ID + workflow_output=$(java -jar cromwell-85.jar run cacheTest/cacheTest.wdl -i cacheTest/inputs.json -o cacheTest/options.json) end_time=$(date +%s) duration=$((end_time - start_time)) echo "second_duration=$duration" >> $GITHUB_OUTPUT - # Find the most recent output file in cromwell-executions - output_file=$(find cromwell-executions -name "output.txt" | sort -r | head -n 1) + # Extract workflow ID from Cromwell output + workflow_id=$(echo "$workflow_output" | grep "started workflow" | sed 's/.*workflow \([^)]*\).*/\1/') + echo "Second workflow ID: $workflow_id" + + # Find the output file for this specific workflow + output_file=$(find cromwell-executions/CacheTest/"$workflow_id" -name "output.txt") if [ -z "$output_file" ]; then - echo "::error::Could not find output.txt in cromwell-executions" + echo "::error::Could not find output.txt for workflow $workflow_id" exit 1 fi execution_id=$(grep "Execution ID:" "$output_file" | cut -d' ' -f3) echo "second_execution_id=$execution_id" >> $GITHUB_OUTPUT + echo "second_workflow_id=$workflow_id" >> $GITHUB_OUTPUT echo "Second run output file: $output_file" cat "$output_file" @@ -86,4 +98,4 @@ jobs: exit 1 fi - echo "Cache validation passed! ✅" + echo "Cache validation passed!" From 1914d57948568172d8986cb139f018bff065b537 Mon Sep 17 00:00:00 2001 From: tefirman Date: Wed, 15 Jan 2025 21:55:53 -0800 Subject: [PATCH 05/21] Identifying output.txt file after each task --- .github/workflows/test-cromwell-cache.yml | 32 +++++++++++------------ 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/.github/workflows/test-cromwell-cache.yml b/.github/workflows/test-cromwell-cache.yml index 903114b..04aef99 100644 --- a/.github/workflows/test-cromwell-cache.yml +++ b/.github/workflows/test-cromwell-cache.yml @@ -25,20 +25,20 @@ jobs: run: | echo "Running first execution..." start_time=$(date +%s) - # Capture Cromwell's output to get workflow ID - workflow_output=$(java -jar cromwell-85.jar run cacheTest/cacheTest.wdl -i cacheTest/inputs.json -o cacheTest/options.json) + java -jar cromwell-85.jar run cacheTest/cacheTest.wdl -i cacheTest/inputs.json -o cacheTest/options.json end_time=$(date +%s) duration=$((end_time - start_time)) echo "first_duration=$duration" >> $GITHUB_OUTPUT - # Extract workflow ID from Cromwell output - workflow_id=$(echo "$workflow_output" | grep "started workflow" | sed 's/.*workflow \([^)]*\).*/\1/') + # Find the most recent workflow directory + workflow_dir=$(ls -td cromwell-executions/CacheTest/*/ | head -1) + workflow_id=$(basename "$workflow_dir") echo "First workflow ID: $workflow_id" - # Find the output file for this specific workflow - output_file=$(find cromwell-executions/CacheTest/"$workflow_id" -name "output.txt") - if [ -z "$output_file" ]; then - echo "::error::Could not find output.txt for workflow $workflow_id" + # Find the output file + output_file="$workflow_dir/call-GenerateTimestamp/execution/output.txt" + if [ ! -f "$output_file" ]; then + echo "::error::Could not find output.txt at $output_file" exit 1 fi execution_id=$(grep "Execution ID:" "$output_file" | cut -d' ' -f3) @@ -52,20 +52,20 @@ jobs: run: | echo "Running second execution..." start_time=$(date +%s) - # Capture Cromwell's output to get workflow ID - workflow_output=$(java -jar cromwell-85.jar run cacheTest/cacheTest.wdl -i cacheTest/inputs.json -o cacheTest/options.json) + java -jar cromwell-85.jar run cacheTest/cacheTest.wdl -i cacheTest/inputs.json -o cacheTest/options.json end_time=$(date +%s) duration=$((end_time - start_time)) echo "second_duration=$duration" >> $GITHUB_OUTPUT - # Extract workflow ID from Cromwell output - workflow_id=$(echo "$workflow_output" | grep "started workflow" | sed 's/.*workflow \([^)]*\).*/\1/') + # Find the most recent workflow directory (different from the first one) + workflow_dir=$(ls -td cromwell-executions/CacheTest/*/ | grep -v "${{ steps.first-run.outputs.first_workflow_id }}" | head -1) + workflow_id=$(basename "$workflow_dir") echo "Second workflow ID: $workflow_id" - # Find the output file for this specific workflow - output_file=$(find cromwell-executions/CacheTest/"$workflow_id" -name "output.txt") - if [ -z "$output_file" ]; then - echo "::error::Could not find output.txt for workflow $workflow_id" + # Find the output file + output_file="$workflow_dir/call-GenerateTimestamp/execution/output.txt" + if [ ! -f "$output_file" ]; then + echo "::error::Could not find output.txt at $output_file" exit 1 fi execution_id=$(grep "Execution ID:" "$output_file" | cut -d' ' -f3) From cae2d065ec2e7585d8b4a5c017b788da7b95fcb3 Mon Sep 17 00:00:00 2001 From: tefirman Date: Thu, 16 Jan 2025 11:38:18 -0800 Subject: [PATCH 06/21] Adding initial version of config file for cache testing --- cacheTest/cromwell.conf | 83 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 cacheTest/cromwell.conf diff --git a/cacheTest/cromwell.conf b/cacheTest/cromwell.conf new file mode 100644 index 0000000..97f6865 --- /dev/null +++ b/cacheTest/cromwell.conf @@ -0,0 +1,83 @@ +include required(classpath("application")) + +backend { + default = "LocalExample" + providers { + LocalExample { + actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" + config { + runtime-attributes = """ + Int? cpu + Int? memory_mb + String? docker + """ + + submit-docker = """ + docker run \ + --cidfile ${cwd}/docker_cid \ + -i \ + --rm \ + -v ${cwd}:${docker_cwd} \ + --entrypoint /bin/bash \ + ${docker} \ + -c "${script}" + """ + + # Enable call caching + call-caching { + enabled = true + invalidate-bad-cache-results = true + } + + # File system configuration + filesystem { + local { + localization: [ + "hard-link", "soft-link", "copy" + ] + caching { + duplication-strategy: [ + "hard-link", "soft-link", "copy" + ] + } + } + } + + concurrent-job-limit = 5 + + # Docker-specific configurations + docker { + hash-lookup { + enabled = true + # Set to false if you want to disable docker image caching + caching { + enabled = true + duplication-strategy = ["hard-link", "copy"] + } + } + + # Kill containers after a timeout + kill-after = 5 minutes + } + } + } + } +} + +call-caching { + enabled = true + invalidate-bad-cache-results = true + # How long to keep cache entries (30 days) + ttl = 2592000 +} + +# Database configuration +database { + profile = "slick.jdbc.HsqldbProfile$" + db { + driver = "org.hsqldb.jdbcDriver" + url = "jdbc:hsqldb:file:cromwell-db;shutdown=false;hsqldb.tx=mvcc" + connectionTimeout = 120000 + numThreads = 1 + } +} \ No newline at end of file From 41f824ca0dbca47211c36f5f43188239bbdbeb79 Mon Sep 17 00:00:00 2001 From: tefirman Date: Thu, 16 Jan 2025 11:43:53 -0800 Subject: [PATCH 07/21] Updating cache test cromwell config --- cacheTest/cromwell.conf | 56 +++++++++++------------------------------ 1 file changed, 14 insertions(+), 42 deletions(-) diff --git a/cacheTest/cromwell.conf b/cacheTest/cromwell.conf index 97f6865..013ab60 100644 --- a/cacheTest/cromwell.conf +++ b/cacheTest/cromwell.conf @@ -13,22 +13,19 @@ backend { """ submit-docker = """ - docker run \ - --cidfile ${cwd}/docker_cid \ - -i \ - --rm \ - -v ${cwd}:${docker_cwd} \ - --entrypoint /bin/bash \ - ${docker} \ - -c "${script}" + docker run \ + --cidfile ${cwd}/docker_cid \ + -i \ + --rm \ + -v ${cwd}:${docker_cwd} \ + -w ${docker_cwd} \ + ${docker} \ + /bin/bash ${script} """ - # Enable call caching - call-caching { - enabled = true - invalidate-bad-cache-results = true - } - + # The script is written to this path in the container + script-epilogue = "sleep 1" + # File system configuration filesystem { local { @@ -43,21 +40,9 @@ backend { } } - concurrent-job-limit = 5 - - # Docker-specific configurations - docker { - hash-lookup { - enabled = true - # Set to false if you want to disable docker image caching - caching { - enabled = true - duplication-strategy = ["hard-link", "copy"] - } - } - - # Kill containers after a timeout - kill-after = 5 minutes + call-caching { + enabled = true + invalidate-bad-cache-results = true } } } @@ -67,17 +52,4 @@ backend { call-caching { enabled = true invalidate-bad-cache-results = true - # How long to keep cache entries (30 days) - ttl = 2592000 -} - -# Database configuration -database { - profile = "slick.jdbc.HsqldbProfile$" - db { - driver = "org.hsqldb.jdbcDriver" - url = "jdbc:hsqldb:file:cromwell-db;shutdown=false;hsqldb.tx=mvcc" - connectionTimeout = 120000 - numThreads = 1 - } } \ No newline at end of file From 8de27fa60b4dc2292f6241d4d426eeee39c0f951 Mon Sep 17 00:00:00 2001 From: tefirman Date: Thu, 16 Jan 2025 11:46:08 -0800 Subject: [PATCH 08/21] Updating cache test cromwell config --- cacheTest/cromwell.conf | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/cacheTest/cromwell.conf b/cacheTest/cromwell.conf index 013ab60..c1b7560 100644 --- a/cacheTest/cromwell.conf +++ b/cacheTest/cromwell.conf @@ -11,35 +11,37 @@ backend { Int? memory_mb String? docker """ + + submit = """ + ${job_shell} ${script} + """ submit-docker = """ - docker run \ - --cidfile ${cwd}/docker_cid \ - -i \ - --rm \ - -v ${cwd}:${docker_cwd} \ - -w ${docker_cwd} \ - ${docker} \ - /bin/bash ${script} + docker run \ + --rm \ + -v ${cwd}:${docker_cwd} \ + -w ${docker_cwd} \ + ${docker} \ + /bin/bash ${docker_script} """ - # The script is written to this path in the container - script-epilogue = "sleep 1" - - # File system configuration + # File system settings filesystem { local { localization: [ "hard-link", "soft-link", "copy" ] - caching { - duplication-strategy: [ - "hard-link", "soft-link", "copy" - ] - } } } + # Docker configuration + docker { + hash-lookup { + enabled = false + } + } + + # Enable call caching call-caching { enabled = true invalidate-bad-cache-results = true From 9781f4b9f4ba020f0a049acd9a1081cedbea8b8f Mon Sep 17 00:00:00 2001 From: tefirman Date: Thu, 16 Jan 2025 12:01:43 -0800 Subject: [PATCH 09/21] Updating cache test cromwell config --- cacheTest/cromwell.conf | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/cacheTest/cromwell.conf b/cacheTest/cromwell.conf index c1b7560..385ffe9 100644 --- a/cacheTest/cromwell.conf +++ b/cacheTest/cromwell.conf @@ -6,6 +6,9 @@ backend { LocalExample { actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" config { + # Add job id regex pattern + job-id-regex = "(\\d+)" + runtime-attributes = """ Int? cpu Int? memory_mb @@ -54,4 +57,14 @@ backend { call-caching { enabled = true invalidate-bad-cache-results = true +} + +database { + profile = "slick.jdbc.HsqldbProfile$" + db { + driver = "org.hsqldb.jdbcDriver" + url = "jdbc:hsqldb:file:cromwell-cache-db;shutdown=false;hsqldb.tx=mvcc" + connectionTimeout = 120000 + numThreads = 1 + } } \ No newline at end of file From 30aaad22323a3581083bff766e7d9ab2aa09d9cf Mon Sep 17 00:00:00 2001 From: tefirman Date: Thu, 16 Jan 2025 12:07:54 -0800 Subject: [PATCH 10/21] Updating cache test cromwell config --- cacheTest/cromwell.conf | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/cacheTest/cromwell.conf b/cacheTest/cromwell.conf index 385ffe9..c506ed8 100644 --- a/cacheTest/cromwell.conf +++ b/cacheTest/cromwell.conf @@ -6,8 +6,7 @@ backend { LocalExample { actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" config { - # Add job id regex pattern - job-id-regex = "(\\d+)" + job-id-regex = ".*" runtime-attributes = """ Int? cpu @@ -43,12 +42,6 @@ backend { enabled = false } } - - # Enable call caching - call-caching { - enabled = true - invalidate-bad-cache-results = true - } } } } From a2de077555e97cbf7d91d95561cde1ece9308945 Mon Sep 17 00:00:00 2001 From: tefirman Date: Thu, 16 Jan 2025 12:16:06 -0800 Subject: [PATCH 11/21] Finally have a successfully caching WDL --- cacheTest/cacheTest.wdl | 15 ++++----------- cacheTest/cromwell.conf | 4 +++- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/cacheTest/cacheTest.wdl b/cacheTest/cacheTest.wdl index b7e124f..8f3d27f 100644 --- a/cacheTest/cacheTest.wdl +++ b/cacheTest/cacheTest.wdl @@ -14,7 +14,6 @@ workflow CacheTest { output { File output_file = GenerateTimestamp.timestamp_file - String execution_id = GenerateTimestamp.execution_id } } @@ -25,22 +24,16 @@ task GenerateTimestamp { } command <<< - # Sleep to make the task take a noticeable amount of time sleep ~{sleep_seconds} - - # Generate a unique execution ID - execution_id=$(date +%s%N) - echo "Execution ID: $execution_id" - - # Create output with timestamp and message + + # Use a deterministic identifier based on inputs echo "Message: ~{input_message}" > output.txt - echo "Timestamp: $(date)" >> output.txt - echo "Execution ID: $execution_id" >> output.txt + echo "Sleep time: ~{sleep_seconds}" >> output.txt + echo "Run ID: ~{input_message}-~{sleep_seconds}" >> output.txt >>> output { File timestamp_file = "output.txt" - String execution_id = read_string(stdout()) } runtime { diff --git a/cacheTest/cromwell.conf b/cacheTest/cromwell.conf index c506ed8..8a1cf54 100644 --- a/cacheTest/cromwell.conf +++ b/cacheTest/cromwell.conf @@ -6,7 +6,7 @@ backend { LocalExample { actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" config { - job-id-regex = ".*" + job-id-regex = "(\\d+)" runtime-attributes = """ Int? cpu @@ -15,10 +15,12 @@ backend { """ submit = """ + echo "1" # Echo a dummy job ID ${job_shell} ${script} """ submit-docker = """ + echo "1" # Echo a dummy job ID docker run \ --rm \ -v ${cwd}:${docker_cwd} \ From 95148748e97670d6f99aa247bd81640d76728323 Mon Sep 17 00:00:00 2001 From: tefirman Date: Thu, 16 Jan 2025 12:16:31 -0800 Subject: [PATCH 12/21] Updating Cromwell cache GitHub Action --- .github/workflows/test-cromwell-cache.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-cromwell-cache.yml b/.github/workflows/test-cromwell-cache.yml index 04aef99..b2e1e0d 100644 --- a/.github/workflows/test-cromwell-cache.yml +++ b/.github/workflows/test-cromwell-cache.yml @@ -25,7 +25,7 @@ jobs: run: | echo "Running first execution..." start_time=$(date +%s) - java -jar cromwell-85.jar run cacheTest/cacheTest.wdl -i cacheTest/inputs.json -o cacheTest/options.json + java -Dconfig.file=cacheTest/cromwell.conf -jar cromwell-85.jar run cacheTest/cacheTest.wdl -i cacheTest/inputs.json -o cacheTest/options.json end_time=$(date +%s) duration=$((end_time - start_time)) echo "first_duration=$duration" >> $GITHUB_OUTPUT @@ -52,7 +52,7 @@ jobs: run: | echo "Running second execution..." start_time=$(date +%s) - java -jar cromwell-85.jar run cacheTest/cacheTest.wdl -i cacheTest/inputs.json -o cacheTest/options.json + java -Dconfig.file=cacheTest/cromwell.conf -jar cromwell-85.jar run cacheTest/cacheTest.wdl -i cacheTest/inputs.json -o cacheTest/options.json end_time=$(date +%s) duration=$((end_time - start_time)) echo "second_duration=$duration" >> $GITHUB_OUTPUT From 9a4e8e9846e2bb79a8ba48fb9e71aa875a9ecb1d Mon Sep 17 00:00:00 2001 From: tefirman Date: Thu, 16 Jan 2025 12:18:15 -0800 Subject: [PATCH 13/21] Updating Cromwell cache GitHub Action --- .github/workflows/test-cromwell-cache.yml | 41 ++++++++++++++++------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/.github/workflows/test-cromwell-cache.yml b/.github/workflows/test-cromwell-cache.yml index b2e1e0d..0d75c22 100644 --- a/.github/workflows/test-cromwell-cache.yml +++ b/.github/workflows/test-cromwell-cache.yml @@ -30,13 +30,19 @@ jobs: duration=$((end_time - start_time)) echo "first_duration=$duration" >> $GITHUB_OUTPUT - # Find the most recent workflow directory - workflow_dir=$(ls -td cromwell-executions/CacheTest/*/ | head -1) - workflow_id=$(basename "$workflow_dir") + # Find the workflow directory more robustly + workflow_dir=$(find . -type d -name "CacheTest" -path "*/cromwell-executions/*" | sort -r | head -1) + if [ -z "$workflow_dir" ]; then + echo "::error::Could not find workflow execution directory" + exit 1 + fi + + latest_run=$(find "$workflow_dir" -mindepth 1 -maxdepth 1 -type d | sort -r | head -1) + workflow_id=$(basename "$latest_run") echo "First workflow ID: $workflow_id" - # Find the output file - output_file="$workflow_dir/call-GenerateTimestamp/execution/output.txt" + # Find and validate output file + output_file="$latest_run/call-GenerateTimestamp/execution/output.txt" if [ ! -f "$output_file" ]; then echo "::error::Could not find output.txt at $output_file" exit 1 @@ -57,13 +63,14 @@ jobs: duration=$((end_time - start_time)) echo "second_duration=$duration" >> $GITHUB_OUTPUT - # Find the most recent workflow directory (different from the first one) - workflow_dir=$(ls -td cromwell-executions/CacheTest/*/ | grep -v "${{ steps.first-run.outputs.first_workflow_id }}" | head -1) - workflow_id=$(basename "$workflow_dir") + # Find the most recent workflow directory that's different from the first + workflow_dir=$(find . -type d -name "CacheTest" -path "*/cromwell-executions/*" | sort -r | head -1) + latest_runs=$(find "$workflow_dir" -mindepth 1 -maxdepth 1 -type d | sort -r) + workflow_id=$(echo "$latest_runs" | grep -v "${{ steps.first-run.outputs.first_workflow_id }}" | head -1 | xargs basename) echo "Second workflow ID: $workflow_id" - # Find the output file - output_file="$workflow_dir/call-GenerateTimestamp/execution/output.txt" + # Find and validate output file + output_file="$workflow_dir/$workflow_id/call-GenerateTimestamp/execution/output.txt" if [ ! -f "$output_file" ]; then echo "::error::Could not find output.txt at $output_file" exit 1 @@ -81,8 +88,16 @@ jobs: echo "First execution ID: ${{ steps.first-run.outputs.first_execution_id }}" echo "Second execution ID: ${{ steps.second-run.outputs.second_execution_id }}" - if [ -z "${{ steps.first-run.outputs.first_execution_id }}" ] || [ -z "${{ steps.second-run.outputs.second_execution_id }}" ]; then - echo "::error::Failed to extract execution IDs from output files" + # Verify both runs completed + if [ -z "${{ steps.first-run.outputs.first_workflow_id }}" ] || [ -z "${{ steps.second-run.outputs.second_workflow_id }}" ]; then + echo "::error::One or both workflow runs failed to complete" + exit 1 + fi + + # Check for cache hit messages in the logs + second_workflow_dir=$(find . -type d -name "${{ steps.second-run.outputs.second_workflow_id }}" -path "*/cromwell-executions/*") + if ! grep -q "Cache hit" "$second_workflow_dir/call-GenerateTimestamp/execution/stdout"; then + echo "::error::No cache hit message found in logs" exit 1 fi @@ -92,7 +107,7 @@ jobs: exit 1 fi - # Verify second run was significantly faster (< 50% of first run time) + # Verify second run was significantly faster if [ ${{ steps.second-run.outputs.second_duration }} -gt $(( ${{ steps.first-run.outputs.first_duration }} / 2 )) ]; then echo "::error::Cache validation failed! Second run took too long (${steps.second-run.outputs.second_duration}s), suggesting cache wasn't used." exit 1 From ebdfbc155222f6a6e7daafb831c9fb0d39dfc93a Mon Sep 17 00:00:00 2001 From: tefirman Date: Thu, 16 Jan 2025 12:27:13 -0800 Subject: [PATCH 14/21] Updating cache test Cromwell configuration --- cacheTest/cromwell.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cacheTest/cromwell.conf b/cacheTest/cromwell.conf index 8a1cf54..7e2fba2 100644 --- a/cacheTest/cromwell.conf +++ b/cacheTest/cromwell.conf @@ -33,7 +33,7 @@ backend { filesystem { local { localization: [ - "hard-link", "soft-link", "copy" + "soft-link", "copy" # "hard-link" not permitted in GitHub Actions ] } } From 54040b2458f86aa613d7ed6b08e0f3d9d44da76e Mon Sep 17 00:00:00 2001 From: tefirman Date: Thu, 16 Jan 2025 13:12:15 -0800 Subject: [PATCH 15/21] Updating Cromwell cache GitHub Action --- .github/workflows/test-cromwell-cache.yml | 4 ++-- cacheTest/cromwell.conf | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test-cromwell-cache.yml b/.github/workflows/test-cromwell-cache.yml index 0d75c22..99322da 100644 --- a/.github/workflows/test-cromwell-cache.yml +++ b/.github/workflows/test-cromwell-cache.yml @@ -69,8 +69,8 @@ jobs: workflow_id=$(echo "$latest_runs" | grep -v "${{ steps.first-run.outputs.first_workflow_id }}" | head -1 | xargs basename) echo "Second workflow ID: $workflow_id" - # Find and validate output file - output_file="$workflow_dir/$workflow_id/call-GenerateTimestamp/execution/output.txt" + # Update the path to include cacheCopy + output_file="$workflow_dir/$workflow_id/call-GenerateTimestamp/cacheCopy/execution/output.txt" if [ ! -f "$output_file" ]; then echo "::error::Could not find output.txt at $output_file" exit 1 diff --git a/cacheTest/cromwell.conf b/cacheTest/cromwell.conf index 7e2fba2..8a1cf54 100644 --- a/cacheTest/cromwell.conf +++ b/cacheTest/cromwell.conf @@ -33,7 +33,7 @@ backend { filesystem { local { localization: [ - "soft-link", "copy" # "hard-link" not permitted in GitHub Actions + "hard-link", "soft-link", "copy" ] } } From ab11b82280e721f3b249911368923efe796aaa86 Mon Sep 17 00:00:00 2001 From: tefirman Date: Thu, 16 Jan 2025 13:20:31 -0800 Subject: [PATCH 16/21] Fixing cache verification step in GitHub Action --- .github/workflows/test-cromwell-cache.yml | 2 +- cacheTest/cacheTest.wdl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-cromwell-cache.yml b/.github/workflows/test-cromwell-cache.yml index 99322da..f264d37 100644 --- a/.github/workflows/test-cromwell-cache.yml +++ b/.github/workflows/test-cromwell-cache.yml @@ -96,7 +96,7 @@ jobs: # Check for cache hit messages in the logs second_workflow_dir=$(find . -type d -name "${{ steps.second-run.outputs.second_workflow_id }}" -path "*/cromwell-executions/*") - if ! grep -q "Cache hit" "$second_workflow_dir/call-GenerateTimestamp/execution/stdout"; then + if ! grep -q "Cache hit" "$second_workflow_dir/call-GenerateTimestamp/cacheCopy/execution/stdout"; then echo "::error::No cache hit message found in logs" exit 1 fi diff --git a/cacheTest/cacheTest.wdl b/cacheTest/cacheTest.wdl index 8f3d27f..11f8e19 100644 --- a/cacheTest/cacheTest.wdl +++ b/cacheTest/cacheTest.wdl @@ -3,7 +3,7 @@ version 1.0 workflow CacheTest { input { String message - Int sleep_time = 5 + Int sleep_time = 20 } call GenerateTimestamp { From 66011cc3f06c363295be49bc0fb0ccd5b6c9c3d1 Mon Sep 17 00:00:00 2001 From: tefirman Date: Thu, 16 Jan 2025 13:48:00 -0800 Subject: [PATCH 17/21] Fixing cache verification step in GitHub Action --- .github/workflows/test-cromwell-cache.yml | 31 ++++++----------------- 1 file changed, 8 insertions(+), 23 deletions(-) diff --git a/.github/workflows/test-cromwell-cache.yml b/.github/workflows/test-cromwell-cache.yml index f264d37..e0a32e8 100644 --- a/.github/workflows/test-cromwell-cache.yml +++ b/.github/workflows/test-cromwell-cache.yml @@ -41,24 +41,17 @@ jobs: workflow_id=$(basename "$latest_run") echo "First workflow ID: $workflow_id" - # Find and validate output file - output_file="$latest_run/call-GenerateTimestamp/execution/output.txt" - if [ ! -f "$output_file" ]; then - echo "::error::Could not find output.txt at $output_file" - exit 1 - fi - execution_id=$(grep "Execution ID:" "$output_file" | cut -d' ' -f3) + # Find and validate execution ID from stdout + execution_id=$(cat "$latest_run/call-GenerateTimestamp/execution/stdout") echo "first_execution_id=$execution_id" >> $GITHUB_OUTPUT echo "first_workflow_id=$workflow_id" >> $GITHUB_OUTPUT - echo "First run output file: $output_file" - cat "$output_file" - name: Second run (should use cache) id: second-run run: | echo "Running second execution..." start_time=$(date +%s) - java -Dconfig.file=cacheTest/cromwell.conf -jar cromwell-85.jar run cacheTest/cacheTest.wdl -i cacheTest/inputs.json -o cacheTest/options.json + java -Dconfig.file=cacheTest/cromwell.conf -jar cromwell-85.jar run cacheTest/cacheTest.wdl -i cacheTest/inputs.json -o cacheTest/options.json 2> cromwell.log end_time=$(date +%s) duration=$((end_time - start_time)) echo "second_duration=$duration" >> $GITHUB_OUTPUT @@ -69,17 +62,10 @@ jobs: workflow_id=$(echo "$latest_runs" | grep -v "${{ steps.first-run.outputs.first_workflow_id }}" | head -1 | xargs basename) echo "Second workflow ID: $workflow_id" - # Update the path to include cacheCopy - output_file="$workflow_dir/$workflow_id/call-GenerateTimestamp/cacheCopy/execution/output.txt" - if [ ! -f "$output_file" ]; then - echo "::error::Could not find output.txt at $output_file" - exit 1 - fi - execution_id=$(grep "Execution ID:" "$output_file" | cut -d' ' -f3) + # Find and validate execution ID from cached stdout + execution_id=$(cat "$workflow_dir/$workflow_id/call-GenerateTimestamp/cacheCopy/execution/stdout") echo "second_execution_id=$execution_id" >> $GITHUB_OUTPUT echo "second_workflow_id=$workflow_id" >> $GITHUB_OUTPUT - echo "Second run output file: $output_file" - cat "$output_file" - name: Validate caching behavior run: | @@ -94,10 +80,9 @@ jobs: exit 1 fi - # Check for cache hit messages in the logs - second_workflow_dir=$(find . -type d -name "${{ steps.second-run.outputs.second_workflow_id }}" -path "*/cromwell-executions/*") - if ! grep -q "Cache hit" "$second_workflow_dir/call-GenerateTimestamp/cacheCopy/execution/stdout"; then - echo "::error::No cache hit message found in logs" + # Check for cache hit messages in the Cromwell logs from the second run + if ! grep -q "cache hit copying success" cromwell.log; then + echo "::error::No cache hit message found in Cromwell logs" exit 1 fi From 11543650fc309543030089416366937be9ff4c96 Mon Sep 17 00:00:00 2001 From: tefirman Date: Thu, 16 Jan 2025 13:57:25 -0800 Subject: [PATCH 18/21] Fixing cache verification step in GitHub Action --- .github/workflows/test-cromwell-cache.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-cromwell-cache.yml b/.github/workflows/test-cromwell-cache.yml index e0a32e8..17a5fd8 100644 --- a/.github/workflows/test-cromwell-cache.yml +++ b/.github/workflows/test-cromwell-cache.yml @@ -51,7 +51,7 @@ jobs: run: | echo "Running second execution..." start_time=$(date +%s) - java -Dconfig.file=cacheTest/cromwell.conf -jar cromwell-85.jar run cacheTest/cacheTest.wdl -i cacheTest/inputs.json -o cacheTest/options.json 2> cromwell.log + java -Dconfig.file=cacheTest/cromwell.conf -jar cromwell-85.jar run cacheTest/cacheTest.wdl -i cacheTest/inputs.json -o cacheTest/options.json > cromwell.log 2>&1 end_time=$(date +%s) duration=$((end_time - start_time)) echo "second_duration=$duration" >> $GITHUB_OUTPUT From 5eb0f42ef328ba462c1c1b522209c00247394dae Mon Sep 17 00:00:00 2001 From: tefirman Date: Thu, 16 Jan 2025 14:09:52 -0800 Subject: [PATCH 19/21] Fixing cache verification step in GitHub Action --- .github/workflows/test-cromwell-cache.yml | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/.github/workflows/test-cromwell-cache.yml b/.github/workflows/test-cromwell-cache.yml index 17a5fd8..92e342e 100644 --- a/.github/workflows/test-cromwell-cache.yml +++ b/.github/workflows/test-cromwell-cache.yml @@ -40,10 +40,6 @@ jobs: latest_run=$(find "$workflow_dir" -mindepth 1 -maxdepth 1 -type d | sort -r | head -1) workflow_id=$(basename "$latest_run") echo "First workflow ID: $workflow_id" - - # Find and validate execution ID from stdout - execution_id=$(cat "$latest_run/call-GenerateTimestamp/execution/stdout") - echo "first_execution_id=$execution_id" >> $GITHUB_OUTPUT echo "first_workflow_id=$workflow_id" >> $GITHUB_OUTPUT - name: Second run (should use cache) @@ -61,10 +57,6 @@ jobs: latest_runs=$(find "$workflow_dir" -mindepth 1 -maxdepth 1 -type d | sort -r) workflow_id=$(echo "$latest_runs" | grep -v "${{ steps.first-run.outputs.first_workflow_id }}" | head -1 | xargs basename) echo "Second workflow ID: $workflow_id" - - # Find and validate execution ID from cached stdout - execution_id=$(cat "$workflow_dir/$workflow_id/call-GenerateTimestamp/cacheCopy/execution/stdout") - echo "second_execution_id=$execution_id" >> $GITHUB_OUTPUT echo "second_workflow_id=$workflow_id" >> $GITHUB_OUTPUT - name: Validate caching behavior @@ -86,12 +78,6 @@ jobs: exit 1 fi - # Verify execution IDs match (indicating cache was used) - if [ "${{ steps.first-run.outputs.first_execution_id }}" != "${{ steps.second-run.outputs.second_execution_id }}" ]; then - echo "::error::Cache validation failed! Execution IDs don't match between runs." - exit 1 - fi - # Verify second run was significantly faster if [ ${{ steps.second-run.outputs.second_duration }} -gt $(( ${{ steps.first-run.outputs.first_duration }} / 2 )) ]; then echo "::error::Cache validation failed! Second run took too long (${steps.second-run.outputs.second_duration}s), suggesting cache wasn't used." From 98241491baea36f19e1c18fb90c2d23097780264 Mon Sep 17 00:00:00 2001 From: tefirman Date: Thu, 16 Jan 2025 14:24:13 -0800 Subject: [PATCH 20/21] Adding third run to ensure no cache usage during cacheTest GitHub Action --- .github/workflows/test-cromwell-cache.yml | 54 ++++++++++++++++++----- 1 file changed, 44 insertions(+), 10 deletions(-) diff --git a/.github/workflows/test-cromwell-cache.yml b/.github/workflows/test-cromwell-cache.yml index 92e342e..1bfaaa8 100644 --- a/.github/workflows/test-cromwell-cache.yml +++ b/.github/workflows/test-cromwell-cache.yml @@ -30,7 +30,6 @@ jobs: duration=$((end_time - start_time)) echo "first_duration=$duration" >> $GITHUB_OUTPUT - # Find the workflow directory more robustly workflow_dir=$(find . -type d -name "CacheTest" -path "*/cromwell-executions/*" | sort -r | head -1) if [ -z "$workflow_dir" ]; then echo "::error::Could not find workflow execution directory" @@ -52,35 +51,70 @@ jobs: duration=$((end_time - start_time)) echo "second_duration=$duration" >> $GITHUB_OUTPUT - # Find the most recent workflow directory that's different from the first workflow_dir=$(find . -type d -name "CacheTest" -path "*/cromwell-executions/*" | sort -r | head -1) latest_runs=$(find "$workflow_dir" -mindepth 1 -maxdepth 1 -type d | sort -r) workflow_id=$(echo "$latest_runs" | grep -v "${{ steps.first-run.outputs.first_workflow_id }}" | head -1 | xargs basename) echo "Second workflow ID: $workflow_id" echo "second_workflow_id=$workflow_id" >> $GITHUB_OUTPUT + - name: Create modified inputs + run: | + echo '{ + "CacheTest.message": "Modified message to invalidate cache", + "CacheTest.sleep_time": 15 + }' > cacheTest/modified_inputs.json + + - name: Third run (should NOT use cache) + id: third-run + run: | + echo "Running third execution with modified inputs..." + start_time=$(date +%s) + java -Dconfig.file=cacheTest/cromwell.conf -jar cromwell-85.jar run cacheTest/cacheTest.wdl -i cacheTest/modified_inputs.json -o cacheTest/options.json > cromwell_modified.log 2>&1 + end_time=$(date +%s) + duration=$((end_time - start_time)) + echo "third_duration=$duration" >> $GITHUB_OUTPUT + + workflow_dir=$(find . -type d -name "CacheTest" -path "*/cromwell-executions/*" | sort -r | head -1) + latest_runs=$(find "$workflow_dir" -mindepth 1 -maxdepth 1 -type d | sort -r) + workflow_id=$(echo "$latest_runs" | grep -v "${{ steps.first-run.outputs.first_workflow_id }}" | grep -v "${{ steps.second-run.outputs.second_workflow_id }}" | head -1 | xargs basename) + echo "Third workflow ID: $workflow_id" + echo "third_workflow_id=$workflow_id" >> $GITHUB_OUTPUT + - name: Validate caching behavior run: | echo "First run duration: ${{ steps.first-run.outputs.first_duration }} seconds" echo "Second run duration: ${{ steps.second-run.outputs.second_duration }} seconds" - echo "First execution ID: ${{ steps.first-run.outputs.first_execution_id }}" - echo "Second execution ID: ${{ steps.second-run.outputs.second_execution_id }}" + echo "Third run duration: ${{ steps.third-run.outputs.third_duration }} seconds" - # Verify both runs completed - if [ -z "${{ steps.first-run.outputs.first_workflow_id }}" ] || [ -z "${{ steps.second-run.outputs.second_workflow_id }}" ]; then - echo "::error::One or both workflow runs failed to complete" + # Verify all runs completed + if [ -z "${{ steps.first-run.outputs.first_workflow_id }}" ] || \ + [ -z "${{ steps.second-run.outputs.second_workflow_id }}" ] || \ + [ -z "${{ steps.third-run.outputs.third_workflow_id }}" ]; then + echo "::error::One or more workflow runs failed to complete" exit 1 fi - # Check for cache hit messages in the Cromwell logs from the second run + # Check for cache hit in second run if ! grep -q "cache hit copying success" cromwell.log; then - echo "::error::No cache hit message found in Cromwell logs" + echo "::error::No cache hit message found in second run Cromwell logs" + exit 1 + fi + + # Check that third run did NOT use cache + if grep -q "cache hit copying success" cromwell_modified.log; then + echo "::error::Cache hit found in third run when it should have been invalidated" exit 1 fi # Verify second run was significantly faster if [ ${{ steps.second-run.outputs.second_duration }} -gt $(( ${{ steps.first-run.outputs.first_duration }} / 2 )) ]; then - echo "::error::Cache validation failed! Second run took too long (${steps.second-run.outputs.second_duration}s), suggesting cache wasn't used." + echo "::error::Cache validation failed! Second run took too long, suggesting cache wasn't used" + exit 1 + fi + + # Verify third run was NOT cached (should take similar time to first run) + if [ ${{ steps.third-run.outputs.third_duration }} -lt $(( ${{ steps.first-run.outputs.first_duration }} / 2 )) ]; then + echo "::error::Third run was too fast, suggesting cache was incorrectly used" exit 1 fi From 6642f2527599a24ff8e09f76a8f997175f1d668e Mon Sep 17 00:00:00 2001 From: tefirman Date: Thu, 16 Jan 2025 14:39:57 -0800 Subject: [PATCH 21/21] Adding cacheTest README and fixing typo in cache test yml --- .github/workflows/test-cromwell-cache.yml | 10 +- cacheTest/README | 129 ++++++++++++++++++++++ 2 files changed, 134 insertions(+), 5 deletions(-) create mode 100644 cacheTest/README diff --git a/.github/workflows/test-cromwell-cache.yml b/.github/workflows/test-cromwell-cache.yml index 1bfaaa8..b721773 100644 --- a/.github/workflows/test-cromwell-cache.yml +++ b/.github/workflows/test-cromwell-cache.yml @@ -18,14 +18,14 @@ jobs: - name: Download Cromwell run: | - wget https://github.com/broadinstitute/cromwell/releases/download/85/cromwell-85.jar + wget https://github.com/broadinstitute/cromwell/releases/download/86/cromwell-86.jar - name: First run id: first-run run: | echo "Running first execution..." start_time=$(date +%s) - java -Dconfig.file=cacheTest/cromwell.conf -jar cromwell-85.jar run cacheTest/cacheTest.wdl -i cacheTest/inputs.json -o cacheTest/options.json + java -Dconfig.file=cacheTest/cromwell.conf -jar cromwell-86.jar run cacheTest/cacheTest.wdl -i cacheTest/inputs.json -o cacheTest/options.json end_time=$(date +%s) duration=$((end_time - start_time)) echo "first_duration=$duration" >> $GITHUB_OUTPUT @@ -46,7 +46,7 @@ jobs: run: | echo "Running second execution..." start_time=$(date +%s) - java -Dconfig.file=cacheTest/cromwell.conf -jar cromwell-85.jar run cacheTest/cacheTest.wdl -i cacheTest/inputs.json -o cacheTest/options.json > cromwell.log 2>&1 + java -Dconfig.file=cacheTest/cromwell.conf -jar cromwell-86.jar run cacheTest/cacheTest.wdl -i cacheTest/inputs.json -o cacheTest/options.json > cromwell.log 2>&1 end_time=$(date +%s) duration=$((end_time - start_time)) echo "second_duration=$duration" >> $GITHUB_OUTPUT @@ -61,7 +61,7 @@ jobs: run: | echo '{ "CacheTest.message": "Modified message to invalidate cache", - "CacheTest.sleep_time": 15 + "CacheTest.sleep_time": 19 }' > cacheTest/modified_inputs.json - name: Third run (should NOT use cache) @@ -69,7 +69,7 @@ jobs: run: | echo "Running third execution with modified inputs..." start_time=$(date +%s) - java -Dconfig.file=cacheTest/cromwell.conf -jar cromwell-85.jar run cacheTest/cacheTest.wdl -i cacheTest/modified_inputs.json -o cacheTest/options.json > cromwell_modified.log 2>&1 + java -Dconfig.file=cacheTest/cromwell.conf -jar cromwell-86.jar run cacheTest/cacheTest.wdl -i cacheTest/modified_inputs.json -o cacheTest/options.json > cromwell_modified.log 2>&1 end_time=$(date +%s) duration=$((end_time - start_time)) echo "third_duration=$duration" >> $GITHUB_OUTPUT diff --git a/cacheTest/README b/cacheTest/README new file mode 100644 index 0000000..bde694b --- /dev/null +++ b/cacheTest/README @@ -0,0 +1,129 @@ +# Unit Test for Cromwell Call Caching + +## Overview +This workflow is designed to validate Cromwell's call caching functionality through a series of controlled test executions. It verifies that: +- Identical workflow runs properly utilize the cache +- Modified inputs correctly invalidate the cache +- Execution times align with expected caching behavior + +The test consists of three sequential workflow runs: +1. Initial execution to populate the cache +2. Identical execution to verify cache utilization +3. Modified execution to confirm cache invalidation + +## Purpose +This workflow serves as a comprehensive test case for: +- Call caching configuration +- Cache hit detection +- Cache invalidation +- Execution time validation +- Workflow output verification +- Runtime environment consistency +- File system interactions +- Docker container caching + +## Workflow Components + +### Workflow: `CacheTest` +The main workflow demonstrates caching behavior through a simple, deterministic task execution. + +**Inputs:** +- `message`: String - Input message to be written to output +- `sleep_time`: Int - Duration to sleep (defaults to 20 seconds) + +**Outputs:** +- `output_file`: File - Generated output file containing timestamp and input message + +### Tasks + +#### Task: `GenerateTimestamp` +Creates a deterministic output based on input parameters with a controlled execution time. + +**Runtime Requirements:** +- CPU: 1 core +- Memory: 1 GB +- Docker: ubuntu:latest + +## Configuration + +### cromwell.conf +Key configuration elements: +```hocon +call-caching { + enabled = true + invalidate-bad-cache-results = true +} +``` + +### options.json +Testing options: +```json +{ + "workflow_failure_mode": "ContinueWhilePossible", + "write_to_cache": true, + "read_from_cache": true +} +``` + +## GitHub Action Workflow + +The test is automated through a GitHub Action (`test-cromwell-cache.yml`) that: +1. Sets up the Java environment +2. Downloads Cromwell +3. Executes three test runs +4. Validates caching behavior + +### Test Sequence +1. **First Run:** + - Executes with initial inputs + - Measures execution time + - Records workflow ID + +2. **Second Run:** + - Uses identical inputs + - Verifies cache utilization + - Confirms faster execution time + +3. **Third Run:** + - Uses modified inputs + - Verifies cache invalidation + - Confirms execution time similar to first run + +### Validation Checks +- Completion of all workflow runs +- Presence of cache hit messages in second run +- Absence of cache hit messages in third run +- Execution time comparisons +- Workflow ID uniqueness + +## Usage + +### Local Testing +```bash +# Execute first run +java -Dconfig.file=cromwell.conf -jar cromwell.jar run cacheTest.wdl -i inputs.json -o options.json + +# Execute second run (should use cache) +java -Dconfig.file=cromwell.conf -jar cromwell.jar run cacheTest.wdl -i inputs.json -o options.json + +# Execute third run (with modified inputs) +java -Dconfig.file=cromwell.conf -jar cromwell.jar run cacheTest.wdl -i modified_inputs.json -o options.json +``` + +### GitHub Actions +The test will automatically run on: +- Pull requests +- Manual workflow dispatch + +## Version +- WDL 1.0 +- Cromwell 86 +- GitHub Actions Runner: ubuntu-latest + +## Additional Notes +- Ensures consistent cache behavior across environments +- Validates both positive and negative cache scenarios +- Provides timing-based validation of cache utilization +- Uses deterministic task outputs for reliable testing +- Includes comprehensive error reporting +- Supports both local and CI/CD testing scenarios