Skip to content

Adding GitHub Action for Variant Calling Test Run #48

Adding GitHub Action for Variant Calling Test Run

Adding GitHub Action for Variant Calling Test Run #48

name: Variant Calling Test Run
on:
pull_request:
paths:
- 'variantCalling/variantCalling.wdl'
- '.github/workflows/variant-calling-test-run.yml'
jobs:
test:
runs-on: ubuntu-latest
permissions: write-all
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Set Up Java
uses: actions/setup-java@v4
with:
distribution: 'temurin'
java-version: '21'
- name: Install required tools
run: |
# Install system packages
sudo apt-get update
sudo apt-get install -y wget unzip samtools bwa tabix
# Download and install GATK
wget https://github.com/broadinstitute/gatk/releases/download/4.4.0.0/gatk-4.4.0.0.zip
unzip gatk-4.4.0.0.zip
sudo ln -s $PWD/gatk-4.4.0.0/gatk /usr/local/bin/gatk
- name: Pull Cromwell Jarfile
run: wget -q https://github.com/broadinstitute/cromwell/releases/download/86/cromwell-86.jar
- name: Setup Test Data
run: |
# mkdir -p test/data
cd test/data
# Download reference chromosome 20 from NCBI
wget ftp://ftp.ncbi.nlm.nih.gov/genomes/archive/old_genbank/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/Primary_Assembly/assembled_chromosomes/FASTA/chr20.fa.gz
gunzip chr20.fa.gz
# Add "chr" prefix to sequence name
sed 's/^>.*/>chr20/' chr20.fa > ref.fasta
rm chr20.fa
# Create sequence dictionary and index
samtools faidx ref.fasta
gatk CreateSequenceDictionary -R ref.fasta -O ref.dict
# Debug: Check the content of our files
echo "=== Reference FASTA header ==="
head -n 1 ref.fasta
echo "=== Reference FAI content ==="
cat ref.fasta.fai
echo "=== Dictionary content ==="
head -n 5 ref.dict
# Create and sort test region bed file
echo -e "chr20\t1000000\t1100000" > test.bed
sort -k1,1 -k2,2n test.bed > sorted.bed
mv sorted.bed test.bed
# Debug: Verify bed file content
echo "=== BED file content ==="
cat test.bed
# Test bed to interval_list conversion directly
echo "=== Testing bed to interval_list conversion ==="
gatk BedToIntervalList \
-I test.bed \
-O test.interval_list \
-SD ref.dict
# Generate BWA indexes
bwa index ref.fasta
# Function to extract reference base at a position
echo "# Create function to get reference base at position"
get_ref_base() {
local pos=$1
local length=$2
samtools faidx ref.fasta chr20:${pos}-$((pos+length-1)) | tail -n1
}
# Create a minimal synthetic dbSNP VCF for testing
echo '##fileformat=VCFv4.2' > dbsnp.vcf
echo '##reference=GRCh38' >> dbsnp.vcf
echo '##INFO=<ID=RS,Number=1,Type=Integer,Description="dbSNP ID">' >> dbsnp.vcf
echo "##contig=<ID=chr20,length=$(grep "^chr20" ref.fasta.fai | cut -f2)>" >> dbsnp.vcf
printf "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" >> dbsnp.vcf
# Get actual reference bases and create VCF records
ref1=$(get_ref_base 1000100 1)
ref2=$(get_ref_base 1000200 1)
ref3=$(get_ref_base 1000300 1)
ref4=$(get_ref_base 1000400 1)
ref5=$(get_ref_base 1000500 2)
printf "chr20\t1000100\trs1234567\t${ref1}\tG\t.\tPASS\tRS=1234567\n" >> dbsnp.vcf
printf "chr20\t1000200\trs2345678\t${ref2}\tC\t.\tPASS\tRS=2345678\n" >> dbsnp.vcf
printf "chr20\t1000300\trs3456789\t${ref3}\tA\t.\tPASS\tRS=3456789\n" >> dbsnp.vcf
printf "chr20\t1000400\trs4567890\t${ref4}\tT\t.\tPASS\tRS=4567890\n" >> dbsnp.vcf
printf "chr20\t1000500\trs5678901\t${ref5}\tA\t.\tPASS\tRS=5678901\n" >> dbsnp.vcf
# Index the VCF
gatk IndexFeatureFile -I dbsnp.vcf
# Create synthetic Mills and 1000G indels VCF similarly
echo '##fileformat=VCFv4.2' > mills_1000G.vcf
echo '##reference=GRCh38' >> mills_1000G.vcf
echo '##INFO=<ID=TYPE,Number=1,Type=String,Description="Type of variant">' >> mills_1000G.vcf
echo '##INFO=<ID=SOURCE,Number=1,Type=String,Description="Source of variant">' >> mills_1000G.vcf
echo "##contig=<ID=chr20,length=$(grep "^chr20" ref.fasta.fai | cut -f2)>" >> mills_1000G.vcf
printf "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" >> mills_1000G.vcf
ref6=$(get_ref_base 1000150 2)
ref7=$(get_ref_base 1000250 1)
ref8=$(get_ref_base 1000350 3)
ref9=$(get_ref_base 1000450 1)
printf "chr20\t1000150\tMILL1\t${ref6}\t${ref6:0:1}\t.\tPASS\tTYPE=deletion;SOURCE=MILLS\n" >> mills_1000G.vcf
printf "chr20\t1000250\tMILL2\t${ref7}\t${ref7}TT\t.\tPASS\tTYPE=insertion;SOURCE=MILLS\n" >> mills_1000G.vcf
printf "chr20\t1000350\tG1000_1\t${ref8}\t${ref8:0:1}\t.\tPASS\tTYPE=deletion;SOURCE=1000G\n" >> mills_1000G.vcf
printf "chr20\t1000450\tG1000_2\t${ref9}\t${ref9}AGC\t.\tPASS\tTYPE=insertion;SOURCE=1000G\n" >> mills_1000G.vcf
bgzip mills_1000G.vcf
tabix -p vcf mills_1000G.vcf.gz
# Create synthetic known indels VCF
echo '##fileformat=VCFv4.2' > known_indels.vcf
echo '##reference=GRCh38' >> known_indels.vcf
echo '##INFO=<ID=TYPE,Number=1,Type=String,Description="Type of variant">' >> known_indels.vcf
echo "##contig=<ID=chr20,length=$(grep "^chr20" ref.fasta.fai | cut -f2)>" >> known_indels.vcf
printf "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" >> known_indels.vcf
ref10=$(get_ref_base 1000550 2)
ref11=$(get_ref_base 1000650 1)
ref12=$(get_ref_base 1000750 3)
printf "chr20\t1000550\tindel1\t${ref10}\t${ref10:0:1}\t.\tPASS\tTYPE=deletion\n" >> known_indels.vcf
printf "chr20\t1000650\tindel2\t${ref11}\t${ref11}TT\t.\tPASS\tTYPE=insertion\n" >> known_indels.vcf
printf "chr20\t1000750\tindel3\t${ref12}\t${ref12:0:1}\t.\tPASS\tTYPE=deletion\n" >> known_indels.vcf
bgzip known_indels.vcf
tabix -p vcf known_indels.vcf.gz
# Create test JSON with all reference files
cat << EOF > ../test-inputs.json
{
"PanelBwaGatk4Annovar.sample_batch": [
{
"sample_name": "TESTBAM1",
"bam_file": "test/data/TESTBAM1.unmapped.bam",
"bed_file": "test/data/test.bed"
},
{
"sample_name": "TESTBAM2",
"bam_file": "test/data/TESTBAM2.unmapped.bam",
"bed_file": "test/data/test.bed"
}
],
"PanelBwaGatk4Annovar.reference_genome": {
"ref_name": "hg38",
"ref_fasta": "test/data/ref.fasta",
"ref_fasta_index": "test/data/ref.fasta.fai",
"ref_dict": "test/data/ref.dict",
"ref_pac": "test/data/ref.fasta.pac",
"ref_sa": "test/data/ref.fasta.sa",
"ref_amb": "test/data/ref.fasta.amb",
"ref_ann": "test/data/ref.fasta.ann",
"ref_bwt": "test/data/ref.fasta.bwt",
"dbSNP_vcf": "test/data/dbsnp.vcf",
"dbSNP_vcf_index": "test/data/dbsnp.vcf.idx",
"known_indels_sites_VCFs": [
"test/data/mills_1000G.vcf.gz",
"test/data/known_indels.vcf.gz"
],
"known_indels_sites_indices": [
"test/data/mills_1000G.vcf.gz.tbi",
"test/data/known_indels.vcf.gz.tbi"
],
"annovar_protocols": "refGene",
"annovar_operation": "g"
}
}
EOF
# Add debugging for BQSR inputs
echo "=== Checking BQSR input files ==="
echo "dbSNP VCF:"
head -n 5 dbsnp.vcf
echo "Mills & 1000G VCF:"
zcat mills_1000G.vcf.gz | head -n 5
echo "Known indels VCF:"
zcat known_indels.vcf.gz | head -n 5
# Validate VCF files
echo "=== Validating VCF files ==="
gatk ValidateVariants -V dbsnp.vcf -R ref.fasta
gatk ValidateVariants -V mills_1000G.vcf.gz -R ref.fasta
gatk ValidateVariants -V known_indels.vcf.gz -R ref.fasta
# # Create a test BAM with unmapped reads
# cat << EOF > test.sam
# @HD VN:1.6 SO:queryname
# @SQ SN:chr20 LN:63025520
# @RG ID:test SM:test_sample PL:ILLUMINA LB:lib1 PU:unit1
# read1 77 * 0 0 * * 0 0 $(get_ref_base 1000100 16) FFFFFFFFFFFFFFFF RG:Z:test
# read1 141 * 0 0 * * 0 0 $(get_ref_base 1000200 16) FFFFFFFFFFFFFFFF RG:Z:test
# read2 77 * 0 0 * * 0 0 $(get_ref_base 1000300 16) FFFFFFFFFFFFFFFF RG:Z:test
# read2 141 * 0 0 * * 0 0 $(get_ref_base 1000400 16) FFFFFFFFFFFFFFFF RG:Z:test
# read3 77 * 0 0 * * 0 0 $(get_ref_base 1000500 16) FFFFFFFFFFFFFFFF RG:Z:test
# read3 141 * 0 0 * * 0 0 $(get_ref_base 1000600 16) FFFFFFFFFFFFFFFF RG:Z:test
# EOF
# # Convert SAM to BAM and proper sorting
# samtools view -b test.sam > test.unsorted.bam
# samtools sort -n test.unsorted.bam > test.unmapped.bam
# # Debug: Check the BAM
# echo "=== Checking final BAM structure ==="
# samtools view -H test.unmapped.bam
# echo "=== First few reads ==="
# samtools view test.unmapped.bam | head -n 2
# # Clean up intermediate files
# rm test.sam test.unsorted.bam
# # Validate BAM file
# echo "=== Validating BAM file ==="
# gatk ValidateSamFile -I test.unmapped.bam
# Navigating back to original directory
cd ../../
- name: Run test workflow
run: |
# Create a directory for workflow logs
mkdir -p workflow_logs
echo "=== Running Cromwell with test workflow ==="
# Run with more detailed logging and capture all output
java -Dbackend.providers.Local.config.root="workflow_logs" \
-Dconfig.trace=true \
-Dakka.loglevel=DEBUG \
-Xmx4g \
-jar cromwell-86.jar \
run \
variantCalling/variantCalling.wdl \
--inputs test/test-inputs.json \
--metadata-output workflow_logs/metadata.json \
2>&1 | tee workflow_logs/workflow.log
# If workflow fails, print debug info
if [ $? -ne 0 ]; then
echo "=== Workflow failed, printing debug information ==="
echo "=== Directory contents ==="
ls -R workflow_logs/
echo "=== Last 100 lines of workflow log ==="
tail -n 100 workflow_logs/workflow.log
echo "=== Contents of stderr files if they exist ==="
find workflow_logs -name "stderr" -type f -exec sh -c 'echo "=== File: {}"; cat {}' \;
echo "=== Contents of stdout files if they exist ==="
find workflow_logs -name "stdout" -type f -exec sh -c 'echo "=== File: {}"; cat {}' \;
echo "=== Contents of script files if they exist ==="
find workflow_logs -name "script" -type f -exec sh -c 'echo "=== File: {}"; cat {}' \;
# Also print metadata if it exists
if [ -f workflow_logs/metadata.json ]; then
echo "=== Workflow metadata ==="
cat workflow_logs/metadata.json
fi
# Exit with failure
exit 1
fi
# - name: Run test workflow
# run: |
# java -Xmx4g -jar cromwell-86.jar run variantCalling/variantCalling.wdl --inputs test/test-inputs.json
- name: Check outputs
run: |
# Basic output existence checks
test -f $(find cromwell-executions -name "*.recal.bam")
test -f $(find cromwell-executions -name "*.GATK.vcf")
test -f $(find cromwell-executions -name "*_multianno.txt")