Adding GitHub Action for Variant Calling Test Run #50
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Variant Calling Test Run | |
on: | |
pull_request: | |
paths: | |
- 'variantCalling/variantCalling.wdl' | |
- '.github/workflows/variant-calling-test-run.yml' | |
jobs: | |
test: | |
runs-on: ubuntu-latest | |
permissions: write-all | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
- name: Set Up Java | |
uses: actions/setup-java@v4 | |
with: | |
distribution: 'temurin' | |
java-version: '21' | |
- name: Install required tools | |
run: | | |
# Install system packages | |
sudo apt-get update | |
sudo apt-get install -y wget unzip samtools bwa tabix | |
# Download and install GATK | |
wget https://github.com/broadinstitute/gatk/releases/download/4.4.0.0/gatk-4.4.0.0.zip | |
unzip gatk-4.4.0.0.zip | |
sudo ln -s $PWD/gatk-4.4.0.0/gatk /usr/local/bin/gatk | |
- name: Pull Cromwell Jarfile | |
run: wget -q https://github.com/broadinstitute/cromwell/releases/download/86/cromwell-86.jar | |
- name: Setup Test Data | |
run: | | |
mkdir -p test/data | |
cd test/data | |
# Download reference chromosome 20 from NCBI | |
wget ftp://ftp.ncbi.nlm.nih.gov/genomes/archive/old_genbank/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/Primary_Assembly/assembled_chromosomes/FASTA/chr20.fa.gz | |
gunzip chr20.fa.gz | |
# Add "chr" prefix to sequence name | |
sed 's/^>.*/>chr20/' chr20.fa > ref.fasta | |
rm chr20.fa | |
# Create sequence dictionary and index | |
samtools faidx ref.fasta | |
gatk CreateSequenceDictionary -R ref.fasta -O ref.dict | |
# Debug: Check the content of our files | |
echo "=== Reference FASTA header ===" | |
head -n 1 ref.fasta | |
echo "=== Reference FAI content ===" | |
cat ref.fasta.fai | |
echo "=== Dictionary content ===" | |
head -n 5 ref.dict | |
# Create and sort test region bed file | |
echo -e "chr20\t10433000\t10437000" > test.bed | |
sort -k1,1 -k2,2n test.bed > sorted.bed | |
mv sorted.bed test.bed | |
# Debug: Verify bed file content | |
echo "=== BED file content ===" | |
cat test.bed | |
# Test bed to interval_list conversion directly | |
echo "=== Testing bed to interval_list conversion ===" | |
gatk BedToIntervalList \ | |
-I test.bed \ | |
-O test.interval_list \ | |
-SD ref.dict | |
# Generate BWA indexes | |
bwa index ref.fasta | |
# Function to extract reference base at a position | |
echo "# Create function to get reference base at position" | |
get_ref_base() { | |
local pos=$1 | |
local length=$2 | |
samtools faidx ref.fasta chr20:${pos}-$((pos+length-1)) | tail -n1 | |
} | |
# Create a minimal synthetic dbSNP VCF for testing | |
echo '##fileformat=VCFv4.2' > dbsnp.vcf | |
echo '##reference=GRCh38' >> dbsnp.vcf | |
echo '##INFO=<ID=RS,Number=1,Type=Integer,Description="dbSNP ID">' >> dbsnp.vcf | |
echo "##contig=<ID=chr20,length=$(grep "^chr20" ref.fasta.fai | cut -f2)>" >> dbsnp.vcf | |
printf "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" >> dbsnp.vcf | |
# Get actual reference bases and create VCF records | |
ref1=$(get_ref_base 10433100 1) | |
ref2=$(get_ref_base 10434200 1) | |
ref3=$(get_ref_base 10435300 1) | |
ref4=$(get_ref_base 10436400 1) | |
ref5=$(get_ref_base 10436500 2) | |
printf "chr20\t10433100\trs1234567\t${ref1}\tG\t.\tPASS\tRS=1234567\n" >> dbsnp.vcf | |
printf "chr20\t10434200\trs2345678\t${ref2}\tC\t.\tPASS\tRS=2345678\n" >> dbsnp.vcf | |
printf "chr20\t10435300\trs3456789\t${ref3}\tA\t.\tPASS\tRS=3456789\n" >> dbsnp.vcf | |
printf "chr20\t10436400\trs4567890\t${ref4}\tT\t.\tPASS\tRS=4567890\n" >> dbsnp.vcf | |
printf "chr20\t10436500\trs5678901\t${ref5}\tA\t.\tPASS\tRS=5678901\n" >> dbsnp.vcf | |
# Index the VCF | |
gatk IndexFeatureFile -I dbsnp.vcf | |
# Create synthetic Mills and 1000G indels VCF similarly | |
echo '##fileformat=VCFv4.2' > mills_1000G.vcf | |
echo '##reference=GRCh38' >> mills_1000G.vcf | |
echo '##INFO=<ID=TYPE,Number=1,Type=String,Description="Type of variant">' >> mills_1000G.vcf | |
echo "##contig=<ID=chr20,length=$(grep "^chr20" ref.fasta.fai | cut -f2)>" >> mills_1000G.vcf | |
printf "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" >> mills_1000G.vcf | |
ref6=$(get_ref_base 10433150 2) | |
ref7=$(get_ref_base 10434250 1) | |
ref8=$(get_ref_base 10435350 3) | |
ref9=$(get_ref_base 10436450 1) | |
printf "chr20\t10433150\tMILL1\t${ref6}\t${ref6:0:1}\t.\tPASS\tTYPE=deletion;SOURCE=MILLS\n" >> mills_1000G.vcf | |
printf "chr20\t10434250\tMILL2\t${ref7}\t${ref7}TT\t.\tPASS\tTYPE=insertion;SOURCE=MILLS\n" >> mills_1000G.vcf | |
printf "chr20\t10435350\tG1000_1\t${ref8}\t${ref8:0:1}\t.\tPASS\tTYPE=deletion;SOURCE=1000G\n" >> mills_1000G.vcf | |
printf "chr20\t10436450\tG1000_2\t${ref9}\t${ref9}AGC\t.\tPASS\tTYPE=insertion;SOURCE=1000G\n" >> mills_1000G.vcf | |
# Compress and index | |
bgzip mills_1000G.vcf | |
tabix -p vcf mills_1000G.vcf.gz | |
# Create synthetic known indels VCF | |
echo '##fileformat=VCFv4.2' > known_indels.vcf | |
echo '##reference=GRCh38' >> known_indels.vcf | |
echo '##INFO=<ID=TYPE,Number=1,Type=String,Description="Type of variant">' >> known_indels.vcf | |
echo "##contig=<ID=chr20,length=$(grep "^chr20" ref.fasta.fai | cut -f2)>" >> known_indels.vcf | |
printf "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" >> known_indels.vcf | |
ref10=$(get_ref_base 10433550 2) | |
ref11=$(get_ref_base 10434650 1) | |
ref12=$(get_ref_base 10435750 3) | |
printf "chr20\t10433550\tindel1\t${ref10}\t${ref10:0:1}\t.\tPASS\tTYPE=deletion\n" >> known_indels.vcf | |
printf "chr20\t10434650\tindel2\t${ref11}\t${ref11}TT\t.\tPASS\tTYPE=insertion\n" >> known_indels.vcf | |
printf "chr20\t10435750\tindel3\t${ref12}\t${ref12:0:1}\t.\tPASS\tTYPE=deletion\n" >> known_indels.vcf | |
# Compress and index | |
bgzip known_indels.vcf | |
tabix -p vcf known_indels.vcf.gz | |
# Create test JSON with all reference files | |
cat << EOF > ../test-inputs.json | |
{ | |
"PanelBwaGatk4Annovar.sample_batch": [ | |
{ | |
"sample_name": "test_sample", | |
"bam_file": "test/data/test.unmapped.bam", | |
"bed_file": "test/data/test.bed" | |
} | |
], | |
"PanelBwaGatk4Annovar.reference_genome": { | |
"ref_name": "hg38", | |
"ref_fasta": "test/data/ref.fasta", | |
"ref_fasta_index": "test/data/ref.fasta.fai", | |
"ref_dict": "test/data/ref.dict", | |
"ref_pac": "test/data/ref.fasta.pac", | |
"ref_sa": "test/data/ref.fasta.sa", | |
"ref_amb": "test/data/ref.fasta.amb", | |
"ref_ann": "test/data/ref.fasta.ann", | |
"ref_bwt": "test/data/ref.fasta.bwt", | |
"dbSNP_vcf": "test/data/dbsnp.vcf", | |
"dbSNP_vcf_index": "test/data/dbsnp.vcf.idx", | |
"known_indels_sites_VCFs": [ | |
"test/data/mills_1000G.vcf.gz", | |
"test/data/known_indels.vcf.gz" | |
], | |
"known_indels_sites_indices": [ | |
"test/data/mills_1000G.vcf.gz.tbi", | |
"test/data/known_indels.vcf.gz.tbi" | |
], | |
"annovar_protocols": "refGene", | |
"annovar_operation": "g" | |
} | |
} | |
EOF | |
# Add debugging for BQSR inputs | |
echo "=== Checking BQSR input files ===" | |
echo "dbSNP VCF:" | |
head -n 5 dbsnp.vcf | |
echo "Mills & 1000G VCF:" | |
zcat mills_1000G.vcf.gz | head -n 5 | |
echo "Known indels VCF:" | |
zcat known_indels.vcf.gz | head -n 5 | |
# Validate VCF files | |
echo "=== Validating VCF files ===" | |
gatk ValidateVariants -V dbsnp.vcf -R ref.fasta | |
gatk ValidateVariants -V mills_1000G.vcf.gz -R ref.fasta | |
gatk ValidateVariants -V known_indels.vcf.gz -R ref.fasta | |
# Navigating back to original directory | |
cd ../../ | |
- name: Generate test BAM | |
run: | | |
# Download NA12878 data | |
wget https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/data/NA12878/Garvan_NA12878_HG001_HiSeq_Exome/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.bam | |
# Generate BAM index | |
samtools index project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.bam | |
# Extract region with known variants, downsample, and convert to fastq | |
# We're using a region known to contain multiple GIAB high-confidence variants | |
samtools view -h -s 0.1 project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.bam chr20:10433000-10437000 | \ | |
samtools sort -n - | \ | |
samtools fastq -1 test/data/r1.fq -2 test/data/r2.fq -0 /dev/null -s /dev/null -n | |
# Convert back to unmapped BAM with proper read groups | |
gatk FastqToSam \ | |
-F1 test/data/r1.fq \ | |
-F2 test/data/r2.fq \ | |
-O test/data/test.unmapped.bam \ | |
-SM NA12878 \ | |
-RG test \ | |
-PL ILLUMINA \ | |
-LB lib1 \ | |
-PU unit1 | |
# Debug: Check the BAM | |
echo "=== Checking final BAM structure ===" | |
samtools view -H test/data/test.unmapped.bam | |
echo "=== First few reads ===" | |
samtools view test/data/test.unmapped.bam | head -n 2 | |
# Clean up intermediate files | |
rm test/data/r1.fq test/data/r2.fq \ | |
project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.bam \ | |
project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.bam.bai | |
# Validate BAM file | |
echo "=== Validating BAM file ===" | |
gatk ValidateSamFile -I test/data/test.unmapped.bam | |
- name: Run test workflow | |
run: | | |
# Create a directory for workflow logs | |
mkdir -p workflow_logs | |
echo "=== Running Cromwell with test workflow ===" | |
# Run with more detailed logging and capture all output | |
java -Dbackend.providers.Local.config.root="workflow_logs" \ | |
-Dconfig.trace=true \ | |
-Dakka.loglevel=DEBUG \ | |
-Xmx4g \ | |
-jar cromwell-86.jar \ | |
run \ | |
variantCalling/variantCalling.wdl \ | |
--inputs test/test-inputs.json \ | |
--metadata-output workflow_logs/metadata.json \ | |
2>&1 | tee workflow_logs/workflow.log | |
# If workflow fails, print debug info | |
if [ $? -ne 0 ]; then | |
echo "=== Workflow failed, printing debug information ===" | |
echo "=== Directory contents ===" | |
ls -R workflow_logs/ | |
echo "=== Last 100 lines of workflow log ===" | |
tail -n 100 workflow_logs/workflow.log | |
echo "=== Contents of stderr files if they exist ===" | |
find workflow_logs -name "stderr" -type f -exec sh -c 'echo "=== File: {}"; cat {}' \; | |
echo "=== Contents of stdout files if they exist ===" | |
find workflow_logs -name "stdout" -type f -exec sh -c 'echo "=== File: {}"; cat {}' \; | |
echo "=== Contents of script files if they exist ===" | |
find workflow_logs -name "script" -type f -exec sh -c 'echo "=== File: {}"; cat {}' \; | |
# Also print metadata if it exists | |
if [ -f workflow_logs/metadata.json ]; then | |
echo "=== Workflow metadata ===" | |
cat workflow_logs/metadata.json | |
fi | |
# Exit with failure | |
exit 1 | |
fi | |
# - name: Run test workflow | |
# run: | | |
# java -Xmx4g -jar cromwell-86.jar run variantCalling/variantCalling.wdl --inputs test/test-inputs.json | |
- name: Check outputs | |
run: | | |
# Basic output existence checks | |
test -f $(find cromwell-executions -name "*.recal.bam") | |
test -f $(find cromwell-executions -name "*.GATK.vcf") | |
test -f $(find cromwell-executions -name "*_multianno.txt") |