Adding GitHub Action for Variant Calling Test Run #41
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Variant Calling Test Run | |
on: | |
pull_request: | |
paths: | |
- 'variantCalling/variantCalling.wdl' | |
- '.github/workflows/variant-calling-test-run.yml' | |
jobs: | |
test: | |
runs-on: ubuntu-latest | |
permissions: write-all | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
- name: Set Up Java | |
uses: actions/setup-java@v4 | |
with: | |
distribution: 'temurin' | |
java-version: '21' | |
- name: Install required tools | |
run: | | |
# Install system packages | |
sudo apt-get update | |
sudo apt-get install -y wget unzip samtools bwa tabix | |
# Download and install GATK | |
wget https://github.com/broadinstitute/gatk/releases/download/4.4.0.0/gatk-4.4.0.0.zip | |
unzip gatk-4.4.0.0.zip | |
sudo ln -s $PWD/gatk-4.4.0.0/gatk /usr/local/bin/gatk | |
- name: Pull Cromwell Jarfile | |
run: wget -q https://github.com/broadinstitute/cromwell/releases/download/86/cromwell-86.jar | |
- name: Setup Test Data | |
run: | | |
mkdir -p test/data | |
cd test/data | |
# Download reference chromosome 20 from NCBI | |
wget ftp://ftp.ncbi.nlm.nih.gov/genomes/archive/old_genbank/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/Primary_Assembly/assembled_chromosomes/FASTA/chr20.fa.gz | |
gunzip chr20.fa.gz | |
# Add "chr" prefix to sequence name | |
sed 's/^>.*/>chr20/' chr20.fa > ref.fasta | |
rm chr20.fa | |
# Create sequence dictionary and index | |
samtools faidx ref.fasta | |
gatk CreateSequenceDictionary -R ref.fasta -O ref.dict | |
# Debug: Check the content of our files | |
echo "=== Reference FASTA header ===" | |
head -n 1 ref.fasta | |
echo "=== Reference FAI content ===" | |
cat ref.fasta.fai | |
echo "=== Dictionary content ===" | |
head -n 5 ref.dict | |
# Create and sort test region bed file | |
echo -e "chr20\t1000000\t1100000" > test.bed | |
sort -k1,1 -k2,2n test.bed > sorted.bed | |
mv sorted.bed test.bed | |
# Debug: Verify bed file content | |
echo "=== BED file content ===" | |
cat test.bed | |
# Test bed to interval_list conversion directly | |
echo "=== Testing bed to interval_list conversion ===" | |
gatk BedToIntervalList \ | |
-I test.bed \ | |
-O test.interval_list \ | |
-SD ref.dict | |
# Generate BWA indexes | |
bwa index ref.fasta | |
# Function to extract reference base at a position | |
echo "# Create function to get reference base at position" | |
get_ref_base() { | |
local pos=$1 | |
local length=$2 | |
samtools faidx ref.fasta chr20:${pos}-$((pos+length-1)) | tail -n1 | |
} | |
# Create a minimal synthetic dbSNP VCF for testing | |
echo '##fileformat=VCFv4.2' > dbsnp.vcf | |
echo '##reference=GRCh38' >> dbsnp.vcf | |
echo '##INFO=<ID=RS,Number=1,Type=Integer,Description="dbSNP ID">' >> dbsnp.vcf | |
echo "##contig=<ID=chr20,length=$(grep "^chr20" ref.fasta.fai | cut -f2)>" >> dbsnp.vcf | |
printf "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" >> dbsnp.vcf | |
# Get actual reference bases and create VCF records | |
ref1=$(get_ref_base 1000100 1) | |
ref2=$(get_ref_base 1000200 1) | |
ref3=$(get_ref_base 1000300 1) | |
ref4=$(get_ref_base 1000400 1) | |
ref5=$(get_ref_base 1000500 2) | |
printf "chr20\t1000100\trs1234567\t${ref1}\tG\t.\tPASS\tRS=1234567\n" >> dbsnp.vcf | |
printf "chr20\t1000200\trs2345678\t${ref2}\tC\t.\tPASS\tRS=2345678\n" >> dbsnp.vcf | |
printf "chr20\t1000300\trs3456789\t${ref3}\tA\t.\tPASS\tRS=3456789\n" >> dbsnp.vcf | |
printf "chr20\t1000400\trs4567890\t${ref4}\tT\t.\tPASS\tRS=4567890\n" >> dbsnp.vcf | |
printf "chr20\t1000500\trs5678901\t${ref5}\tA\t.\tPASS\tRS=5678901\n" >> dbsnp.vcf | |
# Index the VCF | |
gatk IndexFeatureFile -I dbsnp.vcf | |
# Create synthetic Mills and 1000G indels VCF similarly | |
echo '##fileformat=VCFv4.2' > mills_1000G.vcf | |
echo '##reference=GRCh38' >> mills_1000G.vcf | |
echo '##INFO=<ID=TYPE,Number=1,Type=String,Description="Type of variant">' >> mills_1000G.vcf | |
echo '##INFO=<ID=SOURCE,Number=1,Type=String,Description="Source of variant">' >> mills_1000G.vcf | |
echo "##contig=<ID=chr20,length=$(grep "^chr20" ref.fasta.fai | cut -f2)>" >> mills_1000G.vcf | |
printf "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" >> mills_1000G.vcf | |
ref6=$(get_ref_base 1000150 2) | |
ref7=$(get_ref_base 1000250 1) | |
ref8=$(get_ref_base 1000350 3) | |
ref9=$(get_ref_base 1000450 1) | |
printf "chr20\t1000150\tMILL1\t${ref6}\t${ref6:0:1}\t.\tPASS\tTYPE=deletion;SOURCE=MILLS\n" >> mills_1000G.vcf | |
printf "chr20\t1000250\tMILL2\t${ref7}\t${ref7}TT\t.\tPASS\tTYPE=insertion;SOURCE=MILLS\n" >> mills_1000G.vcf | |
printf "chr20\t1000350\tG1000_1\t${ref8}\t${ref8:0:1}\t.\tPASS\tTYPE=deletion;SOURCE=1000G\n" >> mills_1000G.vcf | |
printf "chr20\t1000450\tG1000_2\t${ref9}\t${ref9}AGC\t.\tPASS\tTYPE=insertion;SOURCE=1000G\n" >> mills_1000G.vcf | |
bgzip mills_1000G.vcf | |
tabix -p vcf mills_1000G.vcf.gz | |
# Create synthetic known indels VCF | |
echo '##fileformat=VCFv4.2' > known_indels.vcf | |
echo '##reference=GRCh38' >> known_indels.vcf | |
echo '##INFO=<ID=TYPE,Number=1,Type=String,Description="Type of variant">' >> known_indels.vcf | |
echo "##contig=<ID=chr20,length=$(grep "^chr20" ref.fasta.fai | cut -f2)>" >> known_indels.vcf | |
printf "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" >> known_indels.vcf | |
ref10=$(get_ref_base 1000550 2) | |
ref11=$(get_ref_base 1000650 1) | |
ref12=$(get_ref_base 1000750 3) | |
printf "chr20\t1000550\tindel1\t${ref10}\t${ref10:0:1}\t.\tPASS\tTYPE=deletion\n" >> known_indels.vcf | |
printf "chr20\t1000650\tindel2\t${ref11}\t${ref11}TT\t.\tPASS\tTYPE=insertion\n" >> known_indels.vcf | |
printf "chr20\t1000750\tindel3\t${ref12}\t${ref12:0:1}\t.\tPASS\tTYPE=deletion\n" >> known_indels.vcf | |
bgzip known_indels.vcf | |
tabix -p vcf known_indels.vcf.gz | |
# Create test JSON with all reference files | |
cat << EOF > ../test-inputs.json | |
{ | |
"PanelBwaGatk4Annovar.sample_batch": [ | |
{ | |
"sample_name": "test_sample", | |
"bam_file": "test/data/test.unmapped.bam", | |
"bed_file": "test/data/test.bed" | |
} | |
], | |
"PanelBwaGatk4Annovar.reference_genome": { | |
"ref_name": "hg38", | |
"ref_fasta": "test/data/ref.fasta", | |
"ref_fasta_index": "test/data/ref.fasta.fai", | |
"ref_dict": "test/data/ref.dict", | |
"ref_pac": "test/data/ref.fasta.pac", | |
"ref_sa": "test/data/ref.fasta.sa", | |
"ref_amb": "test/data/ref.fasta.amb", | |
"ref_ann": "test/data/ref.fasta.ann", | |
"ref_bwt": "test/data/ref.fasta.bwt", | |
"dbSNP_vcf": "test/data/dbsnp.vcf", | |
"dbSNP_vcf_index": "test/data/dbsnp.vcf.idx", | |
"known_indels_sites_VCFs": [ | |
"test/data/mills_1000G.vcf.gz", | |
"test/data/known_indels.vcf.gz" | |
], | |
"known_indels_sites_indices": [ | |
"test/data/mills_1000G.vcf.gz.tbi", | |
"test/data/known_indels.vcf.gz.tbi" | |
], | |
"annovar_protocols": "refGene", | |
"annovar_operation": "g" | |
} | |
} | |
EOF | |
# Add debugging for BQSR inputs | |
echo "=== Checking BQSR input files ===" | |
echo "dbSNP VCF:" | |
head -n 5 dbsnp.vcf | |
echo "Mills & 1000G VCF:" | |
zcat mills_1000G.vcf.gz | head -n 5 | |
echo "Known indels VCF:" | |
zcat known_indels.vcf.gz | head -n 5 | |
# Validate VCF files | |
echo "=== Validating VCF files ===" | |
gatk ValidateVariants -V dbsnp.vcf -R ref.fasta | |
gatk ValidateVariants -V mills_1000G.vcf.gz -R ref.fasta | |
gatk ValidateVariants -V known_indels.vcf.gz -R ref.fasta | |
# Navigating back to original directory | |
cd ../../ | |
- name: Generate test BAM | |
run: | | |
# Create a more structured test BAM with multiple reads | |
cat << EOF > test/data/test.sam | |
@HD VN:1.6 SO:queryname | |
@SQ SN:chr20 LN:63025520 | |
@RG ID:test SM:test_sample PL:ILLUMINA LB:lib1 PU:unit1 | |
read1 77 * 0 0 * * 0 0 ACTGACTGACTGACTG FFFFFFFFFFFFFFFF RG:Z:test | |
read1 141 * 0 0 * * 0 0 CGTACGTACGTACGTA FFFFFFFFFFFFFFFF RG:Z:test | |
read2 77 * 0 0 * * 0 0 ACTGACTGACTGACTG FFFFFFFFFFFFFFFF RG:Z:test | |
read2 141 * 0 0 * * 0 0 CGTACGTACGTACGTA FFFFFFFFFFFFFFFF RG:Z:test | |
read3 77 * 0 0 * * 0 0 ACTGACTGACTGACTG FFFFFFFFFFFFFFFF RG:Z:test | |
read3 141 * 0 0 * * 0 0 CGTACGTACGTACGTA FFFFFFFFFFFFFFFF RG:Z:test | |
EOF | |
# Convert SAM to BAM and proper sorting | |
samtools view -b test/data/test.sam > test/data/test.unsorted.bam | |
samtools sort -n test/data/test.unsorted.bam > test/data/test.unmapped.bam | |
samtools index test/data/test.unmapped.bam | |
# Debug: Check the BAM | |
echo "=== Checking final BAM structure ===" | |
samtools view -H test/data/test.unmapped.bam | |
echo "=== First few reads ===" | |
samtools view test/data/test.unmapped.bam | head -n 2 | |
# Clean up intermediate files | |
rm test/data/test.sam test/data/test.unsorted.bam | |
# Validate BAM file | |
echo "=== Validating BAM file ===" | |
gatk ValidateSamFile -I test/data/test.unmapped.bam | |
- name: Run test workflow | |
run: | | |
# Create a directory for workflow logs | |
mkdir -p workflow_logs | |
# Run workflow with more verbose logging | |
java -Dbackend.providers.Local.config.root="workflow_logs" \ | |
-Xmx4g -jar cromwell-86.jar run \ | |
variantCalling/variantCalling.wdl \ | |
--inputs test/test-inputs.json \ | |
--metadata-output workflow_logs/metadata.json \ | |
2>&1 | tee workflow_logs/workflow.log | |
# If workflow fails, print additional debug info | |
if [ $? -ne 0 ]; then | |
echo "=== Workflow failed, printing debug information ===" | |
echo "=== Last 50 lines of workflow log ===" | |
tail -n 50 workflow_logs/workflow.log | |
echo "=== Execution directory contents ===" | |
ls -R workflow_logs | |
# Print content of stderr from the failed task if it exists | |
find workflow_logs -name "stderr" -exec sh -c 'echo "=== Content of {}" && cat {}' \; | |
# Exit with failure | |
exit 1 | |
fi | |
# - name: Run test workflow | |
# run: | | |
# java -Xmx4g -jar cromwell-86.jar run variantCalling/variantCalling.wdl --inputs test/test-inputs.json | |
- name: Check outputs | |
run: | | |
# Basic output existence checks | |
test -f $(find cromwell-executions -name "*.recal.bam") | |
test -f $(find cromwell-executions -name "*.GATK.vcf") | |
test -f $(find cromwell-executions -name "*_multianno.txt") |