Adding GitHub Action for Variant Calling Test Run #38
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Variant Calling Test Run | |
on: | |
pull_request: | |
paths: | |
- 'variantCalling/variantCalling.wdl' | |
- '.github/workflows/variant-calling-test-run.yml' | |
jobs: | |
test: | |
runs-on: ubuntu-latest | |
permissions: write-all | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
- name: Set Up Java | |
uses: actions/setup-java@v4 | |
with: | |
distribution: 'temurin' | |
java-version: '21' | |
- name: Install required tools | |
run: | | |
# Install system packages | |
sudo apt-get update | |
sudo apt-get install -y wget unzip samtools bwa tabix | |
# Download and install GATK | |
wget https://github.com/broadinstitute/gatk/releases/download/4.4.0.0/gatk-4.4.0.0.zip | |
unzip gatk-4.4.0.0.zip | |
sudo ln -s $PWD/gatk-4.4.0.0/gatk /usr/local/bin/gatk | |
- name: Pull Cromwell Jarfile | |
run: wget -q https://github.com/broadinstitute/cromwell/releases/download/86/cromwell-86.jar | |
- name: Setup Test Data | |
run: | | |
mkdir -p test/data | |
cd test/data | |
# Download reference chromosome 20 from NCBI | |
wget ftp://ftp.ncbi.nlm.nih.gov/genomes/archive/old_genbank/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/Primary_Assembly/assembled_chromosomes/FASTA/chr20.fa.gz | |
gunzip chr20.fa.gz | |
# Add "chr" prefix to sequence name | |
sed 's/^>.*/>chr20/' chr20.fa > ref.fasta | |
rm chr20.fa | |
# Create sequence dictionary and index | |
samtools faidx ref.fasta | |
gatk CreateSequenceDictionary -R ref.fasta -O ref.dict | |
# Debug: Check the content of our files | |
echo "=== Reference FASTA header ===" | |
head -n 1 ref.fasta | |
echo "=== Reference FAI content ===" | |
cat ref.fasta.fai | |
echo "=== Dictionary content ===" | |
head -n 5 ref.dict | |
# Create and sort test region bed file | |
echo -e "chr20\t1000000\t1100000" > test.bed | |
sort -k1,1 -k2,2n test.bed > sorted.bed | |
mv sorted.bed test.bed | |
# Debug: Verify bed file content | |
echo "=== BED file content ===" | |
cat test.bed | |
# Test bed to interval_list conversion directly | |
echo "=== Testing bed to interval_list conversion ===" | |
gatk BedToIntervalList \ | |
-I test.bed \ | |
-O test.interval_list \ | |
-SD ref.dict | |
# Generate BWA indexes | |
bwa index ref.fasta | |
# Create a minimal synthetic dbSNP VCF for testing | |
# Write header lines | |
echo '##fileformat=VCFv4.2' > dbsnp.vcf | |
echo '##reference=GRCh38' >> dbsnp.vcf | |
echo '##INFO=<ID=RS,Number=1,Type=Integer,Description="dbSNP ID">' >> dbsnp.vcf | |
echo "##contig=<ID=chr20,length=$(grep "^chr20" ref.fasta.fai | cut -f2)>" >> dbsnp.vcf | |
printf "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" >> dbsnp.vcf | |
printf "chr20\t1000100\trs1234567\tA\tG\t.\tPASS\tRS=1234567\n" >> dbsnp.vcf | |
printf "chr20\t1000200\trs2345678\tT\tC\t.\tPASS\tRS=2345678\n" >> dbsnp.vcf | |
printf "chr20\t1000300\trs3456789\tG\tA\t.\tPASS\tRS=3456789\n" >> dbsnp.vcf | |
printf "chr20\t1000400\trs4567890\tC\tT\t.\tPASS\tRS=4567890\n" >> dbsnp.vcf | |
printf "chr20\t1000500\trs5678901\tAG\tA\t.\tPASS\tRS=5678901\n" >> dbsnp.vcf | |
gatk IndexFeatureFile -I dbsnp.vcf | |
# Create synthetic Mills and 1000G indels VCF | |
echo '##fileformat=VCFv4.2' > mills_1000G.vcf | |
echo '##reference=GRCh38' >> mills_1000G.vcf | |
echo '##INFO=<ID=TYPE,Number=1,Type=String,Description="Type of variant">' >> mills_1000G.vcf | |
echo '##INFO=<ID=SOURCE,Number=1,Type=String,Description="Source of variant">' >> mills_1000G.vcf | |
echo "##contig=<ID=chr20,length=$(grep "^chr20" ref.fasta.fai | cut -f2)>" >> mills_1000G.vcf | |
printf "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" >> mills_1000G.vcf | |
printf "chr20\t1000150\tMILL1\tAT\tA\t.\tPASS\tTYPE=deletion;SOURCE=MILLS\n" >> mills_1000G.vcf | |
printf "chr20\t1000250\tMILL2\tG\tGTT\t.\tPASS\tTYPE=insertion;SOURCE=MILLS\n" >> mills_1000G.vcf | |
printf "chr20\t1000350\tG1000_1\tCTA\tC\t.\tPASS\tTYPE=deletion;SOURCE=1000G\n" >> mills_1000G.vcf | |
printf "chr20\t1000450\tG1000_2\tT\tTAGC\t.\tPASS\tTYPE=insertion;SOURCE=1000G\n" >> mills_1000G.vcf | |
bgzip mills_1000G.vcf | |
tabix -p vcf mills_1000G.vcf.gz | |
# Create synthetic known indels VCF | |
echo '##fileformat=VCFv4.2' > known_indels.vcf | |
echo '##reference=GRCh38' >> known_indels.vcf | |
echo '##INFO=<ID=TYPE,Number=1,Type=String,Description="Type of variant">' >> known_indels.vcf | |
echo "##contig=<ID=chr20,length=$(grep "^chr20" ref.fasta.fai | cut -f2)>" >> known_indels.vcf | |
printf "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" >> known_indels.vcf | |
printf "chr20\t1000550\tindel1\tAT\tA\t.\tPASS\tTYPE=deletion\n" >> known_indels.vcf | |
printf "chr20\t1000650\tindel2\tG\tGTT\t.\tPASS\tTYPE=insertion\n" >> known_indels.vcf | |
printf "chr20\t1000750\tindel3\tCTA\tC\t.\tPASS\tTYPE=deletion\n" >> known_indels.vcf | |
bgzip known_indels.vcf | |
tabix -p vcf known_indels.vcf.gz | |
# Create test JSON with all reference files | |
cat << EOF > ../test-inputs.json | |
{ | |
"PanelBwaGatk4Annovar.sample_batch": [ | |
{ | |
"sample_name": "test_sample", | |
"bam_file": "test/data/test.unmapped.bam", | |
"bed_file": "test/data/test.bed" | |
} | |
], | |
"PanelBwaGatk4Annovar.reference_genome": { | |
"ref_name": "hg38", | |
"ref_fasta": "test/data/ref.fasta", | |
"ref_fasta_index": "test/data/ref.fasta.fai", | |
"ref_dict": "test/data/ref.dict", | |
"ref_pac": "test/data/ref.fasta.pac", | |
"ref_sa": "test/data/ref.fasta.sa", | |
"ref_amb": "test/data/ref.fasta.amb", | |
"ref_ann": "test/data/ref.fasta.ann", | |
"ref_bwt": "test/data/ref.fasta.bwt", | |
"dbSNP_vcf": "test/data/dbsnp.vcf", | |
"dbSNP_vcf_index": "test/data/dbsnp.vcf.idx", | |
"known_indels_sites_VCFs": [ | |
"test/data/mills_1000G.vcf.gz", | |
"test/data/known_indels.vcf.gz" | |
], | |
"known_indels_sites_indices": [ | |
"test/data/mills_1000G.vcf.gz.tbi", | |
"test/data/known_indels.vcf.gz.tbi" | |
], | |
"annovar_protocols": "refGene", | |
"annovar_operation": "g" | |
} | |
} | |
EOF | |
# Add debugging for BQSR inputs | |
echo "=== Checking BQSR input files ===" | |
echo "dbSNP VCF:" | |
head -n 5 dbsnp.vcf | |
echo "Mills & 1000G VCF:" | |
zcat mills_1000G.vcf.gz | head -n 5 | |
echo "Known indels VCF:" | |
zcat known_indels.vcf.gz | head -n 5 | |
# Validate VCF files | |
echo "=== Validating VCF files ===" | |
gatk ValidateVariants -V dbsnp.vcf -R ref.fasta | |
gatk ValidateVariants -V mills_1000G.vcf.gz -R ref.fasta | |
gatk ValidateVariants -V known_indels.vcf.gz -R ref.fasta | |
# Navigating back to original directory | |
cd ../../ | |
- name: Generate test BAM | |
run: | | |
# Create a more structured test BAM with multiple reads | |
cat << EOF > test/data/test.sam | |
@HD VN:1.6 SO:queryname | |
@SQ SN:chr20 LN:63025520 | |
@RG ID:test SM:test_sample PL:ILLUMINA LB:lib1 PU:unit1 | |
read1 77 * 0 0 * * 0 0 ACTGACTGACTGACTG FFFFFFFFFFFFFFFF RG:Z:test | |
read1 141 * 0 0 * * 0 0 CGTACGTACGTACGTA FFFFFFFFFFFFFFFF RG:Z:test | |
read2 77 * 0 0 * * 0 0 ACTGACTGACTGACTG FFFFFFFFFFFFFFFF RG:Z:test | |
read2 141 * 0 0 * * 0 0 CGTACGTACGTACGTA FFFFFFFFFFFFFFFF RG:Z:test | |
read3 77 * 0 0 * * 0 0 ACTGACTGACTGACTG FFFFFFFFFFFFFFFF RG:Z:test | |
read3 141 * 0 0 * * 0 0 CGTACGTACGTACGTA FFFFFFFFFFFFFFFF RG:Z:test | |
EOF | |
# Convert SAM to BAM and proper sorting | |
samtools view -b test/data/test.sam > test/data/test.unsorted.bam | |
samtools sort -n test/data/test.unsorted.bam > test/data/test.unmapped.bam | |
samtools index test/data/test.unmapped.bam | |
# Debug: Check the BAM | |
echo "=== Checking final BAM structure ===" | |
samtools view -H test/data/test.unmapped.bam | |
echo "=== First few reads ===" | |
samtools view test/data/test.unmapped.bam | head -n 2 | |
# Clean up intermediate files | |
rm test/data/test.sam test/data/test.unsorted.bam | |
# Validate BAM file | |
echo "=== Validating BAM file ===" | |
gatk ValidateSamFile -I test/data/test.unmapped.bam | |
- name: Run test workflow | |
run: | | |
java -Xmx4g -jar cromwell-86.jar run variantCalling/variantCalling.wdl --inputs test/test-inputs.json | |
- name: Check outputs | |
run: | | |
# Basic output existence checks | |
test -f $(find cromwell-executions -name "*.recal.bam") | |
test -f $(find cromwell-executions -name "*.GATK.vcf") | |
test -f $(find cromwell-executions -name "*_multianno.txt") |