diff --git a/contaminome.yml b/contaminome.yml index d6b4486..51f10be 100644 --- a/contaminome.yml +++ b/contaminome.yml @@ -11,7 +11,7 @@ eukaryotes: taxid: 10090 Drosophila melanogaster: URL: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT/GCF_000001215.4_Release_6_plus_ISO1_MT_genomic.fna.gz - vulgarname: drosophila + vulgarname: fly accession: GCF_000001215.4 taxid: 7227 Aedes aegypti: diff --git a/dissectBCL.ini b/dissectBCL.ini index 29cd72d..80ad3b5 100644 --- a/dissectBCL.ini +++ b/dissectBCL.ini @@ -27,6 +27,7 @@ kraken2db=/path/to/kraken2_contaminome/contaminomedb [misc] mpiImg=/path/to/multiqc_headerimg.jpg +krakenExpl=" Kraken is used to classify the reads and to detect contamination.
For this we use a *custom* database, with a simplified taxonomical hierarchy (that no longer resembles any true taxonomical classification.
In brief, by default we screen for:
  • eukaryotes (human, mouse, fly, mosquito, lamprey, medaka, c-elegans, yeast, zebrafish and the moss-piglet)
  • prokaryotes (Ecoli, pseudomonas, mycoplasma and haemophilus influenza)
  • viruses (sars-cov2, influenza A,B & C, norwalk virus, rhinoviruses, drosophila C virus, phiX and lambda phage )
  • custom databases (ERCC spikes, univec core DB)
  • Note that for human, mouse, fly and mosquito we scan for mitochondrial and ribosomal contamination separately).
    Only the top (most abundant) five hits and unclassified hits are shown, all other hits are grouped under an 'other' tag.
    " [communication] deepSeq=email@seqfacility.de diff --git a/src/dissectBCL/classes.py b/src/dissectBCL/classes.py index 389e79d..9f23a5b 100644 --- a/src/dissectBCL/classes.py +++ b/src/dissectBCL/classes.py @@ -79,12 +79,30 @@ def parseRunInfo(self): flowcellID = i.text return seqRecipe, lanes, instrument, flowcellID + # Validate successful run. + def validateRunCompletion(self): + """ + validates succesfull completion status in xml. + """ + logging.info("validateRunCompletion") + if self.sequencer == 'Miseq': + tree = ET.parse(self.runCompletionStatus) + root = tree.getroot() + for i in root.iter(): + if i.tag == 'CompletionStatus': + _status = i.text + else: + # no RunCompletionStatus.xml in novaseq, assume succes. + _status = 'SuccessfullyCompleted' + return (_status) + def __init__( self, name, bclPath, origSS, runInfo, + runCompStat, inBaseDir, outBaseDir, logFile, @@ -101,12 +119,14 @@ def __init__( self.bclPath = bclPath self.origSS = origSS self.runInfo = runInfo + self.runCompletionStatus = runCompStat self.inBaseDir = inBaseDir self.outBaseDir = outBaseDir self.logFile = logFile self.config = config # Run filesChecks self.filesExist() + self.succesfullrun = self.validateRunCompletion() # populate runInfo vars. self.seqRecipe, \ self.lanes, \ @@ -121,6 +141,8 @@ def asdict(self): 'bclPath': self.bclPath, 'original sampleSheet': self.origSS, 'runInfo': self.runInfo, + 'runCompletionStatus': self.runCompletionStatus, + 'sucessfulRun': self.succesfullrun, 'inBaseDir': self.inBaseDir, 'outBaseDir': self.outBaseDir, 'dissect logFile': self.logFile, diff --git a/src/dissectBCL/demux.py b/src/dissectBCL/demux.py index 572824c..75a9695 100644 --- a/src/dissectBCL/demux.py +++ b/src/dissectBCL/demux.py @@ -441,150 +441,180 @@ def parseStats(outputFolder, ssdf): def demux(sampleSheet, flowcell, config): logging.warning("Demux module") - for outLane in sampleSheet.ssDic: - logging.info("Demuxing {}".format(outLane)) - # Check outDir - outputFolder = os.path.join(flowcell.outBaseDir, outLane) - if not os.path.exists(outputFolder): - os.mkdir(outputFolder) - logging.info("{} created.".format(outputFolder)) - else: - logging.info("{} already exists. Moving on.".format(outputFolder)) - # Write the demuxSheet in the outputfolder - demuxOut = os.path.join(outputFolder, "demuxSheet.csv") - # Don't remake if demuxSheet exist - if not os.path.exists(demuxOut): - logging.info("Writing demuxSheet for {}".format(outLane)) - writeDemuxSheet( - demuxOut, - sampleSheet.ssDic[outLane], - sampleSheet.laneSplitStatus - ) - else: - logging.warning( - "demuxSheet for {} already exists!".format(outLane) - ) - manual_mask, manual_df, manual_dualIx, man_mmdic = readDemuxSheet( - demuxOut + # Double check for run failure + if flowcell.succesfullrun != 'SuccessfullyCompleted': + for outLane in sampleSheet.ssDic: + outputFolder = os.path.join( + flowcell.outBaseDir, outLane ) - if ( - sampleSheet.ssDic[outLane]['mismatch'] != man_mmdic - ): - logging.info( - "mismatch dic is changed from {} into {}".format( - sampleSheet.ssDic[outLane]['mismatch'], - man_mmdic - ) - ) - sampleSheet.ssDic[outLane]['mismatch'] = man_mmdic - # if mask is changed, update: - # Mask - if ( - 'mask' in sampleSheet.ssDic[outLane] - and manual_mask != sampleSheet.ssDic[outLane]['mask'] - ): - logging.info( - "Mask is changed from {} into {}.".format( - sampleSheet.ssDic[outLane]['mask'], - manual_mask - ) - ) - sampleSheet.ssDic[outLane]['mask'] = manual_mask - # dualIx status - if ( - 'dualIx' in sampleSheet.ssDic[outLane] - and manual_dualIx != sampleSheet.ssDic[outLane]['dualIx'] - ): + if not os.path.exists(outputFolder): + os.mkdir(outputFolder) + Path( + os.path.join(outputFolder, 'run.failed') + ).touch() + return ('sequencingfailed') + else: + for outLane in sampleSheet.ssDic: + logging.info("Demuxing {}".format(outLane)) + # Check outDir + outputFolder = os.path.join(flowcell.outBaseDir, outLane) + if not os.path.exists(outputFolder): + os.mkdir(outputFolder) + logging.info("{} created.".format(outputFolder)) + else: logging.info( - "dualIx is changed from {} into {}.".format( - sampleSheet.ssDic[outLane]['dualIx'], - manual_dualIx - ) + "{} already exists. Moving on.".format(outputFolder) ) - sampleSheet.ssDic[outLane]['dualIx'] = manual_dualIx - - # sampleSheet - sampleSheet.ssDic[outLane]['sampleSheet'] = matchingSheets( - sampleSheet.ssDic[outLane]['sampleSheet'], - manual_df - ) - # Check for 'bak file' existence. - if os.path.exists(demuxOut + '.bak'): - sampleSheet.ssDic[outLane]['P5RC'] = True - # Don't run bcl-convert if we have the touched flag. - if not os.path.exists( - os.path.join(outputFolder, 'bclconvert.done') - ): - # Run bcl-convert - bclOpts = [ - config['software']['bclconvert'], - '--output-directory', outputFolder, - '--force', - '--bcl-input-directory', flowcell.bclPath, - '--sample-sheet', demuxOut, - '--bcl-num-conversion-threads', "20", - '--bcl-num-compression-threads', "20", - "--bcl-sampleproject-subdirectories", "true", - ] - if not sampleSheet.laneSplitStatus: - bclOpts.append('--no-lane-splitting') - bclOpts.append('true') - logging.info("Starting BCLConvert") - logging.info(" ".join(bclOpts)) - bclRunner = Popen( - bclOpts, - stdout=PIPE - ) - exitcode = bclRunner.wait() - if exitcode == 0: - logging.info("bclConvert exit {}".format(exitcode)) + if flowcell.succesfullrun != 'SuccessfullyCompleted': + print("In failure if.") Path( - os.path.join(outputFolder, 'bclconvert.done') + os.path.join(outputFolder, 'run.failed') ).touch() - if flowcell.sequencer == 'MiSeq': - if differentialDiagnosis( - outputFolder, - sampleSheet.ssDic[outLane]['dualIx'], - ): - logging.info("P5 RC triggered.") - # Purge existing reports. - logging.info("Purge existing Reports folder") - shutil.rmtree( - os.path.join(outputFolder, 'Reports') - ) - bclRunner = Popen( - bclOpts, - stdout=PIPE + mailHome( + "{} ignored".format(flowcell.name), + "RunCompletionStatus is not successfullycompleted.\n" + + "Marked for failure and ignored for the future.", + config, + toCore=True + ) + break + # Write the demuxSheet in the outputfolder + demuxOut = os.path.join(outputFolder, "demuxSheet.csv") + # Don't remake if demuxSheet exist + if not os.path.exists(demuxOut): + logging.info("Writing demuxSheet for {}".format(outLane)) + writeDemuxSheet( + demuxOut, + sampleSheet.ssDic[outLane], + sampleSheet.laneSplitStatus + ) + else: + logging.warning( + "demuxSheet for {} already exists!".format(outLane) + ) + man_mask, man_df, man_dualIx, man_mmdic = readDemuxSheet( + demuxOut + ) + if ( + sampleSheet.ssDic[outLane]['mismatch'] != man_mmdic + ): + logging.info( + "mismatch dic is changed from {} into {}".format( + sampleSheet.ssDic[outLane]['mismatch'], + man_mmdic ) - exitcode = bclRunner.wait() - logging.info( - "bclConvert P5fix exit {}".format(exitcode) + ) + sampleSheet.ssDic[outLane]['mismatch'] = man_mmdic + # if mask is changed, update: + # Mask + if ( + 'mask' in sampleSheet.ssDic[outLane] + and man_mask != sampleSheet.ssDic[outLane]['mask'] + ): + logging.info( + "Mask is changed from {} into {}.".format( + sampleSheet.ssDic[outLane]['mask'], + man_mask ) - # Update the sampleSheet with proper RC'ed indices. - sampleSheet.ssDic[outLane][ - 'sampleSheet' - ] = matchingSheets( - sampleSheet.ssDic[outLane]['sampleSheet'], - readDemuxSheet(demuxOut, what='df') + ) + sampleSheet.ssDic[outLane]['mask'] = man_mask + # dualIx status + if ( + 'dualIx' in sampleSheet.ssDic[outLane] + and man_dualIx != sampleSheet.ssDic[outLane]['dualIx'] + ): + logging.info( + "dualIx is changed from {} into {}.".format( + sampleSheet.ssDic[outLane]['dualIx'], + man_dualIx ) - sampleSheet.ssDic[outLane]['P5RC'] = True + ) + sampleSheet.ssDic[outLane]['dualIx'] = man_dualIx + + # sampleSheet + sampleSheet.ssDic[outLane]['sampleSheet'] = matchingSheets( + sampleSheet.ssDic[outLane]['sampleSheet'], + man_df + ) + # Check for 'bak file' existence. + if os.path.exists(demuxOut + '.bak'): + sampleSheet.ssDic[outLane]['P5RC'] = True else: sampleSheet.ssDic[outLane]['P5RC'] = False - else: - logging.critical("bclConvert exit {}".format(exitcode)) - mailHome( - outLane, - 'BCL-convert exit {}. Investigate.'.format( - exitcode - ), - config, - toCore=True - ) - sys.exit(1) + # Don't run bcl-convert if we have the touched flag. + if not os.path.exists( + os.path.join(outputFolder, 'bclconvert.done') + ): + # Run bcl-convert + bclOpts = [ + config['software']['bclconvert'], + '--output-directory', outputFolder, + '--force', + '--bcl-input-directory', flowcell.bclPath, + '--sample-sheet', demuxOut, + '--bcl-num-conversion-threads', "20", + '--bcl-num-compression-threads', "20", + "--bcl-sampleproject-subdirectories", "true", + ] + if not sampleSheet.laneSplitStatus: + bclOpts.append('--no-lane-splitting') + bclOpts.append('true') + logging.info("Starting BCLConvert") + logging.info(" ".join(bclOpts)) + bclRunner = Popen( + bclOpts, + stdout=PIPE + ) + exitcode = bclRunner.wait() + if exitcode == 0: + logging.info("bclConvert exit {}".format(exitcode)) + Path( + os.path.join(outputFolder, 'bclconvert.done') + ).touch() + if flowcell.sequencer == 'MiSeq': + if differentialDiagnosis( + outputFolder, + sampleSheet.ssDic[outLane]['dualIx'], + ): + logging.info("P5 RC triggered.") + # Purge existing reports. + logging.info("Purge existing Reports folder") + shutil.rmtree( + os.path.join(outputFolder, 'Reports') + ) + bclRunner = Popen( + bclOpts, + stdout=PIPE + ) + exitcode = bclRunner.wait() + logging.info( + "bclConvert P5fix exit {}".format(exitcode) + ) + # Update the sampleSheet with proper RC'ed indices. + sampleSheet.ssDic[outLane][ + 'sampleSheet' + ] = matchingSheets( + sampleSheet.ssDic[outLane]['sampleSheet'], + readDemuxSheet(demuxOut, what='df') + ) + sampleSheet.ssDic[outLane]['P5RC'] = True + else: + sampleSheet.ssDic[outLane]['P5RC'] = False + else: + logging.critical("bclConvert exit {}".format(exitcode)) + mailHome( + outLane, + 'BCL-convert exit {}. Investigate.'.format( + exitcode + ), + config, + toCore=True + ) + sys.exit(1) - logging.info("Parsing stats for {}".format(outLane)) - sampleSheet.ssDic[outLane]['sampleSheet'] = parseStats( - outputFolder, - sampleSheet.ssDic[outLane]['sampleSheet'] - ) - return (0) + logging.info("Parsing stats for {}".format(outLane)) + sampleSheet.ssDic[outLane]['sampleSheet'] = parseStats( + outputFolder, + sampleSheet.ssDic[outLane]['sampleSheet'] + ) + return (0) diff --git a/src/dissectBCL/dissect.py b/src/dissectBCL/dissect.py index 9a7e5e1..0754ae7 100644 --- a/src/dissectBCL/dissect.py +++ b/src/dissectBCL/dissect.py @@ -84,6 +84,9 @@ def main(config): outBaseDir=config['Dirs']['outputDir'], origSS=os.path.join(flowcellDir, 'SampleSheet.csv'), runInfo=os.path.join(flowcellDir, 'RunInfo.xml'), + runCompStat=os.path.join( + flowcellDir, 'RunCompletionStatus.xml' + ), logFile=logFile, config=config ) @@ -108,59 +111,60 @@ def main(config): config ) inspect(sampleSheet) - - # postmux - exitStats['postmux'] = postmux( - flowcell, - sampleSheet, - config - ) - - # transfer data - for outLane in sampleSheet.ssDic: - # Copy over files. - transferTime, shipDic = fakeNews.shipFiles( - os.path.join( - flowcell.outBaseDir, - outLane - ), - config - ) - exitStats[outLane] = shipDic - # Push stats to parkour. - exitStats[outLane]['pushParkour'] = fakeNews.pushParkour( - flowcell.flowcellID, + # Break if the sequencing failed. + if not exitStats['demux'] == 'sequencingfailed': + # postmux + exitStats['postmux'] = postmux( + flowcell, sampleSheet, - config, - flowcell.bclPath + config ) - # Create diagnosis + parse QC stats - drHouse = initClass( - os.path.join( - flowcell.outBaseDir, - outLane - ), - flowcell.startTime, - sampleSheet.flowcell, - sampleSheet.ssDic[outLane], - transferTime, - exitStats, - config['Dirs']['baseDir'] + + # transfer data + for outLane in sampleSheet.ssDic: + # Copy over files. + transferTime, shipDic = fakeNews.shipFiles( + os.path.join( + flowcell.outBaseDir, + outLane + ), + config + ) + exitStats[outLane] = shipDic + # Push stats to parkour. + exitStats[outLane]['pushParkour'] = fakeNews.pushParkour( + flowcell.flowcellID, + sampleSheet, + config, + flowcell.bclPath ) - inspect(drHouse) - # Create email. - subject, _html = drHouse.prepMail() - # Send it. - mailHome(subject, _html, config) - Path( + # Create diagnosis + parse QC stats + drHouse = initClass( os.path.join( flowcell.outBaseDir, - outLane, - 'communication.done' + outLane + ), + flowcell.startTime, + sampleSheet.flowcell, + sampleSheet.ssDic[outLane], + transferTime, + exitStats, + config['Dirs']['baseDir'] ) - ).touch() - # Fix logs. - fakeNews.organiseLogs(flowcell, sampleSheet) + inspect(drHouse) + # Create email. + subject, _html = drHouse.prepMail() + # Send it. + mailHome(subject, _html, config) + Path( + os.path.join( + flowcell.outBaseDir, + outLane, + 'communication.done' + ) + ).touch() + # Fix logs. + fakeNews.organiseLogs(flowcell, sampleSheet) else: print("No flowcells found. Go back to sleep.") sleep(60*60) diff --git a/src/dissectBCL/fakeNews.py b/src/dissectBCL/fakeNews.py index 92e33ef..46156fc 100644 --- a/src/dissectBCL/fakeNews.py +++ b/src/dissectBCL/fakeNews.py @@ -18,6 +18,7 @@ import numpy as np import interop import logging +import pathlib def getDiskSpace(outputDir): @@ -312,6 +313,7 @@ def multiQC_yaml(config, flowcell, ssDic, project, laneFolder): ) except TypeError: sumReqRound = 'NA' + mqcyml = { "title": project, "custom_logo": config["misc"]["mpiImg"], @@ -343,7 +345,11 @@ def multiQC_yaml(config, flowcell, ssDic, project, laneFolder): 0 ) )} - ] + ], + "section_comments": { + "kraken": config["misc"]['krakenExpl'] + } + } return (mqcyml, mqcData, seqreportData, indexreportData) @@ -506,12 +512,20 @@ def shipFiles(outPath, config): os.system(fexer) shipDic[project] = shipDicStat # Ship multiQC reports. + ''' + seqFacdir/Sequence_Quality_yyyy/Illumina_yyyy/outlane + ''' + yrstr = '20' + outLane[:2] seqFacDir = os.path.join( config['Dirs']['seqFacDir'], + 'Sequence_Quality_{}'.format(yrstr), + 'Illumina_{}'.format(yrstr), outLane ) if not os.path.exists(seqFacDir): - os.mkdir(seqFacDir) + pathlib.Path( + seqFacDir + ).mkdir(parents=True, exist_ok=True) for qcRepo in glob.glob( os.path.join(outPath, 'Project_*', 'multiqc_report.html') ): diff --git a/src/dissectBCL/misc.py b/src/dissectBCL/misc.py index d137daa..67a99a8 100644 --- a/src/dissectBCL/misc.py +++ b/src/dissectBCL/misc.py @@ -119,6 +119,10 @@ def getNewFlowCell(config, fPath=None): os.path.join( outBaseDir, flowcellName + '*', 'fastq.made' ) + ) and not glob.glob( + os.path.join( + outBaseDir, flowcellName + '*', 'run.failed' + ) ): return (flowcellName, flowcellDir) return (None, None) diff --git a/src/tools/prep_contaminome.py b/src/tools/prep_contaminome.py index db103dc..5aa844c 100644 --- a/src/tools/prep_contaminome.py +++ b/src/tools/prep_contaminome.py @@ -11,14 +11,14 @@ ignore_chrs = { 'human': ['NC_012920.1'], # human mito 'mouse': ['NC_005089.1'], # mouse mito - 'drosophila': ['NC_024511.2'], # fly mito + 'fly': ['NC_024511.2'], # fly mito 'aedes-aegypti': ['NC_035159.1'] # aedes mito } rrna_mask = [ ('human', 'humanrrna'), ('mouse', 'mouserrna'), - ('drosophila', 'flyrrna'), + ('fly', 'flyrrna'), ('aedes-aegypti', 'aedesaegyptirrna') ] @@ -39,7 +39,7 @@ 'progrp': [14, 5, 'family'], 'human': [9606, 9, 'species'], 'mouse': [10090, 10, 'species'], - 'drosophila': [7227, 11, 'species'], + 'fly': [7227, 11, 'species'], 'aedes-aegypti': [7159, 13, 'species'], 'sea-lamprey': [7757, 13, 'species'], 'japanese-medaka': [8090, 13, 'species'],