From 471a89e3abd27fb50d6dd6538b0b5c8d380c1f08 Mon Sep 17 00:00:00 2001 From: Maximilian Haeussler Date: Mon, 10 Sep 2018 17:05:21 +0200 Subject: [PATCH] reviving old cpPrep temporarily --- old.cbPrep | 1986 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1986 insertions(+) create mode 100755 old.cbPrep diff --git a/old.cbPrep b/old.cbPrep new file mode 100755 index 00000000..a54acde5 --- /dev/null +++ b/old.cbPrep @@ -0,0 +1,1986 @@ +#!/usr/bin/env python + +# convert/process single-cell data into the format required by cellBrowser.js + +import logging, sys, optparse, re, unicodedata, string, marshal, socket +import json, math, shutil, os, platform, zlib, glob, operator +from time import gmtime, strftime +from collections import defaultdict, namedtuple, Counter +from os.path import join, basename, dirname, isfile, isdir, abspath, splitext +from os import makedirs +import gzip + +# command line options +options = None + +# directory to static data files, e.g. gencode tables +dataDir = join(dirname(__file__), "static") + +# geneId -> symbol +geneToSym = None + +# this is the default list of genes to load and show +# geneId -> annotation (some string) +geneIdToAnnot = None + +# metaVal -> color dict +#colors = None + +# name of expression matrix file +exprFname = None +# dict: geneId -> cellId -> float +exprMatrix = None + +# reduce precision of JSON floating number encoding +# http://stackoverflow.com/questions/1447287/format-floats-with-standard-json-module +from json import encoder +encoder.FLOAT_REPR = lambda o: format(o, '.3f') + +# name of dataset, specified with options.datasetName +datasetName = None + +DATASETDESCNAME = "description.json" + + +# try to load T-ETE +teteLoaded = False +try: + import numpy as np + import tete + teteLoaded = True +except: + logging.error("Could not load T-ETE. T-ETE will be disabled. Missing dependencies? Are numpy and sklearn installed?") + +# ==== functions ===== +def parseArgs(): + " setup logging, parse command line arguments and options. -h shows auto-generated help page " + parser = optparse.OptionParser("""usage: %prog [options] command -m metaData -e exprMatrix -o outDir - " + "process RNA-seq (single-cell or bulk) expression matrices into a directory of html files to view them + + commands: + - 'coords': import existing coordinates, if you already have (x,y)-coordinates for each cell + and also expression values. + Example: %prog coords -m meta.tsv -c seuratCoords.tsv -e geneExpression.tsv -g defaultGenes.txt -l "Cluster" -o ~/public_html/cellBrowser/ + - coords.tsv has to be in the format (cellId,x,y). + - If you do not have a meta table, specify "none" instead. + + - 'matrix': read existing expression matrix and run through Seurat. + Example: %prog matrix -m meta.tsv -e exprMatrix.tsv -o ~/public_html/cellBrowser/ + - Matrix has one row per gene and one column per sample. + - Use --log if the matrix is not in log2 format yet. + - If you do not have a meta table, specify "none" instead. + - if you re-run matrix again, it will read intermediate results from the /build directory + - this means that you need to clear the output directory if you want a full + re-run. + - the clusters will be colored and labeled by the Seurat cluster + + - '10x': like 'matrix' but the input is a directory in 10x CellRanger format + Example: %prog 10x -m meta.tsv -e cellRangerOutputDir/ -o ~/public_html/cellBrowser/10Sample/ + - If you do not have a meta table, specify "none" instead. + + - 'html': copy the js/htm/css files to the output directory + Example: %prog html -o /usr/local/apache/htdocs-max/cellBrowser/dev2 + + To install Seurat 1.4: + echo 'export R_LIBS_USER=$HOME/R' >> ~/.bashrc + source ~/.bashrc + mkdir -p $R_LIBS_USER + wget https://github.com/satijalab/seurat/archive/v1.4.0.tar.gz + R CMD INSTALL v1.4.0.tar.gz -l $R_LIBS_USER + """) + + parser.add_option("-e", "--exprMatrix", dest="exprMatrix", action="store", + help="gene-cell expression matrix file, .gz is OK") + parser.add_option("-m", "--metaData", dest="metaData", action="store", + help="meta data table file, .gz is OK") + + parser.add_option("-o", "--outDir", dest="outDir", action="store", + help="output dir to write .json files to. Note that under this directory a subdirectory 'build' will be created. It contains temporary files of the pipelines. You can delete it once you do not plan to re-run the json build again. ") + + parser.add_option("-c", "--coords", dest="coords", action="append", help="tab-sep table with cell coordinates, format: metaId, x, y. Can be specified multiple times, if you have multiple coordinate files. Default label is filename, change config.json to change label.") + + parser.add_option("-g", "--genes", dest="genes", action="store", + help="default preloaded genes file. Can be a simple list of gene symbols, one per line. Or ''. Annotation is used for mouse-overs on gene symbols.") + + parser.add_option("-p", "--clusterSpec", dest="clusterSpec", action="append", + help="Tab-sep table with cluster specific genes. Format is . Can be specified multiple times. Default label is part of filename before first dot. Modify config.json to change labels.") + + parser.add_option("-l", "--clusterLabel", dest="clusterLabel", action="store", help="default field with the cluster label, from the meta data table.") + + parser.add_option("-n", "--name", + dest="datasetName", + action="store", help="Human-readable label for dataset. The default is the parent directory name of the matrix file path.") + + parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") + + parser.add_option("", "--metaColumn", + dest="metaColumn", action="store", + help="the name of the column in meta data table that connects the meta data to the gene expression table. Common values are 'id', 'sampleId', 'cellId', etc. Default is the first column of the meta data table.") + + parser.add_option("", "--colors", + dest="colors", action="store", + help="a tab-sep file with two columns, a meta value and a color. Can be used e.g. for color assignments to clusters") + + parser.add_option("-s", "--symTable", + dest="symTable", + action="store", help="use a table (transId, geneId, symbol) to convert transcript or gene identifiers in the expression matrix to symbols, default %default. Specify '-s none' if the input expression file is already using gene symbols.", + default=join(dataDir, "gencode22.ens79-80.tab")) + + parser.add_option("", "--skip", + dest="skip", + action="store_true", help="just skip invalid gene identifiers in matrix, do not stop", + default=join(dataDir, "gencode22.ens79-80.tab")) + + parser.add_option("", "--log2", + dest="doLog", + action="store_true", help="only when 'matrix': log2 the matrix before processing it") + + parser.add_option("", "--test", + dest="test", + action="store_true", help="run doctests") + + (options, args) = parser.parse_args() + + if options.test: + import doctest + doctest.testmod() + sys.exit(0) + + if args==[]: + parser.print_help() + exit(1) + + if options.debug: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + return args, options + +def makeTmpDirFor(outDir): + " make outDir/build and return " + tmpDir = join(outDir, "build") + if not isdir(tmpDir): + os.makedirs(tmpDir) + logging.info('Created %s' % tmpDir) + return tmpDir + +def errAbort(msg): + logging.error(msg) + sys.exit(1) + +#class Object: + # strange Python trickery to allow something that looks like a struct + #pass + +def writeList(list, fname): + " write list to file, one value per line " + logging.debug("Writing %d elements to %s, first element is %s" % (len(list), fname, list[0])) + ofh = open(fname, "w") + for l in list: + ofh.write("%s\n"%l) + ofh.close() + +def parseList(fname): + " write list to file, one value per line " + ifh = open(fname) + l = ifh.read().splitlines() + logging.debug("Read %d elements from %s" % (len(l), fname)) + ofh = open(fname, "w") + return l + +def openFile(fname): + if fname.endswith(".gz"): + fh = gzip.open(fname) + else: + fh = open(fname) + return fh + +def parseDict(fname): + """ parse text file in format keyvalue and return as dict key->val """ + d = {} + + fh = openFile(fname) + + for line in fh: + key, val = line.rstrip("\n").split("\t") + d[key] = val + return d + +def readGeneToSym(fname): + " given a file with geneId,symbol return a dict geneId -> symbol. Strips anything after . in the geneId " + if fname.lower()=="none": + return None + + logging.info("Reading gene,symbol mapping from %s" % fname) + + # Jim's files and CellRanger files have no headers, they are just key-value + line1 = open(fname).readline() + if "geneId" not in line1: + d = parseDict(fname) + # my new gencode tables contain a symbol for ALL genes + elif line1=="transcriptId\tgeneId\tsymbol": + for row in lineFileNextRow(fname): + if row.symbol=="": + continue + d[row.geneId.split(".")[0]]=row.symbol + # my own files have headers + else: + d = {} + for row in lineFileNextRow(fname): + if row.symbol=="": + continue + d[row.geneId.split(".")[0]]=row.symbol + return d + +def convIdToSym(geneId): + if geneToSym is None: + return geneId + else: + return geneToSym[geneId] + +def readGeneIds(fname, geneToSym): + """ input tab-sep file has format symbol, annotationString + returns geneId -> annotation + Checks if symbol is really valid and maps sym -> geneId using geneToSym. + deprecated: Does not do any file parsing if fname is 'none' and returns all valid geneIds instead. + """ + if fname is None: + logging.warn("You have not supplied a marker gene list - all genes will be used") + return None + + if fname and fname.lower()=="none": + logging.info("No marker gene list, using all genes. No gene annotations available.") + if geneToSym==None: + logging.info("Using None for geneIds, as no gene symbol mapping available either") + return None + + ret = {} + for geneId, sym in geneToSym.iteritems(): + ret[sym] = "" + return ret + + symToGenes = None + if geneToSym is not None: + symToGenes = defaultdict(list) + for geneId, sym in geneToSym.iteritems(): + symToGenes[sym].append(geneId) + + logging.info("Reading marker gene list from %s" % fname) + ifh = open(fname) + line1 = ifh.readline() + fs = line1.rstrip("\n").split("\t") + if not "symbol" in line1: + # seek to 0 again if line1 doesn't like like a header line + ifh = open(fname) + + geneToAnnot = {} + showNoAnnotWarn = False + for line in ifh: + if line.startswith("#") or line.startswith(" "): + continue + fs = line.rstrip("\n").split("\t") + if len(fs)==1: + sym = fs[0] + annot = "" + showNoAnnotWarn = True + else: + sym, annot = fs + + if geneToAnnot is not None and sym in geneToAnnot: + logging.warn("%s appears twice in marker list: %s" % (sym, line)) + continue + if symToGenes is not None: + if sym not in symToGenes: + logging.warn("The marker symbol %s cannot be mapped to a geneID. Ignoring this gene." % sym) + continue + if len(symToGenes[sym])!=1: + logging.warn("The gene symbol %s from the default marker file is associated with more than one gene identifier: %s" % \ + (sym, " ".join(symToGenes[sym]))) + geneId = symToGenes[sym][0] + logging.warn("Using only the first gene: %s" % geneId) + else: + geneId = symToGenes[sym][0] + else: + geneId = sym + geneToAnnot[geneId] = annot + + if showNoAnnotWarn: + logging.info("The gene info file %s contains only a single column. " + "This is OK, but note that you can have optional annotation " + "text in this file, as a second column. Annotation text can be" + "something like 'known cancer marker' or anything else you may" + "want to show in the viewer." % fname) + + return geneToAnnot + +def getCirmStaticFile(relPath): + """ make relPath relative to CIRM static data directory. + The base path can be set with the 'CIRM' environment variable. + """ + if platform.node()=="hgwdev": + CIRMPATH = "/hive/groups/cirm/" + else: + CIRMPATH = "/pod/pstore/" + return join(os.getenv("CIRM", CIRMPATH), relPath) + +def lineFileNextRow(inFile): + """ + parses tab-sep file with headers in first line + yields collection.namedtuples + strips "#"-prefix from header line + """ + + if isinstance(inFile, str): + fh = openFile(inFile) + else: + fh = inFile + + line1 = fh.readline() + line1 = line1.strip("\n").lstrip("#") + headers = line1.split("\t") + headers = [re.sub("[^a-zA-Z0-9_]","_", h) for h in headers] + headers = [re.sub("^_","", h) for h in headers] # remove _ prefix + #headers = [x if x!="" else "noName" for x in headers] + if headers[0]=="": # R does not name the first column by default + headers[0]="rowName" + + if "" in headers: + logging.error("Found empty cells in header line of %s" % inFile) + logging.error("This often happens with Excel files. Make sure that the conversion from Excel was done correctly. Use cut -f-lastColumn to fix it.") + assert(False) + + filtHeads = [] + for h in headers: + if h[0].isdigit(): + filtHeads.append("x"+h) + else: + filtHeads.append(h) + headers = filtHeads + + + Record = namedtuple('tsvRec', headers) + for line in fh: + if line.startswith("#"): + continue + line = line.decode("latin1") + # skip special chars in meta data and keep only ASCII + line = unicodedata.normalize('NFKD', line).encode('ascii','ignore') + line = line.rstrip("\n").rstrip("\r") + fields = string.split(line, "\t", maxsplit=len(headers)-1) + try: + rec = Record(*fields) + except Exception, msg: + logging.error("Exception occured while parsing line, %s" % msg) + logging.error("Filename %s" % fh.name) + logging.error("Line was: %s" % line) + logging.error("Does number of fields match headers?") + logging.error("Headers are: %s" % headers) + raise Exception("header count: %d != field count: %d wrong field count in line %s" % (len(headers), len(fields), line)) + yield rec + +def convTabToMarshal(matrixFname, marshFname, doLog): + " convert a genes-down-cells-right matrix to a .marshal file. The version number is stripped from the geneId. " + #marshFname = matrixFname+".marshal" + cellTransCounts = defaultdict(dict) + logging.info("Converting %s to %s" % (matrixFname, marshFname)) + logging.info("Reading %s" % matrixFname) + + ifh = openFile(matrixFname) + + cellNames = ifh.readline().rstrip("\n").rstrip("\r").split("\t")[1:] + lNo = 0 + skippedIds = [] + for line in ifh: + lNo += 1 + fs = line.rstrip("\n").rstrip("\r").split("\t") + + if(len(fs)!=len(cellNames)+1): + print ("Error - incorrect column count: line %d, header columns %d, line columns %d, first three fields: %s" % (lNo, len(cellNames)+1, len(fs), fs[:3])) + assert(False) # have to abort when the input file format is wrong + + assert(len(fs)==len(cellNames)+1) + geneId = fs[0] + if "." in geneId: + geneId = geneId.split(".")[0] + if geneToSym is not None and geneId not in geneToSym: + if options.skip: + skippedIds.append(geneId) + continue + if not geneId.startswith("ENS"): + logging.warn("when reading expression matrix, the gene ID '%s' does not start with 'ENS'. Looks like the gene matrix does not use Ensembl gene identifiers" % geneId) + logging.error("Could not find Ensembl geneId '%s' in file %s. Invalid Gencode/Ensembl version? Supply the right file for -s. If this is not an Ensembl Gene ID, but already a symbol, run the script with '-s none'. If you know that a few genes are invalid (e.g. not primary assembly genes), run the script with --skip" % (geneId, options.symTable)) + sys.exit(1) + + for cellName, count in zip(cellNames, fs[1:]): + if count=="NA": + continue + count = float(count) + if count!=0.0: + val = float(count) + if doLog: + val = math.log(val+1, 2) + if val > 500: + logging.error("Found very high value in expression matrix: %f" % val) + logging.error("This is unlikely to be a matrix of log'ed values. Rerun with --log.") + sys.exit(1) + + cellTransCounts[cellName][geneId] = val + + if len(skippedIds) != 0: + logging.warn("In matrix file %s, in the first column, %d geneIds were not found in file %s. They were removed." % (matrixFname, len(skippedIds), options.symTable)) + logging.warn("Ten examples of the skipped IDs are: %s" % skippedIds[:10]) + + cellTransCounts = dict(cellTransCounts) + + logging.info("Writing %s" % marshFname) + exprMat = marshal.dump(cellTransCounts, open(marshFname, "w")) + +def parseMatrix(matrixFname, buildDir, doLog): + """ + input: tab-sep file with one gene per row and one cellId per column + returns: matrix as dict cellName -> transcriptId or geneId -> count + side effect: creates .marshal which is faster to load when called next time. + """ + if doLog: + marshFname = join(buildDir,"matrix.log2.marshal") + else: + marshFname = join(buildDir, "matrix.marshal") + + if not isfile(marshFname): + convTabToMarshal(matrixFname, marshFname, doLog) + + logging.info("Reading %s" % marshFname) + exprMat = marshal.load(open(marshFname)) + return exprMat + +def parseCoords(fname, validIds=None): + """ parse tsv file in format cellId, x, y and return as list of (cellId, x, y) + Flip the y coordinates to make it more look like in R. + """ + logging.info("Parsing %s" % fname) + coords = [] + maxY = 0 + skipCount = 0 + for row in lineFileNextRow(fname): + assert(len(row)==3) # coord file has to have three rows (cellId, x, y), we just ignore the headers + cellId = row[0] + if validIds: + if cellId not in validIds: + skipCount +=1 + continue + x = float(row[1]) + y = float(row[2]) + maxY = max(maxY, y) + coords.append( (cellId, x, y) ) + + newCoords = [] + for cellId, x, y in coords: + newY = maxY - y + newCoords.append( (cellId, x, newY) ) + + if skipCount != 0: + logging.warn("Ignored %d coordinates, no meta data for these" % (skipCount)) + return newCoords + +def parsePca(pcaFname): + """ parse the scores from a R PCA file and return as a list of tuples (cellId, pc1, pc2, pc3, pc4 ... ) + """ + logging.info("Parsing %s for Principal Components plot" % pcaFname) + data = [] + for row in lineFileNextRow(pcaFname): + data.append( (row.rowName, float(row.PC1), float(row.PC2), float(row.PC3), float(row.PC4), float(row.PC5) ) ) + return data + +def mostRelevantFields(meta): + " remove meta fields that have too few or too many different values " + fields = list(sorted(meta.values()[0].keys())) + + # meta data - make a list + colVals = defaultdict(list) + for cellId, infoDict in meta.iteritems(): + for field in fields: + if field==options.meta: + continue + val = infoDict[field] + colVals[field].append(val) + + # filter down the fields to things that make sense + filtFields = [] + for field, vals in colVals.iteritems(): + valCount = len(set(vals)) + if valCount > 3 and not valCount==len(vals): + filtFields.append(field) + + return filtFields + +def saveJson(data, fname, pretty=False): + if pretty: + json.dump(data, open(fname, "w"), sort_keys=True, indent=4, separators=(', ', ': ')) + else: + json.dump(data, open(fname, "w"), separators=(',', ":")) + +def getDecilesList(values): + """ given a list of values, return the 11 values that define the 10 ranges for the deciles + >>> l = [-4.059,-3.644,-3.184,-3.184,-3.184,-3.059,-2.943,-2.396,-2.322,-2.252,-2.252,-2.252,-2.12,-2.12,-1.943,-1.943,-1.69,-1.556,-1.322,-1.184,-1.12,-0.862,-0.862,-0.837,-0.837,-0.667,-0.644,-0.535,-0.454,-0.234,-0.184,0.084,0.084,0.151,0.299,0.31,0.444,0.485,0.575,0.632,0.66,0.748,0.824,1.043,1.098,1.176,1.239,1.356,1.411,1.521,1.526,1.609,1.748,1.77,1.832,1.864,1.88,2.081,2.094,2.242,2.251,2.331,2.376,2.664,2.718,2.787,2.858,2.928,2.978,3.011,3.093,3.144,3.157,3.245,3.352,3.444,3.462,3.479,3.609,3.699,3.714,3.795,3.811,3.857,3.871,3.903,3.914,3.983,3.985,3.986,4.006,4.056,4.063,4.156,4.179,4.221,4.35,4.352,4.361,4.372,4.38,4.427,4.432,4.447,4.45,4.459,4.516,4.521,4.527,4.611,4.636,4.644,4.659,4.662,4.81,4.866,4.882,4.902,4.916,4.918,5.01,5.018,5.02,5.133,5.179,5.186,5.205,5.218,5.245,5.25,5.262,5.263,5.365,5.374,5.412,5.421,5.437,5.45,5.453,5.484,5.501,5.527,5.527,5.534,5.561,5.566,5.567,5.624,5.646,5.662,5.691,5.705,5.706,5.712,5.87,5.909,5.912,5.92,5.978,5.978,6.012,6.086,6.086,6.106,6.114,6.155,6.168,6.171,6.179,6.221,6.287,6.317,6.324,6.354,6.364,6.385,6.389,6.397,6.427,6.439,6.49,6.513,6.517,6.521,6.557,6.578,6.579,6.648,6.703,6.756,6.887,6.953,7.042,7.155,7.194,7.21,7.225,7.249,7.254,7.291,7.382,7.397,7.504,7.603,7.65,7.861,8.524] + >>> getDecilesList(l) + [-4.059, -1.12, 0.748, 2.331, 3.811, 4.447, 5.133, 5.561, 6.114, 6.578, 8.524] + """ + if len(values)==0: + return None + + values.sort() + binSize = float(len(values)-1) / 10.0; + + # get deciles from the list of sorted values + deciles = [] + pos = 0 + for i in range(11): # 10 bins means that we need 11 ranges + pos = int (binSize * i)# ? could this ever exceed len(values) due to floating point issues? + if pos > len(values): + logging.warn("decile exceeds 10, binSize %d, i %d, len(values) %d" % (binSize, i, len(values))) + pos = len(values) + deciles.append ( values[pos] ) + return deciles + +def getDecilesMatrix(exprMatrix, geneIds): + """ given a dict of geneId -> list of expression numbers, return a dict + geneId -> list of the 10 deciles of these numbers """ + + if exprMatrix is None: + return None + + logging.info("Getting deciles for gene expression") + + ret = {} + noneCount = 0 + if geneIds is None: + logging.info("Got no gene/symbol mapping, using all input genes from matrix") + geneIds = getNonZeroGenes(exprMatrix) + + for geneId in geneIds: + cleanVals = [] + for cellId, vals in exprMatrix.iteritems(): + # remove the null values + val = vals.get(geneId) + if val != None: + cleanVals.append(val) + + decList = getDecilesList(cleanVals) + if decList is None: + noneCount += 1 + + if geneToSym is not None: + sym = geneToSym[geneId] + else: + sym = geneId + + ret[sym] = decList + + if noneCount == len(geneIds): + logging.error("No single geneId in the gene->symbol mapping file could be found in the expression matrix.") + logging.error("If you want to use the IDs as they are in the expression matrix, specify '-s none'") + sys.exit(1) + return ret + +def matrixSlice_sparse(exprMatrix, geneIds): + " get only the expression values of some genes out of the matrix, return dict cellId -> index of geneId -> val " + geneExpr = {} + geneIdToIdx = {} + for i, geneId in enumerate(geneIds): + geneIdToIdx[geneId] = i + + for cellId, cellExpr in exprMatrix.iteritems(): + exprDict = {} + for geneId in geneIds: + val = cellExpr.get(geneId, 0) + if val!=None and val!=0: + exprDict[geneIdToIdx[geneId]] = val + #nonNullCount += 1 + geneExpr[cellId] = exprDict + return geneExpr + +def matrixSlice(exprMatrix, geneIds): + geneExpr = {} + " get only the expression values of some genes out of the matrix, return dict cellId -> list of values " + for cellId, cellExpr in exprMatrix.iteritems(): + exprList =[] + for geneId in geneIds: + val = cellExpr.get(geneId, 0) + exprList.append(val) + geneExpr[cellId] = exprList + return geneExpr + +def iterLineOffsets(ifh): + """ parse a text file and yield tuples of (line, startOffset, endOffset). + endOffset does not include the newline, but the newline is not stripped from line. + """ + line = True + start = 0 + while line!='': + line = ifh.readline() + end = ifh.tell()-1 + if line!="": + yield line, start, end + start = ifh.tell() + +def indexMatrix(fname): + """ index a matrix with one gene per line and return dict with + gene symbol -> (file offset, line length) + """ + logging.info("Indexing line offsets of %s" % fname) + ifh = open(fname) + geneToOffset = {} + skipIds = 0 + lineNo = 0 + for line, start, end in iterLineOffsets(ifh): + if start == 0: + symbol = "_header" + else: + geneId, _ = string.split(line, "\t", 1) + if geneToSym is not None: + symbol = geneToSym.get(geneId) + if symbol is None: + skipIds += 1 + continue + else: + symbol = geneId + lineLen = end - start + assert(lineLen!=0) + geneToOffset[symbol] = (start, lineLen) + lineNo += 1 + + logging.info("Skipped %d genes, as they had no gene symbol" % skipIds) + return geneToOffset + +def getGeneExpr(exprMatrix, geneIds): + " prepare the geneExpr object from the expression matrix and the target gene IDs " + geneExpr = {} + + # use sparse encoding of expression vals if it is smaller + logging.info("Reducing gene matrix") + geneExpr = matrixSlice(exprMatrix, geneIds) + + #logging.info("Trying sparse and full gene expression encoding") + #geneExprSparse = matrixSlice_sparse(exprMatrix, geneIds) + + #fullStr = json.dumps(geneExpr) + #sparseStr = json.dumps(geneExprSparse) + + #fullLen = len(fullStr) + #sparseLen = len(sparseStr) + + #fullZLen = len(zlib.compress(fullStr)) + #sparseZLen = len(zlib.compress(sparseStr)) + + #logging.info("Gene expression sparse encoding size: %d, full encoding size: %d" % (sparseLen, fullLen)) + #logging.info("Compressed: sparse encoding: %d, full encoding: %d" % (sparseZLen, fullZLen)) + #if sparseLen < fullLen: + #logging.info("Using sparse") + #geneExpr = geneExprSparse + + return geneExpr + +def saveCoords(coords, coordFname): + " save a list of (id, x, y) tuples to a filename. Only save 3 digits " + logging.debug("writing %d cell coordinates to %s" % (len(coords), coordFname)) + ofh = open(coordFname, "w") + ofh.write("cellId\tx\ty\n") + for cellId, x, y in coords: + ofh.write("%s\t%0.3f\t%0.3f\n" % (cellId, x, y)) + ofh.close() + +def writeTsvCoords(coordDict, pcaCoords, jsonDir): + " write the coordinates to tsv files, one per coordinate algorithm " + coordFiles = [] + + # write all other coords + for (coordType, coordDesc), coordList in coordDict.iteritems(): + coordFname = join(jsonDir, "%s.coords.tsv" % coordType) # "coords" json -> list of x,y,cellId + coordFiles.append( {"shortLabel":coordDesc, "url":basename(coordFname)} ) + if isfile(coordFname): + logging.info("Not writing %s, already exists" % coordFname) + continue + + logging.info("Writing coord data in JSON format to %s" % coordFname) + coords = [] + for cellId, x, y in coordList: + coords.append((cellId, x, y)) + #saveJson(coords, coordFname) + saveCoords(coords, coordFname) + + if pcaCoords is not None: + # write PCA coords + for pcaFname, i, j in ( ("pc12",1,2), ("pc23",2,3), ("pc34",3,4), ("pc45",4,5)): + coordFname = join(jsonDir, "%s.coords.tsv" % pcaFname) # "coords" json -> list of x,y,cellId + if isfile(coordFname): + logging.info("Not writing PCs, %s already exists" % coordFname) + else: + logging.info("Writing PCs to %s" % coordFname) + coords = [] + for row in pcaCoords: + cellId = row[0] + x = row[i] + y = row[j] + coords.append( (cellId, x, y) ) + saveCoords(coords, coordFname) + coordFiles.append( {"shortLabel":"PCA: PC%d/PC%d" % (i, j ), "url":basename(coordFname)} ) + + return coordFiles + +def writeJsonConfig(cellInfo, coordFiles, geneIdToAnnot, fieldAnnots, configFname): + data = {} + data["coordFiles"] = coordFiles + + metaFields = sorted(cellInfo.values()[0].keys()) + if "Seurat Cluster" in metaFields: + del metaFields[metaFields.index("Seurat Cluster")] + metaFields.insert(0, "Seurat Cluster") + else: + logging.info("Could not find a 'Seurat Cluster' field in the meta data") + + data["metaFields"] = metaFields + data["fieldAnnots"] = fieldAnnots + + #geneFields = [] + #if geneIdToAnnot is not None: + # for geneId, annot in geneIdToAnnot.iteritems(): + # if geneId not in matrixGeneIds: + # logging.warn("gene %s in the default gene list (option -g) is not in the expression matrix. " + # "It will be skipped now and will not appear in the gene list" % geneId) + # continue + + # if geneToSym: + # sym = geneToSym[geneId] + # else: + # sym = geneId + # geneFields.append( (sym, annot, geneId) ) + #else: + # for geneId, symbol in geneToSym.iteritems(): + # geneFields.append( (symbol, "", geneId) ) + + #if len(geneFields)>500: + # logging.error("More than 500 marker genes have been selected to be shown by default - are you sure this is intentional? Consider using the -g option") + # sys.exit(1) + + #geneFields.sort() # sort by gene symbol + #data["preloadGenes"] = geneFields + #geneIds = [x[2] for x in geneFields] + + #logging.info("Writing %d gene names: %s ,..." % (len(geneFields), geneFields[:3])) + saveJson(data, configFname, pretty=True) + return metaFields + +def writeJsonMeta(cellInfo, metaFields, metaFname): + # save the meta info + data = {} + + meta = {} + for cellId, infoDict in cellInfo.iteritems(): + row = [] + for field in metaFields: + #row.append(infoDict.get(field, "")) + row.append(infoDict[field]) + meta[cellId] = row + data["meta"] = meta + logging.info("Exporting sample meta data to %s" % metaFname) + saveJson(data, metaFname) + +def writeTsvMeta(cellInfo, metaFields, metaFname): + " save the meta info to a .tsv file " + logging.info("Writing sample meta data to %s" % metaFname) + ofh = open(metaFname, "w") + ofh.write("_id\t") + ofh.write("\t".join(metaFields)) + ofh.write("\n") + + for cellId, infoDict in cellInfo.iteritems(): + row = [cellId] + for field in metaFields: + row.append(infoDict[field]) + ofh.write("\t".join(row)) + ofh.write("\n") + ofh.close() + +def writeJsonGenes(exprMatrix, geneFields, geneIds, geneFname): + if isfile(geneFname): + logging.info("Not writing %s, already exists" % geneFname) + else: + decileData = getDecilesMatrix(exprMatrix, None) + geneExpr = getGeneExpr(exprMatrix, geneIds) + + logging.info("Writing gene expression to %s" % (geneFname)) + data = {"genes": geneFields, "cellExpr": geneExpr, "deciles":decileData} + saveJson(data, geneFname) + +def writeJsonDescription(desc, descFname, coordFiles, markerInfo): + # save the dataset description + if datasetName is not None: + desc["shortLabel"] = datasetName + + desc["coordFiles"] = coordFiles + desc["markers"] = markerInfo + + if isfile(descFname): + logging.info("Not writing dataset description to %s, file already exists" % descFname) + else: + logging.info("Writing new dataset description to %s" % descFname) + saveJson(desc, descFname, pretty=True) + +def writeTsvAndJson(exprMatrix, coordDict, pcaCoords, cellInfo, geneToSym, geneIdToAnnot, desc, fieldAnnots, markerInfo, jsonDir): + " write coordinates, expression and meta data to jsonDir " + #pcaFname = join(jsonDir, "pcaCoords.json") # 'pcaCoords' -> list of (cellId, (pc1, pc2, pc3, pc4, pc5)) + #geneFname = join(jsonDir, "preloadGenes.json") # "geneExpr" -> cellId -> list of floats + configFname = join(jsonDir, "config.json") # "metaFields" and "exprFields" + descFname = join(jsonDir, "description.json") # three fields: label, URL and description + + coordFiles = writeTsvCoords(coordDict, pcaCoords, jsonDir) + metaFields = writeJsonConfig(cellInfo, coordFiles, geneIdToAnnot, fieldAnnots, configFname) + + #writeJsonGenes(exprMatrix, geneFields, geneIdToAnnot, geneFname) + #writeJsonMeta(cellInfo, metaFields, metaFname) + metaFname = join(jsonDir, "meta.tsv") + writeTsvMeta(cellInfo, metaFields, metaFname) + writeJsonDescription(desc, descFname, coordFiles, markerInfo) + +def parseMeta(fname, cellIds, mustHaveField=None): + """ parse a tab-sep file where one column is a cellId and the others are attributes for it. + returns a dict cellId -> dict with attributes. + if fname is 'none': create fake meta data by using the headers of the file matrixFname + also returns fieldAnnots a dict of fieldName -> dict , e.g. "tooMany":1 or "range":decileList + """ + if fname.lower()=="none": + #cellIds = exprMatrix.keys() + meta = {} + for cellId in cellIds: + meta[cellId] = {"Sample ID" : cellId} + return meta + + idCol = options.metaColumn + meta = {} + fieldVals = defaultdict(Counter) # counts of all distinct values, one per field. + for row in lineFileNextRow(fname): + if idCol==None: + idCol = row._fields[0] + logging.info("--metaColumn not specified. Using the first field, '%s', as the sample identifier field" % idCol) + + + rowDict = row._asdict() + if idCol not in rowDict: + errAbort("The meta data file %s does not have a column named '%s'. First 50 available columns: %s. Use --metaColumn to specify a different name" % (fname, idCol, ",".join(row._fields[:50]))) + + if mustHaveField and mustHaveField not in rowDict: + errAbort("The meta data file %s does not contain a column named '%s'. First 50 available columns: %s. Specify a different cluster label field and run again." % (fname, mustHaveField, ",".join(row._fields[:50]))) + + for key, val in rowDict.iteritems(): + fieldVals[key][val]+=1 + + cellId = rowDict[idCol] + assert(cellId not in rowDict) # cellId must not appear twice in meta file + del rowDict[idCol] # remove the cell id itself + meta[cellId] = rowDict + + # now create the fieldAnnots for all fields + fieldAnnots = defaultdict(dict) + for fieldName, valCounts in fieldVals.iteritems(): + # get the deciles for all-numeric fields + allNumeric = True + vals = [] + for val in valCounts: + try: + floatVal = float(val) + vals.append(val) + except: + allNumeric = False + break + if allNumeric and len(valCounts)>30: + deciles = getDecilesList(vals) + fieldAnnots[fieldName]["deciles"] = deciles + logging.info("field '%s' has only numeric values, deciles: %s" % (fieldName, deciles)) + + # flag fields that have too many different values + if len(valCounts)>=100 and not allNumeric: + fieldAnnots[fieldName]["tooMany"] = True + logging.info("field '%s' has %d different values, too many to color on" % (fieldName, len(valCounts))) + + return meta, fieldAnnots + +def fakeGeneMap(exprMatrix): + " collect a list of all genes in the expression matrix and return dict geneId -> geneId "") " + if exprMatrix is None: + return None + + geneMap = {} + for cellId, cellExpr in exprMatrix.iteritems(): + for geneId, val in cellExpr.iteritems(): + geneMap[geneId] = geneId + return geneMap + +def useOnlyCommonCellIds(meta, exprMatrix): + """ make sure that the cellIds match up between meta, exprMatrix and coords. + Return the arguments as they are, if their cellIds match up. + Otherwise restrict all data to only the common cellIDs and return all arguments. + """ + metaIds = set(meta) + exprIds = set(exprMatrix) + logging.info("CellID-count: meta data=%d, expression matrix=%d" % (len(metaIds), len(exprIds))) + if len(metaIds)==len(exprIds): + return meta, exprMatrix + commonIds = metaIds.intersection(exprIds).intersection(metaIds) + logging.warn("Expression and meta data are not in sync. Restricting analysis to %d cellIDs common to both meta and expression files." % len(commonIds)) + if len(commonIds)==0: + errAbort("There is no overlap between the sample IDs (meta) in the meta data and the sampel Ids in the expression matrix. Cannot continue.") + newMeta = { k: meta[k] for k in commonIds } + newExpr = { k: exprMatrix[k] for k in commonIds } + return newMeta, newExpr + +def useOnlyCommonCellIdsCoords(meta, exprMatrix, coords): + """ make sure that the cellIds match up between meta, exprMatrix and coords. + Return the arguments as they are, if their cellIds match up. + Otherwise restrict all data to only the common cellIDs and return all arguments. + Like function above, but with coords. Hard to make this generic. + """ + metaIds = set(meta) + exprIds = set(exprMatrix) + coordIds = set([x[0] for x in coords]) + logging.info("CellID-count: meta=%d, expression=%d, coords=%s" % (len(metaIds), len(exprIds), len(coordIds))) + if len(metaIds)==len(exprIds)==len(coordIds): + return meta, exprMatrix, coords + + commonIds = metaIds.intersection(exprIds).intersection(coordIds) + logging.warn("input files are not in sync, restricting all files to %d common cellIDs" % len(commonIds)) + if len(commonIds)==0: + logging.error("meta-identifiers between coordinates and meta data do not match at all. Cannot continue.") + logging.error("Example meta IDs: %s" % list(metaIds)[:10]) + logging.error("Example coordinate IDs: %s" % list(coordIds)[:10]) + logging.error("Example expression IDs: %s" % list(exprIds)[:10]) + sys.exit(1) + newMeta = { k: meta[k] for k in commonIds } + newExpr = { k: exprMatrix[k] for k in commonIds } + newCoords = [ row for row in coords if row[0] in commonIds ] + return newMeta, newExpr, newCoords + +def checkDefaultGenes(): + " make sure that all default genes are valid " + global geneIdToAnnot + if geneIdToAnnot is None: + logging.warn("No default gene annotation file specified") + return + errCount = 0 + remGenes = set() + for geneId in geneIdToAnnot: + if not geneId in geneToSym: + logging.warn("Default gene %s does not look like a valid gene symbol" % geneId) + errCount +=1 + remGenes.add(geneId) + + for g in remGenes: + del geneIdToAnnot[g] + + #if errCount > 10 and options.skip: + #raise Exception("too many invalid genes found, use --skip to ignore this") + +def isValidField(meta, fieldName): + " make sure that fieldName really exists, stop if not " + for cellId, cellMeta in meta.iteritems(): + if fieldName not in cellMeta: + raise Exception("the field %s is not a valid meta data field" % fieldName) + +def importCoords(command, metaFname, matrixFname, coordFnames, clusterLabelField, markerInfo, outDir): + " parse coordinates and write all json files to outDir " + buildDir = makeTmpDirFor(outDir) + + exprMatrix = None + cellIds = None + if matrixFname!=None: + exprMatrix = parseMatrix(matrixFname, buildDir, options.doLog) + cellIds = exprMatrix.keys() + + meta, fieldAnnots = parseMeta(metaFname, cellIds) + + validIds = meta + + coordDict = {} + for coordFname in coordFnames: + coords = parseCoords(coordFname, validIds) + coordName = splitext(basename(coordFname))[0] + coordDict[(coordName, coordName)] = coords + + if clusterLabelField: + isValidField(meta, clusterLabelField) + + global geneToSym + if geneToSym is None: + geneToSym = fakeGeneMap(exprMatrix) + + checkDefaultGenes() + + #meta, exprMatrix, coords = useOnlyCommonCellIdsCoords(meta, exprMatrix, coords) + + lines = [] + lines.append("Processing: coordinates supplied by user") + lines.append("Coordinate file: %s" % coordFname) + lines.append("Meta data file: %s" % metaFname) + lines.append("Expression data file: %s" % matrixFname) + lines.append("Directory: %s" % os.getcwd()) + lines.append("Command: %s" % " ".join(sys.argv)) + lines.append("By: %s@%s %s" % (os.getenv('USER'), socket.gethostname(), strftime("%Y-%m-%d %H:%M:%S", gmtime()))) + lines.append("%s" % matrixFname) + longLabel = "
".join(lines) + + shortLabel = datasetName + if shortLabel is None and matrixFname is not None: + shortLabel = basename(dirname(abspath(matrixFname))) + assert(shortLabel is not None) + + desc = { + "shortLabel" : shortLabel, + "longLabel" : longLabel, + "sampleCount" : len(meta) + } + + if clusterLabelField: + desc["clusterLabelField"] = clusterLabelField + desc["clusterField"] = clusterLabelField + + writeTsvAndJson(exprMatrix, coordDict, None, meta, geneToSym, geneIdToAnnot, desc, fieldAnnots, markerInfo, outDir) + + copyMatrixAndIndex(exprMatrix, outDir) + + preloadMatrixFname = join(outDir, "preloadMatrix.tsv") + +def writeMatrix(rows, fname, headers=None): + """ write a list of rows to tab sep file """ + ofh = open(fname, "w") + if headers: + ofh.write("\t".join(headers)) + ofh.write("\n") + + for row in rows: + row = [str(x) for x in row] + ofh.write("\t".join(row)) + ofh.write("\n") + ofh.close() + logging.info("Wrote %s" % fname) + +def encodeCellId(cellId): + " replace invalid chars (for seurat) in cellId " + assert("|" not in cellId) + assert(" " not in cellId) + #cellId = filter(str.isalnum, cellId) # keep only alphanum chhars + #randomStr = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(6)) + #cellId = cellId+"_"+randomStr + cellId = cellId.replace("_",".") + cellId = cellId.replace("-",".") + cellId = cellId.replace("+",".") + cellId = cellId.replace("=",".") + cellId = cellId.replace("*",".") + cellId = cellId.replace("/",".") + return cellId + +def sanitizeForR(cellIds): + " R does not accept many identifiers. Sanitize a list of identifiers and return a map saneId -> original ID " + newToOld = {} + for origId in cellIds: + newId = encodeCellId(origId) + assert(newId not in newToOld) # ID must not appear twice after mapping + newToOld[newId] = origId + return newToOld + +def getNonZeroGenes(exprMatrix, buildDir=None, onlyGenes=None): + " given a matrix with cellId -> geneId -> float, return the sorted list of geneIds that have at least one non-0 data point " + # first make a list of all genes, remove those that are always 0 + geneFname = None + if buildDir is not None: + geneFname = join(buildDir, "nonZeroGenes.txt") + if isfile(geneFname): + logging.debug("Getting all non-zero genes from file %s" % geneFname) + return parseList(geneFname) + + logging.debug("Getting all non-zero genes from matrix") + allGenes = set() + notZeroGenes = set() + assert(len(exprMatrix)!=0) # something went seriously wrong with the matrix import + for cellName, geneDict in exprMatrix.iteritems(): + allGenes.update(geneDict) + for gene, val in geneDict.iteritems(): + if val!=0.0: + notZeroGenes.add(gene) + + skipGenes = allGenes - notZeroGenes + if len(skipGenes)!=0: + logging.info("%d genes have expression=0 across in all cells" % len(skipGenes)) + logging.info("sample gene IDs with all-0 values: %s" % list(skipGenes)[:10]) + allGenes = allGenes - skipGenes + + if onlyGenes is not None: + logging.info("Keeping only %d genes: e.g. %s..." % (len(list(onlyGenes)), list(onlyGenes)[:5])) + allGenes = allGenes.intersection(onlyGenes) + + allGenes = list(allGenes) + allGenes.sort() + logging.debug("Found %d non-zero genes in matrix" % len(allGenes)) + assert(len(allGenes)!=0) # no genes left to write. Probably wrong gene Ids in gene ID table + + if geneFname is not None: + writeList(allGenes, geneFname) + + return allGenes + +def dictToGeneMatrix(exprMatrix, onlyGenes=None): + """ transform exprMatrix back to its input format. + + exprMatrix is a matrix sorted by cellId: dict cellId -> transcriptId -> count + this function returns: a matrix sorted by geneId: + list of rows, 1st row has headers, 1st col has geneId + Genes that are always 0 are removed. + """ + allGenes = getNonZeroGenes(exprMatrix, onlyGenes=onlyGenes) + + cellIds = sorted(exprMatrix) + # create a matrix where each line is a cell, each column is a gene + headRow = ["geneId"] + headRow.extend(cellIds) + + rows = [] + rows.append(headRow) + + i=0 + for geneId in allGenes: + newRow = [geneId] + for cellId in cellIds: + val = exprMatrix[cellId].get(geneId, 0) + newRow.append(val) + rows.append(newRow) + i += 1 + + return rows + +def writeMatrixForSeurat(exprMatrix, geneToSym, seuratMatFname): + " rewrite the headers for seurat, it accepts only one format. Export only genes with a symbol. " + logging.info("Writing matrix for Seurat to %s" % seuratMatFname) + seuratMatrix = dictToGeneMatrix(exprMatrix, onlyGenes=geneToSym) + newMatrix = [] + # make the headers conform to the seurat input format Hi__ + newHeaders = ["Gene_Symbol"] + for cellId in seuratMatrix[0][1:]: + cellId = encodeCellId(cellId) + newHeaders.append("Hi_NA_%s" % (cellId)) + + writeMatrix(seuratMatrix[1:], seuratMatFname, headers = newHeaders) + +def makeSeuratScript(seuratMatFname, tsnePath, clusterPath, clusterGenesPath): + " returns a string with the seurat build script " + seuratScript = """ +library(methods) +suppressWarnings(suppressMessages(library(Seurat))) +print("Seurat: Reading data") +nbt.data=read.table("%(seuratMatFname)s",sep="\t",header=TRUE,row.names=1) +nbt=new("seurat",raw.data=nbt.data) +print("Seurat: Setup") +nbt=Setup(nbt,project = "NBT",min.cells = 3,names.field = 2,names.delim = "_",min.genes = 500, do.logNormalize = F, total.expr = 1e4) +print("Seurat: Mean Variant") +print("CIRM Warning: We are not regressing out cell cycle or mitochondrial signal yet, see http://satijalab.org/seurat/pbmc-tutorial.html") +nbt=MeanVarPlot(nbt,y.cutoff = 2,x.low.cutoff = 2,fxn.x = expMean,fxn.y = logVarDivMean) +print("Seurat: PCA") +nbt=PCA(nbt,do.print=FALSE) +print("Seurat: JackStraw") +# MaxH: changed num.replicate from 200 -> 100 +nbt=JackStraw(nbt,num.replicate = 100,do.print = FALSE) + +print("Seurat: PCA Projection") +nbt=ProjectPCA(nbt,do.print=FALSE) + +print("Seurat: Significant genes from PCA") +pcCount=min(ncol(nbt@pca.rot), 30) +nbt.sig.genes=PCASigGenes(nbt,1:pcCount,pval.cut = 1e-5,max.per.pc = 200) +print("Seurat: PCA 2 using sign. genes from PCA 1") +nbt=PCA(nbt,pc.genes=nbt.sig.genes,do.print = FALSE) + +print("Seurat: JackStraw 2 using all sign. genes") +# max: changed num.replicate from 200 -> 100 +nbt=JackStraw(nbt,num.replicate = 100,do.print = FALSE) +print("Seurat: Dimensionality Reduction") +# max: I have decreased the number of iterations from 2000 in the example to 800 +nbt=RunTSNE(nbt,dims.use = 1:11,max_iter=800) + +tsne12 <- FetchData(nbt, c("tSNE_1", "tSNE_2")) +write.table(tsne12, "%(tsnePath)s", quote=FALSE, sep="\t", col.names=NA) + +print("Seurat: Cluster the tSNE data") +clusterCount=8 +nbt=DBClustDimension(nbt,1,2,reduction.use = "tsne",G.use = clusterCount,set.ident = TRUE) +write.table(FetchData(nbt,c("ident")), "%(clusterPath)s", quote=FALSE, sep="\t", col.names=NA) + +# find_all_markers crashes if there is only a single cluster +if (length(levels(nbt@ident))!=1) { + markers.all=FindAllMarkers(nbt,test.use = "roc", do.print = TRUE) + write.table(markers.all, "%(clusterGenesPath)s", quote=FALSE, sep="\t", col.names=NA) +} else { + file.create("%(clusterGenesPath)s"); # create empty file to signal that all is OK. +} + +""" % locals() + return seuratScript + +def runRscript(scriptStr, scriptFname): + """ run an R script given as a string through Rscript """ + scriptFh = open(scriptFname, "w") + scriptFh.write(scriptStr) + scriptFh.close() + logging.info("Wrote %s, running through Rscript" % scriptFh.name) + + cmd = "time Rscript %s" % scriptFh.name + assert(os.system(cmd)==0) + +def parseTsneToJson(seuratToOrig, tsnePath): + """ read the seurat tsne output and return a list of tuple (cellId, x, y) + """ + coordList = [] + + tsneData = [] + notFoundIds = set() + for row in lineFileNextRow(tsnePath): + seuratId = row[0].split("_")[-1] + if seuratId not in seuratToOrig: + notFoundIds.add(seuratId) + continue + cellId = seuratToOrig[seuratId] + x = float(row.tSNE_1) + y = float(row.tSNE_2) + tsneData.append( {"x" : x, "y" : y, "cellId" : cellId } ) + + coordList.append( (cellId, x, y) ) + + if len(notFoundIds)!=0: + logging.warn("%d cellIds were not found after seurat mapping: %s" % (len(notFoundIds), ",".join(notFoundIds))) + + return coordList + +def parseClusters(newToOrigId, clusterPath): + """ convert seurat tsne cluster assignment to dict cellId -> cluster (as a two-digit string) """ + cellToCluster = {} + for row in lineFileNextRow(clusterPath): + cellId = row[0].split("_")[-1] + if cellId not in newToOrigId: + continue + origId = newToOrigId[cellId] + cellToCluster[origId] = "%02d" % (int(row.ident)) + return cellToCluster + +def addToMeta(meta, metaName, newMeta, prefix=""): + " add a dict with cellId -> string to the meta dict " + allCellIds = set(meta.keys()) + doneCellIds = set() + for cellId, val in newMeta.iteritems(): + assert(metaName not in meta[cellId]) + meta[cellId][metaName] = prefix+str(newMeta[cellId]) + doneCellIds.add(cellId) + + missingCellIds = allCellIds - doneCellIds + if len(missingCellIds)!=0: + logging.warn("%d cells/samples have no value for the attribute '%s', setting it to '' now to avoid errors" % (len(missingCellIds), metaName)) + for cellId in missingCellIds: + meta[cellId][metaName] = "" + return meta + +def runSeurat(meta, exprMatrix, tmpDir): + " run seurat over an expression matrix, write intermediate results to tmpDir, return coords and updated meta " + seuratMatFname = join(tmpDir, "seuratInMatrix.tab") + seuratScriptPath = join(tmpDir, "runSeurat.R") + tsnePath = join(tmpDir, "seuratTsne.tab") + clusterPath = join(tmpDir, "seuratClusters.tab") + clusterGenesPath = join(tmpDir, "seuratClusterGenes.tab") # this is the last file written by seurat + + #exprMatrix = parseMatrix(exprFname, options.doLog) + + if isfile(seuratMatFname) and isfile(clusterGenesPath): + logging.info("Not exporting matrix for Seurat, %s already exists" % seuratMatFname) + else: + meta, exprMatrix = useOnlyCommonCellIds(meta, exprMatrix) + writeMatrixForSeurat(exprMatrix, geneToSym, seuratMatFname) + + if not isfile(clusterGenesPath) or not isfile(seuratScriptPath): + seuratScript = makeSeuratScript(seuratMatFname, tsnePath, clusterPath, clusterGenesPath) + runRscript(seuratScript, seuratScriptPath) + else: + logging.info("Not running Seurat, %s already exists" % clusterGenesPath) + + cellIdMap = sanitizeForR(meta.keys()) + + seuratCoords = parseTsneToJson(cellIdMap, tsnePath) + seuratClusters = parseClusters(cellIdMap, clusterPath) + + meta = addToMeta(meta, "Seurat Cluster", seuratClusters, prefix="cluster ") + + return meta, seuratCoords + +def makePcaScript(matrixFname, outFname): + #data = log(data+1) + pcaScript = """data = read.table("%s", header=TRUE, row.names=1) +pca = prcomp(data, scale.=TRUE, center=TRUE) +write.table(pca$x, "%s", sep="\t", quote=FALSE, col.names=NA) +""" % (matrixFname, outFname) + return pcaScript + +def dictToCellMatrix(exprMatrix, onlyGenes): + """ input: dict cellId -> transcriptId -> count + ouput: list of rows, 1st row has headers, 1st col has cellId + The output is a matrix with one cell per row. + """ + allGenes = getNonZeroGenes(exprMatrix, onlyGenes=onlyGenes) + # now create a matrix where each line is a cell, each column is a gene + rows = [] + row1 = ["cellId"] + row1.extend(allGenes) + rows.append(row1) + + i=0 + for cellName, geneDict in exprMatrix.iteritems(): + newRow = [cellName] + for gene in allGenes: + val = geneDict.get(gene, 0) + newRow.append(val) + assert(len(newRow)!=1) # no marker gene found at all + rows.append(newRow) + i += 1 + + return rows + +def runPca(meta, exprMatrix, tmpDir): + " run a matrix through the R PCA function prcomp and write the princ. components to outFname " + pcaFname = join(tmpDir, "pcaResult.tab") + if isfile(pcaFname): + logging.info("PCA: not re-running R, cache file %s exists" % pcaFname) + else: + pcaMatrix = dictToCellMatrix(exprMatrix, geneToSym) + pcaMatrixFname = join(tmpDir, "pcaInMatrix.tab") + writeMatrix(pcaMatrix, pcaMatrixFname) + scriptPath = join(tmpDir, "runPca.R") + pcaScript = makePcaScript(pcaMatrixFname, pcaFname) + runRscript(pcaScript, scriptPath) + + pcaCoords = parsePca(pcaFname) + return pcaCoords + +def makeDescLines(metaFname, matrixFname): + " return a list of html lines describing a Seurat run " + lines = [] + lines.append("Processing: Default Seurat/PCA Pipeline") + lines.append("Input meta data file: %s" % metaFname) + lines.append("Input expression data file: %s" % matrixFname) + lines.append("Directory: %s" % os.getcwd()) + lines.append("Command: %s" % " ".join(sys.argv)) + lines.append("By: %s@%s %s" % (os.getenv('USER'), socket.gethostname(), strftime("%Y-%m-%d %H:%M:%S", gmtime()))) + return lines + +def getCellIdsFromMatrix(matrixFname): + #cellIdFname = join(buildDir, "cellIds.txt") + #if isfile(cellIdFname): + #cellIds = parseList(cellIdFname) + #else: + #cellIds = exprMatrix.keys() + #writeList(cellIds, cellIdFname) + headers = openFile(matrixFname).readline().rstrip("\n").split("\t") + return headers[1:] + +def runSeuratPcaPipeline(meta, exprMatrix, exprCellIds, buildDir): + " run Seurat and PCA and return new meta data " + meta, seuratCoords = runSeurat(meta, exprMatrix, buildDir) + pcaCoords = runPca(meta, exprMatrix, buildDir) + return meta, seuratCoords, pcaCoords + +def convert10X(inDir, outDir): + " convert 10X format to a normal one-line-per-gene, one-column-per-sample matrix " + + matrixFname = join(outDir, "geneMatrix.tsv.gz") + if isfile(matrixFname): + logging.info("Not creating %s, already exists" % matrixFname) + return matrixFname + + # copied from https://support.10xgenomics.com/single-cell/software/pipelines/latest/output/matrices + import csv + import scipy.io + genome = "hg19" + matrices_dir = "filtered_gene_bc_matrices" + human_matrix_dir = join(inDir, matrices_dir, genome) + matPath = os.path.join(human_matrix_dir, "matrix.mtx") + genes_path = os.path.join(human_matrix_dir, "genes.tsv") + barcodes_path = os.path.join(human_matrix_dir, "barcodes.tsv") + logging.info("Reading %s, %s and %s" % (matPath, genes_path, barcodes_path)) + mat = scipy.io.mmread(matPath) + gene_ids = [row[0] for row in csv.reader(open(genes_path), delimiter="\t")] + gene_names = [row[1] for row in csv.reader(open(genes_path), delimiter="\t")] + barcodes = [row[0] for row in csv.reader(open(barcodes_path), delimiter="\t")] + # END COPY + + mat = mat.todense() + + ofh = gzip.open(matrixFname, "w") + logging.info("Writing %s" % ofh.name) + barcodeStr = "\t".join(barcodes) + ofh.write("GeneId\t%s\n" % barcodeStr) + + rows, cols = mat.shape + assert(len(gene_ids)==rows) + assert(len(barcodes)==cols) + + for i in range(0, rows): + if i % 2000 == 0 and i!=0: + logging.info("Wrote %d lines" % i) + geneId = gene_ids[i] + row = mat[i,:] + row = row.tolist()[0] + row = [str(x) for x in row] + ofh.write("%s\t" % geneId) + ofh.write("\t".join(row)) + ofh.write("\n") + ofh.close() + + logging.info("Wrote %s" % matrixFname) + return matrixFname + +def copyMatrixAndIndex(exprMatrix, outDir): + " copy the matrix to outDir/geneMatrix.tsv and add a file geneMatrixOffsets.json " + if exprMatrix is None: + return + + mtxFname = join(outDir, "geneMatrix.tsv") + if isfile(mtxFname): + logging.info("%s already exists, not writing matrix" % mtxFname) + else: + #shutil.copy(matrixFname, mtxFname) + #writeMatrix(pcaMatrix, pcaMatrixFname) + logging.info("Writing matrix to %s" % mtxFname) + geneMatrix = dictToGeneMatrix(exprMatrix) + writeMatrix(geneMatrix, mtxFname) + + indexFname = join(outDir, "geneMatrixOffsets.json") + if isfile(indexFname): + logging.info("%s already exists, not copying" % indexFname) + else: + offsets = indexMatrix(mtxFname) + saveJson(offsets, indexFname) + +def runOn10X(command, metaFname, cellRangerDir, outDir): + " run Seurat and PCA over a CellRanger input directory " + global geneToSym + genome = "hg19" + geneSymPath = join(cellRangerDir, "filtered_gene_bc_matrices", genome, "genes.tsv") + geneToSym = readGeneToSym(geneSymPath) + + tmpDir = makeTmpDirFor(outDir) + matrixFname = convert10X(cellRangerDir, tmpDir) + buildDir = makeTmpDirFor(outDir) + exprMatrix = parseMatrix(matrixFname, buildDir, True) # 10X matrices are not log'ed. + + exprCellIds = getCellIdsFromMatrix(matrixFname) + meta, fieldAnnots = parseMeta(metaFname, exprCellIds) + + meta, seuratCoords, pcaCoords = runSeuratPcaPipeline(meta, exprMatrix, exprCellIds, buildDir) + lines = makeDescLines(metaFname, matrixFname) + + longLabel = "
".join(lines) + desc = {"shortLabel" : basename(dirname(abspath(matrixFname))), "longLabel" : longLabel, "clusterField":"Seurat Cluster"} + + matrixGeneIds = getNonZeroGenes(exprMatrix) + writeTsvAndJson(exprMatrix, seuratCoords, pcaCoords, meta, geneToSym, geneIdToAnnot, desc, fieldAnnots, [], outDir) + +#def writeDownloadFiles(metaFname, outDir): + #" we keep a copy of the meta data in the directory so people can download them " + #metaOutPath = join(outDir, "meta.tsv") + #logging.info("Copying %s to %s" % (metaFname, metaOutPath)) + #shutil.copy(metaFname, metaOutPath) + +def matrixSplitOffGenesAndCells(matrixFname, onlyGenes, outFname, outGeneFname, cellIdFname): + " given a gene-on-rows file, keep only some genes, and write to file " + if onlyGenes is not None: + logging.info("filtering %s to %s, keeping only %d genes" % (matrixFname, outFname, len(onlyGenes))) + + ofh = open(outFname, "w") + genes = [] + cellIds = False + for line in open(matrixFname): + if not cellIds: # skip the first line + cellIds = line.rstrip("\n").split("\t")[1:] + continue + geneId, exprStr = string.split(line, "\t", maxsplit=1) + if onlyGenes is not None and not geneId in onlyGenes: + continue + #assert(exprStr.endswith("\n")) + ofh.write(exprStr) + genes.append(geneId) + + ofh.close() + + ofh = open(outGeneFname, "w") + ofh.write("\n".join(genes)) + ofh.close() + + ofh = open(cellIdFname, "w") + ofh.write("\n".join(cellIds)) + ofh.close() + +def runTrimap(matrixFname, buildDir): + " run Trimap on reduced matrix (for speed!) and return the coordinates " + if not teteLoaded: + logging.info("Cannot run T-ETE, library was not loaded") + return None + + logging.info("T-ETE dimensionalty reduction") + + teteMatrixFname = join(buildDir, "tete.matrix.txt") + teteGeneFname = join(buildDir, "tete.genes.txt") + teteCellIdFname = join(buildDir, "tete.labels.txt") + teteCoordFname = join(buildDir, "tete.coords.txt") + + if isfile(teteCellIdFname): + logging.info("not doing matrix output / gene filtering, found %s" % teteCellIdFname) + else: + matrixSplitOffGenesAndCells(matrixFname, geneIdToAnnot, teteMatrixFname, teteGeneFname, teteCellIdFname) + + if isfile(teteCoordFname): + coords = parseCoords(teteCoordFname) + else: + X = np.loadtxt(teteMatrixFname) + # XX + print(X.shape) + X = np.transpose(X) + print(X.shape) + + Y = tete.tete(X, 2, 6) + xCoords = Y[:,0] + yCoords = Y[:,1] + cellIds = open(teteCellIdFname).read().splitlines() + print "x", xCoords + print "y", yCoords + print len(xCoords) + print len(yCoords) + assert(len(cellIds)==len(xCoords)==len(yCoords)) + + logging.info("Writing T-ETE coordinates to %s" % teteCoordFname) + ofh = open(teteCoordFname, "w") + ofh.write("cellId\tx\ty\n") + for cellId, x, y in zip(cellIds, xCoords, yCoords): + ofh.write("%s\t%0.3f\t%0.3f\n" % (cellId, x, y)) + ofh.close() + + coords = parseCoords(teteCoordFname) + + return coords + +#def filterMatrixOnGenesAndTranspose(matrixFname, geneIdToAnnot, preloadMatrixFname): +# """ create a matrix with only selected geneIdToAnnot genes from matrixFname and store as one cell per row. +# (not one gene per row). Header fields have format geneId|geneSym|description or just geneId. +# """ +# if geneIdToAnnot is None or len(geneIdToAnnot)==0: +# logging.info("No gene list provided, not preloading any genes, not creating %s" % preloadMatrixFname) +# return +# +# if isfile(preloadMatrixFname): +# logging.info("%s already exists, not re-exporting it" % preloadMatrixFname) +# return +# +# logging.info("Rewriting %s to %s, keeping only %d genes" % (matrixFname, preloadMatrixFname, len(geneIdToAnnot))) +# +# ifh = open(matrixFname) +# +# cellIds = ifh.readline().rstrip("\n").split("\t")[1:] +# geneLabels = [] +# +# cellIdToGene = defaultdict(list) # cellId(int) -> geneId(int) -> str +# for line in ifh: +# geneId, _ = string.split(line[:100], "\t", 1) +# if geneId not in geneIdToAnnot: +# continue +# +# if geneToSym is not None: +# sym = geneToSym[geneId] +# else: +# sym = "" +# +# desc = geneIdToAnnot[geneId] +# assert("|" not in desc) +# assert("|" not in sym) +# assert("|" not in geneId) +# geneLabel = "|".join([geneId, sym, desc]) +# +# geneLabels.append(geneLabel) +# +# exprVec = line.rstrip("\n").split("\t")[1:] +# +# for cellIdx, val in enumerate(exprVec): +# cellIdToGene[cellIdx].append(val) +# +# # write header +# ofh = open(preloadMatrixFname, "w") +# ofh.write("gene\t") +# ofh.write("\t".join(geneLabels)) +# ofh.write("\n") +# +# # write the cell values +# for cellIdx, exprVec in cellIdToGene.iteritems(): +# ofh.write(cellIds[cellIdx]) +# ofh.write("\t") +# ofh.write("\t".join(exprVec)) +# ofh.write("\n") +# +# ofh.close() + +def parseGeneMatrix(fname, onlyGeneIds): + " parse a gene matrix for only a few genes, return two things: cellIds, and dict geneId -> list of floats" + ifh = open(fname) + + cellIds = ifh.readline().rstrip("\n").split("\t")[1:] + + data = {} + #geneIdx = 0 + #geneIds = [] + for line in ifh: + geneId, _ = string.split(line[:100], "\t", 1) + if geneId not in onlyGeneIds: + continue + + exprVec = line.rstrip("\n").split("\t")[1:] + exprVec = [float(x) for x in exprVec] + + #data[geneIdx] = exprVec + data[geneId] = exprVec + #geneIdx += 1 + #geneIds.append(geneId) + return cellIds, data + +def writePreloadGenes(matrixFname, geneIdToAnnot, outFname): + " write a json object with geneFields (list of (geneId, sym, desc)), cellExpr (cellId -> array of float) and deciles (dict gene->11 floats) " + if matrixFname is None: + return + + if geneIdToAnnot is None or len(geneIdToAnnot)==0: + logging.info("No gene list provided, not preloading any genes, not creating %s" % outFname) + return + + if isfile(outFname): + logging.info("%s already exists, not re-exporting it" % outFname) + return + + data = {} + + cellIds, geneToVec = parseGeneMatrix(matrixFname, geneIdToAnnot) + + cellToVec = defaultdict(list) + + deciles = {} + for geneId, geneVec in geneToVec.iteritems(): + for cellId, val in zip(cellIds, geneVec): + cellToVec[cellId].append(val) + + deciles[geneId] = getDecilesList(geneVec) + + geneLabels = [] + for geneId, desc in geneIdToAnnot.iteritems(): + sym = convIdToSym(geneId) + geneLabels.append( (geneId, sym, desc) ) + + data["genes"] = geneLabels + data["cellExpr"] = cellToVec + data["deciles"] = deciles + saveJson(data, outFname) + +def runOnMatrix(command, metaFname, matrixFname, clusterLabelField, outDir): + " run Seurat and PCA over matrix, write all json to outDir " + buildDir = makeTmpDirFor(outDir) + exprMatrix = parseMatrix(matrixFname, buildDir, options.doLog) + + # get cellIds in the matrix and read meta data + exprCellIds = getCellIdsFromMatrix(matrixFname) + matrixGeneIds = getNonZeroGenes(exprMatrix, buildDir) + meta, fieldAnnots = parseMeta(metaFname, exprCellIds) + + meta, seuratCoords, pcaCoords = runSeuratPcaPipeline(meta, exprMatrix, exprCellIds, buildDir) + + # XXX !!! + #teteCoords = runTrimap(matrixFname, buildDir) + teteCoords = None + # XXX !!! + + if clusterLabelField is None: + clusterLabelField = "Seurat Cluster" + else: + isValidField(meta, clusterLabelField) + + lines = makeDescLines(metaFname, matrixFname) + longLabel = "
".join(lines) + shortLabel = basename(dirname(abspath(matrixFname))) + desc = {"shortLabel" : shortLabel, "longLabel" : longLabel, "clusterField":"Seurat Cluster"} + + coordDict = { ("seurat", "T-SNE Seurat 1.4") : seuratCoords } + + if teteCoords is not None: + coordDict[("tete", "T-ETE")] = teteCoords + + copyMatrixAndIndex(exprMatrix, outDir) + + preloadMatrixFname = join(outDir, "preloadGenes.json") + #filterMatrixOnGenesAndTranspose(matrixFname, geneIdToAnnot, preloadMatrixFname) + writePreloadGenes(matrixFname, geneIdToAnnot, preloadMatrixFname) + + writeTsvAndJson(exprMatrix, coordDict, pcaCoords, meta, geneToSym, geneIdToAnnot, desc, fieldAnnots, [], outDir) + #writeDownloadFiles(metaFname, outDir) + +def findDatasets(outDir): + """ search all subdirs of outDir for description.json files and return their contents as a list + A dataset description is a list with three members: A label, the base URL and a longer description + that can contain html. + """ + datasets = [] + for subDir in os.listdir(outDir): + if not isdir(join(outDir, subDir)): + continue + fname = join(outDir, subDir, "description.json") + if not isfile(fname): + continue + + logging.info("Reading %s" % fname) + datasetDesc = json.load(open(fname)) + + fname = join(outDir, subDir, "colors.tsv") + if isfile(fname): + colors = parseColors(fname) + #datasetDesc["metaColors"] = colors + + assert("shortLabel" in datasetDesc) + #assert("longLabel" in datasetDesc) + #if "longLabel" in datasetDesc: + #del datasetDesc["longLabel"] + datasetDesc["baseUrl"] = subDir+"/" + datasets.append(datasetDesc) + logging.info("Found %d datasets" % len(datasets)) + return datasets + +def copyDescPages(inDir, outDir): + " copy description.html and makeDoc.html to output directory " + desc = join(inDir, "description.html") + makeDoc = join(inDir, "makeDoc.html") + acroTsv = join(inDir, "acronyms.tsv") + logging.info("Copying %s, %s, %s (if present) to %s" % (desc, makeDoc, acroTsv, outDir)) + if isfile(desc): + shutil.copy(desc, outDir) + if isfile(makeDoc): + shutil.copy(makeDoc, outDir) + if isfile(acroTsv): + shutil.copy(acroTsv, outDir) + +def copyHtml(outDir): + " copy html/js/css to outdir " + + imgDir = join(outDir, "img") + makeDir(imgDir) + + baseDir = dirname(__file__) + fnames = [join(baseDir, "cellBrowser.js"), join(baseDir, "cellBrowser.css")] + fnames.extend( glob.glob(join(baseDir, "img", "*.png")) ) + + for inFname in fnames: + logging.info("Copying %s to %s" % (inFname, outDir)) + if "img" in inFname: + shutil.copy(inFname, outDir+"/img") + else: + shutil.copy(inFname, outDir) + + datasetDescs = findDatasets(outDir) + datasetDescs = list(sorted(datasetDescs, key=lambda k: k.get('priority', 0), reverse=True)) + + #datasetLabels = [] + #for dd in datasetDescs: + #datasetLabels.append(dd["label"]) + datasetLabels = [x["shortLabel"] for x in datasetDescs] + + # generate the index.html file + indexFname = join(baseDir, "index.html") + indexStr = open(indexFname).read() + old = "datasetList = null" + new = "datasetList = "+json.dumps(datasetDescs, sort_keys=True, indent=4, separators=(',', ': ')) + + newIndexStr = indexStr.replace(old, new) + assert(newIndexStr!=indexStr) + + newFname = join(outDir, "index.html") + ofh = open(newFname, "w") + ofh.write(newIndexStr) + ofh.close() + logging.info("Wrote %s, added datasets: %s" % (newFname, " - ".join(datasetLabels))) + +def readGeneInfo(defaultGeneFname): + " fill global variables with gene annotations: default genes and symbols " + global geneToSym + geneToSym = readGeneToSym(options.symTable) + global geneIdToAnnot + geneIdToAnnot = readGeneIds(defaultGeneFname, geneToSym) + +def makeDir(outDir): + if not isdir(outDir): + makedirs(outDir) + logging.info("Created directory %s" % outDir) + +def mustNotBeNull(var, msg): + " stop if var is null, output msg " + if var is None: + logging.error(msg) + sys.exit(1) + +def findDataDir(outDir, dataName, exprFname): + " return the path of the dataset dir: if dataName is None, use the parent dir of exprFname " + if dataName is not None: + return join(outDir, dataName) + + return join(outDir, basename(dirname(abspath(exprFname)))) + +def parseColors(fname): + " parse color table and return as dict value -> color " + colDict = parseDict(fname) + for metaVal, color in colDict.iteritems(): + color = color.strip() # hbeale had a file with trailing spaces + assert(len(color)<=6) # colors can be no more than six hex digits + for c in color: + assert(c in "0123456789ABCDEFabcdef") # color must be a hex number + return colDict + +def copyColors(fname, dataOutDir): + " copy colors to colors.tsv in dataOutDir " + if fname==None: + return + logging.debug("Copying %s to %s" % (fname, dataOutDir)) + parseColors(fname) # check if colors are hex values + shutil.copy(fname, join(dataOutDir, "colors.tsv")) + +def splitAddSym(filename, outDir): + " split .tsv on first field and create many files in outDir with the non-first columns. Also map geneIds to symbols. " + if filename is None: + return + logging.debug("Splitting %s on first field" % filename) + ifh = open(filename) + + headers = ifh.readline().rstrip("\n").split('\t') + otherHeaders = headers[2:] + + data = defaultdict(list) + for line in ifh: + row = line.rstrip("\n").split('\t') + markerName = row[0] + geneId = row[1] + scoreVal = float(row[2]) + otherFields = row[3:] + + geneSym = convIdToSym(geneId) + + newRow = [] + newRow.append(geneId) + newRow.append(geneSym) + newRow.append(scoreVal) + newRow.extend(otherFields) + + data[markerName].append(newRow) + + newHeaders = ["id", "symbol"] + newHeaders.extend(otherHeaders) + + for markerName, rows in data.iteritems(): + #rows.sort(key=operator.itemgetter(2), reverse=True) # rev-sort by score (fold change) + markerName = markerName.replace("/","_") + outFname = join(outDir, markerName+".tsv") + logging.info("Writing %s" % outFname) + ofh = open(outFname, "w") + ofh.write("\t".join(newHeaders)) + ofh.write("\n") + for row in rows: + row[2] = "%0.3f" % row[2] # limit to 3 digits + ofh.write("\t".join(row)) + ofh.write("\n") + ofh.close() + + +def runCommands(args): + " basic cmd line parsing " + + command = args[0] + outDir = options.outDir + + if outDir is None: + logging.error("You did not specify the -o option, but it is mandatory") + sys.exit(1) + + if command=="html": + if options.colors is not None: + errAbort("There is no need to specify colors with 'html'. Colors are defined for datasets.") + copyHtml(outDir) + sys.exit(0) + + metaFname = options.metaData + exprFname = options.exprMatrix + + mustNotBeNull(outDir, "You need to provide an output directory") + mustNotBeNull(metaFname, "You need to provide a meta data file") + + if command=="coords": + if options.datasetName is None and exprFname is None: + errAbort("With 'coords', you need to provde either an expression matrix or at least a name for the dataset with -n") + if options.coords is None: + errAbort("With 'coords', you need to specify at least one coordinate file with the -c option") + else: + mustNotBeNull(exprFname, "You need to provide an expression matrix") + + makeDir(outDir) + + dataOutDir = findDataDir(outDir, options.datasetName, exprFname) + + #global colors + #colors = parseColors(options.colors) + + global datasetName + if options.datasetName is not None: + datasetName = options.datasetName + + if command=="coords": + coordFnames = options.coords # a list of filenames + defaultGeneFname = options.genes + clusterLabelField = options.clusterLabel + specGenesFnames = options.clusterSpec + + markerInfo = [] + for specGenesFname in specGenesFnames: + logging.debug("Processing cluster-specific genes from %s" % specGenesFname) + markerType = splitext(basename(specGenesFname))[0].split(".")[0] + markerDir = join(dataOutDir, "markers", markerType) + makeDir(markerDir) + splitAddSym(specGenesFname, markerDir) + markerInfo.append((markerDir, markerType)) + + readGeneInfo(defaultGeneFname) + + makeDir(dataOutDir) + importCoords(command, metaFname, exprFname, coordFnames, clusterLabelField, markerInfo, dataOutDir) + copyDescPages(dirname(exprFname), dataOutDir) + + if command=="matrix": + defaultGeneFname = options.genes + readGeneInfo(defaultGeneFname) + clusterLabelField = options.clusterLabel + + makeDir(dataOutDir) + copyColors(options.colors, dataOutDir) + runOnMatrix(command, metaFname, exprFname, clusterLabelField, dataOutDir) + + if command=="10x": + readGeneInfo(defaultGeneFname) + #assert(len(inFnames)==1) + cellRangerDir = metaFname + runOn10X(command, metaFname, cellRangerDir, outDir) + + +# ----------- main -------------- +def main(): + global options + args, options = parseArgs() + + runCommands(args) + +main()