generateTFPN-cedric.py

#!/usr/bin/env python3

import openpyxl
import sys
from CNAdefs import *
from weightedmeanvalue import weightedMeanValues

weHaveGoldStd = False
numberOfPatientsFromTable = 129

# Clinical result of karyotype or fish
TP = 'TP'; FP = 'FP'; FN = 'FN'; TN = 'TN'; NA = 'NA'
def updateSheets(row, colNames, w_values, value_sheet, TFPN_sheet, gainThreshold, deletionThreshold): 
  # User provides gain/deletion thresholds based on the data
  # w_values is a dictionary useing cna keys
  for cna in w_values: 
    # Use proper value of Gold standard
    if (weHaveGoldStd):
      value = w_values[cna]
      kcna = f'k{cna}'
      fcna = f'f{cna}'
      tcna = f't{cna}'
      value_sheet.cell(row, colNames[kcna]).value = value
      value_sheet.cell(row, colNames[fcna]).value = value
      value_sheet.cell(row, colNames[tcna]).value = value
      tcna_value = ref_sheet.cell(row, colNames[tcna]).value
    # Set fake value if no Gold standard (for Cedric)
    else:
      value = w_values[cna]
      kcna = f'k{cna}'
      fcna = f'f{cna}'
      tcna = f't{cna}'
      tcna_value = 1
      value_sheet.cell(row, colNames[tcna]).value = value

    # tcna of loss is t5q, t7q, etc, and gain is t1q, ttrisomy8 , and ttrisomy12 (skip leading 't')
    if (tcna[1:]=='1q' or (tcna[1:]=='trisomy8' or (tcna[1:]=='trisomy12' or tcna[1:]=='9p'))):
      # Gain true-false-positive-negative logic
      if (tcna_value==1 and value > gainThreshold):
        TFPN_sheet.cell(row, colNames[tcna]).value = TP 
      elif (tcna_value==0 and value > gainThreshold):
        TFPN_sheet.cell(row, colNames[tcna]).value = FP
      elif (tcna_value==1 and value <= gainThreshold):
        TFPN_sheet.cell(row, colNames[tcna]).value = FN
      elif (tcna_value==0 and value <= gainThreshold):
        TFPN_sheet.cell(row, colNames[tcna]).value = TN
      else:
        TFPN_sheet.cell(row, colNames[tcna]).value = NA
    else:
      # Deletion true-false-positive-negative logic
      if (tcna_value==1 and value < deletionThreshold):
        TFPN_sheet.cell(row, colNames[tcna]).value = TP 
      elif (tcna_value==0 and value < deletionThreshold):
        TFPN_sheet.cell(row, colNames[tcna]).value = FP
      elif (tcna_value==1 and value >= deletionThreshold):
        TFPN_sheet.cell(row, colNames[tcna]).value = FN
      elif (tcna_value==0 and value >= deletionThreshold):
        TFPN_sheet.cell(row, colNames[tcna]).value = TN
      else:
        TFPN_sheet.cell(row, colNames[tcna]).value = NA

def getSEtable(diagnosis, cnas, TFPN_sheet):
  # Names to indexes of TFPN sheet
  colNames = {}
  for i in range(TFPN_sheet.max_column):
    col = i + 1
    colName = TFPN_sheet.cell(1, col).value
    colNames[colName] = col
  # Create table
  table = {}
  table['names'] = ['Lesion', 'FN', 'FP', 'TN', 'TP', 'Sensitivity', 'Specificity', 'PPV', 'NPV', 'Accuracy', 'F1-score']
  for cna in cnas: 
    tcna = f't{cna}'
    col = colNames[tcna]
    FNcount = 0; FPcount = 0; TNcount = 0; TPcount = 0
    for i in range(numberOfPatientsFromTable):
      row = i + 2
      if (TFPN_sheet.cell(row, col).value==FN and TFPN_sheet.cell(row, colNames['Diagnosis']).value==diagnosis):
        FNcount = FNcount + 1
      elif (TFPN_sheet.cell(row, col).value==FP and TFPN_sheet.cell(row, colNames['Diagnosis']).value==diagnosis):
        FPcount = FPcount + 1 
      elif (TFPN_sheet.cell(row, col).value==TN and TFPN_sheet.cell(row, colNames['Diagnosis']).value==diagnosis):
        TNcount = TNcount + 1 
      elif (TFPN_sheet.cell(row, col).value==TP and TFPN_sheet.cell(row, colNames['Diagnosis']).value==diagnosis):
        TPcount = TPcount + 1 
    eps = 1e-32
    table[tcna] = [cna, FNcount, FPcount, TNcount, TPcount, TPcount / (TPcount + FNcount + eps), TNcount / (TNcount + FPcount + eps), TPcount / (TPcount + FPcount + eps), TNcount / (TNcount + FNcount + eps), (TPcount + TNcount) / (TPcount + FPcount + TNcount + FNcount + eps), TPcount/(TPcount+(0.5*(FPcount+FNcount)) + eps)] 
  return table

# Cedrix function, it can be used when we don't have gold standard cytogenetic test
def getSEtableNoGoldStd(diagnosis, cnas, TFPN_sheet):
  # Names to indexes of TFPN sheet
  colNames = {}
  for i in range(TFPN_sheet.max_column):
    col = i + 1
    colName = TFPN_sheet.cell(1, col).value
    colNames[colName] = col
  # Create table
  table = {}
  table['names'] = ['Lesion', 'FN', 'FP', 'TN', 'TP']
  for cna in cnas:
    tcna = f't{cna}'
    col = colNames[tcna]
    FNcount = 0; FPcount = 0; TNcount = 0; TPcount = 0
    for i in range(numberOfPatientsFromTable):
      row = i + 2
      if (TFPN_sheet.cell(row, col).value==FN and TFPN_sheet.cell(row, colNames['Diagnosis']).value==diagnosis):
        FNcount = FNcount + 1
      elif (TFPN_sheet.cell(row, col).value==FP and TFPN_sheet.cell(row, colNames['Diagnosis']).value==diagnosis):
        FPcount = FPcount + 1
      elif (TFPN_sheet.cell(row, col).value==TN and TFPN_sheet.cell(row, colNames['Diagnosis']).value==diagnosis):
        TNcount = TNcount + 1
      elif (TFPN_sheet.cell(row, col).value==TP and TFPN_sheet.cell(row, colNames['Diagnosis']).value==diagnosis):
        TPcount = TPcount + 1
    eps = 1e-32
    table[tcna] = [cna, FNcount, FPcount, TNcount, TPcount]
  return table

def writeSETable(diagnosis, cnas, TFPN_sheet, tableFileName):
  if (weHaveGoldStd):
    tableSE = getSEtable(diagnosis, cnas, TFPN_sheet)
  else:
    tableSE = getSEtableNoGoldStd(diagnosis, cnas, TFPN_sheet)

  #print(f'method {method}') 
  print(f'Writing table {tableFileName}')
  with open(tableFileName, 'w') as f:
    for item in tableSE['names']:
      f.write(str(item))
      f.write(' ')
    f.write('\n')
    for cna in cnas:
      tcna = 't'+cna
      for item in tableSE[tcna]:
        f.write(str(item))
        f.write(' ')
      f.write('\n')

#ref_table_name = 'tables/CNV_allcases_r.xlsx'
ref_table_name = 'tables/CNV_cedric.xlsx'
ref_book = openpyxl.load_workbook(ref_table_name, read_only=True)
ref_sheet = ref_book.active

z_table_name = 'tables/CNV_zscore.xlsx'
z_book = openpyxl.load_workbook(z_table_name)
z_sheet = z_book.active

TFPN_table_name = 'tables/CNV_TFPN.xlsx'
TFPN_book = openpyxl.load_workbook(TFPN_table_name)
TFPN_sheet = TFPN_book.active

Nrows = ref_sheet.max_row
Ncolumns = ref_sheet.max_column

colNames = {}
for i in range(Ncolumns):
  col = i + 1
  colName = ref_sheet.cell(1, col).value
  colNames[colName] = col

# TODO add comments
# Input parametrs
# First provide one of the cnv methods to be used: canary-kurtz-cytobands, canary-kurtz-arms, canary-mse-cytobands, canary-mse-newcytobands, canary-mse-arms, wisecondor, testcnvkit, testichor
cnvMethod = 'canary-kurtz-arms'
if (len(sys.argv)>=2):
  cnvMethod = sys.argv[1]
updateTables = False
if (len(sys.argv)>=3):
  if (sys.argv[2]=='yes'):
    updateTables = True
if (updateTables):
  for i in range(numberOfPatientsFromTable):#range(Nrows-1):
    row = i + 2 
    HLabel = ref_sheet.cell(row, colNames['HSTAMP_Label']).value
    # Default weighted meanvalue (redefine if needed in particular case)
    defaultValue = 'NA'
    # File name and column names used in canary-kurtz output cytobands
    if (cnvMethod=='canary-kurtz-cytobands'):
      fileName = f'/drive3/dkurtz/HEMESTAMP/CANARy/samples/output/Sample_{HLabel}-T1_Tumor.SegmentedGenome.cytobands-noXY.on-off-combined.txt'
      chrColName = "chrNum"; intChromValue = True; startColName = "Start"; endColName = "End"; valueColName = "combinedStoufferZL2CNR"
      gainThreshold = 1.96; deletionThreshold = -1.96
    # File name and column names used in canary-kurtz output arms
    elif (cnvMethod=='canary-kurtz-arms'):
      fileName = f'/drive3/dkurtz/HEMESTAMP/CANARy/samples/output/Sample_{HLabel}-T1_Tumor.SegmentedGenome.arms.on-off-combined.txt'
      chrColName = "chrNum"; intChromValue = True; startColName = "Start"; endColName = "End"; valueColName = "combinedStoufferZL2CNR" 
      gainThreshold = 1.96; deletionThreshold = -1.96
    # File name and column names used in canary-kurtz-offtarget output 
    elif (cnvMethod=='canary-kurtz-offtarget'):
      fileName = f'/drive3/dkurtz/HEMESTAMP/CANARy/samples/output/Sample_{HLabel}-T1_Tumor.NormalizedGenome.cnr'
      chrColName = "V1"; intChromValue = False; startColName = "V2"; endColName = "V3"; valueColName = "ZLog2CNR"
      gainThreshold = 1.96; deletionThreshold = -1.96  
    # File name and column names used in canary-mse output cytobands
    elif (cnvMethod=='canary-mse-cytobands'):
      fileName = f'canary-python/results-canary/results-canary-mse/Sample_{HLabel}-T1_Tumor.cnvZscores'
      chrColName = "#chr"; intChromValue = False; startColName = "start"; endColName = "end"; valueColName = "gc.corrected.norm.log.std.index.zWeighted.Final"
      gainThreshold = 1.96; deletionThreshold = -1.96
    # File name and column names used in canary-mse new output cytobands
    elif (cnvMethod=='canary-mse-newcytobands'):
      fileName = f'/drive3/mse/CNV/Alicia/results-canary2_new/Sample_{HLabel}-T1_Tumor.cnvZscores'
      chrColName = "#chr"; intChromValue = False; startColName = "start"; endColName = "end"; valueColName = "gc.corrected.norm.log.std.index.zWeighted.Final"
      gainThreshold = 1.96; deletionThreshold = -1.96
    # File name and column names used in canary-mse output arms
    elif (cnvMethod=='canary-mse-arms'):
      fileName = f'/drive3/mse/CNV/Alicia/results-canary5/Sample_{HLabel}-T1_Tumor.cnvZscores'
      chrColName = "#chr"; intChromValue = False; startColName = "start"; endColName = "end"; valueColName = "gc.corrected.norm.log.std.index.zWeighted.Final" 
      gainThreshold = 1.96; deletionThreshold = -1.96
    # File name and column names used in wisecondor output
    elif (cnvMethod=='wisecondor'):
      fileName = f'wisecondor/testSamples/Sample_{HLabel}-T1_Tumor.sorted.samtools-deduped.sorted.offtarget.std.txt'
      chrColName = "chrNum"; intChromValue = True; startColName = "Start"; endColName = "End"; valueColName = "z-score"; defaultValue = 0
      gainThreshold = 1.96; deletionThreshold = -1.96
    # File name and column names used in cnvki-cnst output. Note that cnvkit uses copynumber rather than z-score
    elif (cnvMethod=='cnvkit-cns'):
      fileName = f'cnvkit/results-cnn-tumor/Sample_{HLabel}-T1_Tumor.samtools.call.cns'
      chrColName = "chromosome"; intChromValue = False; startColName = "start"; endColName = "end"; valueColName = "cn"
      gainThreshold = 2.0; deletionThreshold = 2.0
    # File name and column names used in cnvkit-cnr output. Note that cnvkit uses copynumber rather than z-score
    elif (cnvMethod=='cnvkit-cnr'):
      fileName = f'cnvkit/results-cnn-tumor/Sample_{HLabel}-T1_Tumor.samtools.call.cnr'
      chrColName = "chromosome"; intChromValue = False; startColName = "start"; endColName = "end"; valueColName = "cn"
      gainThreshold = 2.0; deletionThreshold = 2.0
   # File name and column names used in ichorcna-cns output. Note that ichorcna uses copynumber rather than z-score
    elif (cnvMethod=='ichorcna-cns'):
      fileName = f'ichorcna/results-ichorcna/{HLabel}.seg'
      chrColName = "chr"; intChromValue = True; startColName = "start"; endColName = "end"; valueColName = "copy.number"
      gainThreshold = 2.0; deletionThreshold = 2.0
   # File name and column names used in ichorcna-cnr output. Note that ichorcna uses copynumber rather than z-score
    elif (cnvMethod=='ichorcna-cnr'):
      fileName = f'ichorcna/results-ichorcna/{HLabel}.cna.seg'
      chrColName = "chr"; intChromValue = True; startColName = "start"; endColName = "end"; valueColName = f'{HLabel}.copy.number'
      gainThreshold = 2.0; deletionThreshold = 2.0
   # File cedric /drive3/crossi/Hodgkin_project/PVAB_archive/cna/\*mynew\*txt/
    elif (cnvMethod=='cedric'):
      fileName = f'/drive3/crossi/Hodgkin_project/PVAB_archive/cna/*mynew*txt/{HLabel}.SegmentedGenome.mynewbed.on-off-combined.txt'
      chrColName = "chrNum"; intChromValue = True; startColName = "Start"; endColName = "End"; valueColName = "combinedStoufferZL2CNR"
      gainThreshold = 1.96; deletionThreshold = -1.96 
 
    else:
      print(f'Provided cnv method {cnvMethod} not in the supported list: canary-kurtz-cytobands, canary-kurtz-arms, canary-mse-cytobands, canary-mse-newcytobands, canary-mse-arms, wisecondor, cnvkit-cns, cnvkit-cnr, ichorcna-cns,ichorcna-cnr')

    # Obtain zscore for every cna from CNAdefs
    print(f'fileName: {fileName}')
    w_zscores = weightedMeanValues(CNA, cnas, fileName, chrColName, startColName, endColName, valueColName, intChromValue, defaultValue)
    updateSheets(row, colNames, w_zscores, z_sheet, TFPN_sheet, gainThreshold, deletionThreshold)
    print(f'{cnvMethod} {HLabel} {w_zscores}')
  # Save the updated excel table
  z_book.save(z_table_name)
  # Comment what. why, how is saved?
  z_book.save(f'tables/{cnvMethod}-Zscore_table.xlsx')
  TFPN_book.save(TFPN_table_name)
  TFPN_book.save(f'tables/TFPN-{cnvMethod}-table.xlsx')

# Writing the resulting tables
TFPN_sheets = {}
TFPN_sheets[cnvMethod] = TFPN_sheet
for method in TFPN_sheets:
  # Define CLL diagnosis, the list of its CNAs, and the name of the written table
  #diagnosis = 'CLL'; cnas = ['11q', '13q', '17p', 'trisomy12'] 
  #tableFileName = 'tables/'+method+'-'+diagnosis+'-tableSE.txt'
  #writeSETable(diagnosis, cnas, TFPN_sheets[method], tableFileName)
  # Define MDS diagnosis, the list of its CNAs, and the name of the written table
  #diagnosis = 'MDS'; cnas = ['5q', '7q', '20q', 'trisomy8'] 
  #tableFileName = 'tables/'+method+'-'+diagnosis+'-tableSE.txt'
  #writeSETable(diagnosis, cnas, TFPN_sheets[method], tableFileName)
  # Define MM diagnosis, the list of its CNAs, and the name of the written table
  #diagnosis = 'MM'; cnas = ['1p', '1q', '13q', '17p'] 
  #tableFileName = 'tables/'+method+'-'+diagnosis+'-tableSE.txt'
  #writeSETable(diagnosis, cnas, TFPN_sheets[method], tableFileName)
  # Define cedric things (HL, 9p...)
  diagnosis = 'HL'; cnas = ['9p', '6p']
  tableFileName = 'tables/'+method+'-'+diagnosis+'-tableSE.txt'
  writeSETable(diagnosis, cnas, TFPN_sheets[method], tableFileName)