Skip to content

Commit

Permalink
Anon data code.
Browse files Browse the repository at this point in the history
  • Loading branch information
dereckmezquita committed Jul 13, 2024
1 parent 2f8a2d4 commit 69c1a85
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 2 deletions.
19 changes: 18 additions & 1 deletion dev/anonymise-data.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,37 @@ change_gene_id <- function(id) {
anonymise_data <- function(data) {
# Create a copy of the original data
dt_anon <- dt$copy(data)
meta_cols <- c("GeneID", "GeneSymbol", "GeneBiotype")

# Change last 3 digits of GeneID
dt_anon[, GeneID := sapply(GeneID, change_gene_id)]

# Get the names of numeric columns (excluding GeneID, GeneSymbol, and GeneBiotype)
numeric_cols <- setdiff(
names(dt_anon)[sapply(dt_anon, is.numeric)],
c("GeneID", "GeneSymbol", "GeneBiotype")
meta_cols
)

# Randomly add 1 or 2 to some cells in numeric columns
for (col in numeric_cols) {
dt_anon[, (col) := get(col) + sample(c(0, 1, 2), .N, replace = TRUE, prob = c(0.85, 0.1, 0.05))]
}

# rename the columns
# select random capital letter from alphabet
letter <- sample(LETTERS, 1)
# select random number from 0 to 9
num <- sample(1:9, 1)

sample_names <- colnames(dt_anon)
sample_names <- sample_names[!sample_names %in% meta_cols]

# remove the first char from sample names
sample_names <- substr(sample_names, 3, nchar(sample_names))
# add the random letter and number to the start of the sample names
sample_names <- paste0(letter, num, sample_names)

colnames(dt_anon) <- c(meta_cols, sample_names)

return(dt_anon[])
}
Expand Down
13 changes: 12 additions & 1 deletion dev/save-data.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,21 @@ box::use(dt = data.table)
box::use(dev/`anonymise-data`[ anonymise_data ])

# 2. Feature counts genes
feature_counts0 <- read.csv("dev/data/feature-counts-genes.csv")
feature_counts0 <- read.csv("dev/data/original.ignore/feature-counts-genes.csv")
dt$setDT(feature_counts0)

# save original colnames
sample_names <- colnames(feature_counts0)
sample_names <- sample_names[!sample_names %in% c("GeneID", "GeneSymbol", "GeneBiotype")]
sample_names <- dt$data.table(original = sample_names)

feature_counts <- anonymise_data(feature_counts0)

sample_names[, new_name := colnames(feature_counts)[4:ncol(feature_counts)] ]

dt$fwrite(sample_names, "dev/data/original.ignore/new-names-table.csv")
dt$fwrite(feature_counts, "dev/data/feature-counts-genes.csv")

use_data(feature_counts, overwrite = TRUE)

# 3. Volcano differential expression
Expand Down

0 comments on commit 69c1a85

Please sign in to comment.