|
| 1 | +read.csv.RCC <- function(path, sample.id.row = 'File.Name') { |
| 2 | + if (!file.exists(path)) { |
| 3 | + stop(paste('File not found:', path)) ; |
| 4 | + } |
| 5 | + |
| 6 | + prep.rcc <- function(path) { |
| 7 | + data <- read.csv( |
| 8 | + path, |
| 9 | + header = FALSE, |
| 10 | + strip.white = TRUE |
| 11 | + ); |
| 12 | + data <- data[!sapply(data, function(x) all(is.na(x)))]; |
| 13 | + |
| 14 | + data.start.index <- min(which(data[, 1] == 'Reporter Counts')); |
| 15 | + header <- data[1:(data.start.index - 1), ]; |
| 16 | + data <- data[data.start.index:nrow(data), ]; |
| 17 | + |
| 18 | + return(list( |
| 19 | + header = header, |
| 20 | + x = data |
| 21 | + )); |
| 22 | + } |
| 23 | + rcc <- prep.rcc(path); |
| 24 | + |
| 25 | + if (is.null(rcc$header)) { |
| 26 | + stop('There appears to be a problem with the RCC CSV file. No header information found.'); |
| 27 | + } |
| 28 | + |
| 29 | + rcc$header <- rcc$header[is.na(rcc$header[1]) | (rcc$header[1] != ''), ]; |
| 30 | + rownames(rcc$header) <- rcc$header[, 1]; |
| 31 | + rcc$header <- rcc$header[, -1]; |
| 32 | + |
| 33 | + rownames(rcc$header) <- gsub(' $', '', rownames(rcc$header)); |
| 34 | + rownames(rcc$header) <- gsub(' ', '.', rownames(rcc$header)); |
| 35 | + rownames(rcc$header) <- tolower(rownames(rcc$header)); |
| 36 | + |
| 37 | + if ('id' %in% rownames(rcc$header)) { |
| 38 | + rownames(rcc$header)[rownames(rcc$header) == 'id'] <- 'sample.id'; |
| 39 | + } |
| 40 | + |
| 41 | + if (!all(c('file.name', 'sample.id', 'binding.density') %in% rownames(rcc$header))) { |
| 42 | + stop('There appears to be a problem with the RCC CSV file. Rownames in header are missing "File name", "Sample id", "Binding density"'); |
| 43 | + } |
| 44 | + |
| 45 | + rcc$header <- rcc$header[!rownames(rcc$header) %in% c('file.attributes', 'lane.attributes'), -c(1,2)]; |
| 46 | + |
| 47 | + sample.ids <- rcc$header[rownames(rcc$header) %in% tolower(sample.id.row),]; |
| 48 | + sample.ids <- gsub(' ', '.', sample.ids); |
| 49 | + sample.ids <- gsub('^([0-9])', 'X\\1', sample.ids); |
| 50 | + colnames(rcc$header) <- sample.ids; |
| 51 | + |
| 52 | + if (is.null(rcc$x)) { |
| 53 | + stop('There appears to be a problem with the RCC CSV file. Likely couldnt find the count specifically "Code Class" in header information.'); |
| 54 | + } |
| 55 | + |
| 56 | + colnames(rcc$x) <- rcc$x[2, ]; |
| 57 | + rcc$x <- rcc$x[-c(1:2), 1:(3 + length(sample.ids))]; |
| 58 | + |
| 59 | + rows.with.missing.anno <- (rcc$x[, 1] == '' | rcc$x[, 2] == ''); |
| 60 | + if (any(rows.with.missing.anno)) { |
| 61 | + rcc$x <- rcc$x[!rows.with.missing.anno,]; |
| 62 | + cat(paste('The following row(s)', paste(which(rows.with.missing.anno), collapse = ', '), 'have been dropped due to missing annotation.\n\t You may want to double check the excel file.\n\n')); |
| 63 | + } |
| 64 | + |
| 65 | + colnames(rcc$x) <- gsub(' ', '.', colnames(rcc$x)); |
| 66 | + colnames(rcc$x) <- c(colnames(rcc$x)[1:3], sample.ids); |
| 67 | + |
| 68 | + cat(paste('There were', length(sample.ids), 'samples imported. \nNote that spaces in sample names will be replaced by dots. \n')); |
| 69 | + |
| 70 | + if (length(sample.ids) > 5) { |
| 71 | + cat('The first and last 3 sample names found in the dataset are:\n'); |
| 72 | + cat(paste(c(sample.ids[1:3], rev(sample.ids)[1:3]))); |
| 73 | + } |
| 74 | + else { |
| 75 | + cat('The sample names found in the dataset are:\n'); |
| 76 | + cat(paste(sample.ids)); |
| 77 | + } |
| 78 | + |
| 79 | + cat(paste('\n\nThere were', nrow(rcc$x), 'genes imported with the following Code Class breakdown:')); |
| 80 | + print(table(rcc$x[, 'Code.Class'])); |
| 81 | + |
| 82 | + class(rcc) <- 'NanoString'; |
| 83 | + return(rcc); |
| 84 | + } |
0 commit comments