## Processing of scRNA-seq data.

## Sample data.

data_expression <- read.table("GSE104276_all_pfc_2394_UMI_TPM_NOERCC.txt", sep = "\t", colClasses = "character", nrows = 1)

data_sample <- data_expression[1,1:(ncol(data_expression)-1)]

data_sample <- t(data_sample)

data_position <- 1:nrow(data_sample)

data_sample_1 <- data.frame(data_sample, data_position)

write.table(data_sample_1, file = "sample_id.txt", quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)

data_sample <- read.table("SampleInfo.txt", sep = "\t", colClasses = "character", skip = 1)

data_sample[,2] <- gsub("GABAergic neurons", "Interneurons", data_sample[,2])

data_sample[,2] <- gsub("Neurons", "Excitatory neurons", data_sample[,2])

data_sample[,2] <- gsub("OPC", "OPCs", data_sample[,2])

data_sample[,2] <- gsub("Stem cells", "NPCs", data_sample[,2])

write.table(data_sample[,1:3], file = "sample_id_1.txt", quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)

data_sample <- read.table("sample_id_1.txt", sep = "\t", colClasses = "character")

data_sample_1 <- read.table("sample_id.txt", sep = "\t", colClasses = "character")

data_sample_2 <- merge(data_sample, data_sample_1, by = 1)

data_sample_3 <- data.frame(data_sample_2[,2:3], data_sample_2[,1], data_sample_2[,4])

write.table(data_sample_3, file = "sample_id_2.txt", quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)

data_sample <- read.table("sample_id_2.txt", sep = "\t", colClasses = "character")

data_sample <- data_sample[ do.call(order, data_sample) ,]

write.table(data_sample, file = "sample_id_3.txt", quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)

write.table(data_sample[,4], file = "sample_id_4.txt", quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)

write.table(data_sample[,1], file = "sample_id_5.txt", quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)

data_sample_1 <- data.frame(data_sample[,1], data_sample[,4])

write.table(data_sample_1, file = "sample_id_6.txt", quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)

## scRNA-seq data.

data_expression <- read.table("GSE104276_all_pfc_2394_UMI_TPM_NOERCC.txt", sep = "\t", colClasses = "character", skip = 1)

data_sample <- scan("sample_id_4.txt", sep = "\t")

data_sample <- data_sample+1

data_expression_1 <- data_expression[,data_sample]

write.table(data_expression[,1], file = "probe.txt", quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)

write.table(data_expression_1, file = "GSE.txt", quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)

## Gene data.

data_probe <- read.table("probe.txt", sep = "\t", colClasses = "character")

data_probe[,1] <- gsub("11-Mar", "MARCH11", data_probe[,1])

data_probe[,1] <- gsub("1-Mar", "MARCH1", data_probe[,1])

data_probe[,1] <- gsub("2-Mar", "MARCH2", data_probe[,1])

data_probe[,1] <- gsub("3-Mar", "MARCH3", data_probe[,1])

data_probe[,1] <- gsub("4-Mar", "MARCH4", data_probe[,1])

data_probe[,1] <- gsub("5-Mar", "MARCH5", data_probe[,1])

data_probe[,1] <- gsub("6-Mar", "MARCH6", data_probe[,1])

data_probe[,1] <- gsub("7-Mar", "MARCH7", data_probe[,1])

data_probe[,1] <- gsub("8-Mar", "MARCH8", data_probe[,1])

data_probe[,1] <- gsub("9-Mar", "MARCH9", data_probe[,1])

data_probe[,1] <- gsub("10-Mar", "MARCH10", data_probe[,1])

data_probe[,1] <- gsub("11-Sep", "SEPT11", data_probe[,1])

data_probe[,1] <- gsub("12-Sep", "SEPT12", data_probe[,1])

data_probe[,1] <- gsub("14-Sep", "SEPT14", data_probe[,1])

data_probe[,1] <- gsub("15-Sep", "SEPT15", data_probe[,1])

data_probe[,1] <- gsub("1-Sep", "SEPT1", data_probe[,1])

data_probe[,1] <- gsub("2-Sep", "SEPT2", data_probe[,1])

data_probe[,1] <- gsub("3-Sep", "SEPT3", data_probe[,1])

data_probe[,1] <- gsub("4-Sep", "SEPT4", data_probe[,1])

data_probe[,1] <- gsub("5-Sep", "SEPT5", data_probe[,1])

data_probe[,1] <- gsub("6-Sep", "SEPT6", data_probe[,1])

data_probe[,1] <- gsub("7-Sep", "SEPT7", data_probe[,1])

data_probe[,1] <- gsub("8-Sep", "SEPT8", data_probe[,1])

data_probe[,1] <- gsub("9-Sep", "SEPT9", data_probe[,1])

data_probe[,1] <- gsub("10-Sep", "SEPT10", data_probe[,1])

data_probe[,1] <- gsub("1-Dec", "DEC1", data_probe[,1])

write.table(data_probe, file = "probe_1.txt", quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)

data_probe <- read.table("probe_1.txt", sep = "\t", colClasses = "character")

data_expression <- read.table("GSE.txt", sep = "\t", colClasses = "character")

data_expression_1 <- data.frame(data_probe, data_expression)

write.table(data_expression_1, file = "GSE_1.txt", quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)

## Gene ID mapping.

data_gene <- read.table("../gene_set/Symbol2ID.txt", sep = "\t", quote = "", colClasses = "character", comment.char = "")

data_expression <- read.table("GSE_1.txt", sep = "\t", colClasses = "character")

data_expression_1 <- merge(data_gene, data_expression, by = 1)

data_expression_1 <- data_expression_1[,2:ncol(data_expression_1)]

data_expression_1 <- data_expression_1[ do.call(order, data_expression_1) ,]

write.table(data_expression_1, file = "GSE_2.txt", quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)

## Average expression for duplicated genes.

data_expression <- read.table("GSE_2.txt", sep = "\t")

data_gene <- data_expression[,1]

data_gene_1 <- unique(data_gene)

data_gene_2 <- data_gene[duplicated(data_gene)]

data_gene_2 <- unique(data_gene_2)

data_gene_2 <- sort(data_gene_2)

data_gene_3 <- setdiff(data_gene_1, data_gene_2)

data_gene_3 <- sort(data_gene_3)

data_expression_1 <- merge(data_expression, data_gene_2, by = 1)

data_expression_2 <- merge(data_expression, data_gene_3, by = 1)

write.table(data_expression_2, file = "GSE_3.txt", quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)

for (i in 1:length(data_gene_2))
{
    data_expression_3 <- data_expression_1[data_expression_1[,1]==data_gene_2[i],]

    data_expression_4 <- colMeans(data_expression_3[,2:ncol(data_expression_3)])

    data_expression_5 <- c(data_gene_2[i], data_expression_4)

    write.table(t(data_expression_5), file = "GSE_3.txt", append = TRUE, quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)
}

## log2 transformation and filtering.

data_expression <- read.table("GSE_3.txt", sep = "\t")

data_expression[,2:ncol(data_expression)] <- data_expression[,2:ncol(data_expression)]+1

data_expression[,2:ncol(data_expression)] <- log2(data_expression[,2:ncol(data_expression)])

write.table(data_expression, file = "GSE_4.txt", quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)

data_expression <- read.table("GSE_4.txt", sep = "\t")

data_row <- rowSums(data_expression[,2:ncol(data_expression)]>0)

data_expression_1 <- data_expression[data_row>=1,]

write.table(data_expression_1, file = "GSE_5.txt", quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)

data_gene <- data_expression_1[,1]

write.table(data_gene, file = "gene_1.txt", quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)
