Permalink
Cannot retrieve contributors at this time
data_mining_gan/mkdataset/prep_thams_data.r
Go to filelibrary(dplyr) | |
library(data.table) | |
# Constants | |
base_path <- './' | |
# Functions | |
get_gene_list <- function(gene_file_name) { | |
base_path <- './' | |
genes_list_file_path <- paste(base_path, 'human_mouse_gene_lists/', gene_file_name, sep='') | |
genes_list_data <- read.table(genes_list_file_path, sep='\t', header=TRUE) | |
genes_list_data$Symbol <- tolower(genes_list_data$Symbol) | |
return(c(genes_list_data$Symbol, 'control', 'class')) | |
} | |
# Parameters | |
output_file_name = 'tham_lasso_dataset.csv' | |
# Create gene_list | |
# genes_list <- get_gene_list('human_genes.csv') # Just Human data | |
# genes_list <- get_gene_list('new_mouse_clean.csv') # Just mouse Data | |
# For intersection of both mouse and human | |
# genes_list_human <- get_gene_list('human_genes.csv') | |
# genes_list_mouse <- get_gene_list('new_mouse_clean.csv') | |
# genes_list <- intersect(genes_list_human, genes_list_mouse) | |
genes_list <- get_gene_list('lasso_gene_list.csv') | |
# Get Tham's data | |
gse_file <- paste(base_path, 'class_data/gse70559_log2ratio_data.csv', sep='') | |
gse_data <- read.table(gse_file, sep='\t', header=TRUE) | |
gse_data <- gse_data[!duplicated(gse_data$TEST), ] # Remove duplicate genes | |
# Convert the gene symbols to all lowercase | |
gse_data$TEST <- tolower(gse_data$TEST) | |
# Select the genes | |
selected_genes <- as.data.frame(filter(gse_data, TEST %in% genes_list)) | |
# Create Filtered dataset with select genes | |
final_data_set <- t(selected_genes) | |
colnames(final_data_set) <- final_data_set[1, ] | |
final_data_set <- as.data.frame(final_data_set) | |
final_data_set$test <- rownames(final_data_set) | |
final_data_set <- final_data_set[ ,c('test','control', 'class', colnames(final_data_set)[!(colnames(final_data_set) %in% c('test','control', 'class'))])] | |
final_data_set <- final_data_set[-1, ] | |
output_file_path <- paste(base_path, output_file_name, sep='') | |
write.table(final_data_set, file=output_file_path, row.names = FALSE, sep=',', quote=FALSE) |