Skip to content
Permalink
master
Go to file
 
 
Cannot retrieve contributors at this time
104 lines (88 sloc) 4.29 KB
library(dplyr)
library(data.table)
# Constants
base_path <- './'
# Functions
get_gene_list <- function(gene_file_name) {
base_path <- './'
genes_list_file_path <- paste(base_path, 'human_mouse_gene_lists/', gene_file_name, sep='')
genes_list_data <- read.table(genes_list_file_path, sep='\t', header=TRUE)
genes_list_data$Symbol <- tolower(genes_list_data$Symbol)
return(c(genes_list_data$Symbol, 'species', 'study_type'))
}
transpose_data <- function(all_genes) {
t_all_genes <- t(all_genes)
colnames(t_all_genes) <- t_all_genes[1, ]
t_all_genes <- as.data.frame(t_all_genes[-1, ])
t_all_genes$gsm <- rownames(t_all_genes)
return(t_all_genes)
}
log_columns <- function(dataset, columns) {
for (gene in colnames(dataset)[columns]) {
dataset[ , gene] <- log(dataset[ , gene], 2)
}
return(dataset)
}
make_numeric <- function(dataset, columns) {
for (gene in colnames(dataset)[columns]) {
dataset[ , gene] <- as.numeric(dataset[ , gene])
}
return(dataset)
}
# Parameters
selected_species <- c('human', 'mouse')
# selected_species <- c('mouse')
output_file_name = 'human_mouse_dataset.csv'
# Create gene_list
# genes_list <- get_gene_list('mouse_genes.csv') # Just Human data
# genes_list <- get_gene_list('new_mouse_clean.csv') # Just mouse Data
# For intersection of both mouse and human
genes_list_human <- get_gene_list('human_genes.csv')
genes_list_mouse <- get_gene_list('new_mouse_clean.csv')
genes_list <- intersect(genes_list_human, genes_list_mouse)
# Read in the files
# Get master metadata file
# metadata_file_path <- '/home/reynaldo/Documents/School/Fall2017/DataMining/grp_proj/mkdataset/class_data/metadata/gse_metadata.csv'
metadata_file_path <- paste(base_path, 'class_data/metadata/gse_metadata.csv', sep='')
metadata <- read.table(metadata_file_path, sep='\t', header=TRUE)
# Remove Symbols with no name
gse_file_path <- paste(base_path, 'class_data/csv/clean_csv/without_log2/', sep='')
gse_file_list <- list.files(gse_file_path, pattern="*.csv", full.names=TRUE, recursive=FALSE)
all_genes <- data.frame(Symbol=genes_list)
for (gse_file in gse_file_list) {
# Read in the file
gse_name = strsplit(strsplit(gse_file, "[/]")[[1]][length(strsplit(gse_file, "[/]")[[1]])], "[.]")[[1]][1]
species <- metadata[metadata$gse_id == gse_name, 'species']
study_type <- metadata[metadata$gse_id == gse_name, 'study_type']
if (species %in% selected_species) {
gse_data <- read.table(gse_file, sep='\t', header=TRUE)
gse_data <- gse_data[ , c(-2, -3)] # Remove the probe ID columns
# Convert the symbols to all lowercase
gse_data$Symbol <- tolower(gse_data$Symbol)
selected_genes <- as.data.frame(filter(gse_data, Symbol %in% genes_list))
if (length(selected_genes$Symbol) > 0) {
# Average selected genes
non_symbol_columns <- colnames(selected_genes)[-1]
selected_avg_genes <- aggregate(selected_genes[ , non_symbol_columns],
by=list(selected_genes$Symbol), data=selected_genes, FUN = mean)
species_df <- as.data.frame(t(data.frame(species=c('species', as.character(rep(species, length(colnames(selected_avg_genes))-1))))))
study_type_df <- as.data.frame(t(data.frame(study_type=c('study_type', as.character(rep(study_type, length(colnames(selected_avg_genes))-1))))))
names(species_df) <- names(selected_avg_genes)
names(study_type_df) <- names(selected_avg_genes)
selected_avg_genes <- rbind(selected_avg_genes, study_type_df, species_df)
colnames(selected_avg_genes) <- c('Symbol', non_symbol_columns)
all_genes <- merge(x=all_genes, y=selected_avg_genes, by='Symbol', all.x=TRUE)
}
}
}
t_all_genes <- t(all_genes)
colnames(t_all_genes) <- t_all_genes[1, ]
t_all_genes <- as.data.frame(t_all_genes[-1, ])
t_all_genes$gsm <- rownames(t_all_genes)
# Rearrange the columns
# colnames(t_all_genes)[-length(colnames(t_all_genes))])
t_all_genes <- t_all_genes[ ,c('gsm','species', 'study_type', colnames(t_all_genes)[!(colnames(t_all_genes) %in% c('gsm','species', 'study_type'))])]
t_all_genes <- make_numeric(t_all_genes, 4:length(colnames(t_all_genes)))
t_all_genes_log_2 <- log_columns(t_all_genes, 4:length(colnames(t_all_genes)))
output_file_path <- paste(base_path, output_file_name, sep='')
write.table(t_all_genes_log_2, file=output_file_path, row.names = FALSE, sep=',', quote=FALSE)
You can’t perform that action at this time.