Skip to content
Permalink
334d4687ee
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
104 lines (88 sloc) 4.29 KB
library(dplyr)
library(data.table)
# Constants
base_path <- './'
# Functions
get_gene_list <- function(gene_file_name) {
base_path <- './'
genes_list_file_path <- paste(base_path, 'human_mouse_gene_lists/', gene_file_name, sep='')
genes_list_data <- read.table(genes_list_file_path, sep='\t', header=TRUE)
genes_list_data$Symbol <- tolower(genes_list_data$Symbol)
return(c(genes_list_data$Symbol, 'species', 'study_type'))
}
transpose_data <- function(all_genes) {
t_all_genes <- t(all_genes)
colnames(t_all_genes) <- t_all_genes[1, ]
t_all_genes <- as.data.frame(t_all_genes[-1, ])
t_all_genes$gsm <- rownames(t_all_genes)
return(t_all_genes)
}
log_columns <- function(dataset, columns) {
for (gene in colnames(dataset)[columns]) {
dataset[ , gene] <- log(dataset[ , gene], 2)
}
return(dataset)
}
make_numeric <- function(dataset, columns) {
for (gene in colnames(dataset)[columns]) {
dataset[ , gene] <- as.numeric(dataset[ , gene])
}
return(dataset)
}
# Parameters
selected_species <- c('human', 'mouse')
# selected_species <- c('mouse')
output_file_name = 'human_mouse_dataset.csv'
# Create gene_list
# genes_list <- get_gene_list('mouse_genes.csv') # Just Human data
# genes_list <- get_gene_list('new_mouse_clean.csv') # Just mouse Data
# For intersection of both mouse and human
genes_list_human <- get_gene_list('human_genes.csv')
genes_list_mouse <- get_gene_list('new_mouse_clean.csv')
genes_list <- intersect(genes_list_human, genes_list_mouse)
# Read in the files
# Get master metadata file
# metadata_file_path <- '/home/reynaldo/Documents/School/Fall2017/DataMining/grp_proj/mkdataset/class_data/metadata/gse_metadata.csv'
metadata_file_path <- paste(base_path, 'class_data/metadata/gse_metadata.csv', sep='')
metadata <- read.table(metadata_file_path, sep='\t', header=TRUE)
# Remove Symbols with no name
gse_file_path <- paste(base_path, 'class_data/csv/clean_csv/without_log2/', sep='')
gse_file_list <- list.files(gse_file_path, pattern="*.csv", full.names=TRUE, recursive=FALSE)
all_genes <- data.frame(Symbol=genes_list)
for (gse_file in gse_file_list) {
# Read in the file
gse_name = strsplit(strsplit(gse_file, "[/]")[[1]][length(strsplit(gse_file, "[/]")[[1]])], "[.]")[[1]][1]
species <- metadata[metadata$gse_id == gse_name, 'species']
study_type <- metadata[metadata$gse_id == gse_name, 'study_type']
if (species %in% selected_species) {
gse_data <- read.table(gse_file, sep='\t', header=TRUE)
gse_data <- gse_data[ , c(-2, -3)] # Remove the probe ID columns
# Convert the symbols to all lowercase
gse_data$Symbol <- tolower(gse_data$Symbol)
selected_genes <- as.data.frame(filter(gse_data, Symbol %in% genes_list))
if (length(selected_genes$Symbol) > 0) {
# Average selected genes
non_symbol_columns <- colnames(selected_genes)[-1]
selected_avg_genes <- aggregate(selected_genes[ , non_symbol_columns],
by=list(selected_genes$Symbol), data=selected_genes, FUN = mean)
species_df <- as.data.frame(t(data.frame(species=c('species', as.character(rep(species, length(colnames(selected_avg_genes))-1))))))
study_type_df <- as.data.frame(t(data.frame(study_type=c('study_type', as.character(rep(study_type, length(colnames(selected_avg_genes))-1))))))
names(species_df) <- names(selected_avg_genes)
names(study_type_df) <- names(selected_avg_genes)
selected_avg_genes <- rbind(selected_avg_genes, study_type_df, species_df)
colnames(selected_avg_genes) <- c('Symbol', non_symbol_columns)
all_genes <- merge(x=all_genes, y=selected_avg_genes, by='Symbol', all.x=TRUE)
}
}
}
t_all_genes <- t(all_genes)
colnames(t_all_genes) <- t_all_genes[1, ]
t_all_genes <- as.data.frame(t_all_genes[-1, ])
t_all_genes$gsm <- rownames(t_all_genes)
# Rearrange the columns
# colnames(t_all_genes)[-length(colnames(t_all_genes))])
t_all_genes <- t_all_genes[ ,c('gsm','species', 'study_type', colnames(t_all_genes)[!(colnames(t_all_genes) %in% c('gsm','species', 'study_type'))])]
t_all_genes <- make_numeric(t_all_genes, 4:length(colnames(t_all_genes)))
t_all_genes_log_2 <- log_columns(t_all_genes, 4:length(colnames(t_all_genes)))
output_file_path <- paste(base_path, output_file_name, sep='')
write.table(t_all_genes_log_2, file=output_file_path, row.names = FALSE, sep=',', quote=FALSE)