Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
library(dplyr)
library(data.table)
# Constants
base_path <- './'
# Functions
get_gene_list <- function(gene_file_name) {
base_path <- './'
genes_list_file_path <- paste(base_path, 'human_mouse_gene_lists/', gene_file_name, sep='')
genes_list_data <- read.table(genes_list_file_path, sep='\t', header=TRUE)
genes_list_data$Symbol <- tolower(genes_list_data$Symbol)
return(c(genes_list_data$Symbol, 'species', 'study_type'))
}
# Parameters
# selected_species <- c('human', 'mouse')
selected_species <- c('mouse')
output_file_name = 'mouse_dataset_log2_ratio.csv'
# Create gene_list
genes_list <- get_gene_list('new_mouse_clean.csv') # Just Human data
# genes_list <- get_gene_list('new_mouse_clean.csv') # Just mouse Data
# For intersection of both mouse and human
# genes_list_human <- get_gene_list('human_genes.csv')
# genes_list_mouse <- get_gene_list('new_mouse_clean.csv')
# genes_list <- intersect(genes_list_human, genes_list_mouse)
# genes_list = c('acta2', 'rgs16') # just for testing
# Read in the files
# Get master metadata file
# metadata_file_path <- '/home/reynaldo/Documents/School/Fall2017/DataMining/grp_proj/mkdataset/class_data/metadata/gse_metadata.csv'
metadata_file_path <- paste(base_path, 'class_data/metadata/gse_metadata.csv', sep='')
metadata <- read.table(metadata_file_path, sep='\t', header=TRUE)
# Remove Symbols with no name
gse_file_path <- paste(base_path, '/class_data/csv/clean_csv/', sep='')
# gse_file_path <- paste(base_path, '/class_data/csv/clean_csv/without_log2/', sep='')
gse_file_list <- list.files(gse_file_path, pattern="*.csv", full.names=TRUE, recursive=FALSE)
all_genes <- data.frame(Symbol=genes_list)
for (gse_file in gse_file_list) {
# Read in the file
gse_name = strsplit(strsplit(gse_file, "[/]")[[1]][length(strsplit(gse_file, "[/]")[[1]])], "[.]")[[1]][1]
species <- metadata[metadata$gse_id == gse_name, 'species']
study_type <- metadata[metadata$gse_id == gse_name, 'study_type']
if (species %in% selected_species) {
gse_data <- read.table(gse_file, sep='\t', header=TRUE)
gse_data <- gse_data[ , c(-2, -3)] # Remove the probe ID columns
gse_data <- gse_data[ , grepl('_', colnames(gse_data), fixed=TRUE) | grepl('Symbol', colnames(gse_data), fixed=TRUE)]
colnames(gse_data) <- c('Symbol', paste(gse_name, '_', colnames(gse_data)[-1], sep=''))
# Convert the symbols to all lowercase
gse_data$Symbol <- tolower(gse_data$Symbol)
selected_genes <- as.data.frame(filter(gse_data, Symbol %in% genes_list))
if (length(selected_genes$Symbol) > 0) {
# Average selected genes
non_symbol_columns <- colnames(selected_genes)[-1]
selected_avg_genes <- aggregate(selected_genes[ , non_symbol_columns],
by=list(selected_genes$Symbol), data=selected_genes, FUN = mean)
species_df <- as.data.frame(t(data.frame(species=c('species', as.character(rep(species, length(colnames(selected_avg_genes))-1))))))
study_type_df <- as.data.frame(t(data.frame(study_type=c('study_type', as.character(rep(study_type, length(colnames(selected_avg_genes))-1))))))
names(species_df) <- names(selected_avg_genes)
names(study_type_df) <- names(selected_avg_genes)
selected_avg_genes <- rbind(selected_avg_genes, study_type_df, species_df)
colnames(selected_avg_genes) <- c('Symbol', non_symbol_columns)
all_genes <- merge(x=all_genes, y=selected_avg_genes, by='Symbol', all.x=TRUE)
}
}
}
t_all_genes <- t(all_genes)
colnames(t_all_genes) <- t_all_genes[1, ]
t_all_genes <- as.data.frame(t_all_genes[-1, ])
t_all_genes$gsm <- rownames(t_all_genes)
# Rearrange the columns
# colnames(t_all_genes)[-length(colnames(t_all_genes))])
t_all_genes <- t_all_genes[ ,c('gsm','species', 'study_type', colnames(t_all_genes)[!(colnames(t_all_genes) %in% c('gsm','species', 'study_type'))])]
output_file_path <- paste(base_path, output_file_name, sep='')
write.table(t_all_genes, file=output_file_path, row.names = FALSE, sep=',', quote=FALSE)