diff --git a/POMP-DETECT/Preprocess.java b/POMP-DETECT/Preprocess.java new file mode 100644 index 0000000..edd3e09 --- /dev/null +++ b/POMP-DETECT/Preprocess.java @@ -0,0 +1,613 @@ +//package spliced; + +import java.io.*; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class Preprocess { + + public static String contigs_file_name = "contigs.fa"; + public static String chromosome_name = "chr1"; + public static String pre_process = "off"; + public static String index_file_name = "/home/sus11005/Data/CODE/SPLICED/DATA/NEW/MAIN_INDEX/chr"; + public static String map_file_name = "/home/sus11005/Data/CODE/SPLICED/DATA/NEW/MAP/map_100.sam"; + public static String umrq_file_name = "/home/sus11005/Data/CODE/SPLICED/DATA/NEW/MAP/umrs_100.fq"; + public static String umrs_file_name = "/home/sus11005/Data/CODE/SPLICED/DATA/NEW/MAP/umrs_100.fa"; + public static String coverage_file_name = "/home/sus11005/Data/CODE/SPLICED/DATA/NEW/MAP/coverage_100.cov"; + public static String reads_file_name = "/home/sus11005/Data/CODE/SPLICED/DATA/NEW/READS/100.1.fastq"; + + public static int mismatch_for_full_alignment = 2; + public static int mismatch_for_half_alignment = 1; + public static int alignments = 100; + public static int threads = 10; + public static int consensus_length = 108; + public static double hamming_dist_threshold = 0; + public static int search_length = 0; + public static int max_relative_alignment = 0; + public static int overlap_length = 0; + public static int overlap_hamming_distance = 0; + + public Preprocess(String file_name) throws IOException { + + FileInputStream fileInputStream = new FileInputStream(file_name); + DataInputStream dataInputStream = new DataInputStream(fileInputStream); + BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(dataInputStream)); + String line; + int counter = 1; + + while ((line = bufferedReader.readLine()) != null) { + if (counter == 2) { + index_file_name = line; + } else if (counter == 4) { + map_file_name = line; + } else if (counter == 6) { + umrq_file_name = line; + } else if (counter == 8) { + umrs_file_name = line; + } else if (counter == 10) { + reads_file_name = line; + } else if (counter == 12) { + coverage_file_name = line; + } else if (counter == 14) { + mismatch_for_full_alignment = Integer.parseInt(line); + } else if (counter == 16) { + mismatch_for_half_alignment = Integer.parseInt(line); + } else if (counter == 18) { + alignments = Integer.parseInt(line); + } else if (counter == 20) { + threads = Integer.parseInt(line); + } else if (counter == 22) { + consensus_length = Integer.parseInt(line); + } else if (counter == 24) { + chromosome_name = line; + } else if (counter == 26) { + pre_process = line; + } else if (counter == 28) { + hamming_dist_threshold = Double.parseDouble(line); + } else if (counter == 30) { + search_length = Integer.parseInt(line); + } else if (counter == 32) { + max_relative_alignment = Integer.parseInt(line); + } else if (counter == 36) { + overlap_length = Integer.parseInt(line); + } else if (counter == 38) { + overlap_hamming_distance = Integer.parseInt(line); + } + + counter++; + } + + bufferedReader.close(); + } + + public static List run(String contigs_file_name, String index_file, int alignments, + int mismatch, int threads) throws IOException, InterruptedException { + + List files = read_and_write_fragmented_contigs(contigs_file_name); + List map_files = new ArrayList(); + + String map_file_preamble = "map_file_"; + int counter = 0; + + for (String fragments_file_name : files) { + + System.out.println("RUNNING BOWTIE..."); + + String map_file_name = map_file_preamble + String.valueOf(counter); + map_files.add(map_file_name); + + String command = "./bowtie --quiet --best --suppress 5,6,7 -k " + String.valueOf(alignments) + " -v " + + String.valueOf(mismatch) + " -p " + String.valueOf(threads) + " " + index_file + " -f " + fragments_file_name + " " + map_file_name; + System.out.println(command); + + + String line; + Process p = Runtime.getRuntime().exec(command); + + BufferedReader bri = new BufferedReader + (new InputStreamReader(p.getInputStream())); + + BufferedReader bre = new BufferedReader + (new InputStreamReader(p.getErrorStream())); + + while ((line = bri.readLine()) != null) { + System.out.println(line); + } + + bri.close(); + + while ((line = bre.readLine()) != null) { + System.out.println(line); + } + + bre.close(); + p.waitFor(); + System.out.println("DONE..."); + + counter++; + } + + return map_files; + } + + public static List read_and_write_fragmented_contigs(String readFileName) throws IOException { + + FileInputStream fileInputStream = new FileInputStream(readFileName); + DataInputStream dataInputStream = new DataInputStream(fileInputStream); + BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(dataInputStream)); + + List> reads_list = new ArrayList>(); + List files = new ArrayList(); + + String line, preamble; + + while ((line = bufferedReader.readLine()) != null) { + List list = new ArrayList(); + + int read_length = line.length(); + int chunk_size = read_length / 2; + + String part = ""; + + for (int i = 0; i < read_length; i += chunk_size) { + + if (read_length - i < chunk_size) { + part = part + line.substring(i); + list.set((list.size() - 1), part); + + } else { + part = (line.substring(i, i + chunk_size)); + list.add(part); + } + } + reads_list.add(list); + } + + bufferedReader.close(); + + int number_of_files = reads_list.get(0).size(); + + for (int i = 0; i < number_of_files; i++) { + + String file_name = String.valueOf(i) + ".txt"; + files.add(file_name); + + FileWriter fileWriter = new FileWriter(file_name, false); + BufferedWriter out = new BufferedWriter(fileWriter); + int counter = 0; + + for (int a = 0; a < reads_list.size(); a++) { + List fragments = reads_list.get(a); + preamble = ">" + String.valueOf(counter) + "\n" + fragments.get(i) + "\n"; + out.write(preamble); + counter++; + } + out.close(); + } + + return files; + } + + public static void run_utils(String chromosome_name, String map_file_one, String map_file_two, int point) throws IOException, InterruptedException { + + String command = "java -Xmx10g -cp . Utilities " + chromosome_name + " " + map_file_one + " " + map_file_two + " " + String.valueOf(point); + System.out.println(command); + + + String line; + Process p = Runtime.getRuntime().exec(command); + BufferedReader bri = new BufferedReader + (new InputStreamReader(p.getInputStream())); + BufferedReader bre = new BufferedReader + (new InputStreamReader(p.getErrorStream())); + while ((line = bri.readLine()) != null) { + System.out.println(line); + } + bri.close(); + while ((line = bre.readLine()) != null) { + System.out.println(line); + } + bre.close(); + p.waitFor(); + System.out.println("DONE..."); + } + + public static int generate_non_uniquely_mapped_reads(String reads_file_name, String index_file, String map_file_name, String umrq_file_name, + String umrs_file_name, int mismatch, int alignments, int threads) throws IOException, InterruptedException { + + System.out.println("RUNNING BOWTIE..."); + + String command = "./bowtie --quiet --sam --sam-nohead -k " + String.valueOf(alignments) + " -v " + String.valueOf(mismatch) + " -p " + String.valueOf(threads) + + " --un " + umrq_file_name + " " + index_file + " -q " + reads_file_name + " " + map_file_name; + System.out.println(command); + + String line; + Process p = Runtime.getRuntime().exec(command); + BufferedReader bri = new BufferedReader + (new InputStreamReader(p.getInputStream())); + BufferedReader bre = new BufferedReader + (new InputStreamReader(p.getErrorStream())); + while ((line = bri.readLine()) != null) { + System.out.println(line); + } + bri.close(); + while ((line = bre.readLine()) != null) { + System.out.println(line); + } + bre.close(); + p.waitFor(); + System.out.println("DONE..."); + + return read_and_write(umrq_file_name, umrs_file_name); + } + + public static void run_contigs_generator(String umrs_file_name, String contigs_file_name, + int consensus_length, int overlap_length, int ham_dist) throws IOException, InterruptedException { + + System.out.println("RUNNING CONTIGS_GENERATOR..."); + + String command = "./a.out " + umrs_file_name + " " + contigs_file_name + " " + String.valueOf(consensus_length) + + " " + String.valueOf(overlap_length) + + " " + String.valueOf(ham_dist); + + String line; + Process p = Runtime.getRuntime().exec(command); + BufferedReader bri = new BufferedReader + (new InputStreamReader(p.getInputStream())); + BufferedReader bre = new BufferedReader + (new InputStreamReader(p.getErrorStream())); + while ((line = bri.readLine()) != null) { + System.out.println(line); + } + bri.close(); + while ((line = bre.readLine()) != null) { + System.out.println(line); + } + bre.close(); + p.waitFor(); + System.out.println("DONE..."); + } + + + public static int read_and_write(String read_file_name, String out_file_name) throws IOException { + FileInputStream fileInputStream = new FileInputStream(read_file_name); + DataInputStream dataInputStream = new DataInputStream(fileInputStream); + BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(dataInputStream)); + String line; + int count = 0; + boolean accept = false; + + FileWriter fileWriter = new FileWriter(out_file_name, false); + BufferedWriter out = new BufferedWriter(fileWriter); + + while ((line = bufferedReader.readLine()) != null) { + + if (line.startsWith("@")) { + accept = true; + } else if (accept) { + String text = ">" + String.valueOf(count) + "\n" + line + "\n"; + out.write(text); + accept = false; + count++; + } + } + + out.close(); + bufferedReader.close(); + return count; + } + + public static void modify_contigs(String in_file_name, String out_file_name, int overlap) throws IOException { + + FileInputStream fileInputStream = new FileInputStream(in_file_name); + DataInputStream dataInputStream = new DataInputStream(fileInputStream); + BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(dataInputStream)); + String line; + + FileWriter fileWriter = new FileWriter(out_file_name, false); + BufferedWriter out = new BufferedWriter(fileWriter); + + while ((line = bufferedReader.readLine()) != null) { + + int start_index = 0; + int end_index = line.length() / 2 + overlap; + + String mod_contig = line.substring(start_index, end_index) + "\n"; + out.write(mod_contig); + + start_index = line.length() / 2 - overlap; + mod_contig = line.substring(start_index) + "\n"; + out.write(mod_contig); + } + + bufferedReader.close(); + out.close(); + } + + public boolean modify_junction_info_file(String file_name) throws IOException { + + HashMap> hash_map = new HashMap>(); + + FileInputStream fileInputStream = new FileInputStream(file_name); + DataInputStream dataInputStream = new DataInputStream(fileInputStream); + BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(dataInputStream)); + String line; + + + while ((line = bufferedReader.readLine()) != null) { + + String[] strings = line.split("\t"); + + int read_id = Integer.parseInt(strings[0]); + int left_mismatch = Integer.parseInt(strings[1]); + int right_mismatch = Integer.parseInt(strings[2]); + int left_boundary = Integer.parseInt(strings[3]); + int right_boundary = Integer.parseInt(strings[4]); + int left_length = Integer.parseInt(strings[5]); + int right_length = Integer.parseInt(strings[6]); + int intron_length = Integer.parseInt(strings[7]); + int coverage = Integer.parseInt(strings[8]); + + Information information = new Information(); + information.setLeft_boundary(left_boundary); + information.setRight_boundary(right_boundary); + information.setContig_id(read_id); + information.setLeft_mismatch(left_mismatch); + information.setRight_mismatch(right_mismatch); + information.setIntron_length(intron_length); + + information.setLeft_length(left_length); + information.setRight_length(right_length); + + if (coverage > 0) { + information.setCoverage(true); + } else { + information.setCoverage(false); + } + + if (hash_map.containsKey(read_id)) { + + List infos = hash_map.get(read_id); + boolean accept = true; + + for (Information info : infos) { + int _left = info.getLeft_boundary(); + int _right = info.getRight_boundary(); + + if (Math.abs(left_boundary - _left) <= 10 && Math.abs(right_boundary - _right) <= 10) { + accept = false; + break; + } + } + + if (accept) { + infos.add(information); + } + + } else { + List infos = new ArrayList(); + infos.add(information); + hash_map.put(read_id, infos); + } + } + + bufferedReader.close(); + + String modified_file_name = file_name; + + FileWriter fileWriter = new FileWriter(modified_file_name, false); + BufferedWriter out = new BufferedWriter(fileWriter); + + for (Object o : hash_map.entrySet()) { + Map.Entry entry = (Map.Entry) o; + List infos = (List) entry.getValue(); + + for (Information junction : infos) { + + int read_id = junction.getContig_id(); + int left_mismatch = junction.getLeft_mismatch(); + int right_mismatch = junction.getRight_mismatch(); + int left_boundary = junction.getLeft_boundary(); + int right_boundary = junction.getRight_boundary(); + int left_length = junction.getLeft_length(); + int right_length = junction.getRight_length(); + int intron_length = junction.getIntron_length(); + int coverage = junction.isCoverage() ? 1 : 0; + + String text = String.valueOf(read_id) + "\t" + String.valueOf(left_mismatch) + "\t" + String.valueOf(right_mismatch) + "\t" + + String.valueOf(left_boundary) + "\t" + String.valueOf(right_boundary) + "\t" + String.valueOf(left_length) + "\t" + + String.valueOf(right_length) + "\t" + String.valueOf(intron_length) + "\t" + String.valueOf(coverage) + "\n"; + + out.write(text); + } + } + + out.close(); + + return true; + } + + + public static void main(String[] args) throws IOException, InterruptedException { + + Preprocess preprocess = new Preprocess("properties.prop"); + + String junctions_file_name = "OUT/junctions_information.info"; + + File file = new File(junctions_file_name); + file.delete(); + + if (pre_process.equalsIgnoreCase("on")) { + + // generate non-uniquely mapped reads + + System.out.println("GENERATING NON_UNIQUELY_MAPPED_READS...."); + + int number_of_non_uniquely_mapped_reads = generate_non_uniquely_mapped_reads(reads_file_name, index_file_name, + map_file_name, umrq_file_name, umrs_file_name, mismatch_for_full_alignment, alignments, threads); + + System.out.println("IM_READS: " + number_of_non_uniquely_mapped_reads); + + System.out.println("FINISHED GENERATING NON_UNIQUELY_MAPPED_READS...."); + + // generate contigs + + System.out.println("GENERATING CONTIGS...."); + + run_contigs_generator(umrs_file_name, contigs_file_name, consensus_length, overlap_length, overlap_hamming_distance); + + System.out.println("FINISHING GENERATING CONTIGS...."); + + // generate map file + + System.out.println("GENERATING MAP FILES...."); + + List map_file_names = run(contigs_file_name, index_file_name, alignments, mismatch_for_half_alignment, threads); + + System.out.println("FINISHING GENERATING MAP FILES...."); + } + + + List map_file_names = new ArrayList(); + map_file_names.add("map_file_0"); + map_file_names.add("map_file_1"); + + + for (int i = 0; i < map_file_names.size() - 1; i++) { + + String map_one = map_file_names.get(i); + String map_two = map_file_names.get(i + 1); + + run_utils(chromosome_name, map_one, map_two, i); + } + + boolean completed = preprocess.modify_junction_info_file(junctions_file_name); + + System.out.println("COMPLETED... ... ... ...: " + completed); + } + + + public class Information { + private int contig_id; + private int part_no; + private int left_boundary; + private int right_boundary; + private int left_mismatch; + private int right_mismatch; + private int left_length; + private int right_length; + + private boolean is_canonical_junction; + private boolean is_semi_canonical_junction; + private boolean is_non_canonical_junction; + + private int intron_length; + private boolean coverage; + + public boolean isCoverage() { + return coverage; + } + + public void setCoverage(boolean coverage) { + this.coverage = coverage; + } + + public int getContig_id() { + return contig_id; + } + + public void setContig_id(int contig_id) { + this.contig_id = contig_id; + } + + public boolean isIs_canonical_junction() { + return is_canonical_junction; + } + + public void setIs_canonical_junction(boolean is_canonical_junction) { + this.is_canonical_junction = is_canonical_junction; + } + + public boolean isIs_semi_canonical_junction() { + return is_semi_canonical_junction; + } + + public void setIs_semi_canonical_junction(boolean is_semi_canonical_junction) { + this.is_semi_canonical_junction = is_semi_canonical_junction; + } + + public boolean isIs_non_canonical_junction() { + return is_non_canonical_junction; + } + + public void setIs_non_canonical_junction(boolean is_non_canonical_junction) { + this.is_non_canonical_junction = is_non_canonical_junction; + } + + public int getIntron_length() { + return intron_length; + } + + public void setIntron_length(int intron_length) { + this.intron_length = intron_length; + } + + public int getLeft_length() { + return left_length; + } + + public void setLeft_length(int left_length) { + this.left_length = left_length; + } + + public int getRight_length() { + return right_length; + } + + public void setRight_length(int right_length) { + this.right_length = right_length; + } + + public int getPart_no() { + return part_no; + } + + public void setPart_no(int part_no) { + this.part_no = part_no; + } + + public int getLeft_mismatch() { + return left_mismatch; + } + + public void setLeft_mismatch(int left_mismatch) { + this.left_mismatch = left_mismatch; + } + + public int getRight_mismatch() { + return right_mismatch; + } + + public void setRight_mismatch(int right_mismatch) { + this.right_mismatch = right_mismatch; + } + + public int getLeft_boundary() { + return left_boundary; + } + + public void setLeft_boundary(int left_boundary) { + this.left_boundary = left_boundary; + } + + public int getRight_boundary() { + return right_boundary; + } + + public void setRight_boundary(int right_boundary) { + this.right_boundary = right_boundary; + } + } +}