Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
init
  • Loading branch information
mux13001 committed Apr 24, 2018
0 parents commit ee7a68b
Show file tree
Hide file tree
Showing 31 changed files with 1,237 additions and 0 deletions.
13 changes: 13 additions & 0 deletions .idea/GenomeAssembly.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/libraries/R_User_Library.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

55 changes: 55 additions & 0 deletions MaSuRCA/Bacteria/config
@@ -0,0 +1,55 @@
# example configuration file

# DATA is specified as type {PE,JUMP,OTHER,PACBIO} and 5 fields:
# 1)two_letter_prefix 2)mean 3)stdev 4)fastq(.gz)_fwd_reads
# 5)fastq(.gz)_rev_reads. The PE reads are always assumed to be
# innies, i.e. --->.<---, and JUMP are assumed to be outties
# <---.--->. If there are any jump libraries that are innies, such as
# longjump, specify them as JUMP and specify NEGATIVE mean. Reverse reads
# are optional for PE libraries and mandatory for JUMP libraries. Any
# OTHER sequence data (454, Sanger, Ion torrent, etc) must be first
# converted into Celera Assembler compatible .frg files (see
# http://wgs-assembler.sourceforge.com)
DATA
##PE= pe 525 60 avg_read_length std_dev /FULL_PATH/paired_read1.fastq /FULL_PATH/paired_read2.fastq
PE= pe 221 1952 /home/CAM/mxu/tutorial/Bacteria/Sample_1.fastq /home/CAM/mxu/tutorial/Bacteria/Sample_2.fastq
PE= se 176 4194 /home/CAM/mxu/tutorial/Bacteria/Sample_s.fastq
#JUMP= sh 3600 200 /FULL_PATH/short_1.fastq /FULL_PATH/short_2.fastq
#pacbio reads must be in a single fasta file! make sure you provide absolute path
#PACBIO=/FULL_PATH/pacbio.fa
#OTHER= /home/CAM/mxu/MaSuRCA/Sample_1.frg /home/CAM/mxu/MaSuRCA/Sample_2.frg
END

PARAMETERS
#set this to 1 if your Illumina jumping library reads are shorter than 100bp
EXTEND_JUMP_READS=0
#this is k-mer size for deBruijn graph values between 25 and 127 are supported, auto will compute the optimal size based on the read data and GC content
GRAPH_KMER_SIZE = auto
#set this to 1 for all Illumina-only assemblies
#set this to 1 if you have less than 20x long reads (454, Sanger, Pacbio) and less than 50x CLONE coverage by Illumina, Sanger or 454 mate pairs
#otherwise keep at 0
USE_LINKING_MATES = 0
#specifies whether to run mega-reads correction on the grid
USE_GRID=0
#specifies queue to use when running on the grid MANDATORY
GRID_QUEUE=all.q
#batch size in the amount of long read sequence for each batch on the grid
GRID_BATCH_SIZE=300000000
#coverage by the longest Long reads to use
LHE_COVERAGE=30
#this parameter is useful if you have too many Illumina jumping library mates. Typically set it to 60 for bacteria and 300 for the other organisms
LIMIT_JUMP_COVERAGE = 300
#these are the additional parameters to Celera Assembler. do not worry about performance, number or processors or batch sizes -- these are computed automatically.
#set cgwErrorRate=0.25 for bacteria and 0.1<=cgwErrorRate<=0.15 for other organisms.
CA_PARAMETERS = cgwErrorRate=0.15
#minimum count k-mers used in error correction 1 means all k-mers are used. one can increase to 2 if Illumina coverage >100
KMER_COUNT_THRESHOLD = 1
#whether to attempt to close gaps in scaffolds with Illumina data
CLOSE_GAPS=1
#auto-detected number of cpus to use
NUM_THREADS = 16
#this is mandatory jellyfish hash size -- a safe value is estimated_genome_size*estimated_coverage
JF_SIZE = 200000000
#set this to 1 to use SOAPdenovo contigging/scaffolding module. Assembly will be worse but will run faster. Useful for very large (>5Gbp) genomes from Illumina-only data
SOAP_ASSEMBLY=0
END
16 changes: 16 additions & 0 deletions MaSuRCA/Bacteria/ma_assemble.sh
@@ -0,0 +1,16 @@
#!/bin/bash
#SBATCH --job-name=masurca
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 16
#SBATCH --partition=himem1
#SBATCH --mail-type=END
#SBATCH --mem=128G
#SBATCH --mail-user=muyang.xu@uconn.edu
#SBATCH -o masurca_%j.out
#SBATCH -e masurca_%j.err

module load MaSuRCA/3.2.4

masurca config
bash assemble.sh
16 changes: 16 additions & 0 deletions MaSuRCA/Bacteria/sample_seq_stats.sh
@@ -0,0 +1,16 @@
#!/bin/bash
#SBATCH --job-name=seqStats
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 4
#SBATCH --partition=general
#SBATCH --mail-type=END
#SBATCH --mem=8G
#SBATCH --mail-user=muyang.xu@uconn.edu
#SBATCH -o stats_%j.out
#SBATCH -e stats_%j.err


awk 'BEGIN { t=0.0;sq=0.0; n=0;} ;NR%4==2 {n++;L=length($0);t+=L;sq+=L*L;}END{m=t/n;printf("total %d avg=%f stddev=%f\n",n,m,sq/n-m*m);}' ../../dataset/Bacteria/Sample_[12].fastq > Sample_stats.txt


56 changes: 56 additions & 0 deletions MaSuRCA/Butterfly/config
@@ -0,0 +1,56 @@
# example configuration file

# DATA is specified as type {PE,JUMP,OTHER,PACBIO} and 5 fields:
# 1)two_letter_prefix 2)mean 3)stdev 4)fastq(.gz)_fwd_reads
# 5)fastq(.gz)_rev_reads. The PE reads are always assumed to be
# innies, i.e. --->.<---, and JUMP are assumed to be outties
# <---.--->. If there are any jump libraries that are innies, such as
# longjump, specify them as JUMP and specify NEGATIVE mean. Reverse reads
# are optional for PE libraries and mandatory for JUMP libraries. Any
# OTHER sequence data (454, Sanger, Ion torrent, etc) must be first
# converted into Celera Assembler compatible .frg files (see
# http://wgs-assembler.sourceforge.com)
DATA
##PE= pe 525 60 avg_read_length std_dev /FULL_PATH/paired_read1.fastq /FULL_PATH/paired_read2.fastq
PE= p1 147 160 /home/CAM/mxu/tutorial/p3/dataset/DRR021673_1.fastq /home/CAM/mxu/tutorial/p3/dataset/DRR021673_2.fastq
PE= p2 145 251 /home/CAM/mxu/tutorial/p3/dataset/DRR021674_1.fastq /home/CAM/mxu/tutorial/p3/dataset/DRR021674_2.fastq
JUMP= m1 124 1539 /home/CAM/mxu/tutorial/p3/dataset/DRR021675_1.fastq /home/CAM/mxu/tutorial/p3/dataset/DRR021675_2.fastq
JUMP= m2 125 1493 /home/CAM/mxu/tutorial/p3/dataset/DRR021677_1.fastq /home/CAM/mxu/tutorial/p3/dataset/DRR021677_2.fastq
#pacbio reads must be in a single fasta file! make sure you provide absolute path
#PACBIO=/FULL_PATH/pacbio.fa
#OTHER=/FULL_PATH/file.frg
END

PARAMETERS
#set this to 1 if your Illumina jumping library reads are shorter than 100bp
#EXTEND_JUMP_READS=0
#this is k-mer size for deBruijn graph values between 25 and 127 are supported, auto will compute the optimal size based on the read data and GC content
GRAPH_KMER_SIZE = auto
#set this to 1 for all Illumina-only assemblies
#set this to 1 if you have less than 20x long reads (454, Sanger, Pacbio) and less than 50x CLONE coverage by Illumina, Sanger or 454 mate pairs
#otherwise keep at 0
USE_LINKING_MATES = 0
#specifies whether to run mega-reads correction on the grid
USE_GRID=0
#specifies queue to use when running on the grid MANDATORY
GRID_QUEUE=all.q
#batch size in the amount of long read sequence for each batch on the grid
GRID_BATCH_SIZE=300000000
#coverage by the longest Long reads to use
##LHE_COVERAGE=30
#this parameter is useful if you have too many Illumina jumping library mates. Typically set it to 60 for bacteria and 300 for the other organisms
LIMIT_JUMP_COVERAGE = 300
#these are the additional parameters to Celera Assembler. do not worry about performance, number or processors or batch sizes -- these are computed automatically.
#set cgwErrorRate=0.25 for bacteria and 0.1<=cgwErrorRate<=0.15 for other organisms.
CA_PARAMETERS = cgwErrorRate=0.15
#minimum count k-mers used in error correction 1 means all k-mers are used. one can increase to 2 if Illumina coverage >100
KMER_COUNT_THRESHOLD = 1
#whether to attempt to close gaps in scaffolds with Illumina data
CLOSE_GAPS=1
#auto-detected number of cpus to use
NUM_THREADS = 16
#this is mandatory jellyfish hash size -- a safe value is estimated_genome_size*estimated_coverage
JF_SIZE = 200000000
#set this to 1 to use SOAPdenovo contigging/scaffolding module. Assembly will be worse but will run faster. Useful for very large (>5Gbp) genomes from Illumina-only data
SOAP_ASSEMBLY=0
END
16 changes: 16 additions & 0 deletions MaSuRCA/Butterfly/ma_assemble.sh
@@ -0,0 +1,16 @@
#!/bin/bash
#SBATCH --job-name=masurca
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 16
#SBATCH --partition=himem1
#SBATCH --mail-type=END
#SBATCH --mem=128G
#SBATCH --mail-user=muyang.xu@uconn.edu
#SBATCH -o masurca_%j.out
#SBATCH -e masurca_%j.err

module load MaSuRCA/3.2.4

masurca config
bash assemble.sh
20 changes: 20 additions & 0 deletions MaSuRCA/Butterfly/xut_seq_stats.sh
@@ -0,0 +1,20 @@
#!/bin/bash
#SBATCH --job-name=seqStats
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 4
#SBATCH --partition=general
#SBATCH --mail-type=END
#SBATCH --mem=8G
#SBATCH --mail-user=muyang.xu@uconn.edu
#SBATCH -o stats_%j.out
#SBATCH -e stats_%j.err


awk 'BEGIN { t=0.0;sq=0.0; n=0;} ;NR%4==2 {n++;L=length($0);t+=L;sq+=L*L;}END{m=t/n;printf("total %d avg=%f stddev=%f\n",n,m,sq/n-m*m);}' ../../dataset/butterfly/DRR021675*.fastq > DRR021675_stats.txt

awk 'BEGIN { t=0.0;sq=0.0; n=0;} ;NR%4==2 {n++;L=length($0);t+=L;sq+=L*L;}END{m=t/n;printf("total %d avg=%f stddev=%f\n",n,m,sq/n-m*m);}' ../../dataset/butterfly/DRR021677*.fastq > DRR021677_stats.txt

awk 'BEGIN { t=0.0;sq=0.0; n=0;} ;NR%4==2 {n++;L=length($0);t+=L;sq+=L*L;}END{m=t/n;printf("total %d avg=%f stddev=%f\n",n,m,sq/n-m*m);}' ../../dataset/butterfly/DRR021673*.fastq > DRR021673_stats.txt

awk 'BEGIN { t=0.0;sq=0.0; n=0;} ;NR%4==2 {n++;L=length($0);t+=L;sq+=L*L;}END{m=t/n;printf("total %d avg=%f stddev=%f\n",n,m,sq/n-m*m);}' ../../dataset/butterfly/DRR021674*.fastq > DRR021674_stats.txt
15 changes: 15 additions & 0 deletions Platanus/Bacteria/gap_close.sh
@@ -0,0 +1,15 @@
#!/bin/bash
#SBATCH --job-name=platnus_GapClose
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 16
#SBATCH --partition=general
#SBATCH --mail-type=END
#SBATCH --mem=65G
#SBATCH --mail-user=muyang.xu@uconn.edu
#SBATCH -o gap_%j.Pxut
#SBATCH -e gap_%j.err

module load platanus/1.2.4

platanus gap_close -o Pxut -c /home/CAM/mxu/tutorial/p3/Pxut_scaffold.fa -IP1 /home/CAM/mxu/tutorial/p3/Sample_1.fastq /home/CAM/mxu/tutorial/p3/Sample_2.fastq -t 16 2> gap_close.log
19 changes: 19 additions & 0 deletions Platanus/Bacteria/platanus.sh
@@ -0,0 +1,19 @@
#!/bin/bash
#SBATCH --job-name=platnus_assemble
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 16
#SBATCH --partition=general
#SBATCH --mail-type=END
#SBATCH --mem=130G
#SBATCH --mail-user=muyang.xu@uconn.edu
#SBATCH -o assemble_%j.out
#SBATCH -e assemble_%j.err

module load platanus/1.2.4

platanus assemble -o sample -f ../../dataset/Bacteria/Sample_[12s].fastq -t 16 -m 128

platanus scaffold -o sample -c sample_contig.fa -b sample_contigBubble.fa -IP1 ../../dataset/Bacteria/Sample_1.fastq ../../dataset/Bacteria/Sample_2.fastq -t 16

platanus gap_close -o sample -c sample_scaffold.fa -IP1 ../../dataset/Bacteria/Sample_1.fastq ../../dataset/Bacteria/Sample_2.fastq -t 16
15 changes: 15 additions & 0 deletions Platanus/Bacteria/scaffold.sh
@@ -0,0 +1,15 @@
#!/bin/bash
#SBATCH --job-name=platnus_scaffold
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 16
#SBATCH --partition=general
#SBATCH --mail-type=END
#SBATCH --mem=65G
#SBATCH --mail-user=muyang.xu@uconn.edu
#SBATCH -o Scaffold_%j.Pxut
#SBATCH -e Scaffold_%j.err

module load platanus/1.2.4

platanus scaffold -o Pxut -c Pxut_contig.fa -b Pxut_contigBubble.fa -IP1 ../../dataset/Bacteria/Sample_1.fastq ../../dataset/Bacteria/Sample_2.fastq -t 16
15 changes: 15 additions & 0 deletions Platanus/Bacteria/trim.sh
@@ -0,0 +1,15 @@
#!/bin/bash
#$ -N Platanus-trim
#$ -M muyang.xu@uconn.edu
#$ -q highmem.q
#$ -m ea
#$ -S /bin/bash
#$ -cwd
#$ -pe smp 4
#$ -o trim_$JOB_ID.out
#$ -e trim_$JOB_ID.err

module load Platanus/1.2.4

Platanus_trim /home/CAM/mxu/tutorial/MaSuRCA_illumina/dataset/DRR021673_1.fastq.bz2 /home/CAM/mxu/tutorial/MaSuRCA_illumina/dataset/DRR021673_2.fastq.bz2

Binary file added Platanus/Butterfly/.platanus.sh.swp
Binary file not shown.
20 changes: 20 additions & 0 deletions Platanus/Butterfly/platanus.sh
@@ -0,0 +1,20 @@
#!/bin/bash
#SBATCH --job-name=platnus_assemble
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 16
#SBATCH --partition=himem3
#SBATCH --mail-type=END
#SBATCH --mem=256G
#SBATCH --mail-user=muyang.xu@uconn.edu
#SBATCH -o assemble_%j.out
#SBATCH -e assemble_%j.err

module load platanus/1.2.4

platanus assemble -o Pxut -f /home/CAM/mxu/tutorial/p3/dataset/DRR02167[34]_[12].fastq -t 16 -m 128

platanus scaffold -o Pxut -c Pxut_contig.fa -b Pxut_contigBubble.fa -IP1 ../../dataset/Butterfly/DRR021673_1.fastq ../../dataset/Butterfly/DRR021673_2.fastq -IP2 ../../dataset/Butterfly/DRR021674_1.fastq ../../dataset/Butterfly/DRR021674_2.fastq -t 16



22 changes: 22 additions & 0 deletions Quast/Bacteria/quast.sh
@@ -0,0 +1,22 @@
#!/bin/bash
#SBATCH --job-name=quast
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 8
#SBATCH --partition=general
#SBATCH --mail-type=END
#SBATCH --mem=64G
#SBATCH --mail-user=muyang.xu@uconn.edu
#SBATCH -o quast_%j.out
#SBATCH -e quast_%j.err

module load quast/4.6

quast.py -t 8 ../../SOAPdenovo/Bacteria/graph_Sample_31.scafSeq -o SOAP/

quast.py -t 8 ../../SPAdes/Bacteria/scaffolds.fasta -o SPAdes/

quast.py -t 8 ../../MaSuRCA/Bacteria/CA/scaffolds.ref.fa -o MaSuRCA/

quast.py -t 8 ../../Platanus/Bacteria/Pxut_scaffold.fa -o Platanus/

23 changes: 23 additions & 0 deletions Quast/Butterfly/quast.sh
@@ -0,0 +1,23 @@
#!/bin/bash
#SBATCH --job-name=quast
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 8
#SBATCH --partition=general
#SBATCH --mail-type=END
#SBATCH --mem=64G
#SBATCH --mail-user=muyang.xu@uconn.edu
#SBATCH -o quast_%j.out
#SBATCH -e quast_%j.err

module load quast/4.6


quast.py -t 8 ../../SOAPdenovo/Butterfly/graph_xuthus_31.scafSeq -o SOAPdenovo/

quast.py -t 8 ../../SPAdes/Butterfly/scaffolds.fasta -o SPAdes/

quast.py -t 8 ../../MaSuRCA/Butterfly/CA/scaffolds.ref.fa -o MaSuRCA/

quast.py -t 8 ../../Platanus/Butterfly/Pxut_scaffold.fa -o Platanus/

0 comments on commit ee7a68b

Please sign in to comment.