diff --git a/.idea/GenomeAssembly.iml b/.idea/GenomeAssembly.iml new file mode 100644 index 0000000..3a4807d --- /dev/null +++ b/.idea/GenomeAssembly.iml @@ -0,0 +1,13 @@ + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/R_User_Library.xml b/.idea/libraries/R_User_Library.xml new file mode 100644 index 0000000..71f5ff7 --- /dev/null +++ b/.idea/libraries/R_User_Library.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..4a3ffc8 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..e09cbfd --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/MaSuRCA/Bacteria/config b/MaSuRCA/Bacteria/config new file mode 100755 index 0000000..6659c15 --- /dev/null +++ b/MaSuRCA/Bacteria/config @@ -0,0 +1,55 @@ +# example configuration file + +# DATA is specified as type {PE,JUMP,OTHER,PACBIO} and 5 fields: +# 1)two_letter_prefix 2)mean 3)stdev 4)fastq(.gz)_fwd_reads +# 5)fastq(.gz)_rev_reads. The PE reads are always assumed to be +# innies, i.e. --->.<---, and JUMP are assumed to be outties +# <---.--->. If there are any jump libraries that are innies, such as +# longjump, specify them as JUMP and specify NEGATIVE mean. Reverse reads +# are optional for PE libraries and mandatory for JUMP libraries. Any +# OTHER sequence data (454, Sanger, Ion torrent, etc) must be first +# converted into Celera Assembler compatible .frg files (see +# http://wgs-assembler.sourceforge.com) +DATA +##PE= pe 525 60 avg_read_length std_dev /FULL_PATH/paired_read1.fastq /FULL_PATH/paired_read2.fastq +PE= pe 221 1952 /home/CAM/mxu/tutorial/Bacteria/Sample_1.fastq /home/CAM/mxu/tutorial/Bacteria/Sample_2.fastq +PE= se 176 4194 /home/CAM/mxu/tutorial/Bacteria/Sample_s.fastq +#JUMP= sh 3600 200 /FULL_PATH/short_1.fastq /FULL_PATH/short_2.fastq +#pacbio reads must be in a single fasta file! make sure you provide absolute path +#PACBIO=/FULL_PATH/pacbio.fa +#OTHER= /home/CAM/mxu/MaSuRCA/Sample_1.frg /home/CAM/mxu/MaSuRCA/Sample_2.frg +END + +PARAMETERS +#set this to 1 if your Illumina jumping library reads are shorter than 100bp +EXTEND_JUMP_READS=0 +#this is k-mer size for deBruijn graph values between 25 and 127 are supported, auto will compute the optimal size based on the read data and GC content +GRAPH_KMER_SIZE = auto +#set this to 1 for all Illumina-only assemblies +#set this to 1 if you have less than 20x long reads (454, Sanger, Pacbio) and less than 50x CLONE coverage by Illumina, Sanger or 454 mate pairs +#otherwise keep at 0 +USE_LINKING_MATES = 0 +#specifies whether to run mega-reads correction on the grid +USE_GRID=0 +#specifies queue to use when running on the grid MANDATORY +GRID_QUEUE=all.q +#batch size in the amount of long read sequence for each batch on the grid +GRID_BATCH_SIZE=300000000 +#coverage by the longest Long reads to use +LHE_COVERAGE=30 +#this parameter is useful if you have too many Illumina jumping library mates. Typically set it to 60 for bacteria and 300 for the other organisms +LIMIT_JUMP_COVERAGE = 300 +#these are the additional parameters to Celera Assembler. do not worry about performance, number or processors or batch sizes -- these are computed automatically. +#set cgwErrorRate=0.25 for bacteria and 0.1<=cgwErrorRate<=0.15 for other organisms. +CA_PARAMETERS = cgwErrorRate=0.15 +#minimum count k-mers used in error correction 1 means all k-mers are used. one can increase to 2 if Illumina coverage >100 +KMER_COUNT_THRESHOLD = 1 +#whether to attempt to close gaps in scaffolds with Illumina data +CLOSE_GAPS=1 +#auto-detected number of cpus to use +NUM_THREADS = 16 +#this is mandatory jellyfish hash size -- a safe value is estimated_genome_size*estimated_coverage +JF_SIZE = 200000000 +#set this to 1 to use SOAPdenovo contigging/scaffolding module. Assembly will be worse but will run faster. Useful for very large (>5Gbp) genomes from Illumina-only data +SOAP_ASSEMBLY=0 +END diff --git a/MaSuRCA/Bacteria/ma_assemble.sh b/MaSuRCA/Bacteria/ma_assemble.sh new file mode 100755 index 0000000..345edcb --- /dev/null +++ b/MaSuRCA/Bacteria/ma_assemble.sh @@ -0,0 +1,16 @@ +#!/bin/bash +#SBATCH --job-name=masurca +#SBATCH -N 1 +#SBATCH -n 1 +#SBATCH -c 16 +#SBATCH --partition=himem1 +#SBATCH --mail-type=END +#SBATCH --mem=128G +#SBATCH --mail-user=muyang.xu@uconn.edu +#SBATCH -o masurca_%j.out +#SBATCH -e masurca_%j.err + +module load MaSuRCA/3.2.4 + +masurca config +bash assemble.sh diff --git a/MaSuRCA/Bacteria/sample_seq_stats.sh b/MaSuRCA/Bacteria/sample_seq_stats.sh new file mode 100755 index 0000000..c28cdef --- /dev/null +++ b/MaSuRCA/Bacteria/sample_seq_stats.sh @@ -0,0 +1,16 @@ +#!/bin/bash +#SBATCH --job-name=seqStats +#SBATCH -N 1 +#SBATCH -n 1 +#SBATCH -c 4 +#SBATCH --partition=general +#SBATCH --mail-type=END +#SBATCH --mem=8G +#SBATCH --mail-user=muyang.xu@uconn.edu +#SBATCH -o stats_%j.out +#SBATCH -e stats_%j.err + + +awk 'BEGIN { t=0.0;sq=0.0; n=0;} ;NR%4==2 {n++;L=length($0);t+=L;sq+=L*L;}END{m=t/n;printf("total %d avg=%f stddev=%f\n",n,m,sq/n-m*m);}' ../../dataset/Bacteria/Sample_[12].fastq > Sample_stats.txt + + diff --git a/MaSuRCA/Butterfly/config b/MaSuRCA/Butterfly/config new file mode 100755 index 0000000..56eaa2b --- /dev/null +++ b/MaSuRCA/Butterfly/config @@ -0,0 +1,56 @@ +# example configuration file + +# DATA is specified as type {PE,JUMP,OTHER,PACBIO} and 5 fields: +# 1)two_letter_prefix 2)mean 3)stdev 4)fastq(.gz)_fwd_reads +# 5)fastq(.gz)_rev_reads. The PE reads are always assumed to be +# innies, i.e. --->.<---, and JUMP are assumed to be outties +# <---.--->. If there are any jump libraries that are innies, such as +# longjump, specify them as JUMP and specify NEGATIVE mean. Reverse reads +# are optional for PE libraries and mandatory for JUMP libraries. Any +# OTHER sequence data (454, Sanger, Ion torrent, etc) must be first +# converted into Celera Assembler compatible .frg files (see +# http://wgs-assembler.sourceforge.com) +DATA +##PE= pe 525 60 avg_read_length std_dev /FULL_PATH/paired_read1.fastq /FULL_PATH/paired_read2.fastq +PE= p1 147 160 /home/CAM/mxu/tutorial/p3/dataset/DRR021673_1.fastq /home/CAM/mxu/tutorial/p3/dataset/DRR021673_2.fastq +PE= p2 145 251 /home/CAM/mxu/tutorial/p3/dataset/DRR021674_1.fastq /home/CAM/mxu/tutorial/p3/dataset/DRR021674_2.fastq +JUMP= m1 124 1539 /home/CAM/mxu/tutorial/p3/dataset/DRR021675_1.fastq /home/CAM/mxu/tutorial/p3/dataset/DRR021675_2.fastq +JUMP= m2 125 1493 /home/CAM/mxu/tutorial/p3/dataset/DRR021677_1.fastq /home/CAM/mxu/tutorial/p3/dataset/DRR021677_2.fastq +#pacbio reads must be in a single fasta file! make sure you provide absolute path +#PACBIO=/FULL_PATH/pacbio.fa +#OTHER=/FULL_PATH/file.frg +END + +PARAMETERS +#set this to 1 if your Illumina jumping library reads are shorter than 100bp +#EXTEND_JUMP_READS=0 +#this is k-mer size for deBruijn graph values between 25 and 127 are supported, auto will compute the optimal size based on the read data and GC content +GRAPH_KMER_SIZE = auto +#set this to 1 for all Illumina-only assemblies +#set this to 1 if you have less than 20x long reads (454, Sanger, Pacbio) and less than 50x CLONE coverage by Illumina, Sanger or 454 mate pairs +#otherwise keep at 0 +USE_LINKING_MATES = 0 +#specifies whether to run mega-reads correction on the grid +USE_GRID=0 +#specifies queue to use when running on the grid MANDATORY +GRID_QUEUE=all.q +#batch size in the amount of long read sequence for each batch on the grid +GRID_BATCH_SIZE=300000000 +#coverage by the longest Long reads to use +##LHE_COVERAGE=30 +#this parameter is useful if you have too many Illumina jumping library mates. Typically set it to 60 for bacteria and 300 for the other organisms +LIMIT_JUMP_COVERAGE = 300 +#these are the additional parameters to Celera Assembler. do not worry about performance, number or processors or batch sizes -- these are computed automatically. +#set cgwErrorRate=0.25 for bacteria and 0.1<=cgwErrorRate<=0.15 for other organisms. +CA_PARAMETERS = cgwErrorRate=0.15 +#minimum count k-mers used in error correction 1 means all k-mers are used. one can increase to 2 if Illumina coverage >100 +KMER_COUNT_THRESHOLD = 1 +#whether to attempt to close gaps in scaffolds with Illumina data +CLOSE_GAPS=1 +#auto-detected number of cpus to use +NUM_THREADS = 16 +#this is mandatory jellyfish hash size -- a safe value is estimated_genome_size*estimated_coverage +JF_SIZE = 200000000 +#set this to 1 to use SOAPdenovo contigging/scaffolding module. Assembly will be worse but will run faster. Useful for very large (>5Gbp) genomes from Illumina-only data +SOAP_ASSEMBLY=0 +END diff --git a/MaSuRCA/Butterfly/ma_assemble.sh b/MaSuRCA/Butterfly/ma_assemble.sh new file mode 100755 index 0000000..345edcb --- /dev/null +++ b/MaSuRCA/Butterfly/ma_assemble.sh @@ -0,0 +1,16 @@ +#!/bin/bash +#SBATCH --job-name=masurca +#SBATCH -N 1 +#SBATCH -n 1 +#SBATCH -c 16 +#SBATCH --partition=himem1 +#SBATCH --mail-type=END +#SBATCH --mem=128G +#SBATCH --mail-user=muyang.xu@uconn.edu +#SBATCH -o masurca_%j.out +#SBATCH -e masurca_%j.err + +module load MaSuRCA/3.2.4 + +masurca config +bash assemble.sh diff --git a/MaSuRCA/Butterfly/xut_seq_stats.sh b/MaSuRCA/Butterfly/xut_seq_stats.sh new file mode 100755 index 0000000..3ef3b3c --- /dev/null +++ b/MaSuRCA/Butterfly/xut_seq_stats.sh @@ -0,0 +1,20 @@ +#!/bin/bash +#SBATCH --job-name=seqStats +#SBATCH -N 1 +#SBATCH -n 1 +#SBATCH -c 4 +#SBATCH --partition=general +#SBATCH --mail-type=END +#SBATCH --mem=8G +#SBATCH --mail-user=muyang.xu@uconn.edu +#SBATCH -o stats_%j.out +#SBATCH -e stats_%j.err + + +awk 'BEGIN { t=0.0;sq=0.0; n=0;} ;NR%4==2 {n++;L=length($0);t+=L;sq+=L*L;}END{m=t/n;printf("total %d avg=%f stddev=%f\n",n,m,sq/n-m*m);}' ../../dataset/butterfly/DRR021675*.fastq > DRR021675_stats.txt + +awk 'BEGIN { t=0.0;sq=0.0; n=0;} ;NR%4==2 {n++;L=length($0);t+=L;sq+=L*L;}END{m=t/n;printf("total %d avg=%f stddev=%f\n",n,m,sq/n-m*m);}' ../../dataset/butterfly/DRR021677*.fastq > DRR021677_stats.txt + +awk 'BEGIN { t=0.0;sq=0.0; n=0;} ;NR%4==2 {n++;L=length($0);t+=L;sq+=L*L;}END{m=t/n;printf("total %d avg=%f stddev=%f\n",n,m,sq/n-m*m);}' ../../dataset/butterfly/DRR021673*.fastq > DRR021673_stats.txt + +awk 'BEGIN { t=0.0;sq=0.0; n=0;} ;NR%4==2 {n++;L=length($0);t+=L;sq+=L*L;}END{m=t/n;printf("total %d avg=%f stddev=%f\n",n,m,sq/n-m*m);}' ../../dataset/butterfly/DRR021674*.fastq > DRR021674_stats.txt diff --git a/Platanus/Bacteria/gap_close.sh b/Platanus/Bacteria/gap_close.sh new file mode 100755 index 0000000..f655591 --- /dev/null +++ b/Platanus/Bacteria/gap_close.sh @@ -0,0 +1,15 @@ +#!/bin/bash +#SBATCH --job-name=platnus_GapClose +#SBATCH -N 1 +#SBATCH -n 1 +#SBATCH -c 16 +#SBATCH --partition=general +#SBATCH --mail-type=END +#SBATCH --mem=65G +#SBATCH --mail-user=muyang.xu@uconn.edu +#SBATCH -o gap_%j.Pxut +#SBATCH -e gap_%j.err + +module load platanus/1.2.4 + +platanus gap_close -o Pxut -c /home/CAM/mxu/tutorial/p3/Pxut_scaffold.fa -IP1 /home/CAM/mxu/tutorial/p3/Sample_1.fastq /home/CAM/mxu/tutorial/p3/Sample_2.fastq -t 16 2> gap_close.log diff --git a/Platanus/Bacteria/platanus.sh b/Platanus/Bacteria/platanus.sh new file mode 100755 index 0000000..7d63cb6 --- /dev/null +++ b/Platanus/Bacteria/platanus.sh @@ -0,0 +1,19 @@ +#!/bin/bash +#SBATCH --job-name=platnus_assemble +#SBATCH -N 1 +#SBATCH -n 1 +#SBATCH -c 16 +#SBATCH --partition=general +#SBATCH --mail-type=END +#SBATCH --mem=130G +#SBATCH --mail-user=muyang.xu@uconn.edu +#SBATCH -o assemble_%j.out +#SBATCH -e assemble_%j.err + +module load platanus/1.2.4 + +platanus assemble -o sample -f ../../dataset/Bacteria/Sample_[12s].fastq -t 16 -m 128 + +platanus scaffold -o sample -c sample_contig.fa -b sample_contigBubble.fa -IP1 ../../dataset/Bacteria/Sample_1.fastq ../../dataset/Bacteria/Sample_2.fastq -t 16 + +platanus gap_close -o sample -c sample_scaffold.fa -IP1 ../../dataset/Bacteria/Sample_1.fastq ../../dataset/Bacteria/Sample_2.fastq -t 16 diff --git a/Platanus/Bacteria/scaffold.sh b/Platanus/Bacteria/scaffold.sh new file mode 100755 index 0000000..30f247f --- /dev/null +++ b/Platanus/Bacteria/scaffold.sh @@ -0,0 +1,15 @@ +#!/bin/bash +#SBATCH --job-name=platnus_scaffold +#SBATCH -N 1 +#SBATCH -n 1 +#SBATCH -c 16 +#SBATCH --partition=general +#SBATCH --mail-type=END +#SBATCH --mem=65G +#SBATCH --mail-user=muyang.xu@uconn.edu +#SBATCH -o Scaffold_%j.Pxut +#SBATCH -e Scaffold_%j.err + +module load platanus/1.2.4 + +platanus scaffold -o Pxut -c Pxut_contig.fa -b Pxut_contigBubble.fa -IP1 ../../dataset/Bacteria/Sample_1.fastq ../../dataset/Bacteria/Sample_2.fastq -t 16 diff --git a/Platanus/Bacteria/trim.sh b/Platanus/Bacteria/trim.sh new file mode 100755 index 0000000..7ecd0a4 --- /dev/null +++ b/Platanus/Bacteria/trim.sh @@ -0,0 +1,15 @@ +#!/bin/bash +#$ -N Platanus-trim +#$ -M muyang.xu@uconn.edu +#$ -q highmem.q +#$ -m ea +#$ -S /bin/bash +#$ -cwd +#$ -pe smp 4 +#$ -o trim_$JOB_ID.out +#$ -e trim_$JOB_ID.err + +module load Platanus/1.2.4 + +Platanus_trim /home/CAM/mxu/tutorial/MaSuRCA_illumina/dataset/DRR021673_1.fastq.bz2 /home/CAM/mxu/tutorial/MaSuRCA_illumina/dataset/DRR021673_2.fastq.bz2 + diff --git a/Platanus/Butterfly/.platanus.sh.swp b/Platanus/Butterfly/.platanus.sh.swp new file mode 100644 index 0000000..dbacd74 Binary files /dev/null and b/Platanus/Butterfly/.platanus.sh.swp differ diff --git a/Platanus/Butterfly/platanus.sh b/Platanus/Butterfly/platanus.sh new file mode 100755 index 0000000..10e01c1 --- /dev/null +++ b/Platanus/Butterfly/platanus.sh @@ -0,0 +1,20 @@ +#!/bin/bash +#SBATCH --job-name=platnus_assemble +#SBATCH -N 1 +#SBATCH -n 1 +#SBATCH -c 16 +#SBATCH --partition=himem3 +#SBATCH --mail-type=END +#SBATCH --mem=256G +#SBATCH --mail-user=muyang.xu@uconn.edu +#SBATCH -o assemble_%j.out +#SBATCH -e assemble_%j.err + +module load platanus/1.2.4 + +platanus assemble -o Pxut -f /home/CAM/mxu/tutorial/p3/dataset/DRR02167[34]_[12].fastq -t 16 -m 128 + +platanus scaffold -o Pxut -c Pxut_contig.fa -b Pxut_contigBubble.fa -IP1 ../../dataset/Butterfly/DRR021673_1.fastq ../../dataset/Butterfly/DRR021673_2.fastq -IP2 ../../dataset/Butterfly/DRR021674_1.fastq ../../dataset/Butterfly/DRR021674_2.fastq -t 16 + + + diff --git a/Quast/Bacteria/quast.sh b/Quast/Bacteria/quast.sh new file mode 100755 index 0000000..761c39c --- /dev/null +++ b/Quast/Bacteria/quast.sh @@ -0,0 +1,22 @@ +#!/bin/bash +#SBATCH --job-name=quast +#SBATCH -N 1 +#SBATCH -n 1 +#SBATCH -c 8 +#SBATCH --partition=general +#SBATCH --mail-type=END +#SBATCH --mem=64G +#SBATCH --mail-user=muyang.xu@uconn.edu +#SBATCH -o quast_%j.out +#SBATCH -e quast_%j.err + +module load quast/4.6 + +quast.py -t 8 ../../SOAPdenovo/Bacteria/graph_Sample_31.scafSeq -o SOAP/ + +quast.py -t 8 ../../SPAdes/Bacteria/scaffolds.fasta -o SPAdes/ + +quast.py -t 8 ../../MaSuRCA/Bacteria/CA/scaffolds.ref.fa -o MaSuRCA/ + +quast.py -t 8 ../../Platanus/Bacteria/Pxut_scaffold.fa -o Platanus/ + diff --git a/Quast/Butterfly/quast.sh b/Quast/Butterfly/quast.sh new file mode 100755 index 0000000..2cda1f9 --- /dev/null +++ b/Quast/Butterfly/quast.sh @@ -0,0 +1,23 @@ +#!/bin/bash +#SBATCH --job-name=quast +#SBATCH -N 1 +#SBATCH -n 1 +#SBATCH -c 8 +#SBATCH --partition=general +#SBATCH --mail-type=END +#SBATCH --mem=64G +#SBATCH --mail-user=muyang.xu@uconn.edu +#SBATCH -o quast_%j.out +#SBATCH -e quast_%j.err + +module load quast/4.6 + + +quast.py -t 8 ../../SOAPdenovo/Butterfly/graph_xuthus_31.scafSeq -o SOAPdenovo/ + +quast.py -t 8 ../../SPAdes/Butterfly/scaffolds.fasta -o SPAdes/ + +quast.py -t 8 ../../MaSuRCA/Butterfly/CA/scaffolds.ref.fa -o MaSuRCA/ + +quast.py -t 8 ../../Platanus/Butterfly/Pxut_scaffold.fa -o Platanus/ + diff --git a/README.md b/README.md new file mode 100644 index 0000000..9f1356a --- /dev/null +++ b/README.md @@ -0,0 +1,681 @@ +# Genome Assembly Tutorial +This repository is a usable, publicly available Genome Assembly tutorial. +All steps have been provided for the UConn CBC Xanadu cluster here with appropriate headers for +the Slurm scheduler that can be modified simply to run.  Commands should never be executed on the submit +nodes of any HPC machine.  If working on the Xanadu cluster, you should use sbatch scriptname after modifying the +script for each stage.  Basic editing of all scripts can be performed on the server with tools, +such as nano, vim, or emacs.  If you are new to Linux, please use +this handy guide for the operating system commands.  +In this guide, you will be working with common genome assemblers, such as +SOAPdenovo, +SPAdes, +MaSuRCA, +Platanus, +and quality assessment tool Quast +If you do not have a Xanadu account and are an affiliate of UConn/UCHC, please apply for one here. + +
+

Contents

+ +
+ +

Data Acquisition

+In this tutorial we will assemble sequences from a Prokaryote sample and Asian swallowtail (Papilio xuthus). +The bacterial sample used in this tutorial is paired-end, meaning that there are forward and reverse reads. which we +will designate as Sample_R1.fastq and Sample_R2.fastq, respectively. + +The butterfly sample is from DDBJ center. It contains 7 libraries in total, 2 pair-end, 5 mate-pair, which are assigned SRA +accession number DRR021673, DRR021674, DRR021675, DRR021676, DRR021677, DRR021678, DRR021679. +Note that all libraires take around 50GB storage in total. Prepare you storage accordingly before proceeding to next +step. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
LibraryTypeInsert sizeRead1Read2
Pair-end300DRR021673_1.fastqDRR021673_2.fastq
Pair-end500DRR021674_1.fastqDRR021674_2.fastq
Mate-pair3kbDRR021675_1.fastqDRR021675_2.fastq
DRR021676_1.fastqDRR021676_2.fastq
Mate-pair5kbDRR021677_1.fastqDRR021677_2.fastq
DRR021678_1.fastqDRR021678_2.fastq
Mate-pair8kbDRR021679_1.fastqDRR021679_2.fastq
+ +If you are performing this tutorial on Xanadu. Make sure you are under home directory. + +
cd
+ +before proceeding. Your home directory contains 10TB of storage and will not pollute the capacities of other users on the cluster. + +The workflow may be cloned into the appropriate directory using the terminal command: +
$git clone https://github.com/wolf-adam-eily/refseq_diffexp_funct_annotation_uconn.git
+$cd GenomeAssemblyTutorial 
+$ls  
+ +The bacteria data is located in dataset/Bacteria/. Uncompress the two sequence files with: + +
tar -xJf Sample_R1.tar.xz && tar -xJf Sample_R2.tar.xz
+ +We can obtain butterfly data either through sequence-read-archives (SRA) or by a downloadable link. Here we use the link to download data. Command below downloads +and uncompress part of butterfly data using wget and bzip2. + +
wget ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/fastq/DRA002/DRA002407/DRX019820/DRR021674_1.fastq.bz2 -P dataset/ && bzip2 -d dataset/DRR021674_1.fastq.bz2
+ +The script which downloads all butterfly libraries is dataset/Butterfly/download.sh. + +All data files are in fastq format. For more information about fastq format, see File Formats Tutorial + +

Quality Control with Sickle

+ +The first is to perform quality control on the reads using sickle. Sicle trims low quality reads below a certain threshold + from raw sequencing data. Use command below run the program on bacteria sample: + +
sickle pe -f dataset/bacteria/Sample_R1.fastq -r dataset/bacteria/Sample_R2.fastq -t sanger -o Sample_1.fastq -p Sample_2.fastq -s Sample_s.fastq -q 30 -l 45
+ + The command processes a file containing forward reads and a file containing reverse reads , in addition, it outputs trimmed singles. + + +Don't run this command alone on Xanadu terminal. The slurm script executing this command is sickle/quality_control.sh. + +Since the bufferfly data are already trimmed adaptor sequences, we can proceed without treating them. + + +

SOAPdenovo: de novo sequence assembler

+ +SOAPdenovo is a novel short-read assembly method that can build a de novo draft assembly for the human-sized genomes. The program is specially designed to assemble Illumina GA short reads. + +SOAPdenovo uses a config file to pass information about the sequences into the +program. Notable fields include average insert size and read length, +which differ depending on the sequencing technology, Each library starts with line [LIB] + +Enter command below to load SOAPdenovo on Xanadu: +
module load SOAP-denovo/2.04
+ + +Bacteria +
#maximal read length
+max_rd_len=250
+[LIB]
+#average insert size
+avg_ins=550
+#if sequence needs to be reversed
+reverse_seq=0
+#in which part(s) the reads are used
+asm_flags=3
+#use only first 250 bps of each read
+rd_len_cutoff=250
+#in which order the reads are used while scaffolding
+rank=1
+# cutoff of pair number for a reliable connection (at least 3 for short insert size)
+pair_num_cutoff=3
+#minimum aligned length to contigs for a reliable read location (at least 32 for short insert size)
+map_len=32
+# path to genes
+q1=../../dataset/Sample_1.fastq
+q2=../../dataset/Sample_2.fastq
+q=../../Sample_s.fastq
+ +Above shows sample config file for the bacterial library, where q1, q2, q designate +the paths to the forward, reverse and singles trimmed reads respectively. The file can be found in SOAPdenovo/Bacteria/config + +To run the assembler we will use the SOAPdenovo-63mer command with the all option ((to perform kmer graph construction, contig error correction, mapping of reads to contigs, and scaffolding). + +
SOAPdenovo-63mer all -s /common/Assembly_Tutorial/Assembly/Sample.config -K 31 -R -o graph_Sample_31 1>ass31.log 2>ass31.err
+ + + + +We repeat this command for kmer size = 35, 41 for later analysis. the slurm script is at SOAPdenovo/Bacteria/assemble.sh + +swallowtail + +For butterfly sample, we will run multiple libraries at once. Therefore, rank for each library. SOAPdenovo will use the pair-end libraries with insert size from smaller to larger to construct scaffolds. +Libraries with the same rank would be used at the same time. It is desired that the pairs in each rank provide adequate physical coverage of the genome. + +
#maximal read length
+max_rd_len=250
+[LIB]
+#average insert size
+avg_ins=300
+#if sequence needs to be reversed
+reverse_seq=0
+#in which part(s) the reads are used
+asm_flags=3
+#use only first 250 bps of each read
+rd_len_cutoff=250
+#in which order the reads are used while scaffolding
+rank=1
+# cutoff of pair number for a reliable connection (at least 3 for short insert size)
+pair_num_cutoff=3
+#minimum aligned length to contigs for a reliable read location (at least 32 for short insert size)
+map_len=32
+# path to genes
+q1=../../dataset/Butterfly/DRR021673_1.fastq
+q2=../../dataset/Butterfly/DRR021673_2.fastq
+
+[LIB]
+#average insert size
+avg_ins=500
+#if sequence needs to be reversed
+reverse_seq=0
+#in which part(s) the reads are used
+asm_flags=3
+#use only first 250 bps of each read
+rd_len_cutoff=250
+#in which order the reads are used while scaffolding
+rank=2
+# cutoff of pair number for a reliable connection (at least 3 for short insert size)
+pair_num_cutoff=3
+#minimum aligned length to contigs for a reliable read location (at least 32 for short insert size)
+map_len=32
+# path to genes
+q1=../../dataset/Butterfly/DRR021674_1.fastq
+q2=../../dataset/Butterfly/DRR021674_2.fastq
+
+[LIB]
+#average insert size
+avg_ins=3000
+#if sequence needs to be reversed
+reverse_seq=0
+#in which part(s) the reads are used
+asm_flags=3
+#use only first 250 bps of each read
+rd_len_cutoff=250
+#in which order the reads are used while scaffolding
+rank=3
+# cutoff of pair number for a reliable connection (at least 3 for short insert size)
+pair_num_cutoff=3
+#minimum aligned length to contigs for a reliable read location (at least 32 for short insert size)
+map_len=32
+# path to genes
+q1=../../dataset/Butterfly/DRR021675_1.fastq
+q2=../../dataset/Butterfly/DRR021675_2.fastq
+
+[LIB]
+#average insert size
+avg_ins=300
+#if sequence needs to be reversed
+reverse_seq=0
+#in which part(s) the reads are used
+asm_flags=3
+#use only first 250 bps of each read
+rd_len_cutoff=250
+#in which order the reads are used while scaffolding
+rank=4
+# cutoff of pair number for a reliable connection (at least 3 for short insert size)
+pair_num_cutoff=3
+#minimum aligned length to contigs for a reliable read location (at least 32 for short insert size)
+map_len=32
+# path to genes
+q1=../../dataset/Butterfly/DRR021677_1.fastq
+q2=../../dataset/Butterfly/DRR021677_2.fastq
+ +As shown above, four libraries with different insert sizes. The file is at SOAPdenovo/Butterfly/config. Run +command below to start assemble + +
SOAPdenovo-63mer all -s config -K 31-R  -o graph_xuthus_31 1>ass31.log 2>ass31.err
+The slurm script containing the command is at SOAPdenovo/Butterfly/assemble.sh + +

SPAdes: de Bruijn graph based assembler

+ +SPAdes is different from the other assemblers in that it generates a final assembly from multiple kmers. A list of kmers is automatically selected by SPAdes using the maximum read length of the input data, and each individual kmer contributes to the final assembly. To run SPAdes we will use the spades.py command with the --careful option to minimize the number of mismatches in the contigs + +Enter command below to load SPAdes on Xanadu: +
module load SPAdes/3.11.1
+ +Bacteria +
spades.py --careful -o SPAdes_out -1 ../../dataset/BacteriaSample_1.fastq -2 ../../dataset/Bacteria/Sample_2.fastq -s d../../ataset/Bacteria/Sample_s.fastq
+ + + +The script is at SPAdes/Bacteria/assemble.sh. + +swallowtail + +
spades.py --careful -o xuthus_out/ -t 16 -m 250 --pe1-1 ../../dataset/butterfly/DRR021673_1.fastq --pe1-2 ../../dataset/butterfly/DRR021673_2.fastq --pe2-1 ../../dataset/butterfly/DRR021674_1.fastq --pe2-2 ../../dataset/butterfly/DRR021674_2.fastq --mp1-1 ../../dataset/butterfly/DRR021675_1.fastq --mp1-2 ../../dataset/Butterfly/DRR021675_2.fastq
+ + + +The script is at SPAdes/Butterfly/assemble.sh. + + +

MaSuRCA assembler

+ +MaSuRCA is whole genome assembly software. It combines the efficiency of the de Bruijn graph and Overlap-Layout-Consensus (OLC) approaches. +It requires a configuration file to generate a run script. In the configuration file, user are required to specify input files, number +of cores to use, jellyfish hash size and name of SGE queue to use. + +The configuration file consists of 2 main sections, DATA and PARAMETERS. Under DATA, +input library is specified with 5 fields: two-letter prefix, average insert length and standard deviation, path to forward +reads, path to reverse reads (optional). In addition to get average insert length and standard deviation from data source. +we can also use awk to calculate two values from data files. Below is an example of the operation on Bacteria +Sample. + +
awk 'BEGIN { t=0.0;sq=0.0; n=0;} ;NR%4==2 {n++;L=length($0);t+=L;sq+=L*L;}END{m=t/n;printf("total %d avg=%f stddev=%f\n",n,m,sq/n-m*m);}' Sample_[12].fastq > Sample_stats.txt
+ +The command outputs a text file containing mean and standard deviation of reads. The slurm script for bacteria and butterfly are +stored in MaSuRCA/Bacteria/sample_seq_stats.sh and MaSuRCA/Butterfly/xut_seq_stats.sh respectively. + +Load MaSuRCA module on Xanadu: + +
module load MaSuRCA/3.2.4
+ +Bacteria + +
DATA
+PE= pe 232 1442 /home/CAM/mxu/tutorial/m4/Sample_1.fastq  /home/CAM/mxu/tutorial/m4/Sample_2.fastq 
+PE= se 176 4194 /home/CAM/mxu/tutorial/Bacteria/Sample_s.fastq
+#JUMP= sh 3600 200  /FULL_PATH/short_1.fastq  /FULL_PATH/short_2.fastq
+#pacbio reads must be in a single fasta file! make sure you provide absolute path
+#PACBIO=/FULL_PATH/pacbio.fa
+#OTHER=/FULL_PATH/file.frg
+END
+
+PARAMETERS
+#set this to 1 if your Illumina jumping library reads are shorter than 100bp
+#EXTEND_JUMP_READS=0
+#this is k-mer size for deBruijn graph values between 25 and 127 are supported, auto will compute the optimal size based on the read data and GC content
+GRAPH_KMER_SIZE = auto
+#set this to 1 for all Illumina-only assemblies
+#set this to 1 if you have less than 20x long reads (454, Sanger, Pacbio) and less than 50x CLONE coverage by Illumina, Sanger or 454 mate pairs
+#otherwise keep at 0
+USE_LINKING_MATES = 0
+#specifies whether to run mega-reads correction on the grid
+USE_GRID=0
+#specifies queue to use when running on the grid MANDATORY
+GRID_QUEUE=all.q
+#batch size in the amount of long read sequence for each batch on the grid
+GRID_BATCH_SIZE=300000000
+#coverage by the longest Long reads to use
+##LHE_COVERAGE=30
+#this parameter is useful if you have too many Illumina jumping library mates. Typically set it to 60 for bacteria and 300 for the other organisms 
+LIMIT_JUMP_COVERAGE = 60 
+#these are the additional parameters to Celera Assembler.  do not worry about performance, number or processors or batch sizes -- these are computed automatically. 
+#set cgwErrorRate=0.25 for bacteria and 0.1<=cgwErrorRate<=0.15 for other organisms.
+CA_PARAMETERS =  cgwErrorRate=0.15
+#minimum count k-mers used in error correction 1 means all k-mers are used.  one can increase to 2 if Illumina coverage >100
+KMER_COUNT_THRESHOLD = 1
+#whether to attempt to close gaps in scaffolds with Illumina data
+CLOSE_GAPS=1
+#auto-detected number of cpus to use
+NUM_THREADS = 16
+#this is mandatory jellyfish hash size -- a safe value is estimated_genome_size*estimated_coverage
+JF_SIZE = 200000000
+#set this to 1 to use SOAPdenovo contigging/scaffolding module.  Assembly will be worse but will run faster. Useful for very large (>5Gbp) genomes from Illumina-only data
+SOAP_ASSEMBLY=0
+END
+ +The configuration file is shown above. It is also located in MaSuRCA/Bacteria/config. Here under DATA, +flag PE designate both pair-end and single-end reads. + +Next, run MaSuRCA on the config file. + +
masurca config
+ +The command generates a bash script named assemble.sh. Run assemble.sh to start assembly ( +Don't run the script on submit node). + +
bash assemble.sh
+ +The results will be stored in directory CA/ after completion. The slurm script containing commands above is +MaSuRCA/Bacteria/ma_assemble.sh + +swallowtail + +
DATA
+##PE= pe 525 60  avg_read_length std_dev /FULL_PATH/paired_read1.fastq /FULL_PATH/paired_read2.fastq 
+PE= p1 147 160 /home/CAM/mxu/tutorial/p3/dataset/DRR021673_1.fastq  /home/CAM/mxu/tutorial/p3/dataset/DRR021673_2.fastq 
+PE= p2 145 251 /home/CAM/mxu/tutorial/p3/dataset/DRR021674_1.fastq  /home/CAM/mxu/tutorial/p3/dataset/DRR021674_2.fastq 
+JUMP= m1 124 1539 /home/CAM/mxu/tutorial/p3/dataset/DRR021675_1.fastq /home/CAM/mxu/tutorial/p3/dataset/DRR021675_2.fastq 
+JUMP= m2 125 1493 /home/CAM/mxu/tutorial/p3/dataset/DRR021677_1.fastq /home/CAM/mxu/tutorial/p3/dataset/DRR021677_2.fastq 
+#pacbio reads must be in a single fasta file! make sure you provide absolute path
+#PACBIO=/FULL_PATH/pacbio.fa
+#OTHER=/FULL_PATH/file.frg
+END
+
+PARAMETERS
+#set this to 1 if your Illumina jumping library reads are shorter than 100bp
+#EXTEND_JUMP_READS=0
+#this is k-mer size for deBruijn graph values between 25 and 127 are supported, auto will compute the optimal size based on the read data and GC content
+GRAPH_KMER_SIZE = auto
+#set this to 1 for all Illumina-only assemblies
+#set this to 1 if you have less than 20x long reads (454, Sanger, Pacbio) and less than 50x CLONE coverage by Illumina, Sanger or 454 mate pairs
+#otherwise keep at 0
+USE_LINKING_MATES = 0
+#specifies whether to run mega-reads correction on the grid
+USE_GRID=0
+#specifies queue to use when running on the grid MANDATORY
+GRID_QUEUE=all.q
+#batch size in the amount of long read sequence for each batch on the grid
+GRID_BATCH_SIZE=300000000
+#coverage by the longest Long reads to use
+##LHE_COVERAGE=30
+#this parameter is useful if you have too many Illumina jumping library mates. Typically set it to 60 for bacteria and 300 for the other organisms 
+LIMIT_JUMP_COVERAGE = 300 
+#these are the additional parameters to Celera Assembler.  do not worry about performance, number or processors or batch sizes -- these are computed automatically. 
+#set cgwErrorRate=0.25 for bacteria and 0.1<=cgwErrorRate<=0.15 for other organisms.
+CA_PARAMETERS = cgwErrorRate=0.15
+#minimum count k-mers used in error correction 1 means all k-mers are used.  one can increase to 2 if Illumina coverage >100
+KMER_COUNT_THRESHOLD = 1
+#whether to attempt to close gaps in scaffolds with Illumina data
+CLOSE_GAPS=1
+#auto-detected number of cpus to use
+NUM_THREADS = 16
+#this is mandatory jellyfish hash size -- a safe value is estimated_genome_size*estimated_coverage
+JF_SIZE = 200000000
+#set this to 1 to use SOAPdenovo contigging/scaffolding module.  Assembly will be worse but will run faster. Useful for very large (>5Gbp) genomes from Illumina-only data
+SOAP_ASSEMBLY=0
+END
+ +For butterfly, we use 4 libraries as input. 2 pair-end libraries start with flag PE and 2 mate-pair +libraries start with flag JUMP. The configuration script is located at MaSuRCA/Butterfly/config +The script to submit assembly job is at MaSuRCA/Butterfly/ma_assemble.sh + + + +

Platanus: PLATform for Assembling NUcleotide Sequences

+Platanus is a novel de novo sequence assembler that can reconstruct genomic sequences of +highly heterozygous diploids from massively parallel shotgun sequencing data. It consists of 3 separate commands: assemble, +scaffold, and gapclose. We will go through these commands in this section. Enter command below +to load Platanus module on Xanadu. + +
module load platanus/1.2.4
+ +Bacteria + +First we need assemble contigs from trimmed reads. + +
platanus assemble -o sample -f dataset/Bacteria/Sample_[12s].fastq -t 16 -m 128 2
+ + + +The command above takes 3 input files with forward reads, reverse reads, and singles reads respectively. +Assembled contigs will be saved in sample_contig.fa when it is completed. + +
platanus scaffold -o sample -c sample_contig.fa -b sample_contigBubble.fa -IP1 ../../dataset/Bacteria/Sample_1.fastq ../../dataset/Bacteria/Sample_2.fastq  -t 16
+ + + +The scaffold command generates a scaffolds file sample_scaffold.fa. With scaffolds, +we can proceed to gapclose. + +
platanus gap_close -o sample -c sample_scaffold.fa -IP1 ../../dataset/Bacteria/Sample_1.fastq  ../../dataset/Bacteria/Sample_2.fastq -t 16
+ + + +It outputs sample_gapClose.fa which contains gap closed sequences. + +The slurm script with all three step is at Platanus/Bacteria/platanus.sh/. + +swallowtail + +We choose library DRR021673 and DRR021674 as input of assembly. + +
platanus assemble -o Pxut -f /home/CAM/mxu/tutorial/p3/dataset/DRR02167[34]_[12].fastq -t 16 -m 128
+ + + +Assembled contigs will be saved in Pxut_contig.fa when it is completed. + +
platanus scaffold -o Pxut -c Pxut_contig.fa -b Pxut_contigBubble.fa -IP1 ../../dataset/Butterfly/DRR021673_1.fastq ../../dataset/Butterfly/DRR021673_2.fastq -IP2  ../../dataset/Butterfly/DRR021674_1.fastq  ../../dataset/Butterfly/DRR021674_2.fastq -t 16
+ + + +The scaffold command generates a scaffolds file Pxut_scaffold.fa. With scaffolds, +we can proceed to gapclose. + +
platanus gap_close -o Pxut -c Pxut_scaffold.fa -IP1 ../../dataset/Butterfly/DRR021673_1.fastq ../../dataset/Butterfly/DRR021673_2.fastq -IP2  ../../dataset/Butterfly/DRR021674_1.fastq  ../../dataset/Butterfly/DRR021674_2.fastq -t 16
+ + + +It outputs Pxut_gapClose.fa which contains gap closed sequences. + +The slurm script with all three step is at Platanus/Butterfly/platanus.sh/. + + +

Quast: Quality Assessment Tool for Genome Assemblies

+ +Now that we have several assemblies, it’s time to analyze the quality of each assembly. SOAPdenovo has +its own statistics output, but for consistency, we will be using the program QUAST. The statistics we are most +interested in are number of contigs, total length, and N50. A good assembly would have a low number of contigs, +a total length that makes sense for the species, and a high N50 value. To run quast on all of our final assembly +files we will run the following commands, with the only parameters used being the name of the scaffold file(s) and +output directory. + +To load QUAST on Xanadu + +
module load quast/4.6
+ +Sample command that processes output of SOAPdenovo with QUAST. + +
python quast.py -t 8 ../../SOAPdenovo/Bacteria/graph_Sample_31.scafSeq -o SOAP
+ + + +QUAST’s output consists of a directory containing results in multiple formats. For statistics such as +contigs, total length, and N50, we can check report.txt by using less command, or we can download +the output from cluster and open the interactive report in HTML format with a web browser (Optional). + +
scp -r your-username@xanadu-submit-ext.cam.uchc.edu:/path/to/QUAST/output .
+ +Bacteria + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Assembly# contigsLargest contigTotal lengthGC (%)N50
SOAPdenovo276103125357410132.4426176
SPAdes59255551288018432.65147660
MaSuRCA110148785289106232.6045141
Platanus63166346280461432.8014143
+ +The script to run QUAST on bacterial results is located at Quast/Bacteria/quast.sh/. + +Butterfly + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Assembly# contigsLargest contigTotal lengthGC (%)N50
SOAPdenovo2107638631232184868133.56831
SPAdes
MaSuRCA
Platanus2873663286423668199633.8162393
+ +The script to run QUAST on swallowtail's results is located at Quast/Butterfly/quast.sh/. + + +

Citations

+ +Bankevich, A., Nurk, S., Antipov, D., Gurevich, A. A., Dvorkin, M., Kulikov, A. S., … Pevzner, P. A. (2012). SPAdes: A New Genome Assembly Algorithm and Its Applications to Single-Cell Sequencing. Journal of Computational Biology, 19(5), 455–477. http://doi.org/10.1089/cmb.2012.0021 + +Gurevich, A., Saveliev, V., Vyahhi, N., & Tesler, G. (2013). QUAST: quality assessment tool for genome assemblies. Bioinformatics, 29(8), 1072–1075. http://doi.org/10.1093/bioinformatics/btt086 + +Kajitani R, Toshimoto K, Noguchi H, Toyoda A, Ogura Y, Okuno M, Yabana M, Harada M, Nagayasu E, Maruyama H, Kohara Y, Fujiyama A, Hayashi T, Itoh T, “Efficient de novo assembly of highly heterozygous genomes from whole-genome shotgun short reads”. Genome Res. 2014 Aug;24(8):1384-95. doi: 10.1101/gr.170720.113. + +Luo, R., Liu, B., Xie, Y., Li, Z., Huang, W., Yuan, J., … Wang, J. (2012). SOAPdenovo2: an empirically improved memory-efficient short-read de novo assembler. GigaScience, 1, 18. http://doi.org/10.1186/2047-217X-1-18 + +Zimin, A. et al. The MaSuRCA genome Assembler. Bioinformatics (2013). doi:10.1093/bioinformatics/btt476 + diff --git a/SOAPdenovo/Bacteria/assemble.sh b/SOAPdenovo/Bacteria/assemble.sh new file mode 100755 index 0000000..ac2eba5 --- /dev/null +++ b/SOAPdenovo/Bacteria/assemble.sh @@ -0,0 +1,17 @@ +#!/bin/bash +#SBATCH --job-name=SOAP +#SBATCH -N 1 +#SBATCH -n 1 +#SBATCH -c 8 +#SBATCH --partition=general +#SBATCH --mail-type=END +#SBATCH --mem=64G +#SBATCH --mail-user=muyang.xu@uconn.edu +#SBATCH -o SOAP_%j.out +#SBATCH -e SOAP_%j.err + +SOAPdenovo-63mer all -s config -K 31 -R -o graph_Sample_31 1>ass31.log 2>ass31.err +SOAPdenovo-63mer all -s config -K 35 -R -o graph_Sample_35 1>ass35.log 2>ass35.err +SOAPdenovo-63mer all -s config -K 41 -R -o graph_Sample_41 1>ass41.log 2>ass41.err + + diff --git a/SOAPdenovo/Bacteria/config b/SOAPdenovo/Bacteria/config new file mode 100755 index 0000000..313f593 --- /dev/null +++ b/SOAPdenovo/Bacteria/config @@ -0,0 +1,21 @@ +#maximal read length +max_rd_len=250 +[LIB] +#average insert size +avg_ins=550 +#if sequence needs to be reversed +reverse_seq=0 +#in which part(s) the reads are used +asm_flags=3 +#use only first 250 bps of each read +rd_len_cutoff=250 +#in which order the reads are used while scaffolding +rank=1 +# cutoff of pair number for a reliable connection (at least 3 for short insert size) +pair_num_cutoff=3 +#minimum aligned length to contigs for a reliable read location (at least 32 for short insert size) +map_len=32 +# path to genes +q1=/common/Assembly_Tutorial/Quality_Control/Sample_1.fastq +q2=/common/Assembly_Tutorial/Quality_Control/Sample_2.fastq +q=/common/Assembly_Tutorial/Quality_Control/Sample_s.fastq diff --git a/SOAPdenovo/Butterfly/assemble.sh b/SOAPdenovo/Butterfly/assemble.sh new file mode 100755 index 0000000..4617025 --- /dev/null +++ b/SOAPdenovo/Butterfly/assemble.sh @@ -0,0 +1,14 @@ +#!/bin/bash +#SBATCH --job-name=SOAP_assemble +#SBATCH -N 1 +#SBATCH -n 1 +#SBATCH -c 16 +#SBATCH --partition=himem3 +#SBATCH --mail-type=END +#SBATCH --mem=256G +#SBATCH --mail-user=muyang.xu@uconn.edu +#SBATCH -o SOAP_%j.out +#SBATCH -e SOAP_%j.err + +module load SOAP-denovo/2.04 +SOAPdenovo-63mer all -s config -K 31-R -o graph_xuthus_31 1>ass31.log 2>ass31.err diff --git a/SOAPdenovo/Butterfly/config b/SOAPdenovo/Butterfly/config new file mode 100755 index 0000000..57a4cbe --- /dev/null +++ b/SOAPdenovo/Butterfly/config @@ -0,0 +1,84 @@ +#maximal read length +max_rd_len=250 +[LIB] +#average insert size +avg_ins=300 +#if sequence needs to be reversed +reverse_seq=0 +#in which part(s) the reads are used +asm_flags=3 +#use only first 250 bps of each read +rd_len_cutoff=250 +#in which order the reads are used while scaffolding +rank=1 +# cutoff of pair number for a reliable connection (at least 3 for short insert size) +pair_num_cutoff=3 +#minimum aligned length to contigs for a reliable read location (at least 32 for short insert size) +map_len=32 +# path to genes +q1=../dataset/Butterfly/DRR021673_1.fastq +q2=../dataset/Butterfly/DRR021673_2.fastq + +[LIB] +#average insert size +avg_ins=500 +#if sequence needs to be reversed +reverse_seq=0 +#in which part(s) the reads are used +asm_flags=3 +#use only first 250 bps of each read +rd_len_cutoff=250 +#in which order the reads are used while scaffolding +rank=2 +# cutoff of pair number for a reliable connection (at least 3 for short insert size) +pair_num_cutoff=3 +#minimum aligned length to contigs for a reliable read location (at least 32 for short insert size) +map_len=32 +# path to genes +q1=../dataset/Butterfly/DRR021674_1.fastq +q2=../dataset/Butterfly/DRR021674_2.fastq + +[LIB] +#average insert size +avg_ins=3000 +#if sequence needs to be reversed +reverse_seq=0 +#in which part(s) the reads are used +asm_flags=3 +#use only first 250 bps of each read +rd_len_cutoff=250 +#in which order the reads are used while scaffolding +rank=3 +# cutoff of pair number for a reliable connection (at least 3 for short insert size) +pair_num_cutoff=3 +#minimum aligned length to contigs for a reliable read location (at least 32 for short insert size) +map_len=32 +# path to genes +q1=../dataset/Butterfly/DRR021675_1.fastq +q2=../dataset/Butterfly/DRR021675_2.fastq + +[LIB] +#average insert size +avg_ins=300 +#if sequence needs to be reversed +reverse_seq=0 +#in which part(s) the reads are used +asm_flags=3 +#use only first 250 bps of each read +rd_len_cutoff=250 +#in which order the reads are used while scaffolding +rank=4 +# cutoff of pair number for a reliable connection (at least 3 for short insert size) +pair_num_cutoff=3 +#minimum aligned length to contigs for a reliable read location (at least 32 for short insert size) +map_len=32 +# path to genes +q1=../dataset/Butterfly/DRR021677_1.fastq +q2=../dataset/Butterfly/DRR021677_2.fastq + + + + + + + diff --git a/SPAdes/Butterfly/assemble.sh b/SPAdes/Butterfly/assemble.sh new file mode 100755 index 0000000..453afaa --- /dev/null +++ b/SPAdes/Butterfly/assemble.sh @@ -0,0 +1,15 @@ +#!/bin/bash +#SBATCH --job-name=SPADES_assemble +#SBATCH -N 1 +#SBATCH -n 1 +#SBATCH -c 16 +#SBATCH --partition=himem2 +#SBATCH --mail-type=END +#SBATCH --mem=256G +#SBATCH --mail-user=muyang.xu@uconn.edu +#SBATCH -o SPADES_%j.out +#SBATCH -e SPADES_%j.err + +module load SPAdes/3.11.1 + +spades.py --careful -o xuthus_out/ -t 16 -m 250 --pe1-1 /home/CAM/mxu/tutorial/p3/dataset/DRR021673_1.fastq --pe1-2 /home/CAM/mxu/tutorial/p3/dataset/DRR021673_2.fastq --pe2-1 /home/CAM/mxu/tutorial/p3/dataset/DRR021674_1.fastq --pe2-2 /home/CAM/mxu/tutorial/p3/dataset/DRR021674_2.fastq --mp1-1 /home/CAM/mxu/tutorial/p3/dataset/DRR021675_1.fastq --mp1-2 /home/CAM/mxu/tutorial/p3/dataset/DRR021675_2.fastq diff --git a/dataset/Bacteria/Sample_R1.tar.xz b/dataset/Bacteria/Sample_R1.tar.xz new file mode 100644 index 0000000..bd31cdb Binary files /dev/null and b/dataset/Bacteria/Sample_R1.tar.xz differ diff --git a/dataset/Bacteria/Sample_R2.tar.xz b/dataset/Bacteria/Sample_R2.tar.xz new file mode 100644 index 0000000..00d55ba Binary files /dev/null and b/dataset/Bacteria/Sample_R2.tar.xz differ diff --git a/dataset/Bacteria/Sample_pe_stats.txt b/dataset/Bacteria/Sample_pe_stats.txt new file mode 100755 index 0000000..7c8edb2 --- /dev/null +++ b/dataset/Bacteria/Sample_pe_stats.txt @@ -0,0 +1 @@ +total 894184 avg=221.468301 stddev=1951.726288 diff --git a/dataset/Bacteria/Sample_s_stats.txt b/dataset/Bacteria/Sample_s_stats.txt new file mode 100755 index 0000000..b0874dc --- /dev/null +++ b/dataset/Bacteria/Sample_s_stats.txt @@ -0,0 +1 @@ +total 45449 avg=176.770072 stddev=4194.061239 diff --git a/dataset/Bacteria/uncompress.sh b/dataset/Bacteria/uncompress.sh new file mode 100755 index 0000000..024e047 --- /dev/null +++ b/dataset/Bacteria/uncompress.sh @@ -0,0 +1,14 @@ +#!/bin/bash +#SBATCH --job-name=uncompress +#SBATCH -N 1 +#SBATCH -n 1 +#SBATCH -c 16 +#SBATCH --partition=general +#SBATCH --mail-type=END +#SBATCH --mem=128G +#SBATCH --mail-user=muyang.xu@uconn.edu +#SBATCH -o uncompress_%j.out +#SBATCH -e uncompress_%j.err + +tar -xJf Sample_R1.tar.xz && tar -xJf Sample_R2.tar.xz + diff --git a/dataset/Butterfly/download.sh b/dataset/Butterfly/download.sh new file mode 100755 index 0000000..fb7c8c7 --- /dev/null +++ b/dataset/Butterfly/download.sh @@ -0,0 +1,44 @@ +#!/bin/bash +#SBATCH --job-name=platnus_assemble +#SBATCH -N 1 +#SBATCH -n 1 +#SBATCH -c 16 +#SBATCH --partition=general +#SBATCH --mail-type=END +#SBATCH --mem=128G +#SBATCH --mail-user=muyang.xu@uconn.edu +#SBATCH -o assemble_%j.out +#SBATCH -e assemble_%j.err + +module load platanus/1.2.4 + +wget ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/fastq/DRA002/DRA002407/DRX019820/DRR021674_1.fastq.bz2 && bzip2 -d dataset/DRR021674_1.fastq.bz2 + +wget ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/fastq/DRA002/DRA002407/DRX019820/DRR021674_2.fastq.bz2 && bzip2 -d dataset/DRR021674_2.fastq.bz2 + + +wget ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/fastq/DRA002/DRA002407/DRX019821/DRR021675_1.fastq.bz2 && bzip2 -d dataset/DRR021675_1.fastq.bz2 + + +wget ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/fastq/DRA002/DRA002407/DRX019821/DRR021675_2.fastq.bz2 && bzip2 -d dataset/DRR021675_2.fastq.bz2 + + +wget ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/fastq/DRA002/DRA002407/DRX019821/DRR021676_1.fastq.bz2 && bzip2 -d dataset/DRR021676_1.fastq.bz2 + + +wget ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/fastq/DRA002/DRA002407/DRX019821/DRR021676_2.fastq.bz2 && bzip2 -d dataset/DRR021676_2.fastq.bz2 + + +wget ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/fastq/DRA002/DRA002407/DRX019822/DRR021677_1.fastq.bz2 && bzip2 -d dataset/DRR021677_1.fastq.bz2 + + +wget ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/fastq/DRA002/DRA002407/DRX019822/DRR021677_2.fastq.bz2 && bzip2 -d dataset/DRR021677_2.fastq.bz2 + + +wget ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/fastq/DRA002/DRA002407/DRX019822/DRR021678_1.fastq.bz2 && bzip2 -d dataset/DRR021678_1.fastq.bz2 + + +wget ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/fastq/DRA002/DRA002407/DRX019822/DRR021678_2.fastq.bz2 && bzip2 -d dataset/DRR021678_2.fastq.bz2 + + +