Skip to content
Permalink
Browse files

Add files via upload

  • Loading branch information
yuf17006 committed Oct 18, 2018
1 parent 8fdb74a commit 8635e7991b16d88a1e9dae1d0bb9a3a678239004
Showing with 388 additions and 0 deletions.
  1. +174 −0 metagenomepipeline.sh
  2. +47 −0 mlsa.sh
  3. +106 −0 supermatrix.py
  4. +27 −0 varsnps.sh
  5. +34 −0 wrapper_supermatrix.py
@@ -0,0 +1,174 @@


##substitute rare gene of choice for intein


####set up psi blast to get pssms
#### *.fa = amino acid sequences of inteins
#### transcont.fasta = aminoacid sequence of metagenome (use emboss transeq on assembled reads)
for b in *.fa; do psiblast -query "$b" -db transcont.fasta -out ${b%.fa}.blast -outfmt 6 -num_iterations 5 -out_pssm ${b%.fa}.pssm -save_pssm_after_last_round; done
# this takes a while, recommend doin git in a script
#to get rid of converged sequences/pssm
for file in $(grep -l CONVERGED *.blast); do rm -i ${file%.blast}.pssm; done


###################### search an AA metagenome with all intein pssms ###################


#########
######### *.pssm= pssm
######### contigs.fastaa = nucleotide sequence from assembled reads

for b in *.pssm; do tblastn -in_pssm "$b" -db contigs.fasta -out "$b".search -outfmt 6 -evalue 1e-10; done

#use each pssm to blast a metagenome (-d) this command will only work for aa metagenomes
#this must be done in a directory containing all the pssms, an infile for each pssm, and the
#metagenome that the user intends to search which has been previously formated into a db,
#and a list of the names of each pssm in a file called pssm.list

###########extract all the hits from a metagenome using all PSSMs #######################
################ remove redundant sequences hit in multiple PSSMs #####################


cat *.search |cut -f 2 >all.bout
#list all of the hits from a psiblast using all intein PSSMs

sort all.hits >sort.hits
#sort all of the hits alphabetically by contig name from metagenome file

uniq sort.hits >mg_name.hits
#extract only unique hits mg_name.hits is a list of contigs which were found by PSSMs
#can also do in excel, data -> remove duplicates


####need to remove \n in contigs so each sequence is all in one line
#linebreak inbetween multifasta, this is all one line
awk '!/^>/ { printf "%s", $0; n = "\n" }
/^>/ { print n $0; n = "" }
END { printf "%s", n }
' contigs.fasta >> contigs.eol



##### mg.fas = fasta of metagenome assembly, usually contigs.eol from above
for filn in `cat all.hits`; do grep -A 1 $filn contigs.eol> $filn.seqfile;done
#makes a list of all of the contigs and searches for the corresponding fasta sequence in the metagenome .fas file

#### *.seqfile = fasta sequences of contigs identified by pssms
#### intein.db == multifasta of intein sequences containing all inteins from inteins.com (AA)
for filn in `cat all.hits`; do blastx -query $filn.seqfile -db /home/CAM/yfeng/inteindb/intein.db -outfmt 6 -out $filn.blast ;done
#new blast



for f in `cat all.hits`; do head -1 $f.blast >> all.tsbh; done
#works for allf
#extract tsbh for each contig, identifies hit as being BIL, HNT, HE, mid, or intein

cat *.tsbh > all.tsbh
#make one file of all the tsbh for all contigs

for filn in *.seqfile; do blastx -query $filn -db /home/CAM/yfeng/inteindb/intein.db -outfmt "6 qaccver saccver qstart qend sstart send evalue qseq" -max_target_seqs 1 >> seqs.tsbh ;done

#sort all.tsbh by bitscore in excel or equivalent (highlight all -> data -> sort )

# at this point the user needs to determine how many of the hits should be extracts I have
#determined that bits of 100 or more are actually inteins anything under that cannot be reliably aligned
# to the identified intein hit, once the user has determined how many of the hits are above 100
#use tail to grab _ hits from the bottom of the list for instance if the last 20 hits are
#above 100: tail -n 20 mg.int.sorted >mg.int.significant




############ NR blast is optional: used to identify gene/species


################ once hits are extracted they need to be blasted against the #############
################ nr db to identify the extein the intein may be inserted into ############

cut -f 1 mg.int.significant > contigs.list
#makes a list of the contigs in the list which have hits over 100
for filn in `cat contigs.list`; do blastall -p blastp -i $filn.seqfile -d nr -m 8 -o $filn.outfile; done
#blasts nr db with significant contigs

for filn in `cat contigs.list`; do grep -m 1 "gi" $filn.outfile > $filn.nr.tsbh;done
#extracts tsbh from nr db

for filn in `cat contigs.list`; do cut -f 2 $filn.nr.tsbh >$filn.nr.gi;done
#extracts gi # from the tsbh of each contig

for filn in *.fa; do blastx -query $filn -db /isg/shared/databases/blast/nr -outfmt 6 -out $filn.outfile -evalue 1e-10 -max_target_seqs 1 -num_threads 10; done

#extract sequences from nr which correspond to the contigs used to search nr





##########################################################
#########################################################
########################################################
#####################################################


#mapping



#for mapping you need to prepare two sets of nucl sequences
#1) contigs as is: which contains extein/intein/extein
#2) contigs with intein removed: extein/extein
#sometimes the contig seqs might be very large so you have to cut them down to size
#attached is contigtrimmer.pl which does that. usage: perl contigtrimmer.pl all.tsbh
#you may need to adjust or look at script
#if you trim contigs then you need to blastx again to find location of the intein in the sequence
#to artificially remove inteins from the exteins to get the second set of sequences use extein.pl
#similar to contigtrimmer.pl, except it takes theflanking region instead of the intein
#usage: perl extein.pl all.tsbh
#again may need to adjust script

############After you have both sets of sequences you can map them to the reads#####


###do this for both sets of sequences, make sure you change the names of output and input files####

##i would do all of this with a submission script unless the reads file is small####

###build bowtie index
bowtie2-build extonly.txt extonly
#extonly.txt = multifasta with second set of sequences from above
#extonly name of bowtie index (can be anything)



#map reads back to sequences
bowtie2 -x exind -1 /home/CAM/yfeng/metagenomes/lakevida/trimmedf.fastq -2 /home/CAM/yfeng/metagenomes/lakevida/trimmr.fastq -S exmap.sam -p 10
# -x: base name of the bowtie index
# -1 and -2 path to forward and reveerse reads, can also work for unpaired reads
# -S sam file to output to
# -p number of threads


#makes your sam file much smaller so its downloadable, and also convert sam to bam
samtools view -b -F 4 combmap.sam > combmapped.bam
#only thing you should chabge is the .sam and .bam file name
#these other parameters are required for filtering to work

#sort bam file by genome position so it actually makes sense
samtools sort exmapped.bam -o exsort.bam
#change .bam to output file name of previous step

#you can then load the files into IGV and see the mapped reads and coverage or use them in bedtools
#for Integrated genome viewer(IGV) you need to generate a bam index for your map file and a fasta index for your sequences


#bam index
samtools index -b name.bam

#fasta index
samtools faidx name.fasta




47 mlsa.sh
@@ -0,0 +1,47 @@
#assumes blast+ and iqtree, and muscle. also requires supermatrix.py and wrapper_supermatrix.py in the a usable path or PATH

for f in * (names of genomes in fasta).fasta; do makeblastdb -i $f -dbtype nucl; done
#need names of genomes in nucleotide fasta format

for x in *.blast; do while IFS=$'\t' read -r -a myArray; do NAME="${myArray[1]}"; START="${myArray[8]}"; STOP="${myArray[9]}"; done < "$x" ; if [ "$START" -gt "$STOP" ]; then STRAND=2; RANGE="$STOP"-"$START"; else STRAND=1; RANGE="$START"-"$STOP"; fi;
g=${x%.blast}; fastacmd -d /home/yuf17006/Frankia_project/Small_Data_tests/Genomes/"$g" -p F -s "$NAME" -L "$RANGE" -S "$STRAND" -o "$x".grab; done;
#grabs top scoring hit for each gene outputs them to the same gene file

#align them all with muscle
for g in *.grab; do muscle -in $g -out $g.align -maxiters 4; done

#all taxa must have the same names in every gene file



#instructions for using supermatrix concatenater
#Attached are two py scripts. On the cluster, create a directory in your home folder called scripts. copy the two files there.
#In the texteditor of your choice, edit the file .bash_profile (dot files are usually not listed ...)
#In that file should be a description of your PATH. Something like
#PATH=$PATH:$HOME/bin:/opt/454/bin
#You need to add the script folder to the path. in the example above, after editing the line would read
#​P​ATH=$PATH:$HOME/bin:/opt/454/bin:$HOME/scripts
#Then,
#create a folder that contains the alignments you want to concatenate
#Create a text file that lists all the names of the individual alignments.
#save that file (possible Alist.txt)
#Invoke the wrapper by typing
#wrapper_supermatrix.py Alist.txt
#​The contents of alist would look something like this:
#list1_aligned.fst
#list2_aligned.fst
#list3_aligned.fst
#list4_aligned.fst
#list5_aligned.fst
#list7_aligned.fst
#​The program creates a lot of output on the screen and in the directory where your alignments were.
#Essentially if you ave 31 files, the program runs 30 time, creating a concatenated alignment every time. Outfile30 would be the concatenation you want to work with. But check the logs, the missingseqs and partition lists files, in case something went wrong ... And, obviously, look at the final alignment in seaview or similar.

wrapper_supermatrix.py Alist.txt

#take the final concatenated alignments and turn them into codons, proteins or leave them
#can use as input into iqtree
iqtree -st (codon, AA, or NT) -bb 1000+ -nt AUTO

#view tree
@@ -0,0 +1,106 @@
#!/usr/bin/env python

import sys
import os
import textwrap

InFile1 = sys.argv[1]
InFile2 = sys.argv[2]
InFile3 = sys.argv[3]

OInFile1 = open(InFile1, 'rU')
OInFile2 = open(InFile2, 'rU')
OutFile1 = open(InFile3, 'w')

Concatdict1 = {}
TaxaList = []
for Line in OInFile1:
Line = Line.strip("\n")
if Line[0] == ">":
TaxaList.append(Line)
Concatdict1[Line] = []
else:
Concatdict1[TaxaList[len(TaxaList)-1]].append(Line)

Concatdict2 = {}
TaxaList2 = []
for Line in OInFile2:
Line = Line.strip("\n")
if Line[0] == ">":
TaxaList2.append(Line)
Concatdict2[Line] = []
else:
Concatdict2[TaxaList2[len(TaxaList2)-1]].append(Line)

MissingSeqs2 = frozenset(Concatdict1.keys()) - frozenset(Concatdict2.keys()) #Keys in 1 but not 2
MissingSeqs1 = frozenset(Concatdict2.keys()) - frozenset(Concatdict1.keys()) #Keys in 2 but not 1

print MissingSeqs1
print MissingSeqs2

SeqLengthFile1 = []

X = 1
for Element in Concatdict1:
ValueStr = "".join(Concatdict1[Element])
SeqLengthFile1.append(len(ValueStr))
X = X + 1
if X > 1:
break
#print SeqLengthFile1

EmptySeq1 = "-" * SeqLengthFile1[0]


SeqLengthFile2 = []
X = 1
for Element in Concatdict2:
ValueStr2 = "".join(Concatdict2[Element])
SeqLengthFile2.append(len(ValueStr2))
X = X + 1
if X > 1:
break
#print SeqLengthFile2

EmptySeq2 = "-" * SeqLengthFile2[0]


for Elements in MissingSeqs1:
Concatdict1[Elements] = list(EmptySeq1)
for Elements in MissingSeqs2:
Concatdict2[Elements] = list(EmptySeq2)

for Element in Concatdict2:
SeqStr = "".join(Concatdict2[Element])
Concatdict1[Element].append(SeqStr)

for Key in Concatdict1:
print Key
OutFile1.write(Key + "\n")
SeqStr = "".join(Concatdict1[Key])
OutFile1.write(SeqStr + "\n")
#OutFile1.write(textwrap.fill(SeqStr, width=60) + "\n")
print SeqStr
#print len(SeqStr)

OInFile1.close()
OInFile2.close()
OutFile1.close()

OutFile2 = "MissingSeqs.list_" + InFile3
OpenOutFile2 = open(OutFile2, 'w')
OpenOutFile2.write("Missing sequences in " + InFile1 + " are: " + "\n")
for Element in MissingSeqs1:
OpenOutFile2.write(Element + "\n")
OpenOutFile2.write("Missing sequences in " + InFile2 + " are: " + "\n")
for Element in MissingSeqs2:
OpenOutFile2.write(Element + "\n")
OpenOutFile2.close()

OutFile3 = "Partition.list_" + InFile3
openOutFile3 = open(OutFile3, 'w')
openOutFile3.write(InFile1 + " = 1 - " + str(SeqLengthFile1[0]) + "\n")
first_value = SeqLengthFile1[0] + 1
second_value = SeqLengthFile1[0] + SeqLengthFile2[0]
openOutFile3.write(InFile2 + " = " + str(first_value) + " - " + str(second_value) + "\n")
openOutFile3.close()
@@ -0,0 +1,27 @@
#requries bcf tools, samtools, and bowtie2 (or other read mapper)
bowtie2-build ref.fasta refname
#makes bowtie index for reference genome

bowtie2 -x refname -1 read1.fq -2 read2.fq -U unpairedread.fq -S output.sam
#map reads back to reference genome for sam file

samtools view -bS output.sam > output.bam
#convert sam to bam

samtools sort output.bam -O output.sort.bam
#sort by genomic location
#need to do next step or the variant annotations are useless
samtools mpileup --min-BQ number 20-30 \
-f ref.fasta --BCF bamfromprevious.bam | \
bcftools call --consensus-caller \
--variants-only --pval-threshold 0.05 recommended -Ov -Ob > outputsvariants.bcf ;
#gets the variants from read mapping file


bcftools norm -m-any outputsvariants.bcf | bcftools norm -Ov --check-ref w -f ref.fasta > outputsvariants.vcf ;
#convert to vcf based off of alignment with refernce genome


bcftools view outputsvariants.vcf | vcfutils.pl varFilter -d 18 -w 1 -W 3 -a 1 \
-e 0.05 -p > outputsvariants.filtered.vcf ;
#filters those variants that don't meet a certain threshold
@@ -0,0 +1,34 @@
#!/usr/bin/env python


# Supermatrix wrapper
# Have list of alignments you want concatenated
# Feed them to supermatrix script
# Concatenate first two alignments take product and add to next alignment in list ...

import sys
from subprocess import call

InFile1 = sys.argv[1]
File1 = open(InFile1, 'rU')

AlignmentFiles = []
OutFile = "OutFile"

for Line in File1:
Line = Line.strip("\n")
AlignmentFiles.append(Line)
File1.close()

X = 1
call("supermatrix.py " + AlignmentFiles[0] + " " + AlignmentFiles[1] + " " + OutFile + str(X), shell=True)

AlignmentFiles = AlignmentFiles[2:]

for Alignment in AlignmentFiles:
Y = X + 1
call("supermatrix.py " + OutFile + str(X) + " " + Alignment + " " + OutFile + str(Y), shell=True)
X = X + 1



0 comments on commit 8635e79

Please sign in to comment.
You can’t perform that action at this time.