From 8635e7991b16d88a1e9dae1d0bb9a3a678239004 Mon Sep 17 00:00:00 2001
From: Yutian Feng <yutian.feng@uconn.edu>
Date: Thu, 18 Oct 2018 17:45:56 -0400
Subject: [PATCH] Add files via upload

---
 metagenomepipeline.sh  | 174 +++++++++++++++++++++++++++++++++++++++++
 mlsa.sh                |  47 +++++++++++
 supermatrix.py         | 106 +++++++++++++++++++++++++
 varsnps.sh             |  27 +++++++
 wrapper_supermatrix.py |  34 ++++++++
 5 files changed, 388 insertions(+)
 create mode 100644 metagenomepipeline.sh
 create mode 100644 mlsa.sh
 create mode 100644 supermatrix.py
 create mode 100644 varsnps.sh
 create mode 100644 wrapper_supermatrix.py

diff --git a/metagenomepipeline.sh b/metagenomepipeline.sh
new file mode 100644
index 0000000..7c6d432
--- /dev/null
+++ b/metagenomepipeline.sh
@@ -0,0 +1,174 @@
+
+
+##substitute rare gene of choice for intein
+
+
+####set up psi blast to get pssms
+####  *.fa = amino acid sequences of inteins
+####   transcont.fasta = aminoacid sequence of metagenome (use emboss transeq on assembled reads)
+for b in *.fa; do psiblast -query "$b" -db transcont.fasta -out ${b%.fa}.blast -outfmt 6 -num_iterations 5 -out_pssm ${b%.fa}.pssm -save_pssm_after_last_round; done 
+# this takes a while, recommend doin git in  a script
+#to get rid of converged sequences/pssm
+for file in $(grep -l CONVERGED *.blast); do rm -i ${file%.blast}.pssm; done
+
+
+###################### search an AA metagenome with all intein pssms ###################
+
+
+######### 
+######### *.pssm= pssm
+#########  contigs.fastaa = nucleotide sequence from assembled reads
+
+for b in *.pssm; do tblastn -in_pssm "$b" -db contigs.fasta -out "$b".search -outfmt 6 -evalue 1e-10; done
+
+#use each pssm to blast a metagenome (-d) this command will only work for aa metagenomes
+#this must be done in a directory containing all the pssms, an infile for each pssm, and the
+#metagenome that the user intends to search which has been previously formated into a db,
+#and a list of the names of each pssm in a file called pssm.list
+
+###########extract all the hits from a metagenome using all PSSMs #######################
+ ################ remove redundant sequences  hit in multiple PSSMs #####################
+
+
+cat *.search |cut -f 2 >all.bout
+#list all of the hits from a psiblast using all intein PSSMs
+
+sort all.hits >sort.hits
+#sort all of the hits alphabetically by contig name from metagenome file
+
+uniq sort.hits >mg_name.hits
+#extract only unique hits mg_name.hits is a list of contigs which were found by PSSMs
+#can also do in excel, data -> remove duplicates
+
+
+####need to remove \n in contigs so each sequence is all in one line
+#linebreak inbetween multifasta, this is all one line
+awk '!/^>/ { printf "%s", $0; n = "\n" } 
+/^>/ { print n $0; n = "" }
+END { printf "%s", n }
+' contigs.fasta  >> contigs.eol
+
+
+
+##### mg.fas = fasta of metagenome assembly, usually contigs.eol from above
+for filn in `cat all.hits`; do grep -A 1 $filn contigs.eol> $filn.seqfile;done
+#makes a list of all of the contigs and searches for the corresponding fasta sequence in the metagenome .fas file
+
+#### *.seqfile = fasta sequences of contigs identified by pssms
+#### intein.db == multifasta of intein sequences containing all inteins from inteins.com (AA)
+for filn in `cat all.hits`; do blastx -query $filn.seqfile -db /home/CAM/yfeng/inteindb/intein.db -outfmt 6 -out $filn.blast ;done
+#new blast
+
+
+
+for f in `cat all.hits`; do head -1 $f.blast >> all.tsbh; done
+#works for allf
+#extract tsbh for each contig, identifies hit as being BIL, HNT, HE, mid, or intein
+
+cat *.tsbh > all.tsbh
+#make one file of all the tsbh for all contigs 
+
+for filn in *.seqfile; do blastx -query $filn -db /home/CAM/yfeng/inteindb/intein.db -outfmt "6 qaccver saccver qstart qend sstart send evalue qseq" -max_target_seqs 1 >> seqs.tsbh ;done
+
+#sort all.tsbh by bitscore in excel or equivalent  (highlight all -> data -> sort )
+
+# at this point the user needs to determine how many of the hits should be extracts I have 
+#determined that bits of 100 or more are actually inteins anything under that cannot be reliably aligned
+# to the identified intein hit, once the user has determined how many of the hits are above 100
+#use tail to grab _ hits from the bottom of the list for instance if the last 20 hits are 
+#above 100: tail -n 20 mg.int.sorted >mg.int.significant
+
+
+
+
+############ NR blast is optional: used to identify gene/species
+
+
+################ once hits are extracted they need to be blasted against the #############
+################ nr db to identify the extein the intein may be inserted into ############
+
+cut -f 1 mg.int.significant > contigs.list
+#makes a list of the contigs in the list which have hits over 100
+for filn in `cat contigs.list`; do blastall -p blastp -i $filn.seqfile -d nr -m 8 -o $filn.outfile; done
+#blasts nr db with significant contigs
+
+for filn in `cat contigs.list`; do grep -m 1 "gi" $filn.outfile > $filn.nr.tsbh;done
+#extracts tsbh from nr db
+
+for filn in `cat contigs.list`; do cut -f 2 $filn.nr.tsbh >$filn.nr.gi;done
+#extracts gi # from the tsbh of each contig
+
+for filn in *.fa; do blastx -query $filn -db /isg/shared/databases/blast/nr -outfmt 6 -out $filn.outfile -evalue 1e-10 -max_target_seqs 1 -num_threads 10; done
+
+#extract sequences from nr which correspond to the contigs used to search nr 
+
+
+
+
+
+##########################################################
+#########################################################
+########################################################
+#####################################################
+
+
+#mapping
+
+
+
+#for mapping you need to prepare two sets of nucl sequences
+#1) contigs as is: which contains extein/intein/extein
+#2) contigs with intein removed: extein/extein
+#sometimes the contig seqs might be very large so you have to cut them down to size
+#attached is contigtrimmer.pl which does that. usage: perl contigtrimmer.pl all.tsbh
+	#you may need to adjust or look at script
+	#if you trim contigs then you need to blastx again to find location of the intein in the sequence
+#to artificially remove inteins from the exteins to get the second set of sequences use extein.pl 
+	#similar to contigtrimmer.pl, except it takes theflanking region instead of the intein
+	#usage: perl extein.pl all.tsbh
+	#again may need to adjust script
+	
+############After you have both sets of sequences you can map them to the reads#####
+
+
+###do this for both sets of sequences,  make sure you change the names of output and input files####
+
+##i would do all of this with a submission script unless the reads file is small####
+
+###build bowtie index
+bowtie2-build extonly.txt extonly
+#extonly.txt = multifasta with second set of sequences from above
+#extonly name of bowtie index (can be anything)
+
+
+
+#map reads back to sequences
+bowtie2 -x exind -1 /home/CAM/yfeng/metagenomes/lakevida/trimmedf.fastq -2 /home/CAM/yfeng/metagenomes/lakevida/trimmr.fastq -S exmap.sam -p 10 
+#  -x: base name of the bowtie index
+# 	-1 and -2 path to forward and reveerse reads, can also work for unpaired reads
+#	-S sam file to output to
+#	-p number of threads
+
+
+#makes your sam file much smaller so its downloadable, and also convert sam to bam
+samtools view -b -F 4 combmap.sam > combmapped.bam
+#only thing you should chabge is the .sam and .bam file name
+#these other parameters are required for filtering to work
+
+#sort bam file by genome position so it actually makes sense
+samtools sort exmapped.bam -o exsort.bam
+#change .bam to output file name of previous step
+
+#you can then load the files into IGV and see the mapped reads and coverage or use them in bedtools
+#for Integrated genome viewer(IGV) you need to generate a bam index for your map file and a fasta index for your sequences
+
+
+#bam index
+samtools index -b name.bam
+
+#fasta index
+samtools faidx name.fasta 
+
+
+
+
diff --git a/mlsa.sh b/mlsa.sh
new file mode 100644
index 0000000..fa8fb94
--- /dev/null
+++ b/mlsa.sh
@@ -0,0 +1,47 @@
+#assumes blast+ and iqtree, and muscle. also requires supermatrix.py and wrapper_supermatrix.py in the a usable path or PATH
+
+for f in * (names of genomes in fasta).fasta; do makeblastdb -i $f -dbtype nucl; done
+#need names of genomes in nucleotide fasta format
+
+for x in *.blast; do while IFS=$'\t' read -r -a myArray; do NAME="${myArray[1]}"; START="${myArray[8]}"; STOP="${myArray[9]}"; done < "$x" ; if [ "$START" -gt "$STOP" ]; then STRAND=2;  RANGE="$STOP"-"$START"; else STRAND=1; RANGE="$START"-"$STOP"; fi;
+ g=${x%.blast}; fastacmd -d /home/yuf17006/Frankia_project/Small_Data_tests/Genomes/"$g" -p F -s "$NAME"  -L "$RANGE" -S "$STRAND" -o "$x".grab; done;
+#grabs top scoring hit for each gene outputs them to the same gene file
+
+#align them all with muscle
+for g in *.grab; do muscle -in $g -out $g.align -maxiters 4; done
+
+#all taxa must have the same names in every gene file
+
+
+
+#instructions for using supermatrix concatenater
+#Attached are two py scripts.  On the cluster, create a directory in your home folder called scripts.  copy the two files there.  
+#In the texteditor of your choice, edit the file .bash_profile (dot files are usually not listed ...)
+#In that file should be a description of your PATH.  Something like 
+#PATH=$PATH:$HOME/bin:/opt/454/bin
+#You need to add the script folder to the path.  in the example above, after editing the line would read 
+#​P​ATH=$PATH:$HOME/bin:/opt/454/bin:$HOME/scripts 
+​#Then, 
+#create a folder that contains the alignments you want to concatenate
+#Create a text file that lists all the names of the individual alignments.
+#save that file (possible Alist.txt) 
+#Invoke the wrapper by typing 
+#wrapper_supermatrix.py Alist.txt
+#​The contents of alist would look something like this: 
+#list1_aligned.fst
+#list2_aligned.fst
+#list3_aligned.fst
+#list4_aligned.fst
+#list5_aligned.fst
+#list7_aligned.fst
+#​The program creates a lot of output on the screen and in the directory where your alignments were. 
+#Essentially if you ave 31 files, the program runs 30 time, creating a concatenated alignment every time.  Outfile30 would be the concatenation you want to work with.  But check the logs, the missingseqs and partition lists files, in case something went wrong ...   And, obviously, look at the final alignment in seaview or similar.
+​
+
+wrapper_supermatrix.py Alist.txt 
+
+#take the final concatenated alignments and turn them into codons, proteins or leave them 
+#can use as input into iqtree
+iqtree -st (codon, AA, or NT) -bb 1000+ -nt AUTO 
+
+#view tree
\ No newline at end of file
diff --git a/supermatrix.py b/supermatrix.py
new file mode 100644
index 0000000..5fe71e0
--- /dev/null
+++ b/supermatrix.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python
+
+import sys
+import os
+import textwrap
+
+InFile1 = sys.argv[1]
+InFile2 = sys.argv[2]
+InFile3 = sys.argv[3]
+
+OInFile1 = open(InFile1, 'rU')
+OInFile2 = open(InFile2, 'rU')
+OutFile1 = open(InFile3, 'w')
+
+Concatdict1 = {}
+TaxaList = []
+for Line in OInFile1:
+	Line = Line.strip("\n")
+	if Line[0] == ">":
+		TaxaList.append(Line)
+		Concatdict1[Line] = []
+	else:
+		Concatdict1[TaxaList[len(TaxaList)-1]].append(Line)
+
+Concatdict2 = {}
+TaxaList2 = []
+for Line in OInFile2:
+	Line = Line.strip("\n")
+	if Line[0] == ">":
+		TaxaList2.append(Line)
+		Concatdict2[Line] = []
+	else:
+		Concatdict2[TaxaList2[len(TaxaList2)-1]].append(Line)
+
+MissingSeqs2 = frozenset(Concatdict1.keys()) - frozenset(Concatdict2.keys()) #Keys in 1 but not 2
+MissingSeqs1 = frozenset(Concatdict2.keys()) - frozenset(Concatdict1.keys()) #Keys in 2 but not 1
+
+print MissingSeqs1
+print MissingSeqs2
+
+SeqLengthFile1 = []
+
+X = 1
+for Element in Concatdict1:
+	ValueStr = "".join(Concatdict1[Element])
+	SeqLengthFile1.append(len(ValueStr))
+	X = X + 1
+	if X > 1:
+		break
+#print SeqLengthFile1
+
+EmptySeq1 = "-" * SeqLengthFile1[0]
+
+
+SeqLengthFile2 = []
+X = 1
+for Element in Concatdict2:
+	ValueStr2 = "".join(Concatdict2[Element])
+	SeqLengthFile2.append(len(ValueStr2))
+	X = X + 1
+	if X > 1:
+		break
+#print SeqLengthFile2
+
+EmptySeq2 = "-" * SeqLengthFile2[0]
+
+
+for Elements in MissingSeqs1:
+	Concatdict1[Elements] = list(EmptySeq1)
+for Elements in MissingSeqs2:
+	Concatdict2[Elements] = list(EmptySeq2)
+	
+for Element in Concatdict2:
+	SeqStr = "".join(Concatdict2[Element])
+	Concatdict1[Element].append(SeqStr)
+
+for Key in Concatdict1:
+	print Key
+	OutFile1.write(Key + "\n")
+	SeqStr = "".join(Concatdict1[Key])
+	OutFile1.write(SeqStr + "\n")
+	#OutFile1.write(textwrap.fill(SeqStr, width=60) + "\n")
+	print SeqStr
+	#print len(SeqStr)
+
+OInFile1.close()
+OInFile2.close()	
+OutFile1.close()
+
+OutFile2 = "MissingSeqs.list_" + InFile3	
+OpenOutFile2 = open(OutFile2, 'w')
+OpenOutFile2.write("Missing sequences in " + InFile1 + " are: " + "\n")
+for Element in MissingSeqs1:
+	OpenOutFile2.write(Element + "\n")
+OpenOutFile2.write("Missing sequences in " + InFile2 + " are: " + "\n")
+for Element in MissingSeqs2:
+	OpenOutFile2.write(Element + "\n")
+OpenOutFile2.close()
+
+OutFile3 = "Partition.list_" + InFile3
+openOutFile3 = open(OutFile3, 'w')
+openOutFile3.write(InFile1 + " = 1 - " + str(SeqLengthFile1[0]) + "\n")
+first_value = SeqLengthFile1[0] + 1
+second_value = SeqLengthFile1[0] + SeqLengthFile2[0]
+openOutFile3.write(InFile2 + " = " + str(first_value) + " - " + str(second_value) + "\n")
+openOutFile3.close()
diff --git a/varsnps.sh b/varsnps.sh
new file mode 100644
index 0000000..4bdbfad
--- /dev/null
+++ b/varsnps.sh
@@ -0,0 +1,27 @@
+#requries bcf tools, samtools, and bowtie2 (or other read mapper)
+bowtie2-build ref.fasta refname
+#makes bowtie index for reference genome
+
+bowtie2 -x refname -1 read1.fq -2 read2.fq -U unpairedread.fq -S output.sam
+#map reads back to reference genome for sam file
+
+samtools view -bS output.sam > output.bam
+#convert sam to bam
+
+samtools sort output.bam -O output.sort.bam
+#sort by genomic location
+#need to do next step or the variant annotations are useless
+samtools mpileup --min-BQ number 20-30 \
+-f ref.fasta --BCF bamfromprevious.bam | \
+ bcftools call --consensus-caller \
+--variants-only --pval-threshold 0.05 recommended -Ov -Ob > outputsvariants.bcf ;
+#gets the variants from read mapping file
+
+
+bcftools norm -m-any outputsvariants.bcf | bcftools norm -Ov --check-ref w -f ref.fasta > outputsvariants.vcf ;
+#convert to vcf based off of alignment with refernce genome
+
+
+bcftools view outputsvariants.vcf | vcfutils.pl varFilter -d 18 -w 1 -W 3 -a 1 \
+-e 0.05 -p > outputsvariants.filtered.vcf ;
+#filters those variants that don't meet a certain threshold
diff --git a/wrapper_supermatrix.py b/wrapper_supermatrix.py
new file mode 100644
index 0000000..4fae2fc
--- /dev/null
+++ b/wrapper_supermatrix.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python
+
+
+# Supermatrix wrapper
+# Have list of alignments you want concatenated
+# Feed them to supermatrix script
+# Concatenate first two alignments take product and add to next alignment in list ...
+
+import sys
+from subprocess import call
+
+InFile1 = sys.argv[1]
+File1 = open(InFile1, 'rU')
+
+AlignmentFiles = []
+OutFile = "OutFile"
+
+for Line in File1:
+	Line = Line.strip("\n")
+	AlignmentFiles.append(Line)
+File1.close()
+
+X = 1
+call("supermatrix.py " + AlignmentFiles[0] + " " + AlignmentFiles[1] + " " + OutFile + str(X), shell=True)
+
+AlignmentFiles = AlignmentFiles[2:]
+
+for Alignment in AlignmentFiles:
+	Y = X + 1
+	call("supermatrix.py " + OutFile + str(X) + " " + Alignment + " " + OutFile + str(Y), shell=True)
+	X = X + 1
+
+
+