Add files via upload

yuf17006 · Oct 18, 2018 · 1b2e8d7 · 1b2e8d7
1 parent e7a57b0
commit 1b2e8d7
Show file tree

Hide file tree

Showing 2 changed files with 153 additions and 0 deletions.
diff --git a/mlsa.sh b/mlsa.sh
@@ -0,0 +1,47 @@
+#assumes blast+ and iqtree, and muscle. also requires supermatrix.py and wrapper_supermatrix.py in the a usable path or PATH
+
+for f in * (names of genomes in fasta).fasta; do makeblastdb -i $f -dbtype nucl; done
+#need names of genomes in nucleotide fasta format
+
+for x in *.blast; do while IFS=$'\t' read -r -a myArray; do NAME="${myArray[1]}"; START="${myArray[8]}"; STOP="${myArray[9]}"; done < "$x" ; if [ "$START" -gt "$STOP" ]; then STRAND=2;  RANGE="$STOP"-"$START"; else STRAND=1; RANGE="$START"-"$STOP"; fi;
+ g=${x%.blast}; fastacmd -d /home/yuf17006/Frankia_project/Small_Data_tests/Genomes/"$g" -p F -s "$NAME"  -L "$RANGE" -S "$STRAND" -o "$x".grab; done;
+#grabs top scoring hit for each gene outputs them to the same gene file
+
+#align them all with muscle
+for g in *.grab; do muscle -in $g -out $g.align -maxiters 4; done
+
+#all taxa must have the same names in every gene file
+
+
+
+#instructions for using supermatrix concatenater
+#Attached are two py scripts.  On the cluster, create a directory in your home folder called scripts.  copy the two files there.  
+#In the texteditor of your choice, edit the file .bash_profile (dot files are usually not listed ...)
+#In that file should be a description of your PATH.  Something like 
+#PATH=$PATH:$HOME/bin:/opt/454/bin
+#You need to add the script folder to the path.  in the example above, after editing the line would read 
+#PATH=$PATH:$HOME/bin:/opt/454/bin:$HOME/scripts 
+#Then, 
+#create a folder that contains the alignments you want to concatenate
+#Create a text file that lists all the names of the individual alignments.
+#save that file (possible Alist.txt) 
+#Invoke the wrapper by typing 
+#wrapper_supermatrix.py Alist.txt
+#The contents of alist would look something like this: 
+#list1_aligned.fst
+#list2_aligned.fst
+#list3_aligned.fst
+#list4_aligned.fst
+#list5_aligned.fst
+#list7_aligned.fst
+#The program creates a lot of output on the screen and in the directory where your alignments were. 
+#Essentially if you ave 31 files, the program runs 30 time, creating a concatenated alignment every time.  Outfile30 would be the concatenation you want to work with.  But check the logs, the missingseqs and partition lists files, in case something went wrong ...   And, obviously, look at the final alignment in seaview or similar.
+
+
+wrapper_supermatrix.py Alist.txt 
+
+#take the final concatenated alignments and turn them into codons, proteins or leave them 
+#can use as input into iqtree
+iqtree -st (codon, AA, or NT) -bb 1000+ -nt AUTO 
+
+#view tree
diff --git a/supermatrix.py b/supermatrix.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python
+
+import sys
+import os
+import textwrap
+
+InFile1 = sys.argv[1]
+InFile2 = sys.argv[2]
+InFile3 = sys.argv[3]
+
+OInFile1 = open(InFile1, 'rU')
+OInFile2 = open(InFile2, 'rU')
+OutFile1 = open(InFile3, 'w')
+
+Concatdict1 = {}
+TaxaList = []
+for Line in OInFile1:
+	Line = Line.strip("\n")
+	if Line[0] == ">":
+		TaxaList.append(Line)
+		Concatdict1[Line] = []
+	else:
+		Concatdict1[TaxaList[len(TaxaList)-1]].append(Line)
+
+Concatdict2 = {}
+TaxaList2 = []
+for Line in OInFile2:
+	Line = Line.strip("\n")
+	if Line[0] == ">":
+		TaxaList2.append(Line)
+		Concatdict2[Line] = []
+	else:
+		Concatdict2[TaxaList2[len(TaxaList2)-1]].append(Line)
+
+MissingSeqs2 = frozenset(Concatdict1.keys()) - frozenset(Concatdict2.keys()) #Keys in 1 but not 2
+MissingSeqs1 = frozenset(Concatdict2.keys()) - frozenset(Concatdict1.keys()) #Keys in 2 but not 1
+
+print MissingSeqs1
+print MissingSeqs2
+
+SeqLengthFile1 = []
+
+X = 1
+for Element in Concatdict1:
+	ValueStr = "".join(Concatdict1[Element])
+	SeqLengthFile1.append(len(ValueStr))
+	X = X + 1
+	if X > 1:
+		break
+#print SeqLengthFile1
+
+EmptySeq1 = "-" * SeqLengthFile1[0]
+
+
+SeqLengthFile2 = []
+X = 1
+for Element in Concatdict2:
+	ValueStr2 = "".join(Concatdict2[Element])
+	SeqLengthFile2.append(len(ValueStr2))
+	X = X + 1
+	if X > 1:
+		break
+#print SeqLengthFile2
+
+EmptySeq2 = "-" * SeqLengthFile2[0]
+
+
+for Elements in MissingSeqs1:
+	Concatdict1[Elements] = list(EmptySeq1)
+for Elements in MissingSeqs2:
+	Concatdict2[Elements] = list(EmptySeq2)
+
+for Element in Concatdict2:
+	SeqStr = "".join(Concatdict2[Element])
+	Concatdict1[Element].append(SeqStr)
+
+for Key in Concatdict1:
+	print Key
+	OutFile1.write(Key + "\n")
+	SeqStr = "".join(Concatdict1[Key])
+	OutFile1.write(SeqStr + "\n")
+	#OutFile1.write(textwrap.fill(SeqStr, width=60) + "\n")
+	print SeqStr
+	#print len(SeqStr)
+
+OInFile1.close()
+OInFile2.close()	
+OutFile1.close()
+
+OutFile2 = "MissingSeqs.list_" + InFile3	
+OpenOutFile2 = open(OutFile2, 'w')
+OpenOutFile2.write("Missing sequences in " + InFile1 + " are: " + "\n")
+for Element in MissingSeqs1:
+	OpenOutFile2.write(Element + "\n")
+OpenOutFile2.write("Missing sequences in " + InFile2 + " are: " + "\n")
+for Element in MissingSeqs2:
+	OpenOutFile2.write(Element + "\n")
+OpenOutFile2.close()
+
+OutFile3 = "Partition.list_" + InFile3
+openOutFile3 = open(OutFile3, 'w')
+openOutFile3.write(InFile1 + " = 1 - " + str(SeqLengthFile1[0]) + "\n")
+first_value = SeqLengthFile1[0] + 1
+second_value = SeqLengthFile1[0] + SeqLengthFile2[0]
+openOutFile3.write(InFile2 + " = " + str(first_value) + " - " + str(second_value) + "\n")
+openOutFile3.close()