diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..e0116c1
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+
+data/gene_name_info/query_full_name.txt
+data/gene_name_info/query_ids.txt
+data/gene_name_info/query_snps.txt
+data/gene_name_info/query_symbol.txt
+results/baseline_doc/pubmed.zinc.0.15.txt
+results/baseline_doc/pubmed.zinc.1.15.txt
diff --git a/REPRODUCING.md b/REPRODUCING.md
new file mode 100644
index 0000000..3146537
--- /dev/null
+++ b/REPRODUCING.md
@@ -0,0 +1,36 @@
+This [Code Ocean](https://codeocean.com) Compute Capsule will allow you to reproduce the results published by the author on your local machine1. Follow the instructions below, or consult [our knowledge base](https://help.codeocean.com/user-manual/sharing-and-finding-published-capsules/exporting-capsules-and-reproducing-results-on-your-local-machine) for more information. Don't hesitate to reach out to [Support](mailto:support@codeocean.com) if you have any questions.
+
+1 You may need access to additional hardware and/or software licenses.
+
+# Prerequisites
+
+- [Docker Community Edition (CE)](https://www.docker.com/community-edition)
+- [nvidia-container-runtime](https://docs.docker.com/config/containers/resource_constraints/#gpu) for code that leverages the GPU
+- MATLAB/MOSEK/Stata licenses where applicable
+
+# Instructions
+
+## The computational environment (Docker image)
+
+This capsule is private and its environment cannot be downloaded at this time. You will need to rebuild the environment locally.
+
+> If there's any software requiring a license that needs to be run during the build stage, you'll need to make your license available. See [our knowledge base](https://help.codeocean.com/user-manual/sharing-and-finding-published-capsules/exporting-capsules-and-reproducing-results-on-your-local-machine) for more information.
+
+In your terminal, navigate to the folder where you've extracted the capsule and execute the following command:
+```shell
+cd environment && docker build . --tag 6ef700ed-ff07-4a42-bf13-65d4165511b6; cd ..
+```
+
+> This step will recreate the environment (i.e., the Docker image) locally, fetching and installing any required dependencies in the process. If any external resources have become unavailable for any reason, the environment will fail to build.
+
+## Running the capsule to reproduce the results
+
+In your terminal, navigate to the folder where you've extracted the capsule and execute the following command, adjusting parameters as needed:
+```shell
+docker run --platform linux/amd64 --rm --gpus all \
+ --workdir /code \
+ --volume "$PWD/data":/data \
+ --volume "$PWD/code":/code \
+ --volume "$PWD/results":/results \
+ 6ef700ed-ff07-4a42-bf13-65d4165511b6 bash run
+```
diff --git a/code/Extrinsic_application_CVD_prediction.py b/code/Extrinsic_application_CVD_prediction.py
new file mode 100644
index 0000000..9617b75
--- /dev/null
+++ b/code/Extrinsic_application_CVD_prediction.py
@@ -0,0 +1,128 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Aug 30 21:59:06 2022
+
+@author: Jihye Moon
+"""
+import sys
+import os
+import pathlib
+
+import pandas as pd
+import numpy as np
+
+from sklearn.preprocessing import StandardScaler
+from sklearn.model_selection import StratifiedShuffleSplit as strata
+
+import lib.ML_models as ml
+sys.path.append('lib')
+import loading_literature_embedding as emb
+
+def data_split(X_train_index, X_test_index, X, y):
+ valid_data = int(len(X_test_index)/2)
+ test_data = int(len(X_test_index))-valid_data
+
+ test = X_test_index[0:test_data]; valid = X_test_index[test_data:test_data+valid_data]
+
+ X_train = X[X_train_index]; X_test = X[test]; X_valid = X[valid]
+
+ y_train = y[X_train_index]
+ y_test = y[test]
+ y_valid = y[valid]
+
+ X_train = np.reshape(X_train, (X_train.shape[0], -1)); X_test = np.reshape(X_test, (X_test.shape[0], -1))
+ X_valid = np.reshape(X_valid, (X_valid.shape[0], -1))
+ y_train = np.squeeze(y_train); y_test = np.squeeze(y_test); y_valid = np.squeeze(y_valid)
+
+ scaler = StandardScaler()
+ scaler.fit(X_train)
+ X_train = scaler.transform(X_train); X_test = scaler.transform(X_test); X_valid = scaler.transform(X_valid)
+ return X_train, X_test, X_valid, y_train, y_test, y_valid
+
+def loading_variable_embedding(data_path):
+ var_symbol = list(pd.read_csv(data_path+'/variables_symbol.csv').drop(columns='Unnamed: 0')['0'])
+ var_name = list(pd.read_csv(data_path+'/variables_preprocessed_names.csv').drop(columns='Unnamed: 0')['0'])
+ tar_symbol = list(pd.read_csv(data_path+'/target_variables_symbol.csv').drop(columns='Unnamed: 0')['0'])
+ tar_name = list(pd.read_csv(data_path+'/target_variables_preprocessed_names.csv').drop(columns='Unnamed: 0')['0'])
+
+ variables_indexing={}; disease_variables_indexing={}
+
+ for i in range(len(var_name)):
+ variables_indexing[var_symbol[i]] = var_name[i]
+
+ for i in range(len(tar_name)):
+ disease_variables_indexing[tar_symbol[i]] = tar_name[i]
+
+ additional_dictionary = {'uricosurics':'uricosuric'}
+ # If some variable names are very unique that can't find in embedding vocabulary,
+ # add the unique variable names here to avoid error for feature selection tasks
+
+ embedding_list, index2variables, embedding, removal, removed_words = emb2simi.variable2embed(words_list, syn0norm, variables_indexing, additional_dictionary)
+
+ if removal==[]:
+ print(" === NO problem for your variables")
+ target_embedding_list, index2target, target_embedding, _, _ = emb2simi.variable2embed(words_list, syn0norm, disease_variables_indexing, additional_dictionary)
+
+ return embedding_list, variables_indexing, disease_variables_indexing, additional_dictionary, \
+ target_embedding_list, index2target, index2variables, target_embedding, embedding
+ else:
+ print(" === Check if there are errors for your variable names")
+ return 0, 0, 0, 0, 0, 0, 0, 0, 0
+
+def CVD_Prediction_with_FS_DR(data_path, Xt, y):
+ feature_size = 128; i=0
+ split_info = strata(n_splits=5, test_size=0.2, random_state=12)
+ total_FS_Pre=[]; total_FS_prob=[]
+ total_DR_pre=[]; total_DR_prob=[]
+ embedding_list, variables_indexing, disease_variables_indexing, additional_dictionary, target_embedding_list, index2target, index2variables, target_embedding, embedding = loading_variable_embedding(data_path)
+ for X_train_index, X_test_index in split_info.split(Xt.values, y):
+ result_dir = os.path.join(output_path +str(i))
+ pathlib.Path(result_dir).mkdir(parents=True, exist_ok=True)
+ X_train, X_test, X_valid, y_train, y_test, y_valid = data_split(X_train_index, X_test_index, Xt.values, y)
+ pr.save_label(y_test, 'CVD_label', result_dir) # y_test labels to evaludate CVD prediction performance for each fold
+ print("=== run Our feature selector --- our FS selected features via feature name , our FS uses same feature set for 5-fold cross validation. ")
+ embed_name = fs.Our_FS(emb2simi, str(i)+'rf_embedding_features', embedding_list, variables_indexing, disease_variables_indexing, additional_dictionary, embedding, target_embedding_list, index2target, index2variables, target_embedding, feature_size, result_dir)
+
+ print("=== run Our dimensionality reductor ")
+ A1, A2, A3 = dr.Our_DR(embedding, X_train, X_test, X_valid, feature_size)
+
+ print("=== Running with MLs with Feature Selection (Our FS)")
+ X2 = Xt[embed_name].values ### selecting only 128 variables based on our 128 features
+ valid_data = int(len(X_test_index)/2); test_data = int(len(X_test_index))-valid_data
+ test = X_test_index[0:test_data]; valid = X_test_index[test_data:test_data+valid_data] # split test data
+ X_train2 = X2[X_train_index]; X_test2 = X2[test]; X_valid2 = X2[valid]
+
+ X_train2 = np.reshape(X_train2, (X_train2.shape[0], -1))
+ X_test2 = np.reshape(X_test2, (X_test2.shape[0], -1))
+ X_valid2 = np.reshape(X_valid2, (X_valid2.shape[0], -1))
+
+ scaler = StandardScaler()
+ scaler.fit(X_train2)
+ X_train2 = scaler.transform(X_train2); X_test2 = scaler.transform(X_test2); X_valid2 = scaler.transform(X_valid2)
+
+ Our_FS_total_prediction, Our_FS_total_prob = pr.run_save(X_train2, y_train, X_test2, y_test, X_valid2, y_valid, 'FS.embedding', 'SMOTE', feature_size, result_dir)
+ total_FS_Pre.append(Our_FS_total_prediction); total_FS_prob.append(Our_FS_total_prob)
+ print("=== Running MLs with Dimensionality Reduction (Our DR)")
+ Our_DR_total_prediction, Our_DR_total_prob = pr.run_save(A1, y_train, A2, y_test, A3, y_valid, 'DR.embedding', 'SMOTE', feature_size, result_dir)
+ total_DR_pre.append(Our_FS_total_prediction); total_DR_prob.append(Our_FS_total_prob)
+ i+=1
+ print('all results are saved in ', output_path)
+ return total_FS_Pre, total_FS_prob, total_DR_pre, total_DR_prob
+
+data_path = '../data/Example'
+model_path = '../data/old_model'
+output_path = '../results/prediction/'
+
+fs = ml.feature_selectors()
+dr = ml.dimension_reducers()
+pr = ml.predictors()
+
+gene_name = '../data/gene_name_info/query_full_name'; gene_symb='../data/gene_name_info/query_symbol'
+emb2simi=emb.embedding_vector()
+
+words_list, index2word, syn0norm, _ = emb2simi.setting(model_path, gene_symb)
+
+Xt = pd.read_csv(data_path+'/Example_X.csv').drop(columns='Unnamed: 0')
+y = pd.read_csv(data_path+'/Example_y.csv').drop(columns='Unnamed: 0').values
+
+total_FS_Pre, total_FS_prob, total_DR_pre, total_DR_prob = CVD_Prediction_with_FS_DR(data_path, Xt, y)
diff --git a/code/LICENSE b/code/LICENSE
new file mode 100644
index 0000000..08320cf
--- /dev/null
+++ b/code/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2022 Jihye Moon
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/code/README.md b/code/README.md
new file mode 100644
index 0000000..c87cead
--- /dev/null
+++ b/code/README.md
@@ -0,0 +1,266 @@
+#### **A Literature Embedding Model for Cardiovascular Disease Prediction using Risk Factors, Symptoms, and Genotype Information**
+##### Authors: Jihye Moon, Hugo F. Posada-Quintero, and *Ki. H. Chon
+**Contact address**: ki.chon@uconn.edu (*Corresponding author), jihye.moon@uconn.edu (Q&A for code).
+(Accepted by Expert System with Application at August 24, 2022)
+
+### Contents
+
+This capsule aims to provide the implementation of 1) **Literature data collection and preprocessing**, and 2) **Literature embedding model training and evaluation**. The pre-trained Literature embedding model aims to identify CVD risk factors and associated information for a given input query (i.e., stroke). Also, since our literature embedding model contains representations for CVD-related words, our literature model can work as **Feature Selection (FS) and Dimensionality Reduction (DR) models on cohort data** for CVD prediction/classification tasks (extrinsic method). We used MESA cohort data consisting of 6,814 subjects and 564 variables in our manuscript. Since our cohort data requires permission, users are required to prepare their cohort data to use the literature embedding model as FS or DR tasks. The cohort data are required to have variables per subject and the variables' name. This capsule's guideline also provides a pipeline for the FS and DR for input cohort data.
+
+1. [Introduction](#introduction)
+2. [Code Implementations and Guidelines](#gudelines)
+ 0. [DEMO](#demo)
+ 1. [DEMO A) CVD risk factors, genes, and associated information identifications](#demo1")
+ 2. [DEMO B) All steps to build a literature embedding model (data collection ~ model training](#demo2)
+ 3. [Reproduction DEMO](#default_demo)
+ 1. [Literature data collection](#collection)
+ 2. [Literature data preprocessing](#preprocessing)
+ 3. [Literature embedding model training](#training)
+ 4. [Literature embedding model evaluation](#evaluation)
+ 5. [FS and DR applications on cohort data](#applications)
+3. [Results](#results)
+4. [GitHub Source](#github)
+
+### 1. Introduction
+Accurate prediction of cardiovascular disease (CVD) requires multifaceted information consisting of not only a patient’s medical history, but genomic data, symptoms, lifestyle, and risk factors which are often not incorporated into a decision-making process as the data are vast, difficult to obtain, and require complex algorithms. **Estimating CVD risk factors is now a significant goal for more accurate CVD prediction and treatment**.
+##### Previous work's limitation
+CVD risk factors can be identified from phenotype variables, genetic arrays, text, and image data. Several approaches have been introduced to identify CVD risk factors that are categorized as: (1) cohort-based CVD risk factor identification, and (2) literature-based CVD risk factor identification and information management. Category (1) enables objective validation of the identified risk factors using CVD patient data, but the number of available features is limited, which may limit the identification of new CVD risk factors. Category (2) enables the management of significant risk factors using publicly-available literature data, however, most methods were not validated using CVD patient data. Hence, **it is critical to developing a novel method to collect information on the risk factors, associated symptoms, and mechanisms, but it needs to be objectively validated using CVD patients to be relevant for better clinical diagnosis and treatment management.**
+##### Our proposed work
+In our paper, **we proposed a literature embedding model that trained using literature data freely accessible online.** Our model enables the retrieval of CVD risk factors, associated information, and genes independently from population-based data. Even though our literature model was trained using literature, our model enables selecting accurate CVD-related features from the population-based cohort data as FS and DR models, which involves better CVD prediction.
+
+### 2. Code implementation and guidelines
+This section provides descriptions for [0. Demo](#demo) and details for [1. Literature data collection](#collection), [2. Literature data preprocessing](#preprocessing), [3. Literature embedding model training](#training), [4. Literature embedding model evaluation](#evaluation), and [5. FS and DR applications on cohort data](#applications).
+DEMO shows an overall for our codes, and the other five subsections show the details of the codes for each purpose.
+
+We prepared five main codes for each goal:
+ 1) step1_data_collection.py,
+ 2) step2_data_preprocessing.py,
+ 3) step3_literature_embedding_training.py,
+ 4) step4_CVD_risk_factor_identification.py,
+ 5) Extrinsic_application_CVD_prediction.py.
+
+We feed different inputs to each main code for each purpose. Details are described below.
+
+#### 2.0. DEMO
+
+We prepared three DEMOs:
+ 1) **DEMO A**: It provides **CVD risk factors, genes, and associated information identifications** using a pre-trained literature model.
+ 2) **DEMO B**: It provides all steps for **literature data collection**, **literature data preprocessing**, and **literature embedding model training and intrinsic evaluation (CVD risk factor identifications)**
+ 3) **Reproduction DEMO**: It shows DEMO A's results and provides **literature embedding model training and evaluation steps**.
+
+In the CodeOcean platform, the DEMO A is the default.
+
+##### 2.0.1. DEMO A) CVD risk factors, genes, and associated information identifications
+To run DEMO A, run the following command:
+~~~~ {.sourceCode .shell}
+./run.sh 'demo_a'
+~~~~
+
+The command imports our pre-trained literature embedding model at EMBEDDING_PATH='../data/old_model' and captures CVD risk factors and associated information for three queries ('stroke', 'atrial fibrillation, 'ventricular fibrillation').
+The input query-related risk factors, associated information, and gene names will be displayed and saved in STEP4_OUTPUT_PATH='../results/demo_a'.
+
+##### 2.0.2. DEMO B) All steps to build a literature embedding model (data collection ~ model training)
+To run Demo-b, run the following command at **your local computer**:
+~~~~ {.sourceCode .shell}
+./run.sh 'demo_b'
+~~~~
+Demo b) provides all steps for literature data collection & preprocessing, literature embedding model training & evaluation for CVD risk factor identifications. This DEMO B provides a limited number of collected literature data. When users want to get all data, users are required to set NUM_WORD_BASED_DATA=0 and NUM_GENE_BASED_DATA=0.
+
+~~~~ {.sourceCode .shell}
+./run.sh 'demo_b'
+ echo 'demo b -- '
+ QUERY_WORD='zinc' ## you can define query word to collect literature data
+ NUM_WORD_BASED_DATA=500000 #if NUM_WORD_BASED_DATA=0, it collects all possible gene-related literature
+ NUM_GENE_BASED_DATA=100 #if NUM_GENE_BASED_DATA=0, it collects all possible gene-related literature
+ BASE_PATH='../results/'
+ DATA_COLLECTION_PATH='../results/demo_b'
+ PREPROCESSEING_PATH='../results/demo_b'
+ EMBEDDING_NAME='pre_trained_demo'
+ EMBEDDING_PATH='../results/pre_trained_demo'
+ EPOCH=2
+ STEP4_OUTPUT_PATH='../results/CVD_searches'
+
+ python -u step1_data_collection.py $QUERY_WORD $NUM_WORD_BASED_DATA $NUM_GENE_BASED_DATA $DATA_COLLECTION_PATH
+ python -u step2_data_preprocessing.py $DATA_COLLECTION_PATH $PREPROCESSEING_PATH
+ python -u step3_literature_embedding_training.py $PREPROCESSEING_PATH $EPOCH $EMBEDDING_NAME
+ python -u step4_CVD_risk_factor_identification.py $EMBEDDING_NAME $STEP4_OUTPUT_PATH
+~~~~
+
+This DEMO B generates the collected literature data, the pre-processed literature data, and the trained literature embedding model at './results'.
+
+##### 2.0.3. Reproduction DEMO
+
+The reproduction DEMO is operated by the following command:
+~~~~ {.sourceCode .shell}
+./run.sh
+
+or
+
+./run.sh 'demo_r'
+~~~~
+
+This reproduction DEMO shows 1) CVD risk factor identifications using our paper's pre-trained literature model and 2) all steps for a literature model training process and risk factor searches using the newly pre-trained model.
+We prepared a collected literature data set at PREPROCESSEING_PATH='../data/old_preprocessed_data' for 2).
+
+#### 2.1. Literature data collection
+This subsection explains details for step1_data_collection.py. The code recieves four input:
+
+~~~~ {.sourceCode .shell}
+ QUERY_WORD='zinc'
+ NUM_WORD_BASED_DATA=0
+ NUM_GENE_BASED_DATA=0
+ DATA_COLLECTION_PATH='../results/$USER_DIFINED'
+
+ python -u step1_data_collection.py $QUERY_WORD $NUM_WORD_BASED_DATA $NUM_GENE_BASED_DATA $DATA_COLLECTION_PATH
+~~~~
+
+In our manuscript, we collected 16k published literature from PubMed using search keywords consisting of a word (“heart”) and human gene names, then trained a literature embedding model using the collected abstracts. Below table 1. shows an example of collected abstracts by this code.
+
+*Table 1. An example of collected abstracts*
+|
Document type
|
Keyword
|
Example
|
+|:---|:---|:---|
+|Keyword-based Literature from PubMed|Heart|Waist-to-hip ratio (WHR) is a strong predictor of mortality in patients with **heart** failure (HF). Left ventricular diastolic filling function has predictable maturational progression, with significant differences in the intraventricular pressure difference between infants from birth to 2 years. |
+|Gene Name-based Literature from PubMed|HMGA1|**HMGA1** has been shown to regulate genes involved with systemic inflammatory processes. We hypothesized that **HMGA1** is important in the function of mesenchymal stromal cells, which are known to modulate inflammatory responses due to sepsis.|
+
+We can change a number of collectible documents:
+~~~
+If NUM_WORD_BASED_DATA==0:
+ It collects all documents for $QUERY_WORD.
+elif NUM_WORD_BASED_DATA==100000:
+ It collects 10,0000 documents for $QUERY_WORD.
+
+If NUM_GENE_BASED_DATA==0:
+ It collects all documents for $QUERY_WORD.
+elif NUM_GENE_BASED_DATA==10:
+ It collects documents for 10*NUM_GENE_BASED_DATA gene names.
+~~~
+
+#### 2.2. Literature data preprocessing
+This subsection explains details for step2_data_preprocessing.py. The code recieves two inputs:
+~~~
+ DATA_COLLECTION_PATH='../results/$USER_DIFINED'
+ PREPROCESSEING_PATH='../results/$USER_DIFINED'
+
+ python -u step2_data_preprocessing.py $DATA_COLLECTION_PATH $PREPROCESSEING_PATH
+~~~
+
+*Table 2. An example of text preprocessing*
+|
Document
|
Gene Name
| Sentence |
+|:---|:---|:---|
+| Original | HMGA1 | Mesenchymal stromal cells expressing a dominant-negative high mobility group A1 transgene exhibit improved function during sepsis. |
+| Pre-processed | #HMGA1 | mesenchymal stromal cells expressing dominant-negative high mobility group a# transgene exhibit improved function sepsis |
+
+
+#### 2.3. Literature embedding model training
+This subsection explains details for step3_literature_embedding_training.py. The code recieves three inputs:
+~~~
+ EMBEDDING_PATH='../results/$MODEL_PATH'
+ EPOCH=2 # setting the number of ecoch for literature embedding model
+
+ python -u step3_literature_embedding_training.py $PREPROCESSEING_PATH $EPOCH $EMBEDDING_PATH
+ ~~~
+EMBEDDING_PATH is embedding model path and EPOCH is the number of epoch. EPOCH=10 is recommanded.
+Our literature embedding model trains literature representations by the following three steps. To train 'heart'-related literature, the model trains a basic skip-gram structure as shown in Fig.1(a). To train gene-name-related literature, the model uses Fig.(b) and (c) structures.
+
+Fig. 1. Skip-gram structure of Word2vec
+| (a) step 1| (b) step 2| (c) step 3|
+| :--- | :--- | :--- |
+| |||
+|Skip-gram structure to predict context words using a center word in the same document|Our proposed structure (1) to predict captured document's word contexts with gene name that used as search query |Our proposed structure (2) to predict gene-name-associated words in captured document using gene name|
+
+Users can set hyper-parameters in step3_literature_embedding_training.py:
+~~~~ {step3_literature_embedding_training.py}
+ window_size = 2 # The number of contexts per center word for literature model training. Details are in our manuscript.
+ min_count = 5 # Words with Appreacnce frequency in the document is fewer than min_count = 5,
+ min_size = 2 # Words that have character size <= min_size = 3 will be excluded.
+ dimension = 128 # Embedding model's dimension
+ num_sampled = 16 # Negative sampling parameter
+ batch_size = 256 #
+~~~~
+
+#### 2.4. Literature embedding model evaluation (CVD risk factor searches)
+This subsection explains details for step4_CVD_risk_factor_identification.py. The code recieves two inputs:
+
+~~~~ {.sourceCode .shell}
+ EMBEDDING_PATH='../results/$MODEL_PATH'
+ STEP4_OUTPUT_PATH='../results/$SEARCH_PATH'
+ python -u step4_CVD_risk_factor_identification.py $EMBEDDING_PATH $STEP4_OUTPUT_PATH
+~~~~
+
+Users can put their query in step4_CVD_risk_factor_identification.py like below:
+
+~~~~ {.sourceCode .python}
+queries = ['stroke', 'atrial fibrillation', 'ventricular fibrillation'] #put your own query in []
+~~~~
+
+#### 2.5. FS and DR applications on cohort data
+This subsection explains details for Extrinsic_application_CVD_prediction.py. The code has three inputs in Extrinsic_application_CVD_prediction.py:
+ * data_path = '../data/Example'
+ * model_path = '../data/old_model'
+ * output_path = '../results/prediction/'
+
+Users are required to prepare cohort data, pre-trained embedding model paths, and output paths. After users run Extrinsic_application_CVD_prediction.py with the user's cohort data, users will get prediction results and label after our FS and DR processes for each K-fold at output_path. Then evaluate CVD prediction performance using performance_metrics.metric(label, prediction_results) in lib/performance_metrics.py. All other ML methods-FS, DT, H2FS, PCA, and UMAP are in lib/ML_models.py file.
+
+##### Cohort data format ######
+
+Users are required to prepare cohort data with variable names. To show the format of input data, we generated example data Example_X (variables per subject) and Example_y (CVD labels per subject) using lib/ExpCohort_Generator.py. Details are in the lib/ExpCohort_Generator.py file.
+
+The **format** of input cohort data (Example_X) have should be like below:
+
+*Table 3. The data format example generated by ExpCohort_Generator.py (variable)*
+| Subject | bca | nit | fhha | sbld | pulrate |
+|-----|----------|----------|----------|----------|----------|
+| 0 | 0.296735 | 0.292552 | 0.074269 | 0.886255 | 0.235104 |
+| 1 | 0.699152 | 0.626459 | 0.917815 | 0.988134 | 0.167721 |
+| 2 | 0.484408 | 0.327285 | 0.351393 | 0.946728 | 0.366808 |
+| 3 | 0.970385 | 0.811354 | 0.068369 | 0.246754 | 0.198345 |
+| .. | ... | ... | ... | ... | ... |
+| N | 0.905146 | 0.855485 | 0.657306 | 0.385825 | 0.957396 |
+
+The **format** of CVD label per subject (Example_y) should be like below:
+
+*Table 4. The cohort data format example generated by ExpCohort_Generator.py (label)*
+|
Subject
|
CVD (Yes=1, no=0)
|
+|:---|:---|
+| 1 | 0 |
+| 2 | 1 |
+| 3 | 1 |
+| ... | ... |
+| N | 0 |
+
+### 3. Results
+
+In our manuscript, we used three queries ('stroke', 'atrial fibrillation, 'ventricular fibrillation') for CVD risk factor identifications. We analyzed whether or not the captured words and genes were correctly identified as risk factors and associated symptoms for the input query words. Our model accurately (average accuracy of >96%) captured associated risk factors, symptoms, and genes for a given input query word. Details are described in our published manuscript.
+
+We also used our embedding model as FS and DR tasks on cohort data for CVD prediction. Our FS and DR method provides better performance with the fastest computation time when compared with other popular FS and DR methods - Random Forest, Decision Tree, H2FS, UMAP, and PCA.
+
+Our model has the potential to facilitate easier collation of multifaceted information for better data mining of vast publicly available data so that efficient and accurate risk factors and symptoms can be identified, which helps better-informed decisions for CVD prediction and treatment.
+
+### 4. GitHub Source
+-------------
+
+This project is also hosted on GitHub ([link](https://github.com/JihyeMooon/CVD_literature_embedding)) and is actively developed.
+
+### Error note
+In the literature data collection process, some errors can happen due to network connecions.
+
+if you get errors at 25/33 point from 'collecting_doc_using_word_based_query' like below:
+~~~
+ 25 / 33
+ Going to download records from 1250001 to 1260000
+ Going to download records from 1260001 to 1270000
+
+ raise HTTPError(req.full_url, code, msg, hdrs, fp)
+ or IncompleteRead: IncompleteRead(20458171 bytes read)
+~~~
+Then run collecting_doc_using_word_based_query agian, with 'w2d_starting_point = 25'
+
+If you have problems from 'collecting_doc_using_gene_based_query' like below:
+~~~
+ Example: if we get error at 5 / 2634
+~~~
+Then run collecting_doc_using_gene_based_query agian, with 'g2d_starting_point = 5'
+
+
+
+
\ No newline at end of file
diff --git a/code/gene_extraction.py b/code/gene_extraction.py
new file mode 100644
index 0000000..c90b3b9
--- /dev/null
+++ b/code/gene_extraction.py
@@ -0,0 +1,134 @@
+from Bio import Entrez
+from Bio import SeqIO
+import time
+from urllib.error import HTTPError
+from http.client import IncompleteRead
+
+# Set your email address for Entrez
+Entrez.email = "lrmercadod@gmail.com"
+Entrez.api_key = "f095f0c0aad9480d90ee0b869acb43670d08"
+
+# Search for human genes in the Nucleotide database
+handle = Entrez.esearch(db="gene", term="Homo sapiens[Organism]", retmax=10000000)
+human_record = Entrez.read(handle)
+handle.close()
+
+# Search for human ZIP11 gene
+handle = Entrez.esearch(db="gene", term="ZIP11 AND Homo sapiens[Organism]", retmax=10000000)
+human_zip11_record = Entrez.read(handle)
+handle.close()
+
+# Search for mouse ZIP11 gene
+handle = Entrez.esearch(db="gene", term="ZIP11 AND Mus musculus[Organism]", retmax=10000000)
+mouse_zip11_record = Entrez.read(handle)
+handle.close()
+
+# Get the list of gene IDs
+human_gene_ids = human_record["IdList"]
+human_zip11_ids = human_zip11_record["IdList"]
+mouse_zip11_ids = mouse_zip11_record["IdList"]
+
+# Combine all gene IDs
+gene_ids = human_gene_ids + human_zip11_ids + mouse_zip11_ids
+
+# Open the output files
+symbol_file = open("query_symbol.txt", "a", encoding="utf-8") # Append mode
+id_file = open("query_ids.txt", "a", encoding="utf-8") # Append mode
+full_name_file = open("query_full_name.txt", "a", encoding="utf-8") # Append mode
+snp_file = open("query_snps.txt", "a", encoding="utf-8") # Append mode
+error_file = open("error_log.txt", "a", encoding="utf-8") # Append mode for error logging
+
+max_retries = 5
+retry_delay = 2
+batch_size = 500
+batch_delay = 2
+
+# Load the last processed batch from the checkpoint file
+checkpoint_file = "checkpoint.txt"
+try:
+ with open(checkpoint_file, "r") as file:
+ last_processed_batch = int(file.read())
+except FileNotFoundError:
+ last_processed_batch = 0
+
+# Iterate over the gene IDs in batches and fetch the gene information
+for i in range(last_processed_batch * batch_size, len(gene_ids), batch_size):
+ batch_ids = gene_ids[i:i+batch_size]
+
+ for gene_id in batch_ids:
+ retries = 0
+ while retries < max_retries:
+ try:
+ handle = Entrez.efetch(db="gene", id=gene_id, retmode="xml")
+ gene_record = Entrez.read(handle)
+ handle.close()
+ break
+ except (HTTPError, IncompleteRead) as e:
+ print(f"Error: {str(e)}. Retrying...")
+ retries += 1
+ time.sleep(retry_delay)
+ else:
+ print(f"Failed to fetch gene information for gene ID: {gene_id}")
+ continue
+
+ # Extract the relevant information
+ if "Entrezgene_gene" in gene_record[0] and "Gene-ref" in gene_record[0]["Entrezgene_gene"]:
+ gene_ref = gene_record[0]["Entrezgene_gene"]["Gene-ref"]
+ gene_symbol = gene_ref.get("Gene-ref_locus", "")
+ gene_full_name = gene_ref.get("Gene-ref_desc", "")
+ else:
+ gene_symbol = ""
+ gene_full_name = ""
+
+ # Retrieve SNP information for the gene
+ retries = 0
+ while retries < max_retries:
+ try:
+ handle = Entrez.elink(dbfrom="gene", db="snp", id=gene_id)
+ snp_record = Entrez.read(handle)
+ handle.close()
+
+ if snp_record[0]["LinkSetDb"]:
+ snp_ids = [link["Id"] for link in snp_record[0]["LinkSetDb"][0]["Link"]]
+ for snp_id in snp_ids:
+ try:
+ snp_file.write(str(snp_id) + "\n")
+ except OSError as e:
+ error_file.write(f"Error writing SNP ID {snp_id} for gene ID {gene_id}: {str(e)}\n")
+ else:
+ try:
+ snp_file.write("N/A\n")
+ except OSError as e:
+ error_file.write(f"Error writing 'N/A' to snp_file for gene ID {gene_id}: {str(e)}\n")
+ break
+ except (IndexError, RuntimeError, IncompleteRead) as e:
+ print(f"Error retrieving SNP information for gene ID: {gene_id}. Retrying...")
+ retries += 1
+ time.sleep(retry_delay)
+ else:
+ print(f"Failed to retrieve SNP information for gene ID: {gene_id}")
+ try:
+ snp_file.write("N/A\n")
+ except OSError as e:
+ error_file.write(f"Error writing 'N/A' to snp_file for gene ID {gene_id}: {str(e)}\n")
+
+ # Write the information to the respective files
+ symbol_file.write(gene_symbol + "\n")
+ id_file.write(gene_id + "\n")
+ full_name_file.write(gene_full_name + "\n")
+
+ # Update the checkpoint file with the last processed batch
+ with open(checkpoint_file, "w") as file:
+ file.write(str(i // batch_size))
+
+ print(f"Processed batch {i//batch_size + 1} of {len(gene_ids)//batch_size + 1}")
+ time.sleep(batch_delay)
+
+# Close the output files
+symbol_file.close()
+id_file.close()
+full_name_file.close()
+snp_file.close()
+error_file.close()
+
+print("Gene extraction completed.")
\ No newline at end of file
diff --git a/code/lib/Bio/Affy/CelFile.py b/code/lib/Bio/Affy/CelFile.py
new file mode 100644
index 0000000..ee95b0d
--- /dev/null
+++ b/code/lib/Bio/Affy/CelFile.py
@@ -0,0 +1,502 @@
+# Copyright 2004 by Harry Zuzan. All rights reserved.
+# Copyright 2016 by Adam Kurkiewicz. All rights reserved.
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+
+"""Reading information from Affymetrix CEL files version 3 and 4."""
+
+
+import struct
+
+try:
+ import numpy
+except ImportError:
+ from Bio import MissingPythonDependencyError
+
+ raise MissingPythonDependencyError(
+ "Install NumPy if you want to use Bio.Affy.CelFile"
+ ) from None
+
+
+class ParserError(ValueError):
+ """Affymetrix parser error."""
+
+ def __init__(self, *args):
+ """Initialise class."""
+ super().__init__(*args)
+
+
+class Record:
+ """Stores the information in a cel file.
+
+ Example usage:
+
+ >>> from Bio.Affy import CelFile
+ >>> with open("Affy/affy_v3_example.CEL") as handle:
+ ... c = CelFile.read(handle)
+ ...
+ >>> print(c.ncols, c.nrows)
+ 5 5
+ >>> print(c.intensities)
+ [[ 234. 170. 22177. 164. 22104.]
+ [ 188. 188. 21871. 168. 21883.]
+ [ 188. 193. 21455. 198. 21300.]
+ [ 188. 182. 21438. 188. 20945.]
+ [ 193. 20370. 174. 20605. 168.]]
+ >>> print(c.stdevs)
+ [[ 24. 34.5 2669. 19.7 3661.2]
+ [ 29.8 29.8 2795.9 67.9 2792.4]
+ [ 29.8 88.7 2976.5 62. 2914.5]
+ [ 29.8 76.2 2759.5 49.2 2762. ]
+ [ 38.8 2611.8 26.6 2810.7 24.1]]
+ >>> print(c.npix)
+ [[25 25 25 25 25]
+ [25 25 25 25 25]
+ [25 25 25 25 25]
+ [25 25 25 25 25]
+ [25 25 25 25 25]]
+
+ """
+
+ def __init__(self):
+ """Initialize the class."""
+ self.version = None
+ self.GridCornerUL = None
+ self.GridCornerUR = None
+ self.GridCornerLR = None
+ self.GridCornerLL = None
+ self.DatHeader = None
+ self.Algorithm = None
+ self.AlgorithmParameters = None
+ self.NumberCells = None
+ self.intensities = None
+ self.stdevs = None
+ self.npix = None
+ self.nrows = None
+ self.ncols = None
+ self.nmask = None
+ self.mask = None
+ self.noutliers = None
+ self.outliers = None
+ self.modified = None
+
+
+def read(handle, version=None):
+ """Read Affymetrix CEL file and return Record object.
+
+ CEL files format versions 3 and 4 are supported.
+ Please specify the CEL file format as 3 or 4 if known for the version
+ argument. If the version number is not specified, the parser will attempt
+ to detect the version from the file contents.
+
+ The Record object returned by this function stores the intensities from
+ the CEL file in record.intensities.
+ Currently, record.mask and record.outliers are not set in when parsing
+ version 4 CEL files.
+
+ Example Usage:
+
+ >>> from Bio.Affy import CelFile
+ >>> with open("Affy/affy_v3_example.CEL") as handle:
+ ... record = CelFile.read(handle)
+ ...
+ >>> record.version == 3
+ True
+ >>> print("%i by %i array" % record.intensities.shape)
+ 5 by 5 array
+
+ >>> with open("Affy/affy_v4_example.CEL", "rb") as handle:
+ ... record = CelFile.read(handle, version=4)
+ ...
+ >>> record.version == 4
+ True
+ >>> print("%i by %i array" % record.intensities.shape)
+ 5 by 5 array
+
+ """
+ try:
+ data = handle.read(0)
+ except AttributeError:
+ raise ValueError("handle should be a file handle") from None
+ data = handle.read(4)
+ if not data:
+ raise ValueError("Empty file.")
+ if data == b"[CEL":
+ raise ValueError("CEL file in version 3 format should be opened in text mode")
+ if data == "[CEL":
+ # Version 3 format. Continue to read the header here before passing
+ # control to _read_v3 to avoid having to seek to the beginning of
+ # the file.
+ data += next(handle)
+ if data.strip() != "[CEL]":
+ raise ValueError("Failed to parse Affy Version 3 CEL file.")
+ line = next(handle)
+ keyword, value = line.split("=", 1)
+ if keyword != "Version":
+ raise ValueError("Failed to parse Affy Version 3 CEL file.")
+ version = int(value)
+ if version != 3:
+ raise ValueError("Incorrect version number in Affy Version 3 CEL file.")
+ return _read_v3(handle)
+ try:
+ magicNumber = struct.unpack(" max_size:
+ max_atoms = [atom]
+ max_size = atom_dict[atom]
+ elif atom_dict[atom] == max_size:
+ max_atoms.append(atom)
+
+ if require_multiple and num_atoms == 1:
+ consensus += ambiguous
+ elif (len(max_atoms) == 1) and (
+ (float(max_size) / float(num_atoms)) >= threshold
+ ):
+ consensus += max_atoms[0]
+ else:
+ consensus += ambiguous
+
+ return Seq(consensus)
+
+ def gap_consensus(self, threshold=0.7, ambiguous="X", require_multiple=False):
+ """Output a fast consensus sequence of the alignment, allowing gaps.
+
+ Same as dumb_consensus(), but allows gap on the output.
+
+ Things to do:
+ - Let the user define that with only one gap, the result
+ character in consensus is gap.
+ - Let the user select gap character, now
+ it takes the same as input.
+
+ """
+ consensus = ""
+
+ # find the length of the consensus we are creating
+ con_len = self.alignment.get_alignment_length()
+
+ # go through each seq item
+ for n in range(con_len):
+ # keep track of the counts of the different atoms we get
+ atom_dict = {}
+ num_atoms = 0
+
+ for record in self.alignment:
+ # make sure we haven't run past the end of any sequences
+ # if they are of different lengths
+ if n < len(record.seq):
+ if record.seq[n] not in atom_dict:
+ atom_dict[record.seq[n]] = 1
+ else:
+ atom_dict[record.seq[n]] += 1
+
+ num_atoms += 1
+
+ max_atoms = []
+ max_size = 0
+
+ for atom in atom_dict:
+ if atom_dict[atom] > max_size:
+ max_atoms = [atom]
+ max_size = atom_dict[atom]
+ elif atom_dict[atom] == max_size:
+ max_atoms.append(atom)
+
+ if require_multiple and num_atoms == 1:
+ consensus += ambiguous
+ elif (len(max_atoms) == 1) and (
+ (float(max_size) / float(num_atoms)) >= threshold
+ ):
+ consensus += max_atoms[0]
+ else:
+ consensus += ambiguous
+
+ return Seq(consensus)
+
+ def replacement_dictionary(self, skip_chars=None, letters=None):
+ """Generate a replacement dictionary to plug into a substitution matrix.
+
+ This should look at an alignment, and be able to generate the number
+ of substitutions of different residues for each other in the
+ aligned object.
+
+ Will then return a dictionary with this information::
+
+ {('A', 'C') : 10, ('C', 'A') : 12, ('G', 'C') : 15 ....}
+
+ This also treats weighted sequences. The following example shows how
+ we calculate the replacement dictionary. Given the following
+ multiple sequence alignment::
+
+ GTATC 0.5
+ AT--C 0.8
+ CTGTC 1.0
+
+ For the first column we have::
+
+ ('A', 'G') : 0.5 * 0.8 = 0.4
+ ('C', 'G') : 0.5 * 1.0 = 0.5
+ ('A', 'C') : 0.8 * 1.0 = 0.8
+
+ We then continue this for all of the columns in the alignment, summing
+ the information for each substitution in each column, until we end
+ up with the replacement dictionary.
+
+ Arguments:
+ - skip_chars - Not used; setting it to anything other than None
+ will raise a ValueError
+ - letters - An iterable (e.g. a string or list of characters to include.
+ """
+ if skip_chars is not None:
+ raise ValueError(
+ "argument skip_chars has been deprecated; instead, please use 'letters' to specify the characters you want to include"
+ )
+ rep_dict = {(letter1, letter2): 0 for letter1 in letters for letter2 in letters}
+
+ # iterate through each record
+ for rec_num1 in range(len(self.alignment)):
+ # iterate through each record from one beyond the current record
+ # to the end of the list of records
+ for rec_num2 in range(rec_num1 + 1, len(self.alignment)):
+ # for each pair of records, compare the sequences and add
+ # the pertinent info to the dictionary
+ self._pair_replacement(
+ self.alignment[rec_num1].seq,
+ self.alignment[rec_num2].seq,
+ self.alignment[rec_num1].annotations.get("weight", 1.0),
+ self.alignment[rec_num2].annotations.get("weight", 1.0),
+ rep_dict,
+ letters,
+ )
+
+ return rep_dict
+
+ def _pair_replacement(self, seq1, seq2, weight1, weight2, dictionary, letters):
+ """Compare two sequences and generate info on the replacements seen (PRIVATE).
+
+ Arguments:
+ - seq1, seq2 - The two sequences to compare.
+ - weight1, weight2 - The relative weights of seq1 and seq2.
+ - dictionary - The dictionary containing the starting replacement
+ info that we will modify.
+ - letters - A list of characters to include when calculating replacements.
+
+ """
+ # loop through each residue in the sequences
+ for residue1, residue2 in zip(seq1, seq2):
+ if residue1 in letters and residue2 in letters:
+ dictionary[(residue1, residue2)] += weight1 * weight2
+
+ def _get_all_letters(self):
+ """Return a string containing the expected letters in the alignment (PRIVATE)."""
+ set_letters = set()
+ for record in self.alignment:
+ set_letters.update(record.seq)
+ list_letters = sorted(set_letters)
+ all_letters = "".join(list_letters)
+ return all_letters
+
+ def pos_specific_score_matrix(self, axis_seq=None, chars_to_ignore=None):
+ """Create a position specific score matrix object for the alignment.
+
+ This creates a position specific score matrix (pssm) which is an
+ alternative method to look at a consensus sequence.
+
+ Arguments:
+ - chars_to_ignore - A list of all characters not to include in
+ the pssm.
+ - axis_seq - An optional argument specifying the sequence to
+ put on the axis of the PSSM. This should be a Seq object. If nothing
+ is specified, the consensus sequence, calculated with default
+ parameters, will be used.
+
+ Returns:
+ - A PSSM (position specific score matrix) object.
+
+ """
+ # determine all of the letters we have to deal with
+ all_letters = self._get_all_letters()
+ assert all_letters
+
+ if chars_to_ignore is None:
+ chars_to_ignore = []
+ if not isinstance(chars_to_ignore, list):
+ raise TypeError("chars_to_ignore should be a list.")
+
+ gap_char = "-"
+ chars_to_ignore.append(gap_char)
+
+ for char in chars_to_ignore:
+ all_letters = all_letters.replace(char, "")
+
+ if axis_seq:
+ left_seq = axis_seq
+ assert len(axis_seq) == self.alignment.get_alignment_length()
+ else:
+ left_seq = self.dumb_consensus()
+
+ pssm_info = []
+ # now start looping through all of the sequences and getting info
+ for residue_num in range(len(left_seq)):
+ score_dict = dict.fromkeys(all_letters, 0)
+ for record in self.alignment:
+ try:
+ this_residue = record.seq[residue_num]
+ # if we hit an index error we've run out of sequence and
+ # should not add new residues
+ except IndexError:
+ this_residue = None
+
+ if this_residue and this_residue not in chars_to_ignore:
+ weight = record.annotations.get("weight", 1.0)
+ try:
+ score_dict[this_residue] += weight
+ except KeyError:
+ raise ValueError(
+ "Residue %s not found" % this_residue
+ ) from None
+
+ pssm_info.append((left_seq[residue_num], score_dict))
+
+ return PSSM(pssm_info)
+
+ def information_content(
+ self,
+ start=0,
+ end=None,
+ e_freq_table=None,
+ log_base=2,
+ chars_to_ignore=None,
+ pseudo_count=0,
+ ):
+ """Calculate the information content for each residue along an alignment.
+
+ Arguments:
+ - start, end - The starting an ending points to calculate the
+ information content. These points should be relative to the first
+ sequence in the alignment, starting at zero (ie. even if the 'real'
+ first position in the seq is 203 in the initial sequence, for
+ the info content, we need to use zero). This defaults to the entire
+ length of the first sequence.
+ - e_freq_table - A dictionary specifying the expected frequencies
+ for each letter (e.g. {'G' : 0.4, 'C' : 0.4, 'T' : 0.1, 'A' : 0.1}).
+ Gap characters should not be included, since these should not have
+ expected frequencies.
+ - log_base - The base of the logarithm to use in calculating the
+ information content. This defaults to 2 so the info is in bits.
+ - chars_to_ignore - A listing of characters which should be ignored
+ in calculating the info content. Defaults to none.
+
+ Returns:
+ - A number representing the info content for the specified region.
+
+ Please see the Biopython manual for more information on how information
+ content is calculated.
+
+ """
+ # if no end was specified, then we default to the end of the sequence
+ if end is None:
+ end = len(self.alignment[0].seq)
+ if chars_to_ignore is None:
+ chars_to_ignore = []
+
+ if start < 0 or end > len(self.alignment[0].seq):
+ raise ValueError(
+ "Start (%s) and end (%s) are not in the range %s to %s"
+ % (start, end, 0, len(self.alignment[0].seq))
+ )
+ # determine random expected frequencies, if necessary
+ random_expected = None
+ # determine all of the letters we have to deal with
+ all_letters = self._get_all_letters()
+ for char in chars_to_ignore:
+ all_letters = all_letters.replace(char, "")
+
+ info_content = {}
+ for residue_num in range(start, end):
+ freq_dict = self._get_letter_freqs(
+ residue_num,
+ self.alignment,
+ all_letters,
+ chars_to_ignore,
+ pseudo_count,
+ e_freq_table,
+ random_expected,
+ )
+ # print(freq_dict, end="")
+ column_score = self._get_column_info_content(
+ freq_dict, e_freq_table, log_base, random_expected
+ )
+ info_content[residue_num] = column_score
+ # sum up the score
+ total_info = sum(info_content.values())
+ # fill in the ic_vector member: holds IC for each column
+ # reset ic_vector to empty list at each call
+ self.ic_vector = []
+ for (i, k) in enumerate(info_content):
+ self.ic_vector.append(info_content[i + start])
+ return total_info
+
+ def _get_letter_freqs(
+ self,
+ residue_num,
+ all_records,
+ letters,
+ to_ignore,
+ pseudo_count=0,
+ e_freq_table=None,
+ random_expected=None,
+ ):
+ """Determine the frequency of specific letters in the alignment (PRIVATE).
+
+ Arguments:
+ - residue_num - The number of the column we are getting frequencies
+ from.
+ - all_records - All of the SeqRecords in the alignment.
+ - letters - The letters we are interested in getting the frequency
+ for.
+ - to_ignore - Letters we are specifically supposed to ignore.
+ - pseudo_count - Optional argument specifying the Pseudo count (k)
+ to add in order to prevent a frequency of 0 for a letter.
+ - e_freq_table - An optional argument specifying a dictionary with
+ the expected frequencies for each letter.
+ - random_expected - Optional argument that specify the frequency to use
+ when e_freq_table is not defined.
+
+ This will calculate the frequencies of each of the specified letters
+ in the alignment at the given frequency, and return this as a
+ dictionary where the keys are the letters and the values are the
+ frequencies. Pseudo count can be added to prevent a null frequency
+ """
+ freq_info = dict.fromkeys(letters, 0)
+
+ total_count = 0
+
+ gap_char = "-"
+
+ if pseudo_count < 0:
+ raise ValueError(
+ "Positive value required for pseudo_count, %s provided" % (pseudo_count)
+ )
+
+ # collect the count info into the dictionary for all the records
+ for record in all_records:
+ try:
+ if record.seq[residue_num] not in to_ignore:
+ weight = record.annotations.get("weight", 1.0)
+ freq_info[record.seq[residue_num]] += weight
+ total_count += weight
+ except KeyError:
+ raise ValueError(
+ "Residue %s not found in letters %s"
+ % (record.seq[residue_num], letters)
+ ) from None
+
+ if e_freq_table:
+ # check if all the residus in freq_info are in e_freq_table
+ for key in freq_info:
+ if key != gap_char and key not in e_freq_table:
+ raise ValueError("%s not found in expected frequency table" % key)
+
+ if total_count == 0:
+ # This column must be entirely ignored characters
+ for letter in freq_info:
+ assert freq_info[letter] == 0
+ # TODO - Map this to NA or NaN?
+ else:
+ # now convert the counts into frequencies
+ for letter in freq_info:
+ if pseudo_count and (random_expected or e_freq_table):
+ # use either the expected random freq or the
+ if e_freq_table:
+ ajust_freq = e_freq_table[letter]
+ else:
+ ajust_freq = random_expected
+
+ ajusted_letter_count = freq_info[letter] + ajust_freq * pseudo_count
+ ajusted_total = total_count + pseudo_count
+ freq_info[letter] = ajusted_letter_count / ajusted_total
+
+ else:
+ freq_info[letter] = freq_info[letter] / total_count
+
+ return freq_info
+
+ def _get_column_info_content(
+ self, obs_freq, e_freq_table, log_base, random_expected
+ ):
+ """Calculate the information content for a column (PRIVATE).
+
+ Arguments:
+ - obs_freq - The frequencies observed for each letter in the column.
+ - e_freq_table - An optional argument specifying a dictionary with
+ the expected frequencies for each letter.
+ - log_base - The base of the logarithm to use in calculating the
+ info content.
+
+ """
+ gap_char = "-"
+
+ if e_freq_table:
+ # check the expected freq information to make sure it is good
+ for key in obs_freq:
+ if key != gap_char and key not in e_freq_table:
+ raise ValueError(
+ f"Frequency table provided does not contain observed letter {key}"
+ )
+
+ total_info = 0.0
+
+ for letter in obs_freq:
+ inner_log = 0.0
+ # if we have expected frequencies, modify the log value by them
+ # gap characters do not have expected frequencies, so they
+ # should just be the observed frequency.
+ if letter != gap_char:
+ if e_freq_table:
+ inner_log = obs_freq[letter] / e_freq_table[letter]
+ else:
+ inner_log = obs_freq[letter] / random_expected
+ # if the observed frequency is zero, we don't add any info to the
+ # total information content
+ if inner_log > 0:
+ letter_info = (
+ obs_freq[letter] * math.log(inner_log) / math.log(log_base)
+ )
+ total_info += letter_info
+ return total_info
+
+ def get_column(self, col):
+ """Return column of alignment."""
+ # TODO - Deprecate this and implement slicing?
+ return self.alignment[:, col]
+
+
+class PSSM:
+ """Represent a position specific score matrix.
+
+ This class is meant to make it easy to access the info within a PSSM
+ and also make it easy to print out the information in a nice table.
+
+ Let's say you had an alignment like this::
+
+ GTATC
+ AT--C
+ CTGTC
+
+ The position specific score matrix (when printed) looks like::
+
+ G A T C
+ G 1 1 0 1
+ T 0 0 3 0
+ A 1 1 0 0
+ T 0 0 2 0
+ C 0 0 0 3
+
+ You can access a single element of the PSSM using the following::
+
+ your_pssm[sequence_number][residue_count_name]
+
+ For instance, to get the 'T' residue for the second element in the
+ above alignment you would need to do:
+
+ your_pssm[1]['T']
+ """
+
+ def __init__(self, pssm):
+ """Initialize with pssm data to represent.
+
+ The pssm passed should be a list with the following structure:
+
+ list[0] - The letter of the residue being represented (for instance,
+ from the example above, the first few list[0]s would be GTAT...
+ list[1] - A dictionary with the letter substitutions and counts.
+ """
+ self.pssm = pssm
+
+ def __getitem__(self, pos):
+ return self.pssm[pos][1]
+
+ def __str__(self):
+ out = " "
+ all_residues = sorted(self.pssm[0][1])
+
+ # first print out the top header
+ for res in all_residues:
+ out += " %s" % res
+ out += "\n"
+
+ # for each item, write out the substitutions
+ for item in self.pssm:
+ out += "%s " % item[0]
+ for res in all_residues:
+ out += " %.1f" % item[1][res]
+
+ out += "\n"
+ return out
+
+ def get_residue(self, pos):
+ """Return the residue letter at the specified position."""
+ return self.pssm[pos][0]
+
+
+def print_info_content(summary_info, fout=None, rep_record=0):
+ """3 column output: position, aa in representative sequence, ic_vector value."""
+ fout = fout or sys.stdout
+ if not summary_info.ic_vector:
+ summary_info.information_content()
+ rep_sequence = summary_info.alignment[rep_record].seq
+ for pos, ic in enumerate(summary_info.ic_vector):
+ fout.write("%d %s %.3f\n" % (pos, rep_sequence[pos], ic))
diff --git a/code/lib/Bio/Align/Applications/_ClustalOmega.py b/code/lib/Bio/Align/Applications/_ClustalOmega.py
new file mode 100644
index 0000000..2181bc5
--- /dev/null
+++ b/code/lib/Bio/Align/Applications/_ClustalOmega.py
@@ -0,0 +1,269 @@
+# Copyright 2011 by Andreas Wilm. All rights reserved.
+# Based on ClustalW wrapper copyright 2009 by Cymon J. Cox.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Command line wrapper for the multiple alignment program Clustal Omega."""
+
+
+from Bio.Application import _Option, _Switch, AbstractCommandline
+
+
+class ClustalOmegaCommandline(AbstractCommandline):
+ """Command line wrapper for clustal omega.
+
+ http://www.clustal.org/omega
+
+ Notes
+ -----
+ Last checked against version: 1.2.0
+
+ References
+ ----------
+ Sievers F, Wilm A, Dineen DG, Gibson TJ, Karplus K, Li W, Lopez R,
+ McWilliam H, Remmert M, Söding J, Thompson JD, Higgins DG (2011).
+ Fast, scalable generation of high-quality protein multiple
+ sequence alignments using Clustal Omega.
+ Molecular Systems Biology 7:539 https://doi.org/10.1038/msb.2011.75
+
+ Examples
+ --------
+ >>> from Bio.Align.Applications import ClustalOmegaCommandline
+ >>> in_file = "unaligned.fasta"
+ >>> out_file = "aligned.fasta"
+ >>> clustalomega_cline = ClustalOmegaCommandline(infile=in_file, outfile=out_file, verbose=True, auto=True)
+ >>> print(clustalomega_cline)
+ clustalo -i unaligned.fasta -o aligned.fasta --auto -v
+
+ You would typically run the command line with clustalomega_cline() or via
+ the Python subprocess module, as described in the Biopython tutorial.
+
+ """
+
+ def __init__(self, cmd="clustalo", **kwargs):
+ """Initialize the class."""
+ # order parameters in the same order as clustalo --help
+ self.parameters = [
+ # Sequence Input
+ _Option(
+ ["-i", "--in", "--infile", "infile"],
+ "Multiple sequence input file",
+ filename=True,
+ equate=False,
+ ),
+ _Option(
+ ["--hmm-in", "HMM input", "hmm_input"],
+ "HMM input files",
+ filename=True,
+ equate=False,
+ ),
+ _Switch(["--dealign", "dealign"], "Dealign input sequences"),
+ _Option(
+ ["--profile1", "--p1", "profile1"],
+ "Pre-aligned multiple sequence file (aligned columns will be kept fix).",
+ filename=True,
+ equate=False,
+ ),
+ _Option(
+ ["--profile2", "--p2", "profile2"],
+ "Pre-aligned multiple sequence file (aligned columns will be kept fix).",
+ filename=True,
+ equate=False,
+ ),
+ _Option(
+ ["-t", "--seqtype", "seqtype"],
+ "{Protein, RNA, DNA} Force a sequence type (default: auto).",
+ equate=False,
+ checker_function=lambda x: x
+ in ["protein", "rna", "dna", "Protein", "RNA", "DNA", "PROTEIN"],
+ ),
+ _Switch(
+ ["--is-profile", "isprofile"],
+ "disable check if profile, force profile (default no)",
+ ),
+ _Option(
+ ["--infmt", "infmt"],
+ """Forced sequence input file format (default: auto)
+
+ Allowed values: a2m, fa[sta], clu[stal], msf, phy[lip], selex, st[ockholm], vie[nna]
+ """,
+ equate=False,
+ checker_function=lambda x: x
+ in [
+ "a2m",
+ "fa",
+ "fasta",
+ "clu",
+ "clustal",
+ "msf",
+ "phy",
+ "phylip",
+ "selex",
+ "st",
+ "stockholm",
+ "vie",
+ "vienna",
+ ],
+ ),
+ # Clustering
+ _Option(
+ ["--distmat-in", "distmat_in"],
+ "Pairwise distance matrix input file (skips distance computation).",
+ filename=True,
+ equate=False,
+ ),
+ _Option(
+ ["--distmat-out", "distmat_out"],
+ "Pairwise distance matrix output file.",
+ filename=True,
+ equate=False,
+ ),
+ _Option(
+ ["--guidetree-in", "guidetree_in"],
+ "Guide tree input file (skips distance computation and guide-tree clustering step).",
+ filename=True,
+ equate=False,
+ ),
+ _Option(
+ ["--guidetree-out", "guidetree_out"],
+ "Guide tree output file.",
+ filename=True,
+ equate=False,
+ ),
+ _Switch(
+ ["--full", "distmat_full"],
+ "Use full distance matrix for guide-tree calculation (slow; mBed is default)",
+ ),
+ _Switch(
+ ["--full-iter", "distmat_full_iter"],
+ "Use full distance matrix for guide-tree calculation during iteration (mBed is default)",
+ ),
+ _Option(
+ ["--cluster-size", "clustersize"],
+ "soft maximum of sequences in sub-clusters",
+ checker_function=lambda x: isinstance(x, int),
+ ),
+ _Option(
+ ["--clustering-out", "clusteringout"],
+ "Clustering output file",
+ filename=True,
+ ),
+ _Switch(
+ ["--use-kimura", "usekimura"],
+ "use Kimura distance correction for aligned sequences (default no)",
+ ),
+ _Switch(
+ ["--percent-id", "percentid"],
+ "convert distances into percent identities (default no)",
+ ),
+ # Alignment Output
+ _Option(
+ ["-o", "--out", "--outfile", "outfile"],
+ "Multiple sequence alignment output file (default: stdout).",
+ filename=True,
+ equate=False,
+ ),
+ _Option(
+ ["--outfmt", "outfmt"],
+ "MSA output file format:"
+ " a2m=fa[sta],clu[stal],msf,phy[lip],selex,st[ockholm],vie[nna]"
+ " (default: fasta).",
+ equate=False,
+ checker_function=lambda x: x
+ in [
+ "a2m",
+ "fa",
+ "fasta",
+ "clu",
+ "clustal",
+ "msf",
+ "phy",
+ "phylip",
+ "selex",
+ "st",
+ "stockholm",
+ "vie",
+ "vienna",
+ ],
+ ),
+ _Switch(
+ ["--residuenumber", "--resno", "residuenumber"],
+ "in Clustal format print residue numbers (default no)",
+ ),
+ _Option(
+ ["--wrap", "wrap"],
+ "number of residues before line-wrap in output",
+ checker_function=lambda x: isinstance(x, int),
+ ),
+ _Option(
+ ["--output-order", "outputorder"],
+ "MSA output order like in input/guide-tree",
+ checker_function=lambda x: x in ["input-order", "tree-order"],
+ ),
+ # Iteration
+ _Option(
+ ["--iterations", "--iter", "iterations"],
+ "Number of (combined guide-tree/HMM) iterations",
+ equate=False,
+ checker_function=lambda x: isinstance(x, int),
+ ),
+ _Option(
+ ["--max-guidetree-iterations", "max_guidetree_iterations"],
+ "Maximum number of guidetree iterations",
+ equate=False,
+ checker_function=lambda x: isinstance(x, int),
+ ),
+ _Option(
+ ["--max-hmm-iterations", "max_hmm_iterations"],
+ "Maximum number of HMM iterations",
+ equate=False,
+ checker_function=lambda x: isinstance(x, int),
+ ),
+ # Limits (will exit early, if exceeded):
+ _Option(
+ ["--maxnumseq", "maxnumseq"],
+ "Maximum allowed number of sequences",
+ equate=False,
+ checker_function=lambda x: isinstance(x, int),
+ ),
+ _Option(
+ ["--maxseqlen", "maxseqlen"],
+ "Maximum allowed sequence length",
+ equate=False,
+ checker_function=lambda x: isinstance(x, int),
+ ),
+ # Miscellaneous:
+ _Switch(
+ ["--auto", "auto"],
+ "Set options automatically (might overwrite some of your options)",
+ ),
+ _Option(
+ ["--threads", "threads"],
+ "Number of processors to use",
+ equate=False,
+ checker_function=lambda x: isinstance(x, int),
+ ),
+ _Option(
+ ["-l", "--log", "log"],
+ "Log all non-essential output to this file.",
+ filename=True,
+ equate=False,
+ ),
+ _Switch(["-h", "--help", "help"], "Print help and exit."),
+ _Switch(["-v", "--verbose", "verbose"], "Verbose output"),
+ _Switch(["--version", "version"], "Print version information and exit"),
+ _Switch(
+ ["--long-version", "long_version"],
+ "Print long version information and exit",
+ ),
+ _Switch(["--force", "force"], "Force file overwriting."),
+ ]
+ AbstractCommandline.__init__(self, cmd, **kwargs)
+
+
+if __name__ == "__main__":
+ from Bio._utils import run_doctest
+
+ run_doctest()
diff --git a/code/lib/Bio/Align/Applications/_Clustalw.py b/code/lib/Bio/Align/Applications/_Clustalw.py
new file mode 100644
index 0000000..777e411
--- /dev/null
+++ b/code/lib/Bio/Align/Applications/_Clustalw.py
@@ -0,0 +1,486 @@
+# Copyright 2009 by Cymon J. Cox. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Command line wrapper for the multiple alignment program Clustal W."""
+
+
+import os
+from Bio.Application import _Option, _Switch, AbstractCommandline
+
+
+class ClustalwCommandline(AbstractCommandline):
+ """Command line wrapper for clustalw (version one or two).
+
+ http://www.clustal.org/
+
+ Notes
+ -----
+ Last checked against versions: 1.83 and 2.1
+
+ References
+ ----------
+ Larkin MA, Blackshields G, Brown NP, Chenna R, McGettigan PA,
+ McWilliam H, Valentin F, Wallace IM, Wilm A, Lopez R, Thompson JD,
+ Gibson TJ, Higgins DG. (2007). Clustal W and Clustal X version 2.0.
+ Bioinformatics, 23, 2947-2948.
+
+ Examples
+ --------
+ >>> from Bio.Align.Applications import ClustalwCommandline
+ >>> in_file = "unaligned.fasta"
+ >>> clustalw_cline = ClustalwCommandline("clustalw2", infile=in_file)
+ >>> print(clustalw_cline)
+ clustalw2 -infile=unaligned.fasta
+
+ You would typically run the command line with clustalw_cline() or via
+ the Python subprocess module, as described in the Biopython tutorial.
+
+ """
+
+ # TODO - Should we default to cmd="clustalw2" now?
+ def __init__(self, cmd="clustalw", **kwargs):
+ """Initialize the class."""
+ self.parameters = [
+ _Option(
+ ["-infile", "-INFILE", "INFILE", "infile"],
+ "Input sequences.",
+ filename=True,
+ ),
+ _Option(
+ ["-profile1", "-PROFILE1", "PROFILE1", "profile1"],
+ "Profiles (old alignment).",
+ filename=True,
+ ),
+ _Option(
+ ["-profile2", "-PROFILE2", "PROFILE2", "profile2"],
+ "Profiles (old alignment).",
+ filename=True,
+ ),
+ # ################# VERBS (do things) #############################
+ _Switch(
+ ["-options", "-OPTIONS", "OPTIONS", "options"],
+ "List the command line parameters",
+ ),
+ _Switch(
+ ["-help", "-HELP", "HELP", "help"], "Outline the command line params."
+ ),
+ _Switch(
+ ["-check", "-CHECK", "CHECK", "check"],
+ "Outline the command line params.",
+ ),
+ _Switch(
+ ["-fullhelp", "-FULLHELP", "FULLHELP", "fullhelp"],
+ "Output full help content.",
+ ),
+ _Switch(
+ ["-align", "-ALIGN", "ALIGN", "align"], "Do full multiple alignment."
+ ),
+ _Switch(["-tree", "-TREE", "TREE", "tree"], "Calculate NJ tree."),
+ _Switch(
+ ["-pim", "-PIM", "PIM", "pim"],
+ "Output percent identity matrix (while calculating the tree).",
+ ),
+ _Option(
+ ["-bootstrap", "-BOOTSTRAP", "BOOTSTRAP", "bootstrap"],
+ "Bootstrap a NJ tree (n= number of bootstraps; def. = 1000).",
+ checker_function=lambda x: isinstance(x, int),
+ ),
+ _Switch(
+ ["-convert", "-CONVERT", "CONVERT", "convert"],
+ "Output the input sequences in a different file format.",
+ ),
+ # #################### PARAMETERS (set things) #########################
+ # ***General settings:****
+ # Makes no sense in biopython
+ # _Option(["-interactive", "-INTERACTIVE", "INTERACTIVE", "interactive"],
+ # [],
+ # lambda x: 0, # Does not take value
+ # False,
+ # "read command line, then enter normal interactive menus",
+ # False),
+ _Switch(
+ ["-quicktree", "-QUICKTREE", "QUICKTREE", "quicktree"],
+ "Use FAST algorithm for the alignment guide tree",
+ ),
+ _Option(
+ ["-type", "-TYPE", "TYPE", "type"],
+ "PROTEIN or DNA sequences",
+ checker_function=lambda x: x in ["PROTEIN", "DNA", "protein", "dna"],
+ ),
+ _Switch(
+ ["-negative", "-NEGATIVE", "NEGATIVE", "negative"],
+ "Protein alignment with negative values in matrix",
+ ),
+ _Option(
+ ["-outfile", "-OUTFILE", "OUTFILE", "outfile"],
+ "Output sequence alignment file name",
+ filename=True,
+ ),
+ _Option(
+ ["-output", "-OUTPUT", "OUTPUT", "output"],
+ "Output format: CLUSTAL(default), GCG, GDE, PHYLIP, PIR, NEXUS and FASTA",
+ checker_function=lambda x: x
+ in [
+ "CLUSTAL",
+ "GCG",
+ "GDE",
+ "PHYLIP",
+ "PIR",
+ "NEXUS",
+ "FASTA",
+ "clustal",
+ "gcg",
+ "gde",
+ "phylip",
+ "pir",
+ "nexus",
+ "fasta",
+ ],
+ ),
+ _Option(
+ ["-outorder", "-OUTORDER", "OUTORDER", "outorder"],
+ "Output taxon order: INPUT or ALIGNED",
+ checker_function=lambda x: x
+ in ["INPUT", "input", "ALIGNED", "aligned"],
+ ),
+ _Option(
+ ["-case", "-CASE", "CASE", "case"],
+ "LOWER or UPPER (for GDE output only)",
+ checker_function=lambda x: x in ["UPPER", "upper", "LOWER", "lower"],
+ ),
+ _Option(
+ ["-seqnos", "-SEQNOS", "SEQNOS", "seqnos"],
+ "OFF or ON (for Clustal output only)",
+ checker_function=lambda x: x in ["ON", "on", "OFF", "off"],
+ ),
+ _Option(
+ ["-seqno_range", "-SEQNO_RANGE", "SEQNO_RANGE", "seqno_range"],
+ "OFF or ON (NEW- for all output formats)",
+ checker_function=lambda x: x in ["ON", "on", "OFF", "off"],
+ ),
+ _Option(
+ ["-range", "-RANGE", "RANGE", "range"],
+ "Sequence range to write starting m to m+n. "
+ "Input as string eg. '24,200'",
+ ),
+ _Option(
+ ["-maxseqlen", "-MAXSEQLEN", "MAXSEQLEN", "maxseqlen"],
+ "Maximum allowed input sequence length",
+ checker_function=lambda x: isinstance(x, int),
+ ),
+ _Switch(
+ ["-quiet", "-QUIET", "QUIET", "quiet"],
+ "Reduce console output to minimum",
+ ),
+ _Option(
+ ["-stats", "-STATS", "STATS", "stats"],
+ "Log some alignment statistics to file",
+ filename=True,
+ ),
+ # ***Fast Pairwise Alignments:***
+ _Option(
+ ["-ktuple", "-KTUPLE", "KTUPLE", "ktuple"],
+ "Word size",
+ checker_function=lambda x: (isinstance(x, int) or isinstance(x, float)),
+ ),
+ _Option(
+ ["-topdiags", "-TOPDIAGS", "TOPDIAGS", "topdiags"],
+ "Number of best diags.",
+ checker_function=lambda x: (isinstance(x, int) or isinstance(x, float)),
+ ),
+ _Option(
+ ["-window", "-WINDOW", "WINDOW", "window"],
+ "Window around best diags.",
+ checker_function=lambda x: (isinstance(x, int) or isinstance(x, float)),
+ ),
+ _Option(
+ ["-pairgap", "-PAIRGAP", "PAIRGAP", "pairgap"],
+ "Gap penalty",
+ checker_function=lambda x: (isinstance(x, int) or isinstance(x, float)),
+ ),
+ _Option(
+ ["-score", "-SCORE", "SCORE", "score"],
+ "Either: PERCENT or ABSOLUTE",
+ checker_function=lambda x: x
+ in ["percent", "PERCENT", "absolute", "ABSOLUTE"],
+ ),
+ # ***Slow Pairwise Alignments:***
+ _Option(
+ ["-pwmatrix", "-PWMATRIX", "PWMATRIX", "pwmatrix"],
+ "Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename",
+ checker_function=lambda x: (
+ x
+ in [
+ "BLOSUM",
+ "PAM",
+ "GONNET",
+ "ID",
+ "blosum",
+ "pam",
+ "gonnet",
+ "id",
+ ]
+ or os.path.exists(x)
+ ),
+ filename=True,
+ ),
+ _Option(
+ ["-pwdnamatrix", "-PWDNAMATRIX", "PWDNAMATRIX", "pwdnamatrix"],
+ "DNA weight matrix=IUB, CLUSTALW or filename",
+ checker_function=lambda x: (
+ x in ["IUB", "CLUSTALW", "iub", "clustalw"] or os.path.exists(x)
+ ),
+ filename=True,
+ ),
+ _Option(
+ ["-pwgapopen", "-PWGAPOPEN", "PWGAPOPEN", "pwgapopen"],
+ "Gap opening penalty",
+ checker_function=lambda x: (isinstance(x, int) or isinstance(x, float)),
+ ),
+ _Option(
+ ["-pwgapext", "-PWGAPEXT", "PWGAPEXT", "pwgapext"],
+ "Gap extension penalty",
+ checker_function=lambda x: (isinstance(x, int) or isinstance(x, float)),
+ ),
+ # ***Multiple Alignments:***
+ _Option(
+ ["-newtree", "-NEWTREE", "NEWTREE", "newtree"],
+ "Output file name for newly created guide tree",
+ filename=True,
+ ),
+ _Option(
+ ["-usetree", "-USETREE", "USETREE", "usetree"],
+ "File name of guide tree",
+ checker_function=lambda x: os.path.exists,
+ filename=True,
+ ),
+ _Option(
+ ["-matrix", "-MATRIX", "MATRIX", "matrix"],
+ "Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename",
+ checker_function=lambda x: (
+ x
+ in [
+ "BLOSUM",
+ "PAM",
+ "GONNET",
+ "ID",
+ "blosum",
+ "pam",
+ "gonnet",
+ "id",
+ ]
+ or os.path.exists(x)
+ ),
+ filename=True,
+ ),
+ _Option(
+ ["-dnamatrix", "-DNAMATRIX", "DNAMATRIX", "dnamatrix"],
+ "DNA weight matrix=IUB, CLUSTALW or filename",
+ checker_function=lambda x: (
+ x in ["IUB", "CLUSTALW", "iub", "clustalw"] or os.path.exists(x)
+ ),
+ filename=True,
+ ),
+ _Option(
+ ["-gapopen", "-GAPOPEN", "GAPOPEN", "gapopen"],
+ "Gap opening penalty",
+ checker_function=lambda x: (isinstance(x, int) or isinstance(x, float)),
+ ),
+ _Option(
+ ["-gapext", "-GAPEXT", "GAPEXT", "gapext"],
+ "Gap extension penalty",
+ checker_function=lambda x: (isinstance(x, int) or isinstance(x, float)),
+ ),
+ _Switch(
+ ["-endgaps", "-ENDGAPS", "ENDGAPS", "endgaps"],
+ "No end gap separation pen.",
+ ),
+ _Option(
+ ["-gapdist", "-GAPDIST", "GAPDIST", "gapdist"],
+ "Gap separation pen. range",
+ checker_function=lambda x: (isinstance(x, int) or isinstance(x, float)),
+ ),
+ _Switch(
+ ["-nopgap", "-NOPGAP", "NOPGAP", "nopgap"], "Residue-specific gaps off"
+ ),
+ _Switch(["-nohgap", "-NOHGAP", "NOHGAP", "nohgap"], "Hydrophilic gaps off"),
+ _Switch(
+ ["-hgapresidues", "-HGAPRESIDUES", "HGAPRESIDUES", "hgapresidues"],
+ "List hydrophilic res.",
+ ),
+ _Option(
+ ["-maxdiv", "-MAXDIV", "MAXDIV", "maxdiv"],
+ "% ident. for delay",
+ checker_function=lambda x: (isinstance(x, int) or isinstance(x, float)),
+ ),
+ # Already handled in General Settings section, but appears a second
+ # time under Multiple Alignments in the help
+ # _Option(["-type", "-TYPE", "TYPE", "type"],
+ # "PROTEIN or DNA",
+ # checker_function=lambda x: x in ["PROTEIN", "DNA",
+ # "protein", "dna"]),
+ _Option(
+ ["-transweight", "-TRANSWEIGHT", "TRANSWEIGHT", "transweight"],
+ "Transitions weighting",
+ checker_function=lambda x: (isinstance(x, int) or isinstance(x, float)),
+ ),
+ _Option(
+ ["-iteration", "-ITERATION", "ITERATION", "iteration"],
+ "NONE or TREE or ALIGNMENT",
+ checker_function=lambda x: x
+ in ["NONE", "TREE", "ALIGNMENT", "none", "tree", "alignment"],
+ ),
+ _Option(
+ ["-numiter", "-NUMITER", "NUMITER", "numiter"],
+ "maximum number of iterations to perform",
+ checker_function=lambda x: isinstance(x, int),
+ ),
+ _Switch(
+ ["-noweights", "-NOWEIGHTS", "NOWEIGHTS", "noweights"],
+ "Disable sequence weighting",
+ ),
+ # ***Profile Alignments:***
+ _Switch(
+ ["-profile", "-PROFILE", "PROFILE", "profile"],
+ "Merge two alignments by profile alignment",
+ ),
+ _Option(
+ ["-newtree1", "-NEWTREE1", "NEWTREE1", "newtree1"],
+ "Output file name for new guide tree of profile1",
+ filename=True,
+ ),
+ _Option(
+ ["-newtree2", "-NEWTREE2", "NEWTREE2", "newtree2"],
+ "Output file for new guide tree of profile2",
+ filename=True,
+ ),
+ _Option(
+ ["-usetree1", "-USETREE1", "USETREE1", "usetree1"],
+ "File name of guide tree for profile1",
+ checker_function=lambda x: os.path.exists,
+ filename=True,
+ ),
+ _Option(
+ ["-usetree2", "-USETREE2", "USETREE2", "usetree2"],
+ "File name of guide tree for profile2",
+ checker_function=lambda x: os.path.exists,
+ filename=True,
+ ),
+ # ***Sequence to Profile Alignments:***
+ _Switch(
+ ["-sequences", "-SEQUENCES", "SEQUENCES", "sequences"],
+ "Sequentially add profile2 sequences to profile1 alignment",
+ ),
+ # These are already handled in the Multiple Alignments section,
+ # but appear a second time here in the help.
+ # _Option(["-newtree", "-NEWTREE", "NEWTREE", "newtree"],
+ # "File for new guide tree",
+ # filename=True),
+ # _Option(["-usetree", "-USETREE", "USETREE", "usetree"],
+ # "File for old guide tree",
+ # checker_function=lambda x: os.path.exists,
+ # filename=True),
+ # ***Structure Alignments:***
+ _Switch(
+ ["-nosecstr1", "-NOSECSTR1", "NOSECSTR1", "nosecstr1"],
+ "Do not use secondary structure-gap penalty mask for profile 1",
+ ),
+ _Switch(
+ ["-nosecstr2", "-NOSECSTR2", "NOSECSTR2", "nosecstr2"],
+ "Do not use secondary structure-gap penalty mask for profile 2",
+ ),
+ _Option(
+ ["-secstrout", "-SECSTROUT", "SECSTROUT", "secstrout"],
+ "STRUCTURE or MASK or BOTH or NONE output in alignment file",
+ checker_function=lambda x: x
+ in [
+ "STRUCTURE",
+ "MASK",
+ "BOTH",
+ "NONE",
+ "structure",
+ "mask",
+ "both",
+ "none",
+ ],
+ ),
+ _Option(
+ ["-helixgap", "-HELIXGAP", "HELIXGAP", "helixgap"],
+ "Gap penalty for helix core residues",
+ checker_function=lambda x: (isinstance(x, int) or isinstance(x, float)),
+ ),
+ _Option(
+ ["-strandgap", "-STRANDGAP", "STRANDGAP", "strandgap"],
+ "gap penalty for strand core residues",
+ checker_function=lambda x: (isinstance(x, int) or isinstance(x, float)),
+ ),
+ _Option(
+ ["-loopgap", "-LOOPGAP", "LOOPGAP", "loopgap"],
+ "Gap penalty for loop regions",
+ checker_function=lambda x: (isinstance(x, int) or isinstance(x, float)),
+ ),
+ _Option(
+ ["-terminalgap", "-TERMINALGAP", "TERMINALGAP", "terminalgap"],
+ "Gap penalty for structure termini",
+ checker_function=lambda x: (isinstance(x, int) or isinstance(x, float)),
+ ),
+ _Option(
+ ["-helixendin", "-HELIXENDIN", "HELIXENDIN", "helixendin"],
+ "Number of residues inside helix to be treated as terminal",
+ checker_function=lambda x: isinstance(x, int),
+ ),
+ _Option(
+ ["-helixendout", "-HELIXENDOUT", "HELIXENDOUT", "helixendout"],
+ "Number of residues outside helix to be treated as terminal",
+ checker_function=lambda x: isinstance(x, int),
+ ),
+ _Option(
+ ["-strandendin", "-STRANDENDIN", "STRANDENDIN", "strandendin"],
+ "Number of residues inside strand to be treated as terminal",
+ checker_function=lambda x: isinstance(x, int),
+ ),
+ _Option(
+ ["-strandendout", "-STRANDENDOUT", "STRANDENDOUT", "strandendout"],
+ "Number of residues outside strand to be treated as terminal",
+ checker_function=lambda x: isinstance(x, int),
+ ),
+ # ***Trees:***
+ _Option(
+ ["-outputtree", "-OUTPUTTREE", "OUTPUTTREE", "outputtree"],
+ "nj OR phylip OR dist OR nexus",
+ checker_function=lambda x: x
+ in ["NJ", "PHYLIP", "DIST", "NEXUS", "nj", "phylip", "dist", "nexus"],
+ ),
+ _Option(
+ ["-seed", "-SEED", "SEED", "seed"],
+ "Seed number for bootstraps.",
+ checker_function=lambda x: isinstance(x, int),
+ ),
+ _Switch(
+ ["-kimura", "-KIMURA", "KIMURA", "kimura"], "Use Kimura's correction."
+ ),
+ _Switch(
+ ["-tossgaps", "-TOSSGAPS", "TOSSGAPS", "tossgaps"],
+ "Ignore positions with gaps.",
+ ),
+ _Option(
+ ["-bootlabels", "-BOOTLABELS", "BOOTLABELS", "bootlabels"],
+ "Node OR branch position of bootstrap values in tree display",
+ checker_function=lambda x: x in ["NODE", "BRANCH", "node", "branch"],
+ ),
+ _Option(
+ ["-clustering", "-CLUSTERING", "CLUSTERING", "clustering"],
+ "NJ or UPGMA",
+ checker_function=lambda x: x in ["NJ", "UPGMA", "nj", "upgma"],
+ ),
+ ]
+ AbstractCommandline.__init__(self, cmd, **kwargs)
+
+
+if __name__ == "__main__":
+ from Bio._utils import run_doctest
+
+ run_doctest()
diff --git a/code/lib/Bio/Align/Applications/_Dialign.py b/code/lib/Bio/Align/Applications/_Dialign.py
new file mode 100644
index 0000000..52be1b1
--- /dev/null
+++ b/code/lib/Bio/Align/Applications/_Dialign.py
@@ -0,0 +1,243 @@
+# Copyright 2009 by Cymon J. Cox. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Command line wrapper for the multiple alignment program DIALIGN2-2."""
+
+from Bio.Application import _Option, _Argument, _Switch, AbstractCommandline
+
+
+class DialignCommandline(AbstractCommandline):
+ """Command line wrapper for the multiple alignment program DIALIGN2-2.
+
+ http://bibiserv.techfak.uni-bielefeld.de/dialign/welcome.html
+
+ Notes
+ -----
+ Last checked against version: 2.2
+
+ References
+ ----------
+ B. Morgenstern (2004). DIALIGN: Multiple DNA and Protein Sequence
+ Alignment at BiBiServ. Nucleic Acids Research 32, W33-W36.
+
+ Examples
+ --------
+ To align a FASTA file (unaligned.fasta) with the output files names
+ aligned.* including a FASTA output file (aligned.fa), use:
+
+ >>> from Bio.Align.Applications import DialignCommandline
+ >>> dialign_cline = DialignCommandline(input="unaligned.fasta",
+ ... fn="aligned", fa=True)
+ >>> print(dialign_cline)
+ dialign2-2 -fa -fn aligned unaligned.fasta
+
+ You would typically run the command line with dialign_cline() or via
+ the Python subprocess module, as described in the Biopython tutorial.
+
+ """
+
+ def __init__(self, cmd="dialign2-2", **kwargs):
+ """Initialize the class."""
+ self.program_name = cmd
+ self.parameters = [
+ _Switch(
+ ["-afc", "afc"],
+ r"Creates additional output file '\*.afc' "
+ "containing data of all fragments considered "
+ "for alignment WARNING: this file can be HUGE !",
+ ),
+ _Switch(
+ ["-afc_v", "afc_v"],
+ "Like '-afc' but verbose: fragments are explicitly "
+ "printed. WARNING: this file can be EVEN BIGGER !",
+ ),
+ _Switch(
+ ["-anc", "anc"],
+ "Anchored alignment. Requires a file .anc "
+ "containing anchor points.",
+ ),
+ _Switch(
+ ["-cs", "cs"],
+ "If segments are translated, not only the 'Watson "
+ "strand' but also the 'Crick strand' is looked at.",
+ ),
+ _Switch(["-cw", "cw"], "Additional output file in CLUSTAL W format."),
+ _Switch(
+ ["-ds", "ds"],
+ "'dna alignment speed up' - non-translated nucleic acid "
+ "fragments are taken into account only if they start "
+ "with at least two matches. Speeds up DNA alignment at "
+ "the expense of sensitivity.",
+ ),
+ _Switch(["-fa", "fa"], "Additional output file in FASTA format."),
+ _Switch(
+ ["-ff", "ff"],
+ r"Creates file \*.frg containing information about all "
+ "fragments that are part of the respective optimal "
+ "pairwise alignmnets plus information about "
+ "consistency in the multiple alignment",
+ ),
+ _Option(
+ ["-fn", "fn"],
+ "Output files are named ..",
+ equate=False,
+ ),
+ _Switch(
+ ["-fop", "fop"],
+ r"Creates file \*.fop containing coordinates of all "
+ "fragments that are part of the respective pairwise alignments.",
+ ),
+ _Switch(
+ ["-fsm", "fsm"],
+ r"Creates file \*.fsm containing coordinates of all "
+ "fragments that are part of the final alignment",
+ ),
+ _Switch(
+ ["-iw", "iw"],
+ "Overlap weights switched off (by default, overlap "
+ "weights are used if up to 35 sequences are aligned). "
+ "This option speeds up the alignment but may lead "
+ "to reduced alignment quality.",
+ ),
+ _Switch(
+ ["-lgs", "lgs"],
+ "'long genomic sequences' - combines the following "
+ "options: -ma, -thr 2, -lmax 30, -smin 8, -nta, -ff, "
+ "-fop, -ff, -cs, -ds, -pst ",
+ ),
+ _Switch(
+ ["-lgs_t", "lgs_t"],
+ "Like '-lgs' but with all segment pairs assessed "
+ "at the peptide level (rather than 'mixed alignments' "
+ "as with the '-lgs' option). Therefore faster than "
+ "-lgs but not very sensitive for non-coding regions.",
+ ),
+ _Option(
+ ["-lmax", "lmax"],
+ "Maximum fragment length = x (default: x = 40 or "
+ "x = 120 for 'translated' fragments). Shorter x "
+ "speeds up the program but may affect alignment quality.",
+ checker_function=lambda x: isinstance(x, int),
+ equate=False,
+ ),
+ _Switch(
+ ["-lo", "lo"],
+ r"(Long Output) Additional file \*.log with information "
+ "about fragments selected for pairwise alignment and "
+ "about consistency in multi-alignment procedure.",
+ ),
+ _Switch(
+ ["-ma", "ma"],
+ "'mixed alignments' consisting of P-fragments and "
+ "N-fragments if nucleic acid sequences are aligned.",
+ ),
+ _Switch(
+ ["-mask", "mask"],
+ "Residues not belonging to selected fragments are "
+ r"replaced by '\*' characters in output alignment "
+ "(rather than being printed in lower-case characters)",
+ ),
+ _Switch(
+ ["-mat", "mat"],
+ r"Creates file \*mat with substitution counts derived "
+ "from the fragments that have been selected for alignment.",
+ ),
+ _Switch(
+ ["-mat_thr", "mat_thr"],
+ "Like '-mat' but only fragments with weight score "
+ "> t are considered",
+ ),
+ _Switch(
+ ["-max_link", "max_link"],
+ "'maximum linkage' clustering used to construct "
+ "sequence tree (instead of UPGMA).",
+ ),
+ _Switch(["-min_link", "min_link"], "'minimum linkage' clustering used."),
+ _Option(["-mot", "mot"], "'motif' option.", equate=False),
+ _Switch(["-msf", "msf"], "Separate output file in MSF format."),
+ _Switch(
+ ["-n", "n"],
+ "Input sequences are nucleic acid sequences. "
+ "No translation of fragments.",
+ ),
+ _Switch(
+ ["-nt", "nt"],
+ "Input sequences are nucleic acid sequences and "
+ "'nucleic acid segments' are translated to 'peptide "
+ "segments'.",
+ ),
+ _Switch(
+ ["-nta", "nta"],
+ "'no textual alignment' - textual alignment suppressed. "
+ "This option makes sense if other output files are of "
+ "interest -- e.g. the fragment files created with -ff, "
+ "-fop, -fsm or -lo.",
+ ),
+ _Switch(
+ ["-o", "o"],
+ "Fast version, resulting alignments may be slightly different.",
+ ),
+ _Switch(
+ ["-ow", "ow"],
+ "Overlap weights enforced (By default, overlap weights "
+ "are used only if up to 35 sequences are aligned since "
+ "calculating overlap weights is time consuming).",
+ ),
+ _Switch(
+ ["-pst", "pst"],
+ r"'print status'. Creates and updates a file \*.sta with "
+ "information about the current status of the program "
+ "run. This option is recommended if large data sets "
+ "are aligned since it allows the user to estimate the "
+ "remaining running time.",
+ ),
+ _Switch(
+ ["-smin", "smin"],
+ "Minimum similarity value for first residue pair "
+ "(or codon pair) in fragments. Speeds up protein "
+ "alignment or alignment of translated DNA fragments "
+ "at the expense of sensitivity.",
+ ),
+ _Option(
+ ["-stars", "stars"],
+ r"Maximum number of '\*' characters indicating degree "
+ "of local similarity among sequences. By default, no "
+ "stars are used but numbers between 0 and 9, instead.",
+ checker_function=lambda x: x in range(0, 10),
+ equate=False,
+ ),
+ _Switch(["-stdo", "stdo"], "Results written to standard output."),
+ _Switch(
+ ["-ta", "ta"],
+ "Standard textual alignment printed (overrides "
+ "suppression of textual alignments in special "
+ "options, e.g. -lgs)",
+ ),
+ _Option(
+ ["-thr", "thr"],
+ "Threshold T = x.",
+ checker_function=lambda x: isinstance(x, int),
+ equate=False,
+ ),
+ _Switch(
+ ["-xfr", "xfr"],
+ "'exclude fragments' - list of fragments can be "
+ "specified that are NOT considered for pairwise alignment",
+ ),
+ _Argument(
+ ["input"],
+ "Input file name. Must be FASTA format",
+ filename=True,
+ is_required=True,
+ ),
+ ]
+ AbstractCommandline.__init__(self, cmd, **kwargs)
+
+
+if __name__ == "__main__":
+ from Bio._utils import run_doctest
+
+ run_doctest()
diff --git a/code/lib/Bio/Align/Applications/_MSAProbs.py b/code/lib/Bio/Align/Applications/_MSAProbs.py
new file mode 100644
index 0000000..74b26a1
--- /dev/null
+++ b/code/lib/Bio/Align/Applications/_MSAProbs.py
@@ -0,0 +1,89 @@
+# Copyright 2013 by Christian Brueffer. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Command line wrapper for the multiple sequence alignment program MSAProbs."""
+
+from Bio.Application import _Argument, _Option, _Switch, AbstractCommandline
+
+
+class MSAProbsCommandline(AbstractCommandline):
+ """Command line wrapper for MSAProbs.
+
+ http://msaprobs.sourceforge.net
+
+ Notes
+ -----
+ Last checked against version: 0.9.7
+
+ References
+ ----------
+ Yongchao Liu, Bertil Schmidt, Douglas L. Maskell: "MSAProbs: multiple
+ sequence alignment based on pair hidden Markov models and partition
+ function posterior probabilities". Bioinformatics, 2010, 26(16): 1958 -1964
+
+ Examples
+ --------
+ >>> from Bio.Align.Applications import MSAProbsCommandline
+ >>> in_file = "unaligned.fasta"
+ >>> out_file = "aligned.cla"
+ >>> cline = MSAProbsCommandline(infile=in_file, outfile=out_file, clustalw=True)
+ >>> print(cline)
+ msaprobs -o aligned.cla -clustalw unaligned.fasta
+
+ You would typically run the command line with cline() or via
+ the Python subprocess module, as described in the Biopython tutorial.
+
+ """
+
+ def __init__(self, cmd="msaprobs", **kwargs):
+ """Initialize the class."""
+ # order of parameters is the same as in msaprobs -help
+ self.parameters = [
+ _Option(
+ ["-o", "--outfile", "outfile"],
+ "specify the output file name (STDOUT by default)",
+ filename=True,
+ equate=False,
+ ),
+ _Option(
+ ["-num_threads", "numthreads"],
+ "specify the number of threads used, and otherwise detect automatically",
+ checker_function=lambda x: isinstance(x, int),
+ ),
+ _Switch(
+ ["-clustalw", "clustalw"],
+ "use CLUSTALW output format instead of FASTA format",
+ ),
+ _Option(
+ ["-c", "consistency"],
+ "use 0 <= REPS <= 5 (default: 2) passes of consistency transformation",
+ checker_function=lambda x: isinstance(x, int) and 0 <= x <= 5,
+ ),
+ _Option(
+ ["-ir", "--iterative-refinement", "iterative_refinement"],
+ "use 0 <= REPS <= 1000 (default: 10) passes of iterative-refinement",
+ checker_function=lambda x: isinstance(x, int) and 0 <= x <= 1000,
+ ),
+ _Switch(["-v", "verbose"], "report progress while aligning (default: off)"),
+ _Option(
+ ["-annot", "annot"],
+ "write annotation for multiple alignment to FILENAME",
+ filename=True,
+ ),
+ _Switch(
+ ["-a", "--alignment-order", "alignment_order"],
+ "print sequences in alignment order rather than input order (default: off)",
+ ),
+ _Option(["-version", "version"], "print out version of MSAPROBS"),
+ _Argument(["infile"], "Multiple sequence input file", filename=True),
+ ]
+ AbstractCommandline.__init__(self, cmd, **kwargs)
+
+
+if __name__ == "__main__":
+ from Bio._utils import run_doctest
+
+ run_doctest()
diff --git a/code/lib/Bio/Align/Applications/_Mafft.py b/code/lib/Bio/Align/Applications/_Mafft.py
new file mode 100644
index 0000000..4a0b901
--- /dev/null
+++ b/code/lib/Bio/Align/Applications/_Mafft.py
@@ -0,0 +1,435 @@
+# Copyright 2009 by Cymon J. Cox. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Command line wrapper for the multiple alignment programme MAFFT."""
+
+
+from Bio.Application import _Option, _Switch, _Argument, AbstractCommandline
+
+
+class MafftCommandline(AbstractCommandline):
+ """Command line wrapper for the multiple alignment program MAFFT.
+
+ http://align.bmr.kyushu-u.ac.jp/mafft/software/
+
+ Notes
+ -----
+ Last checked against version: MAFFT v6.717b (2009/12/03)
+
+ References
+ ----------
+ Katoh, Toh (BMC Bioinformatics 9:212, 2008) Improved accuracy of
+ multiple ncRNA alignment by incorporating structural information into
+ a MAFFT-based framework (describes RNA structural alignment methods)
+
+ Katoh, Toh (Briefings in Bioinformatics 9:286-298, 2008) Recent
+ developments in the MAFFT multiple sequence alignment program
+ (outlines version 6)
+
+ Katoh, Toh (Bioinformatics 23:372-374, 2007) Errata PartTree: an
+ algorithm to build an approximate tree from a large number of
+ unaligned sequences (describes the PartTree algorithm)
+
+ Katoh, Kuma, Toh, Miyata (Nucleic Acids Res. 33:511-518, 2005) MAFFT
+ version 5: improvement in accuracy of multiple sequence alignment
+ (describes [ancestral versions of] the G-INS-i, L-INS-i and E-INS-i
+ strategies)
+
+ Katoh, Misawa, Kuma, Miyata (Nucleic Acids Res. 30:3059-3066, 2002)
+
+ Examples
+ --------
+ >>> from Bio.Align.Applications import MafftCommandline
+ >>> mafft_exe = "/opt/local/mafft"
+ >>> in_file = "../Doc/examples/opuntia.fasta"
+ >>> mafft_cline = MafftCommandline(mafft_exe, input=in_file)
+ >>> print(mafft_cline)
+ /opt/local/mafft ../Doc/examples/opuntia.fasta
+
+ If the mafft binary is on the path (typically the case on a Unix style
+ operating system) then you don't need to supply the executable location:
+
+ >>> from Bio.Align.Applications import MafftCommandline
+ >>> in_file = "../Doc/examples/opuntia.fasta"
+ >>> mafft_cline = MafftCommandline(input=in_file)
+ >>> print(mafft_cline)
+ mafft ../Doc/examples/opuntia.fasta
+
+ You would typically run the command line with mafft_cline() or via
+ the Python subprocess module, as described in the Biopython tutorial.
+
+ Note that MAFFT will write the alignment to stdout, which you may
+ want to save to a file and then parse, e.g.::
+
+ stdout, stderr = mafft_cline()
+ with open("aligned.fasta", "w") as handle:
+ handle.write(stdout)
+ from Bio import AlignIO
+ align = AlignIO.read("aligned.fasta", "fasta")
+
+ Alternatively, to parse the output with AlignIO directly you can
+ use StringIO to turn the string into a handle::
+
+ stdout, stderr = mafft_cline()
+ from io import StringIO
+ from Bio import AlignIO
+ align = AlignIO.read(StringIO(stdout), "fasta")
+
+ """
+
+ def __init__(self, cmd="mafft", **kwargs):
+ """Initialize the class."""
+ BLOSUM_MATRICES = ["30", "45", "62", "80"]
+ self.parameters = [
+ # **** Algorithm ****
+ # Automatically selects an appropriate strategy from L-INS-i, FFT-NS-
+ # i and FFT-NS-2, according to data size. Default: off (always FFT-NS-2)
+ _Switch(["--auto", "auto"], "Automatically select strategy. Default off."),
+ # Distance is calculated based on the number of shared 6mers. Default: on
+ _Switch(
+ ["--6merpair", "6merpair", "sixmerpair"],
+ "Distance is calculated based on the number of shared "
+ "6mers. Default: on",
+ ),
+ # All pairwise alignments are computed with the Needleman-Wunsch
+ # algorithm. More accurate but slower than --6merpair. Suitable for a
+ # set of globally alignable sequences. Applicable to up to ~200
+ # sequences. A combination with --maxiterate 1000 is recommended (G-
+ # INS-i). Default: off (6mer distance is used)
+ _Switch(
+ ["--globalpair", "globalpair"],
+ "All pairwise alignments are computed with the "
+ "Needleman-Wunsch algorithm. Default: off",
+ ),
+ # All pairwise alignments are computed with the Smith-Waterman
+ # algorithm. More accurate but slower than --6merpair. Suitable for a
+ # set of locally alignable sequences. Applicable to up to ~200
+ # sequences. A combination with --maxiterate 1000 is recommended (L-
+ # INS-i). Default: off (6mer distance is used)
+ _Switch(
+ ["--localpair", "localpair"],
+ "All pairwise alignments are computed with the "
+ "Smith-Waterman algorithm. Default: off",
+ ),
+ # All pairwise alignments are computed with a local algorithm with
+ # the generalized affine gap cost (Altschul 1998). More accurate but
+ # slower than --6merpair. Suitable when large internal gaps are
+ # expected. Applicable to up to ~200 sequences. A combination with --
+ # maxiterate 1000 is recommended (E-INS-i). Default: off (6mer
+ # distance is used)
+ _Switch(
+ ["--genafpair", "genafpair"],
+ "All pairwise alignments are computed with a local "
+ "algorithm with the generalized affine gap cost "
+ "(Altschul 1998). Default: off",
+ ),
+ # All pairwise alignments are computed with FASTA (Pearson and Lipman
+ # 1988). FASTA is required. Default: off (6mer distance is used)
+ _Switch(
+ ["--fastapair", "fastapair"],
+ "All pairwise alignments are computed with FASTA "
+ "(Pearson and Lipman 1988). Default: off",
+ ),
+ # Weighting factor for the consistency term calculated from pairwise
+ # alignments. Valid when either of --blobalpair, --localpair, --
+ # genafpair, --fastapair or --blastpair is selected. Default: 2.7
+ _Option(
+ ["--weighti", "weighti"],
+ "Weighting factor for the consistency term calculated "
+ "from pairwise alignments. Default: 2.7",
+ checker_function=lambda x: isinstance(x, float),
+ equate=False,
+ ),
+ # Guide tree is built number times in the progressive stage. Valid
+ # with 6mer distance. Default: 2
+ _Option(
+ ["--retree", "retree"],
+ "Guide tree is built number times in the progressive "
+ "stage. Valid with 6mer distance. Default: 2",
+ checker_function=lambda x: isinstance(x, int),
+ equate=False,
+ ),
+ # Number cycles of iterative refinement are performed. Default: 0
+ _Option(
+ ["--maxiterate", "maxiterate"],
+ "Number cycles of iterative refinement are performed. Default: 0",
+ checker_function=lambda x: isinstance(x, int),
+ equate=False,
+ ),
+ # Number of threads to use. Default: 1
+ _Option(
+ ["--thread", "thread"],
+ "Number of threads to use. Default: 1",
+ checker_function=lambda x: isinstance(x, int),
+ equate=False,
+ ),
+ # Use FFT approximation in group-to-group alignment. Default: on
+ _Switch(
+ ["--fft", "fft"],
+ "Use FFT approximation in group-to-group alignment. Default: on",
+ ),
+ # Do not use FFT approximation in group-to-group alignment. Default:
+ # off
+ _Switch(
+ ["--nofft", "nofft"],
+ "Do not use FFT approximation in group-to-group "
+ "alignment. Default: off",
+ ),
+ # Alignment score is not checked in the iterative refinement stage.
+ # Default: off (score is checked)
+ _Switch(
+ ["--noscore", "noscore"],
+ "Alignment score is not checked in the iterative "
+ "refinement stage. Default: off (score is checked)",
+ ),
+ # Use the Myers-Miller (1988) algorithm. Default: automatically
+ # turned on when the alignment length exceeds 10,000 (aa/nt).
+ _Switch(
+ ["--memsave", "memsave"],
+ "Use the Myers-Miller (1988) algorithm. Default: "
+ "automatically turned on when the alignment length "
+ "exceeds 10,000 (aa/nt).",
+ ),
+ # Use a fast tree-building method (PartTree, Katoh and Toh 2007) with
+ # the 6mer distance. Recommended for a large number (> ~10,000) of
+ # sequences are input. Default: off
+ _Switch(
+ ["--parttree", "parttree"],
+ "Use a fast tree-building method with the 6mer "
+ "distance. Default: off",
+ ),
+ # The PartTree algorithm is used with distances based on DP. Slightly
+ # more accurate and slower than --parttree. Recommended for a large
+ # number (> ~10,000) of sequences are input. Default: off
+ _Switch(
+ ["--dpparttree", "dpparttree"],
+ "The PartTree algorithm is used with distances "
+ "based on DP. Default: off",
+ ),
+ # The PartTree algorithm is used with distances based on FASTA.
+ # Slightly more accurate and slower than --parttree. Recommended for
+ # a large number (> ~10,000) of sequences are input. FASTA is
+ # required. Default: off
+ _Switch(
+ ["--fastaparttree", "fastaparttree"],
+ "The PartTree algorithm is used with distances based "
+ "on FASTA. Default: off",
+ ),
+ # The number of partitions in the PartTree algorithm. Default: 50
+ _Option(
+ ["--partsize", "partsize"],
+ "The number of partitions in the PartTree algorithm. Default: 50",
+ checker_function=lambda x: isinstance(x, int),
+ equate=False,
+ ),
+ # Do not make alignment larger than number sequences. Valid only with
+ # the --*parttree options. Default: the number of input sequences
+ _Switch(
+ ["--groupsize", "groupsize"],
+ "Do not make alignment larger than number sequences. "
+ "Default: the number of input sequences",
+ ),
+ # Adjust direction according to the first sequence
+ # Mafft V6 beta function
+ _Switch(
+ ["--adjustdirection", "adjustdirection"],
+ "Adjust direction according to the first sequence. Default off.",
+ ),
+ # Adjust direction according to the first sequence
+ # for highly diverged data; very slow
+ # Mafft V6 beta function
+ _Switch(
+ ["--adjustdirectionaccurately", "adjustdirectionaccurately"],
+ "Adjust direction according to the first sequence,"
+ "for highly diverged data; very slow"
+ "Default off.",
+ ),
+ # **** Parameter ****
+ # Gap opening penalty at group-to-group alignment. Default: 1.53
+ _Option(
+ ["--op", "op"],
+ "Gap opening penalty at group-to-group alignment. Default: 1.53",
+ checker_function=lambda x: isinstance(x, float),
+ equate=False,
+ ),
+ # Offset value, which works like gap extension penalty, for group-to-
+ # group alignment. Deafult: 0.123
+ _Option(
+ ["--ep", "ep"],
+ "Offset value, which works like gap extension penalty, "
+ "for group-to- group alignment. Default: 0.123",
+ checker_function=lambda x: isinstance(x, float),
+ equate=False,
+ ),
+ # Gap opening penalty at local pairwise alignment. Valid when the --
+ # localpair or --genafpair option is selected. Default: -2.00
+ _Option(
+ ["--lop", "lop"],
+ "Gap opening penalty at local pairwise alignment. Default: 0.123",
+ checker_function=lambda x: isinstance(x, float),
+ equate=False,
+ ),
+ # Offset value at local pairwise alignment. Valid when the --
+ # localpair or --genafpair option is selected. Default: 0.1
+ _Option(
+ ["--lep", "lep"],
+ "Offset value at local pairwise alignment. Default: 0.1",
+ checker_function=lambda x: isinstance(x, float),
+ equate=False,
+ ),
+ # Gap extension penalty at local pairwise alignment. Valid when the -
+ # -localpair or --genafpair option is selected. Default: -0.1
+ _Option(
+ ["--lexp", "lexp"],
+ "Gap extension penalty at local pairwise alignment. Default: -0.1",
+ checker_function=lambda x: isinstance(x, float),
+ equate=False,
+ ),
+ # Gap opening penalty to skip the alignment. Valid when the --
+ # genafpair option is selected. Default: -6.00
+ _Option(
+ ["--LOP", "LOP"],
+ "Gap opening penalty to skip the alignment. Default: -6.00",
+ checker_function=lambda x: isinstance(x, float),
+ equate=False,
+ ),
+ # Gap extension penalty to skip the alignment. Valid when the --
+ # genafpair option is selected. Default: 0.00
+ _Option(
+ ["--LEXP", "LEXP"],
+ "Gap extension penalty to skip the alignment. Default: 0.00",
+ checker_function=lambda x: isinstance(x, float),
+ equate=False,
+ ),
+ # BLOSUM number matrix (Henikoff and Henikoff 1992) is used.
+ # number=30, 45, 62 or 80. Default: 62
+ _Option(
+ ["--bl", "bl"],
+ "BLOSUM number matrix is used. Default: 62",
+ checker_function=lambda x: x in BLOSUM_MATRICES,
+ equate=False,
+ ),
+ # JTT PAM number (Jones et al. 1992) matrix is used. number>0.
+ # Default: BLOSUM62
+ _Option(
+ ["--jtt", "jtt"],
+ "JTT PAM number (Jones et al. 1992) matrix is used. "
+ "number>0. Default: BLOSUM62",
+ equate=False,
+ ),
+ # Transmembrane PAM number (Jones et al. 1994) matrix is used.
+ # number>0. Default: BLOSUM62
+ _Option(
+ ["--tm", "tm"],
+ "Transmembrane PAM number (Jones et al. 1994) "
+ "matrix is used. number>0. Default: BLOSUM62",
+ filename=True, # to ensure spaced inputs are quoted
+ equate=False,
+ ),
+ # Use a user-defined AA scoring matrix. The format of matrixfile is
+ # the same to that of BLAST. Ignored when nucleotide sequences are
+ # input. Default: BLOSUM62
+ _Option(
+ ["--aamatrix", "aamatrix"],
+ "Use a user-defined AA scoring matrix. Default: BLOSUM62",
+ filename=True, # to ensure spaced inputs are quoted
+ equate=False,
+ ),
+ # Incorporate the AA/nuc composition information into the scoring
+ # matrix. Default: off
+ _Switch(
+ ["--fmodel", "fmodel"],
+ "Incorporate the AA/nuc composition information into "
+ "the scoring matrix (True) or not (False, default)",
+ ),
+ # **** Output ****
+ # Name length for CLUSTAL and PHYLIP format output
+ _Option(
+ ["--namelength", "namelength"],
+ """Name length in CLUSTAL and PHYLIP output.
+
+ MAFFT v6.847 (2011) added --namelength for use with
+ the --clustalout option for CLUSTAL output.
+
+ MAFFT v7.024 (2013) added support for this with the
+ --phylipout option for PHYLIP output (default 10).
+ """,
+ checker_function=lambda x: isinstance(x, int),
+ equate=False,
+ ),
+ # Output format: clustal format. Default: off (fasta format)
+ _Switch(
+ ["--clustalout", "clustalout"],
+ "Output format: clustal (True) or fasta (False, default)",
+ ),
+ # Output format: phylip format.
+ # Added in beta with v6.847, fixed in v6.850 (2011)
+ _Switch(
+ ["--phylipout", "phylipout"],
+ "Output format: phylip (True), or fasta (False, default)",
+ ),
+ # Output order: same as input. Default: on
+ _Switch(
+ ["--inputorder", "inputorder"],
+ "Output order: same as input (True, default) or alignment "
+ "based (False)",
+ ),
+ # Output order: aligned. Default: off (inputorder)
+ _Switch(
+ ["--reorder", "reorder"],
+ "Output order: aligned (True) or in input order (False, default)",
+ ),
+ # Guide tree is output to the input.tree file. Default: off
+ _Switch(
+ ["--treeout", "treeout"],
+ "Guide tree is output to the input.tree file (True) or "
+ "not (False, default)",
+ ),
+ # Do not report progress. Default: off
+ _Switch(
+ ["--quiet", "quiet"],
+ "Do not report progress (True) or not (False, default).",
+ ),
+ # **** Input ****
+ # Assume the sequences are nucleotide. Deafult: auto
+ _Switch(
+ ["--nuc", "nuc"],
+ "Assume the sequences are nucleotide (True/False). Default: auto",
+ ),
+ # Assume the sequences are amino acid. Deafult: auto
+ _Switch(
+ ["--amino", "amino"],
+ "Assume the sequences are amino acid (True/False). Default: auto",
+ ),
+ # MAFFT has multiple --seed commands where the unaligned input is
+ # aligned to the seed alignment. There can be multiple seeds in the
+ # form: "mafft --seed align1 --seed align2 [etc] input"
+ # Effectively for n number of seed alignments.
+ # TODO - Can we use class _ArgumentList here?
+ _Option(
+ ["--seed", "seed"],
+ "Seed alignments given in alignment_n (fasta format) "
+ "are aligned with sequences in input.",
+ filename=True,
+ equate=False,
+ ),
+ # The input (must be FASTA format)
+ _Argument(["input"], "Input file name", filename=True, is_required=True),
+ # mafft-profile takes a second alignment input as an argument:
+ # mafft-profile align1 align2
+ _Argument(
+ ["input1"],
+ "Second input file name for the mafft-profile command",
+ filename=True,
+ ),
+ ]
+ AbstractCommandline.__init__(self, cmd, **kwargs)
+
+
+if __name__ == "__main__":
+ from Bio._utils import run_doctest
+
+ run_doctest()
diff --git a/code/lib/Bio/Align/Applications/_Muscle.py b/code/lib/Bio/Align/Applications/_Muscle.py
new file mode 100644
index 0000000..6a67e2a
--- /dev/null
+++ b/code/lib/Bio/Align/Applications/_Muscle.py
@@ -0,0 +1,685 @@
+# Copyright 2009 by Cymon J. Cox. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Command line wrapper for the multiple alignment program MUSCLE."""
+
+
+from Bio.Application import _Option, _Switch, AbstractCommandline
+
+
+class MuscleCommandline(AbstractCommandline):
+ r"""Command line wrapper for the multiple alignment program MUSCLE.
+
+ http://www.drive5.com/muscle/
+
+ Notes
+ -----
+ Last checked against version: 3.7, briefly against 3.8
+
+ References
+ ----------
+ Edgar, Robert C. (2004), MUSCLE: multiple sequence alignment with high
+ accuracy and high throughput, Nucleic Acids Research 32(5), 1792-97.
+
+ Edgar, R.C. (2004) MUSCLE: a multiple sequence alignment method with
+ reduced time and space complexity. BMC Bioinformatics 5(1): 113.
+
+ Examples
+ --------
+ >>> from Bio.Align.Applications import MuscleCommandline
+ >>> muscle_exe = r"C:\Program Files\Alignments\muscle3.8.31_i86win32.exe"
+ >>> in_file = r"C:\My Documents\unaligned.fasta"
+ >>> out_file = r"C:\My Documents\aligned.fasta"
+ >>> muscle_cline = MuscleCommandline(muscle_exe, input=in_file, out=out_file)
+ >>> print(muscle_cline)
+ "C:\Program Files\Alignments\muscle3.8.31_i86win32.exe" -in "C:\My Documents\unaligned.fasta" -out "C:\My Documents\aligned.fasta"
+
+ You would typically run the command line with muscle_cline() or via
+ the Python subprocess module, as described in the Biopython tutorial.
+
+ """
+
+ def __init__(self, cmd="muscle", **kwargs):
+ """Initialize the class."""
+ CLUSTERING_ALGORITHMS = ["upgma", "upgmb", "neighborjoining"]
+ DISTANCE_MEASURES_ITER1 = [
+ "kmer6_6",
+ "kmer20_3",
+ "kmer20_4",
+ "kbit20_3",
+ "kmer4_6",
+ ]
+ DISTANCE_MEASURES_ITER2 = DISTANCE_MEASURES_ITER1 + [
+ "pctid_kimura",
+ "pctid_log",
+ ]
+ OBJECTIVE_SCORES = ["sp", "ps", "dp", "xp", "spf", "spm"]
+ TREE_ROOT_METHODS = ["pseudo", "midlongestspan", "minavgleafdist"]
+
+ # The mucleotide arguments for the sequence type parameter in MUSCLE (-seqtype)
+ # were updated at somepoint in MUSCLE version 3.8. Prior to the update
+ # 'nucleo' was used for nucleotide. This has been updated to 'rna' and 'dna'. 'nucleo' kept for
+ # backwards compatibility with older MUSCLE versions.
+ SEQUENCE_TYPES = ["protein", "rna", "dna", "nucleo", "auto"]
+ WEIGHTING_SCHEMES = [
+ "none",
+ "clustalw",
+ "henikoff",
+ "henikoffpb",
+ "gsc",
+ "threeway",
+ ]
+ self.parameters = [
+ # Can't use "in" as the final alias as this
+ # is a reserved word in python:
+ _Option(
+ ["-in", "in", "input"], "Input filename", filename=True, equate=False
+ ),
+ _Option(["-out", "out"], "Output filename", filename=True, equate=False),
+ _Switch(
+ ["-diags", "diags"], "Find diagonals (faster for similar sequences)"
+ ),
+ _Switch(["-profile", "profile"], "Perform a profile alignment"),
+ _Option(
+ ["-in1", "in1"],
+ "First input filename for profile alignment",
+ filename=True,
+ equate=False,
+ ),
+ _Option(
+ ["-in2", "in2"],
+ "Second input filename for a profile alignment",
+ filename=True,
+ equate=False,
+ ),
+ # anchorspacing Integer 32 Minimum spacing
+ # between anchor cols
+ _Option(
+ ["-anchorspacing", "anchorspacing"],
+ "Minimum spacing between anchor columns",
+ checker_function=lambda x: isinstance(x, int),
+ equate=False,
+ ),
+ # center Floating point [1] Center parameter.
+ # Should be negative.
+ _Option(
+ ["-center", "center"],
+ "Center parameter - should be negative",
+ checker_function=lambda x: isinstance(x, float),
+ equate=False,
+ ),
+ # cluster1 upgma upgmb Clustering method.
+ _Option(
+ ["-cluster1", "cluster1"],
+ "Clustering method used in iteration 1",
+ checker_function=lambda x: x in CLUSTERING_ALGORITHMS,
+ equate=False,
+ ),
+ # cluster2 upgmb cluster1 is used
+ # neighborjoining in iteration 1 and
+ # 2, cluster2 in
+ # later iterations.
+ _Option(
+ ["-cluster2", "cluster2"],
+ "Clustering method used in iteration 2",
+ checker_function=lambda x: x in CLUSTERING_ALGORITHMS,
+ equate=False,
+ ),
+ # diaglength Integer 24 Minimum length of
+ # diagonal.
+ _Option(
+ ["-diaglength", "diaglength"],
+ "Minimum length of diagonal",
+ checker_function=lambda x: isinstance(x, int),
+ equate=True,
+ ),
+ # diagmargin Integer 5 Discard this many
+ # positions at ends
+ # of diagonal.
+ _Option(
+ ["-diagmargin", "diagmargin"],
+ "Discard this many positions at ends of diagonal",
+ checker_function=lambda x: isinstance(x, int),
+ equate=False,
+ ),
+ # distance1 kmer6_6 Kmer6_6(amino) or Distance measure
+ # kmer20_3 Kmer4_6(nucleo) for iteration 1
+ # kmer20_4
+ # kbit20_3
+ # kmer4_6
+ _Option(
+ ["-distance1", "distance1"],
+ "Distance measure for iteration 1",
+ checker_function=lambda x: x in DISTANCE_MEASURES_ITER1,
+ equate=False,
+ ),
+ # distance2 kmer6_6 pctid_kimura Distance measure
+ # kmer20_3 for iterations
+ # kmer20_4 2, 3 ...
+ # kbit20_3
+ # pctid_kimura
+ # pctid_log
+ _Option(
+ ["-distance2", "distance2"],
+ "Distance measure for iteration 2",
+ checker_function=lambda x: x in DISTANCE_MEASURES_ITER2,
+ equate=False,
+ ),
+ # gapextend Floating point [1] The gap extend score
+ _Option(
+ ["-gapextend", "gapextend"],
+ "Gap extension penalty",
+ checker_function=lambda x: isinstance(x, float),
+ equate=False,
+ ),
+ # gapopen Floating point [1] The gap open score
+ # Must be negative.
+ _Option(
+ ["-gapopen", "gapopen"],
+ "Gap open score - negative number",
+ checker_function=lambda x: isinstance(x, float),
+ equate=False,
+ ),
+ # hydro Integer 5 Window size for
+ # determining whether
+ # a region is
+ # hydrophobic.
+ _Option(
+ ["-hydro", "hydro"],
+ "Window size for hydrophobic region",
+ checker_function=lambda x: isinstance(x, int),
+ equate=False,
+ ),
+ # hydrofactor Floating point 1.2 Multiplier for gap
+ # open/close
+ # penalties in
+ # hydrophobic regions
+ _Option(
+ ["-hydrofactor", "hydrofactor"],
+ "Multiplier for gap penalties in hydrophobic regions",
+ checker_function=lambda x: isinstance(x, float),
+ equate=False,
+ ),
+ # log File name None. Log file name
+ # (delete existing
+ # file).
+ _Option(["-log", "log"], "Log file name", filename=True, equate=False),
+ # loga File name None. Log file name
+ # (append to existing
+ # file).
+ _Option(
+ ["-loga", "loga"],
+ "Log file name (append to existing file)",
+ filename=True,
+ equate=False,
+ ),
+ # matrix File name None. File name for
+ # substitution matrix
+ # in NCBI or WU-BLAST
+ # format. If you
+ # specify your own
+ # matrix, you should
+ # also specify:
+ # -gapopen
+ # -gapextend
+ # -center 0.0
+ _Option(
+ ["-matrix", "matrix"],
+ "path to NCBI or WU-BLAST format protein substitution "
+ "matrix - also set -gapopen, -gapextend and -center",
+ filename=True,
+ equate=False,
+ ),
+ # diagbreak Integer 1 Maximum distance
+ # between two
+ # diagonals that
+ # allows them to
+ # merge into one
+ # diagonal.
+ _Option(
+ ["-diagbreak", "diagbreak"],
+ "Maximum distance between two diagonals that allows "
+ "them to merge into one diagonal",
+ checker_function=lambda x: isinstance(x, int),
+ equate=False,
+ ),
+ _Option(
+ ["-maxdiagbreak", "maxdiagbreak"], # deprecated 3.8
+ "Deprecated in v3.8, use -diagbreak instead.",
+ checker_function=lambda x: isinstance(x, int),
+ equate=False,
+ ),
+ # maxhours Floating point None. Maximum time to
+ # run in hours. The
+ # actual time may
+ # exceed requested
+ # limit by a few
+ # minutes. Decimals
+ # are allowed, so 1.5
+ # means one hour and
+ # 30 minutes.
+ _Option(
+ ["-maxhours", "maxhours"],
+ "Maximum time to run in hours",
+ checker_function=lambda x: isinstance(x, float),
+ equate=False,
+ ),
+ # maxiters Integer 1, 2 ... 16 Maximum number of
+ # iterations.
+ _Option(
+ ["-maxiters", "maxiters"],
+ "Maximum number of iterations",
+ checker_function=lambda x: isinstance(x, int),
+ equate=False,
+ ),
+ # maxtrees Integer 1 Maximum number of
+ # new trees to build
+ # in iteration 2.
+ _Option(
+ ["-maxtrees", "maxtrees"],
+ "Maximum number of trees to build in iteration 2",
+ checker_function=lambda x: isinstance(x, int),
+ equate=False,
+ ),
+ # minbestcolscore Floating point [1] Minimum score a
+ # column must have to
+ # be an anchor.
+ _Option(
+ ["-minbestcolscore", "minbestcolscore"],
+ "Minimum score a column must have to be an anchor",
+ checker_function=lambda x: isinstance(x, float),
+ equate=False,
+ ),
+ # minsmoothscore Floating point [1] Minimum smoothed
+ # score a column must
+ # have to be an
+ # anchor.
+ _Option(
+ ["-minsmoothscore", "minsmoothscore"],
+ "Minimum smoothed score a column must have to be an anchor",
+ checker_function=lambda x: isinstance(x, float),
+ equate=False,
+ ),
+ # objscore sp spm Objective score
+ # ps used by tree
+ # dp dependent
+ # xp refinement.
+ # spf sp=sum-of-pairs
+ # spm score. (dimer
+ # approximation)
+ # spm=sp for < 100
+ # seqs, otherwise spf
+ # dp=dynamic
+ # programming score.
+ # ps=average profile-
+ # sequence score.
+ # xp=cross profile
+ # score.
+ _Option(
+ ["-objscore", "objscore"],
+ "Objective score used by tree dependent refinement",
+ checker_function=lambda x: x in OBJECTIVE_SCORES,
+ equate=False,
+ ),
+ # refinewindow Integer 200 Length of window
+ # for -refinew.
+ _Option(
+ ["-refinewindow", "refinewindow"],
+ "Length of window for -refinew",
+ checker_function=lambda x: isinstance(x, int),
+ equate=False,
+ ),
+ # root1 pseudo pseudo Method used to root
+ _Option(
+ ["-root1", "root1"],
+ "Method used to root tree in iteration 1",
+ checker_function=lambda x: x in TREE_ROOT_METHODS,
+ equate=False,
+ ),
+ # root2 midlongestspan tree; root1 is
+ # minavgleafdist used in iteration 1
+ # and 2, root2 in
+ # later iterations.
+ _Option(
+ ["-root2", "root2"],
+ "Method used to root tree in iteration 2",
+ checker_function=lambda x: x in TREE_ROOT_METHODS,
+ equate=False,
+ ),
+ # scorefile File name None File name where to
+ # write a score file.
+ # This contains one
+ # line for each column
+ # in the alignment.
+ # The line contains
+ # the letters in the
+ # column followed by
+ # the average BLOSUM62
+ # score over pairs of
+ # letters in the
+ # column.
+ _Option(
+ ["-scorefile", "scorefile"],
+ "Score file name, contains one line for each column"
+ " in the alignment with average BLOSUM62 score",
+ filename=True,
+ equate=False,
+ ),
+ # seqtype protein auto Sequence type.
+ # dna (MUSCLE version > 3.8)
+ # rna (MUSCLE version > 3.8)
+ # auto
+ # nucleo (only valid for MUSCLE versions < 3.8)
+ _Option(
+ ["-seqtype", "seqtype"],
+ "Sequence type",
+ checker_function=lambda x: x in SEQUENCE_TYPES,
+ equate=False,
+ ),
+ # smoothscoreceil Floating point [1] Maximum value of
+ # column score for
+ # smoothing purposes.
+ _Option(
+ ["-smoothscoreceil", "smoothscoreceil"],
+ "Maximum value of column score for smoothing",
+ checker_function=lambda x: isinstance(x, float),
+ equate=False,
+ ),
+ # smoothwindow Integer 7 Window used for
+ # anchor column
+ # smoothing.
+ _Option(
+ ["-smoothwindow", "smoothwindow"],
+ "Window used for anchor column smoothing",
+ checker_function=lambda x: isinstance(x, int),
+ equate=False,
+ ),
+ # spscore File name Compute SP
+ # objective score of
+ # multiple alignment.
+ _Option(
+ ["-spscore", "spscore"],
+ "Compute SP objective score of multiple alignment",
+ filename=True,
+ equate=False,
+ ),
+ # SUEFF Floating point value 0.1 Constant used in
+ # between 0 and 1. UPGMB clustering.
+ # Determines the
+ # relative fraction
+ # of average linkage
+ # (SUEFF) vs. nearest
+ # neighbor linkage
+ # (1 SUEFF).
+ _Option(
+ ["-sueff", "sueff"],
+ "Constant used in UPGMB clustering",
+ checker_function=lambda x: isinstance(x, float),
+ equate=False,
+ ),
+ # tree1 File name None Save tree
+ _Option(
+ ["-tree1", "tree1"], "Save Newick tree from iteration 1", equate=False
+ ),
+ # tree2 first or second
+ # iteration to given
+ # file in Newick
+ # (Phylip-compatible)
+ # format.
+ _Option(
+ ["-tree2", "tree2"], "Save Newick tree from iteration 2", equate=False
+ ),
+ # usetree File name None Use given tree as
+ # guide tree. Must by
+ # in Newick
+ # (Phyip-compatible)
+ # format.
+ _Option(
+ ["-usetree", "usetree"],
+ "Use given Newick tree as guide tree",
+ filename=True,
+ equate=False,
+ ),
+ # weight1 none clustalw Sequence weighting
+ _Option(
+ ["-weight1", "weight1"],
+ "Weighting scheme used in iteration 1",
+ checker_function=lambda x: x in WEIGHTING_SCHEMES,
+ equate=False,
+ ),
+ # weight2 henikoff scheme.
+ # henikoffpb weight1 is used in
+ # gsc iterations 1 and 2.
+ # clustalw weight2 is used for
+ # threeway tree-dependent
+ # refinement.
+ # none=all sequences
+ # have equal weight.
+ # henikoff=Henikoff &
+ # Henikoff weighting
+ # scheme.
+ # henikoffpb=Modified
+ # Henikoff scheme as
+ # used in PSI-BLAST.
+ # clustalw=CLUSTALW
+ # method.
+ # threeway=Gotoh
+ # three-way method.
+ _Option(
+ ["-weight2", "weight2"],
+ "Weighting scheme used in iteration 2",
+ checker_function=lambda x: x in WEIGHTING_SCHEMES,
+ equate=False,
+ ),
+ # ################### FORMATS ####################################
+ # Multiple formats can be specified on the command line
+ # If -msf appears it will be used regardless of other formats
+ # specified. If -clw appears (and not -msf), clustalw format will
+ # be used regardless of other formats specified. If both -clw and
+ # -clwstrict are specified -clwstrict will be used regardless of
+ # other formats specified. If -fasta is specified and not -msf,
+ # -clw, or clwstrict, fasta will be used. If -fasta and -html are
+ # specified -fasta will be used. Only if -html is specified alone
+ # will html be used. I kid ye not.
+ # clw no Write output in CLUSTALW format
+ # (default is FASTA).
+ _Switch(
+ ["-clw", "clw"],
+ "Write output in CLUSTALW format (with a MUSCLE header)",
+ ),
+ # clwstrict no Write output in CLUSTALW format with
+ # the "CLUSTAL W (1.81)" header rather
+ # than the MUSCLE version. This is
+ # useful when a post-processing step is
+ # picky about the file header.
+ _Switch(
+ ["-clwstrict", "clwstrict"],
+ "Write output in CLUSTALW format with version 1.81 header",
+ ),
+ # fasta yes Write output in FASTA format.
+ # Alternatives include clw,
+ # clwstrict, msf and html.
+ _Switch(["-fasta", "fasta"], "Write output in FASTA format"),
+ # html no Write output in HTML format (default
+ # is FASTA).
+ _Switch(["-html", "html"], "Write output in HTML format"),
+ # msf no Write output in MSF format (default
+ # is FASTA).
+ _Switch(["-msf", "msf"], "Write output in MSF format"),
+ # Phylip interleaved - undocumented as of 3.7
+ _Switch(["-phyi", "phyi"], "Write output in PHYLIP interleaved format"),
+ # Phylip sequential - undocumented as of 3.7
+ _Switch(["-phys", "phys"], "Write output in PHYLIP sequential format"),
+ # ################# Additional specified output files #########
+ _Option(
+ ["-phyiout", "phyiout"],
+ "Write PHYLIP interleaved output to specified filename",
+ filename=True,
+ equate=False,
+ ),
+ _Option(
+ ["-physout", "physout"],
+ "Write PHYLIP sequential format to specified filename",
+ filename=True,
+ equate=False,
+ ),
+ _Option(
+ ["-htmlout", "htmlout"],
+ "Write HTML output to specified filename",
+ filename=True,
+ equate=False,
+ ),
+ _Option(
+ ["-clwout", "clwout"],
+ "Write CLUSTALW output (with MUSCLE header) to specified filename",
+ filename=True,
+ equate=False,
+ ),
+ _Option(
+ ["-clwstrictout", "clwstrictout"],
+ "Write CLUSTALW output (with version 1.81 header) to "
+ "specified filename",
+ filename=True,
+ equate=False,
+ ),
+ _Option(
+ ["-msfout", "msfout"],
+ "Write MSF format output to specified filename",
+ filename=True,
+ equate=False,
+ ),
+ _Option(
+ ["-fastaout", "fastaout"],
+ "Write FASTA format output to specified filename",
+ filename=True,
+ equate=False,
+ ),
+ # ############# END FORMATS ###################################
+ # anchors yes Use anchor optimization in tree
+ # dependent refinement iterations.
+ _Switch(
+ ["-anchors", "anchors"],
+ "Use anchor optimisation in tree dependent refinement iterations",
+ ),
+ # noanchors no Disable anchor optimization. Default
+ # is anchors.
+ _Switch(
+ ["-noanchors", "noanchors"],
+ "Do not use anchor optimisation in tree dependent "
+ "refinement iterations",
+ ),
+ # brenner no Use Steven Brenner's method for
+ # computing the root alignment.
+ _Switch(
+ ["-brenner", "brenner"], "Use Steve Brenner's root alignment method"
+ ),
+ # cluster no Perform fast clustering of input
+ # sequences. Use the tree1 option to
+ # save the tree.
+ _Switch(
+ ["-cluster", "cluster"],
+ "Perform fast clustering of input sequences, "
+ "use -tree1 to save tree",
+ ),
+ # dimer no Use dimer approximation for the
+ # SP score (faster, less accurate).
+ _Switch(
+ ["-dimer", "dimer"],
+ "Use faster (slightly less accurate) dimer approximation"
+ "for the SP score",
+ ),
+ # group yes Group similar sequences together
+ # in the output. This is the default.
+ # See also stable.
+ _Switch(["-group", "group"], "Group similar sequences in output"),
+ # ############# log-expectation profile score ####################
+ # One of either -le, -sp, or -sv
+ #
+ # According to the doc, spn is default and the only option for
+ # nucleotides: this doesn't appear to be true. -le, -sp, and -sv
+ # can be used and produce numerically different logs
+ # (what is going on?)
+ #
+ # spn fails on proteins
+ # le maybe Use log-expectation profile score
+ # (VTML240). Alternatives are to use sp
+ # or sv. This is the default for amino
+ # acid sequences.
+ _Switch(["-le", "le"], "Use log-expectation profile score (VTML240)"),
+ # sv no Use sum-of-pairs profile score
+ # (VTML240). Default is le.
+ _Switch(["-sv", "sv"], "Use sum-of-pairs profile score (VTML240)"),
+ # sp no Use sum-of-pairs protein profile
+ # score (PAM200). Default is le.
+ _Switch(["-sp", "sp"], "Use sum-of-pairs protein profile score (PAM200)"),
+ # spn maybe Use sum-of-pairs nucleotide profile
+ # score (BLASTZ parameters). This is
+ # the only option for nucleotides,
+ # and is therefore the default.
+ _Switch(
+ ["-spn", "spn"], "Use sum-of-pairs protein nucleotide profile score"
+ ),
+ # ########## END log-expectation profile score ###################
+ # quiet no Do not display progress messages.
+ _Switch(["-quiet", "quiet"], "Do not display progress messages"),
+ # refine no Input file is already aligned, skip
+ # first two iterations and begin tree
+ # dependent refinement.
+ _Switch(["-refine", "refine"], "Only do tree dependent refinement"),
+ # refinew no Refine an alignment by dividing it
+ # into non-overlapping windows and
+ # re-aligning each window. Typically
+ # used for whole-genome nucleotide
+ # alignments.
+ _Switch(
+ ["-refinew", "refinew"],
+ "Only do tree dependent refinement using sliding window approach",
+ ),
+ # core yes in muscle, Do not catch exceptions.
+ # no in muscled.
+ _Switch(["-core", "core"], "Do not catch exceptions"),
+ # nocore no in muscle, Catch exceptions and give an
+ # yes in muscled. error message if possible.
+ _Switch(["-nocore", "nocore"], "Catch exceptions"),
+ # stable no Preserve input order of sequences
+ # in output file. Default is to group
+ # sequences by similarity (group).
+ _Switch(
+ ["-stable", "stable"],
+ "Do not group similar sequences in output (not supported in v3.8)",
+ ),
+ # termgaps4 yes Use 4-way test for treatment of
+ # terminal gaps.
+ # (Cannot be disabled in this version).
+ #
+ # termgapsfull no Terminal gaps penalized with
+ # full penalty. [1] Not fully
+ # supported in this version
+ #
+ # termgapshalf yes Terminal gaps penalized with
+ # half penalty. [1] Not fully
+ # supported in this version
+ #
+ # termgapshalflonger no Terminal gaps penalized with
+ # half penalty if gap relative
+ # to longer sequence, otherwise with
+ # full penalty. [1] Not fully
+ # supported in this version
+ #
+ # verbose no Write parameter settings and
+ # progress messages to log file.
+ _Switch(["-verbose", "verbose"], "Write parameter settings and progress"),
+ # version no Write version string to
+ # stdout and exit
+ _Switch(["-version", "version"], "Write version string to stdout and exit"),
+ ]
+ AbstractCommandline.__init__(self, cmd, **kwargs)
+
+
+if __name__ == "__main__":
+ from Bio._utils import run_doctest
+
+ run_doctest()
diff --git a/code/lib/Bio/Align/Applications/_Prank.py b/code/lib/Bio/Align/Applications/_Prank.py
new file mode 100644
index 0000000..4d07c56
--- /dev/null
+++ b/code/lib/Bio/Align/Applications/_Prank.py
@@ -0,0 +1,236 @@
+# Copyright 2009 by Cymon J. Cox. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Command line wrapper for the multiple alignment program PRANK."""
+
+from Bio.Application import _Option, _Switch, AbstractCommandline
+
+
+class PrankCommandline(AbstractCommandline):
+ """Command line wrapper for the multiple alignment program PRANK.
+
+ http://www.ebi.ac.uk/goldman-srv/prank/prank/
+
+ Notes
+ -----
+ Last checked against version: 081202
+
+ References
+ ----------
+ Loytynoja, A. and Goldman, N. 2005. An algorithm for progressive
+ multiple alignment of sequences with insertions. Proceedings of
+ the National Academy of Sciences, 102: 10557--10562.
+
+ Loytynoja, A. and Goldman, N. 2008. Phylogeny-aware gap placement
+ prevents errors in sequence alignment and evolutionary analysis.
+ Science, 320: 1632.
+
+ Examples
+ --------
+ To align a FASTA file (unaligned.fasta) with the output in aligned
+ FASTA format with the output filename starting with "aligned" (you
+ can't pick the filename explicitly), no tree output and no XML output,
+ use:
+
+ >>> from Bio.Align.Applications import PrankCommandline
+ >>> prank_cline = PrankCommandline(d="unaligned.fasta",
+ ... o="aligned", # prefix only!
+ ... f=8, # FASTA output
+ ... notree=True, noxml=True)
+ >>> print(prank_cline)
+ prank -d=unaligned.fasta -o=aligned -f=8 -noxml -notree
+
+ You would typically run the command line with prank_cline() or via
+ the Python subprocess module, as described in the Biopython tutorial.
+
+ """
+
+ def __init__(self, cmd="prank", **kwargs):
+ """Initialize the class."""
+ OUTPUT_FORMAT_VALUES = list(range(1, 18))
+ self.parameters = [
+ # ################# input/output parameters: ##################
+ # -d=sequence_file
+ _Option(["-d", "d"], "Input filename", filename=True, is_required=True),
+ # -t=tree_file [default: no tree, generate approximate NJ tree]
+ _Option(["-t", "t"], "Input guide tree filename", filename=True),
+ # -tree="tree_string" [tree in newick format; in double quotes]
+ _Option(["-tree", "tree"], "Input guide tree as Newick string"),
+ # -m=model_file [default: HKY2/WAG]
+ _Option(
+ ["-m", "m"], "User-defined alignment model filename. Default: HKY2/WAG"
+ ),
+ # -o=output_file [default: 'output']
+ _Option(
+ ["-o", "o"],
+ "Output filenames prefix. Default: 'output'\n "
+ "Will write: output.?.fas (depending on requested "
+ "format), output.?.xml and output.?.dnd",
+ filename=True,
+ ),
+ # -f=output_format [default: 8]
+ _Option(
+ ["-f", "f"],
+ "Output alignment format. Default: 8 FASTA\n"
+ "Option are:\n"
+ "1. IG/Stanford 8. Pearson/Fasta\n"
+ "2. GenBank/GB 11. Phylip3.2\n"
+ "3. NBRF 12. Phylip\n"
+ "4. EMBL 14. PIR/CODATA\n"
+ "6. DNAStrider 15. MSF\n"
+ "7. Fitch 17. PAUP/NEXUS",
+ checker_function=lambda x: x in OUTPUT_FORMAT_VALUES,
+ ),
+ _Switch(
+ ["-noxml", "noxml"],
+ "Do not output XML files (PRANK versions earlier than v.120626)",
+ ),
+ _Switch(
+ ["-notree", "notree"],
+ "Do not output dnd tree files (PRANK versions earlier than v.120626)",
+ ),
+ _Switch(
+ ["-showxml", "showxml"], "Output XML files (PRANK v.120626 and later)"
+ ),
+ _Switch(
+ ["-showtree", "showtree"],
+ "Output dnd tree files (PRANK v.120626 and later)",
+ ),
+ _Switch(["-shortnames", "shortnames"], "Truncate names at first space"),
+ _Switch(["-quiet", "quiet"], "Reduce verbosity"),
+ # ###################### model parameters: ######################
+ # +F [force insertions to be always skipped]
+ # -F [equivalent]
+ _Switch(
+ ["-F", "+F", "F"], "Force insertions to be always skipped: same as +F"
+ ),
+ # -dots [show insertion gaps as dots]
+ _Switch(["-dots", "dots"], "Show insertion gaps as dots"),
+ # -gaprate=# [gap opening rate; default: dna 0.025 / prot 0.0025]
+ _Option(
+ ["-gaprate", "gaprate"],
+ "Gap opening rate. Default: dna 0.025 prot 0.0025",
+ checker_function=lambda x: isinstance(x, float),
+ ),
+ # -gapext=# [gap extension probability; default: dna 0.5 / prot 0.5]
+ _Option(
+ ["-gapext", "gapext"],
+ "Gap extension probability. Default: dna 0.5 / prot 0.5",
+ checker_function=lambda x: isinstance(x, float),
+ ),
+ # -dnafreqs=#,#,#,# [ACGT; default: empirical]
+ _Option(
+ ["-dnafreqs", "dnafreqs"],
+ "DNA frequencies - 'A,C,G,T'. eg '25,25,25,25' as a quote "
+ "surrounded string value. Default: empirical",
+ checker_function=lambda x: isinstance(x, bytes),
+ ),
+ # -kappa=# [ts/tv rate ratio; default:2]
+ _Option(
+ ["-kappa", "kappa"],
+ "Transition/transversion ratio. Default: 2",
+ checker_function=lambda x: isinstance(x, int),
+ ),
+ # -rho=# [pur/pyr rate ratio; default:1]
+ _Option(
+ ["-rho", "rho"],
+ "Purine/pyrimidine ratio. Default: 1",
+ checker_function=lambda x: isinstance(x, int),
+ ),
+ # -codon [for DNA: use empirical codon model]
+ _Switch(["-codon", "codon"], "Codon aware alignment or not"),
+ # -termgap [penalise terminal gaps normally]
+ _Switch(["-termgap", "termgap"], "Penalise terminal gaps normally"),
+ # ############### other parameters: ################################
+ # -nopost [do not compute posterior support; default: compute]
+ _Switch(
+ ["-nopost", "nopost"],
+ "Do not compute posterior support. Default: compute",
+ ),
+ # -pwdist=# [expected pairwise distance for computing guidetree;
+ # default: dna 0.25 / prot 0.5]
+ _Option(
+ ["-pwdist", "pwdist"],
+ "Expected pairwise distance for computing guidetree. "
+ "Default: dna 0.25 / prot 0.5",
+ checker_function=lambda x: isinstance(x, float),
+ ),
+ _Switch(
+ ["-once", "once"], "Run only once. Default: twice if no guidetree given"
+ ),
+ _Switch(["-twice", "twice"], "Always run twice"),
+ _Switch(["-skipins", "skipins"], "Skip insertions in posterior support"),
+ _Switch(
+ ["-uselogs", "uselogs"],
+ "Slower but should work for a greater number of sequences",
+ ),
+ _Switch(["-writeanc", "writeanc"], "Output ancestral sequences"),
+ _Switch(
+ ["-printnodes", "printnodes"], "Output each node; mostly for debugging"
+ ),
+ # -matresize=# [matrix resizing multiplier]
+ # Doesn't specify type but Float and Int work
+ _Option(
+ ["-matresize", "matresize"],
+ "Matrix resizing multiplier",
+ checker_function=lambda x: (isinstance(x, float) or isinstance(x, int)),
+ ),
+ # -matinitsize=# [matrix initial size multiplier]
+ # Doesn't specify type but Float and Int work
+ _Option(
+ ["-matinitsize", "matinitsize"],
+ "Matrix initial size multiplier",
+ checker_function=lambda x: (isinstance(x, float) or isinstance(x, int)),
+ ),
+ _Switch(["-longseq", "longseq"], "Save space in pairwise alignments"),
+ _Switch(["-pwgenomic", "pwgenomic"], "Do pairwise alignment, no guidetree"),
+ # -pwgenomicdist=# [distance for pairwise alignment; default: 0.3]
+ _Option(
+ ["-pwgenomicdist", "pwgenomicdist"],
+ "Distance for pairwise alignment. Default: 0.3",
+ checker_function=lambda x: isinstance(x, float),
+ ),
+ # -scalebranches=# [scale branch lengths; default: dna 1 / prot 2]
+ _Option(
+ ["-scalebranches", "scalebranches"],
+ "Scale branch lengths. Default: dna 1 / prot 2",
+ checker_function=lambda x: isinstance(x, int),
+ ),
+ # -fixedbranches=# [use fixed branch lengths]
+ # Assume looking for a float
+ _Option(
+ ["-fixedbranches", "fixedbranches"],
+ "Use fixed branch lengths of input value",
+ checker_function=lambda x: isinstance(x, float),
+ ),
+ # -maxbranches=# [set maximum branch length]
+ # Assume looking for a float
+ _Option(
+ ["-maxbranches", "maxbranches"],
+ "Use maximum branch lengths of input value",
+ checker_function=lambda x: isinstance(x, float),
+ ),
+ # -realbranches [disable branch length truncation]
+ _Switch(
+ ["-realbranches", "realbranches"], "Disable branch length truncation"
+ ),
+ _Switch(["-translate", "translate"], "Translate to protein"),
+ _Switch(
+ ["-mttranslate", "mttranslate"], "Translate to protein using mt table"
+ ),
+ # ##################### other: ####################
+ _Switch(
+ ["-convert", "convert"],
+ "Convert input alignment to new format. Do not perform alignment",
+ ),
+ ]
+ AbstractCommandline.__init__(self, cmd, **kwargs)
+
+
+if __name__ == "__main__":
+ from Bio._utils import run_doctest
+
+ run_doctest()
diff --git a/code/lib/Bio/Align/Applications/_Probcons.py b/code/lib/Bio/Align/Applications/_Probcons.py
new file mode 100644
index 0000000..e94e026
--- /dev/null
+++ b/code/lib/Bio/Align/Applications/_Probcons.py
@@ -0,0 +1,137 @@
+# Copyright 2009 by Cymon J. Cox. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Command line wrapper for the multiple alignment program PROBCONS."""
+
+from Bio.Application import _Option, _Switch, _Argument, AbstractCommandline
+
+
+class ProbconsCommandline(AbstractCommandline):
+ """Command line wrapper for the multiple alignment program PROBCONS.
+
+ http://probcons.stanford.edu/
+
+ Notes
+ -----
+ Last checked against version: 1.12
+
+ References
+ ----------
+ Do, C.B., Mahabhashyam, M.S.P., Brudno, M., and Batzoglou, S. 2005.
+ PROBCONS: Probabilistic Consistency-based Multiple Sequence Alignment.
+ Genome Research 15: 330-340.
+
+ Examples
+ --------
+ To align a FASTA file (unaligned.fasta) with the output in ClustalW
+ format, and otherwise default settings, use:
+
+ >>> from Bio.Align.Applications import ProbconsCommandline
+ >>> probcons_cline = ProbconsCommandline(input="unaligned.fasta",
+ ... clustalw=True)
+ >>> print(probcons_cline)
+ probcons -clustalw unaligned.fasta
+
+ You would typically run the command line with probcons_cline() or via
+ the Python subprocess module, as described in the Biopython tutorial.
+
+ Note that PROBCONS will write the alignment to stdout, which you may
+ want to save to a file and then parse, e.g.::
+
+ stdout, stderr = probcons_cline()
+ with open("aligned.aln", "w") as handle:
+ handle.write(stdout)
+ from Bio import AlignIO
+ align = AlignIO.read("aligned.fasta", "clustalw")
+
+ Alternatively, to parse the output with AlignIO directly you can
+ use StringIO to turn the string into a handle::
+
+ stdout, stderr = probcons_cline()
+ from io import StringIO
+ from Bio import AlignIO
+ align = AlignIO.read(StringIO(stdout), "clustalw")
+
+ """
+
+ def __init__(self, cmd="probcons", **kwargs):
+ """Initialize the class."""
+ self.parameters = [
+ # Note that some options cannot be assigned via properties using the
+ # original documented option (because hyphens are not valid for names in
+ # python), e.g cmdline.pre-training = 3 will not work
+ # In these cases the shortened option name should be used
+ # cmdline.pre = 3
+ _Switch(
+ ["-clustalw", "clustalw"], "Use CLUSTALW output format instead of MFA"
+ ),
+ _Option(
+ ["-c", "c", "--consistency", "consistency"],
+ "Use 0 <= REPS <= 5 (default: 2) passes of consistency transformation",
+ checker_function=lambda x: x in range(0, 6),
+ equate=False,
+ ),
+ _Option(
+ ["-ir", "--iterative-refinement", "iterative-refinement", "ir"],
+ "Use 0 <= REPS <= 1000 (default: 100) passes of iterative-refinement",
+ checker_function=lambda x: x in range(0, 1001),
+ equate=False,
+ ),
+ _Option(
+ ["-pre", "--pre-training", "pre-training", "pre"],
+ "Use 0 <= REPS <= 20 (default: 0) rounds of pretraining",
+ checker_function=lambda x: x in range(0, 21),
+ equate=False,
+ ),
+ _Switch(["-pairs", "pairs"], "Generate all-pairs pairwise alignments"),
+ _Switch(
+ ["-viterbi", "viterbi"],
+ "Use Viterbi algorithm to generate all pairs "
+ "(automatically enables -pairs)",
+ ),
+ _Switch(
+ ["-verbose", "verbose"], "Report progress while aligning (default: off)"
+ ),
+ _Option(
+ ["-annot", "annot"],
+ "Write annotation for multiple alignment to FILENAME",
+ equate=False,
+ ),
+ _Option(
+ ["-t", "t", "--train", "train"],
+ "Compute EM transition probabilities, store in FILENAME "
+ "(default: no training)",
+ equate=False,
+ ),
+ _Switch(
+ ["-e", "e", "--emissions", "emissions"],
+ "Also reestimate emission probabilities (default: off)",
+ ),
+ _Option(
+ ["-p", "p", "--paramfile", "paramfile"],
+ "Read parameters from FILENAME",
+ equate=False,
+ ),
+ _Switch(
+ ["-a", "--alignment-order", "alignment-order", "a"],
+ "Print sequences in alignment order rather than input "
+ "order (default: off)",
+ ),
+ # Input file name
+ _Argument(
+ ["input"],
+ "Input file name. Must be multiple FASTA alignment (MFA) format",
+ filename=True,
+ is_required=True,
+ ),
+ ]
+ AbstractCommandline.__init__(self, cmd, **kwargs)
+
+
+if __name__ == "__main__":
+ from Bio._utils import run_doctest
+
+ run_doctest()
diff --git a/code/lib/Bio/Align/Applications/_TCoffee.py b/code/lib/Bio/Align/Applications/_TCoffee.py
new file mode 100644
index 0000000..de337bc
--- /dev/null
+++ b/code/lib/Bio/Align/Applications/_TCoffee.py
@@ -0,0 +1,125 @@
+# Copyright 2009 by Cymon J. Cox and Brad Chapman. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Command line wrapper for the multiple alignment program TCOFFEE."""
+
+
+from Bio.Application import _Option, _Switch, AbstractCommandline
+
+
+class TCoffeeCommandline(AbstractCommandline):
+ """Commandline object for the TCoffee alignment program.
+
+ http://www.tcoffee.org/Projects_home_page/t_coffee_home_page.html
+
+ The T-Coffee command line tool has a lot of switches and options.
+ This wrapper implements a VERY limited number of options - if you
+ would like to help improve it please get in touch.
+
+ Notes
+ -----
+ Last checked against: Version_6.92
+
+ References
+ ----------
+ T-Coffee: A novel method for multiple sequence alignments.
+ Notredame, Higgins, Heringa, JMB,302(205-217) 2000
+
+ Examples
+ --------
+ To align a FASTA file (unaligned.fasta) with the output in ClustalW
+ format (file aligned.aln), and otherwise default settings, use:
+
+ >>> from Bio.Align.Applications import TCoffeeCommandline
+ >>> tcoffee_cline = TCoffeeCommandline(infile="unaligned.fasta",
+ ... output="clustalw",
+ ... outfile="aligned.aln")
+ >>> print(tcoffee_cline)
+ t_coffee -output clustalw -infile unaligned.fasta -outfile aligned.aln
+
+ You would typically run the command line with tcoffee_cline() or via
+ the Python subprocess module, as described in the Biopython tutorial.
+
+ """
+
+ SEQ_TYPES = ["dna", "protein", "dna_protein"]
+
+ def __init__(self, cmd="t_coffee", **kwargs):
+ """Initialize the class."""
+ self.parameters = [
+ _Option(
+ ["-output", "output"],
+ """Specify the output type.
+
+ One (or more separated by a comma) of:
+ 'clustalw_aln', 'clustalw', 'gcg', 'msf_aln',
+ 'pir_aln', 'fasta_aln', 'phylip', 'pir_seq', 'fasta_seq'
+ """,
+ equate=False,
+ ),
+ _Option(
+ ["-infile", "infile"],
+ "Specify the input file.",
+ filename=True,
+ is_required=True,
+ equate=False,
+ ),
+ # Indicates the name of the alignment output by t_coffee. If the
+ # default is used, the alignment is named .aln
+ _Option(
+ ["-outfile", "outfile"],
+ "Specify the output file. Default: .aln",
+ filename=True,
+ equate=False,
+ ),
+ _Switch(
+ ["-convert", "convert"], "Specify you want to perform a file conversion"
+ ),
+ _Option(
+ ["-type", "type"],
+ "Specify the type of sequence being aligned",
+ checker_function=lambda x: x in self.SEQ_TYPES,
+ equate=False,
+ ),
+ _Option(
+ ["-outorder", "outorder"],
+ "Specify the order of sequence to output"
+ "Either 'input', 'aligned' or of "
+ "Fasta file with sequence order",
+ equate=False,
+ ),
+ _Option(
+ ["-matrix", "matrix"],
+ "Specify the filename of the substitution matrix to use. "
+ "Default: blosum62mt",
+ equate=False,
+ ),
+ _Option(
+ ["-gapopen", "gapopen"],
+ "Indicates the penalty applied for opening a gap (negative integer)",
+ checker_function=lambda x: isinstance(x, int),
+ equate=False,
+ ),
+ _Option(
+ ["-gapext", "gapext"],
+ "Indicates the penalty applied for extending a gap (negative integer)",
+ checker_function=lambda x: isinstance(x, int),
+ equate=False,
+ ),
+ _Switch(["-quiet", "quiet"], "Turn off log output"),
+ _Option(
+ ["-mode", "mode"],
+ "Specifies a special mode: genome, quickaln, dali, 3dcoffee",
+ equate=False,
+ ),
+ ]
+ AbstractCommandline.__init__(self, cmd, **kwargs)
+
+
+if __name__ == "__main__":
+ from Bio._utils import run_doctest
+
+ run_doctest()
diff --git a/code/lib/Bio/Align/Applications/__init__.py b/code/lib/Bio/Align/Applications/__init__.py
new file mode 100644
index 0000000..778a7dd
--- /dev/null
+++ b/code/lib/Bio/Align/Applications/__init__.py
@@ -0,0 +1,34 @@
+# Copyright 2009 by Peter Cock & Cymon J. Cox. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Alignment command line tool wrappers (OBSOLETE).
+
+We have decided to remove this module in future, and instead recommend
+building your command and invoking it via the subprocess module directly.
+"""
+
+from ._Muscle import MuscleCommandline
+from ._Clustalw import ClustalwCommandline
+from ._ClustalOmega import ClustalOmegaCommandline
+from ._Prank import PrankCommandline
+from ._Mafft import MafftCommandline
+from ._Dialign import DialignCommandline
+from ._Probcons import ProbconsCommandline
+from ._TCoffee import TCoffeeCommandline
+from ._MSAProbs import MSAProbsCommandline
+
+# Make this explicit, then they show up in the API docs
+__all__ = (
+ "MuscleCommandline",
+ "ClustalwCommandline",
+ "ClustalOmegaCommandline",
+ "PrankCommandline",
+ "MafftCommandline",
+ "DialignCommandline",
+ "ProbconsCommandline",
+ "TCoffeeCommandline",
+ "MSAProbsCommandline",
+)
diff --git a/code/lib/Bio/Align/Applications/__pycache__/_ClustalOmega.cpython-37.pyc b/code/lib/Bio/Align/Applications/__pycache__/_ClustalOmega.cpython-37.pyc
new file mode 100644
index 0000000..047e02a
Binary files /dev/null and b/code/lib/Bio/Align/Applications/__pycache__/_ClustalOmega.cpython-37.pyc differ
diff --git a/code/lib/Bio/Align/Applications/__pycache__/_Clustalw.cpython-37.pyc b/code/lib/Bio/Align/Applications/__pycache__/_Clustalw.cpython-37.pyc
new file mode 100644
index 0000000..5cfed83
Binary files /dev/null and b/code/lib/Bio/Align/Applications/__pycache__/_Clustalw.cpython-37.pyc differ
diff --git a/code/lib/Bio/Align/Applications/__pycache__/_Dialign.cpython-37.pyc b/code/lib/Bio/Align/Applications/__pycache__/_Dialign.cpython-37.pyc
new file mode 100644
index 0000000..48be4d1
Binary files /dev/null and b/code/lib/Bio/Align/Applications/__pycache__/_Dialign.cpython-37.pyc differ
diff --git a/code/lib/Bio/Align/Applications/__pycache__/_MSAProbs.cpython-37.pyc b/code/lib/Bio/Align/Applications/__pycache__/_MSAProbs.cpython-37.pyc
new file mode 100644
index 0000000..9a47d9c
Binary files /dev/null and b/code/lib/Bio/Align/Applications/__pycache__/_MSAProbs.cpython-37.pyc differ
diff --git a/code/lib/Bio/Align/Applications/__pycache__/_Mafft.cpython-37.pyc b/code/lib/Bio/Align/Applications/__pycache__/_Mafft.cpython-37.pyc
new file mode 100644
index 0000000..d7dc9b7
Binary files /dev/null and b/code/lib/Bio/Align/Applications/__pycache__/_Mafft.cpython-37.pyc differ
diff --git a/code/lib/Bio/Align/Applications/__pycache__/_Muscle.cpython-37.pyc b/code/lib/Bio/Align/Applications/__pycache__/_Muscle.cpython-37.pyc
new file mode 100644
index 0000000..1fc62ff
Binary files /dev/null and b/code/lib/Bio/Align/Applications/__pycache__/_Muscle.cpython-37.pyc differ
diff --git a/code/lib/Bio/Align/Applications/__pycache__/_Prank.cpython-37.pyc b/code/lib/Bio/Align/Applications/__pycache__/_Prank.cpython-37.pyc
new file mode 100644
index 0000000..191a273
Binary files /dev/null and b/code/lib/Bio/Align/Applications/__pycache__/_Prank.cpython-37.pyc differ
diff --git a/code/lib/Bio/Align/Applications/__pycache__/_Probcons.cpython-37.pyc b/code/lib/Bio/Align/Applications/__pycache__/_Probcons.cpython-37.pyc
new file mode 100644
index 0000000..a0b18ca
Binary files /dev/null and b/code/lib/Bio/Align/Applications/__pycache__/_Probcons.cpython-37.pyc differ
diff --git a/code/lib/Bio/Align/Applications/__pycache__/_TCoffee.cpython-37.pyc b/code/lib/Bio/Align/Applications/__pycache__/_TCoffee.cpython-37.pyc
new file mode 100644
index 0000000..a2f271d
Binary files /dev/null and b/code/lib/Bio/Align/Applications/__pycache__/_TCoffee.cpython-37.pyc differ
diff --git a/code/lib/Bio/Align/Applications/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/Align/Applications/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..5912300
Binary files /dev/null and b/code/lib/Bio/Align/Applications/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/Align/__init__.py b/code/lib/Bio/Align/__init__.py
new file mode 100644
index 0000000..ac5b1cd
--- /dev/null
+++ b/code/lib/Bio/Align/__init__.py
@@ -0,0 +1,2326 @@
+# Copyright 2000, 2004 by Brad Chapman.
+# Revisions copyright 2010-2013, 2015-2018 by Peter Cock.
+# All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Code for dealing with sequence alignments.
+
+One of the most important things in this module is the MultipleSeqAlignment
+class, used in the Bio.AlignIO module.
+
+"""
+
+import sys
+
+from Bio.Align import _aligners
+from Bio.Align import substitution_matrices
+from Bio.Seq import Seq, MutableSeq, reverse_complement, UndefinedSequenceError
+from Bio.SeqRecord import SeqRecord, _RestrictedDict
+
+# Import errors may occur here if a compiled aligners.c file
+# (_aligners.pyd or _aligners.so) is missing or if the user is
+# importing from within the Biopython source tree, see PR #2007:
+# https://github.com/biopython/biopython/pull/2007
+
+
+class MultipleSeqAlignment:
+ """Represents a classical multiple sequence alignment (MSA).
+
+ By this we mean a collection of sequences (usually shown as rows) which
+ are all the same length (usually with gap characters for insertions or
+ padding). The data can then be regarded as a matrix of letters, with well
+ defined columns.
+
+ You would typically create an MSA by loading an alignment file with the
+ AlignIO module:
+
+ >>> from Bio import AlignIO
+ >>> align = AlignIO.read("Clustalw/opuntia.aln", "clustal")
+ >>> print(align)
+ Alignment with 7 rows and 156 columns
+ TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273285|gb|AF191659.1|AF191
+ TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273284|gb|AF191658.1|AF191
+ TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273287|gb|AF191661.1|AF191
+ TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273286|gb|AF191660.1|AF191
+ TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273290|gb|AF191664.1|AF191
+ TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273289|gb|AF191663.1|AF191
+ TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273291|gb|AF191665.1|AF191
+
+ In some respects you can treat these objects as lists of SeqRecord objects,
+ each representing a row of the alignment. Iterating over an alignment gives
+ the SeqRecord object for each row:
+
+ >>> len(align)
+ 7
+ >>> for record in align:
+ ... print("%s %i" % (record.id, len(record)))
+ ...
+ gi|6273285|gb|AF191659.1|AF191 156
+ gi|6273284|gb|AF191658.1|AF191 156
+ gi|6273287|gb|AF191661.1|AF191 156
+ gi|6273286|gb|AF191660.1|AF191 156
+ gi|6273290|gb|AF191664.1|AF191 156
+ gi|6273289|gb|AF191663.1|AF191 156
+ gi|6273291|gb|AF191665.1|AF191 156
+
+ You can also access individual rows as SeqRecord objects via their index:
+
+ >>> print(align[0].id)
+ gi|6273285|gb|AF191659.1|AF191
+ >>> print(align[-1].id)
+ gi|6273291|gb|AF191665.1|AF191
+
+ And extract columns as strings:
+
+ >>> print(align[:, 1])
+ AAAAAAA
+
+ Or, take just the first ten columns as a sub-alignment:
+
+ >>> print(align[:, :10])
+ Alignment with 7 rows and 10 columns
+ TATACATTAA gi|6273285|gb|AF191659.1|AF191
+ TATACATTAA gi|6273284|gb|AF191658.1|AF191
+ TATACATTAA gi|6273287|gb|AF191661.1|AF191
+ TATACATAAA gi|6273286|gb|AF191660.1|AF191
+ TATACATTAA gi|6273290|gb|AF191664.1|AF191
+ TATACATTAA gi|6273289|gb|AF191663.1|AF191
+ TATACATTAA gi|6273291|gb|AF191665.1|AF191
+
+ Combining this alignment slicing with alignment addition allows you to
+ remove a section of the alignment. For example, taking just the first
+ and last ten columns:
+
+ >>> print(align[:, :10] + align[:, -10:])
+ Alignment with 7 rows and 20 columns
+ TATACATTAAGTGTACCAGA gi|6273285|gb|AF191659.1|AF191
+ TATACATTAAGTGTACCAGA gi|6273284|gb|AF191658.1|AF191
+ TATACATTAAGTGTACCAGA gi|6273287|gb|AF191661.1|AF191
+ TATACATAAAGTGTACCAGA gi|6273286|gb|AF191660.1|AF191
+ TATACATTAAGTGTACCAGA gi|6273290|gb|AF191664.1|AF191
+ TATACATTAAGTATACCAGA gi|6273289|gb|AF191663.1|AF191
+ TATACATTAAGTGTACCAGA gi|6273291|gb|AF191665.1|AF191
+
+ Note - This object replaced the older Alignment object defined in module
+ Bio.Align.Generic but is not fully backwards compatible with it.
+
+ Note - This object does NOT attempt to model the kind of alignments used
+ in next generation sequencing with multiple sequencing reads which are
+ much shorter than the alignment, and where there is usually a consensus or
+ reference sequence with special status.
+ """
+
+ def __init__(
+ self, records, alphabet=None, annotations=None, column_annotations=None
+ ):
+ """Initialize a new MultipleSeqAlignment object.
+
+ Arguments:
+ - records - A list (or iterator) of SeqRecord objects, whose
+ sequences are all the same length. This may be an be an
+ empty list.
+ - alphabet - For backward compatibility only; its value should always
+ be None.
+ - annotations - Information about the whole alignment (dictionary).
+ - column_annotations - Per column annotation (restricted dictionary).
+ This holds Python sequences (lists, strings, tuples)
+ whose length matches the number of columns. A typical
+ use would be a secondary structure consensus string.
+
+ You would normally load a MSA from a file using Bio.AlignIO, but you
+ can do this from a list of SeqRecord objects too:
+
+ >>> from Bio.Seq import Seq
+ >>> from Bio.SeqRecord import SeqRecord
+ >>> from Bio.Align import MultipleSeqAlignment
+ >>> a = SeqRecord(Seq("AAAACGT"), id="Alpha")
+ >>> b = SeqRecord(Seq("AAA-CGT"), id="Beta")
+ >>> c = SeqRecord(Seq("AAAAGGT"), id="Gamma")
+ >>> align = MultipleSeqAlignment([a, b, c],
+ ... annotations={"tool": "demo"},
+ ... column_annotations={"stats": "CCCXCCC"})
+ >>> print(align)
+ Alignment with 3 rows and 7 columns
+ AAAACGT Alpha
+ AAA-CGT Beta
+ AAAAGGT Gamma
+ >>> align.annotations
+ {'tool': 'demo'}
+ >>> align.column_annotations
+ {'stats': 'CCCXCCC'}
+ """
+ if alphabet is not None:
+ raise ValueError("The alphabet argument is no longer supported")
+
+ self._records = []
+ if records:
+ self.extend(records)
+
+ # Annotations about the whole alignment
+ if annotations is None:
+ annotations = {}
+ elif not isinstance(annotations, dict):
+ raise TypeError("annotations argument should be a dict")
+ self.annotations = annotations
+
+ # Annotations about each column of the alignment
+ if column_annotations is None:
+ column_annotations = {}
+ # Handle this via the property set function which will validate it
+ self.column_annotations = column_annotations
+
+ def _set_per_column_annotations(self, value):
+ if not isinstance(value, dict):
+ raise TypeError(
+ "The per-column-annotations should be a (restricted) dictionary."
+ )
+ # Turn this into a restricted-dictionary (and check the entries)
+ if len(self):
+ # Use the standard method to get the length
+ expected_length = self.get_alignment_length()
+ self._per_col_annotations = _RestrictedDict(length=expected_length)
+ self._per_col_annotations.update(value)
+ else:
+ # Bit of a problem case... number of columns is undefined
+ self._per_col_annotations = None
+ if value:
+ raise ValueError(
+ "Can't set per-column-annotations without an alignment"
+ )
+
+ def _get_per_column_annotations(self):
+ if self._per_col_annotations is None:
+ # This happens if empty at initialisation
+ if len(self):
+ # Use the standard method to get the length
+ expected_length = self.get_alignment_length()
+ else:
+ # Should this raise an exception? Compare SeqRecord behaviour...
+ expected_length = 0
+ self._per_col_annotations = _RestrictedDict(length=expected_length)
+ return self._per_col_annotations
+
+ column_annotations = property(
+ fget=_get_per_column_annotations,
+ fset=_set_per_column_annotations,
+ doc="""Dictionary of per-letter-annotation for the sequence.""",
+ )
+
+ def _str_line(self, record, length=50):
+ """Return a truncated string representation of a SeqRecord (PRIVATE).
+
+ This is a PRIVATE function used by the __str__ method.
+ """
+ if record.seq.__class__.__name__ == "CodonSeq":
+ if len(record.seq) <= length:
+ return "%s %s" % (record.seq, record.id)
+ else:
+ return "%s...%s %s" % (
+ record.seq[: length - 3],
+ record.seq[-3:],
+ record.id,
+ )
+ else:
+ if len(record.seq) <= length:
+ return "%s %s" % (record.seq, record.id)
+ else:
+ return "%s...%s %s" % (
+ record.seq[: length - 6],
+ record.seq[-3:],
+ record.id,
+ )
+
+ def __str__(self):
+ """Return a multi-line string summary of the alignment.
+
+ This output is intended to be readable, but large alignments are
+ shown truncated. A maximum of 20 rows (sequences) and 50 columns
+ are shown, with the record identifiers. This should fit nicely on a
+ single screen. e.g.
+
+ >>> from Bio.Seq import Seq
+ >>> from Bio.SeqRecord import SeqRecord
+ >>> from Bio.Align import MultipleSeqAlignment
+ >>> a = SeqRecord(Seq("ACTGCTAGCTAG"), id="Alpha")
+ >>> b = SeqRecord(Seq("ACT-CTAGCTAG"), id="Beta")
+ >>> c = SeqRecord(Seq("ACTGCTAGATAG"), id="Gamma")
+ >>> align = MultipleSeqAlignment([a, b, c])
+ >>> print(align)
+ Alignment with 3 rows and 12 columns
+ ACTGCTAGCTAG Alpha
+ ACT-CTAGCTAG Beta
+ ACTGCTAGATAG Gamma
+
+ See also the alignment's format method.
+ """
+ rows = len(self._records)
+ lines = [
+ "Alignment with %i rows and %i columns"
+ % (rows, self.get_alignment_length())
+ ]
+ if rows <= 20:
+ lines.extend(self._str_line(rec) for rec in self._records)
+ else:
+ lines.extend(self._str_line(rec) for rec in self._records[:18])
+ lines.append("...")
+ lines.append(self._str_line(self._records[-1]))
+ return "\n".join(lines)
+
+ def __repr__(self):
+ """Return a representation of the object for debugging.
+
+ The representation cannot be used with eval() to recreate the object,
+ which is usually possible with simple python objects. For example:
+
+
+
+ The hex string is the memory address of the object, see help(id).
+ This provides a simple way to visually distinguish alignments of
+ the same size.
+ """
+ # A doctest for __repr__ would be nice, but __class__ comes out differently
+ # if run via the __main__ trick.
+ return "<%s instance (%i records of length %i) at %x>" % (
+ self.__class__,
+ len(self._records),
+ self.get_alignment_length(),
+ id(self),
+ )
+ # This version is useful for doing eval(repr(alignment)),
+ # but it can be VERY long:
+ # return "%s(%r)" \
+ # % (self.__class__, self._records)
+
+ def __format__(self, format_spec):
+ """Return the alignment as a string in the specified file format.
+
+ The format should be a lower case string supported as an output
+ format by Bio.AlignIO (such as "fasta", "clustal", "phylip",
+ "stockholm", etc), which is used to turn the alignment into a
+ string.
+
+ e.g.
+
+ >>> from Bio.Align import MultipleSeqAlignment
+ >>> a = SeqRecord(Seq("ACTGCTAGCTAG"), id="Alpha", description="")
+ >>> b = SeqRecord(Seq("ACT-CTAGCTAG"), id="Beta", description="")
+ >>> c = SeqRecord(Seq("ACTGCTAGATAG"), id="Gamma", description="")
+ >>> align = MultipleSeqAlignment([a, b, c])
+ >>> print(format(align, "fasta"))
+ >Alpha
+ ACTGCTAGCTAG
+ >Beta
+ ACT-CTAGCTAG
+ >Gamma
+ ACTGCTAGATAG
+
+ >>> print(format(align, "phylip"))
+ 3 12
+ Alpha ACTGCTAGCT AG
+ Beta ACT-CTAGCT AG
+ Gamma ACTGCTAGAT AG
+
+ """
+ if format_spec:
+ from io import StringIO
+ from Bio import AlignIO
+
+ handle = StringIO()
+ AlignIO.write([self], handle, format_spec)
+ return handle.getvalue()
+ else:
+ # Follow python convention and default to using __str__
+ return str(self)
+
+ def __iter__(self):
+ """Iterate over alignment rows as SeqRecord objects.
+
+ e.g.
+
+ >>> from Bio.Align import MultipleSeqAlignment
+ >>> a = SeqRecord(Seq("ACTGCTAGCTAG"), id="Alpha")
+ >>> b = SeqRecord(Seq("ACT-CTAGCTAG"), id="Beta")
+ >>> c = SeqRecord(Seq("ACTGCTAGATAG"), id="Gamma")
+ >>> align = MultipleSeqAlignment([a, b, c])
+ >>> for record in align:
+ ... print(record.id)
+ ... print(record.seq)
+ ...
+ Alpha
+ ACTGCTAGCTAG
+ Beta
+ ACT-CTAGCTAG
+ Gamma
+ ACTGCTAGATAG
+ """
+ return iter(self._records)
+
+ def __len__(self):
+ """Return the number of sequences in the alignment.
+
+ Use len(alignment) to get the number of sequences (i.e. the number of
+ rows), and alignment.get_alignment_length() to get the length of the
+ longest sequence (i.e. the number of columns).
+
+ This is easy to remember if you think of the alignment as being like a
+ list of SeqRecord objects.
+ """
+ return len(self._records)
+
+ def get_alignment_length(self):
+ """Return the maximum length of the alignment.
+
+ All objects in the alignment should (hopefully) have the same
+ length. This function will go through and find this length
+ by finding the maximum length of sequences in the alignment.
+
+ >>> from Bio.Align import MultipleSeqAlignment
+ >>> a = SeqRecord(Seq("ACTGCTAGCTAG"), id="Alpha")
+ >>> b = SeqRecord(Seq("ACT-CTAGCTAG"), id="Beta")
+ >>> c = SeqRecord(Seq("ACTGCTAGATAG"), id="Gamma")
+ >>> align = MultipleSeqAlignment([a, b, c])
+ >>> align.get_alignment_length()
+ 12
+
+ If you want to know the number of sequences in the alignment,
+ use len(align) instead:
+
+ >>> len(align)
+ 3
+
+ """
+ max_length = 0
+
+ for record in self._records:
+ if len(record.seq) > max_length:
+ max_length = len(record.seq)
+
+ return max_length
+
+ def extend(self, records):
+ """Add more SeqRecord objects to the alignment as rows.
+
+ They must all have the same length as the original alignment. For
+ example,
+
+ >>> from Bio.Seq import Seq
+ >>> from Bio.SeqRecord import SeqRecord
+ >>> from Bio.Align import MultipleSeqAlignment
+ >>> a = SeqRecord(Seq("AAAACGT"), id="Alpha")
+ >>> b = SeqRecord(Seq("AAA-CGT"), id="Beta")
+ >>> c = SeqRecord(Seq("AAAAGGT"), id="Gamma")
+ >>> d = SeqRecord(Seq("AAAACGT"), id="Delta")
+ >>> e = SeqRecord(Seq("AAA-GGT"), id="Epsilon")
+
+ First we create a small alignment (three rows):
+
+ >>> align = MultipleSeqAlignment([a, b, c])
+ >>> print(align)
+ Alignment with 3 rows and 7 columns
+ AAAACGT Alpha
+ AAA-CGT Beta
+ AAAAGGT Gamma
+
+ Now we can extend this alignment with another two rows:
+
+ >>> align.extend([d, e])
+ >>> print(align)
+ Alignment with 5 rows and 7 columns
+ AAAACGT Alpha
+ AAA-CGT Beta
+ AAAAGGT Gamma
+ AAAACGT Delta
+ AAA-GGT Epsilon
+
+ Because the alignment object allows iteration over the rows as
+ SeqRecords, you can use the extend method with a second alignment
+ (provided its sequences have the same length as the original alignment).
+ """
+ if len(self):
+ # Use the standard method to get the length
+ expected_length = self.get_alignment_length()
+ else:
+ # Take the first record's length
+ records = iter(records) # records arg could be list or iterator
+ try:
+ rec = next(records)
+ except StopIteration:
+ # Special case, no records
+ return
+ expected_length = len(rec)
+ self._append(rec, expected_length)
+ # Can now setup the per-column-annotations as well, set to None
+ # while missing the length:
+ self.column_annotations = {}
+ # Now continue to the rest of the records as usual
+
+ for rec in records:
+ self._append(rec, expected_length)
+
+ def append(self, record):
+ """Add one more SeqRecord object to the alignment as a new row.
+
+ This must have the same length as the original alignment (unless this is
+ the first record).
+
+ >>> from Bio import AlignIO
+ >>> align = AlignIO.read("Clustalw/opuntia.aln", "clustal")
+ >>> print(align)
+ Alignment with 7 rows and 156 columns
+ TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273285|gb|AF191659.1|AF191
+ TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273284|gb|AF191658.1|AF191
+ TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273287|gb|AF191661.1|AF191
+ TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273286|gb|AF191660.1|AF191
+ TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273290|gb|AF191664.1|AF191
+ TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273289|gb|AF191663.1|AF191
+ TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273291|gb|AF191665.1|AF191
+ >>> len(align)
+ 7
+
+ We'll now construct a dummy record to append as an example:
+
+ >>> from Bio.Seq import Seq
+ >>> from Bio.SeqRecord import SeqRecord
+ >>> dummy = SeqRecord(Seq("N"*156), id="dummy")
+
+ Now append this to the alignment,
+
+ >>> align.append(dummy)
+ >>> print(align)
+ Alignment with 8 rows and 156 columns
+ TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273285|gb|AF191659.1|AF191
+ TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273284|gb|AF191658.1|AF191
+ TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273287|gb|AF191661.1|AF191
+ TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273286|gb|AF191660.1|AF191
+ TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273290|gb|AF191664.1|AF191
+ TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273289|gb|AF191663.1|AF191
+ TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273291|gb|AF191665.1|AF191
+ NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...NNN dummy
+ >>> len(align)
+ 8
+
+ """
+ if self._records:
+ self._append(record, self.get_alignment_length())
+ else:
+ self._append(record)
+
+ def _append(self, record, expected_length=None):
+ """Validate and append a record (PRIVATE)."""
+ if not isinstance(record, SeqRecord):
+ raise TypeError("New sequence is not a SeqRecord object")
+
+ # Currently the get_alignment_length() call is expensive, so we need
+ # to avoid calling it repeatedly for __init__ and extend, hence this
+ # private _append method
+ if expected_length is not None and len(record) != expected_length:
+ # TODO - Use the following more helpful error, but update unit tests
+ # raise ValueError("New sequence is not of length %i"
+ # % self.get_alignment_length())
+ raise ValueError("Sequences must all be the same length")
+
+ self._records.append(record)
+
+ def __add__(self, other):
+ """Combine two alignments with the same number of rows by adding them.
+
+ If you have two multiple sequence alignments (MSAs), there are two ways to think
+ about adding them - by row or by column. Using the extend method adds by row.
+ Using the addition operator adds by column. For example,
+
+ >>> from Bio.Seq import Seq
+ >>> from Bio.SeqRecord import SeqRecord
+ >>> from Bio.Align import MultipleSeqAlignment
+ >>> a1 = SeqRecord(Seq("AAAAC"), id="Alpha")
+ >>> b1 = SeqRecord(Seq("AAA-C"), id="Beta")
+ >>> c1 = SeqRecord(Seq("AAAAG"), id="Gamma")
+ >>> a2 = SeqRecord(Seq("GT"), id="Alpha")
+ >>> b2 = SeqRecord(Seq("GT"), id="Beta")
+ >>> c2 = SeqRecord(Seq("GT"), id="Gamma")
+ >>> left = MultipleSeqAlignment([a1, b1, c1],
+ ... annotations={"tool": "demo", "name": "start"},
+ ... column_annotations={"stats": "CCCXC"})
+ >>> right = MultipleSeqAlignment([a2, b2, c2],
+ ... annotations={"tool": "demo", "name": "end"},
+ ... column_annotations={"stats": "CC"})
+
+ Now, let's look at these two alignments:
+
+ >>> print(left)
+ Alignment with 3 rows and 5 columns
+ AAAAC Alpha
+ AAA-C Beta
+ AAAAG Gamma
+ >>> print(right)
+ Alignment with 3 rows and 2 columns
+ GT Alpha
+ GT Beta
+ GT Gamma
+
+ And add them:
+
+ >>> combined = left + right
+ >>> print(combined)
+ Alignment with 3 rows and 7 columns
+ AAAACGT Alpha
+ AAA-CGT Beta
+ AAAAGGT Gamma
+
+ For this to work, both alignments must have the same number of records (here
+ they both have 3 rows):
+
+ >>> len(left)
+ 3
+ >>> len(right)
+ 3
+ >>> len(combined)
+ 3
+
+ The individual rows are SeqRecord objects, and these can be added together. Refer
+ to the SeqRecord documentation for details of how the annotation is handled. This
+ example is a special case in that both original alignments shared the same names,
+ meaning when the rows are added they also get the same name.
+
+ Any common annotations are preserved, but differing annotation is lost. This is
+ the same behaviour used in the SeqRecord annotations and is designed to prevent
+ accidental propagation of inappropriate values:
+
+ >>> combined.annotations
+ {'tool': 'demo'}
+
+ Similarly any common per-column-annotations are combined:
+
+ >>> combined.column_annotations
+ {'stats': 'CCCXCCC'}
+
+ """
+ if not isinstance(other, MultipleSeqAlignment):
+ raise NotImplementedError
+ if len(self) != len(other):
+ raise ValueError(
+ "When adding two alignments they must have the same length"
+ " (i.e. same number or rows)"
+ )
+ merged = (left + right for left, right in zip(self, other))
+ # Take any common annotation:
+ annotations = {}
+ for k, v in self.annotations.items():
+ if k in other.annotations and other.annotations[k] == v:
+ annotations[k] = v
+ column_annotations = {}
+ for k, v in self.column_annotations.items():
+ if k in other.column_annotations:
+ column_annotations[k] = v + other.column_annotations[k]
+ return MultipleSeqAlignment(
+ merged, annotations=annotations, column_annotations=column_annotations
+ )
+
+ def __getitem__(self, index):
+ """Access part of the alignment.
+
+ Depending on the indices, you can get a SeqRecord object
+ (representing a single row), a Seq object (for a single columns),
+ a string (for a single characters) or another alignment
+ (representing some part or all of the alignment).
+
+ align[r,c] gives a single character as a string
+ align[r] gives a row as a SeqRecord
+ align[r,:] gives a row as a SeqRecord
+ align[:,c] gives a column as a Seq
+
+ align[:] and align[:,:] give a copy of the alignment
+
+ Anything else gives a sub alignment, e.g.
+ align[0:2] or align[0:2,:] uses only row 0 and 1
+ align[:,1:3] uses only columns 1 and 2
+ align[0:2,1:3] uses only rows 0 & 1 and only cols 1 & 2
+
+ We'll use the following example alignment here for illustration:
+
+ >>> from Bio.Seq import Seq
+ >>> from Bio.SeqRecord import SeqRecord
+ >>> from Bio.Align import MultipleSeqAlignment
+ >>> a = SeqRecord(Seq("AAAACGT"), id="Alpha")
+ >>> b = SeqRecord(Seq("AAA-CGT"), id="Beta")
+ >>> c = SeqRecord(Seq("AAAAGGT"), id="Gamma")
+ >>> d = SeqRecord(Seq("AAAACGT"), id="Delta")
+ >>> e = SeqRecord(Seq("AAA-GGT"), id="Epsilon")
+ >>> align = MultipleSeqAlignment([a, b, c, d, e])
+
+ You can access a row of the alignment as a SeqRecord using an integer
+ index (think of the alignment as a list of SeqRecord objects here):
+
+ >>> first_record = align[0]
+ >>> print("%s %s" % (first_record.id, first_record.seq))
+ Alpha AAAACGT
+ >>> last_record = align[-1]
+ >>> print("%s %s" % (last_record.id, last_record.seq))
+ Epsilon AAA-GGT
+
+ You can also access use python's slice notation to create a sub-alignment
+ containing only some of the SeqRecord objects:
+
+ >>> sub_alignment = align[2:5]
+ >>> print(sub_alignment)
+ Alignment with 3 rows and 7 columns
+ AAAAGGT Gamma
+ AAAACGT Delta
+ AAA-GGT Epsilon
+
+ This includes support for a step, i.e. align[start:end:step], which
+ can be used to select every second sequence:
+
+ >>> sub_alignment = align[::2]
+ >>> print(sub_alignment)
+ Alignment with 3 rows and 7 columns
+ AAAACGT Alpha
+ AAAAGGT Gamma
+ AAA-GGT Epsilon
+
+ Or to get a copy of the alignment with the rows in reverse order:
+
+ >>> rev_alignment = align[::-1]
+ >>> print(rev_alignment)
+ Alignment with 5 rows and 7 columns
+ AAA-GGT Epsilon
+ AAAACGT Delta
+ AAAAGGT Gamma
+ AAA-CGT Beta
+ AAAACGT Alpha
+
+ You can also use two indices to specify both rows and columns. Using simple
+ integers gives you the entry as a single character string. e.g.
+
+ >>> align[3, 4]
+ 'C'
+
+ This is equivalent to:
+
+ >>> align[3][4]
+ 'C'
+
+ or:
+
+ >>> align[3].seq[4]
+ 'C'
+
+ To get a single column (as a string) use this syntax:
+
+ >>> align[:, 4]
+ 'CCGCG'
+
+ Or, to get part of a column,
+
+ >>> align[1:3, 4]
+ 'CG'
+
+ However, in general you get a sub-alignment,
+
+ >>> print(align[1:5, 3:6])
+ Alignment with 4 rows and 3 columns
+ -CG Beta
+ AGG Gamma
+ ACG Delta
+ -GG Epsilon
+
+ This should all seem familiar to anyone who has used the NumPy
+ array or matrix objects.
+ """
+ if isinstance(index, int):
+ # e.g. result = align[x]
+ # Return a SeqRecord
+ return self._records[index]
+ elif isinstance(index, slice):
+ # e.g. sub_align = align[i:j:k]
+ new = MultipleSeqAlignment(self._records[index])
+ if self.column_annotations and len(new) == len(self):
+ # All rows kept (although could have been reversed)
+ # Preserve the column annotations too,
+ for k, v in self.column_annotations.items():
+ new.column_annotations[k] = v
+ return new
+ elif len(index) != 2:
+ raise TypeError("Invalid index type.")
+
+ # Handle double indexing
+ row_index, col_index = index
+ if isinstance(row_index, int):
+ # e.g. row_or_part_row = align[6, 1:4], gives a SeqRecord
+ return self._records[row_index][col_index]
+ elif isinstance(col_index, int):
+ # e.g. col_or_part_col = align[1:5, 6], gives a string
+ return "".join(rec[col_index] for rec in self._records[row_index])
+ else:
+ # e.g. sub_align = align[1:4, 5:7], gives another alignment
+ new = MultipleSeqAlignment(
+ rec[col_index] for rec in self._records[row_index]
+ )
+ if self.column_annotations and len(new) == len(self):
+ # All rows kept (although could have been reversed)
+ # Preserve the column annotations too,
+ for k, v in self.column_annotations.items():
+ new.column_annotations[k] = v[col_index]
+ return new
+
+ def sort(self, key=None, reverse=False):
+ """Sort the rows (SeqRecord objects) of the alignment in place.
+
+ This sorts the rows alphabetically using the SeqRecord object id by
+ default. The sorting can be controlled by supplying a key function
+ which must map each SeqRecord to a sort value.
+
+ This is useful if you want to add two alignments which use the same
+ record identifiers, but in a different order. For example,
+
+ >>> from Bio.Seq import Seq
+ >>> from Bio.SeqRecord import SeqRecord
+ >>> from Bio.Align import MultipleSeqAlignment
+ >>> align1 = MultipleSeqAlignment([
+ ... SeqRecord(Seq("ACGT"), id="Human"),
+ ... SeqRecord(Seq("ACGG"), id="Mouse"),
+ ... SeqRecord(Seq("ACGC"), id="Chicken"),
+ ... ])
+ >>> align2 = MultipleSeqAlignment([
+ ... SeqRecord(Seq("CGGT"), id="Mouse"),
+ ... SeqRecord(Seq("CGTT"), id="Human"),
+ ... SeqRecord(Seq("CGCT"), id="Chicken"),
+ ... ])
+
+ If you simple try and add these without sorting, you get this:
+
+ >>> print(align1 + align2)
+ Alignment with 3 rows and 8 columns
+ ACGTCGGT
+ ACGGCGTT
+ ACGCCGCT Chicken
+
+ Consult the SeqRecord documentation which explains why you get a
+ default value when annotation like the identifier doesn't match up.
+ However, if we sort the alignments first, then add them we get the
+ desired result:
+
+ >>> align1.sort()
+ >>> align2.sort()
+ >>> print(align1 + align2)
+ Alignment with 3 rows and 8 columns
+ ACGCCGCT Chicken
+ ACGTCGTT Human
+ ACGGCGGT Mouse
+
+ As an example using a different sort order, you could sort on the
+ GC content of each sequence.
+
+ >>> from Bio.SeqUtils import GC
+ >>> print(align1)
+ Alignment with 3 rows and 4 columns
+ ACGC Chicken
+ ACGT Human
+ ACGG Mouse
+ >>> align1.sort(key = lambda record: GC(record.seq))
+ >>> print(align1)
+ Alignment with 3 rows and 4 columns
+ ACGT Human
+ ACGC Chicken
+ ACGG Mouse
+
+ There is also a reverse argument, so if you wanted to sort by ID
+ but backwards:
+
+ >>> align1.sort(reverse=True)
+ >>> print(align1)
+ Alignment with 3 rows and 4 columns
+ ACGG Mouse
+ ACGT Human
+ ACGC Chicken
+
+ """
+ if key is None:
+ self._records.sort(key=lambda r: r.id, reverse=reverse)
+ else:
+ self._records.sort(key=key, reverse=reverse)
+
+ @property
+ def substitutions(self):
+ """Return an Array with the number of substitutions of letters in the alignment.
+
+ As an example, consider a multiple sequence alignment of three DNA sequences:
+
+ >>> from Bio.Seq import Seq
+ >>> from Bio.SeqRecord import SeqRecord
+ >>> from Bio.Align import MultipleSeqAlignment
+ >>> seq1 = SeqRecord(Seq("ACGT"), id="seq1")
+ >>> seq2 = SeqRecord(Seq("A--A"), id="seq2")
+ >>> seq3 = SeqRecord(Seq("ACGT"), id="seq3")
+ >>> seq4 = SeqRecord(Seq("TTTC"), id="seq4")
+ >>> alignment = MultipleSeqAlignment([seq1, seq2, seq3, seq4])
+ >>> print(alignment)
+ Alignment with 4 rows and 4 columns
+ ACGT seq1
+ A--A seq2
+ ACGT seq3
+ TTTC seq4
+
+ >>> m = alignment.substitutions
+ >>> print(m)
+ A C G T
+ A 3.0 0.5 0.0 2.5
+ C 0.5 1.0 0.0 2.0
+ G 0.0 0.0 1.0 1.0
+ T 2.5 2.0 1.0 1.0
+
+
+ Note that the matrix is symmetric, with counts divided equally on both
+ sides of the diagonal. For example, the total number of substitutions
+ between A and T in the alignment is 3.5 + 3.5 = 7.
+
+ Any weights associated with the sequences are taken into account when
+ calculating the substitution matrix. For example, given the following
+ multiple sequence alignment::
+
+ GTATC 0.5
+ AT--C 0.8
+ CTGTC 1.0
+
+ For the first column we have::
+
+ ('A', 'G') : 0.5 * 0.8 = 0.4
+ ('C', 'G') : 0.5 * 1.0 = 0.5
+ ('A', 'C') : 0.8 * 1.0 = 0.8
+
+ """
+ letters = set.union(*[set(record.seq) for record in self])
+ try:
+ letters.remove("-")
+ except KeyError:
+ pass
+ letters = "".join(sorted(letters))
+ m = substitution_matrices.Array(letters, dims=2)
+ for rec_num1, alignment1 in enumerate(self):
+ seq1 = alignment1.seq
+ weight1 = alignment1.annotations.get("weight", 1.0)
+ for rec_num2, alignment2 in enumerate(self):
+ if rec_num1 == rec_num2:
+ break
+ seq2 = alignment2.seq
+ weight2 = alignment2.annotations.get("weight", 1.0)
+ for residue1, residue2 in zip(seq1, seq2):
+ if residue1 == "-":
+ continue
+ if residue2 == "-":
+ continue
+ m[(residue1, residue2)] += weight1 * weight2
+
+ m += m.transpose()
+ m /= 2.0
+
+ return m
+
+
+class PairwiseAlignment:
+ """Represents a pairwise sequence alignment.
+
+ Internally, the pairwise alignment is stored as the path through
+ the traceback matrix, i.e. a tuple of pairs of indices corresponding
+ to the vertices of the path in the traceback matrix.
+ """
+
+ def __init__(self, target, query, path, score):
+ """Initialize a new PairwiseAlignment object.
+
+ Arguments:
+ - target - The first sequence, as a plain string, without gaps.
+ - query - The second sequence, as a plain string, without gaps.
+ - path - The path through the traceback matrix, defining an
+ alignment.
+ - score - The alignment score.
+
+ You would normally obtain a PairwiseAlignment object by iterating
+ over a PairwiseAlignments object.
+ """
+ self.target = target
+ self.query = query
+ self.score = score
+ self.path = path
+
+ def __eq__(self, other):
+ return self.path == other.path
+
+ def __ne__(self, other):
+ return self.path != other.path
+
+ def __lt__(self, other):
+ return self.path < other.path
+
+ def __le__(self, other):
+ return self.path <= other.path
+
+ def __gt__(self, other):
+ return self.path > other.path
+
+ def __ge__(self, other):
+ return self.path >= other.path
+
+ def __getitem__(self, key):
+ """Return self[key].
+
+ Currently, this is implemented only for indices of the form
+
+ self[:, :]
+
+ which returns a copy of the PairwiseAlignment object, and
+
+ self[:, i:]
+ self[:, :j]
+ self[:, i:j]
+
+ which returns a new PairwiseAlignment object spanning the indicated
+ columns.
+
+ >>> from Bio.Align import PairwiseAligner
+ >>> aligner = PairwiseAligner()
+ >>> alignments = aligner.align("ACCGGTTT", "ACGGGTT")
+ >>> alignment = alignments[0]
+ >>> print(alignment)
+ ACCGG-TTT
+ ||-||-||-
+ AC-GGGTT-
+
+ >>> alignment[:, 1:] # doctest:+ELLIPSIS
+
+ >>> print(alignment[:, 1:])
+ ACCGG-TTT
+ |-||-||-
+ AC-GGGTT-
+
+ >>> print(alignment[:, 2:])
+ ACCGG-TTT
+ -||-||-
+ AC-GGGTT-
+
+ >>> print(alignment[:, 3:])
+ ACCGG-TTT
+ ||-||-
+ ACGGGTT-
+
+ >>> print(alignment[:, 3:-1])
+ ACCGG-TTT
+ ||-||
+ ACGGGTT
+
+ """
+ if isinstance(key, slice):
+ if key.indices(len(self)) == (0, 2, 1):
+ target = self.target
+ query = self.query
+ path = self.path
+ score = self.score
+ return PairwiseAlignment(target, query, path, score)
+ raise NotImplementedError
+ if isinstance(key, int):
+ raise NotImplementedError
+ if isinstance(key, tuple):
+ try:
+ row, col = key
+ except ValueError:
+ raise ValueError("only tuples of length 2 can be alignment indices")
+ if isinstance(row, int):
+ raise NotImplementedError
+ if isinstance(row, slice):
+ if row.indices(len(self)) != (0, 2, 1):
+ raise NotImplementedError
+ if isinstance(col, int):
+ raise NotImplementedError
+ if isinstance(col, slice):
+ n, m = self.shape
+ start_index, stop_index, step = col.indices(m)
+ if step != 1:
+ raise NotImplementedError
+ path = []
+ index = 0
+ path_iterator = iter(self.path)
+ starts = next(path_iterator)
+ for ends in path_iterator:
+ index += max(e - s for s, e in zip(starts, ends))
+ if start_index < index:
+ offset = index - start_index
+ point = tuple(
+ e - offset if s < e else s for s, e in zip(starts, ends)
+ )
+ path.append(point)
+ break
+ starts = ends
+ while True:
+ if stop_index <= index:
+ offset = index - stop_index
+ point = tuple(
+ e - offset if s < e else s for s, e in zip(starts, ends)
+ )
+ path.append(point)
+ break
+ path.append(ends)
+ starts = ends
+ ends = next(path_iterator)
+ index += max(e - s for s, e in zip(starts, ends))
+ path = tuple(path)
+ target = self.target
+ query = self.query
+ if path == self.path:
+ score = self.score
+ else:
+ score = None
+ return PairwiseAlignment(target, query, path, score)
+ raise TypeError("second index must be an integer or slice")
+ raise TypeError("first index must be an integer or slice")
+ raise TypeError("alignment indices must be integers, slices, or tuples")
+
+ def _convert_sequence_string(self, sequence):
+ if isinstance(sequence, (bytes, bytearray)):
+ return sequence.decode()
+ if isinstance(sequence, str):
+ return sequence
+ if isinstance(sequence, Seq):
+ return str(sequence)
+ try: # check if target is a SeqRecord
+ sequence = sequence.seq
+ except AttributeError:
+ pass
+ else:
+ return str(sequence)
+ try:
+ view = memoryview(sequence)
+ except TypeError:
+ pass
+ else:
+ if view.format == "c":
+ return str(sequence)
+ return None
+
+ def __format__(self, format_spec):
+ return self.format(format_spec)
+
+ def format(self, fmt="", **kwargs):
+ """Return the alignment as a string in the specified file format.
+
+ Arguments:
+ - fmt - File format. Acceptable values are
+ "" : create a human-readable representation of the
+ alignment (default);
+ "BED": create a line representing the alignment in
+ the Browser Extensible Data (BED) file format;
+ "PSL": create a line representing the alignment in
+ the Pattern Space Layout (PSL) file format as
+ generated by BLAT;
+ "SAM": create a line representing the alignment in
+ the Sequence Alignment/Map (SAM) format.
+ - mask - PSL format only. Specify if repeat regions in the target
+ sequence are masked and should be reported in the
+ `repMatches` field of the PSL file instead of in the
+ `matches` field. Acceptable values are
+ None : no masking (default);
+ "lower": masking by lower-case characters;
+ "upper": masking by upper-case characters.
+ - wildcard - PSL format only. Report alignments to the wildcard
+ character in the target or query sequence in the
+ `nCount` field of the PSL file instead of in the
+ `matches`, `misMatches`, or `repMatches` fields.
+ Default value is 'N'.
+ """
+ if fmt == "":
+ return self._format_pretty(**kwargs)
+ elif fmt == "psl":
+ return self._format_psl(**kwargs)
+ elif fmt == "bed":
+ return self._format_bed(**kwargs)
+ elif fmt == "sam":
+ return self._format_sam(**kwargs)
+ else:
+ raise ValueError("Unknown format %s" % fmt)
+
+ def _format_pretty(self):
+ seq1 = self._convert_sequence_string(self.target)
+ if seq1 is None:
+ return self._format_generalized()
+ seq2 = self._convert_sequence_string(self.query)
+ if seq2 is None:
+ return self._format_generalized()
+ n1 = len(seq1)
+ n2 = len(seq2)
+ aligned_seq1 = ""
+ aligned_seq2 = ""
+ pattern = ""
+ path = self.path
+ if path[0][1] > path[-1][1]: # mapped to reverse strand
+ path = tuple((c1, n2 - c2) for (c1, c2) in path)
+ seq2 = reverse_complement(seq2)
+ end1, end2 = path[0]
+ if end1 > 0 or end2 > 0:
+ end = max(end1, end2)
+ aligned_seq1 += " " * (end - end1) + seq1[:end1]
+ aligned_seq2 += " " * (end - end2) + seq2[:end2]
+ pattern += " " * end
+ start1 = end1
+ start2 = end2
+ for end1, end2 in path[1:]:
+ if end1 == start1:
+ gap = end2 - start2
+ aligned_seq1 += "-" * gap
+ aligned_seq2 += seq2[start2:end2]
+ pattern += "-" * gap
+ elif end2 == start2:
+ gap = end1 - start1
+ aligned_seq1 += seq1[start1:end1]
+ aligned_seq2 += "-" * gap
+ pattern += "-" * gap
+ else:
+ s1 = seq1[start1:end1]
+ s2 = seq2[start2:end2]
+ aligned_seq1 += s1
+ aligned_seq2 += s2
+ for c1, c2 in zip(s1, s2):
+ if c1 == c2:
+ pattern += "|"
+ else:
+ pattern += "."
+ start1 = end1
+ start2 = end2
+ aligned_seq1 += seq1[end1:]
+ aligned_seq2 += seq2[end2:]
+ return "%s\n%s\n%s\n" % (aligned_seq1, pattern, aligned_seq2)
+
+ def _format_generalized(self):
+ seq1 = self.target
+ seq2 = self.query
+ aligned_seq1 = []
+ aligned_seq2 = []
+ pattern = []
+ path = self.path
+ end1, end2 = path[0]
+ if end1 > 0 or end2 > 0:
+ if end1 <= end2:
+ for c2 in seq2[: end2 - end1]:
+ s2 = str(c2)
+ s1 = " " * len(s2)
+ aligned_seq1.append(s1)
+ aligned_seq2.append(s2)
+ pattern.append(s1)
+ else: # end1 > end2
+ for c1 in seq1[: end1 - end2]:
+ s1 = str(c1)
+ s2 = " " * len(s1)
+ aligned_seq1.append(s1)
+ aligned_seq2.append(s2)
+ pattern.append(s2)
+ start1 = end1
+ start2 = end2
+ for end1, end2 in path[1:]:
+ if end1 == start1:
+ for c2 in seq2[start2:end2]:
+ s2 = str(c2)
+ s1 = "-" * len(s2)
+ aligned_seq1.append(s1)
+ aligned_seq2.append(s2)
+ pattern.append(s1)
+ start2 = end2
+ elif end2 == start2:
+ for c1 in seq1[start1:end1]:
+ s1 = str(c1)
+ s2 = "-" * len(s1)
+ aligned_seq1.append(s1)
+ aligned_seq2.append(s2)
+ pattern.append(s2)
+ start1 = end1
+ else:
+ for c1, c2 in zip(seq1[start1:end1], seq2[start2:end2]):
+ s1 = str(c1)
+ s2 = str(c2)
+ m1 = len(s1)
+ m2 = len(s2)
+ if c1 == c2:
+ p = "|"
+ else:
+ p = "."
+ if m1 < m2:
+ space = (m2 - m1) * " "
+ s1 += space
+ pattern.append(p * m1 + space)
+ elif m1 > m2:
+ space = (m1 - m2) * " "
+ s2 += space
+ pattern.append(p * m2 + space)
+ else:
+ pattern.append(p * m1)
+ aligned_seq1.append(s1)
+ aligned_seq2.append(s2)
+ start1 = end1
+ start2 = end2
+ aligned_seq1 = " ".join(aligned_seq1)
+ aligned_seq2 = " ".join(aligned_seq2)
+ pattern = " ".join(pattern)
+ return "%s\n%s\n%s\n" % (aligned_seq1, pattern, aligned_seq2)
+
+ def _format_bed(self):
+ query = self.query
+ target = self.target
+ # variable names follow those in the BED file format specification
+ try:
+ chrom = target.id
+ except AttributeError:
+ chrom = "target"
+ try:
+ name = query.id
+ except AttributeError:
+ name = "query"
+ path = self.path
+ if path[0][1] < path[-1][1]: # mapped to forward strand
+ strand = "+"
+ else: # mapped to reverse strand
+ strand = "-"
+ n2 = len(query)
+ path = tuple((c1, n2 - c2) for (c1, c2) in path)
+ score = self.score
+ blockSizes = []
+ tStarts = []
+ tStart, qStart = path[0]
+ for tEnd, qEnd in path[1:]:
+ tCount = tEnd - tStart
+ qCount = qEnd - qStart
+ if tCount == 0:
+ qStart = qEnd
+ elif qCount == 0:
+ tStart = tEnd
+ else:
+ assert tCount == qCount
+ tStarts.append(tStart)
+ blockSizes.append(tCount)
+ tStart = tEnd
+ qStart = qEnd
+ chromStart = tStarts[0]
+ chromEnd = tStarts[-1] + blockSizes[-1]
+ blockStarts = [tStart - chromStart for tStart in tStarts]
+ blockCount = len(blockSizes)
+ blockSizes = ",".join(map(str, blockSizes)) + ","
+ blockStarts = ",".join(map(str, blockStarts)) + ","
+ thickStart = chromStart
+ thickEnd = chromEnd
+ itemRgb = "0"
+ words = [
+ chrom,
+ str(chromStart),
+ str(chromEnd),
+ name,
+ str(score),
+ strand,
+ str(thickStart),
+ str(thickEnd),
+ itemRgb,
+ str(blockCount),
+ blockSizes,
+ blockStarts,
+ ]
+ line = "\t".join(words) + "\n"
+ return line
+
+ def _format_psl(self, mask=False, wildcard="N"):
+ path = self.path
+ if not path: # alignment consists of gaps only
+ return ""
+ query = self.query
+ target = self.target
+ try:
+ qName = query.id
+ except AttributeError:
+ qName = "query"
+ try:
+ query = query.seq
+ except AttributeError:
+ pass
+ try:
+ tName = target.id
+ except AttributeError:
+ tName = "target"
+ try:
+ target = target.seq
+ except AttributeError:
+ pass
+ n1 = len(target)
+ n2 = len(query)
+ try:
+ seq1 = bytes(target)
+ except TypeError: # string
+ seq1 = bytes(target, "ASCII")
+ except UndefinedSequenceError: # sequence contents is unknown
+ seq1 = None
+ if path[0][1] < path[-1][1]: # mapped to forward strand
+ strand = "+"
+ seq2 = query
+ else: # mapped to reverse strand
+ strand = "-"
+ seq2 = reverse_complement(query)
+ path = tuple((c1, n2 - c2) for (c1, c2) in path)
+ try:
+ seq2 = bytes(seq2)
+ except TypeError: # string
+ seq2 = bytes(seq2, "ASCII")
+ except UndefinedSequenceError: # sequence contents is unknown
+ seq2 = None
+ if wildcard is not None:
+ if mask == "upper":
+ wildcard = ord(wildcard.lower())
+ else:
+ wildcard = ord(wildcard.upper())
+ # variable names follow those in the PSL file format specification
+ matches = 0
+ misMatches = 0
+ repMatches = 0
+ nCount = 0
+ qNumInsert = 0
+ qBaseInsert = 0
+ tNumInsert = 0
+ tBaseInsert = 0
+ qSize = n2
+ tSize = n1
+ blockSizes = []
+ qStarts = []
+ tStarts = []
+ tStart, qStart = path[0]
+ for tEnd, qEnd in path[1:]:
+ tCount = tEnd - tStart
+ qCount = qEnd - qStart
+ if tCount == 0:
+ if qStart > 0 and qEnd < qSize:
+ qNumInsert += 1
+ qBaseInsert += qCount
+ qStart = qEnd
+ elif qCount == 0:
+ if tStart > 0 and tEnd < tSize:
+ tNumInsert += 1
+ tBaseInsert += tCount
+ tStart = tEnd
+ else:
+ assert tCount == qCount
+ tStarts.append(tStart)
+ qStarts.append(qStart)
+ blockSizes.append(tCount)
+ if seq1 is None or seq2 is None:
+ # contents of at least one sequence is unknown;
+ # count all alignments as matches:
+ matches += tCount
+ else:
+ s1 = seq1[tStart:tEnd]
+ s2 = seq2[qStart:qEnd]
+ if mask == "lower":
+ for u1, u2, c1 in zip(s1.upper(), s2.upper(), s1):
+ if u1 == wildcard or u2 == wildcard:
+ nCount += 1
+ elif u1 == u2:
+ if u1 == c1:
+ matches += 1
+ else:
+ repMatches += 1
+ else:
+ misMatches += 1
+ elif mask == "upper":
+ for u1, u2, c1 in zip(s1.lower(), s2.lower(), s1):
+ if u1 == wildcard or u2 == wildcard:
+ nCount += 1
+ elif u1 == u2:
+ if u1 == c1:
+ matches += 1
+ else:
+ repMatches += 1
+ else:
+ misMatches += 1
+ else:
+ for u1, u2 in zip(s1.upper(), s2.upper()):
+ if u1 == wildcard or u2 == wildcard:
+ nCount += 1
+ elif u1 == u2:
+ matches += 1
+ else:
+ misMatches += 1
+ tStart = tEnd
+ qStart = qEnd
+ tStart = tStarts[0] # start of alignment in target
+ qStart = qStarts[0] # start of alignment in query
+ tEnd = tStarts[-1] + blockSizes[-1] # end of alignment in target
+ qEnd = qStarts[-1] + blockSizes[-1] # end of alignment in query
+ if strand == "-":
+ qStart, qEnd = qSize - qEnd, qSize - qStart
+ blockCount = len(blockSizes)
+ blockSizes = ",".join(map(str, blockSizes)) + ","
+ qStarts = ",".join(map(str, qStarts)) + ","
+ tStarts = ",".join(map(str, tStarts)) + ","
+ words = [
+ str(matches),
+ str(misMatches),
+ str(repMatches),
+ str(nCount),
+ str(qNumInsert),
+ str(qBaseInsert),
+ str(tNumInsert),
+ str(tBaseInsert),
+ strand,
+ qName,
+ str(qSize),
+ str(qStart),
+ str(qEnd),
+ tName,
+ str(tSize),
+ str(tStart),
+ str(tEnd),
+ str(blockCount),
+ blockSizes,
+ qStarts,
+ tStarts,
+ ]
+ line = "\t".join(words) + "\n"
+ return line
+
+ def _format_sam(self):
+ query = self.query
+ target = self.target
+ try:
+ qName = query.id
+ except AttributeError:
+ qName = "query"
+ else:
+ query = query.seq
+ try:
+ rName = target.id
+ except AttributeError:
+ rName = "target"
+ else:
+ target = target.seq
+ n1 = len(target)
+ n2 = len(query)
+ pos = None
+ qSize = n2
+ tSize = n1
+ cigar = []
+ path = self.path
+ if path[0][1] < path[-1][1]: # mapped to forward strand
+ flag = 0
+ seq = query
+ else: # mapped to reverse strand
+ flag = 16
+ seq = reverse_complement(query)
+ path = tuple((c1, n2 - c2) for (c1, c2) in path)
+ try:
+ seq = bytes(seq)
+ except TypeError: # string
+ pass
+ else:
+ seq = str(seq, "ASCII")
+ tStart, qStart = path[0]
+ for tEnd, qEnd in path[1:]:
+ tCount = tEnd - tStart
+ qCount = qEnd - qStart
+ if tCount == 0:
+ length = qCount
+ if pos is None or tEnd == tSize:
+ operation = "S"
+ else:
+ operation = "I"
+ qStart = qEnd
+ elif qCount == 0:
+ if tStart > 0 and tEnd < tSize:
+ length = tCount
+ operation = "D"
+ else:
+ operation = None
+ tStart = tEnd
+ else:
+ assert tCount == qCount
+ if pos is None:
+ pos = tStart
+ tStart = tEnd
+ qStart = qEnd
+ operation = "M"
+ length = tCount
+ if operation is not None:
+ cigar.append(str(length) + operation)
+ mapQ = 255 # not available
+ rNext = "*"
+ pNext = 0
+ tLen = 0
+ qual = "*"
+ cigar = "".join(cigar)
+ tag = "AS:i:%d" % int(round(self.score))
+ words = [
+ qName,
+ str(flag),
+ rName,
+ str(pos + 1), # 1-based coordinates
+ str(mapQ),
+ cigar,
+ rNext,
+ str(pNext),
+ str(tLen),
+ seq,
+ qual,
+ tag,
+ ]
+ line = "\t".join(words) + "\n"
+ return line
+
+ def __str__(self):
+ return self.format()
+
+ def __len__(self):
+ """Return the number of sequences in the alignment, which is always 2."""
+ return 2
+
+ @property
+ def shape(self):
+ """Return the shape of the alignment as a tuple of two integer values.
+
+ The first integer value is the number of sequences in the alignment as
+ returned by len(alignment), which is always 2 for pairwise alignments.
+
+ The second integer value is the number of columns in the alignment when
+ it is printed, and is equal to the sum of the number of matches, number
+ of mismatches, and the total length of gaps in the target and query.
+ Sequence sections beyond the aligned segment are not included in the
+ number of columns.
+
+ For example,
+
+ >>> from Bio import Align
+ >>> aligner = Align.PairwiseAligner()
+ >>> aligner.mode = "global"
+ >>> alignments = aligner.align("GACCTG", "CGATCG")
+ >>> alignment = alignments[0]
+ >>> print(alignment)
+ -GACCT-G
+ -||--|-|
+ CGA--TCG
+
+ >>> len(alignment)
+ 2
+ >>> alignment.shape
+ (2, 8)
+ >>> aligner.mode = "local"
+ >>> alignments = aligner.align("GACCTG", "CGATCG")
+ >>> alignment = alignments[0]
+ >>> print(alignment)
+ GACCT-G
+ ||--|-|
+ CGA--TCG
+
+ >>> len(alignment)
+ 2
+ >>> alignment.shape
+ (2, 7)
+ """
+ path = self.path
+ if path[0][1] > path[-1][1]: # mapped to reverse strand
+ n2 = len(self.query)
+ path = tuple((c1, n2 - c2) for (c1, c2) in path)
+ start = path[0]
+ n = len(start)
+ m = 0
+ for end in path[1:]:
+ m += max(e - s for s, e in zip(start, end))
+ start = end
+ return (n, m)
+
+ @property
+ def aligned(self):
+ """Return the indices of subsequences aligned to each other.
+
+ This property returns the start and end indices of subsequences
+ in the target and query sequence that were aligned to each other.
+ If the alignment between target (t) and query (q) consists of N
+ chunks, you get two tuples of length N:
+
+ (((t_start1, t_end1), (t_start2, t_end2), ..., (t_startN, t_endN)),
+ ((q_start1, q_end1), (q_start2, q_end2), ..., (q_startN, q_endN)))
+
+ For example,
+
+ >>> from Bio import Align
+ >>> aligner = Align.PairwiseAligner()
+ >>> alignments = aligner.align("GAACT", "GAT")
+ >>> alignment = alignments[0]
+ >>> print(alignment)
+ GAACT
+ ||--|
+ GA--T
+
+ >>> alignment.aligned
+ (((0, 2), (4, 5)), ((0, 2), (2, 3)))
+ >>> alignment = alignments[1]
+ >>> print(alignment)
+ GAACT
+ |-|-|
+ G-A-T
+
+ >>> alignment.aligned
+ (((0, 1), (2, 3), (4, 5)), ((0, 1), (1, 2), (2, 3)))
+
+ Note that different alignments may have the same subsequences
+ aligned to each other. In particular, this may occur if alignments
+ differ from each other in terms of their gap placement only:
+
+ >>> aligner.mismatch_score = -10
+ >>> alignments = aligner.align("AAACAAA", "AAAGAAA")
+ >>> len(alignments)
+ 2
+ >>> print(alignments[0])
+ AAAC-AAA
+ |||--|||
+ AAA-GAAA
+
+ >>> alignments[0].aligned
+ (((0, 3), (4, 7)), ((0, 3), (4, 7)))
+ >>> print(alignments[1])
+ AAA-CAAA
+ |||--|||
+ AAAG-AAA
+
+ >>> alignments[1].aligned
+ (((0, 3), (4, 7)), ((0, 3), (4, 7)))
+
+ The property can be used to identify alignments that are identical
+ to each other in terms of their aligned sequences.
+ """
+ segments1 = []
+ segments2 = []
+ path = self.path
+ if path[0][1] < path[-1][1]: # mapped to forward strand
+ i1, i2 = path[0]
+ for node in path[1:]:
+ j1, j2 = node
+ if j1 > i1 and j2 > i2:
+ segment1 = (i1, j1)
+ segment2 = (i2, j2)
+ segments1.append(segment1)
+ segments2.append(segment2)
+ i1, i2 = j1, j2
+ else: # mapped to reverse strand
+ n2 = len(self.query)
+ i1, i2 = path[0]
+ i2 = n2 - i2
+ for node in path[1:]:
+ j1, j2 = node
+ j2 = n2 - j2
+ if j1 > i1 and j2 > i2:
+ segment1 = (i1, j1)
+ segment2 = (n2 - i2, n2 - j2)
+ segments1.append(segment1)
+ segments2.append(segment2)
+ i1, i2 = j1, j2
+ return tuple(segments1), tuple(segments2)
+
+ def sort(self, key=None, reverse=False):
+ """Sort the sequences of the alignment in place.
+
+ By default, this sorts the sequences alphabetically using their id
+ attribute if available, or by their sequence contents otherwise.
+ For example,
+
+ >>> from Bio.Align import PairwiseAligner
+ >>> aligner = PairwiseAligner()
+ >>> aligner.gap_score = -1
+ >>> alignments = aligner.align("AATAA", "AAGAA")
+ >>> len(alignments)
+ 1
+ >>> alignment = alignments[0]
+ >>> print(alignment)
+ AATAA
+ ||.||
+ AAGAA
+
+ >>> alignment.sort()
+ >>> print(alignment)
+ AAGAA
+ ||.||
+ AATAA
+
+
+ Alternatively, a key function can be supplied that maps each sequence
+ to a sort value. For example, you could sort on the GC content of each
+ sequence.
+
+ >>> from Bio.SeqUtils import GC
+ >>> alignment.sort(key=GC)
+ >>> print(alignment)
+ AATAA
+ ||.||
+ AAGAA
+
+
+ You can reverse the sort order by passing `reverse=True`:
+
+ >>> alignment.sort(key=GC, reverse=True)
+ >>> print(alignment)
+ AAGAA
+ ||.||
+ AATAA
+
+
+ The sequences are now sorted by decreasing GC content value.
+ """
+ path = self.path
+ sequences = self.target, self.query
+ if key is None:
+ try:
+ values = [sequence.id for sequence in sequences]
+ except AttributeError:
+ values = sequences
+ else:
+ values = [key(sequence) for sequence in sequences]
+ indices = sorted(range(len(sequences)), key=values.__getitem__, reverse=reverse)
+ sequences = [sequences[index] for index in indices]
+ self.target, self.query = sequences
+ path = tuple(tuple(row[index] for index in indices) for row in path)
+ self.path = path
+
+ def map(self, alignment):
+ r"""Map the alignment to self.target and return the resulting alignment.
+
+ Here, self.query and alignment.target are the same sequence.
+
+ A typical example is where self is the pairwise alignment between a
+ chromosome and a transcript, the argument is the pairwise alignment
+ between the transcript and a sequence (e.g., as obtained by RNA-seq),
+ and we want to find the alignment of the sequence to the chromosome:
+
+ >>> from Bio import Align
+ >>> aligner = Align.PairwiseAligner()
+ >>> aligner.mode = 'local'
+ >>> aligner.open_gap_score = -1
+ >>> aligner.extend_gap_score = 0
+ >>> chromosome = "AAAAAAAACCCCCCCAAAAAAAAAAAGGGGGGAAAAAAAA"
+ >>> transcript = "CCCCCCCGGGGGG"
+ >>> alignments1 = aligner.align(chromosome, transcript)
+ >>> len(alignments1)
+ 1
+ >>> alignment1 = alignments1[0]
+ >>> print(alignment1)
+ AAAAAAAACCCCCCCAAAAAAAAAAAGGGGGGAAAAAAAA
+ |||||||-----------||||||
+ CCCCCCC-----------GGGGGG
+
+ >>> sequence = "CCCCGGGG"
+ >>> alignments2 = aligner.align(transcript, sequence)
+ >>> len(alignments2)
+ 1
+ >>> alignment2 = alignments2[0]
+ >>> print(alignment2)
+ CCCCCCCGGGGGG
+ ||||||||
+ CCCCGGGG
+
+ >>> alignment = alignment1.map(alignment2)
+ >>> print(alignment)
+ AAAAAAAACCCCCCCAAAAAAAAAAAGGGGGGAAAAAAAA
+ ||||-----------||||
+ CCCC-----------GGGG
+
+ >>> format(alignment, "psl")
+ '8\t0\t0\t0\t0\t0\t1\t11\t+\tquery\t8\t0\t8\ttarget\t40\t11\t30\t2\t4,4,\t0,4,\t11,26,\n'
+
+ Mapping the alignment does not depend on the sequence contents. If we
+ delete the sequence contents, the same alignment is found in PSL format
+ (though we obviously lose the ability to print the sequence alignment):
+
+ >>> alignment1.target = Seq(None, len(alignment1.target))
+ >>> alignment1.query = Seq(None, len(alignment1.query))
+ >>> alignment2.target = Seq(None, len(alignment2.target))
+ >>> alignment2.query = Seq(None, len(alignment2.query))
+ >>> alignment = alignment1.map(alignment2)
+ >>> format(alignment, "psl")
+ '8\t0\t0\t0\t0\t0\t1\t11\t+\tquery\t8\t0\t8\ttarget\t40\t11\t30\t2\t4,4,\t0,4,\t11,26,\n'
+ """
+ from numpy import array
+
+ alignment1, alignment2 = self, alignment
+ if len(alignment1.query) != len(alignment2.target):
+ raise ValueError(
+ "length of alignment1 query sequence (%d) != length of alignment2 target sequence (%d)"
+ % (len(alignment1.query), len(alignment2.target))
+ )
+ target = alignment1.target
+ query = alignment2.query
+ path1 = alignment1.path
+ path2 = alignment2.path
+ n1 = len(alignment1.query)
+ n2 = len(alignment2.query)
+ if path1[0][1] < path1[-1][1]: # mapped to forward strand
+ strand1 = "+"
+ else: # mapped to reverse strand
+ strand1 = "-"
+ if path2[0][1] < path2[-1][1]: # mapped to forward strand
+ strand2 = "+"
+ else: # mapped to reverse strand
+ strand2 = "-"
+ path1 = array(path1)
+ path2 = array(path2)
+ if strand1 == "+":
+ if strand2 == "-": # mapped to reverse strand
+ path2[:, 1] = n2 - path2[:, 1]
+ else: # mapped to reverse strand
+ path1[:, 1] = n1 - path1[:, 1]
+ path2[:, 0] = n1 - path2[::-1, 0]
+ if strand2 == "+":
+ path2[:, 1] = n2 - path2[::-1, 1]
+ else: # mapped to reverse strand
+ path2[:, 1] = path2[::-1, 1]
+ path = []
+ tEnd, qEnd = sys.maxsize, sys.maxsize
+ path1 = iter(path1)
+ tStart1, qStart1 = sys.maxsize, sys.maxsize
+ for tEnd1, qEnd1 in path1:
+ if tStart1 < tEnd1 and qStart1 < qEnd1:
+ break
+ tStart1, qStart1 = tEnd1, qEnd1
+ tStart2, qStart2 = sys.maxsize, sys.maxsize
+ for tEnd2, qEnd2 in path2:
+ while qStart2 < qEnd2 and tStart2 < tEnd2:
+ while True:
+ if tStart2 < qStart1:
+ if tEnd2 < qStart1:
+ size = tEnd2 - tStart2
+ else:
+ size = qStart1 - tStart2
+ break
+ elif tStart2 < qEnd1:
+ offset = tStart2 - qStart1
+ if tEnd2 > qEnd1:
+ size = qEnd1 - tStart2
+ else:
+ size = tEnd2 - tStart2
+ qStart = qStart2
+ tStart = tStart1 + offset
+ if tStart > tEnd and qStart > qEnd:
+ # adding a gap both in target and in query;
+ # add gap to target first:
+ path.append([tStart, qEnd])
+ qEnd = qStart2 + size
+ tEnd = tStart + size
+ path.append([tStart, qStart])
+ path.append([tEnd, qEnd])
+ break
+ tStart1, qStart1 = sys.maxsize, sys.maxsize
+ for tEnd1, qEnd1 in path1:
+ if tStart1 < tEnd1 and qStart1 < qEnd1:
+ break
+ tStart1, qStart1 = tEnd1, qEnd1
+ else:
+ size = qEnd2 - qStart2
+ break
+ qStart2 += size
+ tStart2 += size
+ tStart2, qStart2 = tEnd2, qEnd2
+ if strand1 != strand2:
+ path = tuple((c1, n2 - c2) for (c1, c2) in path)
+ alignment = PairwiseAlignment(target, query, path, None)
+ return alignment
+
+ @property
+ def substitutions(self):
+ """Return an Array with the number of substitutions of letters in the alignment.
+
+ As an example, consider a sequence alignment of two RNA sequences:
+
+ >>> from Bio.Align import PairwiseAligner
+ >>> target = "ATACTTACCTGGCAGGGGAGATACCATGATCACGAAGGTGGTTTTCCCAGGGCGAGGCTTATCCATTGCACTCCGGATGTGCTGACCCCTGCGATTTCCCCAAATGTGGGAAACTCGACTGCATAATTTGTGGTAGTGGGGGACTGCGTTCGCGCTTTCCCCTG" # human spliceosomal small nuclear RNA U1
+ >>> query = "ATACTTACCTGACAGGGGAGGCACCATGATCACACAGGTGGTCCTCCCAGGGCGAGGCTCTTCCATTGCACTGCGGGAGGGTTGACCCCTGCGATTTCCCCAAATGTGGGAAACTCGACTGTATAATTTGTGGTAGTGGGGGACTGCGTTCGCGCTATCCCCCG" # sea lamprey spliceosomal small RNA U1
+ >>> aligner = PairwiseAligner()
+ >>> aligner.gap_score = -10
+ >>> alignments = aligner.align(target, query)
+ >>> len(alignments)
+ 1
+ >>> alignment = alignments[0]
+ >>> print(alignment)
+ ATACTTACCTGGCAGGGGAGATACCATGATCACGAAGGTGGTTTTCCCAGGGCGAGGCTTATCCATTGCACTCCGGATGTGCTGACCCCTGCGATTTCCCCAAATGTGGGAAACTCGACTGCATAATTTGTGGTAGTGGGGGACTGCGTTCGCGCTTTCCCCTG
+ |||||||||||.||||||||..|||||||||||..|||||||..|||||||||||||||..|||||||||||.|||..|.|.|||||||||||||||||||||||||||||||||||||||.||||||||||||||||||||||||||||||||||.|||||.|
+ ATACTTACCTGACAGGGGAGGCACCATGATCACACAGGTGGTCCTCCCAGGGCGAGGCTCTTCCATTGCACTGCGGGAGGGTTGACCCCTGCGATTTCCCCAAATGTGGGAAACTCGACTGTATAATTTGTGGTAGTGGGGGACTGCGTTCGCGCTATCCCCCG
+
+ >>> m = alignment.substitutions
+ >>> print(m)
+ A C G T
+ A 28.0 1.0 2.0 1.0
+ C 0.0 39.0 1.0 2.0
+ G 2.0 0.0 45.0 0.0
+ T 2.0 5.0 1.0 35.0
+
+
+ Note that the matrix is not symmetric: rows correspond to the target
+ sequence, and columns to the query sequence. For example, the number
+ of T's in the target sequence that are aligned to a C in the query
+ sequence is
+
+ >>> m['T', 'C']
+ 5.0
+
+ and the number of C's in the query sequence tat are aligned to a T in
+ the query sequence is
+
+ >>> m['C', 'T']
+ 2.0
+
+ For some applications (for example, to define a scoring matrix from
+ the substitution matrix), a symmetric matrix may be preferred, which
+ can be calculated as follows:
+
+ >>> m += m.transpose()
+ >>> m /= 2.0
+ >>> print(m)
+ A C G T
+ A 28.0 0.5 2.0 1.5
+ C 0.5 39.0 0.5 3.5
+ G 2.0 0.5 45.0 0.5
+ T 1.5 3.5 0.5 35.0
+
+
+ The matrix is now symmetric, with counts divided equally on both sides
+ of the diagonal:
+
+ >>> m['C', 'T']
+ 3.5
+ >>> m['T', 'C']
+ 3.5
+
+ The total number of substitutions between T's and C's in the alignment
+ is 3.5 + 3.5 = 7.
+ """
+ target = self.target
+ try:
+ target = target.seq
+ except AttributeError:
+ pass
+ query = self.query
+ try:
+ query = query.seq
+ except AttributeError:
+ pass
+ sequences = (str(target), str(query))
+ letters = set.union(*[set(sequence) for sequence in sequences])
+ letters = "".join(sorted(letters))
+ m = substitution_matrices.Array(letters, dims=2)
+ n = len(sequences)
+ for i1 in range(n):
+ path1 = [p[i1] for p in self.path]
+ sequence1 = sequences[i1]
+ for i2 in range(i1 + 1, n):
+ path2 = [p[i2] for p in self.path]
+ sequence2 = sequences[i2]
+ start1, start2 = sys.maxsize, sys.maxsize
+ for end1, end2 in zip(path1, path2):
+ if start1 < end1 and start2 < end2: # aligned
+ segment1 = sequence1[start1:end1]
+ segment2 = sequence2[start2:end2]
+ for c1, c2 in zip(segment1, segment2):
+ m[c1, c2] += 1.0
+ start1, start2 = end1, end2
+ return m
+
+
+class PairwiseAlignments:
+ """Implements an iterator over pairwise alignments returned by the aligner.
+
+ This class also supports indexing, which is fast for increasing indices,
+ but may be slow for random access of a large number of alignments.
+
+ Note that pairwise aligners can return an astronomical number of alignments,
+ even for relatively short sequences, if they align poorly to each other. We
+ therefore recommend to first check the number of alignments, accessible as
+ len(alignments), which can be calculated quickly even if the number of
+ alignments is very large.
+ """
+
+ def __init__(self, seqA, seqB, score, paths):
+ """Initialize a new PairwiseAlignments object.
+
+ Arguments:
+ - seqA - The first sequence, as a plain string, without gaps.
+ - seqB - The second sequence, as a plain string, without gaps.
+ - score - The alignment score.
+ - paths - An iterator over the paths in the traceback matrix;
+ each path defines one alignment.
+
+ You would normally obtain an PairwiseAlignments object by calling
+ aligner.align(seqA, seqB), where aligner is a PairwiseAligner object.
+ """
+ self.seqA = seqA
+ self.seqB = seqB
+ self.score = score
+ self.paths = paths
+ self.index = -1
+
+ def __len__(self):
+ return len(self.paths)
+
+ def __getitem__(self, index):
+ if index == self.index:
+ return self.alignment
+ if index < self.index:
+ self.paths.reset()
+ self.index = -1
+ while self.index < index:
+ try:
+ alignment = next(self)
+ except StopIteration:
+ raise IndexError("index out of range") from None
+ return alignment
+
+ def __iter__(self):
+ self.paths.reset()
+ self.index = -1
+ return self
+
+ def __next__(self):
+ path = next(self.paths)
+ self.index += 1
+ alignment = PairwiseAlignment(self.seqA, self.seqB, path, self.score)
+ self.alignment = alignment
+ return alignment
+
+
+class PairwiseAligner(_aligners.PairwiseAligner):
+ """Performs pairwise sequence alignment using dynamic programming.
+
+ This provides functions to get global and local alignments between two
+ sequences. A global alignment finds the best concordance between all
+ characters in two sequences. A local alignment finds just the
+ subsequences that align the best.
+
+ To perform a pairwise sequence alignment, first create a PairwiseAligner
+ object. This object stores the match and mismatch scores, as well as the
+ gap scores. Typically, match scores are positive, while mismatch scores
+ and gap scores are negative or zero. By default, the match score is 1,
+ and the mismatch and gap scores are zero. Based on the values of the gap
+ scores, a PairwiseAligner object automatically chooses the appropriate
+ alignment algorithm (the Needleman-Wunsch, Smith-Waterman, Gotoh, or
+ Waterman-Smith-Beyer global or local alignment algorithm).
+
+ Calling the "score" method on the aligner with two sequences as arguments
+ will calculate the alignment score between the two sequences.
+ Calling the "align" method on the aligner with two sequences as arguments
+ will return a generator yielding the alignments between the two
+ sequences.
+
+ Some examples:
+
+ >>> from Bio import Align
+ >>> aligner = Align.PairwiseAligner()
+ >>> alignments = aligner.align("TACCG", "ACG")
+ >>> for alignment in sorted(alignments):
+ ... print("Score = %.1f:" % alignment.score)
+ ... print(alignment)
+ ...
+ Score = 3.0:
+ TACCG
+ -|-||
+ -A-CG
+
+ Score = 3.0:
+ TACCG
+ -||-|
+ -AC-G
+
+
+ Specify the aligner mode as local to generate local alignments:
+
+ >>> aligner.mode = 'local'
+ >>> alignments = aligner.align("TACCG", "ACG")
+ >>> for alignment in sorted(alignments):
+ ... print("Score = %.1f:" % alignment.score)
+ ... print(alignment)
+ ...
+ Score = 3.0:
+ TACCG
+ |-||
+ A-CG
+
+ Score = 3.0:
+ TACCG
+ ||-|
+ AC-G
+
+
+ Do a global alignment. Identical characters are given 2 points,
+ 1 point is deducted for each non-identical character.
+
+ >>> aligner.mode = 'global'
+ >>> aligner.match_score = 2
+ >>> aligner.mismatch_score = -1
+ >>> for alignment in aligner.align("TACCG", "ACG"):
+ ... print("Score = %.1f:" % alignment.score)
+ ... print(alignment)
+ ...
+ Score = 6.0:
+ TACCG
+ -||-|
+ -AC-G
+
+ Score = 6.0:
+ TACCG
+ -|-||
+ -A-CG
+
+
+ Same as above, except now 0.5 points are deducted when opening a
+ gap, and 0.1 points are deducted when extending it.
+
+ >>> aligner.open_gap_score = -0.5
+ >>> aligner.extend_gap_score = -0.1
+ >>> aligner.target_end_gap_score = 0.0
+ >>> aligner.query_end_gap_score = 0.0
+ >>> for alignment in aligner.align("TACCG", "ACG"):
+ ... print("Score = %.1f:" % alignment.score)
+ ... print(alignment)
+ ...
+ Score = 5.5:
+ TACCG
+ -|-||
+ -A-CG
+
+ Score = 5.5:
+ TACCG
+ -||-|
+ -AC-G
+
+
+ The alignment function can also use known matrices already included in
+ Biopython:
+
+ >>> from Bio.Align import substitution_matrices
+ >>> aligner = Align.PairwiseAligner()
+ >>> aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
+ >>> alignments = aligner.align("KEVLA", "EVL")
+ >>> alignments = list(alignments)
+ >>> print("Number of alignments: %d" % len(alignments))
+ Number of alignments: 1
+ >>> alignment = alignments[0]
+ >>> print("Score = %.1f" % alignment.score)
+ Score = 13.0
+ >>> print(alignment)
+ KEVLA
+ -|||-
+ -EVL-
+
+
+ You can also set the value of attributes directly during construction
+ of the PairwiseAligner object by providing them as keyword arguemnts:
+
+ >>> aligner = Align.PairwiseAligner(mode='global', match_score=2, mismatch_score=-1)
+ >>> for alignment in aligner.align("TACCG", "ACG"):
+ ... print("Score = %.1f:" % alignment.score)
+ ... print(alignment)
+ ...
+ Score = 6.0:
+ TACCG
+ -||-|
+ -AC-G
+
+ Score = 6.0:
+ TACCG
+ -|-||
+ -A-CG
+
+
+ """
+
+ def __init__(self, **kwargs):
+ """Initialize a new PairwiseAligner with the keyword arguments as attributes.
+
+ Loops over the keyword arguments and sets them as attributes on the object.
+ """
+ super().__init__()
+ for name, value in kwargs.items():
+ setattr(self, name, value)
+
+ def __setattr__(self, key, value):
+ if key not in dir(_aligners.PairwiseAligner):
+ # To prevent confusion, don't allow users to create new attributes.
+ # On CPython, __slots__ can be used for this, but currently
+ # __slots__ does not behave the same way on PyPy at least.
+ raise AttributeError("'PairwiseAligner' object has no attribute '%s'" % key)
+ _aligners.PairwiseAligner.__setattr__(self, key, value)
+
+ def align(self, seqA, seqB, strand="+"):
+ """Return the alignments of two sequences using PairwiseAligner."""
+ if isinstance(seqA, (Seq, MutableSeq)):
+ sA = bytes(seqA)
+ else:
+ sA = seqA
+ if strand == "+":
+ sB = seqB
+ else: # strand == "-":
+ sB = reverse_complement(seqB)
+ if isinstance(sB, (Seq, MutableSeq)):
+ sB = bytes(sB)
+ score, paths = _aligners.PairwiseAligner.align(self, sA, sB, strand)
+ alignments = PairwiseAlignments(seqA, seqB, score, paths)
+ return alignments
+
+ def score(self, seqA, seqB, strand="+"):
+ """Return the alignments score of two sequences using PairwiseAligner."""
+ if isinstance(seqA, (Seq, MutableSeq)):
+ seqA = bytes(seqA)
+ if strand == "-":
+ seqB = reverse_complement(seqB)
+ if isinstance(seqB, (Seq, MutableSeq)):
+ seqB = bytes(seqB)
+ return _aligners.PairwiseAligner.score(self, seqA, seqB, strand)
+
+ def __getstate__(self):
+ state = {
+ "wildcard": self.wildcard,
+ "target_internal_open_gap_score": self.target_internal_open_gap_score,
+ "target_internal_extend_gap_score": self.target_internal_extend_gap_score,
+ "target_left_open_gap_score": self.target_left_open_gap_score,
+ "target_left_extend_gap_score": self.target_left_extend_gap_score,
+ "target_right_open_gap_score": self.target_right_open_gap_score,
+ "target_right_extend_gap_score": self.target_right_extend_gap_score,
+ "query_internal_open_gap_score": self.query_internal_open_gap_score,
+ "query_internal_extend_gap_score": self.query_internal_extend_gap_score,
+ "query_left_open_gap_score": self.query_left_open_gap_score,
+ "query_left_extend_gap_score": self.query_left_extend_gap_score,
+ "query_right_open_gap_score": self.query_right_open_gap_score,
+ "query_right_extend_gap_score": self.query_right_extend_gap_score,
+ "mode": self.mode,
+ }
+ if self.substitution_matrix is None:
+ state["match_score"] = self.match_score
+ state["mismatch_score"] = self.mismatch_score
+ else:
+ state["substitution_matrix"] = self.substitution_matrix
+ return state
+
+ def __setstate__(self, state):
+ self.wildcard = state["wildcard"]
+ self.target_internal_open_gap_score = state["target_internal_open_gap_score"]
+ self.target_internal_extend_gap_score = state[
+ "target_internal_extend_gap_score"
+ ]
+ self.target_left_open_gap_score = state["target_left_open_gap_score"]
+ self.target_left_extend_gap_score = state["target_left_extend_gap_score"]
+ self.target_right_open_gap_score = state["target_right_open_gap_score"]
+ self.target_right_extend_gap_score = state["target_right_extend_gap_score"]
+ self.query_internal_open_gap_score = state["query_internal_open_gap_score"]
+ self.query_internal_extend_gap_score = state["query_internal_extend_gap_score"]
+ self.query_left_open_gap_score = state["query_left_open_gap_score"]
+ self.query_left_extend_gap_score = state["query_left_extend_gap_score"]
+ self.query_right_open_gap_score = state["query_right_open_gap_score"]
+ self.query_right_extend_gap_score = state["query_right_extend_gap_score"]
+ self.mode = state["mode"]
+ substitution_matrix = state.get("substitution_matrix")
+ if substitution_matrix is None:
+ self.match_score = state["match_score"]
+ self.mismatch_score = state["mismatch_score"]
+ else:
+ self.substitution_matrix = substitution_matrix
+
+
+if __name__ == "__main__":
+ from Bio._utils import run_doctest
+
+ run_doctest()
diff --git a/code/lib/Bio/Align/__pycache__/AlignInfo.cpython-37.pyc b/code/lib/Bio/Align/__pycache__/AlignInfo.cpython-37.pyc
new file mode 100644
index 0000000..7955c2f
Binary files /dev/null and b/code/lib/Bio/Align/__pycache__/AlignInfo.cpython-37.pyc differ
diff --git a/code/lib/Bio/Align/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/Align/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..54c2237
Binary files /dev/null and b/code/lib/Bio/Align/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/Align/_aligners.c b/code/lib/Bio/Align/_aligners.c
new file mode 100644
index 0000000..e78f252
--- /dev/null
+++ b/code/lib/Bio/Align/_aligners.c
@@ -0,0 +1,6988 @@
+/* Copyright 2018-2019 by Michiel de Hoon. All rights reserved.
+ * This file is part of the Biopython distribution and governed by your
+ * choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+ * Please see the LICENSE file that should have been included as part of this
+ * package.
+ */
+
+
+
+#define PY_SSIZE_T_CLEAN
+#include "Python.h"
+#include "float.h"
+
+
+#define HORIZONTAL 0x1
+#define VERTICAL 0x2
+#define DIAGONAL 0x4
+#define STARTPOINT 0x8
+#define ENDPOINT 0x10
+#define M_MATRIX 0x1
+#define Ix_MATRIX 0x2
+#define Iy_MATRIX 0x4
+#define DONE 0x3
+#define NONE 0x7
+
+#define OVERFLOW_ERROR -1
+#define MEMORY_ERROR -2
+
+#define MISSING_LETTER -1
+
+#define SAFE_ADD(t, s) \
+{ if (s != OVERFLOW_ERROR) { \
+ term = t; \
+ if (term > PY_SSIZE_T_MAX - s) s = OVERFLOW_ERROR; \
+ else s += term; \
+ } \
+}
+
+
+typedef enum {NeedlemanWunschSmithWaterman,
+ Gotoh,
+ WatermanSmithBeyer,
+ Unknown} Algorithm;
+
+typedef enum {Global, Local} Mode;
+
+typedef struct {
+ unsigned char trace : 5;
+ unsigned char path : 3;
+} Trace;
+
+typedef struct {
+ unsigned char Ix : 4;
+ unsigned char Iy : 4;
+} TraceGapsGotoh;
+
+typedef struct {
+ int* MIx;
+ int* IyIx;
+ int* MIy;
+ int* IxIy;
+} TraceGapsWatermanSmithBeyer;
+
+typedef struct {
+ PyObject_HEAD
+ Trace** M;
+ union { TraceGapsGotoh** gotoh;
+ TraceGapsWatermanSmithBeyer** waterman_smith_beyer; } gaps;
+ int nA;
+ int nB;
+ int iA;
+ int iB;
+ Mode mode;
+ Algorithm algorithm;
+ Py_ssize_t length;
+ unsigned char strand;
+} PathGenerator;
+
+static PyObject*
+PathGenerator_create_path(PathGenerator* self, int i, int j) {
+ PyObject* tuple;
+ PyObject* row;
+ PyObject* value;
+ int path;
+ const int ii = i;
+ const int jj = j;
+ int n = 1;
+ int direction = 0;
+ Trace** M = self->M;
+ const unsigned char strand = self->strand;
+
+ while (1) {
+ path = M[i][j].path;
+ if (!path) break;
+ if (path != direction) {
+ n++;
+ direction = path;
+ }
+ switch (path) {
+ case HORIZONTAL: j++; break;
+ case VERTICAL: i++; break;
+ case DIAGONAL: i++; j++; break;
+ }
+ }
+
+ i = ii;
+ j = jj;
+ direction = 0;
+ tuple = PyTuple_New(n);
+ if (!tuple) return NULL;
+
+ n = 0;
+ switch (strand) {
+ case '+':
+ while (1) {
+ path = M[i][j].path;
+ if (path != direction) {
+ row = PyTuple_New(2);
+ if (!row) break;
+ value = PyLong_FromLong(i);
+ if (!value) {
+ Py_DECREF(row); /* all references were stolen */
+ break;
+ }
+ PyTuple_SET_ITEM(row, 0, value);
+ value = PyLong_FromLong(j);
+ if (!value) {
+ Py_DECREF(row); /* all references were stolen */
+ break;
+ }
+ PyTuple_SET_ITEM(row, 1, value);
+ PyTuple_SET_ITEM(tuple, n, row);
+ n++;
+ direction = path;
+ }
+ switch (path) {
+ case HORIZONTAL: j++; break;
+ case VERTICAL: i++; break;
+ case DIAGONAL: i++; j++; break;
+ default: return tuple;
+ }
+ }
+ break;
+ case '-': {
+ const int nB = self->nB;
+ while (1) {
+ path = M[i][j].path;
+ if (path != direction) {
+ row = PyTuple_New(2);
+ if (!row) break;
+ value = PyLong_FromLong(i);
+ if (!value) {
+ Py_DECREF(row); /* all references were stolen */
+ break;
+ }
+ PyTuple_SET_ITEM(row, 0, value);
+ value = PyLong_FromLong(nB-j);
+ if (!value) {
+ Py_DECREF(row); /* all references were stolen */
+ break;
+ }
+ PyTuple_SET_ITEM(row, 1, value);
+ PyTuple_SET_ITEM(tuple, n, row);
+ n++;
+ direction = path;
+ }
+ switch (path) {
+ case HORIZONTAL: j++; break;
+ case VERTICAL: i++; break;
+ case DIAGONAL: i++; j++; break;
+ default: return tuple;
+ }
+ }
+ break;
+ }
+ }
+ Py_DECREF(tuple); /* all references were stolen */
+ return PyErr_NoMemory();
+}
+
+static Py_ssize_t
+PathGenerator_needlemanwunsch_length(PathGenerator* self)
+{
+ int i;
+ int j;
+ int trace;
+ const int nA = self->nA;
+ const int nB = self->nB;
+ Trace** M = self->M;
+ Py_ssize_t term;
+ Py_ssize_t count = MEMORY_ERROR;
+ Py_ssize_t temp;
+ Py_ssize_t* counts;
+ counts = PyMem_Malloc((nB+1)*sizeof(Py_ssize_t));
+ if (!counts) goto exit;
+ counts[0] = 1;
+ for (j = 1; j <= nB; j++) {
+ trace = M[0][j].trace;
+ count = 0;
+ if (trace & HORIZONTAL) SAFE_ADD(counts[j-1], count);
+ counts[j] = count;
+ }
+ for (i = 1; i <= nA; i++) {
+ trace = M[i][0].trace;
+ count = 0;
+ if (trace & VERTICAL) SAFE_ADD(counts[0], count);
+ temp = counts[0];
+ counts[0] = count;
+ for (j = 1; j <= nB; j++) {
+ trace = M[i][j].trace;
+ count = 0;
+ if (trace & HORIZONTAL) SAFE_ADD(counts[j-1], count);
+ if (trace & VERTICAL) SAFE_ADD(counts[j], count);
+ if (trace & DIAGONAL) SAFE_ADD(temp, count);
+ temp = counts[j];
+ counts[j] = count;
+ }
+ }
+ PyMem_Free(counts);
+exit:
+ return count;
+}
+
+static Py_ssize_t
+PathGenerator_smithwaterman_length(PathGenerator* self)
+{
+ int i;
+ int j;
+ int trace;
+ const int nA = self->nA;
+ const int nB = self->nB;
+ Trace** M = self->M;
+ Py_ssize_t term;
+ Py_ssize_t count = MEMORY_ERROR;
+ Py_ssize_t total = 0;
+ Py_ssize_t temp;
+ Py_ssize_t* counts;
+ counts = PyMem_Malloc((nB+1)*sizeof(Py_ssize_t));
+ if (!counts) goto exit;
+ counts[0] = 1;
+ for (j = 1; j <= nB; j++) counts[j] = 1;
+ for (i = 1; i <= nA; i++) {
+ temp = counts[0];
+ counts[0] = 1;
+ for (j = 1; j <= nB; j++) {
+ trace = M[i][j].trace;
+ count = 0;
+ if (trace & DIAGONAL) SAFE_ADD(temp, count);
+ if (M[i][j].trace & ENDPOINT) SAFE_ADD(count, total);
+ if (trace & HORIZONTAL) SAFE_ADD(counts[j-1], count);
+ if (trace & VERTICAL) SAFE_ADD(counts[j], count);
+ temp = counts[j];
+ if (count == 0 && (trace & STARTPOINT)) count = 1;
+ counts[j] = count;
+ }
+ }
+ count = total;
+ PyMem_Free(counts);
+exit:
+ return count;
+}
+
+static Py_ssize_t
+PathGenerator_gotoh_global_length(PathGenerator* self)
+{
+ int i;
+ int j;
+ int trace;
+ const int nA = self->nA;
+ const int nB = self->nB;
+ Trace** M = self->M;
+ TraceGapsGotoh** gaps = self->gaps.gotoh;
+ Py_ssize_t count = MEMORY_ERROR;
+ Py_ssize_t term;
+ Py_ssize_t M_temp;
+ Py_ssize_t Ix_temp;
+ Py_ssize_t Iy_temp;
+ Py_ssize_t* M_counts = NULL;
+ Py_ssize_t* Ix_counts = NULL;
+ Py_ssize_t* Iy_counts = NULL;
+ M_counts = PyMem_Malloc((nB+1)*sizeof(Py_ssize_t));
+ if (!M_counts) goto exit;
+ Ix_counts = PyMem_Malloc((nB+1)*sizeof(Py_ssize_t));
+ if (!Ix_counts) goto exit;
+ Iy_counts = PyMem_Malloc((nB+1)*sizeof(Py_ssize_t));
+ if (!Iy_counts) goto exit;
+ M_counts[0] = 1;
+ Ix_counts[0] = 0;
+ Iy_counts[0] = 0;
+ for (j = 1; j <= nB; j++) {
+ M_counts[j] = 0;
+ Ix_counts[j] = 0;
+ Iy_counts[j] = 1;
+ }
+ for (i = 1; i <= nA; i++) {
+ M_temp = M_counts[0];
+ M_counts[0] = 0;
+ Ix_temp = Ix_counts[0];
+ Ix_counts[0] = 1;
+ Iy_temp = Iy_counts[0];
+ Iy_counts[0] = 0;
+ for (j = 1; j <= nB; j++) {
+ count = 0;
+ trace = M[i][j].trace;
+ if (trace & M_MATRIX) SAFE_ADD(M_temp, count);
+ if (trace & Ix_MATRIX) SAFE_ADD(Ix_temp, count);
+ if (trace & Iy_MATRIX) SAFE_ADD(Iy_temp, count);
+ M_temp = M_counts[j];
+ M_counts[j] = count;
+ count = 0;
+ trace = gaps[i][j].Ix;
+ if (trace & M_MATRIX) SAFE_ADD(M_temp, count);
+ if (trace & Ix_MATRIX) SAFE_ADD(Ix_counts[j], count);
+ if (trace & Iy_MATRIX) SAFE_ADD(Iy_counts[j], count);
+ Ix_temp = Ix_counts[j];
+ Ix_counts[j] = count;
+ count = 0;
+ trace = gaps[i][j].Iy;
+ if (trace & M_MATRIX) SAFE_ADD(M_counts[j-1], count);
+ if (trace & Ix_MATRIX) SAFE_ADD(Ix_counts[j-1], count);
+ if (trace & Iy_MATRIX) SAFE_ADD(Iy_counts[j-1], count);
+ Iy_temp = Iy_counts[j];
+ Iy_counts[j] = count;
+ }
+ }
+ count = 0;
+ if (M[nA][nB].trace) SAFE_ADD(M_counts[nB], count);
+ if (gaps[nA][nB].Ix) SAFE_ADD(Ix_counts[nB], count);
+ if (gaps[nA][nB].Iy) SAFE_ADD(Iy_counts[nB], count);
+exit:
+ if (M_counts) PyMem_Free(M_counts);
+ if (Ix_counts) PyMem_Free(Ix_counts);
+ if (Iy_counts) PyMem_Free(Iy_counts);
+ return count;
+}
+
+static Py_ssize_t
+PathGenerator_gotoh_local_length(PathGenerator* self)
+{
+ int i;
+ int j;
+ int trace;
+ const int nA = self->nA;
+ const int nB = self->nB;
+ Trace** M = self->M;
+ TraceGapsGotoh** gaps = self->gaps.gotoh;
+ Py_ssize_t term;
+ Py_ssize_t count = MEMORY_ERROR;
+ Py_ssize_t total = 0;
+ Py_ssize_t M_temp;
+ Py_ssize_t Ix_temp;
+ Py_ssize_t Iy_temp;
+ Py_ssize_t* M_counts = NULL;
+ Py_ssize_t* Ix_counts = NULL;
+ Py_ssize_t* Iy_counts = NULL;
+ M_counts = PyMem_Malloc((nB+1)*sizeof(Py_ssize_t));
+ if (!M_counts) goto exit;
+ Ix_counts = PyMem_Malloc((nB+1)*sizeof(Py_ssize_t));
+ if (!Ix_counts) goto exit;
+ Iy_counts = PyMem_Malloc((nB+1)*sizeof(Py_ssize_t));
+ if (!Iy_counts) goto exit;
+ M_counts[0] = 1;
+ Ix_counts[0] = 0;
+ Iy_counts[0] = 0;
+ for (j = 1; j <= nB; j++) {
+ M_counts[j] = 1;
+ Ix_counts[j] = 0;
+ Iy_counts[j] = 0;
+ }
+ for (i = 1; i <= nA; i++) {
+ M_temp = M_counts[0];
+ M_counts[0] = 1;
+ Ix_temp = Ix_counts[0];
+ Ix_counts[0] = 0;
+ Iy_temp = Iy_counts[0];
+ Iy_counts[0] = 0;
+ for (j = 1; j <= nB; j++) {
+ count = 0;
+ trace = M[i][j].trace;
+ if (trace & M_MATRIX) SAFE_ADD(M_temp, count);
+ if (trace & Ix_MATRIX) SAFE_ADD(Ix_temp, count);
+ if (trace & Iy_MATRIX) SAFE_ADD(Iy_temp, count);
+ if (count == 0 && (trace & STARTPOINT)) count = 1;
+ M_temp = M_counts[j];
+ M_counts[j] = count;
+ if (M[i][j].trace & ENDPOINT) SAFE_ADD(count, total);
+ count = 0;
+ trace = gaps[i][j].Ix;
+ if (trace & M_MATRIX) SAFE_ADD(M_temp, count);
+ if (trace & Ix_MATRIX) SAFE_ADD(Ix_counts[j], count);
+ if (trace & Iy_MATRIX) SAFE_ADD(Iy_counts[j], count);
+ Ix_temp = Ix_counts[j];
+ Ix_counts[j] = count;
+ count = 0;
+ trace = gaps[i][j].Iy;
+ if (trace & M_MATRIX) SAFE_ADD(M_counts[j-1], count);
+ if (trace & Ix_MATRIX) SAFE_ADD(Ix_counts[j-1], count);
+ if (trace & Iy_MATRIX) SAFE_ADD(Iy_counts[j-1], count);
+ Iy_temp = Iy_counts[j];
+ Iy_counts[j] = count;
+ }
+ }
+ count = total;
+exit:
+ if (M_counts) PyMem_Free(M_counts);
+ if (Ix_counts) PyMem_Free(Ix_counts);
+ if (Iy_counts) PyMem_Free(Iy_counts);
+ return count;
+}
+
+static Py_ssize_t
+PathGenerator_waterman_smith_beyer_global_length(PathGenerator* self)
+{
+ int i;
+ int j;
+ int trace;
+ int* p;
+ int gap;
+ const int nA = self->nA;
+ const int nB = self->nB;
+ Trace** M = self->M;
+ TraceGapsWatermanSmithBeyer** gaps = self->gaps.waterman_smith_beyer;
+ Py_ssize_t count = MEMORY_ERROR;
+ Py_ssize_t term;
+ Py_ssize_t** M_count = NULL;
+ Py_ssize_t** Ix_count = NULL;
+ Py_ssize_t** Iy_count = NULL;
+ M_count = PyMem_Malloc((nA+1)*sizeof(Py_ssize_t*));
+ if (!M_count) goto exit;
+ Ix_count = PyMem_Malloc((nA+1)*sizeof(Py_ssize_t*));
+ if (!Ix_count) goto exit;
+ Iy_count = PyMem_Malloc((nA+1)*sizeof(Py_ssize_t*));
+ if (!Iy_count) goto exit;
+ for (i = 0; i <= nA; i++) {
+ M_count[i] = PyMem_Malloc((nB+1)*sizeof(Py_ssize_t));
+ if (!M_count[i]) goto exit;
+ Ix_count[i] = PyMem_Malloc((nB+1)*sizeof(Py_ssize_t));
+ if (!Ix_count[i]) goto exit;
+ Iy_count[i] = PyMem_Malloc((nB+1)*sizeof(Py_ssize_t));
+ if (!Iy_count[i]) goto exit;
+ }
+ for (i = 0; i <= nA; i++) {
+ for (j = 0; j <= nB; j++) {
+ count = 0;
+ trace = M[i][j].trace;
+ if (trace & M_MATRIX) SAFE_ADD(M_count[i-1][j-1], count);
+ if (trace & Ix_MATRIX) SAFE_ADD(Ix_count[i-1][j-1], count);
+ if (trace & Iy_MATRIX) SAFE_ADD(Iy_count[i-1][j-1], count);
+ if (count == 0) count = 1; /* happens at M[0][0] only */
+ M_count[i][j] = count;
+ count = 0;
+ p = gaps[i][j].MIx;
+ if (p) {
+ while (1) {
+ gap = *p;
+ if (!gap) break;
+ SAFE_ADD(M_count[i-gap][j], count);
+ p++;
+ }
+ }
+ p = gaps[i][j].IyIx;
+ if (p) {
+ while (1) {
+ gap = *p;
+ if (!gap) break;
+ SAFE_ADD(Iy_count[i-gap][j], count);
+ p++;
+ }
+ }
+ Ix_count[i][j] = count;
+ count = 0;
+ p = gaps[i][j].MIy;
+ if (p) {
+ while (1) {
+ gap = *p;
+ if (!gap) break;
+ SAFE_ADD(M_count[i][j-gap], count);
+ p++;
+ }
+ }
+ p = gaps[i][j].IxIy;
+ if (p) {
+ while (1) {
+ gap = *p;
+ if (!gap) break;
+ SAFE_ADD(Ix_count[i][j-gap], count);
+ p++;
+ }
+ }
+ Iy_count[i][j] = count;
+ }
+ }
+ count = 0;
+ if (M[nA][nB].trace)
+ SAFE_ADD(M_count[nA][nB], count);
+ if (gaps[nA][nB].MIx[0] || gaps[nA][nB].IyIx[0])
+ SAFE_ADD(Ix_count[nA][nB], count);
+ if (gaps[nA][nB].MIy[0] || gaps[nA][nB].IxIy[0])
+ SAFE_ADD(Iy_count[nA][nB], count);
+exit:
+ if (M_count) {
+ if (Ix_count) {
+ if (Iy_count) {
+ for (i = 0; i <= nA; i++) {
+ if (!M_count[i]) break;
+ PyMem_Free(M_count[i]);
+ if (!Ix_count[i]) break;
+ PyMem_Free(Ix_count[i]);
+ if (!Iy_count[i]) break;
+ PyMem_Free(Iy_count[i]);
+ }
+ PyMem_Free(Iy_count);
+ }
+ PyMem_Free(Ix_count);
+ }
+ PyMem_Free(M_count);
+ }
+ return count;
+}
+
+static Py_ssize_t
+PathGenerator_waterman_smith_beyer_local_length(PathGenerator* self)
+{
+ int i;
+ int j;
+ int trace;
+ int* p;
+ int gap;
+ const int nA = self->nA;
+ const int nB = self->nB;
+ Trace** M = self->M;
+ TraceGapsWatermanSmithBeyer** gaps = self->gaps.waterman_smith_beyer;
+ Py_ssize_t term;
+ Py_ssize_t count = MEMORY_ERROR;
+ Py_ssize_t total = 0;
+ Py_ssize_t** M_count = NULL;
+ Py_ssize_t** Ix_count = NULL;
+ Py_ssize_t** Iy_count = NULL;
+ M_count = PyMem_Malloc((nA+1)*sizeof(Py_ssize_t*));
+ if (!M_count) goto exit;
+ Ix_count = PyMem_Malloc((nA+1)*sizeof(Py_ssize_t*));
+ if (!Ix_count) goto exit;
+ Iy_count = PyMem_Malloc((nA+1)*sizeof(Py_ssize_t*));
+ if (!Iy_count) goto exit;
+ for (i = 0; i <= nA; i++) {
+ M_count[i] = PyMem_Malloc((nB+1)*sizeof(Py_ssize_t));
+ if (!M_count[i]) goto exit;
+ Ix_count[i] = PyMem_Malloc((nB+1)*sizeof(Py_ssize_t));
+ if (!Ix_count[i]) goto exit;
+ Iy_count[i] = PyMem_Malloc((nB+1)*sizeof(Py_ssize_t));
+ if (!Iy_count[i]) goto exit;
+ }
+ for (i = 0; i <= nA; i++) {
+ for (j = 0; j <= nB; j++) {
+ count = 0;
+ trace = M[i][j].trace;
+ if (trace & M_MATRIX) SAFE_ADD(M_count[i-1][j-1], count);
+ if (trace & Ix_MATRIX) SAFE_ADD(Ix_count[i-1][j-1], count);
+ if (trace & Iy_MATRIX) SAFE_ADD(Iy_count[i-1][j-1], count);
+ if (count == 0 && (trace & STARTPOINT)) count = 1;
+ M_count[i][j] = count;
+ if (M[i][j].trace & ENDPOINT) SAFE_ADD(count, total);
+ count = 0;
+ p = gaps[i][j].MIx;
+ if (p) {
+ while (1) {
+ gap = *p;
+ if (!gap) break;
+ SAFE_ADD(M_count[i-gap][j], count);
+ p++;
+ }
+ }
+ p = gaps[i][j].IyIx;
+ if (p) {
+ while (1) {
+ gap = *p;
+ if (!gap) break;
+ SAFE_ADD(Iy_count[i-gap][j], count);
+ p++;
+ }
+ }
+ Ix_count[i][j] = count;
+ count = 0;
+ p = gaps[i][j].MIy;
+ if (p) {
+ while (1) {
+ gap = *p;
+ if (!gap) break;
+ SAFE_ADD(M_count[i][j-gap], count);
+ p++;
+ }
+ }
+ p = gaps[i][j].IxIy;
+ if (p) {
+ while (1) {
+ gap = *p;
+ if (!gap) break;
+ SAFE_ADD(Ix_count[i][j-gap], count);
+ p++;
+ }
+ }
+ Iy_count[i][j] = count;
+ }
+ }
+ count = total;
+exit:
+ if (M_count) {
+ if (Ix_count) {
+ if (Iy_count) {
+ for (i = 0; i <= nA; i++) {
+ if (!M_count[i]) break;
+ PyMem_Free(M_count[i]);
+ if (!Ix_count[i]) break;
+ PyMem_Free(Ix_count[i]);
+ if (!Iy_count[i]) break;
+ PyMem_Free(Iy_count[i]);
+ }
+ PyMem_Free(Iy_count);
+ }
+ PyMem_Free(Ix_count);
+ }
+ PyMem_Free(M_count);
+ }
+ return count;
+}
+
+static Py_ssize_t PathGenerator_length(PathGenerator* self) {
+ Py_ssize_t length = self->length;
+ if (length == 0) {
+ switch (self->algorithm) {
+ case NeedlemanWunschSmithWaterman:
+ switch (self->mode) {
+ case Global:
+ length = PathGenerator_needlemanwunsch_length(self);
+ break;
+ case Local:
+ length = PathGenerator_smithwaterman_length(self);
+ break;
+ default:
+ /* should not happen, but some compilers complain that
+ * that length can be used uninitialized.
+ */
+ PyErr_SetString(PyExc_RuntimeError, "Unknown mode");
+ return -1;
+ }
+ break;
+ case Gotoh:
+ switch (self->mode) {
+ case Global:
+ length = PathGenerator_gotoh_global_length(self);
+ break;
+ case Local:
+ length = PathGenerator_gotoh_local_length(self);
+ break;
+ default:
+ /* should not happen, but some compilers complain that
+ * that length can be used uninitialized.
+ */
+ PyErr_SetString(PyExc_RuntimeError, "Unknown mode");
+ return -1;
+ }
+ break;
+ case WatermanSmithBeyer:
+ switch (self->mode) {
+ case Global:
+ length = PathGenerator_waterman_smith_beyer_global_length(self);
+ break;
+ case Local:
+ length = PathGenerator_waterman_smith_beyer_local_length(self);
+ break;
+ default:
+ /* should not happen, but some compilers complain that
+ * that length can be used uninitialized.
+ */
+ PyErr_SetString(PyExc_RuntimeError, "Unknown mode");
+ return -1;
+ }
+ break;
+ case Unknown:
+ default:
+ PyErr_SetString(PyExc_RuntimeError, "Unknown algorithm");
+ return -1;
+ }
+ self->length = length;
+ }
+ switch (length) {
+ case OVERFLOW_ERROR:
+ PyErr_Format(PyExc_OverflowError,
+ "number of optimal alignments is larger than %zd",
+ PY_SSIZE_T_MAX);
+ break;
+ case MEMORY_ERROR:
+ PyErr_SetNone(PyExc_MemoryError);
+ break;
+ default:
+ break;
+ }
+ return length;
+}
+
+static void
+PathGenerator_dealloc(PathGenerator* self)
+{
+ int i;
+ const int nA = self->nA;
+ const Algorithm algorithm = self->algorithm;
+ Trace** M = self->M;
+ if (M) {
+ for (i = 0; i <= nA; i++) {
+ if (!M[i]) break;
+ PyMem_Free(M[i]);
+ }
+ PyMem_Free(M);
+ }
+ switch (algorithm) {
+ case NeedlemanWunschSmithWaterman:
+ break;
+ case Gotoh: {
+ TraceGapsGotoh** gaps = self->gaps.gotoh;
+ if (gaps) {
+ for (i = 0; i <= nA; i++) {
+ if (!gaps[i]) break;
+ PyMem_Free(gaps[i]);
+ }
+ PyMem_Free(gaps);
+ }
+ break;
+ }
+ case WatermanSmithBeyer: {
+ TraceGapsWatermanSmithBeyer** gaps = self->gaps.waterman_smith_beyer;
+ if (gaps) {
+ int j;
+ const int nB = self->nB;
+ int* trace;
+ for (i = 0; i <= nA; i++) {
+ if (!gaps[i]) break;
+ for (j = 0; j <= nB; j++) {
+ trace = gaps[i][j].MIx;
+ if (trace) PyMem_Free(trace);
+ trace = gaps[i][j].IyIx;
+ if (trace) PyMem_Free(trace);
+ trace = gaps[i][j].MIy;
+ if (trace) PyMem_Free(trace);
+ trace = gaps[i][j].IxIy;
+ if (trace) PyMem_Free(trace);
+ }
+ PyMem_Free(gaps[i]);
+ }
+ PyMem_Free(gaps);
+ }
+ break;
+ }
+ case Unknown:
+ default:
+ PyErr_WriteUnraisable((PyObject*)self);
+ break;
+ }
+ Py_TYPE(self)->tp_free((PyObject*)self);
+}
+
+static PyObject* PathGenerator_next_needlemanwunsch(PathGenerator* self)
+{
+ int i = 0;
+ int j = 0;
+ int path;
+ int trace = 0;
+ const int nA = self->nA;
+ const int nB = self->nB;
+ Trace** M = self->M;
+
+ path = M[i][j].path;
+ if (path == DONE) return NULL;
+ if (path == 0) {
+ /* Generate the first path. */
+ i = nA;
+ j = nB;
+ }
+ else {
+ /* We already have a path. Prune the path to see if there are
+ * any alternative paths. */
+ while (1) {
+ if (path == HORIZONTAL) {
+ trace = M[i][++j].trace;
+ if (trace & VERTICAL) {
+ M[--i][j].path = VERTICAL;
+ break;
+ }
+ if (trace & DIAGONAL) {
+ M[--i][--j].path = DIAGONAL;
+ break;
+ }
+ }
+ else if (path == VERTICAL) {
+ trace = M[++i][j].trace;
+ if (trace & DIAGONAL) {
+ M[--i][--j].path = DIAGONAL;
+ break;
+ }
+ }
+ else /* DIAGONAL */ {
+ i++;
+ j++;
+ }
+ path = M[i][j].path;
+ if (!path) {
+ /* we reached the end of the alignment without finding
+ * an alternative path */
+ M[0][0].path = DONE;
+ return NULL;
+ }
+ }
+ }
+ /* Follow the traceback until we reach the origin. */
+ while (1) {
+ trace = M[i][j].trace;
+ if (trace & HORIZONTAL) M[i][--j].path = HORIZONTAL;
+ else if (trace & VERTICAL) M[--i][j].path = VERTICAL;
+ else if (trace & DIAGONAL) M[--i][--j].path = DIAGONAL;
+ else break;
+ }
+ return PathGenerator_create_path(self, 0, 0);
+}
+
+static PyObject* PathGenerator_next_smithwaterman(PathGenerator* self)
+{
+ int trace = 0;
+ int i = self->iA;
+ int j = self->iB;
+ const int nA = self->nA;
+ const int nB = self->nB;
+ Trace** M = self->M;
+ int path = M[0][0].path;
+
+ if (path == DONE || path == NONE) return NULL;
+
+ path = M[i][j].path;
+ if (path) {
+ /* We already have a path. Prune the path to see if there are
+ * any alternative paths. */
+ while (1) {
+ if (path == HORIZONTAL) {
+ trace = M[i][++j].trace;
+ if (trace & VERTICAL) {
+ M[--i][j].path = VERTICAL;
+ break;
+ }
+ else if (trace & DIAGONAL) {
+ M[--i][--j].path = DIAGONAL;
+ break;
+ }
+ }
+ else if (path == VERTICAL) {
+ trace = M[++i][j].trace;
+ if (trace & DIAGONAL) {
+ M[--i][--j].path = DIAGONAL;
+ break;
+ }
+ }
+ else /* DIAGONAL */ {
+ i++;
+ j++;
+ }
+ path = M[i][j].path;
+ if (!path) break;
+ }
+ }
+
+ if (path) {
+ trace = M[i][j].trace;
+ } else {
+ /* Find a suitable end point for a path.
+ * Only allow end points ending at the M matrix. */
+ while (1) {
+ if (j < nB) j++;
+ else if (i < nA) {
+ i++;
+ j = 0;
+ }
+ else {
+ /* we reached the end of the sequences without finding
+ * an alternative path */
+ M[0][0].path = DONE;
+ return NULL;
+ }
+ trace = M[i][j].trace;
+ if (trace & ENDPOINT) {
+ trace &= DIAGONAL; /* exclude paths ending in a gap */
+ break;
+ }
+ }
+ M[i][j].path = 0;
+ }
+
+ /* Follow the traceback until we reach the origin. */
+ while (1) {
+ if (trace & HORIZONTAL) M[i][--j].path = HORIZONTAL;
+ else if (trace & VERTICAL) M[--i][j].path = VERTICAL;
+ else if (trace & DIAGONAL) M[--i][--j].path = DIAGONAL;
+ else if (trace & STARTPOINT) {
+ self->iA = i;
+ self->iB = j;
+ return PathGenerator_create_path(self, i, j);
+ }
+ else {
+ PyErr_SetString(PyExc_RuntimeError,
+ "Unexpected trace in PathGenerator_next_smithwaterman");
+ return NULL;
+ }
+ trace = M[i][j].trace;
+ }
+}
+
+static PyObject* PathGenerator_next_gotoh_global(PathGenerator* self)
+{
+ int i = 0;
+ int j = 0;
+ int m;
+ int path;
+ int trace = 0;
+ const int nA = self->nA;
+ const int nB = self->nB;
+ Trace** M = self->M;
+ TraceGapsGotoh** gaps = self->gaps.gotoh;
+
+ m = M_MATRIX;
+ path = M[i][j].path;
+ if (path == DONE) return NULL;
+ if (path == 0) {
+ i = nA;
+ j = nB;
+ }
+ else {
+ /* We already have a path. Prune the path to see if there are
+ * any alternative paths. */
+ while (1) {
+ path = M[i][j].path;
+ if (path == 0) {
+ switch (m) {
+ case M_MATRIX: m = Ix_MATRIX; break;
+ case Ix_MATRIX: m = Iy_MATRIX; break;
+ case Iy_MATRIX: m = 0; break;
+ }
+ break;
+ }
+ switch (path) {
+ case HORIZONTAL: trace = gaps[i][++j].Iy; break;
+ case VERTICAL: trace = gaps[++i][j].Ix; break;
+ case DIAGONAL: trace = M[++i][++j].trace; break;
+ }
+ switch (m) {
+ case M_MATRIX:
+ if (trace & Ix_MATRIX) {
+ m = Ix_MATRIX;
+ break;
+ }
+ case Ix_MATRIX:
+ if (trace & Iy_MATRIX) {
+ m = Iy_MATRIX;
+ break;
+ }
+ case Iy_MATRIX:
+ default:
+ switch (path) {
+ case HORIZONTAL: m = Iy_MATRIX; break;
+ case VERTICAL: m = Ix_MATRIX; break;
+ case DIAGONAL: m = M_MATRIX; break;
+ }
+ continue;
+ }
+ switch (path) {
+ case HORIZONTAL: j--; break;
+ case VERTICAL: i--; break;
+ case DIAGONAL: i--; j--; break;
+ }
+ M[i][j].path = path;
+ break;
+ }
+ }
+
+ if (path == 0) {
+ /* Generate a new path. */
+ switch (m) {
+ case M_MATRIX:
+ if (M[nA][nB].trace) {
+ /* m = M_MATRIX; */
+ break;
+ }
+ case Ix_MATRIX:
+ if (gaps[nA][nB].Ix) {
+ m = Ix_MATRIX;
+ break;
+ }
+ case Iy_MATRIX:
+ if (gaps[nA][nB].Iy) {
+ m = Iy_MATRIX;
+ break;
+ }
+ default:
+ /* exhausted this generator */
+ M[0][0].path = DONE;
+ return NULL;
+ }
+ }
+
+ switch (m) {
+ case M_MATRIX:
+ trace = M[i][j].trace;
+ path = DIAGONAL;
+ i--; j--;
+ break;
+ case Ix_MATRIX:
+ trace = gaps[i][j].Ix;
+ path = VERTICAL;
+ i--;
+ break;
+ case Iy_MATRIX:
+ trace = gaps[i][j].Iy;
+ path = HORIZONTAL;
+ j--;
+ break;
+ }
+
+ while (1) {
+ if (trace & M_MATRIX) {
+ trace = M[i][j].trace;
+ M[i][j].path = path;
+ path = DIAGONAL;
+ i--; j--;
+ }
+ else if (trace & Ix_MATRIX) {
+ M[i][j].path = path;
+ trace = gaps[i][j].Ix;
+ path = VERTICAL;
+ i--;
+ }
+ else if (trace & Iy_MATRIX) {
+ M[i][j].path = path;
+ trace = gaps[i][j].Iy;
+ path = HORIZONTAL;
+ j--;
+ }
+ else break;
+ }
+ return PathGenerator_create_path(self, 0, 0);
+}
+
+static PyObject* PathGenerator_next_gotoh_local(PathGenerator* self)
+{
+ int trace = 0;
+ int i;
+ int j;
+ int m = M_MATRIX;
+ int iA = self->iA;
+ int iB = self->iB;
+ const int nA = self->nA;
+ const int nB = self->nB;
+ Trace** M = self->M;
+ TraceGapsGotoh** gaps = self->gaps.gotoh;
+ int path = M[0][0].path;
+
+ if (path == DONE) return NULL;
+
+ path = M[iA][iB].path;
+
+ if (path) {
+ i = iA;
+ j = iB;
+ while (1) {
+ /* We already have a path. Prune the path to see if there are
+ * any alternative paths. */
+ path = M[i][j].path;
+ if (path == 0) {
+ m = M_MATRIX;
+ iA = i;
+ iB = j;
+ break;
+ }
+ switch (path) {
+ case HORIZONTAL: trace = gaps[i][++j].Iy; break;
+ case VERTICAL: trace = gaps[++i][j].Ix; break;
+ case DIAGONAL: trace = M[++i][++j].trace; break;
+ }
+ switch (m) {
+ case M_MATRIX:
+ if (trace & Ix_MATRIX) {
+ m = Ix_MATRIX;
+ break;
+ }
+ case Ix_MATRIX:
+ if (trace & Iy_MATRIX) {
+ m = Iy_MATRIX;
+ break;
+ }
+ case Iy_MATRIX:
+ default:
+ switch (path) {
+ case HORIZONTAL: m = Iy_MATRIX; break;
+ case VERTICAL: m = Ix_MATRIX; break;
+ case DIAGONAL: m = M_MATRIX; break;
+ }
+ continue;
+ }
+ switch (path) {
+ case HORIZONTAL: j--; break;
+ case VERTICAL: i--; break;
+ case DIAGONAL: i--; j--; break;
+ }
+ M[i][j].path = path;
+ break;
+ }
+ }
+
+ if (path == 0) {
+ /* Find the end point for a new path. */
+ while (1) {
+ if (iB < nB) iB++;
+ else if (iA < nA) {
+ iA++;
+ iB = 0;
+ }
+ else {
+ /* we reached the end of the alignment without finding
+ * an alternative path */
+ M[0][0].path = DONE;
+ return NULL;
+ }
+ if (M[iA][iB].trace & ENDPOINT) {
+ M[iA][iB].path = 0;
+ break;
+ }
+ }
+ m = M_MATRIX;
+ i = iA;
+ j = iB;
+ }
+
+ while (1) {
+ switch (m) {
+ case M_MATRIX: trace = M[i][j].trace; break;
+ case Ix_MATRIX: trace = gaps[i][j].Ix; break;
+ case Iy_MATRIX: trace = gaps[i][j].Iy; break;
+ }
+ if (trace == STARTPOINT) {
+ self->iA = i;
+ self->iB = j;
+ return PathGenerator_create_path(self, i, j);
+ }
+ switch (m) {
+ case M_MATRIX:
+ path = DIAGONAL;
+ i--;
+ j--;
+ break;
+ case Ix_MATRIX:
+ path = VERTICAL;
+ i--;
+ break;
+ case Iy_MATRIX:
+ path = HORIZONTAL;
+ j--;
+ break;
+ }
+ if (trace & M_MATRIX) m = M_MATRIX;
+ else if (trace & Ix_MATRIX) m = Ix_MATRIX;
+ else if (trace & Iy_MATRIX) m = Iy_MATRIX;
+ else {
+ PyErr_SetString(PyExc_RuntimeError,
+ "Unexpected trace in PathGenerator_next_gotoh_local");
+ return NULL;
+ }
+ M[i][j].path = path;
+ }
+ return NULL;
+}
+
+static PyObject*
+PathGenerator_next_waterman_smith_beyer_global(PathGenerator* self)
+{
+ int i = 0, j = 0;
+ int iA, iB;
+ int trace;
+ int* gapM;
+ int* gapXY;
+
+ int m = M_MATRIX;
+ const int nA = self->nA;
+ const int nB = self->nB;
+ Trace** M = self->M;
+ TraceGapsWatermanSmithBeyer** gaps = self->gaps.waterman_smith_beyer;
+
+ int gap;
+ int path = M[0][0].path;
+
+ if (path == DONE) return NULL;
+
+ if (path) {
+ /* We already have a path. Prune the path to see if there are
+ * any alternative paths. */
+ while (1) {
+ if (!path) {
+ m <<= 1;
+ break;
+ }
+ switch (path) {
+ case HORIZONTAL:
+ iA = i;
+ iB = j;
+ while (M[i][iB].path == HORIZONTAL) iB++;
+ break;
+ case VERTICAL:
+ iA = i;
+ while (M[iA][j].path == VERTICAL) iA++;
+ iB = j;
+ break;
+ case DIAGONAL:
+ iA = i + 1;
+ iB = j + 1;
+ break;
+ default:
+ PyErr_SetString(PyExc_RuntimeError,
+ "Unexpected path in PathGenerator_next_waterman_smith_beyer_global");
+ return NULL;
+ }
+ if (i == iA) { /* HORIZONTAL */
+ gapM = gaps[iA][iB].MIy;
+ gapXY = gaps[iA][iB].IxIy;
+ if (m == M_MATRIX) {
+ gap = iB - j;
+ while (*gapM != gap) gapM++;
+ gapM++;
+ gap = *gapM;
+ if (gap) {
+ j = iB - gap;
+ while (j < iB) M[i][--iB].path = HORIZONTAL;
+ break;
+ }
+ } else if (m == Ix_MATRIX) {
+ gap = iB - j;
+ while (*gapXY != gap) gapXY++;
+ gapXY++;
+ }
+ gap = *gapXY;
+ if (gap) {
+ m = Ix_MATRIX;
+ j = iB - gap;
+ while (j < iB) M[i][--iB].path = HORIZONTAL;
+ break;
+ }
+ /* no alternative found; continue pruning */
+ m = Iy_MATRIX;
+ j = iB;
+ }
+ else if (j == iB) { /* VERTICAL */
+ gapM = gaps[iA][iB].MIx;
+ gapXY = gaps[iA][iB].IyIx;
+ if (m == M_MATRIX) {
+ gap = iA - i;
+ while (*gapM != gap) gapM++;
+ gapM++;
+ gap = *gapM;
+ if (gap) {
+ i = iA - gap;
+ while (i < iA) M[--iA][j].path = VERTICAL;
+ break;
+ }
+ } else if (m == Iy_MATRIX) {
+ gap = iA - i;
+ while (*gapXY != gap) gapXY++;
+ gapXY++;
+ }
+ gap = *gapXY;
+ if (gap) {
+ m = Iy_MATRIX;
+ i = iA - gap;
+ while (i < iA) M[--iA][j].path = VERTICAL;
+ break;
+ }
+ /* no alternative found; continue pruning */
+ m = Ix_MATRIX;
+ i = iA;
+ }
+ else { /* DIAGONAL */
+ i = iA - 1;
+ j = iB - 1;
+ trace = M[iA][iB].trace;
+ switch (m) {
+ case M_MATRIX:
+ if (trace & Ix_MATRIX) {
+ m = Ix_MATRIX;
+ M[i][j].path = DIAGONAL;
+ break;
+ }
+ case Ix_MATRIX:
+ if (trace & Iy_MATRIX) {
+ m = Iy_MATRIX;
+ M[i][j].path = DIAGONAL;
+ break;
+ }
+ case Iy_MATRIX:
+ default:
+ /* no alternative found; continue pruning */
+ m = M_MATRIX;
+ i = iA;
+ j = iB;
+ path = M[i][j].path;
+ continue;
+ }
+ /* alternative found; build path until starting point */
+ break;
+ }
+ path = M[i][j].path;
+ }
+ }
+
+ if (!path) {
+ /* Find a suitable end point for a path. */
+ switch (m) {
+ case M_MATRIX:
+ if (M[nA][nB].trace) {
+ /* m = M_MATRIX; */
+ break;
+ }
+ case Ix_MATRIX:
+ if (gaps[nA][nB].MIx[0] || gaps[nA][nB].IyIx[0]) {
+ m = Ix_MATRIX;
+ break;
+ }
+ case Iy_MATRIX:
+ if (gaps[nA][nB].MIy[0] || gaps[nA][nB].IxIy[0]) {
+ m = Iy_MATRIX;
+ break;
+ }
+ default:
+ M[0][0].path = DONE;
+ return NULL;
+ }
+ i = nA;
+ j = nB;
+ }
+
+ /* Follow the traceback until we reach the origin. */
+ while (1) {
+ switch (m) {
+ case M_MATRIX:
+ trace = M[i][j].trace;
+ if (trace & M_MATRIX) m = M_MATRIX;
+ else if (trace & Ix_MATRIX) m = Ix_MATRIX;
+ else if (trace & Iy_MATRIX) m = Iy_MATRIX;
+ else return PathGenerator_create_path(self, i, j);
+ i--;
+ j--;
+ M[i][j].path = DIAGONAL;
+ break;
+ case Ix_MATRIX:
+ gap = gaps[i][j].MIx[0];
+ if (gap) m = M_MATRIX;
+ else {
+ gap = gaps[i][j].IyIx[0];
+ m = Iy_MATRIX;
+ }
+ iA = i - gap;
+ while (iA < i) M[--i][j].path = VERTICAL;
+ M[i][j].path = VERTICAL;
+ break;
+ case Iy_MATRIX:
+ gap = gaps[i][j].MIy[0];
+ if (gap) m = M_MATRIX;
+ else {
+ gap = gaps[i][j].IxIy[0];
+ m = Ix_MATRIX;
+ }
+ iB = j - gap;
+ while (iB < j) M[i][--j].path = HORIZONTAL;
+ M[i][j].path = HORIZONTAL;
+ break;
+ }
+ }
+}
+
+static PyObject*
+PathGenerator_next_waterman_smith_beyer_local(PathGenerator* self)
+{
+ int i, j, m;
+ int trace = 0;
+ int* gapM;
+ int* gapXY;
+
+ int iA = self->iA;
+ int iB = self->iB;
+ const int nA = self->nA;
+ const int nB = self->nB;
+ Trace** M = self->M;
+ TraceGapsWatermanSmithBeyer** gaps = self->gaps.waterman_smith_beyer;
+
+ int gap;
+ int path = M[0][0].path;
+
+ if (path == DONE) return NULL;
+ m = 0;
+ path = M[iA][iB].path;
+ if (path) {
+ /* We already have a path. Prune the path to see if there are
+ * any alternative paths. */
+ m = M_MATRIX;
+ i = iA;
+ j = iB;
+ while (1) {
+ path = M[i][j].path;
+ switch (path) {
+ case HORIZONTAL:
+ iA = i;
+ iB = j;
+ while (M[i][iB].path == HORIZONTAL) iB++;
+ break;
+ case VERTICAL:
+ iA = i;
+ iB = j;
+ while (M[iA][j].path == VERTICAL) iA++;
+ break;
+ case DIAGONAL:
+ iA = i + 1;
+ iB = j + 1;
+ break;
+ default:
+ iA = -1;
+ break;
+ }
+ if (iA < 0) {
+ m = 0;
+ iA = i;
+ iB = j;
+ break;
+ }
+ if (i == iA) { /* HORIZONTAL */
+ gapM = gaps[iA][iB].MIy;
+ gapXY = gaps[iA][iB].IxIy;
+ if (m == M_MATRIX) {
+ gap = iB - j;
+ while (*gapM != gap) gapM++;
+ gapM++;
+ gap = *gapM;
+ if (gap) {
+ j = iB - gap;
+ while (j < iB) M[i][--iB].path = HORIZONTAL;
+ break;
+ }
+ } else if (m == Ix_MATRIX) {
+ gap = iB - j;
+ while (*gapXY != gap) gapXY++;
+ gapXY++;
+ }
+ gap = *gapXY;
+ if (gap) {
+ m = Ix_MATRIX;
+ j = iB - gap;
+ M[i][j].path = HORIZONTAL;
+ while (iB > j) M[i][--iB].path = HORIZONTAL;
+ break;
+ }
+ /* no alternative found; continue pruning */
+ m = Iy_MATRIX;
+ j = iB;
+ }
+ else if (j == iB) { /* VERTICAL */
+ gapM = gaps[iA][iB].MIx;
+ gapXY = gaps[iA][iB].IyIx;
+ if (m == M_MATRIX) {
+ gap = iA - i;
+ while (*gapM != gap) gapM++;
+ gapM++;
+ gap = *gapM;
+ if (gap) {
+ i = iA - gap;
+ while (i < iA) M[--iA][j].path = VERTICAL;
+ break;
+ }
+ } else if (m == Iy_MATRIX) {
+ gap = iA - i;
+ while (*gapXY != gap) gapXY++;
+ gapXY++;
+ }
+ gap = *gapXY;
+ if (gap) {
+ m = Iy_MATRIX;
+ i = iA - gap;
+ M[i][j].path = VERTICAL;
+ while (iA > i) M[--iA][j].path = VERTICAL;
+ break;
+ }
+ /* no alternative found; continue pruning */
+ m = Ix_MATRIX;
+ i = iA;
+ }
+ else { /* DIAGONAL */
+ i = iA - 1;
+ j = iB - 1;
+ trace = M[iA][iB].trace;
+ switch (m) {
+ case M_MATRIX:
+ if (trace & Ix_MATRIX) {
+ m = Ix_MATRIX;
+ M[i][j].path = DIAGONAL;
+ break;
+ }
+ case Ix_MATRIX:
+ if (trace & Iy_MATRIX) {
+ m = Iy_MATRIX;
+ M[i][j].path = DIAGONAL;
+ break;
+ }
+ case Iy_MATRIX:
+ default:
+ /* no alternative found; continue pruning */
+ m = M_MATRIX;
+ i = iA;
+ j = iB;
+ continue;
+ }
+ /* alternative found; build path until starting point */
+ break;
+ }
+ }
+ }
+
+ if (m == 0) {
+ /* We are at [nA][nB]. Find a suitable end point for a path. */
+ while (1) {
+ if (iB < nB) iB++;
+ else if (iA < nA) {
+ iA++;
+ iB = 0;
+ }
+ else {
+ /* exhausted this generator */
+ M[0][0].path = DONE;
+ return NULL;
+ }
+ if (M[iA][iB].trace & ENDPOINT) break;
+ }
+ M[iA][iB].path = 0;
+ m = M_MATRIX;
+ i = iA;
+ j = iB;
+ }
+
+ /* Follow the traceback until we reach the origin. */
+ while (1) {
+ switch (m) {
+ case Ix_MATRIX:
+ gapM = gaps[i][j].MIx;
+ gapXY = gaps[i][j].IyIx;
+ iB = j;
+ gap = *gapM;
+ if (gap) m = M_MATRIX;
+ else {
+ gap = *gapXY;
+ m = Iy_MATRIX;
+ }
+ iA = i - gap;
+ while (i > iA) M[--i][iB].path = VERTICAL;
+ break;
+ case Iy_MATRIX:
+ gapM = gaps[i][j].MIy;
+ gapXY = gaps[i][j].IxIy;
+ iA = i;
+ gap = *gapM;
+ if (gap) m = M_MATRIX;
+ else {
+ gap = *gapXY;
+ m = Ix_MATRIX;
+ }
+ iB = j - gap;
+ while (j > iB) M[iA][--j].path = HORIZONTAL;
+ break;
+ case M_MATRIX:
+ iA = i-1;
+ iB = j-1;
+ trace = M[i][j].trace;
+ if (trace & M_MATRIX) m = M_MATRIX;
+ else if (trace & Ix_MATRIX) m = Ix_MATRIX;
+ else if (trace & Iy_MATRIX) m = Iy_MATRIX;
+ else if (trace == STARTPOINT) {
+ self->iA = i;
+ self->iB = j;
+ return PathGenerator_create_path(self, i, j);
+ }
+ else {
+ PyErr_SetString(PyExc_RuntimeError,
+ "Unexpected trace in PathGenerator_next_waterman_smith_beyer_local");
+ return NULL;
+ }
+ M[iA][iB].path = DIAGONAL;
+ break;
+ }
+ i = iA;
+ j = iB;
+ }
+}
+
+static PyObject *
+PathGenerator_next(PathGenerator* self)
+{
+ const Mode mode = self->mode;
+ const Algorithm algorithm = self->algorithm;
+ switch (algorithm) {
+ case NeedlemanWunschSmithWaterman:
+ switch (mode) {
+ case Global:
+ return PathGenerator_next_needlemanwunsch(self);
+ case Local:
+ return PathGenerator_next_smithwaterman(self);
+ }
+ case Gotoh:
+ switch (mode) {
+ case Global:
+ return PathGenerator_next_gotoh_global(self);
+ case Local:
+ return PathGenerator_next_gotoh_local(self);
+ }
+ case WatermanSmithBeyer:
+ switch (mode) {
+ case Global:
+ return PathGenerator_next_waterman_smith_beyer_global(self);
+ case Local:
+ return PathGenerator_next_waterman_smith_beyer_local(self);
+ }
+ case Unknown:
+ default:
+ PyErr_SetString(PyExc_RuntimeError, "Unknown algorithm");
+ return NULL;
+ }
+}
+
+static const char PathGenerator_reset__doc__[] = "reset the iterator";
+
+static PyObject*
+PathGenerator_reset(PathGenerator* self)
+{
+ switch (self->mode) {
+ case Local:
+ self->iA = 0;
+ self->iB = 0;
+ case Global: {
+ Trace** M = self->M;
+ switch (self->algorithm) {
+ case NeedlemanWunschSmithWaterman:
+ case Gotoh: {
+ if (M[0][0].path != NONE) M[0][0].path = 0;
+ break;
+ }
+ case WatermanSmithBeyer: {
+ M[0][0].path = 0;
+ break;
+ }
+ case Unknown:
+ default:
+ break;
+ }
+ }
+ }
+ Py_INCREF(Py_None);
+ return Py_None;
+}
+
+static PyMethodDef PathGenerator_methods[] = {
+ {"reset",
+ (PyCFunction)PathGenerator_reset,
+ METH_NOARGS,
+ PathGenerator_reset__doc__
+ },
+ {NULL} /* Sentinel */
+};
+
+static PySequenceMethods PathGenerator_as_sequence = {
+ (lenfunc)PathGenerator_length, /* sq_length */
+ NULL, /* sq_concat */
+ NULL, /* sq_repeat */
+ NULL, /* sq_item */
+ NULL, /* sq_ass_item */
+ NULL, /* sq_contains */
+ NULL, /* sq_inplace_concat */
+ NULL, /* sq_inplace_repeat */
+};
+
+static PyTypeObject PathGenerator_Type = {
+ PyVarObject_HEAD_INIT(NULL, 0)
+ "Path generator", /* tp_name */
+ sizeof(PathGenerator), /* tp_basicsize */
+ 0, /* tp_itemsize */
+ (destructor)PathGenerator_dealloc, /* tp_dealloc */
+ 0, /* tp_print */
+ 0, /* tp_getattr */
+ 0, /* tp_setattr */
+ 0, /* tp_reserved */
+ 0, /* tp_repr */
+ 0, /* tp_as_number */
+ &PathGenerator_as_sequence, /* tp_as_sequence */
+ 0, /* tp_as_mapping */
+ 0, /* tp_hash */
+ 0, /* tp_call */
+ 0, /* tp_str */
+ 0, /* tp_getattro */
+ 0, /* tp_setattro */
+ 0, /* tp_as_buffer */
+ Py_TPFLAGS_DEFAULT, /* tp_flags */
+ 0, /* tp_doc */
+ 0, /* tp_traverse */
+ 0, /* tp_clear */
+ 0, /* tp_richcompare */
+ 0, /* tp_weaklistoffset */
+ PyObject_SelfIter, /* tp_iter */
+ (iternextfunc)PathGenerator_next, /* tp_iternext */
+ PathGenerator_methods, /* tp_methods */
+};
+
+typedef struct {
+ PyObject_HEAD
+ Mode mode;
+ Algorithm algorithm;
+ double match;
+ double mismatch;
+ double epsilon;
+ double target_internal_open_gap_score;
+ double target_internal_extend_gap_score;
+ double target_left_open_gap_score;
+ double target_left_extend_gap_score;
+ double target_right_open_gap_score;
+ double target_right_extend_gap_score;
+ double query_internal_open_gap_score;
+ double query_internal_extend_gap_score;
+ double query_left_open_gap_score;
+ double query_left_extend_gap_score;
+ double query_right_open_gap_score;
+ double query_right_extend_gap_score;
+ PyObject* target_gap_function;
+ PyObject* query_gap_function;
+ Py_buffer substitution_matrix;
+ PyObject* alphabet;
+ int* mapping;
+ int wildcard;
+} Aligner;
+
+
+static Py_ssize_t
+set_alphabet(Aligner* self, PyObject* alphabet)
+{
+ Py_ssize_t size;
+ if (alphabet == Py_None) {
+ if (self->alphabet) {
+ Py_DECREF(self->alphabet);
+ self->alphabet = NULL;
+ }
+ if (self->mapping) {
+ PyMem_Free(self->mapping);
+ self->mapping = NULL;
+ }
+ return 0;
+ }
+ else if (PyUnicode_Check(alphabet)) {
+ int* mapping;
+ int i;
+ int n;
+ int kind;
+ void* characters;
+ if (PyUnicode_READY(alphabet) == -1) return -1;
+ size = PyUnicode_GET_LENGTH(alphabet);
+ if (size == 0) {
+ PyErr_SetString(PyExc_ValueError, "alphabet has zero length");
+ return -1;
+ }
+ kind = PyUnicode_KIND(alphabet);
+ switch (kind) {
+ case PyUnicode_1BYTE_KIND: {
+ n = 1 << 8 * sizeof(Py_UCS1);
+ break;
+ }
+ case PyUnicode_2BYTE_KIND: {
+ n = 1 << 8 * sizeof(Py_UCS2);
+ break;
+ }
+ case PyUnicode_4BYTE_KIND: {
+ n = 0x110000; /* Maximum code point in Unicode 6.0
+ * is 0x10ffff = 1114111 */
+ break;
+ }
+ case PyUnicode_WCHAR_KIND:
+ default:
+ PyErr_SetString(PyExc_ValueError, "could not interpret alphabet");
+ return -1;
+ }
+ characters = PyUnicode_DATA(alphabet);
+ mapping = PyMem_Malloc(n*sizeof(int));
+ if (!mapping) return -1;
+ for (i = 0; i < n; i++) mapping[i] = MISSING_LETTER;
+ for (i = 0; i < size; i++) {
+ Py_UCS4 character = PyUnicode_READ(kind, characters, i);
+ if (mapping[character] != MISSING_LETTER) {
+ PyObject* c = PyUnicode_FromKindAndData(kind, &character, 1);
+ PyErr_Format(PyExc_ValueError,
+ "alphabet contains '%S' more than once", c);
+ Py_XDECREF(c);
+ PyMem_Free(mapping);
+ return -1;
+ }
+ mapping[character] = i;
+ }
+ Py_INCREF(alphabet);
+ if (self->mapping) PyMem_Free(self->mapping);
+ self->mapping = mapping;
+ }
+ else {
+ /* alphabet is not a string; cannot use mapping */
+ PyObject* sequence = PySequence_Fast(alphabet,
+ "alphabet should support the sequence protocol (e.g.,\n"
+ "strings, lists, and tuples can be valid alphabets).");
+ if (!sequence) return -1;
+ size = PySequence_Fast_GET_SIZE(sequence);
+ Py_DECREF(sequence);
+ if (self->mapping) {
+ PyMem_Free(self->mapping);
+ self->mapping = NULL;
+ }
+ Py_INCREF(alphabet);
+ }
+ Py_XDECREF(self->alphabet);
+ self->alphabet = alphabet;
+ return size;
+}
+
+static int
+Aligner_init(Aligner *self, PyObject *args, PyObject *kwds)
+{
+ self->mode = Global;
+ self->match = 1.0;
+ self->mismatch = 0.0;
+ self->epsilon = 1.e-6;
+ self->target_internal_open_gap_score = 0;
+ self->target_internal_extend_gap_score = 0;
+ self->query_internal_open_gap_score = 0;
+ self->query_internal_extend_gap_score = 0;
+ self->target_left_open_gap_score = 0;
+ self->target_left_extend_gap_score = 0;
+ self->target_right_open_gap_score = 0;
+ self->target_right_extend_gap_score = 0;
+ self->query_left_open_gap_score = 0;
+ self->query_left_extend_gap_score = 0;
+ self->query_right_open_gap_score = 0;
+ self->query_right_extend_gap_score = 0;
+ self->target_gap_function = NULL;
+ self->query_gap_function = NULL;
+ self->substitution_matrix.obj = NULL;
+ self->substitution_matrix.buf = NULL;
+ self->algorithm = Unknown;
+ self->alphabet = NULL;
+ self->mapping = NULL;
+ self->wildcard = -1;
+ return 0;
+}
+
+static void
+Aligner_dealloc(Aligner* self)
+{ Py_XDECREF(self->target_gap_function);
+ Py_XDECREF(self->query_gap_function);
+ if (self->substitution_matrix.obj) PyBuffer_Release(&self->substitution_matrix);
+ Py_XDECREF(self->alphabet);
+ Py_XDECREF(self->mapping);
+ Py_TYPE(self)->tp_free((PyObject*)self);
+}
+
+static PyObject*
+Aligner_repr(Aligner* self)
+{
+ const char text[] = "Pairwise aligner, implementing the Needleman-Wunsch, Smith-Waterman, Gotoh, and Waterman-Smith-Beyer global and local alignment algorithms";
+ return PyUnicode_FromString(text);
+}
+
+static PyObject*
+Aligner_str(Aligner* self)
+{
+ char text[1024];
+ char* p = text;
+ PyObject* substitution_matrix = self->substitution_matrix.obj;
+ void* args[3];
+ int n = 0;
+ PyObject* wildcard = NULL;
+ PyObject* s;
+
+ p += sprintf(p, "Pairwise sequence aligner with parameters\n");
+ if (substitution_matrix) {
+ p += sprintf(p, " substitution_matrix: <%s object at %p>\n",
+ Py_TYPE(substitution_matrix)->tp_name,
+ substitution_matrix);
+ } else {
+ if (self->wildcard == -1) {
+ p += sprintf(p, " wildcard: None\n");
+ }
+ else {
+ wildcard = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
+ &self->wildcard, 1);
+ if (!wildcard) return NULL;
+ p += sprintf(p, " wildcard: '%%U'\n");
+ args[n++] = wildcard;
+ }
+ p += sprintf(p, " match_score: %f\n", self->match);
+ p += sprintf(p, " mismatch_score: %f\n", self->mismatch);
+ }
+ if (self->target_gap_function) {
+ p += sprintf(p, " target_gap_function: %%R\n");
+ args[n++] = self->target_gap_function;
+ }
+ else {
+ p += sprintf(p, " target_internal_open_gap_score: %f\n",
+ self->target_internal_open_gap_score);
+ p += sprintf(p, " target_internal_extend_gap_score: %f\n",
+ self->target_internal_extend_gap_score);
+ p += sprintf(p, " target_left_open_gap_score: %f\n",
+ self->target_left_open_gap_score);
+ p += sprintf(p, " target_left_extend_gap_score: %f\n",
+ self->target_left_extend_gap_score);
+ p += sprintf(p, " target_right_open_gap_score: %f\n",
+ self->target_right_open_gap_score);
+ p += sprintf(p, " target_right_extend_gap_score: %f\n",
+ self->target_right_extend_gap_score);
+ }
+ if (self->query_gap_function) {
+ p += sprintf(p, " query_gap_function: %%R\n");
+ args[n++] = self->query_gap_function;
+ }
+ else {
+ p += sprintf(p, " query_internal_open_gap_score: %f\n",
+ self->query_internal_open_gap_score);
+ p += sprintf(p, " query_internal_extend_gap_score: %f\n",
+ self->query_internal_extend_gap_score);
+ p += sprintf(p, " query_left_open_gap_score: %f\n",
+ self->query_left_open_gap_score);
+ p += sprintf(p, " query_left_extend_gap_score: %f\n",
+ self->query_left_extend_gap_score);
+ p += sprintf(p, " query_right_open_gap_score: %f\n",
+ self->query_right_open_gap_score);
+ p += sprintf(p, " query_right_extend_gap_score: %f\n",
+ self->query_right_extend_gap_score);
+ }
+ switch (self->mode) {
+ case Global: sprintf(p, " mode: global\n"); break;
+ case Local: sprintf(p, " mode: local\n"); break;
+ }
+ s = PyUnicode_FromFormat(text, args[0], args[1], args[2]);
+ Py_XDECREF(wildcard);
+ return s;
+}
+
+static char Aligner_mode__doc__[] = "alignment mode ('global' or 'local')";
+
+static PyObject*
+Aligner_get_mode(Aligner* self, void* closure)
+{ const char* message = NULL;
+ switch (self->mode) {
+ case Global: message = "global"; break;
+ case Local: message = "local"; break;
+ }
+ return PyUnicode_FromString(message);
+}
+
+static int
+Aligner_set_mode(Aligner* self, PyObject* value, void* closure)
+{
+ if (PyUnicode_Check(value)) {
+ if (PyUnicode_CompareWithASCIIString(value, "global") == 0) {
+ self->mode = Global;
+ return 0;
+ }
+ if (PyUnicode_CompareWithASCIIString(value, "local") == 0) {
+ self->mode = Local;
+ return 0;
+ }
+ }
+ PyErr_SetString(PyExc_ValueError,
+ "invalid mode (expected 'global' or 'local'");
+ return -1;
+}
+
+static char Aligner_match_score__doc__[] = "match score";
+
+static PyObject*
+Aligner_get_match_score(Aligner* self, void* closure)
+{ if (self->substitution_matrix.obj) {
+ Py_INCREF(Py_None);
+ return Py_None;
+ }
+ return PyFloat_FromDouble(self->match);
+}
+
+static int
+Aligner_set_match_score(Aligner* self, PyObject* value, void* closure)
+{
+ const double match = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) {
+ PyErr_SetString(PyExc_ValueError, "invalid match score");
+ return -1;
+ }
+ if (self->substitution_matrix.obj) {
+ if (set_alphabet(self, Py_None) < 0) return -1;
+ PyBuffer_Release(&self->substitution_matrix);
+ }
+ self->match = match;
+ return 0;
+}
+
+static char Aligner_mismatch_score__doc__[] = "mismatch score";
+
+static PyObject*
+Aligner_get_mismatch_score(Aligner* self, void* closure)
+{ if (self->substitution_matrix.obj) {
+ Py_INCREF(Py_None);
+ return Py_None;
+ }
+ return PyFloat_FromDouble(self->mismatch);
+}
+
+static int
+Aligner_set_mismatch_score(Aligner* self, PyObject* value, void* closure)
+{
+ const double mismatch = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) {
+ PyErr_SetString(PyExc_ValueError, "invalid mismatch score");
+ return -1;
+ }
+ if (self->substitution_matrix.obj) {
+ if (set_alphabet(self, Py_None) < 0) return -1;
+ PyBuffer_Release(&self->substitution_matrix);
+ }
+ self->mismatch = mismatch;
+ return 0;
+}
+
+static char Aligner_substitution_matrix__doc__[] = "substitution_matrix";
+
+static PyObject*
+Aligner_get_substitution_matrix(Aligner* self, void* closure)
+{ PyObject* object = self->substitution_matrix.obj;
+ if (!object) object = Py_None;
+ Py_INCREF(object);
+ return object;
+}
+
+static int
+Aligner_set_substitution_matrix(Aligner* self, PyObject* values, void* closure)
+{
+ PyObject* alphabet;
+ Py_ssize_t size = -1;
+ Py_buffer view;
+ const int flag = PyBUF_FORMAT | PyBUF_ND;
+ if (values == Py_None) {
+ if (self->substitution_matrix.obj)
+ PyBuffer_Release(&self->substitution_matrix);
+ return 0;
+ }
+ if (PyObject_GetBuffer(values, &view, flag) != 0) {
+ PyErr_SetString(PyExc_ValueError, "expected a matrix");
+ return -1;
+ }
+ if (view.ndim != 2) {
+ PyErr_Format(PyExc_ValueError,
+ "substitution matrix has incorrect rank (%d expected 2)",
+ view.ndim);
+ PyBuffer_Release(&view);
+ return -1;
+ }
+ if (view.len == 0) {
+ PyErr_SetString(PyExc_ValueError, "substitution matrix has zero size");
+ PyBuffer_Release(&view);
+ return -1;
+ }
+ if (strcmp(view.format, "d") != 0) {
+ PyErr_SetString(PyExc_ValueError,
+ "substitution matrix should contain float values");
+ PyBuffer_Release(&view);
+ return -1;
+ }
+ if (view.itemsize != sizeof(double)) {
+ PyErr_Format(PyExc_RuntimeError,
+ "substitution matrix has unexpected item byte size "
+ "(%zd, expected %zd)", view.itemsize, sizeof(double));
+ PyBuffer_Release(&view);
+ return -1;
+ }
+ if (view.shape[0] != view.shape[1]) {
+ PyErr_Format(PyExc_ValueError,
+ "substitution matrix should be square "
+ "(found a %zd x %zd matrix)",
+ view.shape[0], view.shape[1]);
+ PyBuffer_Release(&view);
+ return -1;
+ }
+ alphabet = PyObject_GetAttrString(values, "alphabet");
+ if (alphabet) {
+ size = set_alphabet(self, alphabet);
+ Py_DECREF(alphabet);
+ } else {
+ /* Set a substitution matrix without setting an alphabet; useful
+ * when aligning integers. */
+ PyErr_Clear();
+ size = set_alphabet(self, Py_None);
+ }
+ if (size < 0) {
+ PyBuffer_Release(&view);
+ return -1;
+ }
+ if (self->substitution_matrix.obj) PyBuffer_Release(&self->substitution_matrix);
+ self->substitution_matrix = view;
+ return 0;
+}
+
+static char Aligner_alphabet__doc__[] = "alphabet";
+
+static PyObject*
+Aligner_get_alphabet(Aligner* self, void* closure)
+{ PyObject* object = self->alphabet;
+ if (!object) object = Py_None;
+ Py_INCREF(object);
+ return object;
+}
+
+static int
+Aligner_set_alphabet(Aligner* self, PyObject* alphabet, void* closure)
+{
+ if (self->substitution_matrix.obj) {
+ PyErr_SetString(PyExc_AttributeError,
+ "can't set alphabet if a substitution matrix is used");
+ return -1;
+ }
+ if (set_alphabet(self, alphabet) < 0) return -1;
+ return 0;
+}
+
+static char Aligner_gap_score__doc__[] = "gap score";
+
+static PyObject*
+Aligner_get_gap_score(Aligner* self, void* closure)
+{
+ if (self->target_gap_function || self->query_gap_function) {
+ if (self->target_gap_function != self->query_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "gap scores are different");
+ return NULL;
+ }
+ Py_INCREF(self->target_gap_function);
+ return self->target_gap_function;
+ }
+ else {
+ const double score = self->target_internal_open_gap_score;
+ if (score != self->target_internal_extend_gap_score
+ || score != self->target_left_open_gap_score
+ || score != self->target_left_extend_gap_score
+ || score != self->target_right_open_gap_score
+ || score != self->target_right_extend_gap_score
+ || score != self->query_internal_open_gap_score
+ || score != self->query_internal_extend_gap_score
+ || score != self->query_left_open_gap_score
+ || score != self->query_left_extend_gap_score
+ || score != self->query_right_open_gap_score
+ || score != self->query_right_extend_gap_score) {
+ PyErr_SetString(PyExc_ValueError, "gap scores are different");
+ return NULL;
+ }
+ return PyFloat_FromDouble(score);
+ }
+}
+
+static int
+Aligner_set_gap_score(Aligner* self, PyObject* value, void* closure)
+{ if (PyCallable_Check(value)) {
+ Py_XDECREF(self->target_gap_function);
+ Py_XDECREF(self->query_gap_function);
+ Py_INCREF(value);
+ Py_INCREF(value);
+ self->target_gap_function = value;
+ self->query_gap_function = value;
+ }
+ else {
+ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ if (self->target_gap_function) {
+ Py_DECREF(self->target_gap_function);
+ self->target_gap_function = NULL;
+ }
+ if (self->query_gap_function) {
+ Py_DECREF(self->query_gap_function);
+ self->query_gap_function = NULL;
+ }
+ self->target_internal_open_gap_score = score;
+ self->target_internal_extend_gap_score = score;
+ self->target_left_open_gap_score = score;
+ self->target_left_extend_gap_score = score;
+ self->target_right_open_gap_score = score;
+ self->target_right_extend_gap_score = score;
+ self->query_internal_open_gap_score = score;
+ self->query_internal_extend_gap_score = score;
+ self->query_left_open_gap_score = score;
+ self->query_left_extend_gap_score = score;
+ self->query_right_open_gap_score = score;
+ self->query_right_extend_gap_score = score;
+ }
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_open_gap_score__doc__[] = "internal and end open gap score";
+
+static PyObject*
+Aligner_get_open_gap_score(Aligner* self, void* closure)
+{
+ if (self->target_gap_function || self->query_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ else {
+ const double score = self->target_internal_open_gap_score;
+ if (score != self->target_left_open_gap_score
+ || score != self->target_right_open_gap_score
+ || score != self->query_internal_open_gap_score
+ || score != self->query_left_open_gap_score
+ || score != self->query_right_open_gap_score) {
+ PyErr_SetString(PyExc_ValueError, "gap scores are different");
+ return NULL;
+ }
+ return PyFloat_FromDouble(score);
+ }
+}
+
+static int
+Aligner_set_open_gap_score(Aligner* self, PyObject* value, void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ if (self->target_gap_function) {
+ Py_DECREF(self->target_gap_function);
+ self->target_gap_function = NULL;
+ }
+ if (self->query_gap_function) {
+ Py_DECREF(self->query_gap_function);
+ self->query_gap_function = NULL;
+ }
+ self->target_internal_open_gap_score = score;
+ self->target_left_open_gap_score = score;
+ self->target_right_open_gap_score = score;
+ self->query_internal_open_gap_score = score;
+ self->query_left_open_gap_score = score;
+ self->query_right_open_gap_score = score;
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_extend_gap_score__doc__[] = "extend gap score";
+
+static PyObject*
+Aligner_get_extend_gap_score(Aligner* self, void* closure)
+{
+ if (self->target_gap_function || self->query_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ else {
+ const double score = self->target_internal_extend_gap_score;
+ if (score != self->target_left_extend_gap_score
+ || score != self->target_right_extend_gap_score
+ || score != self->query_internal_extend_gap_score
+ || score != self->query_left_extend_gap_score
+ || score != self->query_right_extend_gap_score) {
+ PyErr_SetString(PyExc_ValueError, "gap scores are different");
+ return NULL;
+ }
+ return PyFloat_FromDouble(score);
+ }
+}
+
+static int
+Aligner_set_extend_gap_score(Aligner* self, PyObject* value, void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ if (self->target_gap_function) {
+ Py_DECREF(self->target_gap_function);
+ self->target_gap_function = NULL;
+ }
+ if (self->query_gap_function) {
+ Py_DECREF(self->query_gap_function);
+ self->query_gap_function = NULL;
+ }
+ self->target_internal_extend_gap_score = score;
+ self->target_left_extend_gap_score = score;
+ self->target_right_extend_gap_score = score;
+ self->query_internal_extend_gap_score = score;
+ self->query_left_extend_gap_score = score;
+ self->query_right_extend_gap_score = score;
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_internal_gap_score__doc__[] = "internal gap score";
+
+static PyObject*
+Aligner_get_internal_gap_score(Aligner* self, void* closure)
+{ if (self->target_gap_function || self->query_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ else {
+ const double score = self->target_internal_open_gap_score;
+ if (score != self->target_internal_extend_gap_score
+ || score != self->query_internal_open_gap_score
+ || score != self->query_internal_extend_gap_score) {
+ PyErr_SetString(PyExc_ValueError, "gap scores are different");
+ return NULL;
+ }
+ return PyFloat_FromDouble(score);
+ }
+}
+
+static int
+Aligner_set_internal_gap_score(Aligner* self, PyObject* value, void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ if (self->target_gap_function) {
+ Py_DECREF(self->target_gap_function);
+ self->target_gap_function = NULL;
+ }
+ if (self->query_gap_function) {
+ Py_DECREF(self->query_gap_function);
+ self->query_gap_function = NULL;
+ }
+ self->target_internal_open_gap_score = score;
+ self->target_internal_extend_gap_score = score;
+ self->query_internal_open_gap_score = score;
+ self->query_internal_extend_gap_score = score;
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_internal_open_gap_score__doc__[] = "internal open gap score";
+
+static PyObject*
+Aligner_get_internal_open_gap_score(Aligner* self, void* closure)
+{ if (self->target_gap_function || self->query_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ else {
+ const double score = self->target_internal_open_gap_score;
+ if (score != self->query_internal_open_gap_score) {
+ PyErr_SetString(PyExc_ValueError, "gap scores are different");
+ return NULL;
+ }
+ return PyFloat_FromDouble(score);
+ }
+}
+
+static int
+Aligner_set_internal_open_gap_score(Aligner* self, PyObject* value, void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ if (self->target_gap_function) {
+ Py_DECREF(self->target_gap_function);
+ self->target_gap_function = NULL;
+ }
+ if (self->query_gap_function) {
+ Py_DECREF(self->query_gap_function);
+ self->query_gap_function = NULL;
+ }
+ self->target_internal_open_gap_score = score;
+ self->query_internal_open_gap_score = score;
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_internal_extend_gap_score__doc__[] = "internal extend gap score";
+
+static PyObject*
+Aligner_get_internal_extend_gap_score(Aligner* self, void* closure)
+{ if (self->target_gap_function || self->query_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ else {
+ const double score = self->target_internal_extend_gap_score;
+ if (score != self->query_internal_extend_gap_score) {
+ PyErr_SetString(PyExc_ValueError, "gap scores are different");
+ return NULL;
+ }
+ return PyFloat_FromDouble(score);
+ }
+}
+
+static int
+Aligner_set_internal_extend_gap_score(Aligner* self, PyObject* value,
+ void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ if (self->target_gap_function) {
+ Py_DECREF(self->target_gap_function);
+ self->target_gap_function = NULL;
+ }
+ if (self->query_gap_function) {
+ Py_DECREF(self->query_gap_function);
+ self->query_gap_function = NULL;
+ }
+ self->target_internal_extend_gap_score = score;
+ self->query_internal_extend_gap_score = score;
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_end_gap_score__doc__[] = "end gap score";
+
+static PyObject*
+Aligner_get_end_gap_score(Aligner* self, void* closure)
+{ if (self->target_gap_function || self->query_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ else {
+ const double score = self->target_left_open_gap_score;
+ if (score != self->target_left_extend_gap_score
+ || score != self->target_right_open_gap_score
+ || score != self->target_right_extend_gap_score
+ || score != self->query_left_open_gap_score
+ || score != self->query_left_extend_gap_score
+ || score != self->query_right_open_gap_score
+ || score != self->query_right_extend_gap_score) {
+ PyErr_SetString(PyExc_ValueError, "gap scores are different");
+ return NULL;
+ }
+ return PyFloat_FromDouble(score);
+ }
+}
+
+static int
+Aligner_set_end_gap_score(Aligner* self, PyObject* value, void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ if (self->target_gap_function) {
+ Py_DECREF(self->target_gap_function);
+ self->target_gap_function = NULL;
+ }
+ if (self->query_gap_function) {
+ Py_DECREF(self->query_gap_function);
+ self->query_gap_function = NULL;
+ }
+ self->target_left_open_gap_score = score;
+ self->target_left_extend_gap_score = score;
+ self->target_right_open_gap_score = score;
+ self->target_right_extend_gap_score = score;
+ self->query_left_open_gap_score = score;
+ self->query_left_extend_gap_score = score;
+ self->query_right_open_gap_score = score;
+ self->query_right_extend_gap_score = score;
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_end_open_gap_score__doc__[] = "end open gap score";
+
+static PyObject*
+Aligner_get_end_open_gap_score(Aligner* self, void* closure)
+{ if (self->target_gap_function || self->query_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ else {
+ const double score = self->target_left_open_gap_score;
+ if (score != self->target_right_open_gap_score
+ || score != self->query_left_open_gap_score
+ || score != self->query_right_open_gap_score) {
+ PyErr_SetString(PyExc_ValueError, "gap scores are different");
+ return NULL;
+ }
+ return PyFloat_FromDouble(score);
+ }
+}
+
+static int
+Aligner_set_end_open_gap_score(Aligner* self, PyObject* value, void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ if (self->target_gap_function) {
+ Py_DECREF(self->target_gap_function);
+ self->target_gap_function = NULL;
+ }
+ if (self->query_gap_function) {
+ Py_DECREF(self->query_gap_function);
+ self->query_gap_function = NULL;
+ }
+ self->target_left_open_gap_score = score;
+ self->target_right_open_gap_score = score;
+ self->query_left_open_gap_score = score;
+ self->query_right_open_gap_score = score;
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_end_extend_gap_score__doc__[] = "end extend gap score";
+
+static PyObject*
+Aligner_get_end_extend_gap_score(Aligner* self, void* closure)
+{ if (self->target_gap_function || self->query_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ else {
+ const double score = self->target_left_extend_gap_score;
+ if (score != self->target_right_extend_gap_score
+ || score != self->query_left_extend_gap_score
+ || score != self->query_right_extend_gap_score) {
+ PyErr_SetString(PyExc_ValueError, "gap scores are different");
+ return NULL;
+ }
+ return PyFloat_FromDouble(score);
+ }
+}
+
+static int
+Aligner_set_end_extend_gap_score(Aligner* self, PyObject* value, void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ if (self->target_gap_function) {
+ Py_DECREF(self->target_gap_function);
+ self->target_gap_function = NULL;
+ }
+ if (self->query_gap_function) {
+ Py_DECREF(self->query_gap_function);
+ self->query_gap_function = NULL;
+ }
+ self->target_left_extend_gap_score = score;
+ self->target_right_extend_gap_score = score;
+ self->query_left_extend_gap_score = score;
+ self->query_right_extend_gap_score = score;
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_left_gap_score__doc__[] = "left gap score";
+
+static PyObject*
+Aligner_get_left_gap_score(Aligner* self, void* closure)
+{ if (self->target_gap_function || self->query_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ else {
+ const double score = self->target_left_open_gap_score;
+ if (score != self->target_left_extend_gap_score
+ || score != self->query_left_open_gap_score
+ || score != self->query_left_extend_gap_score) {
+ PyErr_SetString(PyExc_ValueError, "gap scores are different");
+ return NULL;
+ }
+ return PyFloat_FromDouble(score);
+ }
+}
+
+static int
+Aligner_set_left_gap_score(Aligner* self, PyObject* value, void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ if (self->target_gap_function) {
+ Py_DECREF(self->target_gap_function);
+ self->target_gap_function = NULL;
+ }
+ if (self->query_gap_function) {
+ Py_DECREF(self->query_gap_function);
+ self->query_gap_function = NULL;
+ }
+ self->target_left_open_gap_score = score;
+ self->target_left_extend_gap_score = score;
+ self->query_left_open_gap_score = score;
+ self->query_left_extend_gap_score = score;
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_right_gap_score__doc__[] = "right gap score";
+
+static PyObject*
+Aligner_get_right_gap_score(Aligner* self, void* closure)
+{ if (self->target_gap_function || self->query_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ else {
+ const double score = self->target_right_open_gap_score;
+ if (score != self->target_right_extend_gap_score
+ || score != self->query_right_open_gap_score
+ || score != self->query_right_extend_gap_score) {
+ PyErr_SetString(PyExc_ValueError, "gap scores are different");
+ return NULL;
+ }
+ return PyFloat_FromDouble(score);
+ }
+}
+
+static int
+Aligner_set_right_gap_score(Aligner* self, PyObject* value, void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ if (self->target_gap_function) {
+ Py_DECREF(self->target_gap_function);
+ self->target_gap_function = NULL;
+ }
+ if (self->query_gap_function) {
+ Py_DECREF(self->query_gap_function);
+ self->query_gap_function = NULL;
+ }
+ self->target_right_open_gap_score = score;
+ self->target_right_extend_gap_score = score;
+ self->query_right_open_gap_score = score;
+ self->query_right_extend_gap_score = score;
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_left_open_gap_score__doc__[] = "left open gap score";
+
+static PyObject*
+Aligner_get_left_open_gap_score(Aligner* self, void* closure)
+{ if (self->target_gap_function || self->query_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ else {
+ const double score = self->target_left_open_gap_score;
+ if (score != self->query_left_open_gap_score) {
+ PyErr_SetString(PyExc_ValueError, "gap scores are different");
+ return NULL;
+ }
+ return PyFloat_FromDouble(score);
+ }
+}
+
+static int
+Aligner_set_left_open_gap_score(Aligner* self, PyObject* value, void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ if (self->target_gap_function) {
+ Py_DECREF(self->target_gap_function);
+ self->target_gap_function = NULL;
+ }
+ if (self->query_gap_function) {
+ Py_DECREF(self->query_gap_function);
+ self->query_gap_function = NULL;
+ }
+ self->target_left_open_gap_score = score;
+ self->query_left_open_gap_score = score;
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_left_extend_gap_score__doc__[] = "left extend gap score";
+
+static PyObject*
+Aligner_get_left_extend_gap_score(Aligner* self, void* closure)
+{ if (self->target_gap_function || self->query_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ else {
+ const double score = self->target_left_extend_gap_score;
+ if (score != self->query_left_extend_gap_score) {
+ PyErr_SetString(PyExc_ValueError, "gap scores are different");
+ return NULL;
+ }
+ return PyFloat_FromDouble(score);
+ }
+}
+
+static int
+Aligner_set_left_extend_gap_score(Aligner* self, PyObject* value, void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ if (self->target_gap_function) {
+ Py_DECREF(self->target_gap_function);
+ self->target_gap_function = NULL;
+ }
+ if (self->query_gap_function) {
+ Py_DECREF(self->query_gap_function);
+ self->query_gap_function = NULL;
+ }
+ self->target_left_extend_gap_score = score;
+ self->query_left_extend_gap_score = score;
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_right_open_gap_score__doc__[] = "right open gap score";
+
+static PyObject*
+Aligner_get_right_open_gap_score(Aligner* self, void* closure)
+{ if (self->target_gap_function || self->query_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ else {
+ const double score = self->target_right_open_gap_score;
+ if (score != self->query_right_open_gap_score) {
+ PyErr_SetString(PyExc_ValueError, "gap scores are different");
+ return NULL;
+ }
+ return PyFloat_FromDouble(score);
+ }
+}
+
+static int
+Aligner_set_right_open_gap_score(Aligner* self, PyObject* value, void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ if (self->target_gap_function) {
+ Py_DECREF(self->target_gap_function);
+ self->target_gap_function = NULL;
+ }
+ if (self->query_gap_function) {
+ Py_DECREF(self->query_gap_function);
+ self->query_gap_function = NULL;
+ }
+ self->target_right_open_gap_score = score;
+ self->query_right_open_gap_score = score;
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_right_extend_gap_score__doc__[] = "right extend gap score";
+
+static PyObject*
+Aligner_get_right_extend_gap_score(Aligner* self, void* closure)
+{ if (self->target_gap_function || self->query_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ else {
+ const double score = self->target_right_extend_gap_score;
+ if (score != self->query_right_extend_gap_score) {
+ PyErr_SetString(PyExc_ValueError, "gap scores are different");
+ return NULL;
+ }
+ return PyFloat_FromDouble(score);
+ }
+}
+
+static int
+Aligner_set_right_extend_gap_score(Aligner* self, PyObject* value, void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ if (self->target_gap_function) {
+ Py_DECREF(self->target_gap_function);
+ self->target_gap_function = NULL;
+ }
+ if (self->query_gap_function) {
+ Py_DECREF(self->query_gap_function);
+ self->query_gap_function = NULL;
+ }
+ self->target_right_extend_gap_score = score;
+ self->query_right_extend_gap_score = score;
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_target_open_gap_score__doc__[] = "target open gap score";
+
+static PyObject*
+Aligner_get_target_open_gap_score(Aligner* self, void* closure)
+{ if (self->target_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ else {
+ const double score = self->target_internal_open_gap_score;
+ if (score != self->target_left_open_gap_score
+ || score != self->target_right_open_gap_score) {
+ PyErr_SetString(PyExc_ValueError, "gap scores are different");
+ return NULL;
+ }
+ return PyFloat_FromDouble(score);
+ }
+}
+
+static int
+Aligner_set_target_open_gap_score(Aligner* self, PyObject* value, void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ self->target_internal_open_gap_score = score;
+ self->target_left_open_gap_score = score;
+ self->target_right_open_gap_score = score;
+ if (self->target_gap_function) {
+ Py_DECREF(self->target_gap_function);
+ self->target_gap_function = NULL;
+ }
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_target_extend_gap_score__doc__[] = "target extend gap score";
+
+static PyObject*
+Aligner_get_target_extend_gap_score(Aligner* self, void* closure)
+{ if (self->target_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ else {
+ const double score = self->target_internal_extend_gap_score;
+ if (score != self->target_left_extend_gap_score
+ || score != self->target_right_extend_gap_score) {
+ PyErr_SetString(PyExc_ValueError, "gap scores are different");
+ return NULL;
+ }
+ return PyFloat_FromDouble(score);
+ }
+}
+
+static int
+Aligner_set_target_extend_gap_score(Aligner* self, PyObject* value, void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ self->target_internal_extend_gap_score = score;
+ self->target_left_extend_gap_score = score;
+ self->target_right_extend_gap_score = score;
+ if (self->target_gap_function) {
+ Py_DECREF(self->target_gap_function);
+ self->target_gap_function = NULL;
+ }
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_target_gap_score__doc__[] = "target gap score";
+
+static PyObject*
+Aligner_get_target_gap_score(Aligner* self, void* closure)
+{ if (self->target_gap_function) {
+ Py_INCREF(self->target_gap_function);
+ return self->target_gap_function;
+ }
+ else {
+ const double score = self->target_internal_open_gap_score;
+ if (score != self->target_internal_extend_gap_score
+ || score != self->target_left_open_gap_score
+ || score != self->target_left_extend_gap_score
+ || score != self->target_right_open_gap_score
+ || score != self->target_right_extend_gap_score) {
+ PyErr_SetString(PyExc_ValueError, "gap scores are different");
+ return NULL;
+ }
+ return PyFloat_FromDouble(score);
+ }
+}
+
+static int
+Aligner_set_target_gap_score(Aligner* self, PyObject* value, void* closure)
+{
+ if (PyCallable_Check(value)) {
+ Py_XDECREF(self->target_gap_function);
+ Py_INCREF(value);
+ self->target_gap_function = value;
+ }
+ else {
+ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) {
+ PyErr_SetString(PyExc_ValueError,
+ "gap score should be numerical or callable");
+ return -1;
+ }
+ self->target_internal_open_gap_score = score;
+ self->target_internal_extend_gap_score = score;
+ self->target_left_open_gap_score = score;
+ self->target_left_extend_gap_score = score;
+ self->target_right_open_gap_score = score;
+ self->target_right_extend_gap_score = score;
+ if (self->target_gap_function) {
+ Py_DECREF(self->target_gap_function);
+ self->target_gap_function = NULL;
+ }
+ }
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_query_open_gap_score__doc__[] = "query gap open score";
+
+static PyObject*
+Aligner_get_query_open_gap_score(Aligner* self, void* closure)
+{ if (self->query_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ else {
+ const double score = self->query_internal_open_gap_score;
+ if (score != self->query_left_open_gap_score
+ || score != self->query_right_open_gap_score) {
+ PyErr_SetString(PyExc_ValueError, "gap scores are different");
+ return NULL;
+ }
+ return PyFloat_FromDouble(score);
+ }
+}
+
+static int
+Aligner_set_query_open_gap_score(Aligner* self, PyObject* value, void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ self->query_internal_open_gap_score = score;
+ self->query_left_open_gap_score = score;
+ self->query_right_open_gap_score = score;
+ if (self->query_gap_function) {
+ Py_DECREF(self->query_gap_function);
+ self->query_gap_function = NULL;
+ }
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_query_extend_gap_score__doc__[] = "query gap extend score";
+
+static PyObject*
+Aligner_get_query_extend_gap_score(Aligner* self, void* closure)
+{ if (self->query_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ else {
+ const double score = self->query_internal_extend_gap_score;
+ if (score != self->query_left_extend_gap_score
+ || score != self->query_right_extend_gap_score) {
+ PyErr_SetString(PyExc_ValueError, "gap scores are different");
+ return NULL;
+ }
+ return PyFloat_FromDouble(score);
+ }
+}
+
+static int
+Aligner_set_query_extend_gap_score(Aligner* self, PyObject* value, void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ self->query_internal_extend_gap_score = score;
+ self->query_left_extend_gap_score = score;
+ self->query_right_extend_gap_score = score;
+ if (self->query_gap_function) {
+ Py_DECREF(self->query_gap_function);
+ self->query_gap_function = NULL;
+ }
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_query_gap_score__doc__[] = "query gap score";
+
+static PyObject*
+Aligner_get_query_gap_score(Aligner* self, void* closure)
+{ if (self->query_gap_function) {
+ Py_INCREF(self->query_gap_function);
+ return self->query_gap_function;
+ }
+ else {
+ const double score = self->query_internal_open_gap_score;
+ if (score != self->query_left_open_gap_score
+ || score != self->query_right_open_gap_score
+ || score != self->query_internal_extend_gap_score
+ || score != self->query_left_extend_gap_score
+ || score != self->query_right_extend_gap_score) {
+ PyErr_SetString(PyExc_ValueError, "gap scores are different");
+ return NULL;
+ }
+ return PyFloat_FromDouble(score);
+ }
+}
+
+static int
+Aligner_set_query_gap_score(Aligner* self, PyObject* value, void* closure)
+{ if (PyCallable_Check(value)) {
+ Py_XDECREF(self->query_gap_function);
+ Py_INCREF(value);
+ self->query_gap_function = value;
+ }
+ else {
+ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) {
+ PyErr_SetString(PyExc_ValueError,
+ "gap score should be numerical or callable");
+ return -1;
+ }
+ self->query_internal_open_gap_score = score;
+ self->query_internal_extend_gap_score = score;
+ self->query_left_open_gap_score = score;
+ self->query_left_extend_gap_score = score;
+ self->query_right_open_gap_score = score;
+ self->query_right_extend_gap_score = score;
+ if (self->query_gap_function) {
+ Py_DECREF(self->query_gap_function);
+ self->query_gap_function = NULL;
+ }
+ }
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_target_internal_open_gap_score__doc__[] = "target internal open gap score";
+
+static PyObject*
+Aligner_get_target_internal_open_gap_score(Aligner* self, void* closure)
+{ if (self->target_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ return PyFloat_FromDouble(self->target_internal_open_gap_score);
+}
+
+static int
+Aligner_set_target_internal_open_gap_score(Aligner* self,
+ PyObject* value, void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ self->target_internal_open_gap_score = score;
+ if (self->target_gap_function) {
+ Py_DECREF(self->target_gap_function);
+ self->target_gap_function = NULL;
+ }
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_target_internal_extend_gap_score__doc__[] = "target internal extend gap score";
+
+static PyObject*
+Aligner_get_target_internal_extend_gap_score(Aligner* self, void* closure)
+{ if (self->target_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ return PyFloat_FromDouble(self->target_internal_extend_gap_score);
+}
+
+static int
+Aligner_set_target_internal_extend_gap_score(Aligner* self,
+ PyObject* value, void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ self->target_internal_extend_gap_score = score;
+ if (self->target_gap_function) {
+ Py_DECREF(self->target_gap_function);
+ self->target_gap_function = NULL;
+ }
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_target_internal_gap_score__doc__[] = "target internal gap score";
+
+static PyObject*
+Aligner_get_target_internal_gap_score(Aligner* self, void* closure)
+{ if (self->target_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ else {
+ const double score = self->target_internal_open_gap_score;
+ if (score != self->target_internal_extend_gap_score) {
+ PyErr_SetString(PyExc_ValueError, "gap scores are different");
+ return NULL;
+ }
+ return PyFloat_FromDouble(score);
+ }
+}
+
+static int
+Aligner_set_target_internal_gap_score(Aligner* self, PyObject* value,
+ void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ self->target_internal_open_gap_score = score;
+ self->target_internal_extend_gap_score = score;
+ if (self->target_gap_function) {
+ Py_DECREF(self->target_gap_function);
+ self->target_gap_function = NULL;
+ }
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_target_end_gap_score__doc__[] = "target end gap score";
+
+static PyObject*
+Aligner_get_target_end_gap_score(Aligner* self, void* closure)
+{ if (self->target_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ else {
+ const double score = self->target_left_open_gap_score;
+ if (score != self->target_left_extend_gap_score
+ || score != self->target_right_open_gap_score
+ || score != self->target_right_extend_gap_score) {
+ PyErr_SetString(PyExc_ValueError, "gap scores are different");
+ return NULL;
+ }
+ return PyFloat_FromDouble(score);
+ }
+}
+
+static int
+Aligner_set_target_end_gap_score(Aligner* self, PyObject* value, void* closure) {
+ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ self->target_left_open_gap_score = score;
+ self->target_left_extend_gap_score = score;
+ self->target_right_open_gap_score = score;
+ self->target_right_extend_gap_score = score;
+ if (self->target_gap_function) {
+ Py_DECREF(self->target_gap_function);
+ self->target_gap_function = NULL;
+ }
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_target_end_open_gap_score__doc__[] = "target end open gap score";
+
+static PyObject*
+Aligner_get_target_end_open_gap_score(Aligner* self, void* closure)
+{ if (self->target_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ else {
+ const double score = self->target_left_open_gap_score;
+ if (score != self->target_right_open_gap_score) {
+ PyErr_SetString(PyExc_ValueError, "gap scores are different");
+ return NULL;
+ }
+ return PyFloat_FromDouble(score);
+ }
+}
+
+static int
+Aligner_set_target_end_open_gap_score(Aligner* self, PyObject* value,
+ void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ self->target_left_open_gap_score = score;
+ self->target_right_open_gap_score = score;
+ if (self->target_gap_function) {
+ Py_DECREF(self->target_gap_function);
+ self->target_gap_function = NULL;
+ }
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_target_end_extend_gap_score__doc__[] = "target end extend gap score";
+
+static PyObject*
+Aligner_get_target_end_extend_gap_score(Aligner* self, void* closure)
+{ if (self->target_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ else {
+ const double score = self->target_left_extend_gap_score;
+ if (score != self->target_right_extend_gap_score) {
+ PyErr_SetString(PyExc_ValueError, "gap scores are different");
+ return NULL;
+ }
+ return PyFloat_FromDouble(score);
+ }
+}
+
+static int
+Aligner_set_target_end_extend_gap_score(Aligner* self, PyObject* value, void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ self->target_left_extend_gap_score = score;
+ self->target_right_extend_gap_score = score;
+ if (self->target_gap_function) {
+ Py_DECREF(self->target_gap_function);
+ self->target_gap_function = NULL;
+ }
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_target_left_open_gap_score__doc__[] = "target left open score";
+
+static PyObject*
+Aligner_get_target_left_open_gap_score(Aligner* self, void* closure)
+{ if (self->target_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ return PyFloat_FromDouble(self->target_left_open_gap_score);
+}
+
+static int
+Aligner_set_target_left_open_gap_score(Aligner* self, PyObject* value, void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ self->target_left_open_gap_score = score;
+ if (self->target_gap_function) {
+ Py_DECREF(self->target_gap_function);
+ self->target_gap_function = NULL;
+ }
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_target_left_extend_gap_score__doc__[] = "target left extend score";
+
+static PyObject*
+Aligner_get_target_left_extend_gap_score(Aligner* self, void* closure)
+{ if (self->target_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ return PyFloat_FromDouble(self->target_left_extend_gap_score);
+}
+
+static int
+Aligner_set_target_left_extend_gap_score(Aligner* self, PyObject* value, void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ self->target_left_extend_gap_score = score;
+ if (self->target_gap_function) {
+ Py_DECREF(self->target_gap_function);
+ self->target_gap_function = NULL;
+ }
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_target_left_gap_score__doc__[] = "target left score";
+
+static PyObject*
+Aligner_get_target_left_gap_score(Aligner* self, void* closure)
+{ if (self->target_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ else {
+ const double score = self->target_left_open_gap_score;
+ if (score != self->target_left_extend_gap_score) {
+ PyErr_SetString(PyExc_ValueError, "gap scores are different");
+ return NULL;
+ }
+ return PyFloat_FromDouble(score);
+ }
+}
+
+static int
+Aligner_set_target_left_gap_score(Aligner* self, PyObject* value, void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ self->target_left_open_gap_score = score;
+ self->target_left_extend_gap_score = score;
+ if (self->target_gap_function) {
+ Py_DECREF(self->target_gap_function);
+ self->target_gap_function = NULL;
+ }
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_target_right_gap_score_open__doc__[] = "target right open score";
+
+static PyObject*
+Aligner_get_target_right_open_gap_score(Aligner* self, void* closure)
+{ if (self->target_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ return PyFloat_FromDouble(self->target_right_open_gap_score);
+}
+
+static int
+Aligner_set_target_right_open_gap_score(Aligner* self, PyObject* value, void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ self->target_right_open_gap_score = score;
+ if (self->target_gap_function) {
+ Py_DECREF(self->target_gap_function);
+ self->target_gap_function = NULL;
+ }
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_target_right_extend_gap_score__doc__[] = "target right extend score";
+
+static PyObject*
+Aligner_get_target_right_extend_gap_score(Aligner* self, void* closure)
+{ if (self->target_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ return PyFloat_FromDouble(self->target_right_extend_gap_score);
+}
+
+static int
+Aligner_set_target_right_extend_gap_score(Aligner* self, PyObject* value, void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ self->target_right_extend_gap_score = score;
+ if (self->target_gap_function) {
+ Py_DECREF(self->target_gap_function);
+ self->target_gap_function = NULL;
+ }
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_target_right_gap_score__doc__[] = "target right score";
+
+static PyObject*
+Aligner_get_target_right_gap_score(Aligner* self, void* closure)
+{ if (self->target_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ else {
+ const double score = self->target_right_open_gap_score;
+ if (score != self->target_right_extend_gap_score) {
+ PyErr_SetString(PyExc_ValueError, "gap scores are different");
+ return NULL;
+ }
+ return PyFloat_FromDouble(score);
+ }
+}
+
+static int
+Aligner_set_target_right_gap_score(Aligner* self, PyObject* value, void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ self->target_right_open_gap_score = score;
+ self->target_right_extend_gap_score = score;
+ if (self->target_gap_function) {
+ Py_DECREF(self->target_gap_function);
+ self->target_gap_function = NULL;
+ }
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_query_end_gap_score__doc__[] = "query end score";
+
+static PyObject*
+Aligner_get_query_end_gap_score(Aligner* self, void* closure)
+{ if (self->query_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ else {
+ const double score = self->query_left_open_gap_score;
+ if (score != self->query_left_extend_gap_score
+ || score != self->query_right_open_gap_score
+ || score != self->query_right_extend_gap_score) {
+ PyErr_SetString(PyExc_ValueError, "gap scores are different");
+ return NULL;
+ }
+ return PyFloat_FromDouble(score);
+ }
+}
+
+static int
+Aligner_set_query_end_gap_score(Aligner* self, PyObject* value, void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ self->query_left_open_gap_score = score;
+ self->query_left_extend_gap_score = score;
+ self->query_right_open_gap_score = score;
+ self->query_right_extend_gap_score = score;
+ if (self->query_gap_function) {
+ Py_DECREF(self->query_gap_function);
+ self->query_gap_function = NULL;
+ }
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_query_end_open_gap_score__doc__[] = "query end open score";
+
+static PyObject*
+Aligner_get_query_end_open_gap_score(Aligner* self, void* closure)
+{ if (self->query_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ else {
+ const double score = self->query_left_open_gap_score;
+ if (score != self->query_right_open_gap_score) {
+ PyErr_SetString(PyExc_ValueError, "gap scores are different");
+ return NULL;
+ }
+ return PyFloat_FromDouble(score);
+ }
+}
+
+static int
+Aligner_set_query_end_open_gap_score(Aligner* self, PyObject* value, void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ self->query_left_open_gap_score = score;
+ self->query_right_open_gap_score = score;
+ if (self->query_gap_function) {
+ Py_DECREF(self->query_gap_function);
+ self->query_gap_function = NULL;
+ }
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_query_end_extend_gap_score__doc__[] = "query end extend score";
+
+static PyObject*
+Aligner_get_query_end_extend_gap_score(Aligner* self, void* closure)
+{ if (self->query_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ else {
+ const double score = self->query_left_extend_gap_score;
+ if (score != self->query_right_extend_gap_score) {
+ PyErr_SetString(PyExc_ValueError, "gap scores are different");
+ return NULL;
+ }
+ return PyFloat_FromDouble(score);
+ }
+}
+
+static int
+Aligner_set_query_end_extend_gap_score(Aligner* self, PyObject* value, void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ self->query_left_extend_gap_score = score;
+ self->query_right_extend_gap_score = score;
+ if (self->query_gap_function) {
+ Py_DECREF(self->query_gap_function);
+ self->query_gap_function = NULL;
+ }
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_query_internal_open_gap_score__doc__[] = "query internal open gap score";
+
+static PyObject*
+Aligner_get_query_internal_open_gap_score(Aligner* self, void* closure)
+{ if (self->query_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ return PyFloat_FromDouble(self->query_internal_open_gap_score);
+}
+
+static int
+Aligner_set_query_internal_open_gap_score(Aligner* self, PyObject* value,
+ void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ self->query_internal_open_gap_score = score;
+ if (self->query_gap_function) {
+ Py_DECREF(self->query_gap_function);
+ self->query_gap_function = NULL;
+ }
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_query_internal_extend_gap_score__doc__[] = "query internal extend gap score";
+
+static PyObject*
+Aligner_get_query_internal_extend_gap_score(Aligner* self, void* closure)
+{ if (self->query_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ return PyFloat_FromDouble(self->query_internal_extend_gap_score);
+}
+
+static int
+Aligner_set_query_internal_extend_gap_score(Aligner* self, PyObject* value,
+ void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ self->query_internal_extend_gap_score = score;
+ if (self->query_gap_function) {
+ Py_DECREF(self->query_gap_function);
+ self->query_gap_function = NULL;
+ }
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_query_internal_gap_score__doc__[] = "query internal gap score";
+
+static PyObject*
+Aligner_get_query_internal_gap_score(Aligner* self, void* closure)
+{ if (self->query_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ else {
+ const double score = self->query_internal_open_gap_score;
+ if (score != self->query_internal_extend_gap_score) {
+ PyErr_SetString(PyExc_ValueError, "gap scores are different");
+ return NULL;
+ }
+ return PyFloat_FromDouble(score);
+ }
+}
+
+static int
+Aligner_set_query_internal_gap_score(Aligner* self, PyObject* value,
+ void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ self->query_internal_open_gap_score = score;
+ self->query_internal_extend_gap_score = score;
+ if (self->query_gap_function) {
+ Py_DECREF(self->query_gap_function);
+ self->query_gap_function = NULL;
+ }
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_query_left_open_gap_score__doc__[] = "query left open score";
+
+static PyObject*
+Aligner_get_query_left_open_gap_score(Aligner* self, void* closure)
+{ if (self->query_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ return PyFloat_FromDouble(self->query_left_open_gap_score);
+}
+
+static int
+Aligner_set_query_left_open_gap_score(Aligner* self, PyObject* value, void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ self->query_left_open_gap_score = score;
+ if (self->query_gap_function) {
+ Py_DECREF(self->query_gap_function);
+ self->query_gap_function = NULL;
+ }
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_query_left_extend_gap_score__doc__[] = "query left extend score";
+
+static PyObject*
+Aligner_get_query_left_extend_gap_score(Aligner* self, void* closure)
+{ if (self->query_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ return PyFloat_FromDouble(self->query_left_extend_gap_score);
+}
+
+static int
+Aligner_set_query_left_extend_gap_score(Aligner* self, PyObject* value, void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ self->query_left_extend_gap_score = score;
+ if (self->query_gap_function) {
+ Py_DECREF(self->query_gap_function);
+ self->query_gap_function = NULL;
+ }
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_query_left_gap_score__doc__[] = "query left score";
+
+static PyObject*
+Aligner_get_query_left_gap_score(Aligner* self, void* closure)
+{ if (self->query_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ else {
+ const double score = self->query_left_open_gap_score;
+ if (score != self->query_left_extend_gap_score) {
+ PyErr_SetString(PyExc_ValueError, "gap scores are different");
+ return NULL;
+ }
+ return PyFloat_FromDouble(score);
+ }
+}
+
+static int
+Aligner_set_query_left_gap_score(Aligner* self, PyObject* value, void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ self->query_left_open_gap_score = score;
+ self->query_left_extend_gap_score = score;
+ if (self->query_gap_function) {
+ Py_DECREF(self->query_gap_function);
+ self->query_gap_function = NULL;
+ }
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_query_right_open_gap_score__doc__[] = "query right open score";
+
+static PyObject*
+Aligner_get_query_right_open_gap_score(Aligner* self, void* closure)
+{ if (self->query_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ return PyFloat_FromDouble(self->query_right_open_gap_score);
+}
+
+static int
+Aligner_set_query_right_open_gap_score(Aligner* self, PyObject* value, void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ self->query_right_open_gap_score = score;
+ if (self->query_gap_function) {
+ Py_DECREF(self->query_gap_function);
+ self->query_gap_function = NULL;
+ }
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_query_right_extend_gap_score__doc__[] = "query right extend score";
+
+static PyObject*
+Aligner_get_query_right_extend_gap_score(Aligner* self, void* closure)
+{ if (self->query_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ return PyFloat_FromDouble(self->query_right_extend_gap_score);
+}
+
+static int
+Aligner_set_query_right_extend_gap_score(Aligner* self, PyObject* value, void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ self->query_right_extend_gap_score = score;
+ if (self->query_gap_function) {
+ Py_DECREF(self->query_gap_function);
+ self->query_gap_function = NULL;
+ }
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_query_right_gap_score__doc__[] = "query right score";
+
+static PyObject*
+Aligner_get_query_right_gap_score(Aligner* self, void* closure)
+{ if (self->query_gap_function) {
+ PyErr_SetString(PyExc_ValueError, "using a gap score function");
+ return NULL;
+ }
+ else {
+ const double score = self->query_right_open_gap_score;
+ if (score != self->query_right_extend_gap_score) {
+ PyErr_SetString(PyExc_ValueError, "gap scores are different");
+ return NULL;
+ }
+ return PyFloat_FromDouble(score);
+ }
+}
+
+static int
+Aligner_set_query_right_gap_score(Aligner* self, PyObject* value, void* closure)
+{ const double score = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ self->query_right_open_gap_score = score;
+ self->query_right_extend_gap_score = score;
+ if (self->query_gap_function) {
+ Py_DECREF(self->query_gap_function);
+ self->query_gap_function = NULL;
+ }
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static char Aligner_epsilon__doc__[] = "roundoff epsilon";
+
+static PyObject*
+Aligner_get_epsilon(Aligner* self, void* closure)
+{ return PyFloat_FromDouble(self->epsilon);
+}
+
+static int
+Aligner_set_epsilon(Aligner* self, PyObject* value, void* closure)
+{ const double epsilon = PyFloat_AsDouble(value);
+ if (PyErr_Occurred()) return -1;
+ self->epsilon = epsilon;
+ self->algorithm = Unknown;
+ return 0;
+}
+
+static PyObject*
+Aligner_get_wildcard(Aligner* self, void* closure)
+{
+ if (self->wildcard == -1) {
+ Py_INCREF(Py_None);
+ return Py_None;
+ }
+ else {
+ return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, &self->wildcard, 1);
+ }
+}
+
+static int
+Aligner_set_wildcard(Aligner* self, PyObject* value, void* closure)
+{
+ if (value == Py_None) {
+ self->wildcard = -1;
+ return 0;
+ }
+ if (!PyUnicode_Check(value)) {
+ PyErr_SetString(PyExc_TypeError,
+ "wildcard should be a single character, or None");
+ return -1;
+ }
+ if (PyUnicode_READY(value) == -1) return -1;
+ if (PyUnicode_GET_LENGTH(value) != 1) {
+ PyErr_SetString(PyExc_ValueError,
+ "wildcard should be a single character, or None");
+ return -1;
+ }
+ self->wildcard = PyUnicode_READ_CHAR(value, 0);
+ return 0;
+}
+
+static char Aligner_wildcard__doc__[] = "wildcard character";
+
+static Algorithm _get_algorithm(Aligner* self)
+{
+ Algorithm algorithm = self->algorithm;
+ if (algorithm == Unknown) {
+ const double target_gap_open = self->target_internal_open_gap_score;
+ const double query_gap_open = self->query_internal_open_gap_score;
+ const double target_gap_extend = self->target_internal_extend_gap_score;
+ const double query_gap_extend = self->query_internal_extend_gap_score;
+ const double target_left_open = self->target_left_open_gap_score;
+ const double target_left_extend = self->target_left_extend_gap_score;
+ const double query_left_open = self->query_left_open_gap_score;
+ const double target_right_open = self->target_right_open_gap_score;
+ const double query_right_open = self->query_right_open_gap_score;
+ const double target_right_extend = self->target_right_extend_gap_score;
+ const double query_left_extend = self->query_left_extend_gap_score;
+ const double query_right_extend = self->query_right_extend_gap_score;
+ if (self->target_gap_function || self->query_gap_function)
+ algorithm = WatermanSmithBeyer;
+ else if (target_gap_open == target_gap_extend
+ && query_gap_open == query_gap_extend
+ && target_left_open == target_left_extend
+ && target_right_open == target_right_extend
+ && query_left_open == query_left_extend
+ && query_right_open == query_right_extend)
+ algorithm = NeedlemanWunschSmithWaterman;
+ else
+ algorithm = Gotoh;
+ self->algorithm = algorithm;
+ }
+ return algorithm;
+}
+
+
+static char Aligner_algorithm__doc__[] = "alignment algorithm";
+
+static PyObject*
+Aligner_get_algorithm(Aligner* self, void* closure)
+{
+ const char* s = NULL;
+ const Mode mode = self->mode;
+ const Algorithm algorithm = _get_algorithm(self);
+ switch (algorithm) {
+ case NeedlemanWunschSmithWaterman:
+ switch (mode) {
+ case Global:
+ s = "Needleman-Wunsch";
+ break;
+ case Local:
+ s = "Smith-Waterman";
+ break;
+ }
+ break;
+ case Gotoh:
+ switch (mode) {
+ case Global:
+ s = "Gotoh global alignment algorithm";
+ break;
+ case Local:
+ s = "Gotoh local alignment algorithm";
+ break;
+ }
+ break;
+ case WatermanSmithBeyer:
+ switch (mode) {
+ case Global:
+ s = "Waterman-Smith-Beyer global alignment algorithm";
+ break;
+ case Local:
+ s = "Waterman-Smith-Beyer local alignment algorithm";
+ break;
+ }
+ break;
+ case Unknown:
+ default:
+ break;
+ }
+ return PyUnicode_FromString(s);
+}
+
+static PyGetSetDef Aligner_getset[] = {
+ {"mode",
+ (getter)Aligner_get_mode,
+ (setter)Aligner_set_mode,
+ Aligner_mode__doc__, NULL},
+ {"match_score",
+ (getter)Aligner_get_match_score,
+ (setter)Aligner_set_match_score,
+ Aligner_match_score__doc__, NULL},
+ {"mismatch_score",
+ (getter)Aligner_get_mismatch_score,
+ (setter)Aligner_set_mismatch_score,
+ Aligner_mismatch_score__doc__, NULL},
+ {"match", /* synonym for match_score */
+ (getter)Aligner_get_match_score,
+ (setter)Aligner_set_match_score,
+ Aligner_match_score__doc__, NULL},
+ {"mismatch", /* synonym for mismatch_score */
+ (getter)Aligner_get_mismatch_score,
+ (setter)Aligner_set_mismatch_score,
+ Aligner_mismatch_score__doc__, NULL},
+ {"substitution_matrix",
+ (getter)Aligner_get_substitution_matrix,
+ (setter)Aligner_set_substitution_matrix,
+ Aligner_substitution_matrix__doc__, NULL},
+ {"alphabet",
+ (getter)Aligner_get_alphabet,
+ (setter)Aligner_set_alphabet,
+ Aligner_alphabet__doc__, NULL},
+ {"gap_score",
+ (getter)Aligner_get_gap_score,
+ (setter)Aligner_set_gap_score,
+ Aligner_gap_score__doc__, NULL},
+ {"open_gap_score",
+ (getter)Aligner_get_open_gap_score,
+ (setter)Aligner_set_open_gap_score,
+ Aligner_open_gap_score__doc__, NULL},
+ {"extend_gap_score",
+ (getter)Aligner_get_extend_gap_score,
+ (setter)Aligner_set_extend_gap_score,
+ Aligner_extend_gap_score__doc__, NULL},
+ {"internal_gap_score",
+ (getter)Aligner_get_internal_gap_score,
+ (setter)Aligner_set_internal_gap_score,
+ Aligner_internal_gap_score__doc__, NULL},
+ {"internal_open_gap_score",
+ (getter)Aligner_get_internal_open_gap_score,
+ (setter)Aligner_set_internal_open_gap_score,
+ Aligner_internal_open_gap_score__doc__, NULL},
+ {"internal_extend_gap_score",
+ (getter)Aligner_get_internal_extend_gap_score,
+ (setter)Aligner_set_internal_extend_gap_score,
+ Aligner_internal_extend_gap_score__doc__, NULL},
+ {"end_gap_score",
+ (getter)Aligner_get_end_gap_score,
+ (setter)Aligner_set_end_gap_score,
+ Aligner_end_gap_score__doc__, NULL},
+ {"end_open_gap_score",
+ (getter)Aligner_get_end_open_gap_score,
+ (setter)Aligner_set_end_open_gap_score,
+ Aligner_end_open_gap_score__doc__, NULL},
+ {"end_extend_gap_score",
+ (getter)Aligner_get_end_extend_gap_score,
+ (setter)Aligner_set_end_extend_gap_score,
+ Aligner_end_extend_gap_score__doc__, NULL},
+ {"left_gap_score",
+ (getter)Aligner_get_left_gap_score,
+ (setter)Aligner_set_left_gap_score,
+ Aligner_left_gap_score__doc__, NULL},
+ {"left_open_gap_score",
+ (getter)Aligner_get_left_open_gap_score,
+ (setter)Aligner_set_left_open_gap_score,
+ Aligner_left_open_gap_score__doc__, NULL},
+ {"left_extend_gap_score",
+ (getter)Aligner_get_left_extend_gap_score,
+ (setter)Aligner_set_left_extend_gap_score,
+ Aligner_left_extend_gap_score__doc__, NULL},
+ {"right_gap_score",
+ (getter)Aligner_get_right_gap_score,
+ (setter)Aligner_set_right_gap_score,
+ Aligner_right_gap_score__doc__, NULL},
+ {"right_open_gap_score",
+ (getter)Aligner_get_right_open_gap_score,
+ (setter)Aligner_set_right_open_gap_score,
+ Aligner_right_open_gap_score__doc__, NULL},
+ {"right_extend_gap_score",
+ (getter)Aligner_get_right_extend_gap_score,
+ (setter)Aligner_set_right_extend_gap_score,
+ Aligner_right_extend_gap_score__doc__, NULL},
+ {"target_open_gap_score",
+ (getter)Aligner_get_target_open_gap_score,
+ (setter)Aligner_set_target_open_gap_score,
+ Aligner_target_open_gap_score__doc__, NULL},
+ {"target_extend_gap_score",
+ (getter)Aligner_get_target_extend_gap_score,
+ (setter)Aligner_set_target_extend_gap_score,
+ Aligner_target_extend_gap_score__doc__, NULL},
+ {"target_gap_score",
+ (getter)Aligner_get_target_gap_score,
+ (setter)Aligner_set_target_gap_score,
+ Aligner_target_gap_score__doc__, NULL},
+ {"query_open_gap_score",
+ (getter)Aligner_get_query_open_gap_score,
+ (setter)Aligner_set_query_open_gap_score,
+ Aligner_query_open_gap_score__doc__, NULL},
+ {"query_extend_gap_score",
+ (getter)Aligner_get_query_extend_gap_score,
+ (setter)Aligner_set_query_extend_gap_score,
+ Aligner_query_extend_gap_score__doc__, NULL},
+ {"query_gap_score",
+ (getter)Aligner_get_query_gap_score,
+ (setter)Aligner_set_query_gap_score,
+ Aligner_query_gap_score__doc__, NULL},
+ {"target_end_gap_score",
+ (getter)Aligner_get_target_end_gap_score,
+ (setter)Aligner_set_target_end_gap_score,
+ Aligner_target_end_gap_score__doc__, NULL},
+ {"target_end_open_gap_score",
+ (getter)Aligner_get_target_end_open_gap_score,
+ (setter)Aligner_set_target_end_open_gap_score,
+ Aligner_target_end_open_gap_score__doc__, NULL},
+ {"target_end_extend_gap_score",
+ (getter)Aligner_get_target_end_extend_gap_score,
+ (setter)Aligner_set_target_end_extend_gap_score,
+ Aligner_target_end_extend_gap_score__doc__, NULL},
+ {"target_internal_open_gap_score",
+ (getter)Aligner_get_target_internal_open_gap_score,
+ (setter)Aligner_set_target_internal_open_gap_score,
+ Aligner_target_internal_open_gap_score__doc__, NULL},
+ {"target_internal_extend_gap_score",
+ (getter)Aligner_get_target_internal_extend_gap_score,
+ (setter)Aligner_set_target_internal_extend_gap_score,
+ Aligner_target_internal_extend_gap_score__doc__, NULL},
+ {"target_internal_gap_score",
+ (getter)Aligner_get_target_internal_gap_score,
+ (setter)Aligner_set_target_internal_gap_score,
+ Aligner_target_internal_gap_score__doc__, NULL},
+ {"target_left_open_gap_score",
+ (getter)Aligner_get_target_left_open_gap_score,
+ (setter)Aligner_set_target_left_open_gap_score,
+ Aligner_target_left_open_gap_score__doc__, NULL},
+ {"target_left_extend_gap_score",
+ (getter)Aligner_get_target_left_extend_gap_score,
+ (setter)Aligner_set_target_left_extend_gap_score,
+ Aligner_target_left_extend_gap_score__doc__, NULL},
+ {"target_left_gap_score",
+ (getter)Aligner_get_target_left_gap_score,
+ (setter)Aligner_set_target_left_gap_score,
+ Aligner_target_left_gap_score__doc__, NULL},
+ {"target_right_open_gap_score",
+ (getter)Aligner_get_target_right_open_gap_score,
+ (setter)Aligner_set_target_right_open_gap_score,
+ Aligner_target_right_gap_score_open__doc__, NULL},
+ {"target_right_extend_gap_score",
+ (getter)Aligner_get_target_right_extend_gap_score,
+ (setter)Aligner_set_target_right_extend_gap_score,
+ Aligner_target_right_extend_gap_score__doc__, NULL},
+ {"target_right_gap_score",
+ (getter)Aligner_get_target_right_gap_score,
+ (setter)Aligner_set_target_right_gap_score,
+ Aligner_target_right_gap_score__doc__, NULL},
+ {"query_end_gap_score",
+ (getter)Aligner_get_query_end_gap_score,
+ (setter)Aligner_set_query_end_gap_score,
+ Aligner_query_end_gap_score__doc__, NULL},
+ {"query_end_open_gap_score",
+ (getter)Aligner_get_query_end_open_gap_score,
+ (setter)Aligner_set_query_end_open_gap_score,
+ Aligner_query_end_open_gap_score__doc__, NULL},
+ {"query_end_extend_gap_score",
+ (getter)Aligner_get_query_end_extend_gap_score,
+ (setter)Aligner_set_query_end_extend_gap_score,
+ Aligner_query_end_extend_gap_score__doc__, NULL},
+ {"query_internal_open_gap_score",
+ (getter)Aligner_get_query_internal_open_gap_score,
+ (setter)Aligner_set_query_internal_open_gap_score,
+ Aligner_query_internal_open_gap_score__doc__, NULL},
+ {"query_internal_extend_gap_score",
+ (getter)Aligner_get_query_internal_extend_gap_score,
+ (setter)Aligner_set_query_internal_extend_gap_score,
+ Aligner_query_internal_extend_gap_score__doc__, NULL},
+ {"query_internal_gap_score",
+ (getter)Aligner_get_query_internal_gap_score,
+ (setter)Aligner_set_query_internal_gap_score,
+ Aligner_query_internal_gap_score__doc__, NULL},
+ {"query_left_open_gap_score",
+ (getter)Aligner_get_query_left_open_gap_score,
+ (setter)Aligner_set_query_left_open_gap_score,
+ Aligner_query_left_open_gap_score__doc__, NULL},
+ {"query_left_extend_gap_score",
+ (getter)Aligner_get_query_left_extend_gap_score,
+ (setter)Aligner_set_query_left_extend_gap_score,
+ Aligner_query_left_extend_gap_score__doc__, NULL},
+ {"query_left_gap_score",
+ (getter)Aligner_get_query_left_gap_score,
+ (setter)Aligner_set_query_left_gap_score,
+ Aligner_query_left_gap_score__doc__, NULL},
+ {"query_right_open_gap_score",
+ (getter)Aligner_get_query_right_open_gap_score,
+ (setter)Aligner_set_query_right_open_gap_score,
+ Aligner_query_right_open_gap_score__doc__, NULL},
+ {"query_right_extend_gap_score",
+ (getter)Aligner_get_query_right_extend_gap_score,
+ (setter)Aligner_set_query_right_extend_gap_score,
+ Aligner_query_right_extend_gap_score__doc__, NULL},
+ {"query_right_gap_score",
+ (getter)Aligner_get_query_right_gap_score,
+ (setter)Aligner_set_query_right_gap_score,
+ Aligner_query_right_gap_score__doc__, NULL},
+ {"epsilon",
+ (getter)Aligner_get_epsilon,
+ (setter)Aligner_set_epsilon,
+ Aligner_epsilon__doc__, NULL},
+ {"wildcard",
+ (getter)Aligner_get_wildcard,
+ (setter)Aligner_set_wildcard,
+ Aligner_wildcard__doc__, NULL},
+ {"algorithm",
+ (getter)Aligner_get_algorithm,
+ (setter)NULL,
+ Aligner_algorithm__doc__, NULL},
+ {NULL} /* Sentinel */
+};
+
+#define SELECT_SCORE_GLOBAL(score1, score2, score3) \
+ score = score1; \
+ temp = score2; \
+ if (temp > score) score = temp; \
+ temp = score3; \
+ if (temp > score) score = temp;
+
+#define SELECT_SCORE_WATERMAN_SMITH_BEYER(score1, score2) \
+ temp = score1 + gapscore; \
+ if (temp > score) score = temp; \
+ temp = score2 + gapscore; \
+ if (temp > score) score = temp;
+
+#define SELECT_SCORE_GOTOH_LOCAL_ALIGN(score1, score2, score3, score4) \
+ score = score1; \
+ temp = score2; \
+ if (temp > score) score = temp; \
+ temp = score3; \
+ if (temp > score) score = temp; \
+ score += score4; \
+ if (score < 0) score = 0; \
+ else if (score > maximum) maximum = score;
+
+#define SELECT_SCORE_LOCAL3(score1, score2, score3) \
+ score = score1; \
+ temp = score2; \
+ if (temp > score) score = temp; \
+ temp = score3; \
+ if (temp > score) score = temp; \
+ if (score < 0) score = 0; \
+ else if (score > maximum) maximum = score;
+
+#define SELECT_SCORE_LOCAL1(score1) \
+ score = score1; \
+ if (score < 0) score = 0; \
+ else if (score > maximum) maximum = score;
+
+#define SELECT_TRACE_NEEDLEMAN_WUNSCH(hgap, vgap, align_score) \
+ score = temp + (align_score); \
+ trace = DIAGONAL; \
+ temp = row[j-1] + hgap; \
+ if (temp > score + epsilon) { \
+ score = temp; \
+ trace = HORIZONTAL; \
+ } \
+ else if (temp > score - epsilon) trace |= HORIZONTAL; \
+ temp = row[j] + vgap; \
+ if (temp > score + epsilon) { \
+ score = temp; \
+ trace = VERTICAL; \
+ } \
+ else if (temp > score - epsilon) trace |= VERTICAL; \
+ temp = row[j]; \
+ row[j] = score; \
+ M[i][j].trace = trace;
+
+#define SELECT_TRACE_SMITH_WATERMAN_HVD(align_score) \
+ trace = DIAGONAL; \
+ score = temp + (align_score); \
+ temp = row[j-1] + gap_extend_A; \
+ if (temp > score + epsilon) { \
+ score = temp; \
+ trace = HORIZONTAL; \
+ } \
+ else if (temp > score - epsilon) trace |= HORIZONTAL; \
+ temp = row[j] + gap_extend_B; \
+ if (temp > score + epsilon) { \
+ score = temp; \
+ trace = VERTICAL; \
+ } \
+ else if (temp > score - epsilon) trace |= VERTICAL; \
+ if (score < epsilon) { \
+ score = 0; \
+ trace = STARTPOINT; \
+ } \
+ else if (trace & DIAGONAL && score > maximum - epsilon) { \
+ if (score > maximum + epsilon) { \
+ for ( ; im < i; im++, jm = 0) \
+ for ( ; jm <= nB; jm++) M[im][jm].trace &= ~ENDPOINT; \
+ for ( ; jm < j; jm++) M[im][jm].trace &= ~ENDPOINT; \
+ im = i; \
+ jm = j; \
+ } \
+ trace |= ENDPOINT; \
+ } \
+ M[i][j].trace = trace; \
+ if (score > maximum) maximum = score; \
+ temp = row[j]; \
+ row[j] = score;
+
+#define SELECT_TRACE_SMITH_WATERMAN_D(align_score) \
+ score = temp + (align_score); \
+ trace = DIAGONAL; \
+ if (score < epsilon) { \
+ score = 0; \
+ } \
+ else if (trace & DIAGONAL && score > maximum - epsilon) { \
+ if (score > maximum + epsilon) { \
+ for ( ; im < i; im++, jm = 0) \
+ for ( ; jm <= nB; jm++) M[im][jm].trace &= ~ENDPOINT; \
+ for ( ; jm < j; jm++) M[im][jm].trace &= ~ENDPOINT; \
+ im = i; \
+ jm = j; \
+ } \
+ trace |= ENDPOINT; \
+ } \
+ M[i][j].trace = trace; \
+ if (score > maximum) maximum = score; \
+ temp = row[j]; \
+ row[j] = score
+
+#define SELECT_TRACE_GOTOH_GLOBAL_GAP(matrix, score1, score2, score3) \
+ trace = M_MATRIX; \
+ score = score1; \
+ temp = score2; \
+ if (temp > score + epsilon) { \
+ score = temp; \
+ trace = Ix_MATRIX; \
+ } \
+ else if (temp > score - epsilon) trace |= Ix_MATRIX; \
+ temp = score3; \
+ if (temp > score + epsilon) { \
+ score = temp; \
+ trace = Iy_MATRIX; \
+ } \
+ else if (temp > score - epsilon) trace |= Iy_MATRIX; \
+ gaps[i][j].matrix = trace;
+
+#define SELECT_TRACE_GOTOH_GLOBAL_ALIGN \
+ trace = M_MATRIX; \
+ score = M_temp; \
+ temp = Ix_temp; \
+ if (temp > score + epsilon) { \
+ score = Ix_temp; \
+ trace = Ix_MATRIX; \
+ } \
+ else if (temp > score - epsilon) trace |= Ix_MATRIX; \
+ temp = Iy_temp; \
+ if (temp > score + epsilon) { \
+ score = temp; \
+ trace = Iy_MATRIX; \
+ } \
+ else if (temp > score - epsilon) trace |= Iy_MATRIX; \
+ M[i][j].trace = trace;
+
+#define SELECT_TRACE_GOTOH_LOCAL_ALIGN(align_score) \
+ trace = M_MATRIX; \
+ score = M_temp; \
+ if (Ix_temp > score + epsilon) { \
+ score = Ix_temp; \
+ trace = Ix_MATRIX; \
+ } \
+ else if (Ix_temp > score - epsilon) trace |= Ix_MATRIX; \
+ if (Iy_temp > score + epsilon) { \
+ score = Iy_temp; \
+ trace = Iy_MATRIX; \
+ } \
+ else if (Iy_temp > score - epsilon) trace |= Iy_MATRIX; \
+ score += (align_score); \
+ if (score < epsilon) { \
+ score = 0; \
+ trace = STARTPOINT; \
+ } \
+ else if (score > maximum - epsilon) { \
+ if (score > maximum + epsilon) { \
+ maximum = score; \
+ for ( ; im < i; im++, jm = 0) \
+ for ( ; jm <= nB; jm++) M[im][jm].trace &= ~ENDPOINT; \
+ for ( ; jm < j; jm++) M[im][jm].trace &= ~ENDPOINT; \
+ im = i; \
+ jm = j; \
+ } \
+ trace |= ENDPOINT; \
+ } \
+ M[i][j].trace = trace;
+
+#define SELECT_TRACE_GOTOH_LOCAL_GAP(matrix, score1, score2, score3) \
+ trace = M_MATRIX; \
+ score = score1; \
+ temp = score2; \
+ if (temp > score + epsilon) { \
+ score = temp; \
+ trace = Ix_MATRIX; \
+ } \
+ else if (temp > score - epsilon) trace |= Ix_MATRIX; \
+ temp = score3; \
+ if (temp > score + epsilon) { \
+ score = temp; \
+ trace = Iy_MATRIX; \
+ } \
+ else if (temp > score - epsilon) trace |= Iy_MATRIX; \
+ if (score < epsilon) { \
+ score = -DBL_MAX; \
+ trace = 0; \
+ } \
+ gaps[i][j].matrix = trace;
+
+#define SELECT_TRACE_WATERMAN_SMITH_BEYER_GLOBAL_ALIGN(score4) \
+ trace = M_MATRIX; \
+ score = M_row[i-1][j-1]; \
+ temp = Ix_row[i-1][j-1]; \
+ if (temp > score + epsilon) { \
+ score = temp; \
+ trace = Ix_MATRIX; \
+ } \
+ else if (temp > score - epsilon) trace |= Ix_MATRIX; \
+ temp = Iy_row[i-1][j-1]; \
+ if (temp > score + epsilon) { \
+ score = temp; \
+ trace = Iy_MATRIX; \
+ } \
+ else if (temp > score - epsilon) trace |= Iy_MATRIX; \
+ M_row[i][j] = score + score4; \
+ M[i][j].trace = trace;
+
+#define SELECT_TRACE_WATERMAN_SMITH_BEYER_GAP(score1, score2) \
+ temp = score1 + gapscore; \
+ if (temp > score - epsilon) { \
+ if (temp > score + epsilon) { \
+ score = temp; \
+ nm = 0; \
+ ng = 0; \
+ } \
+ gapM[nm] = gap; \
+ nm++; \
+ } \
+ temp = score2 + gapscore; \
+ if (temp > score - epsilon) { \
+ if (temp > score + epsilon) { \
+ score = temp; \
+ nm = 0; \
+ ng = 0; \
+ } \
+ gapXY[ng] = gap; \
+ ng++; \
+ }
+
+#define SELECT_TRACE_WATERMAN_SMITH_BEYER_ALIGN(score1, score2, score3, score4) \
+ trace = M_MATRIX; \
+ score = score1; \
+ if (score2 > score + epsilon) { \
+ score = score2; \
+ trace = Ix_MATRIX; \
+ } \
+ else if (score2 > score - epsilon) trace |= Ix_MATRIX; \
+ if (score3 > score + epsilon) { \
+ score = score3; \
+ trace = Iy_MATRIX; \
+ } \
+ else if (score3 > score - epsilon) trace |= Iy_MATRIX; \
+ score += score4; \
+ if (score < epsilon) { \
+ score = 0; \
+ trace = STARTPOINT; \
+ } \
+ else if (score > maximum - epsilon) { \
+ if (score > maximum + epsilon) { \
+ maximum = score; \
+ for ( ; im < i; im++, jm = 0) \
+ for ( ; jm <= nB; jm++) M[im][jm].trace &= ~ENDPOINT; \
+ for ( ; jm < j; jm++) M[im][jm].trace &= ~ENDPOINT; \
+ im = i; \
+ jm = j; \
+ } \
+ trace |= ENDPOINT; \
+ } \
+ M_row[i][j] = score; \
+ M[i][j].trace = trace;
+
+/* ----------------- alignment algorithms ----------------- */
+
+#define NEEDLEMANWUNSCH_SCORE(align_score) \
+ int i; \
+ int j; \
+ int kA; \
+ int kB; \
+ const double gap_extend_A = self->target_internal_extend_gap_score; \
+ const double gap_extend_B = self->query_internal_extend_gap_score; \
+ double score; \
+ double temp; \
+ double* row; \
+ double left_gap_extend_A; \
+ double right_gap_extend_A; \
+ double left_gap_extend_B; \
+ double right_gap_extend_B; \
+ switch (strand) { \
+ case '+': \
+ left_gap_extend_A = self->target_left_extend_gap_score; \
+ right_gap_extend_A = self->target_right_extend_gap_score; \
+ left_gap_extend_B = self->query_left_extend_gap_score; \
+ right_gap_extend_B = self->query_right_extend_gap_score; \
+ break; \
+ case '-': \
+ left_gap_extend_A = self->target_right_extend_gap_score; \
+ right_gap_extend_A = self->target_left_extend_gap_score; \
+ left_gap_extend_B = self->query_right_extend_gap_score; \
+ right_gap_extend_B = self->query_left_extend_gap_score; \
+ break; \
+ default: \
+ PyErr_SetString(PyExc_RuntimeError, "strand was neither '+' nor '-'"); \
+ return NULL; \
+ } \
+\
+ /* Needleman-Wunsch algorithm */ \
+ row = PyMem_Malloc((nB+1)*sizeof(double)); \
+ if (!row) return PyErr_NoMemory(); \
+\
+ /* The top row of the score matrix is a special case, \
+ * as there are no previously aligned characters. \
+ */ \
+ row[0] = 0.0; \
+ for (j = 1; j <= nB; j++) row[j] = j * left_gap_extend_A; \
+ for (i = 1; i < nA; i++) { \
+ kA = sA[i-1]; \
+ temp = row[0]; \
+ row[0] = i * left_gap_extend_B; \
+ for (j = 1; j < nB; j++) { \
+ kB = sB[j-1]; \
+ SELECT_SCORE_GLOBAL(temp + (align_score), \
+ row[j] + gap_extend_B, \
+ row[j-1] + gap_extend_A); \
+ temp = row[j]; \
+ row[j] = score; \
+ } \
+ kB = sB[nB-1]; \
+ SELECT_SCORE_GLOBAL(temp + (align_score), \
+ row[nB] + right_gap_extend_B, \
+ row[nB-1] + gap_extend_A); \
+ temp = row[nB]; \
+ row[nB] = score; \
+ } \
+ kA = sA[nA-1]; \
+ temp = row[0]; \
+ row[0] = nA * right_gap_extend_B; \
+ for (j = 1; j < nB; j++) { \
+ kB = sB[j-1]; \
+ SELECT_SCORE_GLOBAL(temp + (align_score), \
+ row[j] + gap_extend_B, \
+ row[j-1] + right_gap_extend_A); \
+ temp = row[j]; \
+ row[j] = score; \
+ } \
+ kB = sB[nB-1]; \
+ SELECT_SCORE_GLOBAL(temp + (align_score), \
+ row[nB] + right_gap_extend_B, \
+ row[nB-1] + right_gap_extend_A); \
+ PyMem_Free(row); \
+ return PyFloat_FromDouble(score);
+
+
+#define SMITHWATERMAN_SCORE(align_score) \
+ int i; \
+ int j; \
+ int kA; \
+ int kB; \
+ const double gap_extend_A = self->target_internal_extend_gap_score; \
+ const double gap_extend_B = self->query_internal_extend_gap_score; \
+ double score; \
+ double* row; \
+ double temp; \
+ double maximum = 0; \
+\
+ /* Smith-Waterman algorithm */ \
+ row = PyMem_Malloc((nB+1)*sizeof(double)); \
+ if (!row) return PyErr_NoMemory(); \
+\
+ /* The top row of the score matrix is a special case, \
+ * as there are no previously aligned characters. \
+ */ \
+ for (j = 0; j <= nB; j++) \
+ row[j] = 0; \
+ for (i = 1; i < nA; i++) { \
+ kA = sA[i-1]; \
+ temp = 0; \
+ for (j = 1; j < nB; j++) { \
+ kB = sB[j-1]; \
+ SELECT_SCORE_LOCAL3(temp + (align_score), \
+ row[j] + gap_extend_B, \
+ row[j-1] + gap_extend_A); \
+ temp = row[j]; \
+ row[j] = score; \
+ } \
+ kB = sB[nB-1]; \
+ SELECT_SCORE_LOCAL1(temp + (align_score)); \
+ temp = row[nB]; \
+ row[nB] = score; \
+ } \
+ kA = sA[nA-1]; \
+ temp = 0; \
+ for (j = 1; j < nB; j++) { \
+ kB = sB[j-1]; \
+ SELECT_SCORE_LOCAL1(temp + (align_score)); \
+ temp = row[j]; \
+ row[j] = score; \
+ } \
+ kB = sB[nB-1]; \
+ SELECT_SCORE_LOCAL1(temp + (align_score)); \
+ PyMem_Free(row); \
+ return PyFloat_FromDouble(maximum);
+
+
+#define NEEDLEMANWUNSCH_ALIGN(align_score) \
+ int i; \
+ int j; \
+ int kA; \
+ int kB; \
+ const double gap_extend_A = self->target_internal_extend_gap_score; \
+ const double gap_extend_B = self->query_internal_extend_gap_score; \
+ const double epsilon = self->epsilon; \
+ Trace** M; \
+ double score; \
+ int trace; \
+ double temp; \
+ double* row = NULL; \
+ PathGenerator* paths; \
+ double left_gap_extend_A; \
+ double right_gap_extend_A; \
+ double left_gap_extend_B; \
+ double right_gap_extend_B; \
+ switch (strand) { \
+ case '+': \
+ left_gap_extend_A = self->target_left_extend_gap_score; \
+ right_gap_extend_A = self->target_right_extend_gap_score; \
+ left_gap_extend_B = self->query_left_extend_gap_score; \
+ right_gap_extend_B = self->query_right_extend_gap_score; \
+ break; \
+ case '-': \
+ left_gap_extend_A = self->target_right_extend_gap_score; \
+ right_gap_extend_A = self->target_left_extend_gap_score; \
+ left_gap_extend_B = self->query_right_extend_gap_score; \
+ right_gap_extend_B = self->query_left_extend_gap_score; \
+ break; \
+ default: \
+ PyErr_SetString(PyExc_RuntimeError, "strand was neither '+' nor '-'"); \
+ return NULL; \
+ } \
+\
+ /* Needleman-Wunsch algorithm */ \
+ paths = PathGenerator_create_NWSW(nA, nB, Global, strand); \
+ if (!paths) return NULL; \
+ row = PyMem_Malloc((nB+1)*sizeof(double)); \
+ if (!row) { \
+ Py_DECREF(paths); \
+ return PyErr_NoMemory(); \
+ } \
+ M = paths->M; \
+ row[0] = 0; \
+ for (j = 1; j <= nB; j++) row[j] = j * left_gap_extend_A; \
+ for (i = 1; i < nA; i++) { \
+ temp = row[0]; \
+ row[0] = i * left_gap_extend_B; \
+ kA = sA[i-1]; \
+ for (j = 1; j < nB; j++) { \
+ kB = sB[j-1]; \
+ SELECT_TRACE_NEEDLEMAN_WUNSCH(gap_extend_A, gap_extend_B, align_score); \
+ } \
+ kB = sB[j-1]; \
+ SELECT_TRACE_NEEDLEMAN_WUNSCH(gap_extend_A, right_gap_extend_B, align_score); \
+ } \
+ temp = row[0]; \
+ row[0] = i * left_gap_extend_B; \
+ kA = sA[nA-1]; \
+ for (j = 1; j < nB; j++) { \
+ kB = sB[j-1]; \
+ SELECT_TRACE_NEEDLEMAN_WUNSCH(right_gap_extend_A, gap_extend_B, align_score); \
+ } \
+ kB = sB[j-1]; \
+ SELECT_TRACE_NEEDLEMAN_WUNSCH(right_gap_extend_A, right_gap_extend_B, align_score); \
+ PyMem_Free(row); \
+ M[nA][nB].path = 0; \
+ return Py_BuildValue("fN", score, paths);
+
+
+#define SMITHWATERMAN_ALIGN(align_score) \
+ int i; \
+ int j; \
+ int im = nA; \
+ int jm = nB; \
+ int kA; \
+ int kB; \
+ const double gap_extend_A = self->target_internal_extend_gap_score; \
+ const double gap_extend_B = self->query_internal_extend_gap_score; \
+ const double epsilon = self->epsilon; \
+ Trace** M = NULL; \
+ double maximum = 0; \
+ double score = 0; \
+ double* row = NULL; \
+ double temp; \
+ int trace; \
+ PathGenerator* paths = NULL; \
+\
+ /* Smith-Waterman algorithm */ \
+ paths = PathGenerator_create_NWSW(nA, nB, Local, strand); \
+ if (!paths) return NULL; \
+ row = PyMem_Malloc((nB+1)*sizeof(double)); \
+ if (!row) { \
+ Py_DECREF(paths); \
+ return PyErr_NoMemory(); \
+ } \
+ M = paths->M; \
+ for (j = 0; j <= nB; j++) row[j] = 0; \
+ for (i = 1; i < nA; i++) { \
+ temp = 0; \
+ kA = sA[i-1]; \
+ for (j = 1; j < nB; j++) { \
+ kB = sB[j-1]; \
+ SELECT_TRACE_SMITH_WATERMAN_HVD(align_score); \
+ } \
+ kB = sB[nB-1]; \
+ SELECT_TRACE_SMITH_WATERMAN_D(align_score); \
+ } \
+ temp = 0; \
+ kA = sA[nA-1]; \
+ for (j = 1; j < nB; j++) { \
+ kB = sB[j-1]; \
+ SELECT_TRACE_SMITH_WATERMAN_D(align_score); \
+ } \
+ kB = sB[nB-1]; \
+ SELECT_TRACE_SMITH_WATERMAN_D(align_score); \
+ PyMem_Free(row); \
+\
+ /* As we don't allow zero-score extensions to alignments, \
+ * we need to remove all traces towards an ENDPOINT. \
+ * In addition, some points then won't have any path to a STARTPOINT. \
+ * Here, use path as a temporary variable to indicate if the point \
+ * is reachable from a STARTPOINT. If it is unreachable, remove all \
+ * traces from it, and don't allow it to be an ENDPOINT. It may still \
+ * be a valid STARTPOINT. */ \
+ for (j = 0; j <= nB; j++) M[0][j].path = 1; \
+ for (i = 1; i <= nA; i++) { \
+ M[i][0].path = 1; \
+ for (j = 1; j <= nB; j++) { \
+ trace = M[i][j].trace; \
+ /* Remove traces to unreachable points. */ \
+ if (!M[i-1][j-1].path) trace &= ~DIAGONAL; \
+ if (!M[i][j-1].path) trace &= ~HORIZONTAL; \
+ if (!M[i-1][j].path) trace &= ~VERTICAL; \
+ if (trace & (STARTPOINT | HORIZONTAL | VERTICAL | DIAGONAL)) { \
+ /* The point is reachable. */ \
+ if (trace & ENDPOINT) M[i][j].path = 0; /* no extensions after ENDPOINT */ \
+ else M[i][j].path = 1; \
+ } \
+ else { \
+ /* The point is not reachable. Then it is not a STARTPOINT, \
+ * all traces from it can be removed, and it cannot act as \
+ * an ENDPOINT. */ \
+ M[i][j].path = 0; \
+ trace = 0; \
+ } \
+ M[i][j].trace = trace; \
+ } \
+ } \
+ if (maximum == 0) M[0][0].path = NONE; \
+ else M[0][0].path = 0; \
+ return Py_BuildValue("fN", maximum, paths);
+
+
+#define GOTOH_GLOBAL_SCORE(align_score) \
+ int i; \
+ int j; \
+ int kA; \
+ int kB; \
+ const double gap_open_A = self->target_internal_open_gap_score; \
+ const double gap_open_B = self->query_internal_open_gap_score; \
+ const double gap_extend_A = self->target_internal_extend_gap_score; \
+ const double gap_extend_B = self->query_internal_extend_gap_score; \
+ double left_gap_open_A; \
+ double left_gap_open_B; \
+ double left_gap_extend_A; \
+ double left_gap_extend_B; \
+ double right_gap_open_A; \
+ double right_gap_open_B; \
+ double right_gap_extend_A; \
+ double right_gap_extend_B; \
+ double* M_row = NULL; \
+ double* Ix_row = NULL; \
+ double* Iy_row = NULL; \
+ double score; \
+ double temp; \
+ double M_temp; \
+ double Ix_temp; \
+ double Iy_temp; \
+ switch (strand) { \
+ case '+': \
+ left_gap_open_A = self->target_left_open_gap_score; \
+ left_gap_open_B = self->query_left_open_gap_score; \
+ left_gap_extend_A = self->target_left_extend_gap_score; \
+ left_gap_extend_B = self->query_left_extend_gap_score; \
+ right_gap_open_A = self->target_right_open_gap_score; \
+ right_gap_open_B = self->query_right_open_gap_score; \
+ right_gap_extend_A = self->target_right_extend_gap_score; \
+ right_gap_extend_B = self->query_right_extend_gap_score; \
+ break; \
+ case '-': \
+ left_gap_open_A = self->target_right_open_gap_score; \
+ left_gap_open_B = self->query_right_open_gap_score; \
+ left_gap_extend_A = self->target_right_extend_gap_score; \
+ left_gap_extend_B = self->query_right_extend_gap_score; \
+ right_gap_open_A = self->target_left_open_gap_score; \
+ right_gap_open_B = self->query_left_open_gap_score; \
+ right_gap_extend_A = self->target_left_extend_gap_score; \
+ right_gap_extend_B = self->query_left_extend_gap_score; \
+ break; \
+ default: \
+ PyErr_SetString(PyExc_RuntimeError, "strand was neither '+' nor '-'"); \
+ return NULL; \
+ } \
+\
+ /* Gotoh algorithm with three states */ \
+ M_row = PyMem_Malloc((nB+1)*sizeof(double)); \
+ if (!M_row) goto exit; \
+ Ix_row = PyMem_Malloc((nB+1)*sizeof(double)); \
+ if (!Ix_row) goto exit; \
+ Iy_row = PyMem_Malloc((nB+1)*sizeof(double)); \
+ if (!Iy_row) goto exit; \
+\
+ /* The top row of the score matrix is a special case, \
+ * as there are no previously aligned characters. \
+ */ \
+ M_row[0] = 0; \
+ Ix_row[0] = -DBL_MAX; \
+ Iy_row[0] = -DBL_MAX; \
+ for (j = 1; j <= nB; j++) { \
+ M_row[j] = -DBL_MAX; \
+ Ix_row[j] = -DBL_MAX; \
+ Iy_row[j] = left_gap_open_A + left_gap_extend_A * (j-1); \
+ } \
+\
+ for (i = 1; i < nA; i++) { \
+ M_temp = M_row[0]; \
+ Ix_temp = Ix_row[0]; \
+ Iy_temp = Iy_row[0]; \
+ M_row[0] = -DBL_MAX; \
+ Ix_row[0] = left_gap_open_B + left_gap_extend_B * (i-1); \
+ Iy_row[0] = -DBL_MAX; \
+ kA = sA[i-1]; \
+ for (j = 1; j < nB; j++) { \
+ kB = sB[j-1]; \
+ SELECT_SCORE_GLOBAL(M_temp, \
+ Ix_temp, \
+ Iy_temp); \
+ M_temp = M_row[j]; \
+ M_row[j] = score + (align_score); \
+ SELECT_SCORE_GLOBAL(M_temp + gap_open_B, \
+ Ix_row[j] + gap_extend_B, \
+ Iy_row[j] + gap_open_B); \
+ Ix_temp = Ix_row[j]; \
+ Ix_row[j] = score; \
+ SELECT_SCORE_GLOBAL(M_row[j-1] + gap_open_A, \
+ Ix_row[j-1] + gap_open_A, \
+ Iy_row[j-1] + gap_extend_A); \
+ Iy_temp = Iy_row[j]; \
+ Iy_row[j] = score; \
+ } \
+ kB = sB[nB-1]; \
+ SELECT_SCORE_GLOBAL(M_temp, \
+ Ix_temp, \
+ Iy_temp); \
+ M_temp = M_row[nB]; \
+ M_row[nB] = score + (align_score); \
+ SELECT_SCORE_GLOBAL(M_temp + right_gap_open_B, \
+ Ix_row[nB] + right_gap_extend_B, \
+ Iy_row[nB] + right_gap_open_B); \
+ Ix_row[nB] = score; \
+ SELECT_SCORE_GLOBAL(M_row[nB-1] + gap_open_A, \
+ Iy_row[nB-1] + gap_extend_A, \
+ Ix_row[nB-1] + gap_open_A); \
+ Iy_row[nB] = score; \
+ } \
+\
+ M_temp = M_row[0]; \
+ Ix_temp = Ix_row[0]; \
+ Iy_temp = Iy_row[0]; \
+ M_row[0] = -DBL_MAX; \
+ Ix_row[0] = left_gap_open_B + left_gap_extend_B * (i-1); \
+ Iy_row[0] = -DBL_MAX; \
+ kA = sA[nA-1]; \
+ for (j = 1; j < nB; j++) { \
+ kB = sB[j-1]; \
+ SELECT_SCORE_GLOBAL(M_temp, \
+ Ix_temp, \
+ Iy_temp); \
+ M_temp = M_row[j]; \
+ M_row[j] = score + (align_score); \
+ SELECT_SCORE_GLOBAL(M_temp + gap_open_B, \
+ Ix_row[j] + gap_extend_B, \
+ Iy_row[j] + gap_open_B); \
+ Ix_temp = Ix_row[j]; \
+ Ix_row[j] = score; \
+ SELECT_SCORE_GLOBAL(M_row[j-1] + right_gap_open_A, \
+ Iy_row[j-1] + right_gap_extend_A, \
+ Ix_row[j-1] + right_gap_open_A); \
+ Iy_temp = Iy_row[j]; \
+ Iy_row[j] = score; \
+ } \
+\
+ kB = sB[nB-1]; \
+ SELECT_SCORE_GLOBAL(M_temp, \
+ Ix_temp, \
+ Iy_temp); \
+ M_temp = M_row[nB]; \
+ M_row[nB] = score + (align_score); \
+ SELECT_SCORE_GLOBAL(M_temp + right_gap_open_B, \
+ Ix_row[nB] + right_gap_extend_B, \
+ Iy_row[nB] + right_gap_open_B); \
+ Ix_temp = Ix_row[nB]; \
+ Ix_row[nB] = score; \
+ SELECT_SCORE_GLOBAL(M_row[nB-1] + right_gap_open_A, \
+ Ix_row[nB-1] + right_gap_open_A, \
+ Iy_row[nB-1] + right_gap_extend_A); \
+ Iy_temp = Iy_row[nB]; \
+ Iy_row[nB] = score; \
+\
+ SELECT_SCORE_GLOBAL(M_row[nB], Ix_row[nB], Iy_row[nB]); \
+ PyMem_Free(M_row); \
+ PyMem_Free(Ix_row); \
+ PyMem_Free(Iy_row); \
+ return PyFloat_FromDouble(score); \
+\
+exit: \
+ if (M_row) PyMem_Free(M_row); \
+ if (Ix_row) PyMem_Free(Ix_row); \
+ if (Iy_row) PyMem_Free(Iy_row); \
+ return PyErr_NoMemory(); \
+
+
+#define GOTOH_LOCAL_SCORE(align_score) \
+ int i; \
+ int j; \
+ int kA; \
+ int kB; \
+ const double gap_open_A = self->target_internal_open_gap_score; \
+ const double gap_open_B = self->query_internal_open_gap_score; \
+ const double gap_extend_A = self->target_internal_extend_gap_score; \
+ const double gap_extend_B = self->query_internal_extend_gap_score; \
+ double* M_row = NULL; \
+ double* Ix_row = NULL; \
+ double* Iy_row = NULL; \
+ double score; \
+ double temp; \
+ double M_temp; \
+ double Ix_temp; \
+ double Iy_temp; \
+ double maximum = 0.0; \
+\
+ /* Gotoh algorithm with three states */ \
+ M_row = PyMem_Malloc((nB+1)*sizeof(double)); \
+ if (!M_row) goto exit; \
+ Ix_row = PyMem_Malloc((nB+1)*sizeof(double)); \
+ if (!Ix_row) goto exit; \
+ Iy_row = PyMem_Malloc((nB+1)*sizeof(double)); \
+ if (!Iy_row) goto exit; \
+ \
+ /* The top row of the score matrix is a special case, \
+ * as there are no previously aligned characters. \
+ */ \
+ M_row[0] = 0; \
+ Ix_row[0] = -DBL_MAX; \
+ Iy_row[0] = -DBL_MAX; \
+ for (j = 1; j <= nB; j++) { \
+ M_row[j] = -DBL_MAX; \
+ Ix_row[j] = -DBL_MAX; \
+ Iy_row[j] = 0; \
+ } \
+ for (i = 1; i < nA; i++) { \
+ M_temp = M_row[0]; \
+ Ix_temp = Ix_row[0]; \
+ Iy_temp = Iy_row[0]; \
+ M_row[0] = -DBL_MAX; \
+ Ix_row[0] = 0; \
+ Iy_row[0] = -DBL_MAX; \
+ kA = sA[i-1]; \
+ for (j = 1; j < nB; j++) { \
+ kB = sB[j-1]; \
+ SELECT_SCORE_GOTOH_LOCAL_ALIGN(M_temp, \
+ Ix_temp, \
+ Iy_temp, \
+ (align_score)); \
+ M_temp = M_row[j]; \
+ M_row[j] = score; \
+ SELECT_SCORE_LOCAL3(M_temp + gap_open_B, \
+ Ix_row[j] + gap_extend_B, \
+ Iy_row[j] + gap_open_B); \
+ Ix_temp = Ix_row[j]; \
+ Ix_row[j] = score; \
+ SELECT_SCORE_LOCAL3(M_row[j-1] + gap_open_A, \
+ Ix_row[j-1] + gap_open_A, \
+ Iy_row[j-1] + gap_extend_A); \
+ Iy_temp = Iy_row[j]; \
+ Iy_row[j] = score; \
+ } \
+ kB = sB[nB-1]; \
+ Ix_row[nB] = 0; \
+ Iy_row[nB] = 0; \
+ SELECT_SCORE_GOTOH_LOCAL_ALIGN(M_temp, \
+ Ix_temp, \
+ Iy_temp, \
+ (align_score)); \
+ M_temp = M_row[nB]; \
+ M_row[nB] = score; \
+ } \
+ M_temp = M_row[0]; \
+ Ix_temp = Ix_row[0]; \
+ Iy_temp = Iy_row[0]; \
+ M_row[0] = -DBL_MAX; \
+ Ix_row[0] = 0; \
+ Iy_row[0] = -DBL_MAX; \
+ kA = sA[nA-1]; \
+ for (j = 1; j < nB; j++) { \
+ kB = sB[j-1]; \
+ SELECT_SCORE_GOTOH_LOCAL_ALIGN(M_temp, \
+ Ix_temp, \
+ Iy_temp, \
+ (align_score)); \
+ M_temp = M_row[j]; \
+ M_row[j] = score; \
+ Ix_temp = Ix_row[j]; \
+ Iy_temp = Iy_row[j]; \
+ Ix_row[j] = 0; \
+ Iy_row[j] = 0; \
+ } \
+ kB = sB[nB-1]; \
+ SELECT_SCORE_GOTOH_LOCAL_ALIGN(M_temp, \
+ Ix_temp, \
+ Iy_temp, \
+ (align_score)); \
+ PyMem_Free(M_row); \
+ PyMem_Free(Ix_row); \
+ PyMem_Free(Iy_row); \
+ return PyFloat_FromDouble(maximum); \
+exit: \
+ if (M_row) PyMem_Free(M_row); \
+ if (Ix_row) PyMem_Free(Ix_row); \
+ if (Iy_row) PyMem_Free(Iy_row); \
+ return PyErr_NoMemory(); \
+
+
+#define GOTOH_GLOBAL_ALIGN(align_score) \
+ int i; \
+ int j; \
+ int kA; \
+ int kB; \
+ const double gap_open_A = self->target_internal_open_gap_score; \
+ const double gap_open_B = self->query_internal_open_gap_score; \
+ const double gap_extend_A = self->target_internal_extend_gap_score; \
+ const double gap_extend_B = self->query_internal_extend_gap_score; \
+ double left_gap_open_A; \
+ double left_gap_open_B; \
+ double left_gap_extend_A; \
+ double left_gap_extend_B; \
+ double right_gap_open_A; \
+ double right_gap_open_B; \
+ double right_gap_extend_A; \
+ double right_gap_extend_B; \
+ const double epsilon = self->epsilon; \
+ TraceGapsGotoh** gaps = NULL; \
+ Trace** M = NULL; \
+ double* M_row = NULL; \
+ double* Ix_row = NULL; \
+ double* Iy_row = NULL; \
+ double score; \
+ int trace; \
+ double temp; \
+ double M_temp; \
+ double Ix_temp; \
+ double Iy_temp; \
+ PathGenerator* paths; \
+ switch (strand) { \
+ case '+': \
+ left_gap_open_A = self->target_left_open_gap_score; \
+ left_gap_open_B = self->query_left_open_gap_score; \
+ left_gap_extend_A = self->target_left_extend_gap_score; \
+ left_gap_extend_B = self->query_left_extend_gap_score; \
+ right_gap_open_A = self->target_right_open_gap_score; \
+ right_gap_open_B = self->query_right_open_gap_score; \
+ right_gap_extend_A = self->target_right_extend_gap_score; \
+ right_gap_extend_B = self->query_right_extend_gap_score; \
+ break; \
+ case '-': \
+ left_gap_open_A = self->target_right_open_gap_score; \
+ left_gap_open_B = self->query_right_open_gap_score; \
+ left_gap_extend_A = self->target_right_extend_gap_score; \
+ left_gap_extend_B = self->query_right_extend_gap_score; \
+ right_gap_open_A = self->target_left_open_gap_score; \
+ right_gap_open_B = self->query_left_open_gap_score; \
+ right_gap_extend_A = self->target_left_extend_gap_score; \
+ right_gap_extend_B = self->query_left_extend_gap_score; \
+ break; \
+ default: \
+ PyErr_SetString(PyExc_RuntimeError, "strand was neither '+' nor '-'"); \
+ return NULL; \
+ } \
+\
+ /* Gotoh algorithm with three states */ \
+ paths = PathGenerator_create_Gotoh(nA, nB, Global, strand); \
+ if (!paths) return NULL; \
+ M_row = PyMem_Malloc((nB+1)*sizeof(double)); \
+ if (!M_row) goto exit; \
+ Ix_row = PyMem_Malloc((nB+1)*sizeof(double)); \
+ if (!Ix_row) goto exit; \
+ Iy_row = PyMem_Malloc((nB+1)*sizeof(double)); \
+ if (!Iy_row) goto exit; \
+ M = paths->M; \
+ gaps = paths->gaps.gotoh; \
+ \
+ /* Gotoh algorithm with three states */ \
+ M_row[0] = 0; \
+ Ix_row[0] = -DBL_MAX; \
+ Iy_row[0] = -DBL_MAX; \
+ for (j = 1; j <= nB; j++) { \
+ M_row[j] = -DBL_MAX; \
+ Ix_row[j] = -DBL_MAX; \
+ Iy_row[j] = left_gap_open_A + left_gap_extend_A * (j-1); \
+ } \
+ for (i = 1; i < nA; i++) { \
+ kA = sA[i-1]; \
+ M_temp = M_row[0]; \
+ Ix_temp = Ix_row[0]; \
+ Iy_temp = Iy_row[0]; \
+ M_row[0] = -DBL_MAX; \
+ Ix_row[0] = left_gap_open_B + left_gap_extend_B * (i-1); \
+ Iy_row[0] = -DBL_MAX; \
+ for (j = 1; j < nB; j++) { \
+ kB = sB[j-1]; \
+ SELECT_TRACE_GOTOH_GLOBAL_ALIGN; \
+ M_temp = M_row[j]; \
+ M_row[j] = score + (align_score); \
+ SELECT_TRACE_GOTOH_GLOBAL_GAP(Ix, \
+ M_temp + gap_open_B, \
+ Ix_row[j] + gap_extend_B, \
+ Iy_row[j] + gap_open_B); \
+ Ix_temp = Ix_row[j]; \
+ Ix_row[j] = score; \
+ SELECT_TRACE_GOTOH_GLOBAL_GAP(Iy, \
+ M_row[j-1] + gap_open_A, \
+ Ix_row[j-1] + gap_open_A, \
+ Iy_row[j-1] + gap_extend_A); \
+ Iy_temp = Iy_row[j]; \
+ Iy_row[j] = score; \
+ } \
+ kB = sB[nB-1]; \
+ SELECT_TRACE_GOTOH_GLOBAL_ALIGN; \
+ M_temp = M_row[nB]; \
+ M_row[nB] = score + (align_score); \
+ SELECT_TRACE_GOTOH_GLOBAL_GAP(Ix, \
+ M_temp + right_gap_open_B, \
+ Ix_row[nB] + right_gap_extend_B, \
+ Iy_row[nB] + right_gap_open_B); \
+ Ix_temp = Ix_row[nB]; \
+ Ix_row[nB] = score; \
+ SELECT_TRACE_GOTOH_GLOBAL_GAP(Iy, \
+ M_row[nB-1] + gap_open_A, \
+ Ix_row[nB-1] + gap_open_A, \
+ Iy_row[nB-1] + gap_extend_A); \
+ Iy_temp = Iy_row[nB]; \
+ Iy_row[nB] = score; \
+ } \
+ kA = sA[nA-1]; \
+ M_temp = M_row[0]; \
+ Ix_temp = Ix_row[0]; \
+ Iy_temp = Iy_row[0]; \
+ M_row[0] = -DBL_MAX; \
+ Ix_row[0] = left_gap_open_B + left_gap_extend_B * (nA-1); \
+ Iy_row[0] = -DBL_MAX; \
+ for (j = 1; j < nB; j++) { \
+ kB = sB[j-1]; \
+ SELECT_TRACE_GOTOH_GLOBAL_ALIGN; \
+ M_temp = M_row[j]; \
+ M_row[j] = score + (align_score); \
+ SELECT_TRACE_GOTOH_GLOBAL_GAP(Ix, \
+ M_temp + gap_open_B, \
+ Ix_row[j] + gap_extend_B, \
+ Iy_row[j] + gap_open_B); \
+ Ix_temp = Ix_row[j]; \
+ Ix_row[j] = score; \
+ SELECT_TRACE_GOTOH_GLOBAL_GAP(Iy, \
+ M_row[j-1] + right_gap_open_A, \
+ Ix_row[j-1] + right_gap_open_A, \
+ Iy_row[j-1] + right_gap_extend_A); \
+ Iy_temp = Iy_row[j]; \
+ Iy_row[j] = score; \
+ } \
+ kB = sB[nB-1]; \
+ SELECT_TRACE_GOTOH_GLOBAL_ALIGN; \
+ M_temp = M_row[j]; \
+ M_row[j] = score + (align_score); \
+ SELECT_TRACE_GOTOH_GLOBAL_GAP(Ix, \
+ M_temp + right_gap_open_B, \
+ Ix_row[j] + right_gap_extend_B, \
+ Iy_row[j] + right_gap_open_B); \
+ Ix_row[nB] = score; \
+ SELECT_TRACE_GOTOH_GLOBAL_GAP(Iy, \
+ M_row[j-1] + right_gap_open_A, \
+ Ix_row[j-1] + right_gap_open_A, \
+ Iy_row[j-1] + right_gap_extend_A); \
+ Iy_row[nB] = score; \
+ M[nA][nB].path = 0; \
+ \
+ /* traceback */ \
+ SELECT_SCORE_GLOBAL(M_row[nB], Ix_row[nB], Iy_row[nB]); \
+ if (M_row[nB] < score - epsilon) M[nA][nB].trace = 0; \
+ if (Ix_row[nB] < score - epsilon) gaps[nA][nB].Ix = 0; \
+ if (Iy_row[nB] < score - epsilon) gaps[nA][nB].Iy = 0; \
+ return Py_BuildValue("fN", score, paths); \
+exit: \
+ Py_DECREF(paths); \
+ if (M_row) PyMem_Free(M_row); \
+ if (Ix_row) PyMem_Free(Ix_row); \
+ if (Iy_row) PyMem_Free(Iy_row); \
+ return PyErr_NoMemory(); \
+
+
+#define GOTOH_LOCAL_ALIGN(align_score) \
+ int i; \
+ int j; \
+ int im = nA; \
+ int jm = nB; \
+ int kA; \
+ int kB; \
+ const double gap_open_A = self->target_internal_open_gap_score; \
+ const double gap_open_B = self->query_internal_open_gap_score; \
+ const double gap_extend_A = self->target_internal_extend_gap_score; \
+ const double gap_extend_B = self->query_internal_extend_gap_score; \
+ const double epsilon = self->epsilon; \
+ Trace** M = NULL; \
+ TraceGapsGotoh** gaps = NULL; \
+ double* M_row = NULL; \
+ double* Ix_row = NULL; \
+ double* Iy_row = NULL; \
+ double score; \
+ int trace; \
+ double temp; \
+ double M_temp; \
+ double Ix_temp; \
+ double Iy_temp; \
+ double maximum = 0.0; \
+ PathGenerator* paths; \
+ \
+ /* Gotoh algorithm with three states */ \
+ paths = PathGenerator_create_Gotoh(nA, nB, Local, strand); \
+ if (!paths) return NULL; \
+ M = paths->M; \
+ gaps = paths->gaps.gotoh; \
+ M_row = PyMem_Malloc((nB+1)*sizeof(double)); \
+ if (!M_row) goto exit; \
+ Ix_row = PyMem_Malloc((nB+1)*sizeof(double)); \
+ if (!Ix_row) goto exit; \
+ Iy_row = PyMem_Malloc((nB+1)*sizeof(double)); \
+ if (!Iy_row) goto exit; \
+ M_row[0] = 0; \
+ Ix_row[0] = -DBL_MAX; \
+ Iy_row[0] = -DBL_MAX; \
+ for (j = 1; j <= nB; j++) { \
+ M_row[j] = 0; \
+ Ix_row[j] = -DBL_MAX; \
+ Iy_row[j] = -DBL_MAX; \
+ } \
+ for (i = 1; i < nA; i++) { \
+ M_temp = M_row[0]; \
+ Ix_temp = Ix_row[0]; \
+ Iy_temp = Iy_row[0]; \
+ M_row[0] = 0; \
+ Ix_row[0] = -DBL_MAX; \
+ Iy_row[0] = -DBL_MAX; \
+ kA = sA[i-1]; \
+ for (j = 1; j < nB; j++) { \
+ kB = sB[j-1]; \
+ SELECT_TRACE_GOTOH_LOCAL_ALIGN(align_score) \
+ M_temp = M_row[j]; \
+ M_row[j] = score; \
+ SELECT_TRACE_GOTOH_LOCAL_GAP(Ix, \
+ M_temp + gap_open_B, \
+ Ix_row[j] + gap_extend_B, \
+ Iy_row[j] + gap_open_B); \
+ Ix_temp = Ix_row[j]; \
+ Ix_row[j] = score; \
+ SELECT_TRACE_GOTOH_LOCAL_GAP(Iy, \
+ M_row[j-1] + gap_open_A, \
+ Ix_row[j-1] + gap_open_A, \
+ Iy_row[j-1] + gap_extend_A); \
+ Iy_temp = Iy_row[j]; \
+ Iy_row[j] = score; \
+ } \
+ kB = sB[nB-1]; \
+ SELECT_TRACE_GOTOH_LOCAL_ALIGN(align_score) \
+ M_temp = M_row[j]; \
+ M_row[j] = score; \
+ Ix_temp = Ix_row[nB]; \
+ Ix_row[nB] = 0; \
+ gaps[i][nB].Ix = 0; \
+ Iy_temp = Iy_row[nB]; \
+ Iy_row[nB] = 0; \
+ gaps[i][nB].Iy = 0; \
+ } \
+ M_temp = M_row[0]; \
+ M_row[0] = 0; \
+ M[nA][0].trace = 0; \
+ Ix_temp = Ix_row[0]; \
+ Ix_row[0] = -DBL_MAX; \
+ gaps[nA][0].Ix = 0; \
+ gaps[nA][0].Iy = 0; \
+ Iy_temp = Iy_row[0]; \
+ Iy_row[0] = -DBL_MAX; \
+ kA = sA[nA-1]; \
+ for (j = 1; j < nB; j++) { \
+ kB = sB[j-1]; \
+ SELECT_TRACE_GOTOH_LOCAL_ALIGN(align_score) \
+ M_temp = M_row[j]; \
+ M_row[j] = score; \
+ Ix_temp = Ix_row[j]; \
+ Ix_row[j] = 0; \
+ gaps[nA][j].Ix = 0; \
+ Iy_temp = Iy_row[j]; \
+ Iy_row[j] = 0; \
+ gaps[nA][j].Iy = 0; \
+ } \
+ kB = sB[nB-1]; \
+ SELECT_TRACE_GOTOH_LOCAL_ALIGN(align_score) \
+ gaps[nA][nB].Ix = 0; \
+ gaps[nA][nB].Iy = 0; \
+\
+ PyMem_Free(M_row); \
+ PyMem_Free(Ix_row); \
+ PyMem_Free(Iy_row); \
+\
+ /* As we don't allow zero-score extensions to alignments, \
+ * we need to remove all traces towards an ENDPOINT. \
+ * In addition, some points then won't have any path to a STARTPOINT. \
+ * Here, use path as a temporary variable to indicate if the point \
+ * is reachable from a STARTPOINT. If it is unreachable, remove all \
+ * traces from it, and don't allow it to be an ENDPOINT. It may still \
+ * be a valid STARTPOINT. */ \
+ for (j = 0; j <= nB; j++) M[0][j].path = M_MATRIX; \
+ for (i = 1; i <= nA; i++) { \
+ M[i][0].path = M_MATRIX; \
+ for (j = 1; j <= nB; j++) { \
+ /* Remove traces to unreachable points. */ \
+ trace = M[i][j].trace; \
+ if (!(M[i-1][j-1].path & M_MATRIX)) trace &= ~M_MATRIX; \
+ if (!(M[i-1][j-1].path & Ix_MATRIX)) trace &= ~Ix_MATRIX; \
+ if (!(M[i-1][j-1].path & Iy_MATRIX)) trace &= ~Iy_MATRIX; \
+ if (trace & (STARTPOINT | M_MATRIX | Ix_MATRIX | Iy_MATRIX)) { \
+ /* The point is reachable. */ \
+ if (trace & ENDPOINT) M[i][j].path = 0; /* no extensions after ENDPOINT */ \
+ else M[i][j].path |= M_MATRIX; \
+ } \
+ else { \
+ /* The point is not reachable. Then it is not a STARTPOINT, \
+ * all traces from it can be removed, and it cannot act as \
+ * an ENDPOINT. */ \
+ M[i][j].path &= ~M_MATRIX; \
+ trace = 0; \
+ } \
+ M[i][j].trace = trace; \
+ trace = gaps[i][j].Ix; \
+ if (!(M[i-1][j].path & M_MATRIX)) trace &= ~M_MATRIX; \
+ if (!(M[i-1][j].path & Ix_MATRIX)) trace &= ~Ix_MATRIX; \
+ if (!(M[i-1][j].path & Iy_MATRIX)) trace &= ~Iy_MATRIX; \
+ if (trace & (M_MATRIX | Ix_MATRIX | Iy_MATRIX)) { \
+ /* The point is reachable. */ \
+ M[i][j].path |= Ix_MATRIX; \
+ } \
+ else { \
+ /* The point is not reachable. Then \
+ * all traces from it can be removed. */ \
+ M[i][j].path &= ~Ix_MATRIX; \
+ trace = 0; \
+ } \
+ gaps[i][j].Ix = trace; \
+ trace = gaps[i][j].Iy; \
+ if (!(M[i][j-1].path & M_MATRIX)) trace &= ~M_MATRIX; \
+ if (!(M[i][j-1].path & Ix_MATRIX)) trace &= ~Ix_MATRIX; \
+ if (!(M[i][j-1].path & Iy_MATRIX)) trace &= ~Iy_MATRIX; \
+ if (trace & (M_MATRIX | Ix_MATRIX | Iy_MATRIX)) { \
+ /* The point is reachable. */ \
+ M[i][j].path |= Iy_MATRIX; \
+ } \
+ else { \
+ /* The point is not reachable. Then \
+ * all traces from it can be removed. */ \
+ M[i][j].path &= ~Iy_MATRIX; \
+ trace = 0; \
+ } \
+ gaps[i][j].Iy = trace; \
+ } \
+ } \
+\
+ /* traceback */ \
+ if (maximum == 0) M[0][0].path = DONE; \
+ else M[0][0].path = 0; \
+ return Py_BuildValue("fN", maximum, paths); \
+\
+exit: \
+ Py_DECREF(paths); \
+ if (M_row) PyMem_Free(M_row); \
+ if (Ix_row) PyMem_Free(Ix_row); \
+ if (Iy_row) PyMem_Free(Iy_row); \
+ return PyErr_NoMemory(); \
+
+
+#define WATERMANSMITHBEYER_ENTER_SCORE \
+ int i; \
+ int j = 0; \
+ int k; \
+ int kA; \
+ int kB; \
+ double** M = NULL; \
+ double** Ix = NULL; \
+ double** Iy = NULL; \
+ double score = 0.0; \
+ double gapscore = 0.0; \
+ double temp; \
+ int ok = 1; \
+ PyObject* result = NULL; \
+\
+ /* Waterman-Smith-Beyer algorithm */ \
+ M = PyMem_Malloc((nA+1)*sizeof(double*)); \
+ if (!M) goto exit; \
+ Ix = PyMem_Malloc((nA+1)*sizeof(double*)); \
+ if (!Ix) goto exit; \
+ Iy = PyMem_Malloc((nA+1)*sizeof(double*)); \
+ if (!Iy) goto exit; \
+ for (i = 0; i <= nA; i++) { \
+ M[i] = PyMem_Malloc((nB+1)*sizeof(double)); \
+ if (!M[i]) goto exit; \
+ Ix[i] = PyMem_Malloc((nB+1)*sizeof(double)); \
+ if (!Ix[i]) goto exit; \
+ Iy[i] = PyMem_Malloc((nB+1)*sizeof(double)); \
+ if (!Iy[i]) goto exit; \
+ } \
+
+
+#define WATERMANSMITHBEYER_GLOBAL_SCORE(align_score, query_gap_start) \
+ /* The top row of the score matrix is a special case, \
+ * as there are no previously aligned characters. \
+ */ \
+ M[0][0] = 0; \
+ Ix[0][0] = -DBL_MAX; \
+ Iy[0][0] = -DBL_MAX; \
+ for (i = 1; i <= nA; i++) { \
+ M[i][0] = -DBL_MAX; \
+ Iy[i][0] = -DBL_MAX; \
+ ok = _call_query_gap_function(self, query_gap_start, i, &score); \
+ if (!ok) goto exit; \
+ Ix[i][0] = score; \
+ } \
+ for (j = 1; j <= nB; j++) { \
+ M[0][j] = -DBL_MAX; \
+ Ix[0][j] = -DBL_MAX; \
+ ok = _call_target_gap_function(self, 0, j, &score); \
+ if (!ok) goto exit; \
+ Iy[0][j] = score; \
+ } \
+ for (i = 1; i <= nA; i++) { \
+ kA = sA[i-1]; \
+ for (j = 1; j <= nB; j++) { \
+ kB = sB[j-1]; \
+ SELECT_SCORE_GLOBAL(M[i-1][j-1], Ix[i-1][j-1], Iy[i-1][j-1]); \
+ M[i][j] = score + (align_score); \
+ score = -DBL_MAX; \
+ for (k = 1; k <= i; k++) { \
+ ok = _call_query_gap_function(self, query_gap_start, k, &gapscore); \
+ if (!ok) goto exit; \
+ SELECT_SCORE_WATERMAN_SMITH_BEYER(M[i-k][j], Iy[i-k][j]); \
+ } \
+ Ix[i][j] = score; \
+ score = -DBL_MAX; \
+ for (k = 1; k <= j; k++) { \
+ ok = _call_target_gap_function(self, i, k, &gapscore); \
+ if (!ok) goto exit; \
+ SELECT_SCORE_WATERMAN_SMITH_BEYER(M[i][j-k], Ix[i][j-k]); \
+ } \
+ Iy[i][j] = score; \
+ } \
+ } \
+ SELECT_SCORE_GLOBAL(M[nA][nB], Ix[nA][nB], Iy[nA][nB]); \
+\
+ result = PyFloat_FromDouble(score); \
+
+
+#define WATERMANSMITHBEYER_LOCAL_SCORE(align_score, query_gap_start) \
+ /* The top row of the score matrix is a special case, \
+ * as there are no previously aligned characters. \
+ */ \
+ M[0][0] = 0; \
+ Ix[0][0] = -DBL_MAX; \
+ Iy[0][0] = -DBL_MAX; \
+ for (i = 1; i <= nA; i++) { \
+ M[i][0] = -DBL_MAX; \
+ Ix[i][0] = 0; \
+ Iy[i][0] = -DBL_MAX; \
+ } \
+ for (j = 1; j <= nB; j++) { \
+ M[0][j] = -DBL_MAX; \
+ Ix[0][j] = -DBL_MAX; \
+ Iy[0][j] = 0; \
+ } \
+ for (i = 1; i <= nA; i++) { \
+ kA = sA[i-1]; \
+ for (j = 1; j <= nB; j++) { \
+ kB = sB[j-1]; \
+ SELECT_SCORE_GOTOH_LOCAL_ALIGN(M[i-1][j-1], \
+ Ix[i-1][j-1], \
+ Iy[i-1][j-1], \
+ (align_score)); \
+ M[i][j] = score; \
+ if (i == nA || j == nB) { \
+ Ix[i][j] = 0; \
+ Iy[i][j] = 0; \
+ continue; \
+ } \
+ score = 0.0; \
+ for (k = 1; k <= i; k++) { \
+ ok = _call_query_gap_function(self, query_gap_start, k, &gapscore); \
+ SELECT_SCORE_WATERMAN_SMITH_BEYER(M[i-k][j], Iy[i-k][j]); \
+ if (!ok) goto exit; \
+ } \
+ if (score > maximum) maximum = score; \
+ Ix[i][j] = score; \
+ score = 0.0; \
+ for (k = 1; k <= j; k++) { \
+ ok = _call_target_gap_function(self, i, k, &gapscore); \
+ if (!ok) goto exit; \
+ SELECT_SCORE_WATERMAN_SMITH_BEYER(M[i][j-k], Ix[i][j-k]); \
+ } \
+ if (score > maximum) maximum = score; \
+ Iy[i][j] = score; \
+ } \
+ } \
+ SELECT_SCORE_GLOBAL(M[nA][nB], Ix[nA][nB], Iy[nA][nB]); \
+ if (score > maximum) maximum = score; \
+ result = PyFloat_FromDouble(maximum); \
+
+
+#define WATERMANSMITHBEYER_EXIT_SCORE \
+exit: \
+ if (M) { \
+ /* If M is NULL, then Ix is also NULL. */ \
+ if (Ix) { \
+ /* If Ix is NULL, then Iy is also NULL. */ \
+ if (Iy) { \
+ /* If Iy is NULL, then M[i], Ix[i], and Iy[i] are \
+ * also NULL. */ \
+ for (i = 0; i <= nA; i++) { \
+ if (!M[i]) break; \
+ PyMem_Free(M[i]); \
+ if (!Ix[i]) break; \
+ PyMem_Free(Ix[i]); \
+ if (!Iy[i]) break; \
+ PyMem_Free(Iy[i]); \
+ } \
+ PyMem_Free(Iy); \
+ } \
+ PyMem_Free(Ix); \
+ } \
+ PyMem_Free(M); \
+ } \
+ if (!ok) return NULL; \
+ if (!result) return PyErr_NoMemory(); \
+ return result; \
+
+
+#define WATERMANSMITHBEYER_ENTER_ALIGN(mode) \
+ int i; \
+ int j = 0; \
+ int gap; \
+ int kA; \
+ int kB; \
+ const double epsilon = self->epsilon; \
+ Trace** M; \
+ TraceGapsWatermanSmithBeyer** gaps; \
+ double** M_row; \
+ double** Ix_row; \
+ double** Iy_row; \
+ int ng; \
+ int nm; \
+ double score; \
+ double gapscore; \
+ double temp; \
+ int trace; \
+ int* gapM; \
+ int* gapXY; \
+ int ok = 1; \
+ PathGenerator* paths = NULL; \
+ \
+ /* Waterman-Smith-Beyer algorithm */ \
+ paths = PathGenerator_create_WSB(nA, nB, mode, strand); \
+ if (!paths) return NULL; \
+ M = paths->M; \
+ gaps = paths->gaps.waterman_smith_beyer; \
+ M_row = PyMem_Malloc((nA+1)*sizeof(double*)); \
+ if (!M_row) goto exit; \
+ Ix_row = PyMem_Malloc((nA+1)*sizeof(double*)); \
+ if (!Ix_row) goto exit; \
+ Iy_row = PyMem_Malloc((nA+1)*sizeof(double*)); \
+ if (!Iy_row) goto exit; \
+ for (i = 0; i <= nA; i++) { \
+ M_row[i] = PyMem_Malloc((nB+1)*sizeof(double)); \
+ if (!M_row[i]) goto exit; \
+ Ix_row[i] = PyMem_Malloc((nB+1)*sizeof(double)); \
+ if (!Ix_row[i]) goto exit; \
+ Iy_row[i] = PyMem_Malloc((nB+1)*sizeof(double)); \
+ if (!Iy_row[i]) goto exit; \
+ } \
+
+
+#define WATERMANSMITHBEYER_GLOBAL_ALIGN(align_score, query_gap_start) \
+ M_row[0][0] = 0; \
+ Ix_row[0][0] = -DBL_MAX; \
+ Iy_row[0][0] = -DBL_MAX; \
+ for (i = 1; i <= nA; i++) { \
+ M_row[i][0] = -DBL_MAX; \
+ Iy_row[i][0] = -DBL_MAX; \
+ ok = _call_query_gap_function(self, query_gap_start, i, &score); \
+ if (!ok) goto exit; \
+ Ix_row[i][0] = score; \
+ } \
+ for (j = 1; j <= nB; j++) { \
+ M_row[0][j] = -DBL_MAX; \
+ Ix_row[0][j] = -DBL_MAX; \
+ ok = _call_target_gap_function(self, query_gap_start, j, &score); \
+ if (!ok) goto exit; \
+ Iy_row[0][j] = score; \
+ } \
+ for (i = 1; i <= nA; i++) { \
+ kA = sA[i-1]; \
+ for (j = 1; j <= nB; j++) { \
+ kB = sB[j-1]; \
+ SELECT_TRACE_WATERMAN_SMITH_BEYER_GLOBAL_ALIGN((align_score)); \
+ gapM = PyMem_Malloc((i+1)*sizeof(int)); \
+ if (!gapM) goto exit; \
+ gaps[i][j].MIx = gapM; \
+ gapXY = PyMem_Malloc((i+1)*sizeof(int)); \
+ if (!gapXY) goto exit; \
+ gaps[i][j].IyIx = gapXY; \
+ nm = 0; \
+ ng = 0; \
+ score = -DBL_MAX; \
+ for (gap = 1; gap <= i; gap++) { \
+ ok = _call_query_gap_function(self, query_gap_start, gap, &gapscore); \
+ if (!ok) goto exit; \
+ SELECT_TRACE_WATERMAN_SMITH_BEYER_GAP(M_row[i-gap][j], \
+ Iy_row[i-gap][j]); \
+ } \
+ gapM = PyMem_Realloc(gapM, (nm+1)*sizeof(int)); \
+ if (!gapM) goto exit; \
+ gaps[i][j].MIx = gapM; \
+ gapM[nm] = 0; \
+ gapXY = PyMem_Realloc(gapXY, (ng+1)*sizeof(int)); \
+ if (!gapXY) goto exit; \
+ gapXY[ng] = 0; \
+ gaps[i][j].IyIx = gapXY; \
+ Ix_row[i][j] = score; \
+ gapM = PyMem_Malloc((j+1)*sizeof(int)); \
+ if (!gapM) goto exit; \
+ gaps[i][j].MIy = gapM; \
+ gapXY = PyMem_Malloc((j+1)*sizeof(int)); \
+ if (!gapXY) goto exit; \
+ gaps[i][j].IxIy = gapXY; \
+ nm = 0; \
+ ng = 0; \
+ score = -DBL_MAX; \
+ for (gap = 1; gap <= j; gap++) { \
+ ok = _call_target_gap_function(self, i, gap, &gapscore); \
+ if (!ok) goto exit; \
+ SELECT_TRACE_WATERMAN_SMITH_BEYER_GAP(M_row[i][j-gap], \
+ Ix_row[i][j-gap]); \
+ } \
+ Iy_row[i][j] = score; \
+ gapM = PyMem_Realloc(gapM, (nm+1)*sizeof(int)); \
+ if (!gapM) goto exit; \
+ gaps[i][j].MIy = gapM; \
+ gapM[nm] = 0; \
+ gapXY = PyMem_Realloc(gapXY, (ng+1)*sizeof(int)); \
+ if (!gapXY) goto exit; \
+ gaps[i][j].IxIy = gapXY; \
+ gapXY[ng] = 0; \
+ } \
+ } \
+ /* traceback */ \
+ SELECT_SCORE_GLOBAL(M_row[nA][nB], Ix_row[nA][nB], Iy_row[nA][nB]); \
+ M[nA][nB].path = 0; \
+ if (M_row[nA][nB] < score - epsilon) M[nA][nB].trace = 0; \
+ if (Ix_row[nA][nB] < score - epsilon) { \
+ gapM = PyMem_Realloc(gaps[nA][nB].MIx, sizeof(int)); \
+ if (!gapM) goto exit; \
+ gapM[0] = 0; \
+ gaps[nA][nB].MIx = gapM; \
+ gapXY = PyMem_Realloc(gaps[nA][nB].IyIx, sizeof(int)); \
+ if (!gapXY) goto exit; \
+ gapXY[0] = 0; \
+ gaps[nA][nB].IyIx = gapXY; \
+ } \
+ if (Iy_row[nA][nB] < score - epsilon) { \
+ gapM = PyMem_Realloc(gaps[nA][nB].MIy, sizeof(int)); \
+ if (!gapM) goto exit; \
+ gapM[0] = 0; \
+ gaps[nA][nB].MIy = gapM; \
+ gapXY = PyMem_Realloc(gaps[nA][nB].IxIy, sizeof(int)); \
+ if (!gapXY) goto exit; \
+ gapXY[0] = 0; \
+ gaps[nA][nB].IxIy = gapXY; \
+ } \
+ for (i = 0; i <= nA; i++) { \
+ PyMem_Free(M_row[i]); \
+ PyMem_Free(Ix_row[i]); \
+ PyMem_Free(Iy_row[i]); \
+ } \
+ PyMem_Free(M_row); \
+ PyMem_Free(Ix_row); \
+ PyMem_Free(Iy_row); \
+ return Py_BuildValue("fN", score, paths); \
+
+
+#define WATERMANSMITHBEYER_LOCAL_ALIGN(align_score, query_gap_start) \
+ M_row[0][0] = 0; \
+ Ix_row[0][0] = -DBL_MAX; \
+ Iy_row[0][0] = -DBL_MAX; \
+ for (i = 1; i <= nA; i++) { \
+ M_row[i][0] = 0; \
+ Ix_row[i][0] = -DBL_MAX; \
+ Iy_row[i][0] = -DBL_MAX; \
+ } \
+ for (i = 1; i <= nB; i++) { \
+ M_row[0][i] = 0; \
+ Ix_row[0][i] = -DBL_MAX; \
+ Iy_row[0][i] = -DBL_MAX; \
+ } \
+ for (i = 1; i <= nA; i++) { \
+ kA = sA[i-1]; \
+ for (j = 1; j <= nB; j++) { \
+ kB = sB[j-1]; \
+ nm = 0; \
+ ng = 0; \
+ SELECT_TRACE_WATERMAN_SMITH_BEYER_ALIGN( \
+ M_row[i-1][j-1], \
+ Ix_row[i-1][j-1], \
+ Iy_row[i-1][j-1], \
+ (align_score)); \
+ M[i][j].path = 0; \
+ if (i == nA || j == nB) { \
+ Ix_row[i][j] = score; \
+ gaps[i][j].MIx = NULL; \
+ gaps[i][j].IyIx = NULL; \
+ gaps[i][j].MIy = NULL; \
+ gaps[i][j].IxIy = NULL; \
+ Iy_row[i][j] = score; \
+ continue; \
+ } \
+ gapM = PyMem_Malloc((i+1)*sizeof(int)); \
+ if (!gapM) goto exit; \
+ gaps[i][j].MIx = gapM; \
+ gapXY = PyMem_Malloc((i+1)*sizeof(int)); \
+ if (!gapXY) goto exit; \
+ gaps[i][j].IyIx = gapXY; \
+ score = -DBL_MAX; \
+ for (gap = 1; gap <= i; gap++) { \
+ ok = _call_query_gap_function(self, query_gap_start, gap, &gapscore); \
+ if (!ok) goto exit; \
+ SELECT_TRACE_WATERMAN_SMITH_BEYER_GAP(M_row[i-gap][j], \
+ Iy_row[i-gap][j]); \
+ } \
+ if (score < epsilon) { \
+ score = -DBL_MAX; \
+ nm = 0; \
+ ng = 0; \
+ } \
+ else if (score > maximum) maximum = score; \
+ gapM[nm] = 0; \
+ gapXY[ng] = 0; \
+ Ix_row[i][j] = score; \
+ M[i][j].path = 0; \
+ gapM = PyMem_Realloc(gapM, (nm+1)*sizeof(int)); \
+ if (!gapM) goto exit; \
+ gaps[i][j].MIx = gapM; \
+ gapM[nm] = 0; \
+ gapXY = PyMem_Realloc(gapXY, (ng+1)*sizeof(int)); \
+ if (!gapXY) goto exit; \
+ gaps[i][j].IyIx = gapXY; \
+ gapXY[ng] = 0; \
+ gapM = PyMem_Malloc((j+1)*sizeof(int)); \
+ if (!gapM) goto exit; \
+ gaps[i][j].MIy = gapM; \
+ gapXY = PyMem_Malloc((j+1)*sizeof(int)); \
+ if (!gapXY) goto exit; \
+ gaps[i][j].IxIy = gapXY; \
+ nm = 0; \
+ ng = 0; \
+ score = -DBL_MAX; \
+ gapM[0] = 0; \
+ for (gap = 1; gap <= j; gap++) { \
+ ok = _call_target_gap_function(self, i, gap, &gapscore); \
+ if (!ok) goto exit; \
+ SELECT_TRACE_WATERMAN_SMITH_BEYER_GAP(M_row[i][j-gap], \
+ Ix_row[i][j-gap]); \
+ } \
+ if (score < epsilon) { \
+ score = -DBL_MAX; \
+ nm = 0; \
+ ng = 0; \
+ } \
+ else if (score > maximum) maximum = score; \
+ gapM = PyMem_Realloc(gapM, (nm+1)*sizeof(int)); \
+ if (!gapM) goto exit; \
+ gaps[i][j].MIy = gapM; \
+ gapXY = PyMem_Realloc(gapXY, (ng+1)*sizeof(int)); \
+ if (!gapXY) goto exit; \
+ gaps[i][j].IxIy = gapXY; \
+ gapM[nm] = 0; \
+ gapXY[ng] = 0; \
+ Iy_row[i][j] = score; \
+ M[i][j].path = 0; \
+ } \
+ } \
+ for (i = 0; i <= nA; i++) PyMem_Free(M_row[i]); \
+ PyMem_Free(M_row); \
+ for (i = 0; i <= nA; i++) PyMem_Free(Ix_row[i]); \
+ PyMem_Free(Ix_row); \
+ for (i = 0; i <= nA; i++) PyMem_Free(Iy_row[i]); \
+ PyMem_Free(Iy_row); \
+\
+ /* As we don't allow zero-score extensions to alignments, \
+ * we need to remove all traces towards an ENDPOINT. \
+ * In addition, some points then won't have any path to a STARTPOINT. \
+ * Here, use path as a temporary variable to indicate if the point \
+ * is reachable from a STARTPOINT. If it is unreachable, remove all \
+ * traces from it, and don't allow it to be an ENDPOINT. It may still \
+ * be a valid STARTPOINT. */ \
+ for (j = 0; j <= nB; j++) M[0][j].path = M_MATRIX; \
+ for (i = 1; i <= nA; i++) { \
+ M[i][0].path = M_MATRIX; \
+ for (j = 1; j <= nB; j++) { \
+ /* Remove traces to unreachable points. */ \
+ trace = M[i][j].trace; \
+ if (!(M[i-1][j-1].path & M_MATRIX)) trace &= ~M_MATRIX; \
+ if (!(M[i-1][j-1].path & Ix_MATRIX)) trace &= ~Ix_MATRIX; \
+ if (!(M[i-1][j-1].path & Iy_MATRIX)) trace &= ~Iy_MATRIX; \
+ if (trace & (STARTPOINT | M_MATRIX | Ix_MATRIX | Iy_MATRIX)) { \
+ /* The point is reachable. */ \
+ if (trace & ENDPOINT) M[i][j].path = 0; /* no extensions after ENDPOINT */ \
+ else M[i][j].path |= M_MATRIX; \
+ } \
+ else { \
+ /* The point is not reachable. Then it is not a STARTPOINT, \
+ * all traces from it can be removed, and it cannot act as \
+ * an ENDPOINT. */ \
+ M[i][j].path &= ~M_MATRIX; \
+ trace = 0; \
+ } \
+ M[i][j].trace = trace; \
+ if (i == nA || j == nB) continue; \
+ gapM = gaps[i][j].MIx; \
+ gapXY = gaps[i][j].IyIx; \
+ nm = 0; \
+ ng = 0; \
+ for (im = 0; (gap = gapM[im]); im++) \
+ if (M[i-gap][j].path & M_MATRIX) gapM[nm++] = gap; \
+ gapM = PyMem_Realloc(gapM, (nm+1)*sizeof(int)); \
+ if (!gapM) goto exit; \
+ gapM[nm] = 0; \
+ gaps[i][j].MIx = gapM; \
+ for (im = 0; (gap = gapXY[im]); im++) \
+ if (M[i-gap][j].path & Iy_MATRIX) gapXY[ng++] = gap; \
+ gapXY = PyMem_Realloc(gapXY, (ng+1)*sizeof(int)); \
+ if (!gapXY) goto exit; \
+ gapXY[ng] = 0; \
+ gaps[i][j].IyIx = gapXY; \
+ if (nm==0 && ng==0) M[i][j].path &= ~Ix_MATRIX; /* not reachable */ \
+ else M[i][j].path |= Ix_MATRIX; /* reachable */ \
+ gapM = gaps[i][j].MIy; \
+ gapXY = gaps[i][j].IxIy; \
+ nm = 0; \
+ ng = 0; \
+ for (im = 0; (gap = gapM[im]); im++) \
+ if (M[i][j-gap].path & M_MATRIX) gapM[nm++] = gap; \
+ gapM = PyMem_Realloc(gapM, (nm+1)*sizeof(int)); \
+ if (!gapM) goto exit; \
+ gapM[nm] = 0; \
+ gaps[i][j].MIy = gapM; \
+ for (im = 0; (gap = gapXY[im]); im++) \
+ if (M[i][j-gap].path & Ix_MATRIX) gapXY[ng++] = gap; \
+ gapXY = PyMem_Realloc(gapXY, (ng+1)*sizeof(int)); \
+ if (!gapXY) goto exit; \
+ gapXY[ng] = 0; \
+ gaps[i][j].IxIy = gapXY; \
+ if (nm==0 && ng==0) M[i][j].path &= ~Iy_MATRIX; /* not reachable */ \
+ else M[i][j].path |= Iy_MATRIX; /* reachable */ \
+ } \
+ } \
+ /* traceback */ \
+ if (maximum == 0) M[0][0].path = DONE; \
+ else M[0][0].path = 0; \
+ return Py_BuildValue("fN", maximum, paths); \
+
+
+#define WATERMANSMITHBEYER_EXIT_ALIGN \
+exit: \
+ if (ok) /* otherwise, an exception was already set */ \
+ PyErr_SetNone(PyExc_MemoryError); \
+ Py_DECREF(paths); \
+ if (M_row) { \
+ /* If M is NULL, then Ix is also NULL. */ \
+ if (Ix_row) { \
+ /* If Ix is NULL, then Iy is also NULL. */ \
+ if (Iy_row) { \
+ /* If Iy is NULL, then M[i], Ix[i], and Iy[i] are also NULL. */ \
+ for (i = 0; i <= nA; i++) { \
+ if (!M_row[i]) break; \
+ PyMem_Free(M_row[i]); \
+ if (!Ix_row[i]) break; \
+ PyMem_Free(Ix_row[i]); \
+ if (!Iy_row[i]) break; \
+ PyMem_Free(Iy_row[i]); \
+ } \
+ PyMem_Free(Iy_row); \
+ } \
+ PyMem_Free(Ix_row); \
+ } \
+ PyMem_Free(M_row); \
+ } \
+ return NULL; \
+
+
+/* -------------- allocation & deallocation ------------- */
+
+static PathGenerator*
+PathGenerator_create_NWSW(Py_ssize_t nA, Py_ssize_t nB, Mode mode, unsigned char strand)
+{
+ int i;
+ unsigned char trace = 0;
+ Trace** M;
+ PathGenerator* paths;
+
+ paths = (PathGenerator*)PyType_GenericAlloc(&PathGenerator_Type, 0);
+ if (!paths) return NULL;
+
+ paths->iA = 0;
+ paths->iB = 0;
+ paths->nA = nA;
+ paths->nB = nB;
+ paths->M = NULL;
+ paths->gaps.gotoh = NULL;
+ paths->gaps.waterman_smith_beyer = NULL;
+ paths->algorithm = NeedlemanWunschSmithWaterman;
+ paths->mode = mode;
+ paths->length = 0;
+ paths->strand = strand;
+
+ M = PyMem_Malloc((nA+1)*sizeof(Trace*));
+ paths->M = M;
+ if (!M) goto exit;
+ switch (mode) {
+ case Global: trace = VERTICAL; break;
+ case Local: trace = STARTPOINT; break;
+ }
+ for (i = 0; i <= nA; i++) {
+ M[i] = PyMem_Malloc((nB+1)*sizeof(Trace));
+ if (!M[i]) goto exit;
+ M[i][0].trace = trace;
+ }
+ if (mode == Global) {
+ M[0][0].trace = 0;
+ trace = HORIZONTAL;
+ }
+ for (i = 1; i <= nB; i++) M[0][i].trace = trace;
+ M[0][0].path = 0;
+ return paths;
+exit:
+ Py_DECREF(paths);
+ PyErr_SetNone(PyExc_MemoryError);
+ return NULL;
+}
+
+static PathGenerator*
+PathGenerator_create_Gotoh(Py_ssize_t nA, Py_ssize_t nB, Mode mode, unsigned char strand)
+{
+ int i;
+ unsigned char trace;
+ Trace** M;
+ TraceGapsGotoh** gaps;
+ PathGenerator* paths;
+
+ switch (mode) {
+ case Global: trace = 0; break;
+ case Local: trace = STARTPOINT; break;
+ default:
+ /* Should not happen, but the compiler has no way of knowing that,
+ * as the enum Mode does not restrict the value of mode, which can
+ * be any integer. Include default: here to prevent compiler
+ * warnings.
+ */
+ PyErr_Format(PyExc_RuntimeError,
+ "mode has unexpected value %d", mode);
+ return NULL;
+ }
+
+ paths = (PathGenerator*)PyType_GenericAlloc(&PathGenerator_Type, 0);
+ if (!paths) return NULL;
+
+ paths->iA = 0;
+ paths->iB = 0;
+ paths->nA = nA;
+ paths->nB = nB;
+ paths->M = NULL;
+ paths->gaps.gotoh = NULL;
+ paths->algorithm = Gotoh;
+ paths->mode = mode;
+ paths->length = 0;
+ paths->strand = strand;
+
+ M = PyMem_Malloc((nA+1)*sizeof(Trace*));
+ if (!M) goto exit;
+ paths->M = M;
+ for (i = 0; i <= nA; i++) {
+ M[i] = PyMem_Malloc((nB+1)*sizeof(Trace));
+ if (!M[i]) goto exit;
+ M[i][0].trace = trace;
+ }
+ gaps = PyMem_Malloc((nA+1)*sizeof(TraceGapsGotoh*));
+ if (!gaps) goto exit;
+ paths->gaps.gotoh = gaps;
+ for (i = 0; i <= nA; i++) {
+ gaps[i] = PyMem_Malloc((nB+1)*sizeof(TraceGapsGotoh));
+ if (!gaps[i]) goto exit;
+ }
+
+ gaps[0][0].Ix = 0;
+ gaps[0][0].Iy = 0;
+ if (mode == Global) {
+ for (i = 1; i <= nA; i++) {
+ gaps[i][0].Ix = Ix_MATRIX;
+ gaps[i][0].Iy = 0;
+ }
+ gaps[1][0].Ix = M_MATRIX;
+ for (i = 1; i <= nB; i++) {
+ M[0][i].trace = 0;
+ gaps[0][i].Ix = 0;
+ gaps[0][i].Iy = Iy_MATRIX;
+ }
+ gaps[0][1].Iy = M_MATRIX;
+ }
+ else if (mode == Local) {
+ for (i = 1; i < nA; i++) {
+ gaps[i][0].Ix = 0;
+ gaps[i][0].Iy = 0;
+ }
+ for (i = 1; i <= nB; i++) {
+ M[0][i].trace = trace;
+ gaps[0][i].Ix = 0;
+ gaps[0][i].Iy = 0;
+ }
+ }
+ M[0][0].path = 0;
+
+ return paths;
+exit:
+ Py_DECREF(paths);
+ PyErr_SetNone(PyExc_MemoryError);
+ return NULL;
+}
+
+static PathGenerator*
+PathGenerator_create_WSB(Py_ssize_t nA, Py_ssize_t nB, Mode mode, unsigned char strand)
+{
+ int i, j;
+ int* trace;
+ Trace** M = NULL;
+ TraceGapsWatermanSmithBeyer** gaps = NULL;
+ PathGenerator* paths;
+
+ paths = (PathGenerator*)PyType_GenericAlloc(&PathGenerator_Type, 0);
+ if (!paths) return NULL;
+
+ paths->iA = 0;
+ paths->iB = 0;
+ paths->nA = nA;
+ paths->nB = nB;
+ paths->M = NULL;
+ paths->gaps.waterman_smith_beyer = NULL;
+ paths->algorithm = WatermanSmithBeyer;
+ paths->mode = mode;
+ paths->length = 0;
+ paths->strand = strand;
+
+ M = PyMem_Malloc((nA+1)*sizeof(Trace*));
+ if (!M) goto exit;
+ paths->M = M;
+ for (i = 0; i <= nA; i++) {
+ M[i] = PyMem_Malloc((nB+1)*sizeof(Trace));
+ if (!M[i]) goto exit;
+ }
+ gaps = PyMem_Malloc((nA+1)*sizeof(TraceGapsWatermanSmithBeyer*));
+ if (!gaps) goto exit;
+ paths->gaps.waterman_smith_beyer = gaps;
+ for (i = 0; i <= nA; i++) gaps[i] = NULL;
+ for (i = 0; i <= nA; i++) {
+ gaps[i] = PyMem_Malloc((nB+1)*sizeof(TraceGapsWatermanSmithBeyer));
+ if (!gaps[i]) goto exit;
+ for (j = 0; j <= nB; j++) {
+ gaps[i][j].MIx = NULL;
+ gaps[i][j].IyIx = NULL;
+ gaps[i][j].MIy = NULL;
+ gaps[i][j].IxIy = NULL;
+ }
+ M[i][0].path = 0;
+ switch (mode) {
+ case Global:
+ M[i][0].trace = 0;
+ trace = PyMem_Malloc(2*sizeof(int));
+ if (!trace) goto exit;
+ gaps[i][0].MIx = trace;
+ trace[0] = i;
+ trace[1] = 0;
+ trace = PyMem_Malloc(sizeof(int));
+ if (!trace) goto exit;
+ gaps[i][0].IyIx = trace;
+ trace[0] = 0;
+ break;
+ case Local:
+ M[i][0].trace = STARTPOINT;
+ break;
+ }
+ }
+ for (i = 1; i <= nB; i++) {
+ switch (mode) {
+ case Global:
+ M[0][i].trace = 0;
+ trace = PyMem_Malloc(2*sizeof(int));
+ if (!trace) goto exit;
+ gaps[0][i].MIy = trace;
+ trace[0] = i;
+ trace[1] = 0;
+ trace = PyMem_Malloc(sizeof(int));
+ if (!trace) goto exit;
+ gaps[0][i].IxIy = trace;
+ trace[0] = 0;
+ break;
+ case Local:
+ M[0][i].trace = STARTPOINT;
+ break;
+ }
+ }
+ M[0][0].path = 0;
+ return paths;
+exit:
+ Py_DECREF(paths);
+ PyErr_SetNone(PyExc_MemoryError);
+ return NULL;
+}
+
+/* ----------------- alignment algorithms ----------------- */
+
+#define MATRIX_SCORE scores[kA*n+kB]
+#define COMPARE_SCORE (kA == wildcard || kB == wildcard) ? 0 : (kA == kB) ? match : mismatch
+
+
+static PyObject*
+Aligner_needlemanwunsch_score_compare(Aligner* self,
+ const int* sA, Py_ssize_t nA,
+ const int* sB, Py_ssize_t nB,
+ unsigned char strand)
+{
+ const double match = self->match;
+ const double mismatch = self->mismatch;
+ const int wildcard = self->wildcard;
+ NEEDLEMANWUNSCH_SCORE(COMPARE_SCORE);
+}
+
+static PyObject*
+Aligner_needlemanwunsch_score_matrix(Aligner* self,
+ const int* sA, Py_ssize_t nA,
+ const int* sB, Py_ssize_t nB,
+ unsigned char strand)
+{
+ const Py_ssize_t n = self->substitution_matrix.shape[0];
+ const double* scores = self->substitution_matrix.buf;
+ NEEDLEMANWUNSCH_SCORE(MATRIX_SCORE);
+}
+
+static PyObject*
+Aligner_smithwaterman_score_compare(Aligner* self,
+ const int* sA, Py_ssize_t nA,
+ const int* sB, Py_ssize_t nB)
+{
+ const double match = self->match;
+ const double mismatch = self->mismatch;
+ const int wildcard = self->wildcard;
+ SMITHWATERMAN_SCORE(COMPARE_SCORE);
+}
+
+static PyObject*
+Aligner_smithwaterman_score_matrix(Aligner* self,
+ const int* sA, Py_ssize_t nA,
+ const int* sB, Py_ssize_t nB)
+{
+ const Py_ssize_t n = self->substitution_matrix.shape[0];
+ const double* scores = self->substitution_matrix.buf;
+ SMITHWATERMAN_SCORE(MATRIX_SCORE);
+}
+
+static PyObject*
+Aligner_needlemanwunsch_align_compare(Aligner* self,
+ const int* sA, Py_ssize_t nA,
+ const int* sB, Py_ssize_t nB,
+ unsigned char strand)
+{
+ const double match = self->match;
+ const double mismatch = self->mismatch;
+ const int wildcard = self->wildcard;
+ NEEDLEMANWUNSCH_ALIGN(COMPARE_SCORE);
+}
+
+static PyObject*
+Aligner_needlemanwunsch_align_matrix(Aligner* self,
+ const int* sA, Py_ssize_t nA,
+ const int* sB, Py_ssize_t nB,
+ unsigned char strand)
+{
+ const Py_ssize_t n = self->substitution_matrix.shape[0];
+ const double* scores = self->substitution_matrix.buf;
+ NEEDLEMANWUNSCH_ALIGN(MATRIX_SCORE);
+}
+
+static PyObject*
+Aligner_smithwaterman_align_compare(Aligner* self,
+ const int* sA, Py_ssize_t nA,
+ const int* sB, Py_ssize_t nB,
+ unsigned char strand)
+{
+ const double match = self->match;
+ const double mismatch = self->mismatch;
+ const int wildcard = self->wildcard;
+ SMITHWATERMAN_ALIGN(COMPARE_SCORE);
+}
+
+static PyObject*
+Aligner_smithwaterman_align_matrix(Aligner* self,
+ const int* sA, Py_ssize_t nA,
+ const int* sB, Py_ssize_t nB,
+ unsigned char strand)
+{
+ const Py_ssize_t n = self->substitution_matrix.shape[0];
+ const double* scores = self->substitution_matrix.buf;
+ SMITHWATERMAN_ALIGN(MATRIX_SCORE);
+}
+
+static PyObject*
+Aligner_gotoh_global_score_compare(Aligner* self,
+ const int* sA, Py_ssize_t nA,
+ const int* sB, Py_ssize_t nB,
+ unsigned char strand)
+{
+ const double match = self->match;
+ const double mismatch = self->mismatch;
+ const int wildcard = self->wildcard;
+ GOTOH_GLOBAL_SCORE(COMPARE_SCORE);
+}
+
+static PyObject*
+Aligner_gotoh_global_score_matrix(Aligner* self,
+ const int* sA, Py_ssize_t nA,
+ const int* sB, Py_ssize_t nB,
+ unsigned char strand)
+{
+ const Py_ssize_t n = self->substitution_matrix.shape[0];
+ const double* scores = self->substitution_matrix.buf;
+ GOTOH_GLOBAL_SCORE(MATRIX_SCORE);
+}
+
+static PyObject*
+Aligner_gotoh_local_score_compare(Aligner* self,
+ const int* sA, Py_ssize_t nA,
+ const int* sB, Py_ssize_t nB)
+{
+ const double match = self->match;
+ const double mismatch = self->mismatch;
+ const int wildcard = self->wildcard;
+ GOTOH_LOCAL_SCORE(COMPARE_SCORE);
+}
+
+static PyObject*
+Aligner_gotoh_local_score_matrix(Aligner* self,
+ const int* sA, Py_ssize_t nA,
+ const int* sB, Py_ssize_t nB)
+{
+ const Py_ssize_t n = self->substitution_matrix.shape[0];
+ const double* scores = self->substitution_matrix.buf;
+ GOTOH_LOCAL_SCORE(MATRIX_SCORE);
+}
+
+static PyObject*
+Aligner_gotoh_global_align_compare(Aligner* self,
+ const int* sA, Py_ssize_t nA,
+ const int* sB, Py_ssize_t nB,
+ unsigned char strand)
+{
+ const double match = self->match;
+ const double mismatch = self->mismatch;
+ const int wildcard = self->wildcard;
+ GOTOH_GLOBAL_ALIGN(COMPARE_SCORE);
+}
+
+static PyObject*
+Aligner_gotoh_global_align_matrix(Aligner* self,
+ const int* sA, Py_ssize_t nA,
+ const int* sB, Py_ssize_t nB,
+ unsigned char strand)
+{
+ const Py_ssize_t n = self->substitution_matrix.shape[0];
+ const double* scores = self->substitution_matrix.buf;
+ GOTOH_GLOBAL_ALIGN(MATRIX_SCORE);
+}
+
+static PyObject*
+Aligner_gotoh_local_align_compare(Aligner* self,
+ const int* sA, Py_ssize_t nA,
+ const int* sB, Py_ssize_t nB,
+ unsigned char strand)
+{
+ const double match = self->match;
+ const double mismatch = self->mismatch;
+ const int wildcard = self->wildcard;
+ GOTOH_LOCAL_ALIGN(COMPARE_SCORE);
+}
+
+static PyObject*
+Aligner_gotoh_local_align_matrix(Aligner* self,
+ const int* sA, Py_ssize_t nA,
+ const int* sB, Py_ssize_t nB,
+ unsigned char strand)
+{
+ const Py_ssize_t n = self->substitution_matrix.shape[0];
+ const double* scores = self->substitution_matrix.buf;
+ GOTOH_LOCAL_ALIGN(MATRIX_SCORE);
+}
+
+static int
+_call_query_gap_function(Aligner* aligner, int i, int j, double* score)
+{
+ double value;
+ PyObject* result;
+ PyObject* function = aligner->query_gap_function;
+ if (!function)
+ value = aligner->query_internal_open_gap_score
+ + (j-1) * aligner->query_internal_extend_gap_score;
+ else {
+ result = PyObject_CallFunction(function, "ii", i, j);
+ if (result == NULL) return 0;
+ value = PyFloat_AsDouble(result);
+ Py_DECREF(result);
+ if (value == -1.0 && PyErr_Occurred()) return 0;
+ }
+ *score = value;
+ return 1;
+}
+
+static int
+_call_target_gap_function(Aligner* aligner, int i, int j, double* score)
+{
+ double value;
+ PyObject* result;
+ PyObject* function = aligner->target_gap_function;
+ if (!function)
+ value = aligner->target_internal_open_gap_score
+ + (j-1) * aligner->target_internal_extend_gap_score;
+ else {
+ result = PyObject_CallFunction(function, "ii", i, j);
+ if (result == NULL) return 0;
+ value = PyFloat_AsDouble(result);
+ Py_DECREF(result);
+ if (value == -1.0 && PyErr_Occurred()) return 0;
+ }
+ *score = value;
+ return 1;
+}
+
+static PyObject*
+Aligner_watermansmithbeyer_global_score_compare(Aligner* self,
+ const int* sA, Py_ssize_t nA,
+ const int* sB, Py_ssize_t nB,
+ unsigned char strand)
+{
+ const double match = self->match;
+ const double mismatch = self->mismatch;
+ const int wildcard = self->wildcard;
+ WATERMANSMITHBEYER_ENTER_SCORE;
+ switch (strand) {
+ case '+': {
+ WATERMANSMITHBEYER_GLOBAL_SCORE(COMPARE_SCORE, j);
+ break;
+ }
+ case '-': {
+ WATERMANSMITHBEYER_GLOBAL_SCORE(COMPARE_SCORE, nB-j);
+ break;
+ }
+ }
+ WATERMANSMITHBEYER_EXIT_SCORE;
+}
+
+static PyObject*
+Aligner_watermansmithbeyer_global_score_matrix(Aligner* self,
+ const int* sA, Py_ssize_t nA,
+ const int* sB, Py_ssize_t nB,
+ unsigned char strand)
+{
+ const Py_ssize_t n = self->substitution_matrix.shape[0];
+ const double* scores = self->substitution_matrix.buf;
+ WATERMANSMITHBEYER_ENTER_SCORE;
+ switch (strand) {
+ case '+':
+ WATERMANSMITHBEYER_GLOBAL_SCORE(MATRIX_SCORE, j);
+ break;
+ case '-':
+ WATERMANSMITHBEYER_GLOBAL_SCORE(MATRIX_SCORE, nB-j);
+ break;
+ }
+ WATERMANSMITHBEYER_EXIT_SCORE;
+}
+
+static PyObject*
+Aligner_watermansmithbeyer_local_score_compare(Aligner* self,
+ const int* sA, Py_ssize_t nA,
+ const int* sB, Py_ssize_t nB,
+ unsigned char strand)
+{
+ const double match = self->match;
+ const double mismatch = self->mismatch;
+ const int wildcard = self->wildcard;
+ double maximum = 0.0;
+ WATERMANSMITHBEYER_ENTER_SCORE;
+ switch (strand) {
+ case '+': {
+ WATERMANSMITHBEYER_LOCAL_SCORE(COMPARE_SCORE, j);
+ break;
+ }
+ case '-': {
+ WATERMANSMITHBEYER_LOCAL_SCORE(COMPARE_SCORE, nB-j);
+ break;
+ }
+ }
+ WATERMANSMITHBEYER_EXIT_SCORE;
+}
+
+static PyObject*
+Aligner_watermansmithbeyer_local_score_matrix(Aligner* self,
+ const int* sA, Py_ssize_t nA,
+ const int* sB, Py_ssize_t nB,
+ unsigned char strand)
+{
+ const Py_ssize_t n = self->substitution_matrix.shape[0];
+ const double* scores = self->substitution_matrix.buf;
+ double maximum = 0.0;
+ WATERMANSMITHBEYER_ENTER_SCORE;
+ switch (strand) {
+ case '+': {
+ WATERMANSMITHBEYER_LOCAL_SCORE(MATRIX_SCORE, j);
+ break;
+ }
+ case '-': {
+ WATERMANSMITHBEYER_LOCAL_SCORE(MATRIX_SCORE, nB-j);
+ break;
+ }
+ }
+ WATERMANSMITHBEYER_EXIT_SCORE;
+}
+
+static PyObject*
+Aligner_watermansmithbeyer_global_align_compare(Aligner* self,
+ const int* sA, Py_ssize_t nA,
+ const int* sB, Py_ssize_t nB,
+ unsigned char strand)
+{
+ const double match = self->match;
+ const double mismatch = self->mismatch;
+ const int wildcard = self->wildcard;
+ WATERMANSMITHBEYER_ENTER_ALIGN(Global);
+ switch (strand) {
+ case '+': {
+ WATERMANSMITHBEYER_GLOBAL_ALIGN(COMPARE_SCORE, j);
+ break;
+ }
+ case '-': {
+ WATERMANSMITHBEYER_GLOBAL_ALIGN(COMPARE_SCORE, nB-j);
+ break;
+ }
+ }
+ WATERMANSMITHBEYER_EXIT_ALIGN;
+}
+
+static PyObject*
+Aligner_watermansmithbeyer_global_align_matrix(Aligner* self,
+ const int* sA, Py_ssize_t nA,
+ const int* sB, Py_ssize_t nB,
+ unsigned char strand)
+{
+ const Py_ssize_t n = self->substitution_matrix.shape[0];
+ const double* scores = self->substitution_matrix.buf;
+ WATERMANSMITHBEYER_ENTER_ALIGN(Global);
+ switch (strand) {
+ case '+': {
+ WATERMANSMITHBEYER_GLOBAL_ALIGN(MATRIX_SCORE, j);
+ break;
+ }
+ case '-': {
+ WATERMANSMITHBEYER_GLOBAL_ALIGN(MATRIX_SCORE, nB-j);
+ break;
+ }
+ }
+ WATERMANSMITHBEYER_EXIT_ALIGN;
+}
+
+static PyObject*
+Aligner_watermansmithbeyer_local_align_compare(Aligner* self,
+ const int* sA, Py_ssize_t nA,
+ const int* sB, Py_ssize_t nB,
+ unsigned char strand)
+{
+ const double match = self->match;
+ const double mismatch = self->mismatch;
+ const int wildcard = self->wildcard;
+ int im = nA;
+ int jm = nB;
+ double maximum = 0;
+ WATERMANSMITHBEYER_ENTER_ALIGN(Local);
+ switch (strand) {
+ case '+': {
+ WATERMANSMITHBEYER_LOCAL_ALIGN(COMPARE_SCORE, j);
+ break;
+ }
+ case '-': {
+ WATERMANSMITHBEYER_LOCAL_ALIGN(COMPARE_SCORE, nB-j);
+ break;
+ }
+ }
+ WATERMANSMITHBEYER_EXIT_ALIGN;
+}
+
+static PyObject*
+Aligner_watermansmithbeyer_local_align_matrix(Aligner* self,
+ const int* sA, Py_ssize_t nA,
+ const int* sB, Py_ssize_t nB,
+ unsigned char strand)
+{
+ const Py_ssize_t n = self->substitution_matrix.shape[0];
+ const double* scores = self->substitution_matrix.buf;
+ int im = nA;
+ int jm = nB;
+ double maximum = 0;
+ WATERMANSMITHBEYER_ENTER_ALIGN(Local);
+ switch (strand) {
+ case '+': {
+ WATERMANSMITHBEYER_LOCAL_ALIGN(MATRIX_SCORE, j);
+ break;
+ }
+ case '-': {
+ WATERMANSMITHBEYER_LOCAL_ALIGN(MATRIX_SCORE, nB-j);
+ break;
+ }
+ }
+ WATERMANSMITHBEYER_EXIT_ALIGN;
+}
+
+static int*
+convert_1bytes_to_ints(const int mapping[], Py_ssize_t n, const unsigned char s[])
+{
+ unsigned char c;
+ Py_ssize_t i;
+ int index;
+ int* indices;
+ if (n == 0) {
+ PyErr_SetString(PyExc_ValueError, "sequence has zero length");
+ return NULL;
+ }
+ indices = PyMem_Malloc(n*sizeof(int));
+ if (!indices) {
+ PyErr_NoMemory();
+ return NULL;
+ }
+ if (!mapping) for (i = 0; i < n; i++) indices[i] = s[i];
+ else {
+ for (i = 0; i < n; i++) {
+ c = s[i];
+ index = mapping[(int)c];
+ if (index == MISSING_LETTER) {
+ PyErr_SetString(PyExc_ValueError,
+ "sequence contains letters not in the alphabet");
+ PyMem_Free(indices);
+ return NULL;
+ }
+ indices[i] = index;
+ }
+ }
+ return indices;
+}
+
+static int*
+convert_2bytes_to_ints(const int mapping[], Py_ssize_t n, const Py_UCS2 s[])
+{
+ unsigned char c;
+ Py_ssize_t i;
+ int index;
+ int* indices;
+ if (n == 0) {
+ PyErr_SetString(PyExc_ValueError, "sequence has zero length");
+ return NULL;
+ }
+ indices = PyMem_Malloc(n*sizeof(int));
+ if (!indices) {
+ PyErr_NoMemory();
+ return NULL;
+ }
+ if (!mapping) for (i = 0; i < n; i++) indices[i] = s[i];
+ else {
+ for (i = 0; i < n; i++) {
+ c = s[i];
+ index = mapping[(int)c];
+ if (index == MISSING_LETTER) {
+ PyErr_SetString(PyExc_ValueError,
+ "sequence contains letters not in the alphabet");
+ PyMem_Free(indices);
+ return NULL;
+ }
+ indices[i] = index;
+ }
+ }
+ return indices;
+}
+
+static int*
+convert_4bytes_to_ints(const int mapping[], Py_ssize_t n, const Py_UCS4 s[])
+{
+ unsigned char c;
+ Py_ssize_t i;
+ int index;
+ int* indices;
+ if (n == 0) {
+ PyErr_SetString(PyExc_ValueError, "sequence has zero length");
+ return NULL;
+ }
+ indices = PyMem_Malloc(n*sizeof(int));
+ if (!indices) {
+ PyErr_NoMemory();
+ return NULL;
+ }
+ if (!mapping) for (i = 0; i < n; i++) indices[i] = s[i];
+ else {
+ for (i = 0; i < n; i++) {
+ c = s[i];
+ index = mapping[(int)c];
+ if (index == MISSING_LETTER) {
+ PyErr_SetString(PyExc_ValueError,
+ "sequence contains letters not in the alphabet");
+ PyMem_Free(indices);
+ return NULL;
+ }
+ indices[i] = index;
+ }
+ }
+ return indices;
+}
+
+static int
+convert_objects_to_ints(Py_buffer* view, PyObject* alphabet, PyObject* sequence)
+{
+ Py_ssize_t i, j;
+ Py_ssize_t n;
+ Py_ssize_t m;
+ int* indices = NULL;
+ PyObject *obj1, *obj2;
+ int equal;
+
+ view->buf = NULL;
+ sequence = PySequence_Fast(sequence,
+ "argument should support the sequence protocol");
+ if (!sequence) return 0;
+ if (!alphabet) {
+ PyErr_SetString(PyExc_ValueError,
+ "alphabet is None; cannot interpret sequence");
+ goto exit;
+ }
+ alphabet = PySequence_Fast(alphabet, NULL); /* should never fail */
+ n = PySequence_Size(sequence);
+ m = PySequence_Size(alphabet);
+ indices = PyMem_Malloc(n*sizeof(int));
+ if (!indices) {
+ PyErr_NoMemory();
+ goto exit;
+ }
+ for (i = 0; i < n; i++) {
+ obj1 = PySequence_Fast_GET_ITEM(sequence, i);
+ for (j = 0; j < m; j++) {
+ obj2 = PySequence_Fast_GET_ITEM(alphabet, j);
+ equal = PyObject_RichCompareBool(obj1, obj2, Py_EQ);
+ if (equal == 1) /* obj1 == obj2 */ {
+ indices[i] = j;
+ break;
+ }
+ else if (equal == -1) /* error */ {
+ PyMem_Del(indices);
+ goto exit;
+ }
+ /* else (equal == 0) continue; */ /* not equal */
+ }
+ if (j == m) {
+ PyErr_SetString(PyExc_ValueError, "failed to find object in alphabet");
+ goto exit;
+ }
+ }
+ view->buf = indices;
+ view->itemsize = 1;
+ view->len = n;
+exit:
+ Py_DECREF(sequence);
+ Py_XDECREF(alphabet);
+ if (view->buf) return 1;
+ return 0;
+}
+
+static int
+sequence_converter(PyObject* argument, void* pointer)
+{
+ Py_buffer* view = pointer;
+ Py_ssize_t i;
+ Py_ssize_t n;
+ int index;
+ int* indices;
+ const int flag = PyBUF_FORMAT | PyBUF_C_CONTIGUOUS;
+ Aligner* aligner;
+ int* mapping;
+
+ if (argument == NULL) {
+ if (view->obj) PyBuffer_Release(view);
+ else {
+ indices = view->buf;
+ PyMem_Free(indices);
+ }
+ return 1;
+ }
+
+ aligner = (Aligner*)view->obj;
+ view->obj = NULL;
+
+ if (PyObject_GetBuffer(argument, view, flag) == 0) {
+ if (view->ndim != 1) {
+ PyErr_Format(PyExc_ValueError,
+ "sequence has incorrect rank (%d expected 1)", view->ndim);
+ return 0;
+ }
+ n = view->len / view->itemsize;
+ if (n == 0) {
+ PyErr_SetString(PyExc_ValueError, "sequence has zero length");
+ return 0;
+ }
+ if (strcmp(view->format, "c") == 0 || strcmp(view->format, "B") == 0) {
+ if (view->itemsize != sizeof(char)) {
+ PyErr_Format(PyExc_ValueError,
+ "sequence has unexpected item byte size "
+ "(%ld, expected %ld)", view->itemsize, sizeof(char));
+ return 0;
+ }
+ indices = convert_1bytes_to_ints(aligner->mapping, n, view->buf);
+ if (!indices) return 0;
+ PyBuffer_Release(view);
+ view->itemsize = 1;
+ view->len = n;
+ view->buf = indices;
+ return Py_CLEANUP_SUPPORTED;
+ }
+ if (strcmp(view->format, "i") == 0 || strcmp(view->format, "l") == 0) {
+ if (view->itemsize != sizeof(int)) {
+ PyErr_Format(PyExc_ValueError,
+ "sequence has unexpected item byte size "
+ "(%ld, expected %ld)", view->itemsize, sizeof(int));
+ return 0;
+ }
+ indices = view->buf;
+ if (aligner->substitution_matrix.obj) {
+ const Py_ssize_t m = aligner->substitution_matrix.shape[0];
+ for (i = 0; i < n; i++) {
+ index = indices[i];
+ if (index < 0) {
+ PyErr_Format(PyExc_ValueError,
+ "sequence item %zd is negative (%d)",
+ i, index);
+ return 0;
+ }
+ if (index >= m) {
+ PyErr_Format(PyExc_ValueError,
+ "sequence item %zd is out of bound"
+ " (%d, should be < %zd)", i, index, m);
+ return 0;
+ }
+ }
+ }
+ return Py_CLEANUP_SUPPORTED;
+ }
+ PyErr_Format(PyExc_ValueError,
+ "sequence has incorrect data type '%s'", view->format);
+ return 0;
+ }
+ PyErr_Clear(); /* To clear the exception raised by PyObject_GetBuffer */
+ mapping = aligner->mapping;
+ if (PyUnicode_Check(argument)) {
+ if (PyUnicode_READY(argument) == -1) return 0;
+ n = PyUnicode_GET_LENGTH(argument);
+ switch (PyUnicode_KIND(argument)) {
+ case PyUnicode_1BYTE_KIND: {
+ Py_UCS1* s = PyUnicode_1BYTE_DATA(argument);
+ indices = convert_1bytes_to_ints(mapping, n, (unsigned char*)s);
+ break;
+ }
+ case PyUnicode_2BYTE_KIND: {
+ Py_UCS2* s = PyUnicode_2BYTE_DATA(argument);
+ indices = convert_2bytes_to_ints(mapping, n, s);
+ break;
+ }
+ case PyUnicode_4BYTE_KIND: {
+ Py_UCS4* s = PyUnicode_4BYTE_DATA(argument);
+ indices = convert_4bytes_to_ints(mapping, n, s);
+ break;
+ }
+ case PyUnicode_WCHAR_KIND:
+ default:
+ PyErr_SetString(PyExc_ValueError, "could not interpret unicode data");
+ return 0;
+ }
+ if (!indices) return 0;
+ view->buf = indices;
+ view->itemsize = 1;
+ view->len = n;
+ return Py_CLEANUP_SUPPORTED;
+ }
+
+ if (!mapping) {
+ if (!convert_objects_to_ints(view, aligner->alphabet, argument)) return 0;
+ return Py_CLEANUP_SUPPORTED;
+ }
+
+ PyErr_SetString(PyExc_ValueError, "sequence has unexpected format");
+ return 0;
+}
+
+static int
+strand_converter(PyObject* argument, void* pointer)
+{
+ if (!PyUnicode_Check(argument)) goto error;
+ if (PyUnicode_READY(argument) == -1) return 0;
+ if (PyUnicode_GET_LENGTH(argument) == 1) {
+ const Py_UCS4 ch = PyUnicode_READ_CHAR(argument, 0);
+ if (ch < 128) {
+ const char c = ch;
+ if (ch == '+' || ch == '-') {
+ *((char*)pointer) = c;
+ return 1;
+ }
+ }
+ }
+error:
+ PyErr_SetString(PyExc_ValueError, "strand must be '+' or '-'");
+ return 0;
+}
+
+static const char Aligner_score__doc__[] = "calculates the alignment score";
+
+static PyObject*
+Aligner_score(Aligner* self, PyObject* args, PyObject* keywords)
+{
+ const int* sA;
+ const int* sB;
+ Py_ssize_t nA;
+ Py_ssize_t nB;
+ Py_buffer bA = {0};
+ Py_buffer bB = {0};
+ const Mode mode = self->mode;
+ const Algorithm algorithm = _get_algorithm(self);
+ char strand = '+';
+ PyObject* result = NULL;
+ PyObject* substitution_matrix = self->substitution_matrix.obj;
+
+ static char *kwlist[] = {"sequenceA", "sequenceB", "strand", NULL};
+
+ bA.obj = (PyObject*)self;
+ bB.obj = (PyObject*)self;
+ if(!PyArg_ParseTupleAndKeywords(args, keywords, "O&O&O&", kwlist,
+ sequence_converter, &bA,
+ sequence_converter, &bB,
+ strand_converter, &strand))
+ return NULL;
+
+ sA = bA.buf;
+ nA = bA.len / bA.itemsize;
+ sB = bB.buf;
+ nB = bB.len / bB.itemsize;
+
+ switch (algorithm) {
+ case NeedlemanWunschSmithWaterman:
+ switch (mode) {
+ case Global:
+ if (substitution_matrix)
+ result = Aligner_needlemanwunsch_score_matrix(self, sA, nA, sB, nB, strand);
+ else
+ result = Aligner_needlemanwunsch_score_compare(self, sA, nA, sB, nB, strand);
+ break;
+ case Local:
+ if (substitution_matrix)
+ result = Aligner_smithwaterman_score_matrix(self, sA, nA, sB, nB);
+ else
+ result = Aligner_smithwaterman_score_compare(self, sA, nA, sB, nB);
+ break;
+ }
+ break;
+ case Gotoh:
+ switch (mode) {
+ case Global:
+ if (substitution_matrix)
+ result = Aligner_gotoh_global_score_matrix(self, sA, nA, sB, nB, strand);
+ else
+ result = Aligner_gotoh_global_score_compare(self, sA, nA, sB, nB, strand);
+ break;
+ case Local:
+ if (substitution_matrix)
+ result = Aligner_gotoh_local_score_matrix(self, sA, nA, sB, nB);
+ else
+ result = Aligner_gotoh_local_score_compare(self, sA, nA, sB, nB);
+ break;
+ }
+ break;
+ case WatermanSmithBeyer:
+ switch (mode) {
+ case Global:
+ if (substitution_matrix)
+ result = Aligner_watermansmithbeyer_global_score_matrix(self, sA, nA, sB, nB, strand);
+ else
+ result = Aligner_watermansmithbeyer_global_score_compare(self, sA, nA, sB, nB, strand);
+ break;
+ case Local:
+ if (substitution_matrix)
+ result = Aligner_watermansmithbeyer_local_score_matrix(self, sA, nA, sB, nB, strand);
+ else
+ result = Aligner_watermansmithbeyer_local_score_compare(self, sA, nA, sB, nB, strand);
+ break;
+ }
+ break;
+ case Unknown:
+ default:
+ PyErr_SetString(PyExc_RuntimeError, "unknown algorithm");
+ break;
+ }
+
+ sequence_converter(NULL, &bA);
+ sequence_converter(NULL, &bB);
+
+ return result;
+}
+
+static const char Aligner_align__doc__[] = "align two sequences";
+
+static PyObject*
+Aligner_align(Aligner* self, PyObject* args, PyObject* keywords)
+{
+ const int* sA;
+ const int* sB;
+ Py_ssize_t nA;
+ Py_ssize_t nB;
+ Py_buffer bA = {0};
+ Py_buffer bB = {0};
+ const Mode mode = self->mode;
+ const Algorithm algorithm = _get_algorithm(self);
+ char strand = '+';
+ PyObject* result = NULL;
+ PyObject* substitution_matrix = self->substitution_matrix.obj;
+
+ static char *kwlist[] = {"sequenceA", "sequenceB", "strand", NULL};
+
+ bA.obj = (PyObject*)self;
+ bB.obj = (PyObject*)self;
+ if(!PyArg_ParseTupleAndKeywords(args, keywords, "O&O&O&", kwlist,
+ sequence_converter, &bA,
+ sequence_converter, &bB,
+ strand_converter, &strand))
+ return NULL;
+
+ sA = bA.buf;
+ nA = bA.len / bA.itemsize;
+ sB = bB.buf;
+ nB = bB.len / bB.itemsize;
+
+ switch (algorithm) {
+ case NeedlemanWunschSmithWaterman:
+ switch (mode) {
+ case Global:
+ if (substitution_matrix)
+ result = Aligner_needlemanwunsch_align_matrix(self, sA, nA, sB, nB, strand);
+ else
+ result = Aligner_needlemanwunsch_align_compare(self, sA, nA, sB, nB, strand);
+ break;
+ case Local:
+ if (substitution_matrix)
+ result = Aligner_smithwaterman_align_matrix(self, sA, nA, sB, nB, strand);
+ else
+ result = Aligner_smithwaterman_align_compare(self, sA, nA, sB, nB, strand);
+ break;
+ }
+ break;
+ case Gotoh:
+ switch (mode) {
+ case Global:
+ if (substitution_matrix)
+ result = Aligner_gotoh_global_align_matrix(self, sA, nA, sB, nB, strand);
+ else
+ result = Aligner_gotoh_global_align_compare(self, sA, nA, sB, nB, strand);
+ break;
+ case Local:
+ if (substitution_matrix)
+ result = Aligner_gotoh_local_align_matrix(self, sA, nA, sB, nB, strand);
+ else
+ result = Aligner_gotoh_local_align_compare(self, sA, nA, sB, nB, strand);
+ break;
+ }
+ break;
+ case WatermanSmithBeyer:
+ switch (mode) {
+ case Global:
+ if (substitution_matrix)
+ result = Aligner_watermansmithbeyer_global_align_matrix(self, sA, nA, sB, nB, strand);
+ else
+ result = Aligner_watermansmithbeyer_global_align_compare(self, sA, nA, sB, nB, strand);
+ break;
+ case Local:
+ if (substitution_matrix)
+ result = Aligner_watermansmithbeyer_local_align_matrix(self, sA, nA, sB, nB, strand);
+ else
+ result = Aligner_watermansmithbeyer_local_align_compare(self, sA, nA, sB, nB, strand);
+ break;
+ }
+ break;
+ case Unknown:
+ default:
+ PyErr_SetString(PyExc_RuntimeError, "unknown algorithm");
+ break;
+ }
+
+ sequence_converter(NULL, &bA);
+ sequence_converter(NULL, &bB);
+
+ return result;
+}
+
+static char Aligner_doc[] =
+"Aligner.\n";
+
+static PyMethodDef Aligner_methods[] = {
+ {"score",
+ (PyCFunction)Aligner_score,
+ METH_VARARGS | METH_KEYWORDS,
+ Aligner_score__doc__
+ },
+ {"align",
+ (PyCFunction)Aligner_align,
+ METH_VARARGS | METH_KEYWORDS,
+ Aligner_align__doc__
+ },
+ {NULL} /* Sentinel */
+};
+
+static PyTypeObject AlignerType = {
+ PyVarObject_HEAD_INIT(NULL, 0)
+ "_algorithms.PairwiseAligner", /* tp_name */
+ sizeof(Aligner), /* tp_basicsize */
+ 0, /* tp_itemsize */
+ (destructor)Aligner_dealloc, /* tp_dealloc */
+ 0, /* tp_print */
+ 0, /* tp_getattr */
+ 0, /* tp_setattr */
+ 0, /* tp_compare */
+ (reprfunc)Aligner_repr, /* tp_repr */
+ 0, /* tp_as_number */
+ 0, /* tp_as_sequence */
+ 0, /* tp_as_mapping */
+ 0, /* tp_hash */
+ 0, /* tp_call */
+ (reprfunc)Aligner_str, /* tp_str */
+ 0, /* tp_getattro */
+ 0, /* tp_setattro */
+ 0, /* tp_as_buffer */
+ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/
+ Aligner_doc, /* tp_doc */
+ 0, /* tp_traverse */
+ 0, /* tp_clear */
+ 0, /* tp_richcompare */
+ 0, /* tp_weaklistoffset */
+ 0, /* tp_iter */
+ 0, /* tp_iternext */
+ Aligner_methods, /* tp_methods */
+ 0, /* tp_members */
+ Aligner_getset, /* tp_getset */
+ 0, /* tp_base */
+ 0, /* tp_dict */
+ 0, /* tp_descr_get */
+ 0, /* tp_descr_set */
+ 0, /* tp_dictoffset */
+ (initproc)Aligner_init, /* tp_init */
+};
+
+
+/* Module definition */
+
+static char _aligners__doc__[] =
+"C extension module implementing pairwise alignment algorithms";
+
+static struct PyModuleDef moduledef = {
+ PyModuleDef_HEAD_INIT,
+ "_aligners",
+ _aligners__doc__,
+ -1,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL
+};
+
+PyObject *
+PyInit__aligners(void)
+{
+ PyObject* module;
+ AlignerType.tp_new = PyType_GenericNew;
+
+ if (PyType_Ready(&AlignerType) < 0 || PyType_Ready(&PathGenerator_Type) < 0)
+ return NULL;
+
+ module = PyModule_Create(&moduledef);
+ if (!module) return NULL;
+
+ Py_INCREF(&AlignerType);
+ /* Reference to AlignerType will be stolen by PyModule_AddObject
+ * only if it is successful. */
+ if (PyModule_AddObject(module,
+ "PairwiseAligner", (PyObject*) &AlignerType) < 0) {
+ Py_DECREF(&AlignerType);
+ Py_DECREF(module);
+ return NULL;
+ }
+
+ return module;
+}
diff --git a/code/lib/Bio/Align/_aligners.cp37-win_amd64.pyd b/code/lib/Bio/Align/_aligners.cp37-win_amd64.pyd
new file mode 100644
index 0000000..26d918c
Binary files /dev/null and b/code/lib/Bio/Align/_aligners.cp37-win_amd64.pyd differ
diff --git a/code/lib/Bio/Align/substitution_matrices/__init__.py b/code/lib/Bio/Align/substitution_matrices/__init__.py
new file mode 100644
index 0000000..5d49ac6
--- /dev/null
+++ b/code/lib/Bio/Align/substitution_matrices/__init__.py
@@ -0,0 +1,514 @@
+# Copyright 2019 by Michiel de Hoon.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+
+"""Substitution matrices."""
+
+import os
+import string
+import numpy
+
+
+class Array(numpy.ndarray):
+ """numpy array subclass indexed by integers and by letters."""
+
+ def __new__(cls, alphabet=None, dims=None, data=None, dtype=float):
+ """Create a new Array instance."""
+ if isinstance(data, dict):
+ if alphabet is not None:
+ raise ValueError("alphabet should be None if data is a dict")
+ if dims is not None:
+ raise ValueError("dims should be None if data is a dict")
+ alphabet = []
+ for key in data:
+ if isinstance(key, str):
+ if dims is None:
+ dims = 1
+ elif dims != 1:
+ raise ValueError("inconsistent dimensions in data")
+ alphabet.append(key)
+ elif isinstance(key, tuple):
+ single_letters = True
+ if dims is None:
+ dims = len(key)
+ elif dims != len(key):
+ raise ValueError("inconsistent dimensions in data")
+ if dims == 1:
+ if not isinstance(key, str):
+ raise ValueError("expected string")
+ if len(key) > 1:
+ single_letters = False
+ alphabet.append(key)
+ elif dims == 2:
+ for letter in key:
+ if not isinstance(letter, str):
+ raise ValueError("expected string")
+ if len(letter) > 1:
+ single_letters = False
+ alphabet.append(letter)
+ else:
+ raise ValueError(
+ "data array should be 1- or 2- dimensional "
+ "(found %d dimensions) in key" % dims
+ )
+ alphabet = sorted(set(alphabet))
+ if single_letters:
+ alphabet = "".join(alphabet)
+ else:
+ alphabet = tuple(alphabet)
+ n = len(alphabet)
+ if dims == 1:
+ shape = (n,)
+ elif dims == 2:
+ shape = (n, n)
+ else: # dims is None
+ raise ValueError("data is an empty dictionary")
+ obj = super().__new__(cls, shape, dtype)
+ if dims == 1:
+ for i, key in enumerate(alphabet):
+ obj[i] = data.get(letter, 0.0)
+ elif dims == 2:
+ for i1, letter1 in enumerate(alphabet):
+ for i2, letter2 in enumerate(alphabet):
+ key = (letter1, letter2)
+ value = data.get(key, 0.0)
+ obj[i1, i2] = value
+ obj._alphabet = alphabet
+ return obj
+ if alphabet is None:
+ alphabet = string.ascii_uppercase
+ elif not (isinstance(alphabet, str) or isinstance(alphabet, tuple)):
+ raise ValueError("alphabet should be a string or a tuple")
+ n = len(alphabet)
+ if data is None:
+ if dims is None:
+ dims = 1
+ elif dims not in (1, 2):
+ raise ValueError("dims should be 1 or 2 (found %s)" % dims)
+ shape = (n,) * dims
+ else:
+ if dims is None:
+ shape = data.shape
+ dims = len(shape)
+ if dims == 1:
+ pass
+ elif dims == 2:
+ if shape[0] != shape[1]:
+ raise ValueError("data array is not square")
+ else:
+ raise ValueError(
+ "data array should be 1- or 2- dimensional "
+ "(found %d dimensions) " % dims
+ )
+ else:
+ shape = (n,) * dims
+ if data.shape != shape:
+ raise ValueError(
+ "data shape has inconsistent shape (expected (%s), found (%s))"
+ % (shape, data.shape)
+ )
+ obj = super().__new__(cls, shape, dtype)
+ if data is None:
+ obj[:] = 0.0
+ else:
+ obj[:] = data
+ obj._alphabet = alphabet
+ return obj
+
+ def __array_finalize__(self, obj):
+ if obj is None:
+ return
+ self._alphabet = getattr(obj, "_alphabet", None)
+
+ def _convert_key(self, key):
+ if isinstance(key, tuple):
+ indices = []
+ for index in key:
+ if isinstance(index, str):
+ try:
+ index = self._alphabet.index(index)
+ except ValueError:
+ raise IndexError("'%s'" % index) from None
+ indices.append(index)
+ key = tuple(indices)
+ elif isinstance(key, str):
+ try:
+ key = self._alphabet.index(key)
+ except ValueError:
+ raise IndexError("'%s'" % key) from None
+ return key
+
+ def __getitem__(self, key):
+ key = self._convert_key(key)
+ value = numpy.ndarray.__getitem__(self, key)
+ if value.ndim == 2:
+ if self.ndim == 2:
+ if value.shape != self.shape:
+ raise IndexError("Requesting truncated array")
+ elif self.ndim == 1:
+ length = self.shape[0]
+ if value.shape[0] == length and value.shape[1] == 1:
+ pass
+ elif value.shape[0] == 1 and value.shape[1] == length:
+ pass
+ else:
+ raise IndexError("Requesting truncated array")
+ elif value.ndim == 1:
+ if value.shape[0] != self.shape[0]:
+ value._alphabet = self.alphabet[key]
+ return value.view(Array)
+
+ def __setitem__(self, key, value):
+ key = self._convert_key(key)
+ numpy.ndarray.__setitem__(self, key, value)
+
+ def __contains__(self, key):
+ # Follow dict definition of __contains__
+ return key in self.keys()
+
+ def __array_prepare__(self, out_arr, context=None):
+ # needed for numpy older than 1.13.0
+ ufunc, inputs, i = context
+ alphabet = self.alphabet
+ for arg in inputs:
+ if isinstance(arg, Array):
+ if arg.alphabet != alphabet:
+ raise ValueError("alphabets are inconsistent")
+ return numpy.ndarray.__array_prepare__(self, out_arr, context)
+
+ def __array_wrap__(self, out_arr, context=None):
+ if len(out_arr) == 1:
+ return out_arr[0]
+ return numpy.ndarray.__array_wrap__(self, out_arr, context)
+
+ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
+ args = []
+ alphabet = self._alphabet
+ for arg in inputs:
+ if isinstance(arg, Array):
+ if arg.alphabet != alphabet:
+ raise ValueError("alphabets are inconsistent")
+ args.append(arg.view(numpy.ndarray))
+ else:
+ args.append(arg)
+
+ outputs = kwargs.pop("out", None)
+ if outputs:
+ out_args = []
+ for arg in outputs:
+ if isinstance(arg, Array):
+ if arg.alphabet != alphabet:
+ raise ValueError("alphabets are inconsistent")
+ out_args.append(arg.view(numpy.ndarray))
+ else:
+ out_args.append(arg)
+ kwargs["out"] = tuple(out_args)
+ else:
+ outputs = (None,) * ufunc.nout
+
+ raw_results = super().__array_ufunc__(ufunc, method, *args, **kwargs)
+ if raw_results is NotImplemented:
+ return NotImplemented
+
+ if method == "at":
+ return
+
+ if ufunc.nout == 1:
+ raw_results = (raw_results,)
+
+ results = []
+ for raw_result, output in zip(raw_results, outputs):
+ if raw_result.ndim == 0:
+ result = raw_result
+ elif output is None:
+ result = numpy.asarray(raw_result).view(Array)
+ result._alphabet = self._alphabet
+ else:
+ result = output
+ result._alphabet = self._alphabet
+ results.append(result)
+
+ return results[0] if len(results) == 1 else results
+
+ def __reduce__(self):
+ import pickle
+
+ values = numpy.array(self)
+ state = pickle.dumps(values)
+ alphabet = self._alphabet
+ dims = len(self.shape)
+ dtype = self.dtype
+ arguments = (Array, alphabet, dims, None, dtype)
+ return (Array.__new__, arguments, state)
+
+ def __setstate__(self, state):
+ import pickle
+
+ self[:, :] = pickle.loads(state)
+
+ def transpose(self, axes=None):
+ """Transpose the array."""
+ other = numpy.ndarray.transpose(self, axes)
+ other._alphabet = self._alphabet
+ return other
+
+ @property
+ def alphabet(self):
+ """Return the alphabet property."""
+ return self._alphabet
+
+ def copy(self):
+ """Create and return a copy of the array."""
+ other = Array(alphabet=self._alphabet, data=self)
+ return other
+
+ def get(self, key, value=None):
+ """Return the value of the key if found; return value otherwise."""
+ try:
+ return self[key]
+ except IndexError:
+ return value
+
+ def items(self):
+ """Return an iterator of (key, value) pairs in the array."""
+ dims = len(self.shape)
+ if dims == 1:
+ for index, key in enumerate(self._alphabet):
+ value = numpy.ndarray.__getitem__(self, index)
+ yield key, value
+ elif dims == 2:
+ for i1, c1 in enumerate(self._alphabet):
+ for i2, c2 in enumerate(self._alphabet):
+ key = (c1, c2)
+ value = numpy.ndarray.__getitem__(self, (i1, i2))
+ yield key, value
+ else:
+ raise RuntimeError("array has unexpected shape %s" % self.shape)
+
+ def keys(self):
+ """Return a tuple with the keys associated with the array."""
+ dims = len(self.shape)
+ alphabet = self._alphabet
+ if dims == 1:
+ return tuple(alphabet)
+ elif dims == 2:
+ return tuple((c1, c2) for c2 in alphabet for c1 in alphabet)
+ else:
+ raise RuntimeError("array has unexpected shape %s" % self.shape)
+
+ def values(self):
+ """Return a tuple with the values stored in the array."""
+ dims = len(self.shape)
+ alphabet = self._alphabet
+ if dims == 1:
+ return tuple(self)
+ elif dims == 2:
+ n1, n2 = self.shape
+ return tuple(
+ numpy.ndarray.__getitem__(self, (i1, i2))
+ for i2 in range(n2)
+ for i1 in range(n1)
+ )
+ else:
+ raise RuntimeError("array has unexpected shape %s" % self.shape)
+
+ def update(self, E=None, **F):
+ """Update the array from dict/iterable E and F."""
+ if E is not None:
+ try:
+ alphabet = E.keys()
+ except AttributeError:
+ for key, value in E:
+ self[key] = value
+ else:
+ for key in E:
+ self[key] = E[key]
+ for key in F:
+ self[key] = F[key]
+
+ def select(self, alphabet):
+ """Subset the array by selecting the letters from the specified alphabet."""
+ ii = []
+ jj = []
+ for i, key in enumerate(alphabet):
+ try:
+ j = self._alphabet.index(key)
+ except ValueError:
+ continue
+ ii.append(i)
+ jj.append(j)
+ dims = len(self.shape)
+ a = Array(alphabet, dims=dims)
+ ii = numpy.ix_(*[ii] * dims)
+ jj = numpy.ix_(*[jj] * dims)
+ a[ii] = numpy.ndarray.__getitem__(self, jj)
+ return a
+
+ def _format_1D(self, fmt):
+ _alphabet = self._alphabet
+ n = len(_alphabet)
+ words = [None] * n
+ lines = []
+ try:
+ header = self.header
+ except AttributeError:
+ pass
+ else:
+ for line in header:
+ line = "# %s\n" % line
+ lines.append(line)
+ maxwidth = 0
+ for i, key in enumerate(_alphabet):
+ value = self[key]
+ word = fmt % value
+ width = len(word)
+ if width > maxwidth:
+ maxwidth = width
+ words[i] = word
+ fmt2 = " %" + str(maxwidth) + "s"
+ for letter, word in zip(_alphabet, words):
+ word = fmt2 % word
+ line = letter + word + "\n"
+ lines.append(line)
+ text = "".join(lines)
+ return text
+
+ def _format_2D(self, fmt):
+ alphabet = self.alphabet
+ n = len(alphabet)
+ words = [[None] * n for _ in range(n)]
+ lines = []
+ try:
+ header = self.header
+ except AttributeError:
+ pass
+ else:
+ for line in header:
+ line = "# %s\n" % line
+ lines.append(line)
+ width = max(len(c) for c in alphabet)
+ line = " " * width
+ for j, c2 in enumerate(alphabet):
+ maxwidth = 0
+ for i, c1 in enumerate(alphabet):
+ key = (c1, c2)
+ value = self[key]
+ word = fmt % value
+ width = len(word)
+ if width > maxwidth:
+ maxwidth = width
+ words[i][j] = word
+ fmt2 = " %" + str(maxwidth) + "s"
+ word = fmt2 % c2
+ line += word
+ for i, c1 in enumerate(alphabet):
+ word = words[i][j]
+ words[i][j] = fmt2 % word
+ line = line.rstrip() + "\n"
+ lines.append(line)
+ for letter, row in zip(alphabet, words):
+ line = letter + "".join(row) + "\n"
+ lines.append(line)
+ text = "".join(lines)
+ return text
+
+ def __format__(self, fmt):
+ return self.format(fmt)
+
+ def format(self, fmt=""):
+ """Return a string representation of the array.
+
+ The argument ``fmt`` specifies the number format to be used.
+ By default, the number format is "%i" if the array contains integer
+ numbers, and "%.1f" otherwise.
+
+ """
+ if fmt == "":
+ if numpy.issubdtype(self.dtype, numpy.integer):
+ fmt = "%i"
+ else:
+ fmt = "%.1f"
+ n = len(self.shape)
+ if n == 1:
+ return self._format_1D(fmt)
+ elif n == 2:
+ return self._format_2D(fmt)
+ else:
+ raise RuntimeError("Array has unexpected rank %d" % n)
+
+ def __str__(self):
+ return self.format()
+
+ def __repr__(self):
+ text = numpy.ndarray.__repr__(self)
+ alphabet = self._alphabet
+ if isinstance(alphabet, str):
+ assert text.endswith(")")
+ text = text[:-1] + ",\n alphabet='%s')" % self._alphabet
+ return text
+
+
+def read(handle, dtype=float):
+ """Parse the file and return an Array object."""
+ try:
+ fp = open(handle)
+ lines = fp.readlines()
+ except TypeError:
+ fp = handle
+ try:
+ lines = fp.readlines()
+ except Exception as e:
+ raise e from None
+ finally:
+ fp.close()
+ header = []
+ for i, line in enumerate(lines):
+ if not line.startswith("#"):
+ break
+ header.append(line[1:].strip())
+ rows = [line.split() for line in lines[i:]]
+ if len(rows[0]) == len(rows[1]) == 2:
+ alphabet = [key for key, value in rows]
+ for key in alphabet:
+ if len(key) > 1:
+ alphabet = tuple(alphabet)
+ break
+ else:
+ alphabet = "".join(alphabet)
+ matrix = Array(alphabet=alphabet, dims=1, dtype=dtype)
+ matrix.update(rows)
+ else:
+ alphabet = rows.pop(0)
+ for key in alphabet:
+ if len(key) > 1:
+ alphabet = tuple(alphabet)
+ break
+ else:
+ alphabet = "".join(alphabet)
+ matrix = Array(alphabet=alphabet, dims=2, dtype=dtype)
+ for letter1, row in zip(alphabet, rows):
+ assert letter1 == row.pop(0)
+ for letter2, word in zip(alphabet, row):
+ matrix[letter1, letter2] = float(word)
+ matrix.header = header
+ return matrix
+
+
+def load(name=None):
+ """Load and return a precalculated substitution matrix.
+
+ >>> from Bio.Align import substitution_matrices
+ >>> names = substitution_matrices.load()
+ """
+ path = os.path.realpath(__file__)
+ directory = os.path.dirname(path)
+ subdirectory = os.path.join(directory, "data")
+ if name is None:
+ filenames = os.listdir(subdirectory)
+ return sorted(filenames)
+ path = os.path.join(subdirectory, name)
+ matrix = read(path)
+ return matrix
diff --git a/code/lib/Bio/Align/substitution_matrices/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/Align/substitution_matrices/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..23f023b
Binary files /dev/null and b/code/lib/Bio/Align/substitution_matrices/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/Align/substitution_matrices/data/BENNER22 b/code/lib/Bio/Align/substitution_matrices/data/BENNER22
new file mode 100644
index 0000000..49ba457
--- /dev/null
+++ b/code/lib/Bio/Align/substitution_matrices/data/BENNER22
@@ -0,0 +1,27 @@
+# S.A. Benner, M.A. Cohen, G.H. Gonnet:
+# "Amino acid substitution during functionally constrained divergent evolution
+# of protein sequences".
+# Protein Engineering 7(11): 1323-1332 (1994).
+# Figure 3B.
+# PMID 7700864
+ A C D E F G H I K L M N P Q R S T V W Y
+A 2.5 -1.2 -0.2 -0.3 -3.1 0.8 -1.6 -0.4 -1.0 -1.7 -0.8 0.0 0.8 -0.9 -1.2 1.3 1.4 0.4 -5.5 -3.5
+C -1.2 12.6 -3.7 -4.3 -0.1 -1.7 -1.5 -2.4 -3.3 -2.6 -2.5 -1.9 -3.1 -3.3 -1.6 0.3 -1.1 -1.7 0.5 0.6
+D -0.2 -3.7 4.8 3.9 -5.4 0.7 0.3 -4.0 0.2 -4.9 -3.9 2.4 -1.8 0.6 -1.0 0.1 -0.7 -3.0 -6.4 -3.0
+E -0.3 -4.3 3.9 4.6 -5.7 0.5 -0.2 -3.6 1.0 -4.4 -3.4 1.2 -1.7 1.7 -0.1 -0.5 -0.9 -2.7 -6.3 -4.0
+F -3.1 -0.1 -5.4 -5.7 7.7 -5.8 0.3 0.5 -5.1 2.2 0.7 -3.5 -3.4 -3.6 -4.3 -2.2 -2.6 -0.1 0.5 5.9
+G 0.8 -1.7 0.7 0.5 -5.8 6.2 -2.0 -3.8 -1.0 -4.9 -3.8 0.4 -1.8 -1.4 -0.7 0.6 -0.7 -2.5 -4.5 -4.8
+H -1.6 -1.5 0.3 -0.2 0.3 -2.0 6.1 -3.2 0.8 -2.1 -2.4 1.4 -0.4 2.4 1.5 -0.5 -1.1 -3.0 -2.7 3.7
+I -0.4 -2.4 -4.0 -3.6 0.5 -3.8 -3.2 4.2 -3.0 2.7 3.1 -2.7 -2.3 -2.7 -3.2 -1.4 0.3 3.6 -4.4 -2.2
+K -1.0 -3.3 0.2 1.0 -5.1 -1.0 0.8 -3.0 4.4 -3.3 -2.0 1.0 -1.6 2.2 3.9 -0.4 -0.4 -2.7 -3.7 -3.6
+L -1.7 -2.6 -4.9 -4.4 2.2 -4.9 -2.1 2.7 -3.3 4.6 3.2 -3.5 -1.3 -2.0 -2.9 -2.1 -1.0 2.0 -1.8 -0.7
+M -0.8 -2.5 -3.9 -3.4 0.7 -3.8 -2.4 3.1 -2.0 3.2 4.9 -2.6 -2.0 -1.7 -2.1 -1.5 0.1 2.5 -2.8 -1.8
+N 0.0 -1.9 2.4 1.2 -3.5 0.4 1.4 -2.7 1.0 -3.5 -2.6 3.3 -1.1 0.5 0.4 1.1 0.5 -2.3 -5.2 -1.2
+P 0.8 -3.1 -1.8 -1.7 -3.4 -1.8 -0.4 -2.3 -1.6 -1.3 -2.0 -1.1 7.0 -0.1 -1.2 1.1 0.4 -1.7 -5.8 -3.5
+Q -0.9 -3.3 0.6 1.7 -3.6 -1.4 2.4 -2.7 2.2 -2.0 -1.7 0.5 -0.1 4.2 2.2 -0.6 -0.7 -2.4 -3.3 -1.9
+R -1.2 -1.6 -1.0 -0.1 -4.3 -0.7 1.5 -3.2 3.9 -2.9 -2.1 0.4 -1.2 2.2 5.0 -0.5 -0.7 -2.9 -1.1 -2.7
+S 1.3 0.3 0.1 -0.5 -2.2 0.6 -0.5 -1.4 -0.4 -2.1 -1.5 1.1 1.1 -0.6 -0.5 2.0 1.5 -0.9 -3.9 -1.9
+T 1.4 -1.1 -0.7 -0.9 -2.6 -0.7 -1.1 0.3 -0.4 -1.0 0.1 0.5 0.4 -0.7 -0.7 1.5 2.5 0.4 -4.5 -3.0
+V 0.4 -1.7 -3.0 -2.7 -0.1 -2.5 -3.0 3.6 -2.7 2.0 2.5 -2.3 -1.7 -2.4 -2.9 -0.9 0.4 3.7 -4.5 -2.6
+W -5.5 0.5 -6.4 -6.3 0.5 -4.5 -2.7 -4.4 -3.7 -1.8 -2.8 -5.2 -5.8 -3.3 -1.1 -3.9 -4.5 -4.5 15.7 1.5
+Y -3.5 0.6 -3.0 -4.0 5.9 -4.8 3.7 -2.2 -3.6 -0.7 -1.8 -1.2 -3.5 -1.9 -2.7 -1.9 -3.0 -2.6 1.5 9.0
diff --git a/code/lib/Bio/Align/substitution_matrices/data/BENNER6 b/code/lib/Bio/Align/substitution_matrices/data/BENNER6
new file mode 100644
index 0000000..4849b30
--- /dev/null
+++ b/code/lib/Bio/Align/substitution_matrices/data/BENNER6
@@ -0,0 +1,27 @@
+# S.A. Benner, M.A. Cohen, G.H. Gonnet:
+# "Amino acid substitution during functionally constrained divergent evolution
+# of protein sequences".
+# Protein Engineering 7(11): 1323-1332 (1994).
+# Figure 3A.
+# PMID 7700864
+ A C D E F G H I K L M N P Q R S T V W Y
+A 2.5 -1.7 -0.6 -0.7 -3.2 0.8 -2.1 0.1 -1.9 -1.3 -0.2 0.0 1.1 -1.7 -1.7 1.4 1.7 0.7 -4.3 -4.0
+C -1.7 12.1 -3.7 -4.7 -0.1 -1.3 -1.2 -3.6 -2.8 -3.8 -3.7 -1.6 -2.7 -3.2 -0.4 0.9 -1.5 -3.1 1.6 2.6
+D -0.6 -3.7 5.2 4.4 -5.7 0.8 0.1 -4.2 -0.2 -5.3 -4.3 2.5 -2.8 0.6 -1.5 -0.4 -1.2 -3.3 -6.3 -2.3
+E -0.7 -4.7 4.4 5.2 -6.7 0.5 -0.2 -4.1 0.9 -5.0 -4.1 1.1 -2.6 2.1 -0.4 -1.2 -1.6 -3.0 -5.6 -4.1
+F -3.2 -0.1 -5.7 -6.7 8.3 -5.7 0.1 0.0 -6.3 2.4 -0.1 -3.5 -3.2 -4.4 -4.9 -1.8 -2.4 -0.5 -1.6 5.6
+G 0.8 -1.3 0.8 0.5 -5.7 5.8 -2.1 -3.4 -1.4 -4.6 -3.7 -0.1 -1.7 -1.6 -0.1 0.8 -0.5 -2.3 -1.7 -4.9
+H -2.1 -1.2 0.1 -0.2 0.1 -2.1 6.1 -3.7 0.9 -2.2 -3.4 1.4 -0.4 3.2 1.8 -0.9 -1.7 -3.8 -2.8 4.4
+I 0.1 -3.6 -4.2 -4.1 0.0 -3.4 -3.7 4.4 -3.8 2.4 4.0 -2.5 -2.0 -3.8 -3.8 -1.2 0.7 3.9 -5.0 -3.3
+K -1.9 -2.8 -0.2 0.9 -6.3 -1.4 0.9 -3.8 5.6 -4.1 -2.9 1.0 -2.3 2.5 4.3 -1.2 -1.1 -3.8 -1.4 -4.0
+L -1.3 -3.8 -5.3 -5.0 2.4 -4.6 -2.2 2.4 -4.1 4.8 -2.9 -3.4 -0.2 -2.4 -3.2 -1.5 -0.4 1.9 -3.0 -1.6
+M -0.2 -3.7 -4.3 -4.1 -0.1 -3.7 -3.4 4.0 -2.9 -2.9 4.8 -2.5 -1.8 -3.1 -3.0 -1.3 0.6 3.3 -4.4 -3.6
+N 0.0 -1.6 2.5 1.1 -3.5 -0.1 1.4 -2.5 1.0 -3.4 -2.5 3.6 -1.1 0.1 -0.1 1.2 0.5 -2.4 -4.4 -0.9
+P 1.1 -2.7 -2.8 -2.6 -3.2 -1.7 -0.4 -2.0 -2.3 -0.2 -1.8 -1.1 6.5 0.1 -1.3 1.4 0.6 -1.6 -4.8 -3.8
+Q -1.7 -3.2 0.6 2.1 -4.4 -1.6 3.2 -3.8 2.5 -2.4 -3.1 0.1 0.1 5.3 2.5 -1.4 -1.7 -3.5 -2.6 -1.4
+R -1.7 -0.4 -1.5 -0.4 -4.9 -0.1 1.8 -3.8 4.3 -3.2 -3.0 -0.1 -1.3 2.5 5.1 -0.9 -1.3 -3.7 2.0 -2.6
+S 1.4 0.9 -0.4 -1.2 -1.8 0.8 -0.9 -1.2 -1.2 -1.5 -1.3 1.2 1.4 -1.4 -0.9 2.1 1.5 -0.9 -2.9 -1.8
+T 1.7 -1.5 -1.2 -1.6 -2.4 -0.5 -1.7 0.7 -1.1 -0.4 0.6 0.5 0.6 -1.7 -1.3 1.5 2.4 0.6 -2.6 -3.4
+V 0.7 -3.1 -3.3 -3.0 -0.5 -2.3 -3.8 3.9 -3.8 1.9 3.3 -2.4 -1.6 -3.5 -3.7 -0.9 0.6 4.0 -4.8 -3.8
+W -4.3 1.6 -6.3 -5.6 -1.6 -1.7 -2.8 -5.0 -1.4 -3.0 -4.4 -4.4 -4.8 -2.6 2.0 -2.9 -2.6 -4.8 14.7 -0.3
+Y -4.0 2.6 -2.3 -4.1 5.6 -4.9 4.4 -3.3 -4.0 -1.6 -3.6 -0.9 -3.8 -1.4 -2.6 -1.8 -3.4 -3.8 -0.3 9.5
diff --git a/code/lib/Bio/Align/substitution_matrices/data/BENNER74 b/code/lib/Bio/Align/substitution_matrices/data/BENNER74
new file mode 100644
index 0000000..62000b1
--- /dev/null
+++ b/code/lib/Bio/Align/substitution_matrices/data/BENNER74
@@ -0,0 +1,27 @@
+# S.A. Benner, M.A. Cohen, G.H. Gonnet:
+# "Amino acid substitution during functionally constrained divergent evolution
+# of protein sequences".
+# Protein Engineering 7(11): 1323-1332 (1994).
+# Figure 3C.
+# PMID 7700864
+ A C D E F G H I K L M N P Q R S T V W Y
+A 2.4 0.3 -0.3 -0.1 -2.6 0.6 -1.0 -0.8 -0.4 -1.4 -0.8 -0.2 0.4 -0.3 -0.8 1.1 0.7 0.1 -4.1 -2.6
+C 0.3 11.8 -3.2 -3.2 -0.7 -2.0 -1.3 -1.2 -2.9 -1.6 -1.2 -1.8 -3.1 -2.6 -2.2 0.1 -0.6 -0.2 -0.9 -0.4
+D -0.3 -3.2 4.8 2.9 -4.7 0.2 0.4 -3.9 0.4 -4.2 -3.2 2.2 -1.0 0.8 -0.5 0.4 -0.2 -2.9 -5.5 -2.8
+E -0.1 -3.2 2.9 3.7 -4.3 -0.5 0.2 -2.9 1.2 -3.1 -2.2 1.0 -0.7 1.7 0.3 0.1 -0.2 -2.1 -4.7 -3.0
+F -2.6 -0.7 -4.7 -4.3 7.2 -5.4 0.0 0.9 -3.6 2.1 1.3 -3.2 -3.8 -2.8 -3.5 -2.6 -2.2 0.1 3.0 5.3
+G 0.6 -2.0 0.2 -0.5 -5.4 6.6 -1.6 -4.3 -1.1 -4.6 -3.5 0.4 -1.7 -1.1 -1.0 0.4 -1.0 -3.1 -4.1 -4.3
+H -1.0 -1.3 0.4 0.2 0.0 -1.6 6.1 -2.3 0.6 -1.9 -1.5 1.2 -1.0 1.4 1.0 -0.3 -0.5 -2.1 -1.0 2.5
+I -0.8 -1.2 -3.9 -2.9 0.9 -4.3 -2.3 4.0 -2.3 2.8 2.6 -2.8 -2.6 -2.0 -2.6 -1.8 -0.3 3.2 -2.3 -1.0
+K -0.4 -2.9 0.4 1.2 -3.6 -1.1 0.6 -2.3 3.4 -2.4 -1.5 0.9 -0.8 1.7 2.9 0.0 0.1 -1.9 -3.6 -2.4
+L -1.4 -1.6 -4.2 -3.1 2.1 -4.6 -1.9 2.8 -2.4 4.2 2.9 -3.1 -2.2 -1.7 -2.4 -2.2 -1.1 1.9 -0.9 -0.1
+M -0.8 -1.2 -3.2 -2.2 1.3 -3.5 -1.5 2.6 -1.5 2.9 4.5 -2.2 -2.4 -1.0 -1.8 -1.4 -0.4 1.8 -1.3 -0.5
+N -0.2 -1.8 2.2 1.0 -3.2 0.4 1.2 -2.8 0.9 -3.1 -2.2 3.6 -1.0 0.7 0.3 0.9 0.4 -2.2 -4.0 -1.4
+P 0.4 -3.1 -1.0 -0.7 -3.8 -1.7 -1.0 -2.6 -0.8 -2.2 -2.4 -1.0 7.5 -0.2 -0.1 0.5 0.1 -1.9 -5.2 -3.4
+Q -0.3 -2.6 0.8 1.7 -2.8 -1.1 1.4 -2.0 1.7 -1.7 -1.0 0.7 -0.2 3.0 1.6 0.1 -0.1 -1.7 -2.8 -1.8
+R -0.8 -2.2 -0.5 0.3 -3.5 -1.0 1.0 -2.6 2.9 -2.4 -1.8 0.3 -0.1 1.6 4.8 -0.2 -0.3 -2.2 -1.6 -2.0
+S 1.1 0.1 0.4 0.1 -2.6 0.4 -0.3 -1.8 0.0 -2.2 -1.4 0.9 0.5 0.1 -0.2 2.1 1.4 -1.0 -3.4 -1.9
+T 0.7 -0.6 -0.2 -0.2 -2.2 -1.0 -0.5 -0.3 0.1 -1.1 -0.4 0.4 0.1 -0.1 -0.3 1.4 2.5 0.2 -3.7 -2.1
+V 0.1 -0.2 -2.9 -2.1 0.1 -3.1 -2.1 3.2 -1.9 1.9 1.8 -2.2 -1.9 -1.7 -2.2 -1.0 0.2 3.4 -2.9 -1.4
+W -4.1 -0.9 -5.5 -4.7 3.0 -4.1 -1.0 -2.3 -3.6 -0.9 -1.3 -4.0 -5.2 -2.8 -1.6 -3.4 -3.7 -2.9 14.7 3.6
+Y -2.6 -0.4 -2.8 -3.0 5.3 -4.3 2.5 -1.0 -2.4 -0.1 -0.5 -1.4 -3.4 -1.8 -2.0 -1.9 -2.1 -1.4 3.6 8.1
diff --git a/code/lib/Bio/Align/substitution_matrices/data/BLOSUM45 b/code/lib/Bio/Align/substitution_matrices/data/BLOSUM45
new file mode 100644
index 0000000..18c3323
--- /dev/null
+++ b/code/lib/Bio/Align/substitution_matrices/data/BLOSUM45
@@ -0,0 +1,31 @@
+# Matrix made by matblas from blosum45.iij
+# * column uses minimum score
+# BLOSUM Clustered Scoring Matrix in 1/3 Bit Units
+# Blocks Database = /data/blocks_5.0/blocks.dat
+# Cluster Percentage: >= 45
+# Entropy = 0.3795, Expected = -0.2789
+ A R N D C Q E G H I L K M F P S T W Y V B Z X *
+A 5 -2 -1 -2 -1 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -2 -2 0 -1 -1 0 -5
+R -2 7 0 -1 -3 1 0 -2 0 -3 -2 3 -1 -2 -2 -1 -1 -2 -1 -2 -1 0 -1 -5
+N -1 0 6 2 -2 0 0 0 1 -2 -3 0 -2 -2 -2 1 0 -4 -2 -3 4 0 -1 -5
+D -2 -1 2 7 -3 0 2 -1 0 -4 -3 0 -3 -4 -1 0 -1 -4 -2 -3 5 1 -1 -5
+C -1 -3 -2 -3 12 -3 -3 -3 -3 -3 -2 -3 -2 -2 -4 -1 -1 -5 -3 -1 -2 -3 -2 -5
+Q -1 1 0 0 -3 6 2 -2 1 -2 -2 1 0 -4 -1 0 -1 -2 -1 -3 0 4 -1 -5
+E -1 0 0 2 -3 2 6 -2 0 -3 -2 1 -2 -3 0 0 -1 -3 -2 -3 1 4 -1 -5
+G 0 -2 0 -1 -3 -2 -2 7 -2 -4 -3 -2 -2 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -5
+H -2 0 1 0 -3 1 0 -2 10 -3 -2 -1 0 -2 -2 -1 -2 -3 2 -3 0 0 -1 -5
+I -1 -3 -2 -4 -3 -2 -3 -4 -3 5 2 -3 2 0 -2 -2 -1 -2 0 3 -3 -3 -1 -5
+L -1 -2 -3 -3 -2 -2 -2 -3 -2 2 5 -3 2 1 -3 -3 -1 -2 0 1 -3 -2 -1 -5
+K -1 3 0 0 -3 1 1 -2 -1 -3 -3 5 -1 -3 -1 -1 -1 -2 -1 -2 0 1 -1 -5
+M -1 -1 -2 -3 -2 0 -2 -2 0 2 2 -1 6 0 -2 -2 -1 -2 0 1 -2 -1 -1 -5
+F -2 -2 -2 -4 -2 -4 -3 -3 -2 0 1 -3 0 8 -3 -2 -1 1 3 0 -3 -3 -1 -5
+P -1 -2 -2 -1 -4 -1 0 -2 -2 -2 -3 -1 -2 -3 9 -1 -1 -3 -3 -3 -2 -1 -1 -5
+S 1 -1 1 0 -1 0 0 0 -1 -2 -3 -1 -2 -2 -1 4 2 -4 -2 -1 0 0 0 -5
+T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -1 -1 2 5 -3 -1 0 0 -1 0 -5
+W -2 -2 -4 -4 -5 -2 -3 -2 -3 -2 -2 -2 -2 1 -3 -4 -3 15 3 -3 -4 -2 -2 -5
+Y -2 -1 -2 -2 -3 -1 -2 -3 2 0 0 -1 0 3 -3 -2 -1 3 8 -1 -2 -2 -1 -5
+V 0 -2 -3 -3 -1 -3 -3 -3 -3 3 1 -2 1 0 -3 -1 0 -3 -1 5 -3 -3 -1 -5
+B -1 -1 4 5 -2 0 1 -1 0 -3 -3 0 -2 -3 -2 0 0 -4 -2 -3 4 2 -1 -5
+Z -1 0 0 1 -3 4 4 -2 0 -3 -2 1 -1 -3 -1 0 -1 -2 -2 -3 2 4 -1 -5
+X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 0 0 -2 -1 -1 -1 -1 -1 -5
+* -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 1
diff --git a/code/lib/Bio/Align/substitution_matrices/data/BLOSUM50 b/code/lib/Bio/Align/substitution_matrices/data/BLOSUM50
new file mode 100644
index 0000000..3f62e3c
--- /dev/null
+++ b/code/lib/Bio/Align/substitution_matrices/data/BLOSUM50
@@ -0,0 +1,31 @@
+# Matrix made by matblas from blosum50.iij
+# * column uses minimum score
+# BLOSUM Clustered Scoring Matrix in 1/3 Bit Units
+# Blocks Database = /data/blocks_5.0/blocks.dat
+# Cluster Percentage: >= 50
+# Entropy = 0.4808, Expected = -0.3573
+ A R N D C Q E G H I L K M F P S T W Y V B Z X *
+A 5 -2 -1 -2 -1 -1 -1 0 -2 -1 -2 -1 -1 -3 -1 1 0 -3 -2 0 -2 -1 -1 -5
+R -2 7 -1 -2 -4 1 0 -3 0 -4 -3 3 -2 -3 -3 -1 -1 -3 -1 -3 -1 0 -1 -5
+N -1 -1 7 2 -2 0 0 0 1 -3 -4 0 -2 -4 -2 1 0 -4 -2 -3 4 0 -1 -5
+D -2 -2 2 8 -4 0 2 -1 -1 -4 -4 -1 -4 -5 -1 0 -1 -5 -3 -4 5 1 -1 -5
+C -1 -4 -2 -4 13 -3 -3 -3 -3 -2 -2 -3 -2 -2 -4 -1 -1 -5 -3 -1 -3 -3 -2 -5
+Q -1 1 0 0 -3 7 2 -2 1 -3 -2 2 0 -4 -1 0 -1 -1 -1 -3 0 4 -1 -5
+E -1 0 0 2 -3 2 6 -3 0 -4 -3 1 -2 -3 -1 -1 -1 -3 -2 -3 1 5 -1 -5
+G 0 -3 0 -1 -3 -2 -3 8 -2 -4 -4 -2 -3 -4 -2 0 -2 -3 -3 -4 -1 -2 -2 -5
+H -2 0 1 -1 -3 1 0 -2 10 -4 -3 0 -1 -1 -2 -1 -2 -3 2 -4 0 0 -1 -5
+I -1 -4 -3 -4 -2 -3 -4 -4 -4 5 2 -3 2 0 -3 -3 -1 -3 -1 4 -4 -3 -1 -5
+L -2 -3 -4 -4 -2 -2 -3 -4 -3 2 5 -3 3 1 -4 -3 -1 -2 -1 1 -4 -3 -1 -5
+K -1 3 0 -1 -3 2 1 -2 0 -3 -3 6 -2 -4 -1 0 -1 -3 -2 -3 0 1 -1 -5
+M -1 -2 -2 -4 -2 0 -2 -3 -1 2 3 -2 7 0 -3 -2 -1 -1 0 1 -3 -1 -1 -5
+F -3 -3 -4 -5 -2 -4 -3 -4 -1 0 1 -4 0 8 -4 -3 -2 1 4 -1 -4 -4 -2 -5
+P -1 -3 -2 -1 -4 -1 -1 -2 -2 -3 -4 -1 -3 -4 10 -1 -1 -4 -3 -3 -2 -1 -2 -5
+S 1 -1 1 0 -1 0 -1 0 -1 -3 -3 0 -2 -3 -1 5 2 -4 -2 -2 0 0 -1 -5
+T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 2 5 -3 -2 0 0 -1 0 -5
+W -3 -3 -4 -5 -5 -1 -3 -3 -3 -3 -2 -3 -1 1 -4 -4 -3 15 2 -3 -5 -2 -3 -5
+Y -2 -1 -2 -3 -3 -1 -2 -3 2 -1 -1 -2 0 4 -3 -2 -2 2 8 -1 -3 -2 -1 -5
+V 0 -3 -3 -4 -1 -3 -3 -4 -4 4 1 -3 1 -1 -3 -2 0 -3 -1 5 -4 -3 -1 -5
+B -2 -1 4 5 -3 0 1 -1 0 -4 -4 0 -3 -4 -2 0 0 -5 -3 -4 5 2 -1 -5
+Z -1 0 0 1 -3 4 5 -2 0 -3 -3 1 -1 -4 -1 0 -1 -2 -2 -3 2 5 -1 -5
+X -1 -1 -1 -1 -2 -1 -1 -2 -1 -1 -1 -1 -1 -2 -2 -1 0 -3 -1 -1 -1 -1 -1 -5
+* -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 1
diff --git a/code/lib/Bio/Align/substitution_matrices/data/BLOSUM62 b/code/lib/Bio/Align/substitution_matrices/data/BLOSUM62
new file mode 100644
index 0000000..205f139
--- /dev/null
+++ b/code/lib/Bio/Align/substitution_matrices/data/BLOSUM62
@@ -0,0 +1,31 @@
+# Matrix made by matblas from blosum62.iij
+# * column uses minimum score
+# BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
+# Blocks Database = /data/blocks_5.0/blocks.dat
+# Cluster Percentage: >= 62
+# Entropy = 0.6979, Expected = -0.5209
+ A R N D C Q E G H I L K M F P S T W Y V B Z X *
+A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4
+R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4
+N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4
+D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4
+C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4
+Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4
+E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
+G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4
+H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4
+I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4
+L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4
+K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4
+M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4
+F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4
+P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -4
+S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -4
+T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -4
+W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -4
+Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4
+V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4
+B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4
+Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
+X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4
+* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1
diff --git a/code/lib/Bio/Align/substitution_matrices/data/BLOSUM80 b/code/lib/Bio/Align/substitution_matrices/data/BLOSUM80
new file mode 100644
index 0000000..78172a3
--- /dev/null
+++ b/code/lib/Bio/Align/substitution_matrices/data/BLOSUM80
@@ -0,0 +1,31 @@
+# Matrix made by matblas from blosum80_3.iij
+# * column uses minimum score
+# BLOSUM Clustered Scoring Matrix in 1/3 Bit Units
+# Blocks Database = /data/blocks_5.0/blocks.dat
+# Cluster Percentage: >= 80
+# Entropy = 0.9868, Expected = -0.7442
+ A R N D C Q E G H I L K M F P S T W Y V B Z X *
+A 7 -3 -3 -3 -1 -2 -2 0 -3 -3 -3 -1 -2 -4 -1 2 0 -5 -4 -1 -3 -2 -1 -8
+R -3 9 -1 -3 -6 1 -1 -4 0 -5 -4 3 -3 -5 -3 -2 -2 -5 -4 -4 -2 0 -2 -8
+N -3 -1 9 2 -5 0 -1 -1 1 -6 -6 0 -4 -6 -4 1 0 -7 -4 -5 5 -1 -2 -8
+D -3 -3 2 10 -7 -1 2 -3 -2 -7 -7 -2 -6 -6 -3 -1 -2 -8 -6 -6 6 1 -3 -8
+C -1 -6 -5 -7 13 -5 -7 -6 -7 -2 -3 -6 -3 -4 -6 -2 -2 -5 -5 -2 -6 -7 -4 -8
+Q -2 1 0 -1 -5 9 3 -4 1 -5 -4 2 -1 -5 -3 -1 -1 -4 -3 -4 -1 5 -2 -8
+E -2 -1 -1 2 -7 3 8 -4 0 -6 -6 1 -4 -6 -2 -1 -2 -6 -5 -4 1 6 -2 -8
+G 0 -4 -1 -3 -6 -4 -4 9 -4 -7 -7 -3 -5 -6 -5 -1 -3 -6 -6 -6 -2 -4 -3 -8
+H -3 0 1 -2 -7 1 0 -4 12 -6 -5 -1 -4 -2 -4 -2 -3 -4 3 -5 -1 0 -2 -8
+I -3 -5 -6 -7 -2 -5 -6 -7 -6 7 2 -5 2 -1 -5 -4 -2 -5 -3 4 -6 -6 -2 -8
+L -3 -4 -6 -7 -3 -4 -6 -7 -5 2 6 -4 3 0 -5 -4 -3 -4 -2 1 -7 -5 -2 -8
+K -1 3 0 -2 -6 2 1 -3 -1 -5 -4 8 -3 -5 -2 -1 -1 -6 -4 -4 -1 1 -2 -8
+M -2 -3 -4 -6 -3 -1 -4 -5 -4 2 3 -3 9 0 -4 -3 -1 -3 -3 1 -5 -3 -2 -8
+F -4 -5 -6 -6 -4 -5 -6 -6 -2 -1 0 -5 0 10 -6 -4 -4 0 4 -2 -6 -6 -3 -8
+P -1 -3 -4 -3 -6 -3 -2 -5 -4 -5 -5 -2 -4 -6 12 -2 -3 -7 -6 -4 -4 -2 -3 -8
+S 2 -2 1 -1 -2 -1 -1 -1 -2 -4 -4 -1 -3 -4 -2 7 2 -6 -3 -3 0 -1 -1 -8
+T 0 -2 0 -2 -2 -1 -2 -3 -3 -2 -3 -1 -1 -4 -3 2 8 -5 -3 0 -1 -2 -1 -8
+W -5 -5 -7 -8 -5 -4 -6 -6 -4 -5 -4 -6 -3 0 -7 -6 -5 16 3 -5 -8 -5 -5 -8
+Y -4 -4 -4 -6 -5 -3 -5 -6 3 -3 -2 -4 -3 4 -6 -3 -3 3 11 -3 -5 -4 -3 -8
+V -1 -4 -5 -6 -2 -4 -4 -6 -5 4 1 -4 1 -2 -4 -3 0 -5 -3 7 -6 -4 -2 -8
+B -3 -2 5 6 -6 -1 1 -2 -1 -6 -7 -1 -5 -6 -4 0 -1 -8 -5 -6 6 0 -3 -8
+Z -2 0 -1 1 -7 5 6 -4 0 -6 -5 1 -3 -6 -2 -1 -2 -5 -4 -4 0 6 -1 -8
+X -1 -2 -2 -3 -4 -2 -2 -3 -2 -2 -2 -2 -2 -3 -3 -1 -1 -5 -3 -2 -3 -1 -2 -8
+* -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 1
diff --git a/code/lib/Bio/Align/substitution_matrices/data/BLOSUM90 b/code/lib/Bio/Align/substitution_matrices/data/BLOSUM90
new file mode 100644
index 0000000..71441b5
--- /dev/null
+++ b/code/lib/Bio/Align/substitution_matrices/data/BLOSUM90
@@ -0,0 +1,31 @@
+# Matrix made by matblas from blosum90.iij
+# * column uses minimum score
+# BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
+# Blocks Database = /data/blocks_5.0/blocks.dat
+# Cluster Percentage: >= 90
+# Entropy = 1.1806, Expected = -0.8887
+ A R N D C Q E G H I L K M F P S T W Y V B Z X *
+A 5 -2 -2 -3 -1 -1 -1 0 -2 -2 -2 -1 -2 -3 -1 1 0 -4 -3 -1 -2 -1 -1 -6
+R -2 6 -1 -3 -5 1 -1 -3 0 -4 -3 2 -2 -4 -3 -1 -2 -4 -3 -3 -2 0 -2 -6
+N -2 -1 7 1 -4 0 -1 -1 0 -4 -4 0 -3 -4 -3 0 0 -5 -3 -4 4 -1 -2 -6
+D -3 -3 1 7 -5 -1 1 -2 -2 -5 -5 -1 -4 -5 -3 -1 -2 -6 -4 -5 4 0 -2 -6
+C -1 -5 -4 -5 9 -4 -6 -4 -5 -2 -2 -4 -2 -3 -4 -2 -2 -4 -4 -2 -4 -5 -3 -6
+Q -1 1 0 -1 -4 7 2 -3 1 -4 -3 1 0 -4 -2 -1 -1 -3 -3 -3 -1 4 -1 -6
+E -1 -1 -1 1 -6 2 6 -3 -1 -4 -4 0 -3 -5 -2 -1 -1 -5 -4 -3 0 4 -2 -6
+G 0 -3 -1 -2 -4 -3 -3 6 -3 -5 -5 -2 -4 -5 -3 -1 -3 -4 -5 -5 -2 -3 -2 -6
+H -2 0 0 -2 -5 1 -1 -3 8 -4 -4 -1 -3 -2 -3 -2 -2 -3 1 -4 -1 0 -2 -6
+I -2 -4 -4 -5 -2 -4 -4 -5 -4 5 1 -4 1 -1 -4 -3 -1 -4 -2 3 -5 -4 -2 -6
+L -2 -3 -4 -5 -2 -3 -4 -5 -4 1 5 -3 2 0 -4 -3 -2 -3 -2 0 -5 -4 -2 -6
+K -1 2 0 -1 -4 1 0 -2 -1 -4 -3 6 -2 -4 -2 -1 -1 -5 -3 -3 -1 1 -1 -6
+M -2 -2 -3 -4 -2 0 -3 -4 -3 1 2 -2 7 -1 -3 -2 -1 -2 -2 0 -4 -2 -1 -6
+F -3 -4 -4 -5 -3 -4 -5 -5 -2 -1 0 -4 -1 7 -4 -3 -3 0 3 -2 -4 -4 -2 -6
+P -1 -3 -3 -3 -4 -2 -2 -3 -3 -4 -4 -2 -3 -4 8 -2 -2 -5 -4 -3 -3 -2 -2 -6
+S 1 -1 0 -1 -2 -1 -1 -1 -2 -3 -3 -1 -2 -3 -2 5 1 -4 -3 -2 0 -1 -1 -6
+T 0 -2 0 -2 -2 -1 -1 -3 -2 -1 -2 -1 -1 -3 -2 1 6 -4 -2 -1 -1 -1 -1 -6
+W -4 -4 -5 -6 -4 -3 -5 -4 -3 -4 -3 -5 -2 0 -5 -4 -4 11 2 -3 -6 -4 -3 -6
+Y -3 -3 -3 -4 -4 -3 -4 -5 1 -2 -2 -3 -2 3 -4 -3 -2 2 8 -3 -4 -3 -2 -6
+V -1 -3 -4 -5 -2 -3 -3 -5 -4 3 0 -3 0 -2 -3 -2 -1 -3 -3 5 -4 -3 -2 -6
+B -2 -2 4 4 -4 -1 0 -2 -1 -5 -5 -1 -4 -4 -3 0 -1 -6 -4 -4 4 0 -2 -6
+Z -1 0 -1 0 -5 4 4 -3 0 -4 -4 1 -2 -4 -2 -1 -1 -4 -3 -3 0 4 -1 -6
+X -1 -2 -2 -2 -3 -1 -2 -2 -2 -2 -2 -1 -1 -2 -2 -1 -1 -3 -2 -2 -2 -1 -2 -6
+* -6 -6 -6 -6 -6 -6 -6 -6 -6 -6 -6 -6 -6 -6 -6 -6 -6 -6 -6 -6 -6 -6 -6 1
diff --git a/code/lib/Bio/Align/substitution_matrices/data/DAYHOFF b/code/lib/Bio/Align/substitution_matrices/data/DAYHOFF
new file mode 100644
index 0000000..e8aecac
--- /dev/null
+++ b/code/lib/Bio/Align/substitution_matrices/data/DAYHOFF
@@ -0,0 +1,27 @@
+# M.O. Dayhoff, R.M. Schwartz, and B.C. Orcutt:
+# "A Model of Evolutionary Change in Proteins."
+# Margaret O. Dayhoff: Atlas of Protein Sequence and Structure,
+# Volume 5, Supplement 3, 1978, pages 345-352.
+# The National Biomedical Research Foundation, 1979.
+# Figure 84, page 352.
+ A C D E F G H I K L M N P Q R S T V W Y
+A 0.2 -0.2 0.0 0.0 -0.4 0.1 -0.1 -0.1 -0.1 -0.2 -0.1 0.0 0.1 0.0 -0.2 0.1 0.1 0.0 -0.6 -0.3
+C -0.2 1.2 -0.5 -0.5 -0.4 -0.3 -0.3 -0.2 -0.5 -0.6 -0.5 -0.4 -0.3 -0.5 -0.4 0.0 -0.2 -0.2 -0.8 0.0
+D 0.0 -0.5 0.4 0.3 -0.6 0.1 0.1 -0.2 0.0 -0.4 -0.3 0.2 -0.1 0.2 -0.1 0.0 0.0 -0.2 -0.7 -0.4
+E 0.0 -0.5 0.3 0.4 -0.5 0.0 0.1 -0.2 0.0 -0.3 -0.2 0.1 -0.1 0.2 -0.1 0.0 0.0 -0.2 -0.7 -0.4
+F -0.4 -0.4 -0.6 -0.5 0.9 -0.5 -0.2 0.1 -0.5 0.2 0.0 -0.4 -0.5 -0.5 -0.4 -0.3 -0.3 -0.1 0.0 0.7
+G 0.1 -0.3 0.1 0.0 -0.5 0.5 -0.2 -0.3 -0.2 -0.4 -0.3 0.0 -0.1 -0.1 -0.3 0.1 0.0 -0.1 -0.7 -0.5
+H -0.1 -0.3 0.1 0.1 -0.2 -0.2 0.6 -0.2 0.0 -0.2 -0.2 0.2 0.0 0.3 0.2 -0.1 -0.1 -0.2 -0.3 0.0
+I -0.1 -0.2 -0.2 -0.2 0.1 -0.3 -0.2 0.5 -0.2 0.2 0.2 -0.2 -0.2 -0.2 -0.2 -0.1 0.0 0.4 -0.5 -0.1
+K -0.1 -0.5 0.0 0.0 -0.5 -0.2 0.0 -0.2 0.5 -0.3 0.0 0.1 -0.1 0.1 0.3 0.0 0.0 -0.2 -0.3 -0.4
+L -0.2 -0.6 -0.4 -0.3 0.2 -0.4 -0.2 0.2 -0.3 0.6 0.4 -0.3 -0.3 -0.2 -0.3 -0.3 -0.2 0.2 -0.2 -0.1
+M -0.1 -0.5 -0.3 -0.2 0.0 -0.3 -0.2 0.2 0.0 0.4 0.6 -0.2 -0.2 -0.1 0.0 -0.2 -0.1 0.2 -0.4 -0.2
+N 0.0 -0.4 0.2 0.1 -0.4 0.0 0.2 -0.2 0.1 -0.3 -0.2 0.2 -0.1 0.1 0.0 0.1 0.0 -0.2 -0.4 -0.2
+P 0.1 -0.3 -0.1 -0.1 -0.5 -0.1 0.0 -0.2 -0.1 -0.3 -0.2 -0.1 0.6 0.0 0.0 0.1 0.0 -0.1 -0.6 -0.5
+Q 0.0 -0.5 0.2 0.2 -0.5 -0.1 0.3 -0.2 0.1 -0.2 -0.1 0.1 0.0 0.4 0.1 -0.1 -0.1 -0.2 -0.5 -0.4
+R -0.2 -0.4 -0.1 -0.1 -0.4 -0.3 0.2 -0.2 0.3 -0.3 0.0 0.0 0.0 0.1 0.6 0.0 -0.1 -0.2 0.2 -0.4
+S 0.1 0.0 0.0 0.0 -0.3 0.1 -0.1 -0.1 0.0 -0.3 -0.2 0.1 0.1 -0.1 0.0 0.2 0.1 -0.1 -0.2 -0.3
+T 0.1 -0.2 0.0 0.0 -0.3 0.0 -0.1 0.0 0.0 -0.2 -0.1 0.0 0.0 -0.1 -0.1 0.1 0.3 0.0 -0.5 -0.3
+V 0.0 -0.2 -0.2 -0.2 -0.1 -0.1 -0.2 0.4 -0.2 0.2 0.2 -0.2 -0.1 -0.2 -0.2 -0.1 0.0 0.4 -0.6 -0.2
+W -0.6 -0.8 -0.7 -0.7 0.0 -0.7 -0.3 -0.5 -0.3 -0.2 -0.4 -0.4 -0.6 -0.5 0.2 -0.2 -0.5 -0.6 1.7 0.0
+Y -0.3 0.0 -0.4 -0.4 0.7 -0.5 0.0 -0.1 -0.4 -0.1 -0.2 -0.2 -0.5 -0.4 -0.4 -0.3 -0.3 -0.2 0.0 1.0
diff --git a/code/lib/Bio/Align/substitution_matrices/data/FENG b/code/lib/Bio/Align/substitution_matrices/data/FENG
new file mode 100644
index 0000000..ebd5c2d
--- /dev/null
+++ b/code/lib/Bio/Align/substitution_matrices/data/FENG
@@ -0,0 +1,26 @@
+# D.F. Feng, M.S. Johnson, R.F. Doolittle:
+# "Aligning amino acid sequences: Comparison of commonly used methods."
+# Journal of Molecular Evolution 21(2): 112-125 (1985).
+# Table 1, upper triangle.
+# PMID 6100188
+ A C D E F G H I K L M N P Q R S T V W Y
+A 6 2 4 4 2 5 2 2 3 2 2 3 5 3 2 5 5 5 2 2
+C 2 6 1 0 3 3 2 2 0 2 2 2 2 1 2 4 2 2 3 3
+D 4 1 6 5 1 4 3 1 3 1 0 5 2 4 2 3 2 3 0 2
+E 4 0 5 6 0 4 2 1 4 1 1 3 3 4 2 3 3 4 1 1
+F 2 3 1 0 6 1 2 4 0 4 2 1 2 1 1 3 1 4 3 5
+G 5 3 4 4 1 6 1 2 2 2 1 3 3 2 3 5 2 4 3 2
+H 2 2 3 2 2 1 6 1 3 3 1 4 3 4 4 3 2 1 1 3
+I 2 2 1 1 4 2 1 6 2 5 4 2 2 1 2 2 3 5 2 3
+K 3 0 3 4 0 2 3 2 6 2 2 4 2 4 5 3 4 3 1 1
+L 2 2 1 1 4 2 3 5 2 6 5 1 3 2 2 2 2 5 4 3
+M 2 2 0 1 2 1 1 4 2 5 6 1 2 2 2 1 3 4 3 2
+N 3 2 5 3 1 3 4 2 4 1 1 6 2 3 2 5 4 2 0 3
+P 5 2 2 3 2 3 3 2 2 3 2 2 6 3 3 4 4 3 2 2
+Q 3 1 4 4 1 2 4 1 4 2 2 3 3 6 3 3 3 2 1 2
+R 2 2 2 2 1 3 4 2 5 2 2 2 3 3 6 3 3 2 2 1
+S 5 4 3 3 3 5 3 2 3 2 1 5 4 3 3 6 5 2 2 3
+T 5 2 2 3 1 2 2 3 4 2 3 4 4 3 3 5 6 3 1 2
+V 5 2 3 4 4 4 1 5 3 5 4 2 3 2 2 2 3 6 3 3
+W 2 3 0 1 3 3 1 2 1 4 3 0 2 1 2 2 1 3 6 3
+Y 2 3 2 1 5 2 3 3 1 3 2 3 2 2 1 3 2 3 3 6
diff --git a/code/lib/Bio/Align/substitution_matrices/data/GENETIC b/code/lib/Bio/Align/substitution_matrices/data/GENETIC
new file mode 100644
index 0000000..79fc69b
--- /dev/null
+++ b/code/lib/Bio/Align/substitution_matrices/data/GENETIC
@@ -0,0 +1,27 @@
+# S.A. Benner, M.A. Cohen, G.H. Gonnet:
+# "Amino acid substitution during functionally constrained divergent evolution
+# of protein sequences."
+# Figure 5.
+# Protein Engineering 7(11): 1323-1332 (1994).
+# PMID 7700864
+ A C D E F G H I K L M N P Q R S T V W Y
+A 4.0 -1.9 1.0 1.3 -2.4 1.2 -2.1 -1.8 -1.9 -2.3 -2.0 -1.7 0.8 -2.1 -1.6 0.1 0.9 1.0 -2.2 -2.4
+C -1.9 5.5 -1.6 -3.0 1.8 1.0 -1.6 -1.9 -3.2 -1.3 -2.7 -1.5 -1.9 -3.1 0.7 1.5 -1.9 -2.2 4.1 2.6
+D 1.0 -1.6 4.8 3.8 -1.7 1.1 1.7 -2.1 0.3 -2.4 -2.5 1.7 -2.2 0.3 -2.3 -2.1 -2.1 1.0 -2.9 2.3
+E 1.3 -3.0 3.8 5.7 -2.9 1.4 0.3 -2.3 2.0 -2.5 -1.8 0.3 -2.1 2.0 -2.0 -2.8 -2.1 1.3 -3.2 -0.9
+F -2.4 1.8 -1.7 -2.9 4.5 -1.9 -1.1 1.3 -2.8 2.2 0.5 -1.3 -1.8 -2.1 -1.5 0.0 -2.1 1.0 0.0 2.0
+G 1.2 1.0 1.1 1.4 -1.9 4.2 -2.2 -2.5 -2.2 -2.2 -2.3 -2.6 -1.8 -2.1 0.8 -0.6 -2.1 1.1 1.4 -1.8
+H -2.1 -1.6 1.7 0.3 -1.1 -2.2 4.7 -1.8 0.6 -0.1 -1.8 1.8 0.7 3.6 3.6 -1.6 -1.8 -2.1 -2.1 2.3
+I -1.8 -1.9 -2.1 -2.3 1.3 -2.5 -1.8 4.1 0.7 1.2 3.3 0.9 -1.6 -1.9 -1.2 -0.5 0.8 1.0 -2.2 -1.6
+K -1.9 -3.2 0.3 2.0 -2.8 -2.2 0.6 0.7 5.6 -2.0 1.6 3.5 -1.5 2.2 -0.2 -1.5 1.0 -2.1 -3.0 -0.8
+L -2.3 -1.3 -2.4 -2.5 2.2 -2.2 -0.1 1.2 -2.0 3.4 1.5 -2.2 0.0 0.1 -0.4 -1.2 -1.9 1.1 -0.3 -1.6
+M -2.0 -2.7 -2.5 -1.8 0.5 -2.3 -1.8 3.3 1.6 1.5 5.4 0.1 -1.4 -1.2 -0.4 -1.3 0.7 1.0 -2.0 -2.9
+N -1.7 -1.5 1.7 0.3 -1.3 -2.6 1.8 0.9 3.5 -2.2 0.1 4.7 -1.6 0.4 -1.5 -0.3 0.9 -2.2 -3.0 2.5
+P 0.8 -1.9 -2.2 -2.1 -1.8 -1.8 0.7 -1.6 -1.5 0.0 -1.4 -1.6 3.8 1.0 0.3 0.4 1.1 -2.1 -1.6 -2.3
+Q -2.1 -3.1 0.3 2.0 -2.1 -2.1 3.6 -1.9 2.2 0.1 -1.2 0.4 1.0 5.5 0.3 -2.3 -1.7 -2.0 -2.3 -0.8
+R -1.6 0.7 -2.3 -2.0 -1.5 0.8 3.6 -1.2 -0.2 -0.4 -0.4 -1.5 0.3 0.3 2.9 0.3 -0.6 -2.1 1.8 -1.9
+S 0.1 1.5 -2.1 -2.8 0.0 -0.6 -1.6 -0.5 -1.5 -1.2 -1.3 -0.3 0.4 -2.3 0.3 2.6 1.0 -2.2 0.8 0.3
+T 0.9 -1.9 -2.1 -2.1 -2.1 -2.1 -1.8 0.8 1.0 -1.9 0.7 0.9 1.1 -1.7 -0.6 1.0 4.0 -2.2 -2.2 -2.1
+V 1.0 -2.2 1.0 1.3 1.0 1.1 -2.1 1.0 -2.1 1.1 1.0 -2.2 -2.1 -2.0 -2.1 -2.2 -2.2 4.1 -2.1 -2.2
+W -2.2 4.1 -2.9 -3.2 0.0 1.4 -2.1 -2.2 -3.0 -0.3 -2.0 -3.0 -1.6 -2.3 1.8 0.8 -2.2 -2.1 7.5 -0.5
+Y -2.4 2.6 2.3 -0.9 2.0 -1.8 2.3 -1.6 -0.8 -1.6 -2.9 2.5 -2.3 -0.8 -1.9 0.3 -2.1 -2.2 -0.5 6.5
diff --git a/code/lib/Bio/Align/substitution_matrices/data/GONNET1992 b/code/lib/Bio/Align/substitution_matrices/data/GONNET1992
new file mode 100644
index 0000000..ac4e821
--- /dev/null
+++ b/code/lib/Bio/Align/substitution_matrices/data/GONNET1992
@@ -0,0 +1,26 @@
+# Gaston H. Gonnet, Mark A. Cohen, Steven A. Benner:
+# "Exhaustive matching of the entire protein sequence database."
+# Science 256(5062): 1443-1445 (1992).
+# Figure 2.
+# PMID 1604319
+ A C D E F G H I K L M N P Q R S T V W Y
+A 2.4 0.5 -0.3 0.0 -2.3 0.5 -0.8 -0.8 -0.4 -1.2 -0.7 -0.3 0.3 -0.2 -0.6 1.1 0.6 0.1 -3.6 -2.2
+C 0.5 11.5 -3.2 -3.0 -0.8 -2.0 -1.3 -1.1 -2.8 -1.5 -0.9 -1.8 -3.1 -2.4 -2.2 0.1 -0.5 0.0 -1.0 -0.5
+D -0.3 -3.2 4.7 2.7 -4.5 0.1 0.4 -3.8 0.5 -4.0 -3.0 2.2 -0.7 0.9 -0.3 0.5 0.0 -2.9 -5.2 -2.8
+E 0.0 -3.0 2.7 3.6 -3.9 -0.8 0.4 -2.7 1.2 -2.8 -2.0 0.9 -0.5 1.7 0.4 0.2 -0.1 -1.9 -4.3 -2.7
+F -2.3 -0.8 -4.5 -3.9 7.0 -5.2 -0.1 1.0 -3.3 2.0 1.6 -3.1 -3.8 -2.6 -3.2 -2.8 -2.2 0.1 3.6 5.1
+G 0.5 -2.0 0.1 -0.8 -5.2 6.6 -1.4 -4.5 -1.1 -4.4 -3.5 0.4 -1.6 -1.0 -1.0 0.4 -1.1 -3.3 -4.0 -4.0
+H -0.8 -1.3 0.4 0.4 -0.1 -1.4 6.0 -2.2 0.6 -1.9 -1.3 1.2 -1.1 1.2 0.6 -0.2 -0.3 -2.0 -0.8 2.2
+I -0.8 -1.1 -3.8 -2.7 1.0 -4.5 -2.2 4.0 -2.1 2.8 2.5 -2.8 -2.6 -1.9 -2.4 -1.8 -0.6 3.1 -1.8 -0.7
+K -0.4 -2.8 0.5 1.2 -3.3 -1.1 0.6 -2.1 3.2 -2.1 -1.4 0.8 -0.6 1.5 2.7 0.1 0.1 -1.7 -3.5 -2.1
+L -1.2 -1.5 -4.0 -2.8 2.0 -4.4 -1.9 2.8 -2.1 4.0 2.8 -3.0 -2.3 -1.6 -2.2 -2.1 -1.3 1.8 -0.7 0.0
+M -0.7 -0.9 -3.0 -2.0 1.6 -3.5 -1.3 2.5 -1.4 2.8 4.3 -2.2 -2.4 -1.0 -1.7 -1.4 -0.6 1.6 -1.0 -0.2
+N -0.3 -1.8 2.2 0.9 -3.1 0.4 1.2 -2.8 0.8 -3.0 -2.2 3.8 -0.9 0.7 0.3 0.9 0.5 -2.2 -3.6 -1.4
+P 0.3 -3.1 -0.7 -0.5 -3.8 -1.6 -1.1 -2.6 -0.6 -2.3 -2.4 -0.9 7.6 -0.2 -0.9 0.4 0.1 -1.8 -5.0 -3.1
+Q -0.2 -2.4 0.9 1.7 -2.6 -1.0 1.2 -1.9 1.5 -1.6 -1.0 0.7 -0.2 2.7 1.5 0.2 0.0 -1.5 -2.7 -1.7
+R -0.6 -2.2 -0.3 0.4 -3.2 -1.0 0.6 -2.4 2.7 -2.2 -1.7 0.3 -0.9 1.5 4.7 -0.2 -0.2 -2.0 -1.6 -1.8
+S 1.1 0.1 0.5 0.2 -2.8 0.4 -0.2 -1.8 0.1 -2.1 -1.4 0.9 0.4 0.2 -0.2 2.2 1.5 -1.0 -3.3 -1.9
+T 0.6 -0.5 0.0 -0.1 -2.2 -1.1 -0.3 -0.6 0.1 -1.3 -0.6 0.5 0.1 0.0 -0.2 1.5 2.5 0.0 -3.5 -1.9
+V 0.1 0.0 -2.9 -1.9 0.1 -3.3 -2.0 3.1 -1.7 1.8 1.6 -2.2 -1.8 -1.5 -2.0 -1.0 0.0 3.4 -2.6 -1.1
+W -3.6 -1.0 -5.2 -4.3 3.6 -4.0 -0.8 -1.8 -3.5 -0.7 -1.0 -3.6 -5.0 -2.7 -1.6 -3.3 -3.5 -2.6 14.2 4.1
+Y -2.2 -0.5 -2.8 -2.7 5.1 -4.0 2.2 -0.7 -2.1 0.0 -0.2 -1.4 -3.1 -1.7 -1.8 -1.9 -1.9 -1.1 4.1 7.8
diff --git a/code/lib/Bio/Align/substitution_matrices/data/HOXD70 b/code/lib/Bio/Align/substitution_matrices/data/HOXD70
new file mode 100644
index 0000000..4cbd0f6
--- /dev/null
+++ b/code/lib/Bio/Align/substitution_matrices/data/HOXD70
@@ -0,0 +1,9 @@
+# F. Chiaromonte, V.B. Yap, W. Miller:
+# "Scoring pairwise genomic sequence alignments"
+# Pacific Symposium on Biocomputing 2002: 115-26 (2002).
+# PMID 11928468
+ A C G T
+A 91 -114 -31 -123
+C -114 100 -125 -31
+G -31 -125 100 -114
+T -123 -31 -114 91
diff --git a/code/lib/Bio/Align/substitution_matrices/data/JOHNSON b/code/lib/Bio/Align/substitution_matrices/data/JOHNSON
new file mode 100644
index 0000000..7d30964
--- /dev/null
+++ b/code/lib/Bio/Align/substitution_matrices/data/JOHNSON
@@ -0,0 +1,27 @@
+# Mark S. Johnson and John P. Overington:
+# "A structural basis for sequence comparisons. An evaluation of scoring
+# methodologies."
+# Journal of Molecular Biology 233(4): 716-738 (1993).
+# Table 3, upper triangle.
+# PMID 8411177
+ A C D E F G H I K L M N P Q R S T V W Y
+A 0.60 -0.34 -0.16 -0.07 -0.32 -0.05 -0.31 -0.22 -0.09 -0.33 -0.15 -0.14 -0.10 -0.06 -0.16 0.00 -0.08 -0.05 -0.58 -0.40
+C -0.34 1.61 -0.97 -0.69 -0.44 -0.82 -0.82 -0.77 -0.87 -0.87 -0.44 -0.76 -0.89 -0.69 -0.56 -0.77 -0.60 -0.48 -0.91 -0.77
+D -0.16 -0.97 0.85 0.24 -0.70 -0.21 -0.07 -0.48 -0.15 -0.80 -0.59 0.26 -0.10 -0.11 -0.34 -0.02 -0.18 -0.52 -0.60 -0.38
+E -0.07 -0.69 0.24 0.86 -0.64 -0.25 -0.23 -0.48 0.11 -0.56 -0.28 -0.07 -0.15 0.24 -0.02 -0.22 -0.05 -0.42 -0.76 -0.37
+F -0.32 -0.44 -0.70 -0.64 1.04 -0.86 -0.17 0.05 -0.56 0.18 -0.06 -0.38 -0.50 -0.64 -0.60 -0.48 -0.50 -0.13 0.34 0.34
+G -0.05 -0.82 -0.21 -0.25 -0.86 0.80 -0.32 -0.55 -0.35 -0.72 -0.52 -0.14 -0.25 -0.28 -0.28 -0.13 -0.38 -0.56 -0.63 -0.54
+H -0.31 -0.82 -0.07 -0.23 -0.17 -0.32 1.27 -0.51 0.01 -0.42 -0.23 0.17 -0.43 0.14 0.01 -0.26 -0.30 -0.39 -0.40 -0.04
+I -0.22 -0.77 -0.48 -0.48 0.05 -0.55 -0.51 0.81 -0.47 0.26 0.26 -0.47 -0.57 -0.70 -0.54 -0.47 -0.32 0.39 -0.33 -0.25
+K -0.09 -0.87 -0.15 0.11 -0.56 -0.35 0.01 -0.47 0.76 -0.34 -0.19 0.01 -0.06 0.11 0.32 -0.15 -0.02 -0.37 -0.54 -0.37
+L -0.33 -0.87 -0.80 -0.56 0.18 -0.72 -0.42 0.26 -0.34 0.73 0.44 -0.48 -0.28 -0.44 -0.37 -0.52 -0.46 0.18 -0.10 -0.24
+M -0.15 -0.44 -0.59 -0.28 -0.06 -0.52 -0.23 0.26 -0.19 0.44 1.12 -0.37 -0.98 -0.06 -0.42 -0.48 -0.32 0.07 -0.09 -0.13
+N -0.14 -0.76 0.26 -0.07 -0.38 -0.14 0.17 -0.47 0.01 -0.48 -0.37 0.80 -0.24 -0.08 -0.15 0.10 0.01 -0.57 -0.61 -0.13
+P -0.10 -0.89 -0.10 -0.15 -0.50 -0.25 -0.43 -0.57 -0.06 -0.28 -0.98 -0.24 1.03 -0.36 -0.36 -0.10 -0.20 -0.52 -0.74 -0.70
+Q -0.06 -0.69 -0.11 0.24 -0.64 -0.28 0.14 -0.70 0.11 -0.44 -0.06 -0.08 -0.36 0.90 0.21 -0.12 -0.04 -0.36 -0.82 -0.51
+R -0.16 -0.56 -0.34 -0.02 -0.60 -0.28 0.01 -0.54 0.32 -0.37 -0.42 -0.15 -0.36 0.21 1.00 -0.06 -0.14 -0.49 -0.38 -0.21
+S 0.00 -0.77 -0.02 -0.22 -0.48 -0.13 -0.26 -0.47 -0.15 -0.52 -0.48 0.10 -0.10 -0.12 -0.06 0.58 0.20 -0.43 -0.62 -0.34
+T -0.08 -0.60 -0.18 -0.05 -0.50 -0.38 -0.30 -0.32 -0.02 -0.46 -0.32 0.01 -0.20 -0.04 -0.14 0.20 0.68 -0.19 -0.93 -0.27
+V -0.05 -0.48 -0.52 -0.42 -0.13 -0.56 -0.39 0.39 -0.37 0.18 0.07 -0.57 -0.52 -0.36 -0.49 -0.43 -0.19 0.70 -0.49 -0.18
+W -0.58 -0.91 -0.60 -0.76 0.34 -0.63 -0.40 -0.33 -0.54 -0.10 -0.09 -0.61 -0.74 -0.82 -0.38 -0.62 -0.93 -0.49 1.52 0.23
+Y -0.40 -0.77 -0.38 -0.37 0.34 -0.54 -0.04 -0.25 -0.37 -0.24 -0.13 -0.13 -0.70 -0.51 -0.21 -0.34 -0.27 -0.18 0.23 1.05
diff --git a/code/lib/Bio/Align/substitution_matrices/data/JONES b/code/lib/Bio/Align/substitution_matrices/data/JONES
new file mode 100644
index 0000000..daed995
--- /dev/null
+++ b/code/lib/Bio/Align/substitution_matrices/data/JONES
@@ -0,0 +1,26 @@
+# David T. Jones, William R. Taylor, Janet M. Thornton:
+# "The rapid generation of mutation data matrices from protein sequences."
+# Computer Applications in the Biosciences: CABIOS 8(3): 275-282 (1992).
+# Table I, lower triangle.
+# PMID 1633570
+ A R N D C Q E G H I L K M F P S T W Y V
+A 0.2 -0.1 0.0 0.0 -0.1 -0.1 -0.1 0.1 -0.2 0.0 -0.1 -0.1 -0.1 -0.3 0.1 0.1 0.2 -0.4 -0.3 0.1
+R -0.1 0.5 0.0 -0.1 -0.1 0.2 0.0 0.0 0.2 -0.3 -0.3 0.4 -0.2 -0.4 -0.1 -0.1 -0.1 0.0 -0.2 -0.3
+N 0.0 0.0 0.3 0.2 -0.1 0.0 0.1 0.0 0.1 -0.2 -0.3 0.1 -0.2 -0.3 -0.1 0.1 0.1 -0.5 -0.1 -0.2
+D 0.0 -0.1 0.2 0.5 -0.3 0.1 0.4 0.1 0.0 -0.3 -0.4 0.0 -0.3 -0.5 -0.2 0.0 -0.1 -0.5 -0.2 -0.2
+C -0.1 -0.1 -0.1 -0.3 1.1 -0.3 -0.4 -0.1 0.0 -0.2 -0.3 -0.3 -0.2 0.0 -0.2 0.1 -0.1 0.1 0.2 -0.2
+Q -0.1 0.2 0.0 0.1 -0.3 0.5 0.2 -0.1 0.2 -0.3 -0.2 0.2 -0.2 -0.4 0.0 -0.1 -0.1 -0.3 -0.2 -0.3
+E -0.1 0.0 0.1 0.4 -0.4 0.2 0.5 0.0 0.0 -0.3 -0.4 0.1 -0.3 -0.5 -0.2 -0.1 -0.1 -0.5 -0.4 -0.2
+G 0.1 0.0 0.0 0.1 -0.1 -0.1 0.0 0.5 -0.2 -0.3 -0.4 -0.1 -0.3 -0.5 -0.1 0.1 -0.1 -0.2 -0.4 -0.2
+H -0.2 0.2 0.1 0.0 0.0 0.2 0.0 -0.2 0.6 -0.3 -0.2 0.1 -0.2 0.0 0.0 -0.1 -0.1 -0.3 0.4 -0.3
+I 0.0 -0.3 -0.2 -0.3 -0.2 -0.3 -0.3 -0.3 -0.3 0.4 0.2 -0.3 0.3 0.0 -0.2 -0.1 0.1 -0.4 -0.2 0.4
+L -0.1 -0.3 -0.3 -0.4 -0.3 -0.2 -0.4 -0.4 -0.2 0.2 0.5 -0.3 0.3 0.2 0.0 -0.2 -0.1 -0.2 -0.1 0.2
+K -0.1 0.4 0.1 0.0 -0.3 0.2 0.1 -0.1 0.1 -0.3 -0.3 0.5 -0.2 -0.5 -0.2 -0.1 -0.1 -0.3 -0.3 -0.3
+M -0.1 -0.2 -0.2 -0.3 -0.2 -0.2 -0.3 -0.3 -0.2 0.3 0.3 -0.2 0.6 0.0 -0.2 -0.1 0.0 -0.3 -0.2 0.2
+F -0.3 -0.4 -0.3 -0.5 0.0 -0.4 -0.5 -0.5 0.0 0.0 0.2 -0.5 0.0 0.8 -0.3 -0.2 -0.2 -0.1 0.5 0.0
+P 0.1 -0.1 -0.1 -0.2 -0.2 0.0 -0.2 -0.1 0.0 -0.2 0.0 -0.2 -0.2 -0.3 0.6 0.1 0.1 -0.4 -0.3 -0.1
+S 0.1 -0.1 0.1 0.0 0.1 -0.1 -0.1 0.1 -0.1 -0.1 -0.2 -0.1 -0.1 -0.2 0.1 0.2 0.1 -0.3 -0.1 -0.1
+T 0.2 -0.1 0.1 -0.1 -0.1 -0.1 -0.1 -0.1 -0.1 0.1 -0.1 -0.1 0.0 -0.2 0.1 0.1 0.2 -0.4 -0.3 0.0
+W -0.4 0.0 -0.5 -0.5 0.1 -0.3 -0.5 -0.2 -0.3 -0.4 -0.2 -0.3 -0.3 -0.1 -0.4 -0.3 -0.4 1.5 0.0 -0.3
+Y -0.3 -0.2 -0.1 -0.2 0.2 -0.2 -0.4 -0.4 0.4 -0.2 -0.1 -0.3 -0.2 0.5 -0.3 -0.1 -0.3 0.0 0.9 -0.3
+V 0.1 -0.3 -0.2 -0.2 -0.2 -0.3 -0.2 -0.2 -0.3 0.4 0.2 -0.3 0.2 0.0 -0.1 -0.1 0.0 -0.3 -0.3 0.4
diff --git a/code/lib/Bio/Align/substitution_matrices/data/LEVIN b/code/lib/Bio/Align/substitution_matrices/data/LEVIN
new file mode 100644
index 0000000..2f9c8c4
--- /dev/null
+++ b/code/lib/Bio/Align/substitution_matrices/data/LEVIN
@@ -0,0 +1,27 @@
+# Jonathan M. Levin, Barry Robson, Jean Garnier:
+# "An algorithm for secondary structure determination in proteins based on
+# sequence similarity."
+# FEBS Letters 205(2): 303-308 (1986).
+# Figure 1.
+# PMID 3743779
+ A C D E F G H I K L M N P Q R S T V W Y
+A 2 0 0 1 -1 0 0 0 0 0 0 0 -1 0 0 1 0 0 -1 -1
+C 0 2 0 0 -1 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 -1
+D 0 0 2 1 -1 0 0 -1 0 -1 -1 1 0 0 0 0 0 -1 -1 -1
+E 1 0 1 2 -1 0 0 -1 0 -1 -1 0 -1 1 0 0 0 -1 -1 -1
+F -1 -1 -1 -1 2 -1 -1 1 -1 0 0 -1 -1 -1 -1 -1 -1 0 0 1
+G 0 0 0 0 -1 2 0 -1 0 -1 -1 0 0 0 0 0 0 -1 -1 -1
+H 0 0 0 0 -1 0 2 -1 0 -1 -1 0 0 0 0 0 0 -1 -1 0
+I 0 0 -1 -1 1 -1 -1 2 -1 0 0 -1 -1 -1 -1 -1 0 1 0 0
+K 0 0 0 0 -1 0 0 -1 2 -1 -1 1 0 0 1 0 0 -1 -1 -1
+L 0 0 -1 -1 0 -1 -1 0 -1 2 2 -1 -1 -1 -1 -1 0 1 0 0
+M 0 0 -1 -1 0 -1 -1 0 -1 2 2 -1 -1 -1 -1 -1 0 0 0 0
+N 0 0 1 0 -1 0 0 -1 1 -1 -1 3 0 1 0 0 0 -1 -1 -1
+P -1 0 0 -1 -1 0 0 -1 0 -1 -1 0 3 0 0 0 0 -1 -1 -1
+Q 0 0 0 1 -1 0 0 -1 0 -1 -1 1 0 2 0 0 0 -1 -1 -1
+R 0 0 0 0 -1 0 0 -1 1 -1 -1 0 0 0 2 0 0 -1 0 -1
+S 1 0 0 0 -1 0 0 -1 0 -1 -1 0 0 0 0 2 0 -1 -1 -1
+T 0 0 0 0 -1 0 0 0 0 0 0 0 0 0 0 0 2 0 -1 -1
+V 0 0 -1 -1 0 -1 -1 1 -1 1 0 -1 -1 -1 -1 -1 0 2 0 0
+W -1 -1 -1 -1 0 -1 -1 0 -1 0 0 -1 -1 -1 0 -1 -1 0 2 0
+Y -1 -1 -1 -1 1 -1 0 0 -1 0 0 -1 -1 -1 -1 -1 -1 0 0 2
diff --git a/code/lib/Bio/Align/substitution_matrices/data/MCLACHLAN b/code/lib/Bio/Align/substitution_matrices/data/MCLACHLAN
new file mode 100644
index 0000000..adf81ce
--- /dev/null
+++ b/code/lib/Bio/Align/substitution_matrices/data/MCLACHLAN
@@ -0,0 +1,27 @@
+# A.D. McLachlan:
+# "Tests for comparing related amino-acid sequences. Cytochrome c and
+# cytochrome c 551."
+# Journal of Molecular Biology 61(2): 409-424 (1971).
+# Figure 1.
+# PMID 5167087
+ A C D E F G H I K L M N P Q R S T V W Y
+A 8 1 3 4 1 3 3 2 3 2 3 3 4 3 2 4 3 3 1 1
+C 1 9 1 0 0 1 3 1 0 0 3 1 0 0 1 2 2 1 2 1
+D 3 1 8 5 1 3 4 1 3 1 2 5 3 4 1 3 3 1 0 1
+E 4 0 5 8 0 3 2 1 4 1 1 4 4 5 3 4 4 2 1 2
+F 1 0 1 0 9 0 4 3 0 5 5 0 1 0 1 2 1 3 6 6
+G 3 1 3 3 0 8 2 1 3 1 1 3 3 2 3 3 2 2 1 0
+H 3 3 4 2 4 2 8 2 4 2 3 4 3 4 5 3 4 2 3 4
+I 2 1 1 1 3 1 2 8 1 5 5 1 1 0 1 2 3 5 3 3
+K 3 0 3 4 0 3 4 1 8 2 1 4 3 4 5 3 3 2 1 1
+L 2 0 1 1 5 1 2 5 2 8 6 1 1 3 2 2 3 5 3 3
+M 3 3 2 1 5 1 3 5 1 6 8 2 1 3 1 2 3 4 1 2
+N 3 1 5 4 0 3 4 1 4 1 2 8 1 4 3 5 3 1 0 2
+P 4 0 3 4 1 3 3 1 3 1 1 1 8 3 3 3 3 2 0 0
+Q 3 0 4 5 0 2 4 0 4 3 3 4 3 8 5 4 3 2 2 1
+R 2 1 1 3 1 3 5 1 5 2 1 3 3 5 8 4 3 2 3 2
+S 4 2 3 4 2 3 3 2 3 2 2 5 3 4 4 8 5 2 3 3
+T 3 2 3 4 1 2 4 3 3 3 3 3 3 3 3 5 8 3 2 1
+V 3 1 1 2 3 2 2 5 2 5 4 1 2 2 2 2 3 8 2 3
+W 1 2 0 1 6 1 3 3 1 3 1 0 0 2 3 3 2 2 9 6
+Y 1 1 1 2 6 0 4 3 1 3 2 2 0 1 2 3 1 3 6 9
diff --git a/code/lib/Bio/Align/substitution_matrices/data/MDM78 b/code/lib/Bio/Align/substitution_matrices/data/MDM78
new file mode 100644
index 0000000..5d0b2ef
--- /dev/null
+++ b/code/lib/Bio/Align/substitution_matrices/data/MDM78
@@ -0,0 +1,27 @@
+# R.M. Schwartz and M.O. Dayhoff:
+# "Matrices for Detecting Distant Relationships."
+# Margaret O. Dayhoff: Atlas of Protein Sequence and Structure,
+# Volume 5, Supplement 3, 1978, pages 353-358.
+# The National Biomedical Research Foundation, 1979.
+# Figure 85, page 354.
+ A R N D C Q E G H I L K M F P S T W Y V
+A 0.18 -0.15 0.02 0.03 -0.20 -0.04 0.03 0.13 -0.14 -0.05 -0.19 -0.12 -0.11 -0.35 0.11 0.11 0.12 -0.58 -0.35 0.02
+R -0.15 0.61 0.00 -0.13 -0.36 0.13 -0.11 -0.26 0.16 -0.20 -0.30 0.34 -0.04 -0.45 -0.02 -0.03 -0.09 0.22 -0.42 -0.25
+N 0.02 0.00 0.20 0.21 -0.36 0.08 0.14 0.03 0.16 -0.18 -0.29 0.10 -0.17 -0.35 -0.05 0.07 0.04 -0.42 -0.21 -0.17
+D 0.03 -0.13 0.21 0.39 -0.51 0.16 0.34 0.06 0.07 -0.24 -0.40 0.01 -0.26 -0.56 -0.10 0.03 -0.01 -0.68 -0.43 -0.21
+C -0.20 -0.36 -0.36 -0.51 1.19 -0.54 -0.53 -0.34 -0.34 -0.23 -0.60 -0.54 -0.52 -0.43 -0.28 0.00 -0.22 -0.78 0.03 -0.19
+Q -0.04 0.13 0.08 0.16 -0.54 0.40 0.25 -0.12 0.29 -0.20 -0.18 0.07 -0.10 -0.47 0.02 -0.05 -0.08 -0.48 -0.40 -0.19
+E 0.03 -0.11 0.14 0.34 -0.53 0.25 0.38 0.02 0.07 -0.20 -0.34 -0.01 -0.21 -0.54 -0.06 0.00 -0.04 -0.70 -0.43 -0.18
+G 0.13 -0.26 0.03 0.06 -0.34 -0.12 0.02 0.48 -0.21 -0.26 -0.41 -0.17 -0.28 -0.48 -0.05 0.11 0.00 -0.70 -0.52 -0.14
+H -0.14 0.16 0.16 0.07 -0.34 0.29 0.07 -0.21 0.65 -0.24 -0.21 0.00 -0.21 -0.18 -0.02 -0.08 -0.13 -0.28 -0.01 -0.22
+I -0.05 -0.20 -0.18 -0.24 -0.23 -0.20 -0.20 -0.26 -0.24 0.45 0.24 -0.19 0.22 0.10 -0.20 -0.14 0.01 -0.51 -0.09 0.37
+L -0.19 -0.30 -0.29 -0.40 -0.60 -0.18 -0.34 -0.41 -0.21 0.24 0.59 -0.29 0.37 0.18 -0.25 -0.28 -0.17 -0.18 -0.09 0.19
+K -0.12 0.34 0.10 0.01 -0.54 0.07 -0.01 -0.17 0.00 -0.19 -0.29 0.47 0.04 -0.53 -0.11 -0.02 0.00 -0.35 -0.44 -0.24
+M -0.11 -0.04 -0.17 -0.26 -0.52 -0.10 -0.21 -0.28 -0.21 0.22 0.37 0.04 0.64 0.02 -0.21 -0.16 -0.06 -0.42 -0.24 0.18
+F -0.35 -0.45 -0.35 -0.56 -0.43 -0.47 -0.54 -0.48 -0.18 0.10 0.18 -0.53 0.02 0.91 -0.46 -0.32 -0.31 0.04 0.70 -0.12
+P 0.11 -0.02 -0.05 -0.10 -0.28 0.02 -0.06 -0.05 -0.02 -0.20 -0.25 -0.11 -0.21 -0.46 0.59 0.09 0.03 -0.56 -0.49 -0.12
+S 0.11 -0.03 0.07 0.03 0.00 -0.05 0.00 0.11 -0.08 -0.14 -0.28 -0.02 -0.16 -0.32 0.09 0.16 0.13 -0.25 -0.28 -0.10
+T 0.12 -0.09 0.04 -0.01 -0.22 -0.08 -0.04 0.00 -0.13 0.01 -0.17 0.00 -0.06 -0.31 0.03 0.13 0.26 -0.52 -0.27 0.03
+W -0.58 0.22 -0.42 -0.68 -0.78 -0.48 -0.70 -0.70 -0.28 -0.51 -0.18 -0.35 -0.42 0.04 -0.56 -0.25 -0.52 1.73 -0.02 -0.62
+Y -0.35 -0.42 -0.21 -0.43 0.03 -0.40 -0.43 -0.52 -0.01 -0.09 -0.09 -0.44 -0.24 0.70 -0.49 -0.28 -0.27 -0.02 1.01 -0.25
+V 0.02 -0.25 -0.17 -0.21 -0.19 -0.19 -0.18 -0.14 -0.22 0.37 0.19 -0.24 0.18 -0.12 -0.12 -0.10 0.03 -0.62 -0.25 0.43
diff --git a/code/lib/Bio/Align/substitution_matrices/data/NUC.4.4 b/code/lib/Bio/Align/substitution_matrices/data/NUC.4.4
new file mode 100644
index 0000000..6fb12d2
--- /dev/null
+++ b/code/lib/Bio/Align/substitution_matrices/data/NUC.4.4
@@ -0,0 +1,25 @@
+#
+# This matrix was created by Todd Lowe 12/10/92
+#
+# Uses ambiguous nucleotide codes, probabilities rounded to
+# nearest integer
+#
+# Lowest score = -4, Highest score = 5
+#
+ A T G C S W R Y K M B V H D N
+A 5 -4 -4 -4 -4 1 1 -4 -4 1 -4 -1 -1 -1 -2
+T -4 5 -4 -4 -4 1 -4 1 1 -4 -1 -4 -1 -1 -2
+G -4 -4 5 -4 1 -4 1 -4 1 -4 -1 -1 -4 -1 -2
+C -4 -4 -4 5 1 -4 -4 1 -4 1 -1 -1 -1 -4 -2
+S -4 -4 1 1 -1 -4 -2 -2 -2 -2 -1 -1 -3 -3 -1
+W 1 1 -4 -4 -4 -1 -2 -2 -2 -2 -3 -3 -1 -1 -1
+R 1 -4 1 -4 -2 -2 -1 -4 -2 -2 -3 -1 -3 -1 -1
+Y -4 1 -4 1 -2 -2 -4 -1 -2 -2 -1 -3 -1 -3 -1
+K -4 1 1 -4 -2 -2 -2 -2 -1 -4 -1 -3 -3 -1 -1
+M 1 -4 -4 1 -2 -2 -2 -2 -4 -1 -3 -1 -1 -3 -1
+B -4 -1 -1 -1 -1 -3 -3 -1 -1 -3 -1 -2 -2 -2 -1
+V -1 -4 -1 -1 -1 -3 -1 -3 -3 -1 -2 -1 -2 -2 -1
+H -1 -1 -4 -1 -3 -1 -3 -1 -3 -1 -2 -2 -1 -2 -1
+D -1 -1 -1 -4 -3 -1 -1 -3 -1 -3 -2 -2 -2 -1 -1
+N -2 -2 -2 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
+
diff --git a/code/lib/Bio/Align/substitution_matrices/data/PAM250 b/code/lib/Bio/Align/substitution_matrices/data/PAM250
new file mode 100644
index 0000000..17e9e60
--- /dev/null
+++ b/code/lib/Bio/Align/substitution_matrices/data/PAM250
@@ -0,0 +1,34 @@
+#
+# This matrix was produced by "pam" Version 1.0.6 [28-Jul-93]
+#
+# PAM 250 substitution matrix, scale = ln(2)/3 = 0.231049
+#
+# Expected score = -0.844, Entropy = 0.354 bits
+#
+# Lowest score = -8, Highest score = 17
+#
+ A R N D C Q E G H I L K M F P S T W Y V B Z X *
+A 2 -2 0 0 -2 0 0 1 -1 -1 -2 -1 -1 -3 1 1 1 -6 -3 0 0 0 0 -8
+R -2 6 0 -1 -4 1 -1 -3 2 -2 -3 3 0 -4 0 0 -1 2 -4 -2 -1 0 -1 -8
+N 0 0 2 2 -4 1 1 0 2 -2 -3 1 -2 -3 0 1 0 -4 -2 -2 2 1 0 -8
+D 0 -1 2 4 -5 2 3 1 1 -2 -4 0 -3 -6 -1 0 0 -7 -4 -2 3 3 -1 -8
+C -2 -4 -4 -5 12 -5 -5 -3 -3 -2 -6 -5 -5 -4 -3 0 -2 -8 0 -2 -4 -5 -3 -8
+Q 0 1 1 2 -5 4 2 -1 3 -2 -2 1 -1 -5 0 -1 -1 -5 -4 -2 1 3 -1 -8
+E 0 -1 1 3 -5 2 4 0 1 -2 -3 0 -2 -5 -1 0 0 -7 -4 -2 3 3 -1 -8
+G 1 -3 0 1 -3 -1 0 5 -2 -3 -4 -2 -3 -5 0 1 0 -7 -5 -1 0 0 -1 -8
+H -1 2 2 1 -3 3 1 -2 6 -2 -2 0 -2 -2 0 -1 -1 -3 0 -2 1 2 -1 -8
+I -1 -2 -2 -2 -2 -2 -2 -3 -2 5 2 -2 2 1 -2 -1 0 -5 -1 4 -2 -2 -1 -8
+L -2 -3 -3 -4 -6 -2 -3 -4 -2 2 6 -3 4 2 -3 -3 -2 -2 -1 2 -3 -3 -1 -8
+K -1 3 1 0 -5 1 0 -2 0 -2 -3 5 0 -5 -1 0 0 -3 -4 -2 1 0 -1 -8
+M -1 0 -2 -3 -5 -1 -2 -3 -2 2 4 0 6 0 -2 -2 -1 -4 -2 2 -2 -2 -1 -8
+F -3 -4 -3 -6 -4 -5 -5 -5 -2 1 2 -5 0 9 -5 -3 -3 0 7 -1 -4 -5 -2 -8
+P 1 0 0 -1 -3 0 -1 0 0 -2 -3 -1 -2 -5 6 1 0 -6 -5 -1 -1 0 -1 -8
+S 1 0 1 0 0 -1 0 1 -1 -1 -3 0 -2 -3 1 2 1 -2 -3 -1 0 0 0 -8
+T 1 -1 0 0 -2 -1 0 0 -1 0 -2 0 -1 -3 0 1 3 -5 -3 0 0 -1 0 -8
+W -6 2 -4 -7 -8 -5 -7 -7 -3 -5 -2 -3 -4 0 -6 -2 -5 17 0 -6 -5 -6 -4 -8
+Y -3 -4 -2 -4 0 -4 -4 -5 0 -1 -1 -4 -2 7 -5 -3 -3 0 10 -2 -3 -4 -2 -8
+V 0 -2 -2 -2 -2 -2 -2 -1 -2 4 2 -2 2 -1 -1 -1 0 -6 -2 4 -2 -2 -1 -8
+B 0 -1 2 3 -4 1 3 0 1 -2 -3 1 -2 -4 -1 0 0 -5 -3 -2 3 2 -1 -8
+Z 0 0 1 3 -5 3 3 0 2 -2 -3 0 -2 -5 0 0 -1 -6 -4 -2 2 3 -1 -8
+X 0 -1 0 -1 -3 -1 -1 -1 -1 -1 -1 -1 -1 -2 -1 0 0 -4 -2 -1 -1 -1 -1 -8
+* -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 1
diff --git a/code/lib/Bio/Align/substitution_matrices/data/PAM30 b/code/lib/Bio/Align/substitution_matrices/data/PAM30
new file mode 100644
index 0000000..8a01c88
--- /dev/null
+++ b/code/lib/Bio/Align/substitution_matrices/data/PAM30
@@ -0,0 +1,34 @@
+#
+# This matrix was produced by "pam" Version 1.0.6 [28-Jul-93]
+#
+# PAM 30 substitution matrix, scale = ln(2)/2 = 0.346574
+#
+# Expected score = -5.06, Entropy = 2.57 bits
+#
+# Lowest score = -17, Highest score = 13
+#
+ A R N D C Q E G H I L K M F P S T W Y V B Z X *
+A 6 -7 -4 -3 -6 -4 -2 -2 -7 -5 -6 -7 -5 -8 -2 0 -1 -13 -8 -2 -3 -3 -3 -17
+R -7 8 -6 -10 -8 -2 -9 -9 -2 -5 -8 0 -4 -9 -4 -3 -6 -2 -10 -8 -7 -4 -6 -17
+N -4 -6 8 2 -11 -3 -2 -3 0 -5 -7 -1 -9 -9 -6 0 -2 -8 -4 -8 6 -3 -3 -17
+D -3 -10 2 8 -14 -2 2 -3 -4 -7 -12 -4 -11 -15 -8 -4 -5 -15 -11 -8 6 1 -5 -17
+C -6 -8 -11 -14 10 -14 -14 -9 -7 -6 -15 -14 -13 -13 -8 -3 -8 -15 -4 -6 -12 -14 -9 -17
+Q -4 -2 -3 -2 -14 8 1 -7 1 -8 -5 -3 -4 -13 -3 -5 -5 -13 -12 -7 -3 6 -5 -17
+E -2 -9 -2 2 -14 1 8 -4 -5 -5 -9 -4 -7 -14 -5 -4 -6 -17 -8 -6 1 6 -5 -17
+G -2 -9 -3 -3 -9 -7 -4 6 -9 -11 -10 -7 -8 -9 -6 -2 -6 -15 -14 -5 -3 -5 -5 -17
+H -7 -2 0 -4 -7 1 -5 -9 9 -9 -6 -6 -10 -6 -4 -6 -7 -7 -3 -6 -1 -1 -5 -17
+I -5 -5 -5 -7 -6 -8 -5 -11 -9 8 -1 -6 -1 -2 -8 -7 -2 -14 -6 2 -6 -6 -5 -17
+L -6 -8 -7 -12 -15 -5 -9 -10 -6 -1 7 -8 1 -3 -7 -8 -7 -6 -7 -2 -9 -7 -6 -17
+K -7 0 -1 -4 -14 -3 -4 -7 -6 -6 -8 7 -2 -14 -6 -4 -3 -12 -9 -9 -2 -4 -5 -17
+M -5 -4 -9 -11 -13 -4 -7 -8 -10 -1 1 -2 11 -4 -8 -5 -4 -13 -11 -1 -10 -5 -5 -17
+F -8 -9 -9 -15 -13 -13 -14 -9 -6 -2 -3 -14 -4 9 -10 -6 -9 -4 2 -8 -10 -13 -8 -17
+P -2 -4 -6 -8 -8 -3 -5 -6 -4 -8 -7 -6 -8 -10 8 -2 -4 -14 -13 -6 -7 -4 -5 -17
+S 0 -3 0 -4 -3 -5 -4 -2 -6 -7 -8 -4 -5 -6 -2 6 0 -5 -7 -6 -1 -5 -3 -17
+T -1 -6 -2 -5 -8 -5 -6 -6 -7 -2 -7 -3 -4 -9 -4 0 7 -13 -6 -3 -3 -6 -4 -17
+W -13 -2 -8 -15 -15 -13 -17 -15 -7 -14 -6 -12 -13 -4 -14 -5 -13 13 -5 -15 -10 -14 -11 -17
+Y -8 -10 -4 -11 -4 -12 -8 -14 -3 -6 -7 -9 -11 2 -13 -7 -6 -5 10 -7 -6 -9 -7 -17
+V -2 -8 -8 -8 -6 -7 -6 -5 -6 2 -2 -9 -1 -8 -6 -6 -3 -15 -7 7 -8 -6 -5 -17
+B -3 -7 6 6 -12 -3 1 -3 -1 -6 -9 -2 -10 -10 -7 -1 -3 -10 -6 -8 6 0 -5 -17
+Z -3 -4 -3 1 -14 6 6 -5 -1 -6 -7 -4 -5 -13 -4 -5 -6 -14 -9 -6 0 6 -5 -17
+X -3 -6 -3 -5 -9 -5 -5 -5 -5 -5 -6 -5 -5 -8 -5 -3 -4 -11 -7 -5 -5 -5 -5 -17
+* -17 -17 -17 -17 -17 -17 -17 -17 -17 -17 -17 -17 -17 -17 -17 -17 -17 -17 -17 -17 -17 -17 -17 1
diff --git a/code/lib/Bio/Align/substitution_matrices/data/PAM70 b/code/lib/Bio/Align/substitution_matrices/data/PAM70
new file mode 100644
index 0000000..b20cdf0
--- /dev/null
+++ b/code/lib/Bio/Align/substitution_matrices/data/PAM70
@@ -0,0 +1,34 @@
+#
+# This matrix was produced by "pam" Version 1.0.6 [28-Jul-93]
+#
+# PAM 70 substitution matrix, scale = ln(2)/2 = 0.346574
+#
+# Expected score = -2.77, Entropy = 1.60 bits
+#
+# Lowest score = -11, Highest score = 13
+#
+ A R N D C Q E G H I L K M F P S T W Y V B Z X *
+A 5 -4 -2 -1 -4 -2 -1 0 -4 -2 -4 -4 -3 -6 0 1 1 -9 -5 -1 -1 -1 -2 -11
+R -4 8 -3 -6 -5 0 -5 -6 0 -3 -6 2 -2 -7 -2 -1 -4 0 -7 -5 -4 -2 -3 -11
+N -2 -3 6 3 -7 -1 0 -1 1 -3 -5 0 -5 -6 -3 1 0 -6 -3 -5 5 -1 -2 -11
+D -1 -6 3 6 -9 0 3 -1 -1 -5 -8 -2 -7 -10 -4 -1 -2 -10 -7 -5 5 2 -3 -11
+C -4 -5 -7 -9 9 -9 -9 -6 -5 -4 -10 -9 -9 -8 -5 -1 -5 -11 -2 -4 -8 -9 -6 -11
+Q -2 0 -1 0 -9 7 2 -4 2 -5 -3 -1 -2 -9 -1 -3 -3 -8 -8 -4 -1 5 -2 -11
+E -1 -5 0 3 -9 2 6 -2 -2 -4 -6 -2 -4 -9 -3 -2 -3 -11 -6 -4 2 5 -3 -11
+G 0 -6 -1 -1 -6 -4 -2 6 -6 -6 -7 -5 -6 -7 -3 0 -3 -10 -9 -3 -1 -3 -3 -11
+H -4 0 1 -1 -5 2 -2 -6 8 -6 -4 -3 -6 -4 -2 -3 -4 -5 -1 -4 0 1 -3 -11
+I -2 -3 -3 -5 -4 -5 -4 -6 -6 7 1 -4 1 0 -5 -4 -1 -9 -4 3 -4 -4 -3 -11
+L -4 -6 -5 -8 -10 -3 -6 -7 -4 1 6 -5 2 -1 -5 -6 -4 -4 -4 0 -6 -4 -4 -11
+K -4 2 0 -2 -9 -1 -2 -5 -3 -4 -5 6 0 -9 -4 -2 -1 -7 -7 -6 -1 -2 -3 -11
+M -3 -2 -5 -7 -9 -2 -4 -6 -6 1 2 0 10 -2 -5 -3 -2 -8 -7 0 -6 -3 -3 -11
+F -6 -7 -6 -10 -8 -9 -9 -7 -4 0 -1 -9 -2 8 -7 -4 -6 -2 4 -5 -7 -9 -5 -11
+P 0 -2 -3 -4 -5 -1 -3 -3 -2 -5 -5 -4 -5 -7 7 0 -2 -9 -9 -3 -4 -2 -3 -11
+S 1 -1 1 -1 -1 -3 -2 0 -3 -4 -6 -2 -3 -4 0 5 2 -3 -5 -3 0 -2 -1 -11
+T 1 -4 0 -2 -5 -3 -3 -3 -4 -1 -4 -1 -2 -6 -2 2 6 -8 -4 -1 -1 -3 -2 -11
+W -9 0 -6 -10 -11 -8 -11 -10 -5 -9 -4 -7 -8 -2 -9 -3 -8 13 -3 -10 -7 -10 -7 -11
+Y -5 -7 -3 -7 -2 -8 -6 -9 -1 -4 -4 -7 -7 4 -9 -5 -4 -3 9 -5 -4 -7 -5 -11
+V -1 -5 -5 -5 -4 -4 -4 -3 -4 3 0 -6 0 -5 -3 -3 -1 -10 -5 6 -5 -4 -2 -11
+B -1 -4 5 5 -8 -1 2 -1 0 -4 -6 -1 -6 -7 -4 0 -1 -7 -4 -5 5 1 -2 -11
+Z -1 -2 -1 2 -9 5 5 -3 1 -4 -4 -2 -3 -9 -2 -2 -3 -10 -7 -4 1 5 -3 -11
+X -2 -3 -2 -3 -6 -2 -3 -3 -3 -3 -4 -3 -3 -5 -3 -1 -2 -7 -5 -2 -2 -3 -3 -11
+* -11 -11 -11 -11 -11 -11 -11 -11 -11 -11 -11 -11 -11 -11 -11 -11 -11 -11 -11 -11 -11 -11 -11 1
diff --git a/code/lib/Bio/Align/substitution_matrices/data/RAO b/code/lib/Bio/Align/substitution_matrices/data/RAO
new file mode 100644
index 0000000..f3ef1c0
--- /dev/null
+++ b/code/lib/Bio/Align/substitution_matrices/data/RAO
@@ -0,0 +1,27 @@
+# J.K. Mohana Rao:
+# "New scoring matrix for amino acid residue exchanges based on residue
+# characteristic physical parameters."
+# International Journal of Peptide and Protein Research: 29(2): 276-281 (1987).
+# Figure 1, lower triangle.
+# PMID 3570667
+ A C D E F G H I K L M N P Q R S T V W Y
+A 16 11 9 10 10 8 11 9 10 11 11 9 6 11 8 10 10 9 11 9
+C 11 16 8 9 10 8 10 8 9 11 10 9 7 10 8 10 10 8 11 10
+D 9 8 16 11 4 9 9 3 11 6 5 11 8 11 10 10 9 3 6 7
+E 10 9 11 16 6 6 11 4 11 7 8 10 5 11 9 9 8 4 7 6
+F 10 10 4 6 16 7 9 12 6 11 10 6 4 7 5 8 10 11 11 10
+G 8 8 9 6 7 16 7 6 7 6 4 10 11 8 7 11 10 6 8 10
+H 11 10 9 11 9 7 16 8 11 10 10 10 5 11 10 10 10 9 10 9
+I 9 8 3 4 12 6 8 16 4 10 9 5 3 6 4 8 10 12 11 10
+K 10 9 11 11 6 7 11 4 16 7 8 11 6 12 11 10 9 5 7 7
+L 11 11 6 7 11 6 10 10 7 16 11 7 4 9 6 8 9 10 11 9
+M 11 10 5 8 10 4 10 9 8 11 16 6 2 9 6 7 8 9 10 8
+N 9 9 11 10 6 10 10 5 11 7 6 16 9 11 10 11 10 5 8 8
+P 6 7 8 5 4 11 5 3 6 4 2 9 16 7 6 10 8 3 6 8
+Q 11 10 11 11 7 8 11 6 12 9 9 11 7 16 10 10 10 6 9 8
+R 8 8 10 9 5 7 10 4 11 6 6 10 6 10 16 9 9 5 7 7
+S 10 10 10 9 8 11 10 8 10 8 7 11 10 10 9 16 11 8 10 11
+T 10 10 9 8 10 10 10 10 9 9 8 10 8 10 9 11 16 10 11 11
+V 9 8 3 4 11 6 9 12 5 10 9 5 3 6 5 8 10 16 11 10
+W 11 11 6 7 11 8 10 11 7 11 10 8 6 9 7 10 11 11 16 11
+Y 9 10 7 6 10 10 9 10 7 9 8 8 8 8 7 11 11 10 11 16
diff --git a/code/lib/Bio/Align/substitution_matrices/data/RISLER b/code/lib/Bio/Align/substitution_matrices/data/RISLER
new file mode 100644
index 0000000..438b601
--- /dev/null
+++ b/code/lib/Bio/Align/substitution_matrices/data/RISLER
@@ -0,0 +1,27 @@
+# J.L. Risler, M.O. Delorme, H. Delacroix, A. Henaut:
+# "Amino acid substitutions in structurally related proteins. A pattern
+# recognition approach. Determination of a new and efficient scoring matrix."
+# Journal of Molecular Biology 204(4): 1019-1029 (1988).
+# Figure 5.
+# PMID 3221397
+ A C D E F G H I K L M N P Q R S T V W Y
+A 2.2 -1.5 0.2 1.7 0.6 0.6 -0.6 1.7 1.4 1.3 1.0 1.3 -0.2 1.8 1.5 2.0 1.9 2.0 -0.9 0.2
+C -1.5 2.2 -1.7 -1.5 -1.6 -1.7 -1.8 -1.6 -1.6 -1.5 -1.6 -1.6 -1.8 -1.4 -1.5 -1.3 -1.4 -1.4 -1.8 -1.1
+D 0.2 -1.7 2.2 1.0 -0.3 -0.4 -1.3 0.0 0.1 -0.2 -0.5 0.8 -1.2 0.6 -0.1 0.7 0.0 0.0 -1.4 -0.4
+E 1.7 -1.5 1.0 2.2 0.6 0.3 -0.6 1.5 1.4 0.9 0.6 1.4 -0.1 2.1 1.9 1.8 1.6 1.6 -1.0 0.2
+F 0.6 -1.6 -0.3 0.6 2.2 -0.4 -1.1 1.0 0.1 1.0 -0.2 0.4 -1.1 0.7 0.4 0.5 0.3 0.8 -0.9 2.0
+G 0.6 -1.7 -0.4 0.3 -0.4 2.2 -1.2 0.0 -0.1 -0.2 -0.4 0.2 -1.2 0.2 0.1 0.7 0.2 0.1 -1.3 -0.2
+H -0.6 -1.8 -1.3 -0.6 -1.1 -1.2 2.2 -0.8 -1.0 -0.9 -1.2 -0.3 -1.6 -0.5 -0.4 -0.4 -0.9 -0.7 -1.7 -0.8
+I 1.7 -1.6 0.0 1.5 1.0 0.0 -0.8 2.2 1.0 2.1 0.9 0.9 -0.6 1.4 1.4 1.6 1.6 2.2 -0.7 0.4
+K 1.4 -1.6 0.1 1.4 0.1 -0.1 -1.0 1.0 2.2 0.7 0.4 1.0 -0.7 1.7 2.1 1.4 1.2 1.2 -1.1 0.5
+L 1.3 -1.5 -0.2 0.9 1.0 -0.2 -0.9 2.1 0.7 2.2 1.8 0.8 -0.8 1.1 1.2 1.3 1.2 2.0 -0.8 0.5
+M 1.0 -1.6 -0.5 0.6 -0.2 -0.4 -1.2 0.9 0.4 1.8 2.2 0.0 -1.2 1.2 1.1 0.6 0.8 0.8 -1.3 -0.2
+N 1.3 -1.6 0.8 1.4 0.4 0.2 -0.3 0.9 1.0 0.8 0.0 2.2 -1.0 1.6 1.2 1.9 1.1 1.1 -1.1 -0.1
+P -0.2 -1.8 -1.2 -0.1 -1.1 -1.2 -1.6 -0.6 -0.7 -0.8 -1.2 -1.0 2.2 -0.6 -0.3 -0.3 -0.5 -0.6 -1.6 -1.2
+Q 1.8 -1.4 0.6 2.1 0.7 0.2 -0.5 1.4 1.7 1.1 1.2 1.6 -0.6 2.2 2.0 1.8 1.7 1.5 -1.0 0.5
+R 1.5 -1.5 -0.1 1.9 0.4 0.1 -0.4 1.4 2.1 1.2 1.1 1.2 -0.3 2.0 2.2 2.0 1.9 1.5 -0.8 0.8
+S 2.0 -1.3 0.7 1.8 0.5 0.7 -0.4 1.6 1.4 1.3 0.6 1.9 -0.3 1.8 2.0 2.2 2.1 1.8 -0.8 0.4
+T 1.9 -1.4 0.0 1.6 0.3 0.2 -0.9 1.6 1.2 1.2 0.8 1.1 -0.5 1.7 1.9 2.1 2.2 1.6 -1.0 0.3
+V 2.0 -1.4 0.0 1.6 0.8 0.1 -0.7 2.2 1.2 2.0 0.8 1.1 -0.6 1.5 1.5 1.8 1.6 2.2 -0.7 0.3
+W -0.9 -1.8 -1.4 -1.0 -0.9 -1.3 -1.7 -0.7 -1.1 -0.8 -1.3 -1.1 -1.6 -1.0 -0.8 -0.8 -1.0 -0.7 2.2 -0.6
+Y 0.2 -1.1 -0.4 0.2 2.0 -0.2 -0.8 0.4 0.5 0.5 -0.2 -0.1 -1.2 0.5 0.8 0.4 0.3 0.3 -0.6 2.2
diff --git a/code/lib/Bio/Align/substitution_matrices/data/SCHNEIDER b/code/lib/Bio/Align/substitution_matrices/data/SCHNEIDER
new file mode 100644
index 0000000..0384fa9
--- /dev/null
+++ b/code/lib/Bio/Align/substitution_matrices/data/SCHNEIDER
@@ -0,0 +1,70 @@
+# Adrian Schneider, Gina M. Cannarozzi, and Gaston H. Gonnet:
+# "Empirical codon substitution matrix."
+# BMC Bioinformatics 6:134 (2005).
+# Additional File 3.
+# PMID 15927081
+ AAA AAC AAG AAT ACA ACC ACG ACT AGA AGC AGG AGT ATA ATC ATG ATT CAA CAC CAG CAT CCA CCC CCG CCT CGA CGC CGG CGT CTA CTC CTG CTT GAA GAC GAG GAT GCA GCC GCG GCT GGA GGC GGG GGT GTA GTC GTG GTT TAA TAC TAG TAT TCA TCC TCG TCT TGA TGC TGG TGT TTA TTC TTG TTT
+AAA 11.6 -2.7 9.7 -1.7 -2.7 -6.4 -3.9 -5.6 5.1 -5.0 3.6 -4.2 -6.3 -13.0 -7.1 -11.5 0.4 -6.0 -1.9 -5.3 -8.5 -11.2 -8.9 -10.8 2.1 0.0 1.4 0.2 -10.2 -13.5 -13.0 -12.5 -2.6 -8.5 -5.0 -8.1 -6.3 -9.9 -7.5 -9.0 -7.1 -10.2 -8.2 -9.2 -8.2 -12.5 -11.1 -11.4 -50.0 -14.8 -50.0 -13.8 -7.3 -10.1 -8.4 -9.1 -50.0 -13.0 -13.5 -12.4 -10.7 -18.1 -11.8 -17.2
+AAC -2.7 13.0 -3.3 10.9 -3.5 -0.4 -3.3 -1.8 -5.4 4.6 -5.5 3.0 -10.2 -7.9 -9.9 -9.6 -5.0 0.5 -5.5 -1.0 -10.3 -8.1 -9.4 -9.6 -8.1 -5.0 -7.3 -6.3 -13.4 -11.3 -14.4 -12.9 -6.3 0.8 -6.4 -1.1 -7.4 -5.0 -6.2 -6.5 -5.6 -1.6 -4.7 -3.0 -10.8 -8.7 -11.9 -10.0 -50.0 -6.2 -50.0 -7.5 -6.3 -4.3 -6.2 -5.4 -50.0 -7.0 -16.3 -8.2 -13.2 -12.3 -13.1 -13.3
+AAG 9.7 -3.3 11.6 -2.8 -4.5 -6.7 -3.1 -6.9 3.3 -5.5 4.8 -5.1 -8.9 -13.2 -5.7 -12.6 -1.5 -6.1 -0.6 -6.1 -10.0 -11.8 -8.6 -11.9 1.2 0.5 2.2 0.1 -11.8 -14.0 -11.9 -13.4 -4.9 -9.1 -3.4 -8.9 -8.0 -10.0 -7.2 -10.0 -9.1 -10.2 -7.1 -9.9 -10.1 -13.0 -10.6 -12.7 -50.0 -14.9 -50.0 -14.4 -8.9 -10.8 -8.9 -10.4 -50.0 -13.1 -11.8 -13.0 -12.4 -19.4 -11.5 -17.8
+AAT -1.7 10.9 -2.8 12.9 -2.7 -2.2 -2.8 0.2 -4.8 2.9 -5.2 5.2 -9.1 -9.5 -9.0 -7.0 -4.0 -1.0 -5.0 1.0 -9.0 -8.8 -8.9 -7.7 -7.0 -6.8 -7.1 -4.3 -12.4 -12.9 -13.9 -10.0 -4.9 -0.7 -5.6 1.4 -6.0 -6.5 -6.2 -4.8 -5.1 -3.2 -5.0 -1.2 -9.8 -9.9 -11.3 -8.2 -50.0 -7.6 -50.0 -5.1 -5.5 -5.8 -5.9 -4.2 -50.0 -8.3 -15.5 -5.9 -11.6 -14.2 -12.0 -11.6
+ACA -2.7 -3.5 -4.5 -2.7 11.7 9.0 10.6 9.6 -3.2 -0.7 -5.1 -0.4 0.7 -4.6 -0.4 -3.6 -4.8 -8.9 -6.5 -7.8 -1.4 -4.1 -2.9 -3.3 -8.0 -9.1 -8.0 -8.2 -6.0 -9.3 -7.7 -8.5 -6.4 -10.1 -7.5 -8.6 2.8 -0.7 0.9 0.1 -5.9 -7.3 -6.5 -6.5 -0.4 -3.9 -2.6 -3.1 -50.0 -14.6 -50.0 -12.6 2.9 0.1 1.8 1.0 -50.0 -9.2 -13.6 -7.7 -4.8 -12.2 -6.2 -11.0
+ACC -6.4 -0.4 -6.7 -2.2 9.0 12.3 9.8 9.6 -7.1 2.4 -7.2 0.3 -2.9 -1.2 -3.0 -3.3 -7.6 -6.7 -8.1 -7.9 -4.9 -1.9 -4.5 -3.8 -9.5 -7.8 -9.3 -8.9 -9.4 -7.1 -9.0 -9.0 -9.4 -7.5 -9.3 -9.2 -0.7 2.6 0.2 -0.1 -8.0 -4.6 -7.7 -6.1 -3.6 -1.0 -4.1 -3.1 -50.0 -11.6 -50.0 -11.6 0.2 2.3 0.4 0.5 -50.0 -6.5 -15.3 -7.8 -8.4 -9.5 -8.3 -11.1
+ACG -3.9 -3.3 -3.1 -2.8 10.6 9.8 12.2 9.8 -4.9 0.1 -3.6 -0.2 -1.7 -4.1 0.9 -3.5 -5.9 -8.9 -5.6 -8.5 -3.9 -3.9 -2.2 -4.5 -7.3 -7.5 -6.4 -8.5 -7.1 -8.2 -6.7 -7.8 -7.9 -9.1 -7.1 -9.4 0.9 -0.2 2.5 -0.3 -6.9 -6.5 -5.5 -6.8 -1.7 -3.4 -1.7 -2.8 -50.0 -13.0 -50.0 -11.5 1.5 0.7 2.7 0.6 -50.0 -8.9 -12.1 -7.4 -6.1 -11.5 -5.4 -10.4
+ACT -5.6 -1.8 -6.9 0.2 9.6 9.6 9.8 11.6 -6.6 0.9 -7.2 2.5 -2.3 -3.4 -2.3 -0.6 -6.5 -8.2 -8.0 -6.3 -3.7 -3.5 -3.9 -1.6 -9.9 -8.9 -9.6 -7.0 -8.6 -8.9 -8.8 -6.8 -8.6 -8.2 -8.8 -6.8 0.3 0.1 0.2 2.4 -7.4 -6.2 -6.9 -4.4 -2.9 -2.9 -3.5 -0.9 -50.0 -12.2 -50.0 -10.1 1.2 0.6 1.2 2.4 -50.0 -7.6 -16.1 -5.8 -7.2 -10.8 -7.4 -9.0
+AGA 5.1 -5.4 3.3 -4.8 -3.2 -7.1 -4.9 -6.6 13.3 -2.8 11.2 -1.9 -5.7 -12.5 -7.2 -11.6 -0.7 -4.8 -3.1 -4.3 -10.0 -11.6 -8.8 -11.7 10.5 7.7 9.1 8.5 -9.8 -12.7 -11.7 -11.8 -6.3 -11.4 -8.8 -11.0 -7.5 -10.3 -8.3 -9.9 -1.9 -6.7 -4.1 -6.4 -7.9 -12.5 -11.4 -11.7 -50.0 -14.0 -50.0 -13.2 -8.5 -10.4 -9.1 -9.6 -50.0 -9.8 -7.7 -8.8 -10.3 -17.9 -11.8 -16.0
+AGC -5.0 4.6 -5.5 2.9 -0.7 2.4 0.1 0.9 -2.8 12.8 -2.6 11.0 -8.4 -6.3 -8.3 -7.7 -6.1 -2.9 -6.5 -4.2 -8.5 -5.9 -7.5 -7.7 -5.5 -2.4 -5.5 -4.1 -12.9 -10.3 -12.7 -11.3 -7.4 -3.1 -7.4 -4.9 -4.4 -1.8 -3.5 -3.5 -1.6 3.0 -0.8 0.9 -8.4 -6.3 -9.6 -7.3 -50.0 -9.0 -50.0 -9.7 -2.3 -0.2 -1.2 -1.5 -50.0 -0.7 -13.3 -2.0 -11.8 -11.9 -12.3 -12.7
+AGG 3.6 -5.5 4.8 -5.2 -5.1 -7.2 -3.6 -7.2 11.2 -2.6 13.4 -2.1 -7.7 -12.5 -5.3 -11.9 -2.2 -4.4 -1.9 -4.8 -10.2 -11.5 -8.1 -12.0 9.3 8.2 10.0 8.0 -10.8 -11.9 -10.7 -12.4 -8.2 -11.8 -6.9 -11.5 -8.4 -9.9 -7.1 -10.6 -4.7 -6.8 -1.2 -7.0 -9.8 -11.6 -9.6 -12.1 -50.0 -14.1 -50.0 -12.7 -10.0 -10.8 -8.8 -10.5 -50.0 -9.2 -4.2 -9.3 -11.0 -18.2 -11.1 -16.1
+AGT -4.2 3.0 -5.1 5.2 -0.4 0.3 -0.2 2.5 -1.9 11.0 -2.1 13.2 -7.6 -8.5 -8.0 -5.3 -5.6 -4.5 -6.8 -2.4 -7.8 -7.9 -7.9 -6.2 -5.7 -5.1 -5.4 -2.1 -12.4 -12.3 -13.1 -10.1 -6.6 -4.4 -6.9 -2.6 -3.6 -3.8 -3.7 -2.0 -1.4 0.8 -0.9 3.0 -8.2 -8.1 -9.2 -5.8 -50.0 -10.1 -50.0 -7.4 -1.7 -2.0 -1.5 -0.5 -50.0 -2.1 -12.6 -0.4 -11.6 -13.9 -11.4 -11.1
+ATA -6.3 -10.2 -8.9 -9.1 0.7 -2.9 -1.7 -2.3 -5.7 -8.4 -7.7 -7.6 13.2 9.6 3.5 9.7 -8.7 -12.4 -10.7 -10.7 -7.8 -10.2 -8.8 -9.6 -9.5 -11.9 -10.9 -10.5 2.3 -0.4 -0.2 -0.3 -9.7 -15.5 -11.4 -13.7 -3.0 -6.4 -4.1 -5.4 -9.6 -11.9 -10.5 -11.8 6.2 3.3 3.7 3.6 -50.0 -13.6 -50.0 -11.9 -5.6 -8.8 -7.2 -8.9 -50.0 -12.4 -14.1 -11.6 2.8 -6.4 0.5 -5.2
+ATC -13.0 -7.9 -13.2 -9.5 -4.6 -1.2 -4.1 -3.4 -12.5 -6.3 -12.5 -8.5 9.6 12.7 0.2 10.5 -12.4 -11.6 -13.2 -12.0 -12.4 -10.1 -11.6 -12.0 -15.1 -13.1 -14.3 -13.4 -1.4 1.4 -1.6 -0.5 -14.9 -14.3 -15.2 -16.8 -7.7 -4.7 -6.3 -6.6 -14.6 -10.8 -12.8 -13.0 2.3 6.0 2.5 3.6 -50.0 -11.2 -50.0 -11.9 -10.5 -9.2 -10.5 -10.5 -50.0 -10.3 -16.2 -11.3 -1.7 -3.7 -2.5 -5.4
+ATG -7.1 -9.9 -5.7 -9.0 -0.4 -3.0 0.9 -2.3 -7.2 -8.3 -5.3 -8.0 3.5 0.2 14.3 1.0 -7.3 -11.2 -6.9 -9.7 -8.5 -10.2 -7.7 -9.8 -10.0 -10.4 -8.4 -10.2 1.0 -0.6 1.7 -0.5 -11.1 -14.8 -9.7 -14.5 -3.7 -5.8 -3.1 -5.6 -10.4 -11.5 -8.2 -11.1 0.6 -1.6 1.6 -1.1 -50.0 -12.4 -50.0 -11.5 -5.5 -8.0 -4.9 -7.2 -50.0 -12.4 -10.3 -11.6 0.7 -6.8 2.4 -6.0
+ATT -11.5 -9.6 -12.6 -7.0 -3.6 -3.3 -3.5 -0.6 -11.6 -7.7 -11.9 -5.3 9.7 10.5 1.0 12.6 -12.0 -12.2 -12.5 -9.7 -11.1 -11.2 -11.9 -9.4 -12.9 -14.3 -14.0 -11.9 -1.0 -0.4 -1.6 1.2 -13.9 -15.4 -13.7 -12.6 -6.2 -6.7 -6.3 -4.4 -13.0 -12.2 -13.0 -10.4 2.9 3.8 2.5 5.8 -50.0 -12.5 -50.0 -10.2 -9.6 -9.7 -9.6 -8.3 -50.0 -11.6 -15.4 -9.4 -1.1 -5.4 -1.6 -3.3
+CAA 0.4 -5.0 -1.5 -4.0 -4.8 -7.6 -5.9 -6.5 -0.7 -6.1 -2.2 -5.6 -8.7 -12.4 -7.3 -12.0 12.8 2.3 10.2 3.0 0.0 -3.4 -0.7 -3.2 2.5 -0.8 0.9 0.2 -3.0 -7.2 -5.7 -6.1 -0.2 -6.6 -1.8 -6.0 -5.3 -8.2 -5.7 -7.6 -6.7 -9.6 -7.5 -9.3 -7.1 -10.5 -9.6 -9.4 -50.0 -8.1 -50.0 -7.2 -4.3 -6.8 -5.7 -6.3 -50.0 -9.5 -9.6 -8.9 -6.2 -12.8 -6.8 -11.8
+CAC -6.0 0.5 -6.1 -1.0 -8.9 -6.7 -8.9 -8.2 -4.8 -2.9 -4.4 -4.5 -12.4 -11.6 -11.2 -12.2 2.3 14.6 1.9 12.9 -5.5 -2.6 -4.1 -4.7 -1.2 3.0 -1.1 1.6 -8.0 -4.7 -8.7 -6.0 -8.2 -4.7 -7.8 -6.7 -10.0 -8.3 -8.9 -9.9 -10.8 -7.1 -10.0 -9.2 -11.6 -10.1 -12.3 -11.9 -50.0 2.2 -50.0 0.9 -7.9 -5.9 -7.4 -6.9 -50.0 -4.6 -11.6 -5.6 -9.7 -5.9 -9.9 -7.3
+CAG -1.9 -5.5 -0.6 -5.0 -6.5 -8.1 -5.6 -8.0 -3.1 -6.5 -1.9 -6.8 -10.7 -13.2 -6.9 -12.5 10.2 1.9 11.9 2.1 -2.3 -4.1 -0.5 -4.6 -0.1 -0.9 2.4 -0.8 -5.1 -6.9 -4.8 -6.7 -2.3 -7.1 -0.8 -7.2 -7.2 -8.0 -5.6 -8.5 -9.4 -9.7 -7.2 -9.8 -9.2 -11.1 -9.5 -10.7 -50.0 -8.7 -50.0 -8.6 -6.1 -7.5 -5.5 -7.7 -50.0 -10.4 -7.7 -10.0 -7.3 -13.6 -6.4 -13.3
+CAT -5.3 -1.0 -6.1 1.0 -7.8 -7.9 -8.5 -6.3 -4.3 -4.2 -4.8 -2.4 -10.7 -12.0 -9.7 -9.7 3.0 12.9 2.1 14.7 -4.9 -3.8 -4.0 -2.4 -1.0 0.8 -1.3 3.4 -7.6 -5.7 -8.1 -3.5 -6.8 -5.8 -7.2 -4.2 -8.9 -10.2 -8.4 -8.2 -9.9 -8.4 -9.7 -6.4 -11.1 -11.1 -11.8 -10.0 -50.0 0.4 -50.0 2.6 -7.2 -7.0 -7.4 -5.1 -50.0 -5.7 -9.8 -3.2 -8.4 -7.4 -8.9 -5.9
+CCA -8.5 -10.3 -10.0 -9.0 -1.4 -4.9 -3.9 -3.7 -10.0 -8.5 -10.2 -7.8 -7.8 -12.4 -8.5 -11.1 0.0 -5.5 -2.3 -4.9 12.6 10.0 11.1 10.5 -6.0 -8.3 -6.9 -7.5 -2.2 -7.4 -5.3 -5.9 -8.9 -12.5 -9.4 -11.6 -0.7 -3.9 -2.3 -3.0 -9.1 -9.8 -9.2 -9.5 -5.6 -9.0 -8.0 -8.5 -50.0 -15.3 -50.0 -14.1 2.3 -1.4 0.3 -0.5 -50.0 -13.6 -14.3 -11.5 -5.3 -13.7 -6.1 -12.7
+CCC -11.2 -8.1 -11.8 -8.8 -4.1 -1.9 -3.9 -3.5 -11.6 -5.9 -11.5 -7.9 -10.2 -10.1 -10.2 -11.2 -3.4 -2.6 -4.1 -3.8 10.0 13.1 10.7 10.6 -8.1 -5.2 -8.0 -6.7 -6.1 -3.7 -7.5 -5.6 -11.1 -10.3 -10.6 -11.9 -3.2 -1.0 -2.2 -2.8 -10.5 -8.1 -9.4 -9.5 -8.3 -6.9 -9.4 -8.8 -50.0 -11.3 -50.0 -12.8 -0.6 2.2 -0.3 0.1 -50.0 -10.1 -17.5 -11.1 -8.6 -9.9 -8.6 -11.9
+CCG -8.9 -9.4 -8.6 -8.9 -2.9 -4.5 -2.2 -3.9 -8.8 -7.5 -8.1 -7.9 -8.8 -11.6 -7.7 -11.9 -0.7 -4.1 -0.5 -4.0 11.1 10.7 13.2 10.4 -5.7 -5.8 -3.5 -6.1 -3.6 -6.5 -3.3 -5.5 -9.4 -11.0 -8.7 -11.2 -1.8 -2.7 0.7 -3.0 -9.1 -8.2 -7.3 -9.4 -7.2 -8.2 -7.2 -8.2 -50.0 -13.2 -50.0 -13.3 0.5 -0.9 1.8 -0.6 -50.0 -11.4 -11.1 -10.5 -6.1 -12.9 -5.0 -11.9
+CCT -10.8 -9.6 -11.9 -7.7 -3.3 -3.8 -4.5 -1.6 -11.7 -7.7 -12.0 -6.2 -9.6 -12.0 -9.8 -9.4 -3.2 -4.7 -4.6 -2.4 10.5 10.6 10.4 12.6 -8.2 -7.9 -9.0 -4.7 -6.1 -6.0 -7.5 -2.9 -10.9 -12.2 -11.2 -10.9 -2.4 -2.9 -2.7 -0.9 -10.4 -9.7 -10.2 -8.0 -8.4 -9.1 -9.3 -6.5 -50.0 -13.3 -50.0 -11.0 -0.0 -0.4 -0.4 2.3 -50.0 -10.9 -17.2 -8.3 -8.3 -12.7 -7.8 -9.3
+CGA 2.1 -8.1 1.2 -7.0 -8.0 -9.5 -7.3 -9.9 10.5 -5.5 9.3 -5.7 -9.5 -15.1 -10.0 -12.9 2.5 -1.2 -0.1 -1.0 -6.0 -8.1 -5.7 -8.2 13.8 11.3 11.8 12.1 -6.0 -9.1 -9.0 -9.5 -8.9 -13.4 -10.0 -13.1 -9.9 -11.7 -9.8 -12.8 -5.9 -9.4 -6.5 -8.1 -10.7 -14.0 -13.3 -13.9 -50.0 -11.9 -50.0 -9.7 -8.9 -10.8 -9.0 -11.1 -50.0 -6.5 -5.3 -6.3 -10.5 -16.2 -10.4 -15.2
+CGC 0.0 -5.0 0.5 -6.8 -9.1 -7.8 -7.5 -8.9 7.7 -2.4 8.2 -5.1 -11.9 -13.1 -10.4 -14.3 -0.8 3.0 -0.9 0.8 -8.3 -5.2 -5.8 -7.9 11.3 15.0 11.2 12.8 -9.7 -5.3 -9.2 -7.8 -11.7 -10.6 -10.1 -12.2 -11.7 -8.8 -8.9 -11.8 -9.3 -5.0 -8.0 -8.2 -13.6 -11.4 -12.8 -13.0 -50.0 -7.2 -50.0 -8.4 -11.0 -7.7 -8.7 -10.2 -50.0 -1.5 -7.1 -4.9 -11.2 -11.6 -11.6 -15.0
+CGG 1.4 -7.3 2.2 -7.1 -8.0 -9.3 -6.4 -9.6 9.1 -5.5 10.0 -5.4 -10.9 -14.3 -8.4 -14.0 0.9 -1.1 2.4 -1.3 -6.9 -8.0 -3.5 -9.0 11.8 11.2 13.4 11.4 -7.3 -8.8 -6.5 -8.5 -10.0 -12.5 -7.8 -12.5 -9.6 -11.0 -7.1 -10.7 -7.9 -8.0 -4.3 -9.4 -11.4 -12.3 -10.9 -12.9 -50.0 -11.8 -50.0 -11.2 -9.4 -9.8 -7.3 -10.5 -50.0 -6.2 -2.2 -6.7 -9.8 -15.2 -8.5 -14.9
+CGT 0.2 -6.3 0.1 -4.3 -8.2 -8.9 -8.5 -7.0 8.5 -4.1 8.0 -2.1 -10.5 -13.4 -10.2 -11.9 0.2 1.6 -0.8 3.4 -7.5 -6.7 -6.1 -4.7 12.1 12.8 11.4 14.7 -8.7 -7.4 -9.4 -5.3 -9.9 -11.8 -10.5 -10.3 -9.9 -10.1 -9.3 -9.3 -8.1 -7.2 -8.0 -5.2 -12.0 -12.1 -12.6 -10.9 -50.0 -7.9 -50.0 -5.3 -9.3 -8.8 -8.9 -7.0 -50.0 -3.6 -7.1 -1.2 -10.2 -12.7 -9.9 -11.0
+CTA -10.2 -13.4 -11.8 -12.4 -6.0 -9.4 -7.1 -8.6 -9.8 -12.9 -10.8 -12.4 2.3 -1.4 1.0 -1.0 -3.0 -8.0 -5.1 -7.6 -2.2 -6.1 -3.6 -6.1 -6.0 -9.7 -7.3 -8.7 11.2 7.9 8.9 8.1 -12.1 -17.4 -13.2 -16.8 -6.3 -9.5 -7.5 -8.8 -12.7 -15.2 -13.1 -14.2 0.0 -3.1 -1.8 -3.1 -50.0 -10.7 -50.0 -9.3 -4.6 -8.9 -5.9 -8.3 -50.0 -12.8 -9.6 -10.6 9.5 -3.2 8.2 -2.8
+CTC -13.5 -11.3 -14.0 -12.9 -9.3 -7.1 -8.2 -8.9 -12.7 -10.3 -11.9 -12.3 -0.4 1.4 -0.6 -0.4 -7.2 -4.7 -6.9 -5.7 -7.4 -3.7 -6.5 -6.0 -9.1 -5.3 -8.8 -7.4 7.9 11.9 7.8 9.3 -15.2 -15.1 -14.6 -17.2 -9.7 -7.3 -8.2 -9.2 -15.0 -12.8 -14.1 -14.9 -2.8 0.2 -2.9 -2.3 -50.0 -7.4 -50.0 -8.7 -9.4 -7.3 -9.4 -9.4 -50.0 -8.0 -11.9 -9.7 6.5 0.5 6.4 -1.8
+CTG -13.0 -14.4 -11.9 -13.9 -7.7 -9.0 -6.7 -8.8 -11.7 -12.7 -10.7 -13.1 -0.2 -1.6 1.7 -1.6 -5.7 -8.7 -4.8 -8.1 -5.3 -7.5 -3.3 -7.5 -9.0 -9.2 -6.5 -9.4 8.9 7.8 10.1 7.8 -14.3 -17.7 -13.0 -17.8 -8.0 -9.4 -6.2 -9.2 -15.1 -14.9 -12.2 -14.9 -1.8 -3.2 -0.8 -3.2 -50.0 -10.8 -50.0 -9.9 -7.3 -9.6 -6.4 -9.4 -50.0 -12.3 -8.7 -11.3 7.6 -3.3 8.7 -2.9
+CTT -12.5 -12.9 -13.4 -10.0 -8.5 -9.0 -7.8 -6.8 -11.8 -11.3 -12.4 -10.1 -0.3 -0.5 -0.5 1.2 -6.1 -6.0 -6.7 -3.5 -5.9 -5.6 -5.5 -2.9 -9.5 -7.8 -8.5 -5.3 8.1 9.3 7.8 11.8 -14.4 -15.7 -14.5 -14.6 -8.4 -8.7 -7.9 -7.1 -14.3 -13.8 -13.8 -12.3 -2.4 -2.2 -2.9 -0.2 -50.0 -8.9 -50.0 -7.1 -8.1 -9.1 -8.7 -6.6 -50.0 -9.7 -11.4 -8.0 6.8 -1.8 6.9 0.4
+GAA -2.6 -6.3 -4.9 -4.9 -6.4 -9.4 -7.9 -8.6 -6.3 -7.4 -8.2 -6.6 -9.7 -14.9 -11.1 -13.9 -0.2 -8.2 -2.3 -6.8 -8.9 -11.1 -9.4 -10.9 -8.9 -11.7 -10.0 -9.9 -12.1 -15.2 -14.3 -14.4 11.1 2.9 9.2 3.5 -3.0 -7.0 -4.3 -6.2 -2.2 -6.6 -3.7 -5.7 -6.0 -10.4 -8.6 -9.5 -50.0 -15.5 -50.0 -13.9 -7.8 -10.5 -8.7 -9.6 -50.0 -16.5 -17.1 -14.8 -12.1 -18.9 -12.4 -17.4
+GAC -8.5 0.8 -9.1 -0.7 -10.1 -7.5 -9.1 -8.2 -11.4 -3.1 -11.8 -4.4 -15.5 -14.3 -14.8 -15.4 -6.6 -4.7 -7.1 -5.8 -12.5 -10.3 -11.0 -12.2 -13.4 -10.6 -12.5 -11.8 -17.4 -15.1 -17.7 -15.7 2.9 12.5 3.1 10.3 -7.7 -5.2 -6.2 -7.3 -5.4 -1.7 -5.4 -3.7 -11.6 -9.1 -13.0 -10.8 -50.0 -10.3 -50.0 -11.2 -9.8 -8.3 -9.1 -9.7 -50.0 -12.7 -20.6 -13.1 -16.0 -16.6 -17.0 -18.4
+GAG -5.0 -6.4 -3.4 -5.6 -7.5 -9.3 -7.1 -8.8 -8.8 -7.4 -6.9 -6.9 -11.4 -15.2 -9.7 -13.7 -1.8 -7.8 -0.8 -7.2 -9.4 -10.6 -8.7 -11.2 -10.0 -10.1 -7.8 -10.5 -13.2 -14.6 -13.0 -14.5 9.2 3.1 10.8 3.0 -4.5 -6.7 -2.7 -6.7 -5.0 -6.8 -2.5 -6.7 -7.5 -10.4 -7.4 -10.2 -50.0 -15.5 -50.0 -14.2 -8.9 -10.2 -8.5 -10.0 -50.0 -15.6 -15.3 -15.5 -13.0 -18.8 -12.6 -18.3
+GAT -8.1 -1.1 -8.9 1.4 -8.6 -9.2 -9.4 -6.8 -11.0 -4.9 -11.5 -2.6 -13.7 -16.8 -14.5 -12.6 -6.0 -6.7 -7.2 -4.2 -11.6 -11.9 -11.2 -10.9 -13.1 -12.2 -12.5 -10.3 -16.8 -17.2 -17.8 -14.6 3.5 10.3 3.0 12.4 -6.9 -7.5 -6.8 -4.9 -5.4 -4.2 -5.4 -1.3 -10.8 -11.5 -12.4 -8.2 -50.0 -12.5 -50.0 -8.9 -9.2 -10.0 -9.4 -7.9 -50.0 -14.3 -19.9 -11.4 -16.2 -19.2 -15.8 -15.9
+GCA -6.3 -7.4 -8.0 -6.0 2.8 -0.7 0.9 0.3 -7.5 -4.4 -8.4 -3.6 -3.0 -7.7 -3.7 -6.2 -5.3 -10.0 -7.2 -8.9 -0.7 -3.2 -1.8 -2.4 -9.9 -11.7 -9.6 -9.9 -6.3 -9.7 -8.0 -8.4 -3.0 -7.7 -4.5 -6.9 11.3 8.2 9.4 9.1 -1.0 -3.3 -1.8 -2.4 1.5 -2.1 -0.4 -1.1 -50.0 -14.5 -50.0 -12.9 2.8 -0.0 1.4 0.7 -50.0 -8.7 -14.1 -7.5 -5.2 -12.5 -6.5 -10.7
+GCC -9.9 -5.0 -10.0 -6.5 -0.7 2.6 -0.2 0.1 -10.3 -1.8 -9.9 -3.8 -6.4 -4.7 -5.8 -6.7 -8.2 -8.3 -8.0 -10.2 -3.9 -1.0 -2.7 -2.9 -11.7 -8.8 -11.0 -10.1 -9.5 -7.3 -9.4 -8.7 -7.0 -5.2 -6.7 -7.5 8.2 11.6 8.8 9.1 -3.7 -0.9 -3.1 -3.0 -2.0 1.2 -2.3 -1.3 -50.0 -12.1 -50.0 -12.6 0.1 2.5 0.7 0.6 -50.0 -6.0 -14.8 -7.5 -8.6 -9.4 -8.5 -11.2
+GCG -7.5 -6.2 -7.2 -6.2 0.9 0.2 2.5 0.2 -8.3 -3.5 -7.1 -3.7 -4.1 -6.3 -3.1 -6.3 -5.7 -8.9 -5.6 -8.4 -2.3 -2.2 0.7 -2.7 -9.8 -8.9 -7.1 -9.3 -7.5 -8.2 -6.2 -7.9 -4.3 -6.2 -2.7 -6.8 9.4 8.8 12.1 8.8 -2.1 -2.0 0.1 -2.4 0.0 -1.1 1.3 -0.8 -50.0 -11.9 -50.0 -12.6 1.2 0.7 3.5 0.8 -50.0 -7.1 -12.4 -7.1 -7.1 -10.6 -5.0 -10.6
+GCT -9.0 -6.5 -10.0 -4.8 0.1 -0.1 -0.3 2.4 -9.9 -3.5 -10.6 -2.0 -5.4 -6.6 -5.6 -4.4 -7.6 -9.9 -8.5 -8.2 -3.0 -2.8 -3.0 -0.9 -12.8 -11.8 -10.7 -9.3 -8.8 -9.2 -9.2 -7.1 -6.2 -7.3 -6.7 -4.9 9.1 9.1 8.8 11.2 -3.1 -3.0 -3.3 -1.0 -1.1 -1.4 -2.0 1.2 -50.0 -12.8 -50.0 -10.9 0.8 0.5 0.7 2.3 -50.0 -7.5 -16.5 -5.9 -7.8 -11.1 -7.8 -9.0
+GGA -7.1 -5.6 -9.1 -5.1 -5.9 -8.0 -6.9 -7.4 -1.9 -1.6 -4.7 -1.4 -9.6 -14.6 -10.4 -13.0 -6.7 -10.8 -9.4 -9.9 -9.1 -10.5 -9.1 -10.4 -5.9 -9.3 -7.9 -8.1 -12.7 -15.0 -15.1 -14.3 -2.2 -5.4 -5.0 -5.4 -1.0 -3.7 -2.1 -3.1 12.8 9.6 11.1 10.1 -4.8 -9.4 -8.1 -8.6 -50.0 -18.3 -50.0 -15.4 -5.9 -7.5 -6.6 -7.1 -50.0 -9.5 -11.7 -8.4 -12.5 -17.3 -13.2 -15.4
+GGC -10.2 -1.6 -10.2 -3.2 -7.3 -4.6 -6.5 -6.2 -6.7 3.0 -6.8 0.8 -11.9 -10.8 -11.5 -12.2 -9.6 -7.1 -9.7 -8.4 -9.8 -8.1 -8.2 -9.7 -9.4 -5.0 -8.0 -7.2 -15.2 -12.8 -14.9 -13.8 -6.6 -1.7 -6.8 -4.2 -3.3 -0.9 -2.0 -3.0 9.6 12.8 9.7 10.5 -8.5 -5.6 -9.3 -7.8 -50.0 -12.7 -50.0 -13.8 -6.8 -5.1 -5.3 -6.5 -50.0 -4.1 -12.8 -5.8 -14.4 -14.1 -13.6 -14.8
+GGG -8.2 -4.7 -7.1 -5.0 -6.5 -7.7 -5.5 -6.9 -4.1 -0.8 -1.2 -0.9 -10.5 -12.8 -8.2 -13.0 -7.5 -10.0 -7.2 -9.7 -9.2 -9.4 -7.3 -10.2 -6.5 -8.0 -4.3 -8.0 -13.1 -14.1 -12.2 -13.8 -3.7 -5.4 -2.5 -5.4 -1.8 -3.1 0.1 -3.3 11.1 9.7 12.9 9.8 -5.5 -8.1 -5.4 -7.8 -50.0 -16.4 -50.0 -14.8 -5.9 -7.1 -4.7 -7.2 -50.0 -8.4 -6.4 -8.1 -12.1 -16.2 -10.3 -15.8
+GGT -9.2 -3.0 -9.9 -1.2 -6.5 -6.1 -6.8 -4.4 -6.4 0.9 -7.0 3.0 -11.8 -13.0 -11.1 -10.4 -9.3 -9.2 -9.8 -6.4 -9.5 -9.5 -9.4 -8.0 -8.1 -8.2 -9.4 -5.2 -14.2 -14.9 -14.9 -12.3 -5.7 -3.7 -6.7 -1.3 -2.4 -3.0 -2.4 -1.0 10.1 10.5 9.8 13.1 -7.2 -8.0 -8.4 -5.0 -50.0 -14.7 -50.0 -11.4 -6.3 -6.7 -5.8 -5.0 -50.0 -6.1 -13.4 -3.6 -13.0 -16.0 -13.7 -13.3
+GTA -8.2 -10.8 -10.1 -9.8 -0.4 -3.6 -1.7 -2.9 -7.9 -8.4 -9.8 -8.2 6.2 2.3 0.6 2.9 -7.1 -11.6 -9.2 -11.1 -5.6 -8.3 -7.2 -8.4 -10.7 -13.6 -11.4 -12.0 0.0 -2.8 -1.8 -2.4 -6.0 -11.6 -7.5 -10.8 1.5 -2.0 0.0 -1.1 -4.8 -8.5 -5.5 -7.2 11.9 8.6 10.0 9.0 -50.0 -14.2 -50.0 -12.8 -4.1 -7.4 -5.7 -6.5 -50.0 -11.0 -14.3 -9.9 1.4 -7.9 -0.7 -6.8
+GTC -12.5 -8.7 -13.0 -9.9 -3.9 -1.0 -3.4 -2.9 -12.5 -6.3 -11.6 -8.1 3.3 6.0 -1.6 3.8 -10.5 -10.1 -11.1 -11.1 -9.0 -6.9 -8.2 -9.1 -14.0 -11.4 -12.3 -12.1 -3.1 0.2 -3.2 -2.2 -10.4 -9.1 -10.4 -11.5 -2.1 1.2 -1.1 -1.4 -9.4 -5.6 -8.1 -8.0 8.6 12.4 8.6 9.5 -50.0 -10.4 -50.0 -11.6 -7.2 -5.7 -7.5 -7.4 -50.0 -8.6 -15.7 -9.7 -3.0 -3.7 -3.2 -5.9
+GTG -11.1 -11.9 -10.6 -11.3 -2.6 -4.1 -1.7 -3.5 -11.4 -9.6 -9.6 -9.2 3.7 2.5 1.6 2.5 -9.6 -12.3 -9.5 -11.8 -8.0 -9.4 -7.2 -9.3 -13.3 -12.8 -10.9 -12.6 -1.8 -2.9 -0.8 -2.9 -8.6 -13.0 -7.4 -12.4 -0.4 -2.3 1.3 -2.0 -8.1 -9.3 -5.4 -8.4 10.0 8.6 11.4 8.9 -50.0 -13.7 -50.0 -13.2 -6.1 -8.1 -5.5 -7.5 -50.0 -11.3 -12.7 -10.1 -1.6 -7.8 -0.3 -7.2
+GTT -11.4 -10.0 -12.7 -8.2 -3.1 -3.1 -2.8 -0.9 -11.7 -7.3 -12.1 -5.8 3.6 3.6 -1.1 5.8 -9.4 -11.9 -10.7 -10.0 -8.5 -8.8 -8.2 -6.5 -13.9 -13.0 -12.9 -10.9 -3.1 -2.3 -3.2 -0.2 -9.5 -10.8 -10.2 -8.2 -1.1 -1.3 -0.8 1.2 -8.6 -7.8 -7.8 -5.0 9.0 9.5 8.9 12.0 -50.0 -13.0 -50.0 -10.3 -6.2 -7.0 -7.1 -5.4 -50.0 -9.3 -14.0 -8.2 -2.4 -6.3 -2.4 -3.9
+TAA -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 33.3 -50.0 30.6 -50.0 -50.0 -50.0 -50.0 -50.0 29.2 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0
+TAC -14.8 -6.2 -14.9 -7.6 -14.6 -11.6 -13.0 -12.2 -14.0 -9.0 -14.1 -10.1 -13.6 -11.2 -12.4 -12.5 -8.1 2.2 -8.7 0.4 -15.3 -11.3 -13.2 -13.3 -11.9 -7.2 -11.8 -7.9 -10.7 -7.4 -10.8 -8.9 -15.5 -10.3 -15.5 -12.5 -14.5 -12.1 -11.9 -12.8 -18.3 -12.7 -16.4 -14.7 -14.2 -10.4 -13.7 -13.0 -50.0 15.1 -50.0 13.3 -9.9 -5.8 -8.9 -7.6 -50.0 -1.5 -7.8 -3.0 -8.1 3.6 -9.4 2.0
+TAG -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 30.6 -50.0 35.2 -50.0 -50.0 -50.0 -50.0 -50.0 28.5 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0
+TAT -13.8 -7.5 -14.4 -5.1 -12.6 -11.6 -11.5 -10.1 -13.2 -9.7 -12.7 -7.4 -11.9 -11.9 -11.5 -10.2 -7.2 0.9 -8.6 2.6 -14.1 -12.8 -13.3 -11.0 -9.7 -8.4 -11.2 -5.3 -9.3 -8.7 -9.9 -7.1 -13.9 -11.2 -14.2 -8.9 -12.9 -12.6 -12.6 -10.9 -15.4 -13.8 -14.8 -11.4 -12.8 -11.6 -13.2 -10.3 -50.0 13.3 -50.0 15.2 -8.6 -7.0 -8.4 -4.8 -50.0 -3.0 -7.1 -0.1 -7.3 2.3 -8.7 3.9
+TCA -7.3 -6.3 -8.9 -5.5 2.9 0.2 1.5 1.2 -8.5 -2.3 -10.0 -1.7 -5.6 -10.5 -5.5 -9.6 -4.3 -7.9 -6.1 -7.2 2.3 -0.6 0.5 -0.0 -8.9 -11.0 -9.4 -9.3 -4.6 -9.4 -7.3 -8.1 -7.8 -9.8 -8.9 -9.2 2.8 0.1 1.2 0.8 -5.9 -6.8 -5.9 -6.3 -4.1 -7.2 -6.1 -6.2 -50.0 -9.9 -50.0 -8.6 12.5 9.4 11.0 9.8 -50.0 -4.8 -8.7 -3.5 -0.0 -8.6 -2.1 -7.0
+TCC -10.1 -4.3 -10.8 -5.8 0.1 2.3 0.7 0.6 -10.4 -0.2 -10.8 -2.0 -8.8 -9.2 -8.0 -9.7 -6.8 -5.9 -7.5 -7.0 -1.4 2.2 -0.9 -0.4 -10.8 -7.7 -9.8 -8.8 -8.9 -7.3 -9.6 -9.1 -10.5 -8.3 -10.2 -10.0 -0.0 2.5 0.7 0.5 -7.5 -5.1 -7.1 -6.7 -7.4 -5.7 -8.1 -7.0 -50.0 -5.8 -50.0 -7.0 9.4 12.7 10.1 10.0 -50.0 -1.5 -11.3 -3.3 -5.4 -4.2 -5.6 -6.8
+TCG -8.4 -6.2 -8.9 -5.9 1.8 0.4 2.7 1.2 -9.1 -1.2 -8.8 -1.5 -7.2 -10.5 -4.9 -9.6 -5.7 -7.4 -5.5 -7.4 0.3 -0.3 1.8 -0.4 -9.0 -8.7 -7.3 -8.9 -5.9 -9.4 -6.4 -8.7 -8.7 -9.1 -8.5 -9.4 1.4 0.7 3.5 0.7 -6.6 -5.3 -4.7 -5.8 -5.7 -7.5 -5.5 -7.1 -50.0 -8.9 -50.0 -8.4 11.0 10.1 13.2 10.2 -50.0 -4.2 -6.1 -3.6 -3.5 -8.4 -1.2 -6.8
+TCT -9.1 -5.4 -10.4 -4.2 1.0 0.5 0.6 2.4 -9.6 -1.5 -10.5 -0.5 -8.9 -10.5 -7.2 -8.3 -6.3 -6.9 -7.7 -5.1 -0.5 0.1 -0.6 2.3 -11.1 -10.2 -10.5 -7.0 -8.3 -9.4 -9.4 -6.6 -9.6 -9.7 -10.0 -7.9 0.7 0.6 0.8 2.3 -7.1 -6.5 -7.2 -5.0 -6.5 -7.4 -7.5 -5.4 -50.0 -7.6 -50.0 -4.8 9.8 10.0 10.2 12.1 -50.0 -3.1 -11.4 -0.7 -4.6 -6.6 -4.9 -3.9
+TGA -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 29.2 -50.0 28.5 -50.0 -50.0 -50.0 -50.0 -50.0 33.3 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0 -50.0
+TGC -13.0 -7.0 -13.1 -8.3 -9.2 -6.5 -8.9 -7.6 -9.8 -0.7 -9.2 -2.1 -12.4 -10.3 -12.4 -11.6 -9.5 -4.6 -10.4 -5.7 -13.6 -10.1 -11.4 -10.9 -6.5 -1.5 -6.2 -3.6 -12.8 -8.0 -12.3 -9.7 -16.5 -12.7 -15.6 -14.3 -8.7 -6.0 -7.1 -7.5 -9.5 -4.1 -8.4 -6.1 -11.0 -8.6 -11.3 -9.3 -50.0 -1.5 -50.0 -3.0 -4.8 -1.5 -4.2 -3.1 -50.0 16.4 -5.1 14.2 -10.1 -4.3 -10.0 -5.4
+TGG -13.5 -16.3 -11.8 -15.5 -13.6 -15.3 -12.1 -16.1 -7.7 -13.3 -4.2 -12.6 -14.1 -16.2 -10.3 -15.4 -9.6 -11.6 -7.7 -9.8 -14.3 -17.5 -11.1 -17.2 -5.3 -7.1 -2.2 -7.1 -9.6 -11.9 -8.7 -11.4 -17.1 -20.6 -15.3 -19.9 -14.1 -14.8 -12.4 -16.5 -11.7 -12.8 -6.4 -13.4 -14.3 -15.7 -12.7 -14.0 -50.0 -7.8 -50.0 -7.1 -8.7 -11.3 -6.1 -11.4 -50.0 -5.1 18.6 -4.8 -8.6 -8.1 -4.5 -7.3
+TGT -12.4 -8.2 -13.0 -5.9 -7.7 -7.8 -7.4 -5.8 -8.8 -2.0 -9.3 -0.4 -11.6 -11.3 -11.6 -9.4 -8.9 -5.6 -10.0 -3.2 -11.5 -11.1 -10.5 -8.3 -6.3 -4.9 -6.7 -1.2 -10.6 -9.7 -11.3 -8.0 -14.8 -13.1 -15.5 -11.4 -7.5 -7.5 -7.1 -5.9 -8.4 -5.8 -8.1 -3.6 -9.9 -9.7 -10.1 -8.2 -50.0 -3.0 -50.0 -0.1 -3.5 -3.3 -3.6 -0.7 -50.0 14.2 -4.8 16.4 -8.9 -5.8 -8.9 -3.4
+TTA -10.7 -13.2 -12.4 -11.6 -4.8 -8.4 -6.1 -7.2 -10.3 -11.8 -11.0 -11.6 2.8 -1.7 0.7 -1.1 -6.2 -9.7 -7.3 -8.4 -5.3 -8.6 -6.1 -8.3 -10.5 -11.2 -9.8 -10.2 9.5 6.5 7.6 6.8 -12.1 -16.0 -13.0 -16.2 -5.2 -8.6 -7.1 -7.8 -12.5 -14.4 -12.1 -13.0 1.4 -3.0 -1.6 -2.4 -50.0 -8.1 -50.0 -7.3 -0.0 -5.4 -3.5 -4.6 -50.0 -10.1 -8.6 -8.9 13.2 -0.9 9.7 -0.3
+TTC -18.1 -12.3 -19.4 -14.2 -12.2 -9.5 -11.5 -10.8 -17.9 -11.9 -18.2 -13.9 -6.4 -3.7 -6.8 -5.4 -12.8 -5.9 -13.6 -7.4 -13.7 -9.9 -12.9 -12.7 -16.2 -11.6 -15.2 -12.7 -3.2 0.5 -3.3 -1.8 -18.9 -16.6 -18.8 -19.2 -12.5 -9.4 -10.6 -11.1 -17.3 -14.1 -16.2 -16.0 -7.9 -3.7 -7.8 -6.3 -50.0 3.6 -50.0 2.3 -8.6 -4.2 -8.4 -6.6 -50.0 -4.3 -8.1 -5.8 -0.9 14.2 -1.8 11.6
+TTG -11.8 -13.1 -11.5 -12.0 -6.2 -8.3 -5.4 -7.4 -11.8 -12.3 -11.1 -11.4 0.5 -2.5 2.4 -1.6 -6.8 -9.9 -6.4 -8.9 -6.1 -8.6 -5.0 -7.8 -10.4 -11.6 -8.5 -9.9 8.2 6.4 8.7 6.9 -12.4 -17.0 -12.6 -15.8 -6.5 -8.5 -5.0 -7.8 -13.2 -13.6 -10.3 -13.7 -0.7 -3.2 -0.3 -2.4 -50.0 -9.4 -50.0 -8.7 -2.1 -5.6 -1.2 -4.9 -50.0 -10.0 -4.5 -8.9 9.7 -1.8 11.3 -0.9
+TTT -17.2 -13.3 -17.8 -11.6 -11.0 -11.1 -10.4 -9.0 -16.0 -12.7 -16.1 -11.1 -5.2 -5.4 -6.0 -3.3 -11.8 -7.3 -13.3 -5.9 -12.7 -11.9 -11.9 -9.3 -15.2 -15.0 -14.9 -11.0 -2.8 -1.8 -2.9 0.4 -17.4 -18.4 -18.3 -15.9 -10.7 -11.2 -10.6 -9.0 -15.4 -14.8 -15.8 -13.3 -6.8 -5.9 -7.2 -3.9 -50.0 2.0 -50.0 3.9 -7.0 -6.8 -6.8 -3.9 -50.0 -5.4 -7.3 -3.4 -0.3 11.6 -0.9 14.1
diff --git a/code/lib/Bio/Align/substitution_matrices/data/STR b/code/lib/Bio/Align/substitution_matrices/data/STR
new file mode 100644
index 0000000..23189c3
--- /dev/null
+++ b/code/lib/Bio/Align/substitution_matrices/data/STR
@@ -0,0 +1,26 @@
+# Steven Henikoff and Jorja G. Henikoff:
+# "Performance evaluation of amino acid substitution matrices."
+# Proteins: Structure, Function, and Genetics: 17(1): 49-61 (1993).
+# Figure 1, lower triangle.
+# PMID 8234244
+ A C D E F G H I K L M N P Q R S T V W Y
+A 4 -2 -1 0 -3 0 -2 -2 -1 -2 0 -1 -1 0 -1 0 -1 0 -3 -3
+C -2 11 -7 -3 -2 -6 -6 -4 -4 -6 -5 -6 -8 -3 -2 -4 -5 -4 -6 -6
+D -1 -7 6 2 -5 -1 0 -3 -1 -6 -4 2 -1 0 -2 0 -1 -4 -6 -3
+E 0 -3 2 5 -4 -2 -2 -3 1 -4 -2 0 -1 2 0 -1 0 -2 -6 -2
+F -3 -2 -5 -4 7 -6 -2 1 -3 2 0 -3 -5 -4 -4 -3 -3 -1 2 3
+G 0 -6 -1 -2 -6 5 -3 -5 -3 -5 -4 -1 -2 -2 -2 -1 -3 -4 -4 -3
+H -2 -6 0 -2 -2 -3 8 -5 0 -3 -2 2 -3 0 0 -2 -2 -2 -3 0
+I -2 -4 -3 -3 1 -5 -5 6 -3 2 1 -3 -4 -5 -3 -3 -2 2 -2 -1
+K -1 -4 -1 1 -3 -3 0 -3 5 -2 -1 0 -1 1 2 -1 0 -3 -3 -2
+L -2 -6 -6 -4 2 -5 -3 2 -2 5 3 -3 -3 -3 -3 -4 -3 1 -1 -2
+M 0 -5 -4 -2 0 -4 -2 1 -1 3 8 -2 -6 1 -4 -4 -2 0 -2 -1
+N -1 -6 2 0 -3 -1 2 -3 0 -3 -2 5 -2 0 -1 0 0 -4 -5 -1
+P -1 -8 -1 -1 -5 -2 -3 -4 -1 -3 -6 -2 7 -2 -2 -1 -1 -4 -4 -6
+Q 0 -3 0 2 -4 -2 0 -5 1 -3 1 0 -2 6 1 -1 0 -2 -5 -3
+R -1 -2 -2 0 -4 -2 0 -3 2 -3 -4 -1 -2 1 7 0 -1 -3 -2 -1
+S 0 -4 0 -1 -3 -1 -2 -3 -1 -4 -4 0 -1 -1 0 4 1 -3 -5 -2
+T -1 -5 -1 0 -3 -3 -2 -2 0 -3 -2 0 -1 0 -1 1 5 -1 -5 -2
+V 0 -4 -4 -2 -1 -4 -2 2 -3 1 0 -4 -4 -2 -3 -3 -1 5 -4 -1
+W -3 -6 -6 -6 2 -4 -3 -2 -3 -1 -2 -5 -4 -5 -2 -5 -5 -4 10 2
+Y -3 -6 -3 -2 3 -3 0 -1 -2 -2 -1 -1 -6 -3 -1 -2 -2 -1 2 7
diff --git a/code/lib/Bio/Align/substitution_matrices/data/TRANS b/code/lib/Bio/Align/substitution_matrices/data/TRANS
new file mode 100644
index 0000000..611e6b9
--- /dev/null
+++ b/code/lib/Bio/Align/substitution_matrices/data/TRANS
@@ -0,0 +1,12 @@
+# David Wheeler,
+# Department of Cell Biology, Baylor College of Medicine, Houston, Texas:
+# "Weight matrices for sequence similarity scoring."
+# Version 2.0, May 1996.
+# David Wheeler defined the Transition/Transversion Matrix as a penalty
+# matrix; the matrix below is a similarity matrix where
+# similarity = 5 - penalty.
+ A T C G
+A 5 0 0 4
+T 0 5 4 0
+C 0 4 5 0
+G 4 0 0 5
diff --git a/code/lib/Bio/AlignIO/ClustalIO.py b/code/lib/Bio/AlignIO/ClustalIO.py
new file mode 100644
index 0000000..49fc51a
--- /dev/null
+++ b/code/lib/Bio/AlignIO/ClustalIO.py
@@ -0,0 +1,305 @@
+# Copyright 2006-2016 by Peter Cock. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Bio.AlignIO support for "clustal" output from CLUSTAL W and other tools.
+
+You are expected to use this module via the Bio.AlignIO functions (or the
+Bio.SeqIO functions if you want to work directly with the gapped sequences).
+"""
+from Bio.Align import MultipleSeqAlignment
+from Bio.AlignIO.Interfaces import AlignmentIterator
+from Bio.AlignIO.Interfaces import SequentialAlignmentWriter
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+
+
+class ClustalWriter(SequentialAlignmentWriter):
+ """Clustalw alignment writer."""
+
+ def write_alignment(self, alignment):
+ """Use this to write (another) single alignment to an open file."""
+ if len(alignment) == 0:
+ raise ValueError("Must have at least one sequence")
+ if alignment.get_alignment_length() == 0:
+ # This doubles as a check for an alignment object
+ raise ValueError("Non-empty sequences are required")
+
+ # Old versions of the parser in Bio.Clustalw used a ._version property
+ try:
+ version = str(alignment._version)
+ except AttributeError:
+ version = ""
+ if not version:
+ version = "1.81"
+ if version.startswith("2."):
+ # e.g. 2.0.x
+ output = "CLUSTAL %s multiple sequence alignment\n\n\n" % version
+ else:
+ # e.g. 1.81 or 1.83
+ output = "CLUSTAL X (%s) multiple sequence alignment\n\n\n" % version
+
+ cur_char = 0
+ max_length = len(alignment[0])
+
+ if max_length <= 0:
+ raise ValueError("Non-empty sequences are required")
+
+ if "clustal_consensus" in alignment.column_annotations:
+ star_info = alignment.column_annotations["clustal_consensus"]
+ else:
+ try:
+ # This was originally stored by Bio.Clustalw as ._star_info
+ star_info = alignment._star_info
+ except AttributeError:
+ star_info = None
+
+ # keep displaying sequences until we reach the end
+ while cur_char != max_length:
+ # calculate the number of sequences to show, which will
+ # be less if we are at the end of the sequence
+ if (cur_char + 50) > max_length:
+ show_num = max_length - cur_char
+ else:
+ show_num = 50
+
+ # go through all of the records and print out the sequences
+ # when we output, we do a nice 80 column output, although this
+ # may result in truncation of the ids.
+ for record in alignment:
+ # Make sure we don't get any spaces in the record
+ # identifier when output in the file by replacing
+ # them with underscores:
+ line = record.id[0:30].replace(" ", "_").ljust(36)
+ line += str(record.seq[cur_char : (cur_char + show_num)])
+ output += line + "\n"
+
+ # now we need to print out the star info, if we've got it
+ if star_info:
+ output += (
+ (" " * 36) + star_info[cur_char : (cur_char + show_num)] + "\n"
+ )
+
+ output += "\n"
+ cur_char += show_num
+
+ # Want a trailing blank new line in case the output is concatenated
+ self.handle.write(output + "\n")
+
+
+class ClustalIterator(AlignmentIterator):
+ """Clustalw alignment iterator."""
+
+ _header = None # for caching lines between __next__ calls
+
+ def __next__(self):
+ """Parse the next alignment from the handle."""
+ handle = self.handle
+
+ if self._header is None:
+ line = handle.readline()
+ else:
+ # Header we saved from when we were parsing
+ # the previous alignment.
+ line = self._header
+ self._header = None
+
+ if not line:
+ raise StopIteration
+
+ # Whitelisted headers we know about
+ known_headers = ["CLUSTAL", "PROBCONS", "MUSCLE", "MSAPROBS", "Kalign"]
+ if line.strip().split()[0] not in known_headers:
+ raise ValueError(
+ "%s is not a known CLUSTAL header: %s"
+ % (line.strip().split()[0], ", ".join(known_headers))
+ )
+
+ # find the clustal version in the header line
+ version = None
+ for word in line.split():
+ if word[0] == "(" and word[-1] == ")":
+ word = word[1:-1]
+ if word[0] in "0123456789":
+ version = word
+ break
+
+ # There should be two blank lines after the header line
+ line = handle.readline()
+ while line.strip() == "":
+ line = handle.readline()
+
+ # If the alignment contains entries with the same sequence
+ # identifier (not a good idea - but seems possible), then this
+ # dictionary based parser will merge their sequences. Fix this?
+ ids = []
+ seqs = []
+ consensus = ""
+ seq_cols = None # Used to extract the consensus
+
+ # Use the first block to get the sequence identifiers
+ while True:
+ if line[0] != " " and line.strip() != "":
+ # Sequences identifier...
+ fields = line.rstrip().split()
+
+ # We expect there to be two fields, there can be an optional
+ # "sequence number" field containing the letter count.
+ if len(fields) < 2 or len(fields) > 3:
+ raise ValueError("Could not parse line:\n%s" % line)
+
+ ids.append(fields[0])
+ seqs.append(fields[1])
+
+ # Record the sequence position to get the consensus
+ if seq_cols is None:
+ start = len(fields[0]) + line[len(fields[0]) :].find(fields[1])
+ end = start + len(fields[1])
+ seq_cols = slice(start, end)
+ del start, end
+ assert fields[1] == line[seq_cols]
+
+ if len(fields) == 3:
+ # This MAY be an old style file with a letter count...
+ try:
+ letters = int(fields[2])
+ except ValueError:
+ raise ValueError(
+ "Could not parse line, bad sequence number:\n%s" % line
+ ) from None
+ if len(fields[1].replace("-", "")) != letters:
+ raise ValueError(
+ "Could not parse line, invalid sequence number:\n%s" % line
+ )
+ elif line[0] == " ":
+ # Sequence consensus line...
+ assert len(ids) == len(seqs)
+ assert len(ids) > 0
+ assert seq_cols is not None
+ consensus = line[seq_cols]
+ assert not line[: seq_cols.start].strip()
+ assert not line[seq_cols.stop :].strip()
+ # Check for blank line (or end of file)
+ line = handle.readline()
+ assert line.strip() == ""
+ break
+ else:
+ # No consensus
+ break
+ line = handle.readline()
+ if not line:
+ break # end of file
+
+ assert line.strip() == ""
+ assert seq_cols is not None
+
+ # Confirm all same length
+ for s in seqs:
+ assert len(s) == len(seqs[0])
+ if consensus:
+ assert len(consensus) == len(seqs[0])
+
+ # Loop over any remaining blocks...
+ done = False
+ while not done:
+ # There should be a blank line between each block.
+ # Also want to ignore any consensus line from the
+ # previous block.
+ while (not line) or line.strip() == "":
+ line = handle.readline()
+ if not line:
+ break # end of file
+ if not line:
+ break # end of file
+
+ if line.split(None, 1)[0] in known_headers:
+ # Found concatenated alignment.
+ self._header = line
+ break
+
+ for i in range(len(ids)):
+ if line[0] == " ":
+ raise ValueError("Unexpected line:\n%r" % line)
+ fields = line.rstrip().split()
+
+ # We expect there to be two fields, there can be an optional
+ # "sequence number" field containing the letter count.
+ if len(fields) < 2 or len(fields) > 3:
+ raise ValueError("Could not parse line:\n%r" % line)
+
+ if fields[0] != ids[i]:
+ raise ValueError(
+ "Identifiers out of order? Got '%s' but expected '%s'"
+ % (fields[0], ids[i])
+ )
+
+ if fields[1] != line[seq_cols]:
+ start = len(fields[0]) + line[len(fields[0]) :].find(fields[1])
+ if start != seq_cols.start:
+ raise ValueError("Old location %s -> %i:XX" % (seq_cols, start))
+ end = start + len(fields[1])
+ seq_cols = slice(start, end)
+ del start, end
+
+ # Append the sequence
+ seqs[i] += fields[1]
+ assert len(seqs[i]) == len(seqs[0])
+
+ if len(fields) == 3:
+ # This MAY be an old style file with a letter count...
+ try:
+ letters = int(fields[2])
+ except ValueError:
+ raise ValueError(
+ "Could not parse line, bad sequence number:\n%s" % line
+ ) from None
+ if len(seqs[i].replace("-", "")) != letters:
+ raise ValueError(
+ "Could not parse line, invalid sequence number:\n%s" % line
+ )
+
+ # Read in the next line
+ line = handle.readline()
+ # There should now be a consensus line
+ if consensus:
+ assert line[0] == " "
+ assert seq_cols is not None
+ consensus += line[seq_cols]
+ assert len(consensus) == len(seqs[0])
+ assert not line[: seq_cols.start].strip()
+ assert not line[seq_cols.stop :].strip()
+ # Read in the next line
+ line = handle.readline()
+
+ assert len(ids) == len(seqs)
+ if len(seqs) == 0 or len(seqs[0]) == 0:
+ raise StopIteration
+
+ if (
+ self.records_per_alignment is not None
+ and self.records_per_alignment != len(ids)
+ ):
+ raise ValueError(
+ "Found %i records in this alignment, told to expect %i"
+ % (len(ids), self.records_per_alignment)
+ )
+
+ records = (SeqRecord(Seq(s), id=i, description=i) for (i, s) in zip(ids, seqs))
+ alignment = MultipleSeqAlignment(records)
+ # TODO - Handle alignment annotation better, for now
+ # mimic the old parser in Bio.Clustalw
+ if version:
+ alignment._version = version
+ if consensus:
+ alignment_length = len(seqs[0])
+ if len(consensus) != alignment_length:
+ raise ValueError(
+ "Alignment length is %i, consensus length is %i, '%s'"
+ % (alignment_length, len(consensus), consensus)
+ )
+ alignment.column_annotations["clustal_consensus"] = consensus
+ # For backward compatibility prior to .column_annotations:
+ alignment._star_info = consensus
+ return alignment
diff --git a/code/lib/Bio/AlignIO/EmbossIO.py b/code/lib/Bio/AlignIO/EmbossIO.py
new file mode 100644
index 0000000..b1ebd4d
--- /dev/null
+++ b/code/lib/Bio/AlignIO/EmbossIO.py
@@ -0,0 +1,219 @@
+# Copyright 2008-2016 by Peter Cock. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Bio.AlignIO support for "emboss" alignment output from EMBOSS tools.
+
+You are expected to use this module via the Bio.AlignIO functions (or the
+Bio.SeqIO functions if you want to work directly with the gapped sequences).
+
+This module contains a parser for the EMBOSS pairs/simple file format, for
+example from the alignret, water and needle tools.
+"""
+from Bio.Align import MultipleSeqAlignment
+from Bio.AlignIO.Interfaces import AlignmentIterator
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+
+
+class EmbossIterator(AlignmentIterator):
+ """Emboss alignment iterator.
+
+ For reading the (pairwise) alignments from EMBOSS tools in what they
+ call the "pairs" and "simple" formats.
+ """
+
+ _header = None # for caching lines between __next__ calls
+
+ def __next__(self):
+ """Parse the next alignment from the handle."""
+ handle = self.handle
+
+ if self._header is None:
+ line = handle.readline()
+ else:
+ # Header we saved from when we were parsing
+ # the previous alignment.
+ line = self._header
+ self._header = None
+
+ if not line:
+ raise StopIteration
+
+ while line.rstrip() != "#=======================================":
+ line = handle.readline()
+ if not line:
+ raise StopIteration
+
+ length_of_seqs = None
+ number_of_seqs = None
+ ids = []
+ header_dict = {}
+
+ while line[0] == "#":
+ # Read in the rest of this alignment header,
+ # try and discover the number of records expected
+ # and their length
+ parts = line[1:].split(":", 1)
+ key = parts[0].lower().strip()
+ if key == "aligned_sequences":
+ number_of_seqs = int(parts[1].strip())
+ assert len(ids) == 0
+ # Should now expect the record identifiers...
+ for i in range(number_of_seqs):
+ line = handle.readline()
+ parts = line[1:].strip().split(":", 1)
+ assert i + 1 == int(parts[0].strip())
+ ids.append(parts[1].strip())
+ assert len(ids) == number_of_seqs
+ if key == "length":
+ length_of_seqs = int(parts[1].strip())
+
+ # Parse the rest of the header
+ if key == "identity":
+ header_dict["identity"] = int(parts[1].strip().split("/")[0])
+ if key == "similarity":
+ header_dict["similarity"] = int(parts[1].strip().split("/")[0])
+ if key == "gaps":
+ header_dict["gaps"] = int(parts[1].strip().split("/")[0])
+ if key == "score":
+ header_dict["score"] = float(parts[1].strip())
+
+ # And read in another line...
+ line = handle.readline()
+
+ if number_of_seqs is None:
+ raise ValueError("Number of sequences missing!")
+ if length_of_seqs is None:
+ raise ValueError("Length of sequences missing!")
+
+ if (
+ self.records_per_alignment is not None
+ and self.records_per_alignment != number_of_seqs
+ ):
+ raise ValueError(
+ "Found %i records in this alignment, told to expect %i"
+ % (number_of_seqs, self.records_per_alignment)
+ )
+
+ seqs = [""] * len(ids)
+ seq_starts = []
+ index = 0
+
+ # Parse the seqs
+ while line:
+ if len(line) > 21:
+ id_start = line[:21].strip().split(None, 1)
+ seq_end = line[21:].strip().split(None, 1)
+ if len(id_start) == 2 and len(seq_end) == 2:
+ # identifier, seq start position, seq, seq end position
+ # (an aligned seq is broken up into multiple lines)
+ id, start = id_start
+ seq, end = seq_end
+ if start >= end:
+ # Special case, either a single letter is present,
+ # or no letters at all.
+ if seq.replace("-", "") == "":
+ start = int(start)
+ end = int(end)
+ else:
+ start = int(start) - 1
+ end = int(end)
+ else:
+ assert seq.replace("-", "") != "", repr(line)
+ start = int(start) - 1 # python counting
+ end = int(end)
+
+ if index < 0 or index >= number_of_seqs:
+ raise ValueError(
+ "Expected index %i in range [0,%i)"
+ % (index, number_of_seqs)
+ )
+ # The identifier is truncated...
+ assert id == ids[index] or id == ids[index][: len(id)]
+
+ if len(seq_starts) == index:
+ # Record the start
+ seq_starts.append(start)
+
+ # Check the start...
+ if start >= end:
+ assert seq.replace("-", "") == "", line
+ elif start - seq_starts[index] != len(seqs[index].replace("-", "")):
+ raise ValueError(
+ "Found %i chars so far for sequence %i (%s, %r), line says start %i:\n%s"
+ % (
+ len(seqs[index].replace("-", "")),
+ index,
+ id,
+ seqs[index],
+ start,
+ line,
+ )
+ )
+ seqs[index] += seq
+
+ # Check the end ...
+ if end != seq_starts[index] + len(seqs[index].replace("-", "")):
+ raise ValueError(
+ "Found %i chars so far for sequence %i (%s, %r, start=%i), file says end %i:\n%s"
+ % (
+ len(seqs[index].replace("-", "")),
+ index,
+ id,
+ seqs[index],
+ seq_starts[index],
+ end,
+ line,
+ )
+ )
+
+ index += 1
+ if index >= number_of_seqs:
+ index = 0
+ else:
+ # just a start value, this is just alignment annotation (?)
+ # print("Skipping: " + line.rstrip())
+ pass
+ elif line.strip() == "":
+ # Just a spacer?
+ pass
+ else:
+ raise ValueError("Unrecognised EMBOSS pairwise line: %r\n" % line)
+
+ line = handle.readline()
+ if (
+ line.rstrip() == "#---------------------------------------"
+ or line.rstrip() == "#======================================="
+ ):
+ # End of alignment
+ self._header = line
+ break
+
+ assert index == 0
+
+ if (
+ self.records_per_alignment is not None
+ and self.records_per_alignment != len(ids)
+ ):
+ raise ValueError(
+ "Found %i records in this alignment, told to expect %i"
+ % (len(ids), self.records_per_alignment)
+ )
+
+ records = []
+ for id, seq in zip(ids, seqs):
+ if len(seq) != length_of_seqs:
+ # EMBOSS 2.9.0 is known to use spaces instead of minus signs
+ # for leading gaps, and thus fails to parse. This old version
+ # is still used as of Dec 2008 behind the EBI SOAP webservice:
+ # http://www.ebi.ac.uk/Tools/webservices/wsdl/WSEmboss.wsdl
+ raise ValueError(
+ "Error parsing alignment - sequences of "
+ "different length? You could be using an "
+ "old version of EMBOSS."
+ )
+ records.append(SeqRecord(Seq(seq), id=id, description=id))
+ return MultipleSeqAlignment(records, annotations=header_dict)
diff --git a/code/lib/Bio/AlignIO/FastaIO.py b/code/lib/Bio/AlignIO/FastaIO.py
new file mode 100644
index 0000000..9816253
--- /dev/null
+++ b/code/lib/Bio/AlignIO/FastaIO.py
@@ -0,0 +1,344 @@
+# Copyright 2008-2016 by Peter Cock. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Bio.AlignIO support for "fasta-m10" output from Bill Pearson's FASTA tools.
+
+You are expected to use this module via the Bio.AlignIO functions (or the
+Bio.SeqIO functions if you want to work directly with the gapped sequences).
+
+This module contains a parser for the pairwise alignments produced by Bill
+Pearson's FASTA tools, for use from the Bio.AlignIO interface where it is
+referred to as the "fasta-m10" file format (as we only support the machine
+readable output format selected with the -m 10 command line option).
+
+This module does NOT cover the generic "fasta" file format originally
+developed as an input format to the FASTA tools. The Bio.AlignIO and
+Bio.SeqIO both use the Bio.SeqIO.FastaIO module to deal with these files,
+which can also be used to store a multiple sequence alignments.
+"""
+from Bio.Align import MultipleSeqAlignment
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+
+
+def _extract_alignment_region(alignment_seq_with_flanking, annotation):
+ """Extract alignment region (PRIVATE).
+
+ Helper function for the main parsing code.
+
+ To get the actual pairwise alignment sequences, we must first
+ translate the un-gapped sequence based coordinates into positions
+ in the gapped sequence (which may have a flanking region shown
+ using leading - characters). To date, I have never seen any
+ trailing flanking region shown in the m10 file, but the
+ following code should also cope with that.
+
+ Note that this code seems to work fine even when the "sq_offset"
+ entries are present as a result of using the -X command line option.
+ """
+ align_stripped = alignment_seq_with_flanking.strip("-")
+ display_start = int(annotation["al_display_start"])
+ if int(annotation["al_start"]) <= int(annotation["al_stop"]):
+ start = int(annotation["al_start"]) - display_start
+ end = int(annotation["al_stop"]) - display_start + 1
+ else:
+ # FASTA has flipped this sequence...
+ start = display_start - int(annotation["al_start"])
+ end = display_start - int(annotation["al_stop"]) + 1
+
+ end += align_stripped.count("-")
+ if start < 0 or start >= end or end > len(align_stripped):
+ raise ValueError(
+ "Problem with sequence start/stop,\n%s[%i:%i]\n%s"
+ % (alignment_seq_with_flanking, start, end, annotation)
+ )
+ return align_stripped[start:end]
+
+
+def FastaM10Iterator(handle, seq_count=None):
+ """Alignment iterator for the FASTA tool's pairwise alignment output.
+
+ This is for reading the pairwise alignments output by Bill Pearson's
+ FASTA program when called with the -m 10 command line option for machine
+ readable output. For more details about the FASTA tools, see the website
+ http://fasta.bioch.virginia.edu/ and the paper:
+
+ W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448
+
+ This class is intended to be used via the Bio.AlignIO.parse() function
+ by specifying the format as "fasta-m10" as shown in the following code::
+
+ from Bio import AlignIO
+ handle = ...
+ for a in AlignIO.parse(handle, "fasta-m10"):
+ assert len(a) == 2, "Should be pairwise!"
+ print("Alignment length %i" % a.get_alignment_length())
+ for record in a:
+ print("%s %s %s" % (record.seq, record.name, record.id))
+
+ Note that this is not a full blown parser for all the information
+ in the FASTA output - for example, most of the header and all of the
+ footer is ignored. Also, the alignments are not batched according to
+ the input queries.
+
+ Also note that there can be up to about 30 letters of flanking region
+ included in the raw FASTA output as contextual information. This is NOT
+ part of the alignment itself, and is not included in the resulting
+ MultipleSeqAlignment objects returned.
+ """
+ state_PREAMBLE = -1
+ state_NONE = 0
+ state_QUERY_HEADER = 1
+ state_ALIGN_HEADER = 2
+ state_ALIGN_QUERY = 3
+ state_ALIGN_MATCH = 4
+ state_ALIGN_CONS = 5
+
+ def build_hsp():
+ if not query_tags and not match_tags:
+ raise ValueError("No data for query %r, match %r" % (query_id, match_id))
+ assert query_tags, query_tags
+ assert match_tags, match_tags
+ evalue = align_tags.get("fa_expect")
+ tool = global_tags.get("tool", "").upper()
+
+ q = _extract_alignment_region(query_seq, query_tags)
+ if tool in ["TFASTX"] and len(match_seq) == len(q):
+ m = match_seq
+ # Quick hack until I can work out how -, * and / characters
+ # and the apparent mix of aa and bp coordinates works.
+ else:
+ m = _extract_alignment_region(match_seq, match_tags)
+ if len(q) != len(m):
+ raise ValueError(
+ f"""\
+Darn... amino acids vs nucleotide coordinates?
+tool: {tool}
+query_seq: {query_seq}
+query_tags: {query_tags}
+{q} length: {len(q)}
+match_seq: {match_seq}
+match_tags: {match_tags}
+{m} length: {len(m)}
+handle.name: {handle.name}
+"""
+ )
+
+ annotations = {}
+ records = []
+
+ # Want to record both the query header tags, and the alignment tags.
+ annotations.update(header_tags)
+ annotations.update(align_tags)
+
+ # Query
+ # =====
+ record = SeqRecord(
+ Seq(q),
+ id=query_id,
+ name="query",
+ description=query_descr,
+ annotations={"original_length": int(query_tags["sq_len"])},
+ )
+ # TODO - handle start/end coordinates properly. Short term hack for now:
+ record._al_start = int(query_tags["al_start"])
+ record._al_stop = int(query_tags["al_stop"])
+
+ # TODO - Can FASTA output RNA?
+ if "sq_type" in query_tags:
+ if query_tags["sq_type"] == "D":
+ record.annotations["molecule_type"] = "DNA"
+ elif query_tags["sq_type"] == "p":
+ record.annotations["molecule_type"] = "protein"
+
+ records.append(record)
+
+ # Match
+ # =====
+ record = SeqRecord(
+ Seq(m),
+ id=match_id,
+ name="match",
+ description=match_descr,
+ annotations={"original_length": int(match_tags["sq_len"])},
+ )
+ # TODO - handle start/end coordinates properly. Short term hack for now:
+ record._al_start = int(match_tags["al_start"])
+ record._al_stop = int(match_tags["al_stop"])
+
+ if "sq_type" in match_tags:
+ if match_tags["sq_type"] == "D":
+ record.annotations["molecule_type"] = "DNA"
+ elif match_tags["sq_type"] == "p":
+ record.annotations["molecule_type"] = "protein"
+
+ records.append(record)
+
+ return MultipleSeqAlignment(records, annotations=annotations)
+
+ state = state_PREAMBLE
+ query_id = None
+ match_id = None
+ query_descr = ""
+ match_descr = ""
+ global_tags = {}
+ header_tags = {}
+ align_tags = {}
+ query_tags = {}
+ match_tags = {}
+ query_seq = ""
+ match_seq = ""
+ cons_seq = ""
+ for line in handle:
+ if ">>>" in line and not line.startswith(">>>"):
+ if query_id and match_id:
+ # This happens on old FASTA output which lacked an end of
+ # query >>><<< marker line.
+ yield build_hsp()
+ state = state_NONE
+ query_descr = line[line.find(">>>") + 3 :].strip()
+ query_id = query_descr.split(None, 1)[0]
+ match_id = None
+ header_tags = {}
+ align_tags = {}
+ query_tags = {}
+ match_tags = {}
+ query_seq = ""
+ match_seq = ""
+ cons_seq = ""
+ elif line.startswith("!! No "):
+ # e.g.
+ # !! No library sequences with E() < 0.5
+ # or on more recent versions,
+ # No sequences with E() < 0.05
+ assert state == state_NONE
+ assert not header_tags
+ assert not align_tags
+ assert not match_tags
+ assert not query_tags
+ assert match_id is None
+ assert not query_seq
+ assert not match_seq
+ assert not cons_seq
+ query_id = None
+ elif line.strip() in [">>><<<", ">>>///"]:
+ # End of query, possible end of all queries
+ if query_id and match_id:
+ yield build_hsp()
+ state = state_NONE
+ query_id = None
+ match_id = None
+ header_tags = {}
+ align_tags = {}
+ query_tags = {}
+ match_tags = {}
+ query_seq = ""
+ match_seq = ""
+ cons_seq = ""
+ elif line.startswith(">>>"):
+ # Should be start of a match!
+ assert query_id is not None
+ assert line[3:].split(", ", 1)[0] == query_id, line
+ assert match_id is None
+ assert not header_tags
+ assert not align_tags
+ assert not query_tags
+ assert not match_tags
+ assert not match_seq
+ assert not query_seq
+ assert not cons_seq
+ state = state_QUERY_HEADER
+ elif line.startswith(">>"):
+ # Should now be at start of a match alignment!
+ if query_id and match_id:
+ yield build_hsp()
+ align_tags = {}
+ query_tags = {}
+ match_tags = {}
+ query_seq = ""
+ match_seq = ""
+ cons_seq = ""
+ match_descr = line[2:].strip()
+ match_id = match_descr.split(None, 1)[0]
+ state = state_ALIGN_HEADER
+ elif line.startswith(">--"):
+ # End of one HSP
+ assert query_id and match_id, line
+ yield build_hsp()
+ # Clean up read for next HSP
+ # but reuse header_tags
+ align_tags = {}
+ query_tags = {}
+ match_tags = {}
+ query_seq = ""
+ match_seq = ""
+ cons_seq = ""
+ state = state_ALIGN_HEADER
+ elif line.startswith(">"):
+ if state == state_ALIGN_HEADER:
+ # Should be start of query alignment seq...
+ assert query_id is not None, line
+ assert match_id is not None, line
+ assert query_id.startswith(line[1:].split(None, 1)[0]), line
+ state = state_ALIGN_QUERY
+ elif state == state_ALIGN_QUERY:
+ # Should be start of match alignment seq
+ assert query_id is not None, line
+ assert match_id is not None, line
+ assert match_id.startswith(line[1:].split(None, 1)[0]), line
+ state = state_ALIGN_MATCH
+ elif state == state_NONE:
+ # Can get > as the last line of a histogram
+ pass
+ else:
+ raise RuntimeError("state %i got %r" % (state, line))
+ elif line.startswith("; al_cons"):
+ assert state == state_ALIGN_MATCH, line
+ state = state_ALIGN_CONS
+ # Next line(s) should be consensus seq...
+ elif line.startswith("; "):
+ if ": " in line:
+ key, value = [s.strip() for s in line[2:].split(": ", 1)]
+ else:
+ import warnings
+ from Bio import BiopythonParserWarning
+
+ # Seen in lalign36, specifically version 36.3.4 Apr, 2011
+ # Fixed in version 36.3.5b Oct, 2011(preload8)
+ warnings.warn(
+ "Missing colon in line: %r" % line, BiopythonParserWarning
+ )
+ try:
+ key, value = [s.strip() for s in line[2:].split(" ", 1)]
+ except ValueError:
+ raise ValueError("Bad line: %r" % line) from None
+ if state == state_QUERY_HEADER:
+ header_tags[key] = value
+ elif state == state_ALIGN_HEADER:
+ align_tags[key] = value
+ elif state == state_ALIGN_QUERY:
+ query_tags[key] = value
+ elif state == state_ALIGN_MATCH:
+ match_tags[key] = value
+ else:
+ raise RuntimeError("Unexpected state %r, %r" % (state, line))
+ elif state == state_ALIGN_QUERY:
+ query_seq += line.strip()
+ elif state == state_ALIGN_MATCH:
+ match_seq += line.strip()
+ elif state == state_ALIGN_CONS:
+ cons_seq += line.strip("\n")
+ elif state == state_PREAMBLE:
+ if line.startswith("#"):
+ global_tags["command"] = line[1:].strip()
+ elif line.startswith(" version "):
+ global_tags["version"] = line[9:].strip()
+ elif " compares a " in line:
+ global_tags["tool"] = line[: line.find(" compares a ")].strip()
+ elif " searches a " in line:
+ global_tags["tool"] = line[: line.find(" searches a ")].strip()
+ else:
+ pass
diff --git a/code/lib/Bio/AlignIO/Interfaces.py b/code/lib/Bio/AlignIO/Interfaces.py
new file mode 100644
index 0000000..b53de30
--- /dev/null
+++ b/code/lib/Bio/AlignIO/Interfaces.py
@@ -0,0 +1,160 @@
+# Copyright 2008-2018 by Peter Cock. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""AlignIO support module (not for general use).
+
+Unless you are writing a new parser or writer for Bio.AlignIO, you should not
+use this module. It provides base classes to try and simplify things.
+"""
+
+
+class AlignmentIterator:
+ """Base class for building MultipleSeqAlignment iterators.
+
+ You should write a next() method to return Alignment
+ objects. You may wish to redefine the __init__
+ method as well.
+ """
+
+ def __init__(self, handle, seq_count=None):
+ """Create an AlignmentIterator object.
+
+ Arguments:
+ - handle - input file
+ - count - optional, expected number of records per alignment
+ Recommend for fasta file format.
+
+ Note when subclassing:
+ - there should be a single non-optional argument, the handle,
+ and optional count IN THAT ORDER.
+ - you can add additional optional arguments.
+
+ """
+ self.handle = handle
+ self.records_per_alignment = seq_count
+ #####################################################
+ # You may want to subclass this, for example #
+ # to read through the file to find the first record,#
+ # or if additional arguments are required. #
+ #####################################################
+
+ def __next__(self):
+ """Return the next alignment in the file.
+
+ This method should be replaced by any derived class to do something
+ useful.
+ """
+ raise NotImplementedError("This object should be subclassed")
+ #####################################################
+ # You SHOULD subclass this, to split the file up #
+ # into your individual alignments and convert these #
+ # into MultipleSeqAlignment objects. #
+ #####################################################
+
+ def __iter__(self):
+ """Iterate over the entries as MultipleSeqAlignment objects.
+
+ Example usage for (concatenated) PHYLIP files::
+
+ with open("many.phy","r") as myFile:
+ for alignment in PhylipIterator(myFile):
+ print("New alignment:")
+ for record in alignment:
+ print(record.id)
+ print(record.seq)
+
+ """
+ return iter(self.__next__, None)
+
+
+class AlignmentWriter:
+ """Base class for building MultipleSeqAlignment writers.
+
+ You should write a write_alignment() method.
+ You may wish to redefine the __init__ method as well.
+ """
+
+ def __init__(self, handle):
+ """Initialize the class."""
+ self.handle = handle
+
+ def write_file(self, alignments):
+ """Use this to write an entire file containing the given alignments.
+
+ Arguments:
+ - alignments - A list or iterator returning MultipleSeqAlignment objects
+
+ In general, this method can only be called once per file.
+
+ This method should be replaced by any derived class to do something
+ useful. It should return the number of alignments..
+ """
+ raise NotImplementedError("This object should be subclassed")
+ #####################################################
+ # You SHOULD subclass this, to write the alignment #
+ # objects to the file handle #
+ #####################################################
+
+ def clean(self, text):
+ """Use this to avoid getting newlines in the output."""
+ return text.replace("\n", " ").replace("\r", " ")
+
+
+class SequentialAlignmentWriter(AlignmentWriter):
+ """Base class for building MultipleSeqAlignment writers.
+
+ This assumes each alignment can be simply appended to the file.
+ You should write a write_alignment() method.
+ You may wish to redefine the __init__ method as well.
+ """
+
+ def __init__(self, handle):
+ """Initialize the class."""
+ self.handle = handle
+
+ def write_file(self, alignments):
+ """Use this to write an entire file containing the given alignments.
+
+ Arguments:
+ - alignments - A list or iterator returning MultipleSeqAlignment objects
+
+ In general, this method can only be called once per file.
+ """
+ self.write_header()
+ count = 0
+ for alignment in alignments:
+ self.write_alignment(alignment)
+ count += 1
+ self.write_footer()
+ return count
+
+ def write_header(self):
+ """Use this to write any header.
+
+ This method should be replaced by any derived class to do something
+ useful.
+ """
+ pass
+
+ def write_footer(self):
+ """Use this to write any footer.
+
+ This method should be replaced by any derived class to do something
+ useful.
+ """
+ pass
+
+ def write_alignment(self, alignment):
+ """Use this to write a single alignment.
+
+ This method should be replaced by any derived class to do something
+ useful.
+ """
+ raise NotImplementedError("This object should be subclassed")
+ #####################################################
+ # You SHOULD subclass this, to write the alignment #
+ # objects to the file handle #
+ #####################################################
diff --git a/code/lib/Bio/AlignIO/MafIO.py b/code/lib/Bio/AlignIO/MafIO.py
new file mode 100644
index 0000000..787325e
--- /dev/null
+++ b/code/lib/Bio/AlignIO/MafIO.py
@@ -0,0 +1,833 @@
+# Copyright 2011, 2012 by Andrew Sczesnak. All rights reserved.
+# Revisions Copyright 2011, 2017 by Peter Cock. All rights reserved.
+# Revisions Copyright 2014, 2015 by Adam Novak. All rights reserved.
+# Revisions Copyright 2015, 2017 by Blaise Li. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Bio.AlignIO support for the "maf" multiple alignment format.
+
+The Multiple Alignment Format, described by UCSC, stores a series of
+multiple alignments in a single file. It is suitable for whole-genome
+to whole-genome alignments, metadata such as source chromosome, start
+position, size, and strand can be stored.
+
+See http://genome.ucsc.edu/FAQ/FAQformat.html#format5
+
+You are expected to use this module via the Bio.AlignIO functions(or the
+Bio.SeqIO functions if you want to work directly with the gapped sequences).
+
+Coordinates in the MAF format are defined in terms of zero-based start
+positions (like Python) and aligning region sizes.
+
+A minimal aligned region of length one and starting at first position in the
+source sequence would have ``start == 0`` and ``size == 1``.
+
+As we can see on this example, ``start + size`` will give one more than the
+zero-based end position. We can therefore manipulate ``start`` and
+``start + size`` as python list slice boundaries.
+
+For an inclusive end coordinate, we need to use ``end = start + size - 1``.
+A 1-column wide alignment would have ``start == end``.
+"""
+import os
+
+from itertools import islice
+from sqlite3 import dbapi2
+
+from Bio.Align import MultipleSeqAlignment
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+
+from .Interfaces import SequentialAlignmentWriter
+
+MAFINDEX_VERSION = 2
+
+
+class MafWriter(SequentialAlignmentWriter):
+ """Accepts a MultipleSeqAlignment object, writes a MAF file."""
+
+ def write_header(self):
+ """Write the MAF header."""
+ self.handle.write("##maf version=1 scoring=none\n")
+ self.handle.write("# generated by Biopython\n\n")
+
+ def _write_record(self, record):
+ """Write a single SeqRecord object to an 's' line in a MAF block (PRIVATE)."""
+ # convert biopython-style 1/-1 strand to MAF-style +/- strand
+ if record.annotations.get("strand") == 1:
+ strand = "+"
+ elif record.annotations.get("strand") == -1:
+ strand = "-"
+ else:
+ # TODO: issue warning?
+ strand = "+"
+
+ fields = [
+ "s",
+ # In the MAF file format, spaces are not allowed in the id
+ "%-40s" % record.id.replace(" ", "_"),
+ "%15s" % record.annotations.get("start", 0),
+ "%5s"
+ % record.annotations.get("size", len(str(record.seq).replace("-", ""))),
+ strand,
+ "%15s" % record.annotations.get("srcSize", 0),
+ str(record.seq),
+ ]
+ self.handle.write("%s\n" % " ".join(fields))
+
+ def write_alignment(self, alignment):
+ """Write a complete alignment to a MAF block.
+
+ Writes every SeqRecord in a MultipleSeqAlignment object to its own
+ MAF block (beginning with an 'a' line, containing 's' lines).
+ """
+ if not isinstance(alignment, MultipleSeqAlignment):
+ raise TypeError("Expected an alignment object")
+
+ if len({len(x) for x in alignment}) > 1:
+ raise ValueError("Sequences must all be the same length")
+
+ # We allow multiple sequences with the same IDs; for example, there may
+ # be a MAF aligning the + and - strands of the same sequence together.
+
+ # for now, use ._annotations private property, but restrict keys to those
+ # specifically supported by the MAF format, according to spec
+ try:
+ anno = " ".join(
+ [
+ "%s=%s" % (x, y)
+ for x, y in alignment._annotations.items()
+ if x in ("score", "pass")
+ ]
+ )
+ except AttributeError:
+ anno = "score=0.00"
+
+ self.handle.write("a %s\n" % (anno,))
+
+ recs_out = 0
+
+ for record in alignment:
+ self._write_record(record)
+
+ recs_out += 1
+
+ self.handle.write("\n")
+
+ return recs_out
+
+
+# Invalid function name according to pylint, but kept for compatibility
+# with Bio* conventions.
+def MafIterator(handle, seq_count=None):
+ """Iterate over a MAF file handle as MultipleSeqAlignment objects.
+
+ Iterates over lines in a MAF file-like object (handle), yielding
+ MultipleSeqAlignment objects. SeqRecord IDs generally correspond to
+ species names.
+ """
+ in_a_bundle = False
+
+ annotations = []
+ records = []
+
+ while True:
+ # allows parsing of the last bundle without duplicating code
+ try:
+ line = next(handle)
+ except StopIteration:
+ line = ""
+
+ if in_a_bundle:
+ if line.startswith("s"):
+ # add a SeqRecord to the bundle
+ line_split = line.strip().split()
+
+ if len(line_split) != 7:
+ raise ValueError(
+ "Error parsing alignment - 's' line must have 7 fields"
+ )
+
+ # convert MAF-style +/- strand to biopython-type 1/-1
+ if line_split[4] == "+":
+ strand = 1
+ elif line_split[4] == "-":
+ strand = -1
+ else:
+ # TODO: issue warning, set to 0?
+ strand = 1
+
+ # s (literal), src (ID), start, size, strand, srcSize, text (sequence)
+ anno = {
+ "start": int(line_split[2]),
+ "size": int(line_split[3]),
+ "strand": strand,
+ "srcSize": int(line_split[5]),
+ }
+
+ sequence = line_split[6]
+
+ # interpret a dot/period to mean the same as the first sequence
+ if "." in sequence:
+ if not records:
+ raise ValueError(
+ "Found dot/period in first sequence of alignment"
+ )
+
+ ref = records[0].seq
+ new = []
+
+ for (letter, ref_letter) in zip(sequence, ref):
+ new.append(ref_letter if letter == "." else letter)
+
+ sequence = "".join(new)
+
+ records.append(
+ SeqRecord(
+ Seq(sequence),
+ id=line_split[1],
+ name=line_split[1],
+ description="",
+ annotations=anno,
+ )
+ )
+ elif line.startswith("i"):
+ # TODO: information about what is in the aligned species DNA before
+ # and after the immediately preceding "s" line
+ pass
+ elif line.startswith("e"):
+ # TODO: information about the size of the gap between the alignments
+ # that span the current block
+ pass
+ elif line.startswith("q"):
+ # TODO: quality of each aligned base for the species.
+ # Need to find documentation on this, looks like ASCII 0-9 or gap?
+ # Can then store in each SeqRecord's .letter_annotations dictionary,
+ # perhaps as the raw string or turned into integers / None for gap?
+ pass
+ elif line.startswith("#"):
+ # ignore comments
+ # (not sure whether comments
+ # are in the maf specification, though)
+ pass
+ elif not line.strip():
+ # end a bundle of records
+ if seq_count is not None:
+ assert len(records) == seq_count
+
+ alignment = MultipleSeqAlignment(records)
+ # TODO - Introduce an annotated alignment class?
+ # See also Bio/AlignIO/FastaIO.py for same requirement.
+ # For now, store the annotation a new private property:
+ alignment._annotations = annotations
+
+ yield alignment
+
+ in_a_bundle = False
+
+ annotations = []
+ records = []
+ else:
+ raise ValueError(
+ "Error parsing alignment - unexpected line:\n%s" % (line,)
+ )
+ elif line.startswith("a"):
+ # start a bundle of records
+ in_a_bundle = True
+ annot_strings = line.strip().split()[1:]
+ if len(annot_strings) != line.count("="):
+ raise ValueError("Error parsing alignment - invalid key in 'a' line")
+ annotations = dict(a_string.split("=") for a_string in annot_strings)
+ elif line.startswith("#"):
+ # ignore comments
+ pass
+ elif not line:
+ break
+
+
+class MafIndex:
+ """Index for a MAF file.
+
+ The index is a sqlite3 database that is built upon creation of the object
+ if necessary, and queried when methods *search* or *get_spliced* are
+ used.
+ """
+
+ def __init__(self, sqlite_file, maf_file, target_seqname):
+ """Indexes or loads the index of a MAF file."""
+ self._target_seqname = target_seqname
+ # example: Tests/MAF/ucsc_mm9_chr10.mafindex
+ self._index_filename = sqlite_file
+ # example: /home/bli/src/biopython/Tests/MAF
+ self._relative_path = os.path.abspath(os.path.dirname(sqlite_file))
+ # example: Tests/MAF/ucsc_mm9_chr10.maf
+ self._maf_file = maf_file
+
+ self._maf_fp = open(self._maf_file)
+
+ # if sqlite_file exists, use the existing db, otherwise index the file
+ if os.path.isfile(sqlite_file):
+ self._con = dbapi2.connect(sqlite_file)
+ self._record_count = self.__check_existing_db()
+ else:
+ self._con = dbapi2.connect(sqlite_file)
+ self._record_count = self.__make_new_index()
+
+ # lastly, setup a MafIterator pointing at the open maf_file
+ self._mafiter = MafIterator(self._maf_fp)
+
+ def __check_existing_db(self):
+ """Perform basic sanity checks upon loading an existing index (PRIVATE)."""
+ try:
+ idx_version = int(
+ self._con.execute(
+ "SELECT value FROM meta_data WHERE key = 'version'"
+ ).fetchone()[0]
+ )
+ if idx_version != MAFINDEX_VERSION:
+ msg = "\n".join(
+ [
+ "Index version (%s) incompatible with this version "
+ "of MafIndex" % idx_version,
+ "You might erase the existing index %s "
+ "for it to be rebuilt." % self._index_filename,
+ ]
+ )
+ raise ValueError(msg)
+
+ filename = self._con.execute(
+ "SELECT value FROM meta_data WHERE key = 'filename'"
+ ).fetchone()[0]
+ # Compute absolute path of the original maf file
+ if os.path.isabs(filename):
+ # It was already stored as absolute
+ tmp_mafpath = filename
+ else:
+ # It should otherwise have been stored as relative to the index
+ # Would be stored with Unix / path separator, so convert
+ # it to the local OS path separator here:
+ tmp_mafpath = os.path.join(
+ self._relative_path, filename.replace("/", os.path.sep)
+ )
+ if tmp_mafpath != os.path.abspath(self._maf_file):
+ # Original and given absolute paths differ.
+ raise ValueError(
+ "Index uses a different file (%s != %s)"
+ % (filename, self._maf_file)
+ )
+
+ db_target = self._con.execute(
+ "SELECT value FROM meta_data WHERE key = 'target_seqname'"
+ ).fetchone()[0]
+ if db_target != self._target_seqname:
+ raise ValueError(
+ "Provided database indexed for %s, expected %s"
+ % (db_target, self._target_seqname)
+ )
+
+ record_count = int(
+ self._con.execute(
+ "SELECT value FROM meta_data WHERE key = 'record_count'"
+ ).fetchone()[0]
+ )
+ if record_count == -1:
+ raise ValueError("Unfinished/partial database provided")
+
+ records_found = int(
+ self._con.execute("SELECT COUNT(*) FROM offset_data").fetchone()[0]
+ )
+ if records_found != record_count:
+ raise ValueError(
+ "Expected %s records, found %s. Corrupt index?"
+ % (record_count, records_found)
+ )
+
+ return records_found
+
+ except (dbapi2.OperationalError, dbapi2.DatabaseError) as err:
+ raise ValueError("Problem with SQLite database: %s" % err) from None
+
+ def __make_new_index(self):
+ """Read MAF file and generate SQLite index (PRIVATE)."""
+ # make the tables
+ self._con.execute("CREATE TABLE meta_data (key TEXT, value TEXT);")
+ self._con.execute(
+ "INSERT INTO meta_data (key, value) VALUES ('version', %s);"
+ % MAFINDEX_VERSION
+ )
+ self._con.execute(
+ "INSERT INTO meta_data (key, value) VALUES ('record_count', -1);"
+ )
+ self._con.execute(
+ "INSERT INTO meta_data (key, value) VALUES ('target_seqname', '%s');"
+ % (self._target_seqname,)
+ )
+ # Determine whether to store maf file as relative to the index or absolute
+ # See https://github.com/biopython/biopython/pull/381
+ if not os.path.isabs(self._maf_file) and not os.path.isabs(
+ self._index_filename
+ ):
+ # Since the user gave both maf file and index as relative paths,
+ # we will store the maf file relative to the index.
+ # Note for cross platform use (e.g. shared drive over SAMBA),
+ # convert any Windows slash into Unix style for rel paths.
+ # example: ucsc_mm9_chr10.maf
+ mafpath = os.path.relpath(self._maf_file, self._relative_path).replace(
+ os.path.sep, "/"
+ )
+ elif (
+ os.path.dirname(os.path.abspath(self._maf_file)) + os.path.sep
+ ).startswith(self._relative_path + os.path.sep):
+ # Since maf file is in same directory or sub directory,
+ # might as well make this into a relative path:
+ mafpath = os.path.relpath(self._maf_file, self._relative_path).replace(
+ os.path.sep, "/"
+ )
+ else:
+ # Default to storing as an absolute path
+ # example: /home/bli/src/biopython/Tests/MAF/ucsc_mm9_chr10.maf
+ mafpath = os.path.abspath(self._maf_file)
+ self._con.execute(
+ "INSERT INTO meta_data (key, value) VALUES ('filename', '%s');" % (mafpath,)
+ )
+ self._con.execute(
+ "CREATE TABLE offset_data (bin INTEGER, start INTEGER, end INTEGER, offset INTEGER);"
+ )
+
+ insert_count = 0
+
+ # iterate over the entire file and insert in batches
+ mafindex_func = self.__maf_indexer()
+
+ while True:
+ batch = list(islice(mafindex_func, 100))
+ if not batch:
+ break
+
+ # batch is made from self.__maf_indexer(),
+ # which yields zero-based "inclusive" start and end coordinates
+ self._con.executemany(
+ "INSERT INTO offset_data (bin, start, end, offset) VALUES (?,?,?,?);",
+ batch,
+ )
+ self._con.commit()
+ insert_count += len(batch)
+
+ # then make indexes on the relevant fields
+ self._con.execute("CREATE INDEX IF NOT EXISTS bin_index ON offset_data(bin);")
+ self._con.execute(
+ "CREATE INDEX IF NOT EXISTS start_index ON offset_data(start);"
+ )
+ self._con.execute("CREATE INDEX IF NOT EXISTS end_index ON offset_data(end);")
+
+ self._con.execute(
+ "UPDATE meta_data SET value = '%s' WHERE key = 'record_count'"
+ % (insert_count,)
+ )
+
+ self._con.commit()
+
+ return insert_count
+
+ def __maf_indexer(self):
+ """Return index information for each bundle (PRIVATE).
+
+ Yields index information for each bundle in the form of
+ (bin, start, end, offset) tuples where start and end are
+ 0-based inclusive coordinates.
+ """
+ line = self._maf_fp.readline()
+
+ while line:
+ if line.startswith("a"):
+ # note the offset
+ offset = self._maf_fp.tell() - len(line)
+
+ # search the following lines for a match to target_seqname
+ while True:
+ line = self._maf_fp.readline()
+
+ if not line.strip() or line.startswith("a"):
+ # Empty line or new alignment record
+ raise ValueError(
+ "Target for indexing (%s) not found in this bundle"
+ % (self._target_seqname,)
+ )
+ elif line.startswith("s"):
+ # s (literal), src (ID), start, size, strand, srcSize, text (sequence)
+ line_split = line.strip().split()
+
+ if line_split[1] == self._target_seqname:
+ start = int(line_split[2])
+ size = int(line_split[3])
+ if size != len(line_split[6].replace("-", "")):
+ raise ValueError(
+ "Invalid length for target coordinates "
+ "(expected %s, found %s)"
+ % (size, len(line_split[6].replace("-", "")))
+ )
+
+ # "inclusive" end position is start + length - 1
+ end = start + size - 1
+
+ # _ucscbin takes end-exclusive coordinates
+ yield (self._ucscbin(start, end + 1), start, end, offset)
+
+ break
+
+ line = self._maf_fp.readline()
+
+ # TODO: check coordinate correctness for the two bin-related static methods
+ @staticmethod
+ def _region2bin(start, end):
+ """Find bins that a region may belong to (PRIVATE).
+
+ Converts a region to a list of bins that it may belong to, including largest
+ and smallest bins.
+ """
+ bins = [0, 1]
+
+ bins.extend(range(1 + (start >> 26), 2 + ((end - 1) >> 26)))
+ bins.extend(range(9 + (start >> 23), 10 + ((end - 1) >> 23)))
+ bins.extend(range(73 + (start >> 20), 74 + ((end - 1) >> 20)))
+ bins.extend(range(585 + (start >> 17), 586 + ((end - 1) >> 17)))
+
+ return set(bins)
+
+ @staticmethod
+ def _ucscbin(start, end):
+ """Return the smallest bin a given region will fit into (PRIVATE).
+
+ Adapted from http://genomewiki.ucsc.edu/index.php/Bin_indexing_system
+ """
+ bin_offsets = [512 + 64 + 8 + 1, 64 + 8 + 1, 8 + 1, 1, 0]
+
+ _bin_first_shift = 17
+ _bin_next_shift = 3
+
+ start_bin = start
+ end_bin = end - 1
+
+ start_bin >>= _bin_first_shift
+ end_bin >>= _bin_first_shift
+
+ for bin_offset in bin_offsets:
+ if start_bin == end_bin:
+ return bin_offset + start_bin
+ start_bin >>= _bin_next_shift
+ end_bin >>= _bin_next_shift
+
+ return 0
+
+ def _get_record(self, offset):
+ """Retrieve a single MAF record located at the offset provided (PRIVATE)."""
+ self._maf_fp.seek(offset)
+ return next(self._mafiter)
+
+ def search(self, starts, ends):
+ """Search index database for MAF records overlapping ranges provided.
+
+ Returns *MultipleSeqAlignment* results in order by start, then end, then
+ internal offset field.
+
+ *starts* should be a list of 0-based start coordinates of segments in the reference.
+ *ends* should be the list of the corresponding segment ends
+ (in the half-open UCSC convention:
+ http://genome.ucsc.edu/blog/the-ucsc-genome-browser-coordinate-counting-systems/).
+ """
+ # verify the provided exon coordinates
+ if len(starts) != len(ends):
+ raise ValueError("Every position in starts must have a match in ends")
+
+ # Could it be safer to sort the (exonstart, exonend) pairs?
+ for exonstart, exonend in zip(starts, ends):
+ exonlen = exonend - exonstart
+ if exonlen < 1:
+ raise ValueError(
+ "Exon coordinates (%d, %d) invalid: exon length (%d) < 1"
+ % (exonstart, exonend, exonlen)
+ )
+ con = self._con
+
+ # Keep track of what blocks have already been yielded
+ # in order to avoid duplicating them
+ # (see https://github.com/biopython/biopython/issues/1083)
+ yielded_rec_coords = set()
+ # search for every exon
+ for exonstart, exonend in zip(starts, ends):
+ try:
+ possible_bins = ", ".join(
+ map(str, self._region2bin(exonstart, exonend))
+ )
+ except TypeError:
+ raise TypeError(
+ "Exon coordinates must be integers "
+ "(start=%d, end=%d)" % (exonstart, exonend)
+ ) from None
+
+ # https://www.sqlite.org/lang_expr.html
+ # -----
+ # The BETWEEN operator
+ #
+ # The BETWEEN operator is logically equivalent to a pair of
+ # comparisons. "x BETWEEN y AND z" is equivalent to "x>=y AND x<=z"
+ # except that with BETWEEN, the x expression is only evaluated
+ # once. The precedence of the BETWEEN operator is the same as the
+ # precedence as operators == and != and LIKE and groups left to
+ # right.
+ # -----
+
+ # We are testing overlap between the query segment and records in
+ # the index, using non-strict coordinates comparisons.
+ # The query segment end must be passed as end-inclusive
+ # The index should also have been build with end-inclusive
+ # end coordinates.
+ # See https://github.com/biopython/biopython/pull/1086#issuecomment-285069073
+
+ result = con.execute(
+ "SELECT DISTINCT start, end, offset FROM offset_data "
+ "WHERE bin IN (%s) "
+ "AND (end BETWEEN %s AND %s OR %s BETWEEN start AND end) "
+ "ORDER BY start, end, offset ASC;"
+ % (possible_bins, exonstart, exonend - 1, exonend - 1)
+ )
+
+ rows = result.fetchall()
+
+ # rows come from the sqlite index,
+ # which should have been written using __make_new_index,
+ # so rec_start and rec_end should be zero-based "inclusive" coordinates
+ for rec_start, rec_end, offset in rows:
+ # Avoid yielding multiple time the same block
+ if (rec_start, rec_end) in yielded_rec_coords:
+ continue
+ else:
+ yielded_rec_coords.add((rec_start, rec_end))
+ # Iterate through hits, fetching alignments from the MAF file
+ # and checking to be sure we've retrieved the expected record.
+
+ fetched = self._get_record(int(offset))
+
+ for record in fetched:
+ if record.id == self._target_seqname:
+ # start and size come from the maf lines
+ start = record.annotations["start"]
+ # "inclusive" end is start + length - 1
+ end = start + record.annotations["size"] - 1
+
+ if not (start == rec_start and end == rec_end):
+ raise ValueError(
+ "Expected %s-%s @ offset %s, found %s-%s"
+ % (rec_start, rec_end, offset, start, end)
+ )
+
+ yield fetched
+
+ def get_spliced(self, starts, ends, strand=1):
+ """Return a multiple alignment of the exact sequence range provided.
+
+ Accepts two lists of start and end positions on target_seqname, representing
+ exons to be spliced in silico. Returns a *MultipleSeqAlignment* of the
+ desired sequences spliced together.
+
+ *starts* should be a list of 0-based start coordinates of segments in the reference.
+ *ends* should be the list of the corresponding segment ends
+ (in the half-open UCSC convention:
+ http://genome.ucsc.edu/blog/the-ucsc-genome-browser-coordinate-counting-systems/).
+
+ To ask for the alignment portion corresponding to the first 100
+ nucleotides of the reference sequence, you would use
+ ``search([0], [100])``
+ """
+ # validate strand
+ if strand not in (1, -1):
+ raise ValueError("Strand must be 1 or -1, got %s" % strand)
+
+ # pull all alignments that span the desired intervals
+ fetched = list(self.search(starts, ends))
+
+ # keep track of the expected letter count
+ # (sum of lengths of [start, end) segments,
+ # where [start, end) half-open)
+ expected_letters = sum(end - start for start, end in zip(starts, ends))
+
+ # if there's no alignment, return filler for the assembly of the length given
+ if len(fetched) == 0:
+ return MultipleSeqAlignment(
+ [SeqRecord(Seq("N" * expected_letters), id=self._target_seqname)]
+ )
+
+ # find the union of all IDs in these alignments
+ all_seqnames = {sequence.id for multiseq in fetched for sequence in multiseq}
+
+ # split every record by base position
+ # key: sequence name
+ # value: dictionary
+ # key: position in the reference sequence
+ # value: letter(s) (including letters
+ # aligned to the "-" preceding the letter
+ # at the position in the reference, if any)
+ split_by_position = {seq_name: {} for seq_name in all_seqnames}
+
+ # keep track of what the total number of (unspliced) letters should be
+ total_rec_length = 0
+
+ # track first strand encountered on the target seqname
+ ref_first_strand = None
+
+ for multiseq in fetched:
+ # find the target_seqname in this MultipleSeqAlignment and use it to
+ # set the parameters for the rest of this iteration
+ for seqrec in multiseq:
+ if seqrec.id == self._target_seqname:
+ try:
+ if ref_first_strand is None:
+ ref_first_strand = seqrec.annotations["strand"]
+
+ if ref_first_strand not in (1, -1):
+ raise ValueError("Strand must be 1 or -1")
+ elif ref_first_strand != seqrec.annotations["strand"]:
+ raise ValueError(
+ "Encountered strand='%s' on target seqname, "
+ "expected '%s'"
+ % (seqrec.annotations["strand"], ref_first_strand)
+ )
+ except KeyError:
+ raise ValueError(
+ "No strand information for target seqname (%s)"
+ % self._target_seqname
+ ) from None
+ # length including gaps (i.e. alignment length)
+ rec_length = len(seqrec)
+ rec_start = seqrec.annotations["start"]
+ ungapped_length = seqrec.annotations["size"]
+ # inclusive end in zero-based coordinates of the reference
+ rec_end = rec_start + ungapped_length - 1
+ # This is length in terms of actual letters in the reference
+ total_rec_length += ungapped_length
+
+ # blank out these positions for every seqname
+ for seqrec in multiseq:
+ for pos in range(rec_start, rec_end + 1):
+ split_by_position[seqrec.id][pos] = ""
+
+ break
+ # http://psung.blogspot.fr/2007/12/for-else-in-python.html
+ # https://docs.python.org/2/tutorial/controlflow.html#break-and-continue-statements-and-else-clauses-on-loops
+ else:
+ raise ValueError(
+ "Did not find %s in alignment bundle" % (self._target_seqname,)
+ )
+
+ # the true, chromosome/contig/etc position in the target seqname
+ real_pos = rec_start
+
+ # loop over the alignment to fill split_by_position
+ for gapped_pos in range(0, rec_length):
+ for seqrec in multiseq:
+ # keep track of this position's value for the target seqname
+ if seqrec.id == self._target_seqname:
+ track_val = seqrec.seq[gapped_pos]
+
+ # Here, a real_pos that corresponds to just after a series of "-"
+ # in the reference will "accumulate" the letters found in other sequences
+ # in front of the "-"s
+ split_by_position[seqrec.id][real_pos] += seqrec.seq[gapped_pos]
+
+ # increment the real_pos counter only when non-gaps are found in
+ # the target_seqname, and we haven't reached the end of the record
+ if track_val != "-" and real_pos < rec_end:
+ real_pos += 1
+
+ # make sure the number of bp entries equals the sum of the record lengths
+ if len(split_by_position[self._target_seqname]) != total_rec_length:
+ raise ValueError(
+ "Target seqname (%s) has %s records, expected %s"
+ % (
+ self._target_seqname,
+ len(split_by_position[self._target_seqname]),
+ total_rec_length,
+ )
+ )
+
+ # translates a position in the target_seqname sequence to its gapped length
+ realpos_to_len = {
+ pos: len(gapped_fragment)
+ for pos, gapped_fragment in split_by_position[self._target_seqname].items()
+ if len(gapped_fragment) > 1
+ }
+
+ # splice together the exons
+ subseq = {}
+
+ for seqid in all_seqnames:
+ seq_split = split_by_position[seqid]
+ seq_splice = []
+
+ filler_char = "N" if seqid == self._target_seqname else "-"
+
+ # iterate from start to end, taking bases from split_by_position when
+ # they exist, using N or - for gaps when there is no alignment.
+ append = seq_splice.append
+
+ for exonstart, exonend in zip(starts, ends):
+ # exonend is exclusive
+ for real_pos in range(exonstart, exonend):
+ # if this seqname has this position, add it
+ if real_pos in seq_split:
+ append(seq_split[real_pos])
+ # if not, but it's in the target_seqname, add length-matched filler
+ elif real_pos in realpos_to_len:
+ append(filler_char * realpos_to_len[real_pos])
+ # it's not in either, so add a single filler character
+ else:
+ append(filler_char)
+
+ subseq[seqid] = "".join(seq_splice)
+
+ # make sure we're returning the right number of letters
+ if len(subseq[self._target_seqname].replace("-", "")) != expected_letters:
+ raise ValueError(
+ "Returning %s letters for target seqname (%s), expected %s"
+ % (
+ len(subseq[self._target_seqname].replace("-", "")),
+ self._target_seqname,
+ expected_letters,
+ )
+ )
+
+ # check to make sure all sequences are the same length as the target seqname
+ ref_subseq_len = len(subseq[self._target_seqname])
+
+ for seqid, seq in subseq.items():
+ if len(seq) != ref_subseq_len:
+ raise ValueError(
+ "Returning length %s for %s, expected %s"
+ % (len(seq), seqid, ref_subseq_len)
+ )
+
+ # finally, build a MultipleSeqAlignment object for our final sequences
+ result_multiseq = []
+
+ for seqid, seq in subseq.items():
+ seq = Seq(seq)
+
+ seq = seq if strand == ref_first_strand else seq.reverse_complement()
+
+ result_multiseq.append(SeqRecord(seq, id=seqid, name=seqid, description=""))
+
+ return MultipleSeqAlignment(result_multiseq)
+
+ def __repr__(self):
+ """Return a string representation of the index."""
+ return "MafIO.MafIndex(%r, target_seqname=%r)" % (
+ self._maf_fp.name,
+ self._target_seqname,
+ )
+
+ def __len__(self):
+ """Return the number of records in the index."""
+ return self._record_count
diff --git a/code/lib/Bio/AlignIO/MauveIO.py b/code/lib/Bio/AlignIO/MauveIO.py
new file mode 100644
index 0000000..b5f597c
--- /dev/null
+++ b/code/lib/Bio/AlignIO/MauveIO.py
@@ -0,0 +1,349 @@
+# Copyright 2015-2015 by Eric Rasche. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Bio.AlignIO support for "xmfa" output from Mauve/ProgressiveMauve.
+
+You are expected to use this module via the Bio.AlignIO functions (or the
+Bio.SeqIO functions if you want to work directly with the gapped sequences).
+
+For example, consider a progressiveMauve alignment file containing the following::
+
+ #FormatVersion Mauve1
+ #Sequence1File a.fa
+ #Sequence1Entry 1
+ #Sequence1Format FastA
+ #Sequence2File b.fa
+ #Sequence2Entry 2
+ #Sequence2Format FastA
+ #Sequence3File c.fa
+ #Sequence3Entry 3
+ #Sequence3Format FastA
+ #BackboneFile three.xmfa.bbcols
+ > 1:0-0 + a.fa
+ --------------------------------------------------------------------------------
+ --------------------------------------------------------------------------------
+ --------------------------------------------------------------------------------
+ > 2:5417-5968 + b.fa
+ TTTAAACATCCCTCGGCCCGTCGCCCTTTTATAATAGCAGTACGTGAGAGGAGCGCCCTAAGCTTTGGGAAATTCAAGC-
+ --------------------------------------------------------------------------------
+ CTGGAACGTACTTGCTGGTTTCGCTACTATTTCAAACAAGTTAGAGGCCGTTACCTCGGGCGAACGTATAAACCATTCTG
+ > 3:9476-10076 - c.fa
+ TTTAAACACCTTTTTGGATG--GCCCAGTTCGTTCAGTTGTG-GGGAGGAGATCGCCCCAAACGTATGGTGAGTCGGGCG
+ TTTCCTATAGCTATAGGACCAATCCACTTACCATACGCCCGGCGTCGCCCAGTCCGGTTCGGTACCCTCCATGACCCACG
+ ---------------------------------------------------------AAATGAGGGCCCAGGGTATGCTT
+ =
+ > 2:5969-6015 + b.fa
+ -----------------------
+ GGGCGAACGTATAAACCATTCTG
+ > 3:9429-9476 - c.fa
+ TTCGGTACCCTCCATGACCCACG
+ AAATGAGGGCCCAGGGTATGCTT
+
+This is a multiple sequence alignment with multiple aligned sections, so you
+would probably load this using the Bio.AlignIO.parse() function:
+
+ >>> from Bio import AlignIO
+ >>> align = AlignIO.parse("Mauve/simple_short.xmfa", "mauve")
+ >>> alignments = list(align)
+ >>> for aln in alignments:
+ ... print(aln)
+ ...
+ Alignment with 3 rows and 240 columns
+ --------------------------------------------...--- a.fa
+ TTTAAACATCCCTCGGCCCGTCGCCCTTTTATAATAGCAGTACG...CTG b.fa/5416-5968
+ TTTAAACACCTTTTTGGATG--GCCCAGTTCGTTCAGTTGTG-G...CTT c.fa/9475-10076
+ Alignment with 2 rows and 46 columns
+ -----------------------GGGCGAACGTATAAACCATTCTG b.fa/5968-6015
+ TTCGGTACCCTCCATGACCCACGAAATGAGGGCCCAGGGTATGCTT c.fa/9428-9476
+
+Additional information is extracted from the XMFA file and available through
+the annotation attribute of each record::
+
+ >>> for record in alignments[0]:
+ ... print(record.id, len(record))
+ ... print(" start: %d, end: %d, strand: %d" %(
+ ... record.annotations['start'], record.annotations['end'],
+ ... record.annotations['strand']))
+ ...
+ a.fa 240
+ start: 0, end: 0, strand: 1
+ b.fa/5416-5968 240
+ start: 5416, end: 5968, strand: 1
+ c.fa/9475-10076 240
+ start: 9475, end: 10076, strand: -1
+
+"""
+import re
+
+from Bio.Align import MultipleSeqAlignment
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+
+from .Interfaces import AlignmentIterator
+from .Interfaces import SequentialAlignmentWriter
+
+
+XMFA_HEADER_REGEX = re.compile(
+ r"> (?P\d+):(?P\d+)-(?P\d+) (?P[+-]) (?P.*)"
+)
+XMFA_HEADER_REGEX_BIOPYTHON = re.compile(
+ r"> (?P\d+):(?P\d+)-(?P\d+) (?P[+-]) (?P[^#]*) # (?P.*)"
+)
+ID_LINE_FMT = "> {seq_name}:{start}-{end} {strand} {filename} # {ugly_hack}"
+
+
+def _identifier_split(identifier):
+ """Return (name, start, end) string tuple from an identifier (PRIVATE)."""
+ id, loc, strand = identifier.split(":")
+ start, end = map(int, loc.split("-"))
+ start -= 1
+ return id, start, end, strand
+
+
+class MauveWriter(SequentialAlignmentWriter):
+ """Mauve/XMFA alignment writer."""
+
+ def __init__(self, *args, **kwargs):
+ """Initialize the class."""
+ super().__init__(*args, **kwargs)
+ self._wrote_header = False
+ self._wrote_first = False
+
+ def write_alignment(self, alignment):
+ """Use this to write (another) single alignment to an open file.
+
+ Note that sequences and their annotation are recorded
+ together (rather than having a block of annotation followed
+ by a block of aligned sequences).
+ """
+ count = len(alignment)
+
+ self._length_of_sequences = alignment.get_alignment_length()
+
+ # NOTE - For now, the alignment object does not hold any per column
+ # or per alignment annotation - only per sequence.
+
+ if count == 0:
+ raise ValueError("Must have at least one sequence")
+ if self._length_of_sequences == 0:
+ raise ValueError("Non-empty sequences are required")
+
+ if not self._wrote_header:
+ self._wrote_header = True
+ self.handle.write("#FormatVersion Mauve1\n")
+ # There are some more headers, but we ignore those for now.
+ # Sequence1File unknown.fa
+ # Sequence1Entry 1
+ # Sequence1Format FastA
+ for i in range(1, count + 1):
+ self.handle.write("#Sequence%sEntry\t%s\n" % (i, i))
+
+ for idx, record in enumerate(alignment):
+ self._write_record(record, record_idx=idx)
+ self.handle.write("=\n")
+
+ def _write_record(self, record, record_idx=0):
+ """Write a single SeqRecord to the file (PRIVATE)."""
+ if self._length_of_sequences != len(record.seq):
+ raise ValueError("Sequences must all be the same length")
+
+ seq_name = record.name
+ try:
+ seq_name = str(int(record.name))
+ except ValueError:
+ seq_name = str(record_idx + 1)
+
+ # We remove the "/{start}-{end}" before writing, as it cannot be part
+ # of the produced XMFA file.
+ if "start" in record.annotations and "end" in record.annotations:
+ suffix0 = "/%s-%s" % (
+ record.annotations["start"],
+ record.annotations["end"],
+ )
+ suffix1 = "/%s-%s" % (
+ record.annotations["start"] + 1,
+ record.annotations["end"],
+ )
+ if seq_name[-len(suffix0) :] == suffix0:
+ seq_name = seq_name[: -len(suffix0)]
+ if seq_name[-len(suffix1) :] == suffix1:
+ seq_name = seq_name[: -len(suffix1)]
+
+ if (
+ "start" in record.annotations
+ and "end" in record.annotations
+ and "strand" in record.annotations
+ ):
+ id_line = ID_LINE_FMT.format(
+ seq_name=seq_name,
+ start=record.annotations["start"] + 1,
+ end=record.annotations["end"],
+ strand=("+" if record.annotations["strand"] == 1 else "-"),
+ filename=record.name + ".fa",
+ ugly_hack=record.id,
+ )
+ lacking_annotations = False
+ else:
+ id_line = ID_LINE_FMT.format(
+ seq_name=seq_name,
+ start=0,
+ end=0,
+ strand="+",
+ filename=record.name + ".fa",
+ ugly_hack=record.id,
+ )
+ lacking_annotations = True
+
+ # If the sequence is an empty one, skip writing it out
+ if (":0-0 " in id_line or ":1-0 " in id_line) and not lacking_annotations:
+ # Except in the first LCB
+ if not self._wrote_first:
+ self._wrote_first = True
+ # The first LCB we write out is special, and must list ALL
+ # sequences, for the Mauve GUI
+ # http://darlinglab.org/mauve/user-guide/files.html#non-standard-xmfa-formatting-used-by-the-mauve-gui
+ id_line = ID_LINE_FMT.format(
+ seq_name=seq_name,
+ start=0,
+ end=0,
+ strand="+",
+ filename=record.name + ".fa",
+ ugly_hack=record.id,
+ )
+ id_line = id_line.replace("\n", " ").replace("\r", " ")
+ self.handle.write(id_line + "\n\n")
+ # Alignments lacking a start/stop/strand were generated by
+ # Biopython on load, and shouldn't exist according to XMFA
+ else:
+ # In other blocks, we only write sequences if they exist in a given
+ # alignment.
+ id_line = id_line.replace("\n", " ").replace("\r", " ")
+ self.handle.write(id_line + "\n")
+ for i in range(0, len(record.seq), 80):
+ self.handle.write("%s\n" % record.seq[i : i + 80])
+
+
+class MauveIterator(AlignmentIterator):
+ """Mauve xmfa alignment iterator."""
+
+ _ids = [] # for caching IDs between __next__ calls
+
+ def __next__(self):
+ """Parse the next alignment from the handle."""
+ handle = self.handle
+ line = handle.readline()
+
+ if not line:
+ raise StopIteration
+
+ # Strip out header comments
+ while line and line.strip().startswith("#"):
+ line = handle.readline()
+
+ seqs = {}
+ seq_regions = {}
+ passed_end_alignment = False
+
+ latest_id = None
+ while True:
+ if not line:
+ break # end of file
+ line = line.strip()
+
+ if line.startswith("="):
+ # There may be more data, but we've reached the end of this
+ # alignment
+ break
+ elif line.startswith(">"):
+ m = XMFA_HEADER_REGEX_BIOPYTHON.match(line)
+ if not m:
+ m = XMFA_HEADER_REGEX.match(line)
+ if not m:
+ raise ValueError("Malformed header line: %s", line)
+
+ parsed_id = m.group("id")
+ parsed_data = {}
+ for key in ("start", "end", "id", "strand", "name", "realname"):
+ try:
+ value = m.group(key)
+ if key == "start":
+ value = int(value)
+ # Convert to zero based counting
+ if value > 0:
+ value -= 1
+
+ if key == "end":
+ value = int(value)
+ parsed_data[key] = value
+ except IndexError:
+ # This will occur if we're asking for a group that
+ # doesn't exist. It's fine.
+ pass
+ seq_regions[parsed_id] = parsed_data
+
+ if parsed_id not in self._ids:
+ self._ids.append(parsed_id)
+
+ seqs.setdefault(parsed_id, "")
+ latest_id = parsed_id
+ else:
+ assert not passed_end_alignment
+ if latest_id is None:
+ raise ValueError("Saw sequence before definition line")
+ seqs[latest_id] += line
+ line = handle.readline()
+
+ assert len(seqs) <= len(self._ids)
+
+ self.ids = self._ids
+ self.sequences = seqs
+
+ if self._ids and seqs:
+ alignment_length = max(map(len, list(seqs.values())))
+ records = []
+ for id in self._ids:
+ if id not in seqs or len(seqs[id]) == 0 or len(seqs[id]) == 0:
+ seq = "-" * alignment_length
+ else:
+ seq = seqs[id]
+
+ if alignment_length != len(seq):
+ raise ValueError(
+ "Sequences have different lengths, or repeated identifier"
+ )
+
+ # Sometimes we don't see a particular sequence in the
+ # alignment, so we skip that record since it isn't present in
+ # that LCB/alignment
+ if id not in seq_regions:
+ continue
+
+ if seq_regions[id]["start"] != 0 or seq_regions[id]["end"] != 0:
+ suffix = "/{start}-{end}".format(**seq_regions[id])
+ if "realname" in seq_regions[id]:
+ corrected_id = seq_regions[id]["realname"]
+ else:
+ corrected_id = seq_regions[id]["name"]
+ if corrected_id.count(suffix) == 0:
+ corrected_id += suffix
+ else:
+ if "realname" in seq_regions[id]:
+ corrected_id = seq_regions[id]["realname"]
+ else:
+ corrected_id = seq_regions[id]["name"]
+
+ record = SeqRecord(Seq(seq), id=corrected_id, name=id)
+
+ record.annotations["start"] = seq_regions[id]["start"]
+ record.annotations["end"] = seq_regions[id]["end"]
+ record.annotations["strand"] = (
+ 1 if seq_regions[id]["strand"] == "+" else -1
+ )
+
+ records.append(record)
+ return MultipleSeqAlignment(records)
+ else:
+ raise StopIteration
diff --git a/code/lib/Bio/AlignIO/MsfIO.py b/code/lib/Bio/AlignIO/MsfIO.py
new file mode 100644
index 0000000..d620f1b
--- /dev/null
+++ b/code/lib/Bio/AlignIO/MsfIO.py
@@ -0,0 +1,331 @@
+# Copyright 2019, National Marrow Donor Program (NMPD). All rights reserved.
+# Written by Peter Cock, The James Hutton Institute, under contract to NMDP.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Bio.AlignIO support for GCG MSF format.
+
+The file format was produced by the GCG PileUp and and LocalPileUp tools,
+and later tools such as T-COFFEE and MUSCLE support it as an optional
+output format.
+
+The original GCG tool would write gaps at ends of each sequence which could
+be missing data as tildes (``~``), whereas internal gaps were periods (``.``)
+instead. This parser replaces both with minus signs (``-``) for consistency
+with the rest of ``Bio.AlignIO``.
+
+You are expected to use this module via the Bio.AlignIO functions (or the
+Bio.SeqIO functions if you want to work directly with the gapped sequences).
+"""
+from Bio.Align import MultipleSeqAlignment
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+
+from .Interfaces import AlignmentIterator
+
+
+class MsfIterator(AlignmentIterator):
+ """GCG MSF alignment iterator."""
+
+ _header = None # for caching lines between __next__ calls
+
+ def __next__(self):
+ """Parse the next alignment from the handle."""
+ handle = self.handle
+
+ if self._header is None:
+ line = handle.readline()
+ else:
+ # Header we saved from when we were parsing
+ # the previous alignment.
+ line = self._header
+ self._header = None
+
+ if not line:
+ raise StopIteration
+
+ # Whitelisted headers we know about.
+ known_headers = ["!!NA_MULTIPLE_ALIGNMENT", "!!AA_MULTIPLE_ALIGNMENT", "PileUp"]
+ # Examples in "Molecular Biology Software Training Manual GCG version 10"
+ # by BBSRC Bioscuences IT Services (BITS), Harpenden, UK, Copyright 1996-2001
+ # would often start as follows:
+ #
+ # !!AA_MUTIPLE_ALIGNMENT 1.0
+ # PileUp of: @/usr/users2/culhane/...
+ #
+ # etc with other seemingly free format text before getting to the
+ # MSF/Type/Check line and the following Name: lines block and // line.
+ #
+ # MUSCLE just has a line "PileUp", while other sources just use the line
+ # "!!AA_MULTIPLE_ALIGNMENT" (amino acid) or "!!NA_MULTIPLE_ALIGNMENT"
+ # (nucleotide).
+ if line.strip().split()[0] not in known_headers:
+ raise ValueError(
+ "%s is not a known GCG MSF header: %s"
+ % (line.strip().split()[0], ", ".join(known_headers))
+ )
+
+ while line and " MSF: " not in line:
+ line = handle.readline()
+
+ if not line:
+ raise ValueError("Reached end of file without MSF/Type/Check header line")
+
+ # Quoting from "Molecular Biology Software Training Manual GCG version 10"
+ # by BBSRC Bioscuences IT Services (BITS), Harpenden, UK. Copyright 1996-2001.
+ # Page 31:
+ #
+ # "Header information is before a .. (double dot) in a GCG format file.
+ # The file will also have a checksum specific for that file."
+ #
+ # This was followed by a single non-aligned sequence, but this convention
+ # appears to also be used in the GCG MSF files. Quoting other examples in
+ # this reference, page 31:
+ #
+ # localpileup_17.msf MSF: 195 Type: P January 6, 2000 15:41 Check: 4365 ..
+ #
+ # Except from page 148:
+ #
+ # localpileup_106.msf MSF: 457 Type: P November 28, 2000 16:09 Check: 2396 ..
+ #
+ # Quoting output from MUSCLE v3.8, have two leading spaces and a zero checksum:
+ #
+ # MSF: 689 Type: N Check: 0000 ..
+ #
+ # By observation, the MSF value is the column count, type is N (nucleotide)
+ # or P (protein / amino acid).
+ #
+ # In a possible bug, EMBOSS v6.6.0.0 uses CompCheck: rather than Check: as shown,
+ #
+ # $ seqret -sequence Tests/Fasta/f002 -auto -stdout -osformat msf
+ # !!NA_MULTIPLE_ALIGNMENT 1.0
+ #
+ # stdout MSF: 633 Type: N 01/08/19 CompCheck: 8543 ..
+ #
+ # Name: G26680 Len: 633 Check: 4334 Weight: 1.00
+ # Name: G26685 Len: 633 Check: 3818 Weight: 1.00
+ # Name: G29385 Len: 633 Check: 391 Weight: 1.00
+ #
+ # //
+ #
+ parts = line.strip("\n").split()
+ offset = parts.index("MSF:")
+ if (
+ parts[offset + 2] != "Type:"
+ or parts[-3] not in ("Check:", "CompCheck:")
+ or parts[-1] != ".."
+ ):
+ raise ValueError(
+ "GCG MSF header line should be "
+ "' MSF: Type: Check: ..', "
+ " not: %r" % line
+ )
+ try:
+ aln_length = int(parts[offset + 1])
+ except ValueError:
+ aln_length = -1
+ if aln_length < 0:
+ raise ValueError(
+ "GCG MSF header line should have MDF: for column count, not %r"
+ % parts[offset + 1]
+ )
+ seq_type = parts[offset + 3]
+ if seq_type not in ["P", "N"]:
+ raise ValueError(
+ "GCG MSF header line should have 'Type: P' (protein) "
+ "or 'Type: N' (nucleotide), not 'Type: %s'" % seq_type
+ )
+
+ # There should be a blank line after that header line, then the Name: lines
+ #
+ # In a possible bug, T-COFFEE v12.00 adds 'oo' after the names, as shown here,
+ #
+ # PileUp
+ #
+ #
+ #
+ # MSF: 628 Type: P Check: 147 ..
+ #
+ # Name: AK1H_ECOLI/1-378 oo Len: 628 Check: 3643 Weight: 1.000
+ # Name: AKH_HAEIN/1-382 oo Len: 628 Check: 6504 Weight: 1.000
+ #
+ # //
+ ids = []
+ lengths = []
+ checks = []
+ weights = []
+ line = handle.readline()
+ while line and line.strip() != "//":
+ line = handle.readline()
+ if line.strip().startswith("Name: "):
+ if " Len: " in line and " Check: " in line and " Weight: " in line:
+ rest = line[line.index("Name: ") + 6 :].strip()
+ name, rest = rest.split(" Len: ")
+ length, rest = rest.split(" Check: ")
+ check, weight = rest.split(" Weight: ")
+ name = name.strip()
+ if name.endswith(" oo"):
+ # T-COFFEE oddity, ignore this
+ name = name[:-3]
+ if name in ids:
+ raise ValueError("Duplicated ID of %r" % name)
+ if " " in name:
+ raise NotImplementedError("Space in ID %r" % name)
+ ids.append(name)
+ # Expect aln_length <= int(length.strip()), see below
+ lengths.append(int(length.strip()))
+ checks.append(int(check.strip()))
+ weights.append(float(weight.strip()))
+ else:
+ raise ValueError("Malformed GCG MSF name line: %r" % line)
+ if not line:
+ raise ValueError("End of file while looking for end of header // line.")
+
+ if aln_length != max(lengths):
+ # In broken examples from IMGTHLA was possible to continue
+ # https://github.com/ANHIG/IMGTHLA/issues/201
+ max_length = max(lengths)
+ max_count = sum(1 for _ in lengths if _ == max_length)
+ raise ValueError(
+ "GCG MSF header said alignment length %i, but %s of %i sequences said Len: %s"
+ % (aln_length, max_count, len(ids), max_length)
+ )
+
+ line = handle.readline()
+ if not line:
+ raise ValueError("End of file after // line, expected sequences.")
+ if line.strip():
+ raise ValueError("After // line, expected blank line before sequences.")
+
+ # Now load the sequences
+ seqs = [[] for _ in ids] # list of empty lists
+ completed_length = 0
+ while completed_length < aln_length:
+ # Note might have a coordinate header line (seems to be optional)
+ for idx, name in enumerate(ids):
+ line = handle.readline()
+ if idx == 0 and not line.strip():
+ # T-COFFEE uses two blank lines between blocks, rather than one
+ while line and not line.strip():
+ line = handle.readline()
+ if not line:
+ raise ValueError("End of file where expecting sequence data.")
+ # print("Looking for seq for %s in line: %r" % (name, line))
+ words = line.strip().split()
+ # Should we use column numbers, rather than assuming no spaces in names?
+ if idx == 0 and words and words[0] != name:
+ # print("Actually have a coord line")
+ # Hopefully this is a coordinate header before the first seq
+ try:
+ i = int(words[0])
+ except ValueError:
+ i = -1
+ if i != completed_length + 1:
+ raise ValueError(
+ "Expected GCG MSF coordinate line starting %i, got: %r"
+ % (completed_length + 1, line)
+ )
+ if len(words) > 1:
+ # Final block usually not full 50 chars, so expect start only.
+ if len(words) != 2:
+ i = -1
+ else:
+ try:
+ i = int(words[1])
+ except ValueError:
+ i = -1
+ if i != (
+ completed_length + 50
+ if completed_length + 50 < aln_length
+ else aln_length
+ ):
+ raise ValueError(
+ "Expected GCG MSF coordinate line %i to %i, got: %r"
+ % (
+ completed_length + 1,
+ completed_length + 50
+ if completed_length + 50 < aln_length
+ else aln_length,
+ line,
+ )
+ )
+ line = handle.readline()
+ words = line.strip().split()
+ # print("Still looking for seq for %s in line: %r" % (name, line))
+ # Dealt with any coordinate header line, should now be sequence
+ if not words:
+ # Should be sequence here, but perhaps its a short one?
+ if (
+ lengths[idx] < aln_length
+ and len("".join(seqs[idx])) == lengths[idx]
+ ):
+ # Is this actually allowed in the format? Personally I would
+ # expect a line with name and a block of trailing ~ here.
+ pass
+ else:
+ raise ValueError(
+ "Expected sequence for %s, got: %r" % (name, line)
+ )
+ elif words[0] == name:
+ assert len(words) > 1, line
+ # print(i, name, repr(words))
+ seqs[idx].extend(words[1:])
+ else:
+ raise ValueError("Expected sequence for %r, got: %r" % (name, line))
+ # TODO - check the sequence lengths thus far are consistent
+ # with blocks of 50?
+ completed_length += 50
+ line = handle.readline()
+ if line.strip():
+ raise ValueError("Expected blank line, got: %r" % line)
+
+ # Skip over any whitespace at the end...
+ while True:
+ line = handle.readline()
+ if not line:
+ # End of file, no more alignments
+ break
+ elif not line.strip():
+ # Blank line, ignore
+ pass
+ elif line.strip().split()[0] in known_headers:
+ # Looks like the start of another alignment:
+ self._header = line
+ break
+ else:
+ raise ValueError("Unexpected line after GCG MSF alignment: %r" % line)
+
+ # Combine list of strings into single string, remap gaps
+ seqs = ["".join(s).replace("~", "-").replace(".", "-") for s in seqs]
+
+ # Apply any trailing padding for short sequences
+ padded = False
+ for idx, (length, s) in enumerate(zip(lengths, seqs)):
+ if len(s) < aln_length and len(s) == length:
+ padded = True
+ seqs[idx] = s + "-" * (aln_length - len(s))
+ if padded:
+ import warnings
+ from Bio import BiopythonParserWarning
+
+ warnings.warn(
+ "One of more alignment sequences were truncated and have been gap padded",
+ BiopythonParserWarning,
+ )
+
+ records = (
+ SeqRecord(Seq(s), id=i, name=i, description=i, annotations={"weight": w},)
+ for (i, s, w) in zip(ids, seqs, weights)
+ )
+
+ # This will check alignment lengths are self-consistent:
+ align = MultipleSeqAlignment(records)
+ # Check matches the header:
+ if align.get_alignment_length() != aln_length:
+ raise ValueError(
+ "GCG MSF headers said alignment length %i, but have %i"
+ % (aln_length, align.get_alignment_length())
+ )
+ return align
diff --git a/code/lib/Bio/AlignIO/NexusIO.py b/code/lib/Bio/AlignIO/NexusIO.py
new file mode 100644
index 0000000..2c97e2e
--- /dev/null
+++ b/code/lib/Bio/AlignIO/NexusIO.py
@@ -0,0 +1,166 @@
+# Copyright 2008-2010, 2012-2014, 2016-2017 by Peter Cock. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Bio.AlignIO support for the "nexus" file format.
+
+You are expected to use this module via the Bio.AlignIO functions (or the
+Bio.SeqIO functions if you want to work directly with the gapped sequences).
+
+See also the Bio.Nexus module (which this code calls internally),
+as this offers more than just accessing the alignment or its
+sequences as SeqRecord objects.
+"""
+from Bio.Align import MultipleSeqAlignment
+from Bio.AlignIO.Interfaces import AlignmentWriter
+from Bio.Nexus import Nexus
+from Bio.SeqRecord import SeqRecord
+
+
+# You can get a couple of example files here:
+# http://www.molecularevolution.org/resources/fileformats/
+
+
+# This is a generator function!
+def NexusIterator(handle, seq_count=None):
+ """Return SeqRecord objects from a Nexus file.
+
+ Thus uses the Bio.Nexus module to do the hard work.
+
+ You are expected to call this function via Bio.SeqIO or Bio.AlignIO
+ (and not use it directly).
+
+ NOTE - We only expect ONE alignment matrix per Nexus file,
+ meaning this iterator will only yield one MultipleSeqAlignment.
+ """
+ n = Nexus.Nexus(handle)
+ if not n.matrix:
+ # No alignment found
+ return
+
+ # Bio.Nexus deals with duplicated names by adding a '.copy' suffix.
+ # The original names and the modified names are kept in these two lists:
+ assert len(n.unaltered_taxlabels) == len(n.taxlabels)
+
+ if seq_count and seq_count != len(n.unaltered_taxlabels):
+ raise ValueError(
+ "Found %i sequences, but seq_count=%i"
+ % (len(n.unaltered_taxlabels), seq_count)
+ )
+
+ # TODO - Can we extract any annotation too?
+ if n.datatype in ("dna", "nucleotide"):
+ annotations = {"molecule_type": "DNA"}
+ elif n.datatype == "rna":
+ annotations = {"molecule_type": "RNA"}
+ elif n.datatype == "protein":
+ annotations = {"molecule_type": "protein"}
+ else:
+ annotations = None
+ records = (
+ SeqRecord(
+ n.matrix[new_name],
+ id=new_name,
+ name=old_name,
+ description="",
+ annotations=annotations,
+ )
+ for old_name, new_name in zip(n.unaltered_taxlabels, n.taxlabels)
+ )
+ # All done
+ yield MultipleSeqAlignment(records)
+
+
+class NexusWriter(AlignmentWriter):
+ """Nexus alignment writer.
+
+ Note that Nexus files are only expected to hold ONE alignment
+ matrix.
+
+ You are expected to call this class via the Bio.AlignIO.write() or
+ Bio.SeqIO.write() functions.
+ """
+
+ def write_file(self, alignments):
+ """Use this to write an entire file containing the given alignments.
+
+ Arguments:
+ - alignments - A list or iterator returning MultipleSeqAlignment objects.
+ This should hold ONE and only one alignment.
+
+ """
+ align_iter = iter(alignments) # Could have been a list
+ try:
+ alignment = next(align_iter)
+ except StopIteration:
+ # Nothing to write!
+ return 0
+
+ # Check there is only one alignment...
+ try:
+ next(align_iter)
+ raise ValueError("We can only write one Alignment to a Nexus file.")
+ except StopIteration:
+ pass
+
+ # Good. Actually write the single alignment,
+ self.write_alignment(alignment)
+ return 1 # we only support writing one alignment!
+
+ def write_alignment(self, alignment, interleave=None):
+ """Write an alignment to file.
+
+ Creates an empty Nexus object, adds the sequences
+ and then gets Nexus to prepare the output.
+ Default interleave behaviour: Interleave if columns > 1000
+ --> Override with interleave=[True/False]
+ """
+ if len(alignment) == 0:
+ raise ValueError("Must have at least one sequence")
+ columns = alignment.get_alignment_length()
+ if columns == 0:
+ raise ValueError("Non-empty sequences are required")
+ datatype = self._classify_mol_type_for_nexus(alignment)
+ minimal_record = (
+ "#NEXUS\nbegin data; dimensions ntax=0 nchar=0; format datatype=%s; end;"
+ % datatype
+ )
+ n = Nexus.Nexus(minimal_record)
+ for record in alignment:
+ # Sanity test sequences (should this be even stricter?)
+ if datatype == "dna" and "U" in record.seq:
+ raise ValueError(f"{record.id} contains U, but DNA alignment")
+ elif datatype == "rna" and "T" in record.seq:
+ raise ValueError(f"{record.id} contains T, but RNA alignment")
+ n.add_sequence(record.id, str(record.seq))
+
+ # Note: MrBayes may choke on large alignments if not interleaved
+ if interleave is None:
+ interleave = columns > 1000
+ n.write_nexus_data(self.handle, interleave=interleave)
+
+ def _classify_mol_type_for_nexus(self, alignment):
+ """Return 'protein', 'dna', or 'rna' based on records' molecule type (PRIVATE).
+
+ All the records must have a molecule_type annotation, and they must
+ agree.
+
+ Raises an exception if this is not possible.
+ """
+ values = {_.annotations.get("molecule_type", None) for _ in alignment}
+ if all(_ and "DNA" in _ for _ in values):
+ return "dna" # could have been a mix of "DNA" and "gDNA"
+ elif all(_ and "RNA" in _ for _ in values):
+ return "rna" # could have been a mix of "RNA" and "mRNA"
+ elif all(_ and "protein" in _ for _ in values):
+ return "protein"
+ else:
+ raise ValueError("Need the molecule type to be defined")
+
+
+if __name__ == "__main__":
+ from Bio._utils import run_doctest
+
+ run_doctest(verbose=0)
diff --git a/code/lib/Bio/AlignIO/PhylipIO.py b/code/lib/Bio/AlignIO/PhylipIO.py
new file mode 100644
index 0000000..cc3f665
--- /dev/null
+++ b/code/lib/Bio/AlignIO/PhylipIO.py
@@ -0,0 +1,454 @@
+# Copyright 2006-2016 by Peter Cock. All rights reserved.
+# Revisions copyright 2011 Brandon Invergo. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""AlignIO support for "phylip" format from Joe Felsenstein's PHYLIP tools.
+
+You are expected to use this module via the Bio.AlignIO functions (or the
+Bio.SeqIO functions if you want to work directly with the gapped sequences).
+
+Support for "relaxed phylip" format is also provided. Relaxed phylip differs
+from standard phylip format in the following ways:
+
+ - No whitespace is allowed in the sequence ID.
+ - No truncation is performed. Instead, sequence IDs are padded to the longest
+ ID length, rather than 10 characters. A space separates the sequence
+ identifier from the sequence.
+
+Relaxed phylip is supported by RAxML and PHYML.
+
+Note
+====
+
+In TREE_PUZZLE (Schmidt et al. 2003) and PHYML (Guindon and Gascuel 2003)
+a dot/period (".") in a sequence is interpreted as meaning the same
+character as in the first sequence. The PHYLIP documentation from 3.3 to 3.69
+http://evolution.genetics.washington.edu/phylip/doc/sequence.html says:
+
+"a period was also previously allowed but it is no longer allowed,
+because it sometimes is used in different senses in other programs"
+
+Biopython 1.58 or later treats dots/periods in the sequence as invalid, both
+for reading and writing. Older versions did nothing special with a dot/period.
+"""
+import string
+
+from Bio.Align import MultipleSeqAlignment
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+
+from .Interfaces import AlignmentIterator
+from .Interfaces import SequentialAlignmentWriter
+
+
+_PHYLIP_ID_WIDTH = 10
+_NO_DOTS = "PHYLIP format no longer allows dots in sequence"
+
+
+class PhylipWriter(SequentialAlignmentWriter):
+ """Phylip alignment writer."""
+
+ def write_alignment(self, alignment, id_width=_PHYLIP_ID_WIDTH):
+ """Use this to write (another) single alignment to an open file.
+
+ This code will write interlaced alignments (when the sequences are
+ longer than 50 characters).
+
+ Note that record identifiers are strictly truncated to id_width,
+ defaulting to the value required to comply with the PHYLIP standard.
+
+ For more information on the file format, please see:
+ http://evolution.genetics.washington.edu/phylip/doc/sequence.html
+ http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles
+ """
+ handle = self.handle
+
+ if len(alignment) == 0:
+ raise ValueError("Must have at least one sequence")
+ length_of_seqs = alignment.get_alignment_length()
+ for record in alignment:
+ if length_of_seqs != len(record.seq):
+ raise ValueError("Sequences must all be the same length")
+ if length_of_seqs <= 0:
+ raise ValueError("Non-empty sequences are required")
+
+ # Check for repeated identifiers...
+ # Apply this test *after* cleaning the identifiers
+ names = []
+ seqs = []
+ for record in alignment:
+ """
+ Quoting the PHYLIP version 3.6 documentation:
+
+ The name should be ten characters in length, filled out to
+ the full ten characters by blanks if shorter. Any printable
+ ASCII/ISO character is allowed in the name, except for
+ parentheses ("(" and ")"), square brackets ("[" and "]"),
+ colon (":"), semicolon (";") and comma (","). If you forget
+ to extend the names to ten characters in length by blanks,
+ the program [i.e. PHYLIP] will get out of synchronization
+ with the contents of the data file, and an error message will
+ result.
+
+ Note that Tab characters count as only one character in the
+ species names. Their inclusion can cause trouble.
+ """
+ name = sanitize_name(record.id, id_width)
+ if name in names:
+ raise ValueError(
+ "Repeated name %r (originally %r), possibly due to truncation"
+ % (name, record.id)
+ )
+ names.append(name)
+ sequence = str(record.seq)
+ if "." in sequence:
+ # Do this check here (once per record, not once per block)
+ raise ValueError(_NO_DOTS)
+ seqs.append(sequence)
+
+ # From experimentation, the use of tabs is not understood by the
+ # EMBOSS suite. The nature of the expected white space is not
+ # defined in the PHYLIP documentation, simply "These are in free
+ # format, separated by blanks". We'll use spaces to keep EMBOSS
+ # happy.
+ handle.write(" %i %s\n" % (len(alignment), length_of_seqs))
+ block = 0
+ while True:
+ for name, sequence in zip(names, seqs):
+ if block == 0:
+ # Write name (truncated/padded to id_width characters)
+ # Now truncate and right pad to expected length.
+ handle.write(name[:id_width].ljust(id_width))
+ else:
+ # write indent
+ handle.write(" " * id_width)
+ # Write five chunks of ten letters per line...
+ for chunk in range(0, 5):
+ i = block * 50 + chunk * 10
+ seq_segment = sequence[i : i + 10]
+ # TODO - Force any gaps to be '-' character?
+ # TODO - How to cope with '?' or '.' in the sequence?
+ handle.write(" %s" % seq_segment)
+ if i + 10 > length_of_seqs:
+ break
+ handle.write("\n")
+ block += 1
+ if block * 50 >= length_of_seqs:
+ break
+ handle.write("\n")
+
+
+class PhylipIterator(AlignmentIterator):
+ """Reads a Phylip alignment file returning a MultipleSeqAlignment iterator.
+
+ Record identifiers are limited to at most 10 characters.
+
+ It only copes with interlaced phylip files! Sequential files won't work
+ where the sequences are split over multiple lines.
+
+ For more information on the file format, please see:
+ http://evolution.genetics.washington.edu/phylip/doc/sequence.html
+ http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles
+ """
+
+ # Default truncation length
+ id_width = _PHYLIP_ID_WIDTH
+
+ _header = None # for caching lines between __next__ calls
+
+ def _is_header(self, line):
+ line = line.strip()
+ parts = [x for x in line.split() if x]
+ if len(parts) != 2:
+ return False # First line should have two integers
+ try:
+ number_of_seqs = int(parts[0])
+ length_of_seqs = int(parts[1])
+ return True
+ except ValueError:
+ return False # First line should have two integers
+
+ def _split_id(self, line):
+ """Extract the sequence ID from a Phylip line (PRIVATE).
+
+ Returning a tuple containing: (sequence_id, sequence_residues)
+
+ The first 10 characters in the line are are the sequence id, the
+ remainder are sequence data.
+ """
+ seq_id = line[: self.id_width].strip()
+ seq = line[self.id_width :].strip().replace(" ", "")
+ return seq_id, seq
+
+ def __next__(self):
+ """Parse the next alignment from the handle."""
+ handle = self.handle
+
+ if self._header is None:
+ line = handle.readline()
+ else:
+ # Header we saved from when we were parsing
+ # the previous alignment.
+ line = self._header
+ self._header = None
+
+ if not line:
+ raise StopIteration
+ line = line.strip()
+ parts = [x for x in line.split() if x]
+ if len(parts) != 2:
+ raise ValueError("First line should have two integers")
+ try:
+ number_of_seqs = int(parts[0])
+ length_of_seqs = int(parts[1])
+ except ValueError:
+ raise ValueError("First line should have two integers") from None
+
+ assert self._is_header(line)
+
+ if (
+ self.records_per_alignment is not None
+ and self.records_per_alignment != number_of_seqs
+ ):
+ raise ValueError(
+ "Found %i records in this alignment, told to expect %i"
+ % (number_of_seqs, self.records_per_alignment)
+ )
+
+ ids = []
+ seqs = []
+
+ # By default, expects STRICT truncation / padding to 10 characters.
+ # Does not require any whitespace between name and seq.
+ for i in range(number_of_seqs):
+ line = handle.readline().rstrip()
+ sequence_id, s = self._split_id(line)
+ ids.append(sequence_id)
+ if "." in s:
+ raise ValueError(_NO_DOTS)
+ seqs.append([s])
+
+ # Look for further blocks
+ line = ""
+ while True:
+ # Skip any blank lines between blocks...
+ while "" == line.strip():
+ line = handle.readline()
+ if not line:
+ break # end of file
+ if not line:
+ break # end of file
+
+ if self._is_header(line):
+ # Looks like the start of a concatenated alignment
+ self._header = line
+ break
+
+ # print("New block...")
+ for i in range(number_of_seqs):
+ s = line.strip().replace(" ", "")
+ if "." in s:
+ raise ValueError(_NO_DOTS)
+ seqs[i].append(s)
+ line = handle.readline()
+ if (not line) and i + 1 < number_of_seqs:
+ raise ValueError("End of file mid-block")
+ if not line:
+ break # end of file
+
+ records = (
+ SeqRecord(Seq("".join(s)), id=i, name=i, description=i)
+ for (i, s) in zip(ids, seqs)
+ )
+ return MultipleSeqAlignment(records)
+
+
+# Relaxed Phylip
+class RelaxedPhylipWriter(PhylipWriter):
+ """Relaxed Phylip format writer."""
+
+ def write_alignment(self, alignment):
+ """Write a relaxed phylip alignment."""
+ # Check inputs
+ for name in (s.id.strip() for s in alignment):
+ if any(c in name for c in string.whitespace):
+ raise ValueError("Whitespace not allowed in identifier: %s" % name)
+
+ # Calculate a truncation length - maximum length of sequence ID plus a
+ # single character for padding
+ # If no sequences, set id_width to 1. super(...) call will raise a
+ # ValueError
+ if len(alignment) == 0:
+ id_width = 1
+ else:
+ id_width = max(len(s.id.strip()) for s in alignment) + 1
+ super().write_alignment(alignment, id_width)
+
+
+class RelaxedPhylipIterator(PhylipIterator):
+ """Relaxed Phylip format Iterator."""
+
+ def _split_id(self, line):
+ """Extract the sequence ID from a Phylip line (PRIVATE).
+
+ Returns a tuple containing: (sequence_id, sequence_residues)
+
+ For relaxed format split at the first whitespace character.
+ """
+ seq_id, sequence = line.split(None, 1)
+ sequence = sequence.strip().replace(" ", "")
+ return seq_id, sequence
+
+
+class SequentialPhylipWriter(SequentialAlignmentWriter):
+ """Sequential Phylip format Writer."""
+
+ def write_alignment(self, alignment, id_width=_PHYLIP_ID_WIDTH):
+ """Write a Phylip alignment to the handle."""
+ handle = self.handle
+
+ if len(alignment) == 0:
+ raise ValueError("Must have at least one sequence")
+ length_of_seqs = alignment.get_alignment_length()
+ for record in alignment:
+ if length_of_seqs != len(record.seq):
+ raise ValueError("Sequences must all be the same length")
+ if length_of_seqs <= 0:
+ raise ValueError("Non-empty sequences are required")
+
+ # Check for repeated identifiers...
+ # Apply this test *after* cleaning the identifiers
+ names = []
+ for record in alignment:
+ # Either remove the banned characters, or map them to something
+ # else like an underscore "_" or pipe "|" character...
+ name = sanitize_name(record.id, id_width)
+ if name in names:
+ raise ValueError(
+ "Repeated name %r (originally %r), possibly due to truncation"
+ % (name, record.id)
+ )
+ names.append(name)
+
+ # From experimentation, the use of tabs is not understood by the
+ # EMBOSS suite. The nature of the expected white space is not
+ # defined in the PHYLIP documentation, simply "These are in free
+ # format, separated by blanks". We'll use spaces to keep EMBOSS
+ # happy.
+ handle.write(" %i %s\n" % (len(alignment), length_of_seqs))
+ for name, record in zip(names, alignment):
+ sequence = str(record.seq)
+ if "." in sequence:
+ raise ValueError(_NO_DOTS)
+ handle.write(name[:id_width].ljust(id_width))
+ # Write the entire sequence to one line (see sequential format
+ # notes in the SequentialPhylipIterator docstring
+ handle.write(sequence)
+ handle.write("\n")
+
+
+class SequentialPhylipIterator(PhylipIterator):
+ """Sequential Phylip format Iterator.
+
+ The sequential format carries the same restrictions as the normal
+ interleaved one, with the difference being that the sequences are listed
+ sequentially, each sequence written in its entirety before the start of
+ the next. According to the PHYLIP documentation for input file
+ formatting, newlines and spaces may optionally be entered at any point
+ in the sequences.
+ """
+
+ _header = None # for caching lines between __next__ calls
+
+ def __next__(self):
+ """Parse the next alignment from the handle."""
+ handle = self.handle
+
+ if self._header is None:
+ line = handle.readline()
+ else:
+ # Header we saved from when we were parsing
+ # the previous alignment.
+ line = self._header
+ self._header = None
+
+ if not line:
+ raise StopIteration
+ line = line.strip()
+ parts = [x for x in line.split() if x]
+ if len(parts) != 2:
+ raise ValueError("First line should have two integers")
+ try:
+ number_of_seqs = int(parts[0])
+ length_of_seqs = int(parts[1])
+ except ValueError:
+ raise ValueError("First line should have two integers") from None
+
+ assert self._is_header(line)
+
+ if (
+ self.records_per_alignment is not None
+ and self.records_per_alignment != number_of_seqs
+ ):
+ raise ValueError(
+ "Found %i records in this alignment, told to expect %i"
+ % (number_of_seqs, self.records_per_alignment)
+ )
+
+ ids = []
+ seqs = []
+
+ # By default, expects STRICT truncation / padding to 10 characters.
+ # Does not require any whitespace between name and seq.
+ for i in range(number_of_seqs):
+ line = handle.readline().rstrip()
+ sequence_id, s = self._split_id(line)
+ ids.append(sequence_id)
+ while len(s) < length_of_seqs:
+ # The sequence may be split into multiple lines
+ line = handle.readline().strip()
+ if not line:
+ break
+ if line == "":
+ continue
+ s = "".join([s, line.strip().replace(" ", "")])
+ if len(s) > length_of_seqs:
+ raise ValueError(
+ "Found a record of length %i, "
+ "should be %i" % (len(s), length_of_seqs)
+ )
+ if "." in s:
+ raise ValueError(_NO_DOTS)
+ seqs.append(s)
+ while True:
+ # Find other alignments in the file
+ line = handle.readline()
+ if not line:
+ break
+ if self._is_header(line):
+ self._header = line
+ break
+
+ records = (
+ SeqRecord(Seq(s), id=i, name=i, description=i) for (i, s) in zip(ids, seqs)
+ )
+ return MultipleSeqAlignment(records)
+
+
+def sanitize_name(name, width=None):
+ """Sanitise sequence identifier for output.
+
+ Removes the banned characters "[]()" and replaces the characters ":;"
+ with "|". The name is truncated to "width" characters if specified.
+ """
+ name = name.strip()
+ for char in "[](),":
+ name = name.replace(char, "")
+ for char in ":;":
+ name = name.replace(char, "|")
+ if width is not None:
+ name = name[:width]
+ return name
diff --git a/code/lib/Bio/AlignIO/StockholmIO.py b/code/lib/Bio/AlignIO/StockholmIO.py
new file mode 100644
index 0000000..386e762
--- /dev/null
+++ b/code/lib/Bio/AlignIO/StockholmIO.py
@@ -0,0 +1,630 @@
+# Copyright 2006-2016 by Peter Cock. All rights reserved.
+# Revisions copyright 2015 by Ben Woodcroft. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Bio.AlignIO support for "stockholm" format (used in the PFAM database).
+
+You are expected to use this module via the Bio.AlignIO functions (or the
+Bio.SeqIO functions if you want to work directly with the gapped sequences).
+
+For example, consider a Stockholm alignment file containing the following::
+
+ # STOCKHOLM 1.0
+ #=GC SS_cons .................<<<<<<<<...<<<<<<<........>>>>>>>..
+ AP001509.1 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGU
+ #=GR AP001509.1 SS -----------------<<<<<<<<---..<<-<<-------->>->>..--
+ AE007476.1 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGU
+ #=GR AE007476.1 SS -----------------<<<<<<<<-----<<.<<-------->>.>>----
+
+ #=GC SS_cons ......<<<<<<<.......>>>>>>>..>>>>>>>>...............
+ AP001509.1 CUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
+ #=GR AP001509.1 SS -------<<<<<--------->>>>>--->>>>>>>>---------------
+ AE007476.1 UUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
+ #=GR AE007476.1 SS ------.<<<<<--------->>>>>.-->>>>>>>>---------------
+ //
+
+This is a single multiple sequence alignment, so you would probably load this
+using the Bio.AlignIO.read() function:
+
+ >>> from Bio import AlignIO
+ >>> align = AlignIO.read("Stockholm/simple.sth", "stockholm")
+ >>> print(align)
+ Alignment with 2 rows and 104 columns
+ UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-G...UGU AP001509.1
+ AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-C...GAU AE007476.1
+ >>> for record in align:
+ ... print("%s %i" % (record.id, len(record)))
+ AP001509.1 104
+ AE007476.1 104
+
+In addition to the sequences themselves, this example alignment also includes
+some GR lines for the secondary structure of the sequences. These are
+strings, with one character for each letter in the associated sequence:
+
+ >>> for record in align:
+ ... print(record.id)
+ ... print(record.seq)
+ ... print(record.letter_annotations['secondary_structure'])
+ AP001509.1
+ UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGUCUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
+ -----------------<<<<<<<<---..<<-<<-------->>->>..---------<<<<<--------->>>>>--->>>>>>>>---------------
+ AE007476.1
+ AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGUUUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
+ -----------------<<<<<<<<-----<<.<<-------->>.>>----------.<<<<<--------->>>>>.-->>>>>>>>---------------
+
+Any general annotation for each row is recorded in the SeqRecord's annotations
+dictionary. Any per-column annotation for the entire alignment in in the
+alignment's column annotations dictionary, such as the secondary structure
+consensus in this example:
+
+ >>> sorted(align.column_annotations.keys())
+ ['secondary_structure']
+ >>> align.column_annotations["secondary_structure"]
+ '.................<<<<<<<<...<<<<<<<........>>>>>>>........<<<<<<<.......>>>>>>>..>>>>>>>>...............'
+
+You can output this alignment in many different file formats
+using Bio.AlignIO.write(), or the MultipleSeqAlignment object's format method:
+
+ >>> print(format(align, "fasta"))
+ >AP001509.1
+ UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGUCUCUAC-A
+ GGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
+ >AE007476.1
+ AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGUUUCUACAA
+ GGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
+
+
+Most output formats won't be able to hold the annotation possible in a
+Stockholm file:
+
+ >>> print(format(align, "stockholm"))
+ # STOCKHOLM 1.0
+ #=GF SQ 2
+ AP001509.1 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGUCUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
+ #=GS AP001509.1 AC AP001509.1
+ #=GS AP001509.1 DE AP001509.1
+ #=GR AP001509.1 SS -----------------<<<<<<<<---..<<-<<-------->>->>..---------<<<<<--------->>>>>--->>>>>>>>---------------
+ AE007476.1 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGUUUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
+ #=GS AE007476.1 AC AE007476.1
+ #=GS AE007476.1 DE AE007476.1
+ #=GR AE007476.1 SS -----------------<<<<<<<<-----<<.<<-------->>.>>----------.<<<<<--------->>>>>.-->>>>>>>>---------------
+ #=GC SS_cons .................<<<<<<<<...<<<<<<<........>>>>>>>........<<<<<<<.......>>>>>>>..>>>>>>>>...............
+ //
+
+
+Note that when writing Stockholm files, AlignIO does not break long sequences
+up and interleave them (as in the input file shown above). The standard
+allows this simpler layout, and it is more likely to be understood by other
+tools.
+
+Finally, as an aside, it can sometimes be useful to use Bio.SeqIO.parse() to
+iterate over the alignment rows as SeqRecord objects - rather than working
+with Alignnment objects.
+
+ >>> from Bio import SeqIO
+ >>> for record in SeqIO.parse("Stockholm/simple.sth", "stockholm"):
+ ... print(record.id)
+ ... print(record.seq)
+ ... print(record.letter_annotations['secondary_structure'])
+ AP001509.1
+ UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGUCUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
+ -----------------<<<<<<<<---..<<-<<-------->>->>..---------<<<<<--------->>>>>--->>>>>>>>---------------
+ AE007476.1
+ AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGUUUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
+ -----------------<<<<<<<<-----<<.<<-------->>.>>----------.<<<<<--------->>>>>.-->>>>>>>>---------------
+
+Remember that if you slice a SeqRecord, the per-letter-annotations like the
+secondary structure string here, are also sliced:
+
+ >>> sub_record = record[10:20]
+ >>> print(sub_record.seq)
+ AUCGUUUUAC
+ >>> print(sub_record.letter_annotations['secondary_structure'])
+ -------<<<
+
+Likewise with the alignment object, as long as you are not dropping any rows,
+slicing specific columns of an alignment will slice any per-column-annotations:
+
+ >>> align.column_annotations["secondary_structure"]
+ '.................<<<<<<<<...<<<<<<<........>>>>>>>........<<<<<<<.......>>>>>>>..>>>>>>>>...............'
+ >>> part_align = align[:,10:20]
+ >>> part_align.column_annotations["secondary_structure"]
+ '.......<<<'
+
+You can also see this in the Stockholm output of this partial-alignment:
+
+ >>> print(format(part_align, "stockholm"))
+ # STOCKHOLM 1.0
+ #=GF SQ 2
+ AP001509.1 UCAACACUCU
+ #=GS AP001509.1 AC AP001509.1
+ #=GS AP001509.1 DE AP001509.1
+ #=GR AP001509.1 SS -------<<<
+ AE007476.1 AUCGUUUUAC
+ #=GS AE007476.1 AC AE007476.1
+ #=GS AE007476.1 DE AE007476.1
+ #=GR AE007476.1 SS -------<<<
+ #=GC SS_cons .......<<<
+ //
+
+
+"""
+from collections import OrderedDict
+
+from Bio.Align import MultipleSeqAlignment
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+
+from .Interfaces import AlignmentIterator
+from .Interfaces import SequentialAlignmentWriter
+
+
+class StockholmWriter(SequentialAlignmentWriter):
+ """Stockholm/PFAM alignment writer."""
+
+ # These dictionaries should be kept in sync with those
+ # defined in the StockholmIterator class.
+ pfam_gr_mapping = {
+ "secondary_structure": "SS",
+ "surface_accessibility": "SA",
+ "transmembrane": "TM",
+ "posterior_probability": "PP",
+ "ligand_binding": "LI",
+ "active_site": "AS",
+ "intron": "IN",
+ }
+ # These GC mappings are in addition to *_cons in GR mapping:
+ pfam_gc_mapping = {"reference_annotation": "RF", "model_mask": "MM"}
+ # Following dictionary deliberately does not cover AC, DE or DR
+ pfam_gs_mapping = {"organism": "OS", "organism_classification": "OC", "look": "LO"}
+
+ def write_alignment(self, alignment):
+ """Use this to write (another) single alignment to an open file.
+
+ Note that sequences and their annotation are recorded
+ together (rather than having a block of annotation followed
+ by a block of aligned sequences).
+ """
+ count = len(alignment)
+
+ self._length_of_sequences = alignment.get_alignment_length()
+ self._ids_written = []
+
+ if count == 0:
+ raise ValueError("Must have at least one sequence")
+ if self._length_of_sequences == 0:
+ raise ValueError("Non-empty sequences are required")
+
+ self.handle.write("# STOCKHOLM 1.0\n")
+ self.handle.write("#=GF SQ %i\n" % count)
+ for record in alignment:
+ self._write_record(record)
+ # This shouldn't be None... but just in case,
+ if alignment.column_annotations:
+ for k, v in sorted(alignment.column_annotations.items()):
+ if k in self.pfam_gc_mapping:
+ self.handle.write("#=GC %s %s\n" % (self.pfam_gc_mapping[k], v))
+ elif k in self.pfam_gr_mapping:
+ self.handle.write(
+ "#=GC %s %s\n" % (self.pfam_gr_mapping[k] + "_cons", v)
+ )
+ else:
+ # It doesn't follow the PFAM standards, but should we record
+ # this data anyway?
+ pass
+ self.handle.write("//\n")
+
+ def _write_record(self, record):
+ """Write a single SeqRecord to the file (PRIVATE)."""
+ if self._length_of_sequences != len(record.seq):
+ raise ValueError("Sequences must all be the same length")
+
+ # For the case for stockholm to stockholm, try and use record.name
+ seq_name = record.id
+ if record.name is not None:
+ if "accession" in record.annotations:
+ if record.id == record.annotations["accession"]:
+ seq_name = record.name
+
+ # In the Stockholm file format, spaces are not allowed in the id
+ seq_name = seq_name.replace(" ", "_")
+
+ if "start" in record.annotations and "end" in record.annotations:
+ suffix = "/%s-%s" % (
+ record.annotations["start"],
+ record.annotations["end"],
+ )
+ if seq_name[-len(suffix) :] != suffix:
+ seq_name = "%s/%s-%s" % (
+ seq_name,
+ record.annotations["start"],
+ record.annotations["end"],
+ )
+
+ if seq_name in self._ids_written:
+ raise ValueError("Duplicate record identifier: %s" % seq_name)
+ self._ids_written.append(seq_name)
+ self.handle.write("%s %s\n" % (seq_name, record.seq))
+
+ # The recommended placement for GS lines (per sequence annotation)
+ # is above the alignment (as a header block) or just below the
+ # corresponding sequence.
+ #
+ # The recommended placement for GR lines (per sequence per column
+ # annotation such as secondary structure) is just below the
+ # corresponding sequence.
+ #
+ # We put both just below the corresponding sequence as this allows
+ # us to write the file using a single pass through the records.
+
+ # AC = Accession
+ if "accession" in record.annotations:
+ self.handle.write(
+ "#=GS %s AC %s\n"
+ % (seq_name, self.clean(record.annotations["accession"]))
+ )
+ elif record.id:
+ self.handle.write("#=GS %s AC %s\n" % (seq_name, self.clean(record.id)))
+
+ # DE = description
+ if record.description:
+ self.handle.write(
+ "#=GS %s DE %s\n" % (seq_name, self.clean(record.description))
+ )
+
+ # DE = database links
+ for xref in record.dbxrefs:
+ self.handle.write("#=GS %s DR %s\n" % (seq_name, self.clean(xref)))
+
+ # GS = other per sequence annotation
+ for key, value in record.annotations.items():
+ if key in self.pfam_gs_mapping:
+ data = self.clean(str(value))
+ if data:
+ self.handle.write(
+ "#=GS %s %s %s\n"
+ % (seq_name, self.clean(self.pfam_gs_mapping[key]), data)
+ )
+ else:
+ # It doesn't follow the PFAM standards, but should we record
+ # this data anyway?
+ pass
+
+ # GR = per row per column sequence annotation
+ for key, value in record.letter_annotations.items():
+ if key in self.pfam_gr_mapping and len(str(value)) == len(record.seq):
+ data = self.clean(str(value))
+ if data:
+ self.handle.write(
+ "#=GR %s %s %s\n"
+ % (seq_name, self.clean(self.pfam_gr_mapping[key]), data)
+ )
+ else:
+ # It doesn't follow the PFAM standards, but should we record
+ # this data anyway?
+ pass
+
+
+class StockholmIterator(AlignmentIterator):
+ """Loads a Stockholm file from PFAM into MultipleSeqAlignment objects.
+
+ The file may contain multiple concatenated alignments, which are loaded
+ and returned incrementally.
+
+ This parser will detect if the Stockholm file follows the PFAM
+ conventions for sequence specific meta-data (lines starting #=GS
+ and #=GR) and populates the SeqRecord fields accordingly.
+
+ Any annotation which does not follow the PFAM conventions is currently
+ ignored.
+
+ If an accession is provided for an entry in the meta data, IT WILL NOT
+ be used as the record.id (it will be recorded in the record's
+ annotations). This is because some files have (sub) sequences from
+ different parts of the same accession (differentiated by different
+ start-end positions).
+
+ Wrap-around alignments are not supported - each sequences must be on
+ a single line. However, interlaced sequences should work.
+
+ For more information on the file format, please see:
+ http://sonnhammer.sbc.su.se/Stockholm.html
+ https://en.wikipedia.org/wiki/Stockholm_format
+ http://bioperl.org/formats/alignment_formats/Stockholm_multiple_alignment_format.html
+
+ For consistency with BioPerl and EMBOSS we call this the "stockholm"
+ format.
+ """
+
+ # These dictionaries should be kept in sync with those
+ # defined in the PfamStockholmWriter class.
+ pfam_gr_mapping = {
+ "SS": "secondary_structure",
+ "SA": "surface_accessibility",
+ "TM": "transmembrane",
+ "PP": "posterior_probability",
+ "LI": "ligand_binding",
+ "AS": "active_site",
+ "IN": "intron",
+ }
+ # These GC mappings are in addition to *_cons in GR mapping:
+ pfam_gc_mapping = {"RF": "reference_annotation", "MM": "model_mask"}
+ # Following dictionary deliberately does not cover AC, DE or DR
+ pfam_gs_mapping = {"OS": "organism", "OC": "organism_classification", "LO": "look"}
+
+ _header = None # for caching lines between __next__ calls
+
+ def __next__(self):
+ """Parse the next alignment from the handle."""
+ handle = self.handle
+
+ if self._header is None:
+ line = handle.readline()
+ else:
+ # Header we saved from when we were parsing
+ # the previous alignment.
+ line = self._header
+ self._header = None
+
+ if not line:
+ # Empty file - just give up.
+ raise StopIteration
+ if line.strip() != "# STOCKHOLM 1.0":
+ raise ValueError("Did not find STOCKHOLM header")
+
+ # Note: If this file follows the PFAM conventions, there should be
+ # a line containing the number of sequences, e.g. "#=GF SQ 67"
+ # We do not check for this - perhaps we should, and verify that
+ # if present it agrees with our parsing.
+
+ seqs = {}
+ ids = OrderedDict() # Really only need an OrderedSet, but python lacks this
+ gs = {}
+ gr = {}
+ gf = {}
+ gc = {}
+ passed_end_alignment = False
+ while True:
+ line = handle.readline()
+ if not line:
+ break # end of file
+ line = line.strip() # remove trailing \n
+ if line == "# STOCKHOLM 1.0":
+ self._header = line
+ break
+ elif line == "//":
+ # The "//" line indicates the end of the alignment.
+ # There may still be more meta-data
+ passed_end_alignment = True
+ elif line == "":
+ # blank line, ignore
+ pass
+ elif line[0] != "#":
+ # Sequence
+ # Format: ""
+ assert not passed_end_alignment
+ parts = [x.strip() for x in line.split(" ", 1)]
+ if len(parts) != 2:
+ # This might be someone attempting to store a zero length sequence?
+ raise ValueError(
+ "Could not split line into identifier and sequence:\n" + line
+ )
+ seq_id, seq = parts
+ if seq_id not in ids:
+ ids[seq_id] = True
+ seqs.setdefault(seq_id, "")
+ seqs[seq_id] += seq.replace(".", "-")
+ elif len(line) >= 5:
+ # Comment line or meta-data
+ if line[:5] == "#=GF ":
+ # Generic per-File annotation, free text
+ # Format: #=GF
+ feature, text = line[5:].strip().split(None, 1)
+ # Each feature key could be used more than once,
+ # so store the entries as a list of strings.
+ if feature not in gf:
+ gf[feature] = [text]
+ else:
+ gf[feature].append(text)
+ elif line[:5] == "#=GC ":
+ # Generic per-Column annotation, exactly 1 char per column
+ # Format: "#=GC "
+ feature, text = line[5:].strip().split(None, 2)
+ if feature not in gc:
+ gc[feature] = ""
+ gc[feature] += text.strip() # append to any previous entry
+ # Might be interleaved blocks, so can't check length yet
+ elif line[:5] == "#=GS ":
+ # Generic per-Sequence annotation, free text
+ # Format: "#=GS "
+ try:
+ seq_id, feature, text = line[5:].strip().split(None, 2)
+ except ValueError:
+ # Free text can sometimes be empty, which a one line split throws an error for.
+ # See https://github.com/biopython/biopython/issues/2982 for more details
+ seq_id, feature = line[5:].strip().split(None, 1)
+ text = ""
+ # if seq_id not in ids:
+ # ids.append(seq_id)
+ if seq_id not in gs:
+ gs[seq_id] = {}
+ if feature not in gs[seq_id]:
+ gs[seq_id][feature] = [text]
+ else:
+ gs[seq_id][feature].append(text)
+ elif line[:5] == "#=GR ":
+ # Generic per-Sequence AND per-Column markup
+ # Format: "#=GR "
+ seq_id, feature, text = line[5:].strip().split(None, 2)
+ # if seq_id not in ids:
+ # ids.append(seq_id)
+ if seq_id not in gr:
+ gr[seq_id] = {}
+ if feature not in gr[seq_id]:
+ gr[seq_id][feature] = ""
+ gr[seq_id][feature] += text.strip() # append to any previous entry
+ # Might be interleaved blocks, so can't check length yet
+ # Next line...
+
+ assert len(seqs) <= len(ids)
+ # assert len(gs) <= len(ids)
+ # assert len(gr) <= len(ids)
+
+ self.ids = ids.keys()
+ self.sequences = seqs
+ self.seq_annotation = gs
+ self.seq_col_annotation = gr
+
+ if ids and seqs:
+
+ if (
+ self.records_per_alignment is not None
+ and self.records_per_alignment != len(ids)
+ ):
+ raise ValueError(
+ "Found %i records in this alignment, told to expect %i"
+ % (len(ids), self.records_per_alignment)
+ )
+
+ alignment_length = len(list(seqs.values())[0])
+ records = [] # Alignment obj will put them all in a list anyway
+ for seq_id in ids:
+ seq = seqs[seq_id]
+ if alignment_length != len(seq):
+ raise ValueError(
+ "Sequences have different lengths, or repeated identifier"
+ )
+ name, start, end = self._identifier_split(seq_id)
+ record = SeqRecord(
+ Seq(seq),
+ id=seq_id,
+ name=name,
+ description=seq_id,
+ annotations={"accession": name},
+ )
+ # Accession will be overridden by _populate_meta_data if an explicit
+ # accession is provided:
+ record.annotations["accession"] = name
+
+ if start is not None:
+ record.annotations["start"] = start
+ if end is not None:
+ record.annotations["end"] = end
+
+ self._populate_meta_data(seq_id, record)
+ records.append(record)
+ for k, v in gc.items():
+ if len(v) != alignment_length:
+ raise ValueError(
+ "%s length %i, expected %i" % (k, len(v), alignment_length)
+ )
+ alignment = MultipleSeqAlignment(records)
+
+ for k, v in sorted(gc.items()):
+ if k in self.pfam_gc_mapping:
+ alignment.column_annotations[self.pfam_gc_mapping[k]] = v
+ elif k.endswith("_cons") and k[:-5] in self.pfam_gr_mapping:
+ alignment.column_annotations[self.pfam_gr_mapping[k[:-5]]] = v
+ else:
+ # Ignore it?
+ alignment.column_annotations["GC:" + k] = v
+
+ # TODO - Introduce an annotated alignment class?
+ # For now, store the annotation a new private property:
+ alignment._annotations = gr
+
+ return alignment
+ else:
+ raise StopIteration
+
+ def _identifier_split(self, identifier):
+ """Return (name, start, end) string tuple from an identifier (PRIVATE)."""
+ if "/" in identifier:
+ name, start_end = identifier.rsplit("/", 1)
+ if start_end.count("-") == 1:
+ try:
+ start, end = start_end.split("-")
+ return name, int(start), int(end)
+ except ValueError:
+ # Non-integers after final '/' - fall through
+ pass
+ return identifier, None, None
+
+ def _get_meta_data(self, identifier, meta_dict):
+ """Take an itentifier and returns dict of all meta-data matching it (PRIVATE).
+
+ For example, given "Q9PN73_CAMJE/149-220" will return all matches to
+ this or "Q9PN73_CAMJE" which the identifier without its /start-end
+ suffix.
+
+ In the example below, the suffix is required to match the AC, but must
+ be removed to match the OS and OC meta-data::
+
+ # STOCKHOLM 1.0
+ #=GS Q9PN73_CAMJE/149-220 AC Q9PN73
+ ...
+ Q9PN73_CAMJE/149-220 NKA...
+ ...
+ #=GS Q9PN73_CAMJE OS Campylobacter jejuni
+ #=GS Q9PN73_CAMJE OC Bacteria
+
+ This function will return an empty dictionary if no data is found.
+ """
+ name, start, end = self._identifier_split(identifier)
+ if name == identifier:
+ identifier_keys = [identifier]
+ else:
+ identifier_keys = [identifier, name]
+ answer = {}
+ for identifier_key in identifier_keys:
+ try:
+ for feature_key in meta_dict[identifier_key]:
+ answer[feature_key] = meta_dict[identifier_key][feature_key]
+ except KeyError:
+ pass
+ return answer
+
+ def _populate_meta_data(self, identifier, record):
+ """Add meta-date to a SecRecord's annotations dictionary (PRIVATE).
+
+ This function applies the PFAM conventions.
+ """
+ seq_data = self._get_meta_data(identifier, self.seq_annotation)
+ for feature in seq_data:
+ # Note this dictionary contains lists!
+ if feature == "AC": # ACcession number
+ assert len(seq_data[feature]) == 1
+ record.annotations["accession"] = seq_data[feature][0]
+ elif feature == "DE": # DEscription
+ record.description = "\n".join(seq_data[feature])
+ elif feature == "DR": # Database Reference
+ # Should we try and parse the strings?
+ record.dbxrefs = seq_data[feature]
+ elif feature in self.pfam_gs_mapping:
+ record.annotations[self.pfam_gs_mapping[feature]] = ", ".join(
+ seq_data[feature]
+ )
+ else:
+ # Ignore it?
+ record.annotations["GS:" + feature] = ", ".join(seq_data[feature])
+
+ # Now record the per-letter-annotations
+ seq_col_data = self._get_meta_data(identifier, self.seq_col_annotation)
+ for feature in seq_col_data:
+ # Note this dictionary contains strings!
+ if feature in self.pfam_gr_mapping:
+ record.letter_annotations[self.pfam_gr_mapping[feature]] = seq_col_data[
+ feature
+ ]
+ else:
+ # Ignore it?
+ record.letter_annotations["GR:" + feature] = seq_col_data[feature]
+
+
+if __name__ == "__main__":
+ from Bio._utils import run_doctest
+
+ run_doctest()
diff --git a/code/lib/Bio/AlignIO/__init__.py b/code/lib/Bio/AlignIO/__init__.py
new file mode 100644
index 0000000..fe01f8f
--- /dev/null
+++ b/code/lib/Bio/AlignIO/__init__.py
@@ -0,0 +1,480 @@
+# Copyright 2008-2018 by Peter Cock. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Multiple sequence alignment input/output as alignment objects.
+
+The Bio.AlignIO interface is deliberately very similar to Bio.SeqIO, and in
+fact the two are connected internally. Both modules use the same set of file
+format names (lower case strings). From the user's perspective, you can read
+in a PHYLIP file containing one or more alignments using Bio.AlignIO, or you
+can read in the sequences within these alignments using Bio.SeqIO.
+
+Bio.AlignIO is also documented at http://biopython.org/wiki/AlignIO and by
+a whole chapter in our tutorial:
+
+* `HTML Tutorial`_
+* `PDF Tutorial`_
+
+.. _`HTML Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.html
+.. _`PDF Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.pdf
+
+Input
+-----
+For the typical special case when your file or handle contains one and only
+one alignment, use the function Bio.AlignIO.read(). This takes an input file
+handle (or in recent versions of Biopython a filename as a string), format
+string and optional number of sequences per alignment. It will return a single
+MultipleSeqAlignment object (or raise an exception if there isn't just one
+alignment):
+
+>>> from Bio import AlignIO
+>>> align = AlignIO.read("Phylip/interlaced.phy", "phylip")
+>>> print(align)
+Alignment with 3 rows and 384 columns
+-----MKVILLFVLAVFTVFVSS---------------RGIPPE...I-- CYS1_DICDI
+MAHARVLLLALAVLATAAVAVASSSSFADSNPIRPVTDRAASTL...VAA ALEU_HORVU
+------MWATLPLLCAGAWLLGV--------PVCGAAELSVNSL...PLV CATH_HUMAN
+
+For the general case, when the handle could contain any number of alignments,
+use the function Bio.AlignIO.parse(...) which takes the same arguments, but
+returns an iterator giving MultipleSeqAlignment objects (typically used in a
+for loop). If you want random access to the alignments by number, turn this
+into a list:
+
+>>> from Bio import AlignIO
+>>> alignments = list(AlignIO.parse("Emboss/needle.txt", "emboss"))
+>>> print(alignments[2])
+Alignment with 2 rows and 120 columns
+-KILIVDDQYGIRILLNEVFNKEGYQTFQAANGLQALDIVTKER...--- ref_rec
+LHIVVVDDDPGTCVYIESVFAELGHTCKSFVRPEAAEEYILTHP...HKE gi|94967506|receiver
+
+Most alignment file formats can be concatenated so as to hold as many
+different multiple sequence alignments as possible. One common example
+is the output of the tool seqboot in the PHLYIP suite. Sometimes there
+can be a file header and footer, as seen in the EMBOSS alignment output.
+
+Output
+------
+Use the function Bio.AlignIO.write(...), which takes a complete set of
+Alignment objects (either as a list, or an iterator), an output file handle
+(or filename in recent versions of Biopython) and of course the file format::
+
+ from Bio import AlignIO
+ alignments = ...
+ count = SeqIO.write(alignments, "example.faa", "fasta")
+
+If using a handle make sure to close it to flush the data to the disk::
+
+ from Bio import AlignIO
+ alignments = ...
+ with open("example.faa", "w") as handle:
+ count = SeqIO.write(alignments, handle, "fasta")
+
+In general, you are expected to call this function once (with all your
+alignments) and then close the file handle. However, for file formats
+like PHYLIP where multiple alignments are stored sequentially (with no file
+header and footer), then multiple calls to the write function should work as
+expected when using handles.
+
+If you are using a filename, the repeated calls to the write functions will
+overwrite the existing file each time.
+
+Conversion
+----------
+The Bio.AlignIO.convert(...) function allows an easy interface for simple
+alignment file format conversions. Additionally, it may use file format
+specific optimisations so this should be the fastest way too.
+
+In general however, you can combine the Bio.AlignIO.parse(...) function with
+the Bio.AlignIO.write(...) function for sequence file conversion. Using
+generator expressions provides a memory efficient way to perform filtering or
+other extra operations as part of the process.
+
+File Formats
+------------
+When specifying the file format, use lowercase strings. The same format
+names are also used in Bio.SeqIO and include the following:
+
+ - clustal - Output from Clustal W or X, see also the module Bio.Clustalw
+ which can be used to run the command line tool from Biopython.
+ - emboss - EMBOSS tools' "pairs" and "simple" alignment formats.
+ - fasta - The generic sequence file format where each record starts with
+ an identifier line starting with a ">" character, followed by
+ lines of sequence.
+ - fasta-m10 - For the pairwise alignments output by Bill Pearson's FASTA
+ tools when used with the -m 10 command line option for machine
+ readable output.
+ - ig - The IntelliGenetics file format, apparently the same as the
+ MASE alignment format.
+ - msf - The GCG MSF alignment format, originally from PileUp tool.
+ - nexus - Output from NEXUS, see also the module Bio.Nexus which can also
+ read any phylogenetic trees in these files.
+ - phylip - Interlaced PHYLIP, as used by the PHYLIP tools.
+ - phylip-sequential - Sequential PHYLIP.
+ - phylip-relaxed - PHYLIP like format allowing longer names.
+ - stockholm - A richly annotated alignment file format used by PFAM.
+ - mauve - Output from progressiveMauve/Mauve
+
+Note that while Bio.AlignIO can read all the above file formats, it cannot
+write to all of them.
+
+You can also use any file format supported by Bio.SeqIO, such as "fasta" or
+"ig" (which are listed above), PROVIDED the sequences in your file are all the
+same length.
+"""
+# TODO
+# - define policy on reading aligned sequences with gaps in
+# (e.g. - and . characters)
+#
+# - Can we build the to_alignment(...) functionality
+# into the generic Alignment class instead?
+#
+# - How best to handle unique/non unique record.id when writing.
+# For most file formats reading such files is fine; The stockholm
+# parser would fail.
+#
+# - MSF multiple alignment format, aka GCG, aka PileUp format (*.msf)
+# http://www.bioperl.org/wiki/MSF_multiple_alignment_format
+from Bio.Align import MultipleSeqAlignment
+from Bio.File import as_handle
+
+from . import ClustalIO
+from . import EmbossIO
+from . import FastaIO
+from . import MafIO
+from . import MauveIO
+from . import MsfIO
+from . import NexusIO
+from . import PhylipIO
+from . import StockholmIO
+
+# Convention for format names is "mainname-subtype" in lower case.
+# Please use the same names as BioPerl and EMBOSS where possible.
+
+_FormatToIterator = { # "fasta" is done via Bio.SeqIO
+ "clustal": ClustalIO.ClustalIterator,
+ "emboss": EmbossIO.EmbossIterator,
+ "fasta-m10": FastaIO.FastaM10Iterator,
+ "maf": MafIO.MafIterator,
+ "mauve": MauveIO.MauveIterator,
+ "msf": MsfIO.MsfIterator,
+ "nexus": NexusIO.NexusIterator,
+ "phylip": PhylipIO.PhylipIterator,
+ "phylip-sequential": PhylipIO.SequentialPhylipIterator,
+ "phylip-relaxed": PhylipIO.RelaxedPhylipIterator,
+ "stockholm": StockholmIO.StockholmIterator,
+}
+
+_FormatToWriter = { # "fasta" is done via Bio.SeqIO
+ "clustal": ClustalIO.ClustalWriter,
+ "maf": MafIO.MafWriter,
+ "mauve": MauveIO.MauveWriter,
+ "nexus": NexusIO.NexusWriter,
+ "phylip": PhylipIO.PhylipWriter,
+ "phylip-sequential": PhylipIO.SequentialPhylipWriter,
+ "phylip-relaxed": PhylipIO.RelaxedPhylipWriter,
+ "stockholm": StockholmIO.StockholmWriter,
+}
+
+
+def write(alignments, handle, format):
+ """Write complete set of alignments to a file.
+
+ Arguments:
+ - alignments - A list (or iterator) of MultipleSeqAlignment objects,
+ or a single alignment object.
+ - handle - File handle object to write to, or filename as string
+ (note older versions of Biopython only took a handle).
+ - format - lower case string describing the file format to write.
+
+ You should close the handle after calling this function.
+
+ Returns the number of alignments written (as an integer).
+ """
+ from Bio import SeqIO
+
+ # Try and give helpful error messages:
+ if not isinstance(format, str):
+ raise TypeError("Need a string for the file format (lower case)")
+ if not format:
+ raise ValueError("Format required (lower case string)")
+ if format != format.lower():
+ raise ValueError("Format string '%s' should be lower case" % format)
+
+ if isinstance(alignments, MultipleSeqAlignment):
+ # This raised an exception in older versions of Biopython
+ alignments = [alignments]
+
+ with as_handle(handle, "w") as fp:
+ # Map the file format to a writer class
+ if format in _FormatToWriter:
+ writer_class = _FormatToWriter[format]
+ count = writer_class(fp).write_file(alignments)
+ elif format in SeqIO._FormatToWriter:
+ # Exploit the existing SeqIO parser to do the dirty work!
+ # TODO - Can we make one call to SeqIO.write() and count the alignments?
+ count = 0
+ for alignment in alignments:
+ if not isinstance(alignment, MultipleSeqAlignment):
+ raise TypeError(
+ "Expect a list or iterator of MultipleSeqAlignment "
+ "objects, got: %r" % alignment
+ )
+ SeqIO.write(alignment, fp, format)
+ count += 1
+ elif format in _FormatToIterator or format in SeqIO._FormatToIterator:
+ raise ValueError(
+ "Reading format '%s' is supported, but not writing" % format
+ )
+ else:
+ raise ValueError("Unknown format '%s'" % format)
+
+ if not isinstance(count, int):
+ raise RuntimeError(
+ "Internal error - the underlying %s "
+ "writer should have returned the alignment count, not %r" % (format, count)
+ )
+
+ return count
+
+
+# This is a generator function!
+def _SeqIO_to_alignment_iterator(handle, format, seq_count=None):
+ """Use Bio.SeqIO to create an MultipleSeqAlignment iterator (PRIVATE).
+
+ Arguments:
+ - handle - handle to the file.
+ - format - string describing the file format.
+ - seq_count - Optional integer, number of sequences expected in each
+ alignment. Recommended for fasta format files.
+
+ If count is omitted (default) then all the sequences in the file are
+ combined into a single MultipleSeqAlignment.
+ """
+ from Bio import SeqIO
+
+ if format not in SeqIO._FormatToIterator:
+ raise ValueError("Unknown format '%s'" % format)
+
+ if seq_count:
+ # Use the count to split the records into batches.
+ seq_record_iterator = SeqIO.parse(handle, format)
+
+ records = []
+ for record in seq_record_iterator:
+ records.append(record)
+ if len(records) == seq_count:
+ yield MultipleSeqAlignment(records)
+ records = []
+ if records:
+ raise ValueError("Check seq_count argument, not enough sequences?")
+ else:
+ # Must assume that there is a single alignment using all
+ # the SeqRecord objects:
+ records = list(SeqIO.parse(handle, format))
+ if records:
+ yield MultipleSeqAlignment(records)
+
+
+def parse(handle, format, seq_count=None):
+ """Iterate over an alignment file as MultipleSeqAlignment objects.
+
+ Arguments:
+ - handle - handle to the file, or the filename as a string
+ (note older versions of Biopython only took a handle).
+ - format - string describing the file format.
+ - seq_count - Optional integer, number of sequences expected in each
+ alignment. Recommended for fasta format files.
+
+ If you have the file name in a string 'filename', use:
+
+ >>> from Bio import AlignIO
+ >>> filename = "Emboss/needle.txt"
+ >>> format = "emboss"
+ >>> for alignment in AlignIO.parse(filename, format):
+ ... print("Alignment of length %i" % alignment.get_alignment_length())
+ Alignment of length 124
+ Alignment of length 119
+ Alignment of length 120
+ Alignment of length 118
+ Alignment of length 125
+
+ If you have a string 'data' containing the file contents, use::
+
+ from Bio import AlignIO
+ from io import StringIO
+ my_iterator = AlignIO.parse(StringIO(data), format)
+
+ Use the Bio.AlignIO.read() function when you expect a single record only.
+ """
+ from Bio import SeqIO
+
+ # Try and give helpful error messages:
+ if not isinstance(format, str):
+ raise TypeError("Need a string for the file format (lower case)")
+ if not format:
+ raise ValueError("Format required (lower case string)")
+ if format != format.lower():
+ raise ValueError("Format string '%s' should be lower case" % format)
+ if seq_count is not None and not isinstance(seq_count, int):
+ raise TypeError("Need integer for seq_count (sequences per alignment)")
+
+ with as_handle(handle) as fp:
+ # Map the file format to a sequence iterator:
+ if format in _FormatToIterator:
+ iterator_generator = _FormatToIterator[format]
+ i = iterator_generator(fp, seq_count)
+
+ elif format in SeqIO._FormatToIterator:
+ # Exploit the existing SeqIO parser to the dirty work!
+ i = _SeqIO_to_alignment_iterator(fp, format, seq_count=seq_count)
+ else:
+ raise ValueError("Unknown format '%s'" % format)
+
+ yield from i
+
+
+def read(handle, format, seq_count=None):
+ """Turn an alignment file into a single MultipleSeqAlignment object.
+
+ Arguments:
+ - handle - handle to the file, or the filename as a string
+ (note older versions of Biopython only took a handle).
+ - format - string describing the file format.
+ - seq_count - Optional integer, number of sequences expected in each
+ alignment. Recommended for fasta format files.
+
+ If the handle contains no alignments, or more than one alignment,
+ an exception is raised. For example, using a PFAM/Stockholm file
+ containing one alignment:
+
+ >>> from Bio import AlignIO
+ >>> filename = "Clustalw/protein.aln"
+ >>> format = "clustal"
+ >>> alignment = AlignIO.read(filename, format)
+ >>> print("Alignment of length %i" % alignment.get_alignment_length())
+ Alignment of length 411
+
+ If however you want the first alignment from a file containing
+ multiple alignments this function would raise an exception.
+
+ >>> from Bio import AlignIO
+ >>> filename = "Emboss/needle.txt"
+ >>> format = "emboss"
+ >>> alignment = AlignIO.read(filename, format)
+ Traceback (most recent call last):
+ ...
+ ValueError: More than one record found in handle
+
+ Instead use:
+
+ >>> from Bio import AlignIO
+ >>> filename = "Emboss/needle.txt"
+ >>> format = "emboss"
+ >>> alignment = next(AlignIO.parse(filename, format))
+ >>> print("First alignment has length %i" % alignment.get_alignment_length())
+ First alignment has length 124
+
+ You must use the Bio.AlignIO.parse() function if you want to read multiple
+ records from the handle.
+ """
+ iterator = parse(handle, format, seq_count)
+ try:
+ alignment = next(iterator)
+ except StopIteration:
+ raise ValueError("No records found in handle") from None
+ try:
+ next(iterator)
+ raise ValueError("More than one record found in handle")
+ except StopIteration:
+ pass
+ if seq_count:
+ if len(alignment) != seq_count:
+ raise RuntimeError(
+ "More sequences found in alignment than specified in seq_count: %s."
+ % seq_count
+ )
+ return alignment
+
+
+def convert(in_file, in_format, out_file, out_format, molecule_type=None):
+ """Convert between two alignment files, returns number of alignments.
+
+ Arguments:
+ - in_file - an input handle or filename
+ - in_format - input file format, lower case string
+ - output - an output handle or filename
+ - out_file - output file format, lower case string
+ - molecule_type - optional molecule type to apply, string containing
+ "DNA", "RNA" or "protein".
+
+ **NOTE** - If you provide an output filename, it will be opened which will
+ overwrite any existing file without warning. This may happen if even the
+ conversion is aborted (e.g. an invalid out_format name is given).
+
+ Some output formats require the molecule type be specified where this
+ cannot be determined by the parser. For example, converting to FASTA,
+ Clustal, or PHYLIP format to NEXUS:
+
+ >>> from io import StringIO
+ >>> from Bio import AlignIO
+ >>> handle = StringIO()
+ >>> AlignIO.convert("Phylip/horses.phy", "phylip", handle, "nexus", "DNA")
+ 1
+ >>> print(handle.getvalue())
+ #NEXUS
+ begin data;
+ dimensions ntax=10 nchar=40;
+ format datatype=dna missing=? gap=-;
+ matrix
+ Mesohippus AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+ Hypohippus AAACCCCCCCAAAAAAAAACAAAAAAAAAAAAAAAAAAAA
+ Archaeohip CAAAAAAAAAAAAAAAACACAAAAAAAAAAAAAAAAAAAA
+ Parahippus CAAACAACAACAAAAAAAACAAAAAAAAAAAAAAAAAAAA
+ Merychippu CCAACCACCACCCCACACCCAAAAAAAAAAAAAAAAAAAA
+ 'M. secundu' CCAACCACCACCCACACCCCAAAAAAAAAAAAAAAAAAAA
+ Nannipus CCAACCACAACCCCACACCCAAAAAAAAAAAAAAAAAAAA
+ Neohippari CCAACCCCCCCCCCACACCCAAAAAAAAAAAAAAAAAAAA
+ Calippus CCAACCACAACCCACACCCCAAAAAAAAAAAAAAAAAAAA
+ Pliohippus CCCACCCCCCCCCACACCCCAAAAAAAAAAAAAAAAAAAA
+ ;
+ end;
+
+ """
+ if molecule_type:
+ if not isinstance(molecule_type, str):
+ raise TypeError("Molecule type should be a string, not %r" % molecule_type)
+ elif (
+ "DNA" in molecule_type
+ or "RNA" in molecule_type
+ or "protein" in molecule_type
+ ):
+ pass
+ else:
+ raise ValueError("Unexpected molecule type, %r" % molecule_type)
+
+ # TODO - Add optimised versions of important conversions
+ # For now just off load the work to SeqIO parse/write
+ # Don't open the output file until we've checked the input is OK:
+ alignments = parse(in_file, in_format, None)
+
+ if molecule_type:
+ # Edit the records on the fly to set molecule type
+
+ def over_ride(alignment):
+ """Over-ride molecule in-place."""
+ for record in alignment:
+ record.annotations["molecule_type"] = molecule_type
+ return alignment
+
+ alignments = (over_ride(_) for _ in alignments)
+ return write(alignments, out_file, out_format)
+
+
+if __name__ == "__main__":
+ from Bio._utils import run_doctest
+
+ run_doctest()
diff --git a/code/lib/Bio/AlignIO/__pycache__/ClustalIO.cpython-37.pyc b/code/lib/Bio/AlignIO/__pycache__/ClustalIO.cpython-37.pyc
new file mode 100644
index 0000000..967a616
Binary files /dev/null and b/code/lib/Bio/AlignIO/__pycache__/ClustalIO.cpython-37.pyc differ
diff --git a/code/lib/Bio/AlignIO/__pycache__/EmbossIO.cpython-37.pyc b/code/lib/Bio/AlignIO/__pycache__/EmbossIO.cpython-37.pyc
new file mode 100644
index 0000000..dc69b07
Binary files /dev/null and b/code/lib/Bio/AlignIO/__pycache__/EmbossIO.cpython-37.pyc differ
diff --git a/code/lib/Bio/AlignIO/__pycache__/FastaIO.cpython-37.pyc b/code/lib/Bio/AlignIO/__pycache__/FastaIO.cpython-37.pyc
new file mode 100644
index 0000000..590a863
Binary files /dev/null and b/code/lib/Bio/AlignIO/__pycache__/FastaIO.cpython-37.pyc differ
diff --git a/code/lib/Bio/AlignIO/__pycache__/Interfaces.cpython-37.pyc b/code/lib/Bio/AlignIO/__pycache__/Interfaces.cpython-37.pyc
new file mode 100644
index 0000000..50cee59
Binary files /dev/null and b/code/lib/Bio/AlignIO/__pycache__/Interfaces.cpython-37.pyc differ
diff --git a/code/lib/Bio/AlignIO/__pycache__/MafIO.cpython-37.pyc b/code/lib/Bio/AlignIO/__pycache__/MafIO.cpython-37.pyc
new file mode 100644
index 0000000..6495934
Binary files /dev/null and b/code/lib/Bio/AlignIO/__pycache__/MafIO.cpython-37.pyc differ
diff --git a/code/lib/Bio/AlignIO/__pycache__/MauveIO.cpython-37.pyc b/code/lib/Bio/AlignIO/__pycache__/MauveIO.cpython-37.pyc
new file mode 100644
index 0000000..9a01d82
Binary files /dev/null and b/code/lib/Bio/AlignIO/__pycache__/MauveIO.cpython-37.pyc differ
diff --git a/code/lib/Bio/AlignIO/__pycache__/MsfIO.cpython-37.pyc b/code/lib/Bio/AlignIO/__pycache__/MsfIO.cpython-37.pyc
new file mode 100644
index 0000000..41d6c6a
Binary files /dev/null and b/code/lib/Bio/AlignIO/__pycache__/MsfIO.cpython-37.pyc differ
diff --git a/code/lib/Bio/AlignIO/__pycache__/NexusIO.cpython-37.pyc b/code/lib/Bio/AlignIO/__pycache__/NexusIO.cpython-37.pyc
new file mode 100644
index 0000000..7de4464
Binary files /dev/null and b/code/lib/Bio/AlignIO/__pycache__/NexusIO.cpython-37.pyc differ
diff --git a/code/lib/Bio/AlignIO/__pycache__/PhylipIO.cpython-37.pyc b/code/lib/Bio/AlignIO/__pycache__/PhylipIO.cpython-37.pyc
new file mode 100644
index 0000000..15268e3
Binary files /dev/null and b/code/lib/Bio/AlignIO/__pycache__/PhylipIO.cpython-37.pyc differ
diff --git a/code/lib/Bio/AlignIO/__pycache__/StockholmIO.cpython-37.pyc b/code/lib/Bio/AlignIO/__pycache__/StockholmIO.cpython-37.pyc
new file mode 100644
index 0000000..e638dbf
Binary files /dev/null and b/code/lib/Bio/AlignIO/__pycache__/StockholmIO.cpython-37.pyc differ
diff --git a/code/lib/Bio/AlignIO/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/AlignIO/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..cf0b44e
Binary files /dev/null and b/code/lib/Bio/AlignIO/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/Alphabet/__init__.py b/code/lib/Bio/Alphabet/__init__.py
new file mode 100644
index 0000000..5109136
--- /dev/null
+++ b/code/lib/Bio/Alphabet/__init__.py
@@ -0,0 +1,22 @@
+# Copyright 2000-2002 by Andrew Dalke.
+# Revisions copyright 2007-2010 by Peter Cock.
+# All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Alphabets were previously used to declare sequence type and letters (OBSOLETE).
+
+The design of Bio.Aphabet included a number of historic design choices
+which, with the benefit of hindsight, were regretable. Bio.Alphabet was
+therefore removed from Biopython in release 1.78. Instead, the molecule type is
+included as an annotation on SeqRecords where appropriate.
+
+Please see https://biopython.org/wiki/Alphabet for examples showing how to
+transition from Bio.Alphabet to molecule type annotations.
+"""
+
+raise ImportError(
+ "Bio.Alphabet has been removed from Biopython. In many cases, the alphabet can simply be ignored and removed from scripts. In a few cases, you may need to specify the ``molecule_type`` as an annotation on a SeqRecord for your script to work correctly. Please see https://biopython.org/wiki/Alphabet for more information."
+)
diff --git a/code/lib/Bio/Alphabet/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/Alphabet/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..9ee4b01
Binary files /dev/null and b/code/lib/Bio/Alphabet/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/Application/__init__.py b/code/lib/Bio/Application/__init__.py
new file mode 100644
index 0000000..f844d27
--- /dev/null
+++ b/code/lib/Bio/Application/__init__.py
@@ -0,0 +1,838 @@
+# Copyright 2001-2004 Brad Chapman.
+# Revisions copyright 2009-2013 by Peter Cock.
+# All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""General mechanisms to access applications in Biopython (OBSOLETE).
+
+This module is not intended for direct use. It provides the basic objects which
+are subclassed by our command line wrappers, such as:
+
+ - Bio.Align.Applications
+ - Bio.Blast.Applications
+ - Bio.Emboss.Applications
+ - Bio.Sequencing.Applications
+
+These modules provide wrapper classes for command line tools to help you
+construct command line strings by setting the values of each parameter.
+The finished command line strings are then normally invoked via the built-in
+Python module subprocess.
+
+Due to the on going maintainance burden or keeping command line application
+wrappers up to date, we have decided to deprecate and eventually remove them.
+We instead now recommend building your command line and invoking it directly
+with the subprocess module.
+"""
+import os
+import platform
+import sys
+import subprocess
+import re
+
+
+# Use this regular expression to test the property names are going to
+# be valid as Python properties or arguments
+_re_prop_name = re.compile(r"^[a-zA-Z][a-zA-Z0-9_]*$")
+assert _re_prop_name.match("t")
+assert _re_prop_name.match("test")
+assert _re_prop_name.match("_test") is None # we don't want private names
+assert _re_prop_name.match("-test") is None
+assert _re_prop_name.match("any-hyphen") is None
+assert _re_prop_name.match("underscore_ok")
+assert _re_prop_name.match("test_name")
+assert _re_prop_name.match("test2")
+# These are reserved names in Python itself,
+_reserved_names = [
+ "and",
+ "del",
+ "from",
+ "not",
+ "while",
+ "as",
+ "elif",
+ "global",
+ "or",
+ "with",
+ "assert",
+ "else",
+ "if",
+ "pass",
+ "yield",
+ "break",
+ "except",
+ "import",
+ "print",
+ "class",
+ "exec",
+ "in",
+ "raise",
+ "continue",
+ "finally",
+ "is",
+ "return",
+ "def",
+ "for",
+ "lambda",
+ "try",
+]
+# These are reserved names due to the way the wrappers work
+_local_reserved_names = ["set_parameter"]
+
+
+class ApplicationError(subprocess.CalledProcessError):
+ """Raised when an application returns a non-zero exit status (OBSOLETE).
+
+ The exit status will be stored in the returncode attribute, similarly
+ the command line string used in the cmd attribute, and (if captured)
+ stdout and stderr as strings.
+
+ This exception is a subclass of subprocess.CalledProcessError.
+
+ >>> err = ApplicationError(-11, "helloworld", "", "Some error text")
+ >>> err.returncode, err.cmd, err.stdout, err.stderr
+ (-11, 'helloworld', '', 'Some error text')
+ >>> print(err)
+ Non-zero return code -11 from 'helloworld', message 'Some error text'
+
+ """
+
+ def __init__(self, returncode, cmd, stdout="", stderr=""):
+ """Initialize the class."""
+ self.returncode = returncode
+ self.cmd = cmd
+ self.stdout = stdout
+ self.stderr = stderr
+
+ def __str__(self):
+ """Format the error as a string."""
+ # get first line of any stderr message
+ try:
+ msg = self.stderr.lstrip().split("\n", 1)[0].rstrip()
+ except Exception: # TODO, ValueError? AttributeError?
+ msg = ""
+ if msg:
+ return "Non-zero return code %d from %r, message %r" % (
+ self.returncode,
+ self.cmd,
+ msg,
+ )
+ else:
+ return "Non-zero return code %d from %r" % (self.returncode, self.cmd)
+
+ def __repr__(self):
+ """Represent the error as a string."""
+ return "ApplicationError(%i, %s, %s, %s)" % (
+ self.returncode,
+ self.cmd,
+ self.stdout,
+ self.stderr,
+ )
+
+
+class AbstractCommandline:
+ r"""Generic interface for constructing command line strings (OBSOLETE).
+
+ This class shouldn't be called directly; it should be subclassed to
+ provide an implementation for a specific application.
+
+ For a usage example we'll show one of the EMBOSS wrappers. You can set
+ options when creating the wrapper object using keyword arguments - or
+ later using their corresponding properties:
+
+ >>> from Bio.Emboss.Applications import WaterCommandline
+ >>> cline = WaterCommandline(gapopen=10, gapextend=0.5)
+ >>> cline
+ WaterCommandline(cmd='water', gapopen=10, gapextend=0.5)
+
+ You can instead manipulate the parameters via their properties, e.g.
+
+ >>> cline.gapopen
+ 10
+ >>> cline.gapopen = 20
+ >>> cline
+ WaterCommandline(cmd='water', gapopen=20, gapextend=0.5)
+
+ You can clear a parameter you have already added by 'deleting' the
+ corresponding property:
+
+ >>> del cline.gapopen
+ >>> cline.gapopen
+ >>> cline
+ WaterCommandline(cmd='water', gapextend=0.5)
+
+ Once you have set the parameters you need, you can turn the object into
+ a string (e.g. to log the command):
+
+ >>> str(cline)
+ Traceback (most recent call last):
+ ...
+ ValueError: You must either set outfile (output filename), or enable filter or stdout (output to stdout).
+
+ In this case the wrapper knows certain arguments are required to construct
+ a valid command line for the tool. For a complete example,
+
+ >>> from Bio.Emboss.Applications import WaterCommandline
+ >>> water_cmd = WaterCommandline(gapopen=10, gapextend=0.5)
+ >>> water_cmd.asequence = "asis:ACCCGGGCGCGGT"
+ >>> water_cmd.bsequence = "asis:ACCCGAGCGCGGT"
+ >>> water_cmd.outfile = "temp_water.txt"
+ >>> print(water_cmd)
+ water -outfile=temp_water.txt -asequence=asis:ACCCGGGCGCGGT -bsequence=asis:ACCCGAGCGCGGT -gapopen=10 -gapextend=0.5
+ >>> water_cmd
+ WaterCommandline(cmd='water', outfile='temp_water.txt', asequence='asis:ACCCGGGCGCGGT', bsequence='asis:ACCCGAGCGCGGT', gapopen=10, gapextend=0.5)
+
+ You would typically run the command line via a standard Python operating
+ system call using the subprocess module for full control. For the simple
+ case where you just want to run the command and get the output:
+
+ stdout, stderr = water_cmd()
+
+ Note that by default we assume the underlying tool is installed on the
+ system $PATH environment variable. This is normal under Linux/Unix, but
+ may need to be done manually under Windows. Alternatively, you can specify
+ the full path to the binary as the first argument (cmd):
+
+ >>> from Bio.Emboss.Applications import WaterCommandline
+ >>> water_cmd = WaterCommandline(r"C:\Program Files\EMBOSS\water.exe",
+ ... gapopen=10, gapextend=0.5,
+ ... asequence="asis:ACCCGGGCGCGGT",
+ ... bsequence="asis:ACCCGAGCGCGGT",
+ ... outfile="temp_water.txt")
+ >>> print(water_cmd)
+ "C:\Program Files\EMBOSS\water.exe" -outfile=temp_water.txt -asequence=asis:ACCCGGGCGCGGT -bsequence=asis:ACCCGAGCGCGGT -gapopen=10 -gapextend=0.5
+
+ Notice that since the path name includes a space it has automatically
+ been quoted.
+
+ """
+
+ # TODO - Replace the above example since EMBOSS doesn't work properly
+ # if installed into a folder with a space like "C:\Program Files\EMBOSS"
+ #
+ # Note the call example above is not a doctest as we can't handle EMBOSS
+ # (or any other tool) being missing in the unit tests.
+
+ parameters = None # will be a list defined in subclasses
+
+ def __init__(self, cmd, **kwargs):
+ """Create a new instance of a command line wrapper object."""
+ # Init method - should be subclassed!
+ #
+ # The subclass methods should look like this:
+ #
+ # def __init__(self, cmd="muscle", **kwargs):
+ # self.parameters = [...]
+ # AbstractCommandline.__init__(self, cmd, **kwargs)
+ #
+ # i.e. There should have an optional argument "cmd" to set the location
+ # of the executable (with a sensible default which should work if the
+ # command is on the path on Unix), and keyword arguments. It should
+ # then define a list of parameters, all objects derived from the base
+ # class _AbstractParameter.
+ #
+ # The keyword arguments should be any valid parameter name, and will
+ # be used to set the associated parameter.
+ self.program_name = cmd
+ try:
+ parameters = self.parameters
+ except AttributeError:
+ raise AttributeError(
+ "Subclass should have defined self.parameters"
+ ) from None
+ # Create properties for each parameter at run time
+ aliases = set()
+ for p in parameters:
+ if not p.names:
+ if not isinstance(p, _StaticArgument):
+ raise TypeError("Expected %r to be of type _StaticArgument" % p)
+ continue
+ for name in p.names:
+ if name in aliases:
+ raise ValueError("Parameter alias %s multiply defined" % name)
+ aliases.add(name)
+ name = p.names[-1]
+ if _re_prop_name.match(name) is None:
+ raise ValueError(
+ "Final parameter name %r cannot be used as "
+ "an argument or property name in python" % name
+ )
+ if name in _reserved_names:
+ raise ValueError(
+ "Final parameter name %r cannot be used as "
+ "an argument or property name because it is "
+ "a reserved word in python" % name
+ )
+ if name in _local_reserved_names:
+ raise ValueError(
+ "Final parameter name %r cannot be used as "
+ "an argument or property name due to the "
+ "way the AbstractCommandline class works" % name
+ )
+
+ # Beware of binding-versus-assignment confusion issues
+ def getter(name):
+ return lambda x: x._get_parameter(name)
+
+ def setter(name):
+ return lambda x, value: x.set_parameter(name, value)
+
+ def deleter(name):
+ return lambda x: x._clear_parameter(name)
+
+ doc = p.description
+ if isinstance(p, _Switch):
+ doc += (
+ "\n\nThis property controls the addition of the %s "
+ "switch, treat this property as a boolean." % p.names[0]
+ )
+ else:
+ doc += (
+ "\n\nThis controls the addition of the %s parameter "
+ "and its associated value. Set this property to the "
+ "argument value required." % p.names[0]
+ )
+ prop = property(getter(name), setter(name), deleter(name), doc)
+ setattr(self.__class__, name, prop) # magic!
+ for key, value in kwargs.items():
+ self.set_parameter(key, value)
+
+ def _validate(self):
+ """Make sure the required parameters have been set (PRIVATE).
+
+ No return value - it either works or raises a ValueError.
+
+ This is a separate method (called from __str__) so that subclasses may
+ override it.
+ """
+ for p in self.parameters:
+ # Check for missing required parameters:
+ if p.is_required and not (p.is_set):
+ raise ValueError("Parameter %s is not set." % p.names[-1])
+ # Also repeat the parameter validation here, just in case?
+
+ def __str__(self):
+ """Make the commandline string with the currently set options.
+
+ e.g.
+
+ >>> from Bio.Emboss.Applications import WaterCommandline
+ >>> cline = WaterCommandline(gapopen=10, gapextend=0.5)
+ >>> cline.asequence = "asis:ACCCGGGCGCGGT"
+ >>> cline.bsequence = "asis:ACCCGAGCGCGGT"
+ >>> cline.outfile = "temp_water.txt"
+ >>> print(cline)
+ water -outfile=temp_water.txt -asequence=asis:ACCCGGGCGCGGT -bsequence=asis:ACCCGAGCGCGGT -gapopen=10 -gapextend=0.5
+ >>> str(cline)
+ 'water -outfile=temp_water.txt -asequence=asis:ACCCGGGCGCGGT -bsequence=asis:ACCCGAGCGCGGT -gapopen=10 -gapextend=0.5'
+ """
+ self._validate()
+ commandline = "%s " % _escape_filename(self.program_name)
+ for parameter in self.parameters:
+ if parameter.is_set:
+ # This will include a trailing space:
+ commandline += str(parameter)
+ return commandline.strip() # remove trailing space
+
+ def __repr__(self):
+ """Return a representation of the command line object for debugging.
+
+ e.g.
+
+ >>> from Bio.Emboss.Applications import WaterCommandline
+ >>> cline = WaterCommandline(gapopen=10, gapextend=0.5)
+ >>> cline.asequence = "asis:ACCCGGGCGCGGT"
+ >>> cline.bsequence = "asis:ACCCGAGCGCGGT"
+ >>> cline.outfile = "temp_water.txt"
+ >>> print(cline)
+ water -outfile=temp_water.txt -asequence=asis:ACCCGGGCGCGGT -bsequence=asis:ACCCGAGCGCGGT -gapopen=10 -gapextend=0.5
+ >>> cline
+ WaterCommandline(cmd='water', outfile='temp_water.txt', asequence='asis:ACCCGGGCGCGGT', bsequence='asis:ACCCGAGCGCGGT', gapopen=10, gapextend=0.5)
+ """
+ answer = "%s(cmd=%r" % (self.__class__.__name__, self.program_name)
+ for parameter in self.parameters:
+ if parameter.is_set:
+ if isinstance(parameter, _Switch):
+ answer += ", %s=True" % parameter.names[-1]
+ else:
+ answer += ", %s=%r" % (parameter.names[-1], parameter.value)
+ answer += ")"
+ return answer
+
+ def _get_parameter(self, name):
+ """Get a commandline option value (PRIVATE)."""
+ for parameter in self.parameters:
+ if name in parameter.names:
+ if isinstance(parameter, _Switch):
+ return parameter.is_set
+ else:
+ return parameter.value
+ raise ValueError("Option name %s was not found." % name)
+
+ def _clear_parameter(self, name):
+ """Reset or clear a commandline option value (PRIVATE)."""
+ cleared_option = False
+ for parameter in self.parameters:
+ if name in parameter.names:
+ parameter.value = None
+ parameter.is_set = False
+ cleared_option = True
+ if not cleared_option:
+ raise ValueError("Option name %s was not found." % name)
+
+ def set_parameter(self, name, value=None):
+ """Set a commandline option for a program (OBSOLETE).
+
+ Every parameter is available via a property and as a named
+ keyword when creating the instance. Using either of these is
+ preferred to this legacy set_parameter method which is now
+ OBSOLETE, and likely to be DEPRECATED and later REMOVED in
+ future releases.
+ """
+ set_option = False
+ for parameter in self.parameters:
+ if name in parameter.names:
+ if isinstance(parameter, _Switch):
+ if value is None:
+ import warnings
+
+ warnings.warn(
+ "For a switch type argument like %s, "
+ "we expect a boolean. None is treated "
+ "as FALSE!" % parameter.names[-1]
+ )
+ parameter.is_set = bool(value)
+ set_option = True
+ else:
+ if value is not None:
+ self._check_value(value, name, parameter.checker_function)
+ parameter.value = value
+ parameter.is_set = True
+ set_option = True
+ if not set_option:
+ raise ValueError("Option name %s was not found." % name)
+
+ def _check_value(self, value, name, check_function):
+ """Check whether the given value is valid (PRIVATE).
+
+ No return value - it either works or raises a ValueError.
+
+ This uses the passed function 'check_function', which can either
+ return a [0, 1] (bad, good) value or raise an error. Either way
+ this function will raise an error if the value is not valid, or
+ finish silently otherwise.
+ """
+ if check_function is not None:
+ is_good = check_function(value) # May raise an exception
+ if is_good not in [0, 1, True, False]:
+ raise ValueError(
+ "Result of check_function: %r is of an unexpected value" % is_good
+ )
+ if not is_good:
+ raise ValueError(
+ "Invalid parameter value %r for parameter %s" % (value, name)
+ )
+
+ def __setattr__(self, name, value):
+ """Set attribute name to value (PRIVATE).
+
+ This code implements a workaround for a user interface issue.
+ Without this __setattr__ attribute-based assignment of parameters
+ will silently accept invalid parameters, leading to known instances
+ of the user assuming that parameters for the application are set,
+ when they are not.
+
+ >>> from Bio.Emboss.Applications import WaterCommandline
+ >>> cline = WaterCommandline(gapopen=10, gapextend=0.5, stdout=True)
+ >>> cline.asequence = "a.fasta"
+ >>> cline.bsequence = "b.fasta"
+ >>> cline.csequence = "c.fasta"
+ Traceback (most recent call last):
+ ...
+ ValueError: Option name csequence was not found.
+ >>> print(cline)
+ water -stdout -asequence=a.fasta -bsequence=b.fasta -gapopen=10 -gapextend=0.5
+
+ This workaround uses a whitelist of object attributes, and sets the
+ object attribute list as normal, for these. Other attributes are
+ assumed to be parameters, and passed to the self.set_parameter method
+ for validation and assignment.
+ """
+ if name in ["parameters", "program_name"]: # Allowed attributes
+ self.__dict__[name] = value
+ else:
+ self.set_parameter(name, value) # treat as a parameter
+
+ def __call__(self, stdin=None, stdout=True, stderr=True, cwd=None, env=None):
+ """Execute command, wait for it to finish, return (stdout, stderr).
+
+ Runs the command line tool and waits for it to finish. If it returns
+ a non-zero error level, an exception is raised. Otherwise two strings
+ are returned containing stdout and stderr.
+
+ The optional stdin argument should be a string of data which will be
+ passed to the tool as standard input.
+
+ The optional stdout and stderr argument may be filenames (string),
+ but otherwise are treated as a booleans, and control if the output
+ should be captured as strings (True, default), or ignored by sending
+ it to /dev/null to avoid wasting memory (False). If sent to a file
+ or ignored, then empty string(s) are returned.
+
+ The optional cwd argument is a string giving the working directory
+ to run the command from. See Python's subprocess module documentation
+ for more details.
+
+ The optional env argument is a dictionary setting the environment
+ variables to be used in the new process. By default the current
+ process' environment variables are used. See Python's subprocess
+ module documentation for more details.
+
+ Default example usage::
+
+ from Bio.Emboss.Applications import WaterCommandline
+ water_cmd = WaterCommandline(gapopen=10, gapextend=0.5,
+ stdout=True, auto=True,
+ asequence="a.fasta", bsequence="b.fasta")
+ print("About to run: %s" % water_cmd)
+ std_output, err_output = water_cmd()
+
+ This functionality is similar to subprocess.check_output(). In general
+ if you require more control over running the command, use subprocess
+ directly.
+
+ When the program called returns a non-zero error level, a custom
+ ApplicationError exception is raised. This includes any stdout and
+ stderr strings captured as attributes of the exception object, since
+ they may be useful for diagnosing what went wrong.
+ """
+ if not stdout:
+ stdout_arg = open(os.devnull, "w")
+ elif isinstance(stdout, str):
+ stdout_arg = open(stdout, "w")
+ else:
+ stdout_arg = subprocess.PIPE
+
+ if not stderr:
+ stderr_arg = open(os.devnull, "w")
+ elif isinstance(stderr, str):
+ if stdout == stderr:
+ stderr_arg = stdout_arg # Write both to the same file
+ else:
+ stderr_arg = open(stderr, "w")
+ else:
+ stderr_arg = subprocess.PIPE
+
+ # We may not need to supply any piped input, but we setup the
+ # standard input pipe anyway as a work around for a python
+ # bug if this is called from a Windows GUI program. For
+ # details, see http://bugs.python.org/issue1124861
+ #
+ # Using universal newlines is important on Python 3, this
+ # gives unicode handles rather than bytes handles.
+
+ # Windows 7, 8, 8.1 and 10 want shell = True
+ if sys.platform != "win32":
+ use_shell = True
+ else:
+ win_ver = platform.win32_ver()[0]
+ if win_ver in ["7", "8", "post2012Server", "10"]:
+ use_shell = True
+ else:
+ use_shell = False
+ child_process = subprocess.Popen(
+ str(self),
+ stdin=subprocess.PIPE,
+ stdout=stdout_arg,
+ stderr=stderr_arg,
+ universal_newlines=True,
+ cwd=cwd,
+ env=env,
+ shell=use_shell,
+ )
+ # Use .communicate as can get deadlocks with .wait(), see Bug 2804
+ stdout_str, stderr_str = child_process.communicate(stdin)
+ if not stdout:
+ assert not stdout_str, stdout_str
+ if not stderr:
+ assert not stderr_str, stderr_str
+ return_code = child_process.returncode
+
+ # Particularly important to close handles on Jython and PyPy
+ # (where garbage collection is less predictable) and on Windows
+ # (where cannot delete files with an open handle):
+ if not stdout or isinstance(stdout, str):
+ # We opened /dev/null or a file
+ stdout_arg.close()
+ if not stderr or (isinstance(stderr, str) and stdout != stderr):
+ # We opened /dev/null or a file
+ stderr_arg.close()
+
+ if return_code:
+ raise ApplicationError(return_code, str(self), stdout_str, stderr_str)
+ return stdout_str, stderr_str
+
+
+class _AbstractParameter:
+ """A class to hold information about a parameter for a commandline.
+
+ Do not use this directly, instead use one of the subclasses.
+ """
+
+ def __init__(self):
+ raise NotImplementedError
+
+ def __str__(self):
+ raise NotImplementedError
+
+
+class _Option(_AbstractParameter):
+ """Represent an option that can be set for a program.
+
+ This holds UNIXish options like --append=yes and -a yes,
+ where a value (here "yes") is generally expected.
+
+ For UNIXish options like -kimura in clustalw which don't
+ take a value, use the _Switch object instead.
+
+ Attributes:
+ - names -- a list of string names (typically two entries) by which
+ the parameter can be set via the legacy set_parameter method
+ (eg ["-a", "--append", "append"]). The first name in list is used
+ when building the command line. The last name in the list is a
+ "human readable" name describing the option in one word. This
+ must be a valid Python identifier as it is used as the property
+ name and as a keyword argument, and should therefore follow PEP8
+ naming.
+ - description -- a description of the option. This is used as
+ the property docstring.
+ - filename -- True if this argument is a filename (or other argument
+ that should be quoted) and should be automatically quoted if it
+ contains spaces.
+ - checker_function -- a reference to a function that will determine
+ if a given value is valid for this parameter. This function can either
+ raise an error when given a bad value, or return a [0, 1] decision on
+ whether the value is correct.
+ - equate -- should an equals sign be inserted if a value is used?
+ - is_required -- a flag to indicate if the parameter must be set for
+ the program to be run.
+ - is_set -- if the parameter has been set
+ - value -- the value of a parameter
+
+ """
+
+ def __init__(
+ self,
+ names,
+ description,
+ filename=False,
+ checker_function=None,
+ is_required=False,
+ equate=True,
+ ):
+ self.names = names
+ if not isinstance(description, str):
+ raise TypeError("Should be a string: %r for %s" % (description, names[-1]))
+ # Note 'filename' is for any string with spaces that needs quoting
+ self.is_filename = filename
+ self.checker_function = checker_function
+ self.description = description
+ self.equate = equate
+ self.is_required = is_required
+
+ self.is_set = False
+ self.value = None
+
+ def __str__(self):
+ """Return the value of this option for the commandline.
+
+ Includes a trailing space.
+ """
+ # Note: Before equate was handled explicitly, the old
+ # code would do either "--name " or "--name=value ",
+ # or " -name " or " -name value ". This choice is now
+ # now made explicitly when setting up the option.
+ if self.value is None:
+ return "%s " % self.names[0]
+ if self.is_filename:
+ v = _escape_filename(self.value)
+ else:
+ v = str(self.value)
+ if self.equate:
+ return "%s=%s " % (self.names[0], v)
+ else:
+ return "%s %s " % (self.names[0], v)
+
+
+class _Switch(_AbstractParameter):
+ """Represent an optional argument switch for a program.
+
+ This holds UNIXish options like -kimura in clustalw which don't
+ take a value, they are either included in the command string
+ or omitted.
+
+ Attributes:
+ - names -- a list of string names (typically two entries) by which
+ the parameter can be set via the legacy set_parameter method
+ (eg ["-a", "--append", "append"]). The first name in list is used
+ when building the command line. The last name in the list is a
+ "human readable" name describing the option in one word. This
+ must be a valid Python identifier as it is used as the property
+ name and as a keyword argument, and should therefore follow PEP8
+ naming.
+ - description -- a description of the option. This is used as
+ the property docstring.
+ - is_set -- if the parameter has been set
+
+ NOTE - There is no value attribute, see is_set instead,
+
+ """
+
+ def __init__(self, names, description):
+ self.names = names
+ self.description = description
+ self.is_set = False
+ self.is_required = False
+
+ def __str__(self):
+ """Return the value of this option for the commandline.
+
+ Includes a trailing space.
+ """
+ assert not hasattr(self, "value")
+ if self.is_set:
+ return "%s " % self.names[0]
+ else:
+ return ""
+
+
+class _Argument(_AbstractParameter):
+ """Represent an argument on a commandline.
+
+ The names argument should be a list containing one string.
+ This must be a valid Python identifier as it is used as the
+ property name and as a keyword argument, and should therefore
+ follow PEP8 naming.
+ """
+
+ def __init__(
+ self,
+ names,
+ description,
+ filename=False,
+ checker_function=None,
+ is_required=False,
+ ):
+ # if len(names) != 1:
+ # raise ValueError("The names argument to _Argument should be a "
+ # "single entry list with a PEP8 property name.")
+ self.names = names
+ if not isinstance(description, str):
+ raise TypeError("Should be a string: %r for %s" % (description, names[-1]))
+ # Note 'filename' is for any string with spaces that needs quoting
+ self.is_filename = filename
+ self.checker_function = checker_function
+ self.description = description
+ self.is_required = is_required
+ self.is_set = False
+ self.value = None
+
+ def __str__(self):
+ if self.value is None:
+ return " "
+ elif self.is_filename:
+ return "%s " % _escape_filename(self.value)
+ else:
+ return "%s " % self.value
+
+
+class _ArgumentList(_Argument):
+ """Represent a variable list of arguments on a command line, e.g. multiple filenames."""
+
+ # TODO - Option to require at least one value? e.g. min/max count?
+
+ def __str__(self):
+ if not isinstance(self.value, list):
+ raise TypeError("Arguments should be a list")
+ if not self.value:
+ raise ValueError("Requires at least one filename")
+ # A trailing space is required so that parameters following the last filename
+ # do not appear merged.
+ # e.g.: samtools cat in1.bam in2.bam-o out.sam [without trailing space][Incorrect]
+ # samtools cat in1.bam in2.bam -o out.sam [with trailing space][Correct]
+ if self.is_filename:
+ return " ".join(_escape_filename(v) for v in self.value) + " "
+ else:
+ return " ".join(self.value) + " "
+
+
+class _StaticArgument(_AbstractParameter):
+ """Represent a static (read only) argument on a commandline.
+
+ This is not intended to be exposed as a named argument or
+ property of a command line wrapper object.
+ """
+
+ def __init__(self, value):
+ self.names = []
+ self.is_required = False
+ self.is_set = True
+ self.value = value
+
+ def __str__(self):
+ return "%s " % self.value
+
+
+def _escape_filename(filename):
+ """Escape filenames with spaces by adding quotes (PRIVATE).
+
+ Note this will not add quotes if they are already included:
+
+ >>> print((_escape_filename('example with spaces')))
+ "example with spaces"
+ >>> print((_escape_filename('"example with spaces"')))
+ "example with spaces"
+ >>> print((_escape_filename(1)))
+ 1
+
+ Note the function is more generic than the name suggests, since it
+ is used to add quotes around any string arguments containing spaces.
+ """
+ # Is adding the following helpful
+ # if os.path.isfile(filename):
+ # # On Windows, if the file exists, we can ask for
+ # # its alternative short name (DOS style 8.3 format)
+ # # which has no spaces in it. Note that this name
+ # # is not portable between machines, or even folder!
+ # try:
+ # import win32api
+ # short = win32api.GetShortPathName(filename)
+ # assert os.path.isfile(short)
+ # return short
+ # except ImportError:
+ # pass
+ if not isinstance(filename, str):
+ # for example the NCBI BLAST+ -outfmt argument can be an integer
+ return filename
+ if " " not in filename:
+ return filename
+ # We'll just quote it - works on Windows, Mac OS X etc
+ if filename.startswith('"') and filename.endswith('"'):
+ # Its already quoted
+ return filename
+ else:
+ return '"%s"' % filename
+
+
+def _test():
+ """Run the Bio.Application module's doctests (PRIVATE)."""
+ import doctest
+
+ doctest.testmod(verbose=1)
+
+
+if __name__ == "__main__":
+ # Run the doctests
+ _test()
diff --git a/code/lib/Bio/Application/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/Application/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..ee141eb
Binary files /dev/null and b/code/lib/Bio/Application/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/Blast/Applications.py b/code/lib/Bio/Blast/Applications.py
new file mode 100644
index 0000000..954a254
--- /dev/null
+++ b/code/lib/Bio/Blast/Applications.py
@@ -0,0 +1,1602 @@
+# Copyright 2001 Brad Chapman.
+# Revisions copyright 2009-2010 by Peter Cock.
+# Revisions copyright 2010 by Phillip Garland.
+# All rights reserved.
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Definitions for interacting with BLAST related applications (OBSOLETE).
+
+Wrappers for the new NCBI BLAST+ tools (written in C++):
+
+ - NcbiblastpCommandline - Protein-Protein BLAST
+ - NcbiblastnCommandline - Nucleotide-Nucleotide BLAST
+ - NcbiblastxCommandline - Translated Query-Protein Subject BLAST
+ - NcbitblastnCommandline - Protein Query-Translated Subject BLAST
+ - NcbitblastxCommandline - Translated Query-Protein Subject BLAST
+ - NcbipsiblastCommandline - Position-Specific Initiated BLAST
+ - NcbirpsblastCommandline - Reverse Position Specific BLAST
+ - NcbirpstblastnCommandline - Translated Reverse Position Specific BLAST
+ - NcbideltablastCommandline - Protein-Protein domain enhanced lookup time accelerated blast
+ - NcbiblastformatterCommandline - Convert ASN.1 to other BLAST output formats
+ - NcbimakeblastdbCommandline - Application to create BLAST databases
+
+For further details, see:
+
+Camacho et al. BLAST+: architecture and applications
+BMC Bioinformatics 2009, 10:421
+https://doi.org/10.1186/1471-2105-10-421
+
+We have decided to remove this module in future, and instead recommend
+building your command and invoking it via the subprocess module directly.
+"""
+
+from Bio.Application import _Option, AbstractCommandline, _Switch
+
+
+class _NcbibaseblastCommandline(AbstractCommandline):
+ """Base Commandline object for (new) NCBI BLAST+ wrappers (PRIVATE).
+
+ This is provided for subclassing, it deals with shared options
+ common to all the BLAST tools (blastn, rpsblast, rpsblast, etc
+ AND blast_formatter).
+ """
+
+ def __init__(self, cmd=None, **kwargs):
+ assert cmd is not None
+ extra_parameters = [
+ # Core:
+ _Switch(
+ ["-h", "h"], "Print USAGE and DESCRIPTION; ignore other arguments."
+ ),
+ _Switch(
+ ["-help", "help"],
+ "Print USAGE, DESCRIPTION and ARGUMENTS description; "
+ "ignore other arguments.",
+ ),
+ _Switch(
+ ["-version", "version"],
+ "Print version number; ignore other arguments.",
+ ),
+ # Output configuration options
+ _Option(
+ ["-out", "out"],
+ "Output file for alignment.",
+ filename=True,
+ equate=False,
+ ),
+ # Formatting options:
+ _Option(
+ ["-outfmt", "outfmt"],
+ "Alignment view. Typically an integer 0-14 but for some "
+ "formats can be named columns like '6 qseqid sseqid'. "
+ "Use 5 for XML output (differs from classic BLAST which "
+ "used 7 for XML).",
+ filename=True, # to ensure spaced inputs are quoted
+ equate=False,
+ ),
+ # TODO - Document and test the column options
+ _Switch(["-show_gis", "show_gis"], "Show NCBI GIs in deflines?"),
+ _Option(
+ ["-num_descriptions", "num_descriptions"],
+ "Number of database sequences to show one-line descriptions for.\n\n"
+ "Integer argument (at least zero). Default is 500. "
+ "See also num_alignments.",
+ equate=False,
+ ),
+ _Option(
+ ["-num_alignments", "num_alignments"],
+ "Number of database sequences to show num_alignments for.\n\n"
+ "Integer argument (at least zero). Default is 200. "
+ "See also num_alignments.",
+ equate=False,
+ ),
+ _Option(
+ ["-line_length", "line_length"],
+ "Line length for formatting alignments "
+ "(integer, at least 1, default 60).\n\n"
+ "Not applicable for outfmt > 4. Added in BLAST+ 2.2.30.",
+ equate=False,
+ ),
+ _Switch(
+ ["-html", "html"], "Produce HTML output? See also the outfmt option."
+ ),
+ # Miscellaneous options
+ _Switch(
+ ["-parse_deflines", "parse_deflines"],
+ "Should the query and subject defline(s) be parsed?",
+ ),
+ ]
+ try:
+ # Insert extra parameters - at the start just in case there
+ # are any arguments which must come last:
+ self.parameters = extra_parameters + self.parameters
+ except AttributeError:
+ # Should we raise an error? The subclass should have set this up!
+ self.parameters = extra_parameters
+ AbstractCommandline.__init__(self, cmd, **kwargs)
+
+ def _validate_incompatibilities(self, incompatibles):
+ """Validate parameters for incompatibilities (PRIVATE).
+
+ Used by the _validate method.
+ """
+ for a in incompatibles:
+ if self._get_parameter(a):
+ for b in incompatibles[a]:
+ if self._get_parameter(b):
+ raise ValueError("Options %s and %s are incompatible." % (a, b))
+
+
+class _NcbiblastCommandline(_NcbibaseblastCommandline):
+ """Base Commandline object for (new) NCBI BLAST+ wrappers (PRIVATE).
+
+ This is provided for subclassing, it deals with shared options
+ common to all the BLAST tools (blastn, rpsblast, rpsblast, etc).
+ """
+
+ def __init__(self, cmd=None, **kwargs):
+ assert cmd is not None
+ extra_parameters = [
+ # Input query options:
+ _Option(
+ ["-query", "query"],
+ "The sequence to search with.",
+ filename=True,
+ equate=False,
+ ), # Should this be required?
+ _Option(
+ ["-query_loc", "query_loc"],
+ "Location on the query sequence (Format: start-stop).",
+ equate=False,
+ ),
+ # General search options:
+ _Option(["-db", "db"], "The database to BLAST against.", equate=False),
+ _Option(["-evalue", "evalue"], "Expectation value cutoff.", equate=False),
+ _Option(
+ ["-word_size", "word_size"],
+ "Word size for wordfinder algorithm.\n\nInteger. Minimum 2.",
+ equate=False,
+ ),
+ # BLAST-2-Sequences options:
+ # - see subclass
+ # Formatting options:
+ # - see baseclass
+ # Query filtering options
+ _Option(
+ ["-soft_masking", "soft_masking"],
+ "Apply filtering locations as soft masks (Boolean, Default = true).",
+ equate=False,
+ ),
+ _Switch(
+ ["-lcase_masking", "lcase_masking"],
+ "Use lower case filtering in query and subject sequence(s)?",
+ ),
+ # Restrict search or results
+ _Option(
+ ["-gilist", "gilist"],
+ "Restrict search of database to list of GI's.\n\n"
+ "Incompatible with: negative_gilist, seqidlist, negative_seqidlist, "
+ "remote, subject, subject_loc",
+ filename=True,
+ equate=False,
+ ),
+ _Option(
+ ["-negative_gilist", "negative_gilist"],
+ "Restrict search of database to everything except the listed GIs.\n\n"
+ "Incompatible with: gilist, seqidlist, remote, subject, subject_loc",
+ filename=True,
+ equate=False,
+ ),
+ _Option(
+ ["-seqidlist", "seqidlist"],
+ "Restrict search of database to list of SeqID's.\n\n"
+ "Incompatible with: gilist, negative_gilist, remote, subject, "
+ "subject_loc",
+ filename=True,
+ equate=False,
+ ),
+ _Option(
+ ["-negative_seqidlist", "negative_seqidlist"],
+ "Restrict search of database to everything except listed SeqID's.\n\n"
+ "Incompatible with: gilist, seqidlist, remote, subject, subject_loc",
+ filename=True,
+ equate=False,
+ ),
+ _Option(
+ ["-entrez_query", "entrez_query"],
+ "Restrict search with the given Entrez query (requires remote).",
+ equate=False,
+ ),
+ _Option(
+ ["-qcov_hsp_perc", "qcov_hsp_perc"],
+ "Percent query coverage per hsp (float, 0 to 100).\n\n"
+ "Added in BLAST+ 2.2.30.",
+ equate=False,
+ ),
+ _Option(
+ ["-max_target_seqs", "max_target_seqs"],
+ "Maximum number of aligned sequences to keep (integer, at least one).",
+ equate=False,
+ ),
+ # Statistical options
+ _Option(
+ ["-dbsize", "dbsize"],
+ "Effective length of the database (integer).",
+ equate=False,
+ ),
+ _Option(
+ ["-searchsp", "searchsp"],
+ "Effective length of the search space (integer).",
+ equate=False,
+ ),
+ _Option(
+ ["-max_hsps_per_subject", "max_hsps_per_subject"],
+ "Override max number of HSPs per subject saved for ungapped searches "
+ "(integer).",
+ equate=False,
+ ),
+ _Option(
+ ["-max_hsps", "max_hsps"],
+ "Set max number of HSPs saved per subject sequence\n\n"
+ "Ddefault 0 means no limit.",
+ equate=False,
+ ),
+ _Switch(["-sum_statistics", "sum_statistics"], "Use sum statistics."),
+ # Is -sum_stats a BLAST+ bug, why not use -sum_statistics switch?
+ _Option(
+ ["-sum_stats", "sum_stats"],
+ "Use sum statistics (boolean).\n\nAdded in BLAST+ 2.2.30.",
+ equate=False,
+ ),
+ # Extension options
+ _Option(
+ ["-xdrop_ungap", "xdrop_ungap"],
+ "X-dropoff value (in bits) for ungapped extensions (float).",
+ equate=False,
+ ),
+ _Option(
+ ["-xdrop_gap", "xdrop_gap"],
+ "X-dropoff value (in bits) for preliminary gapped extensions (float).",
+ equate=False,
+ ),
+ _Option(
+ ["-xdrop_gap_final", "xdrop_gap_final"],
+ "X-dropoff value (in bits) for final gapped alignment (float).",
+ equate=False,
+ ),
+ _Option(
+ ["-window_size", "window_size"],
+ "Multiple hits window size, use 0 to specify 1-hit algorithm "
+ "(integer).",
+ equate=False,
+ ),
+ # Search strategy options
+ _Option(
+ ["-import_search_strategy", "import_search_strategy"],
+ "Search strategy to use.\n\n"
+ "Incompatible with: export_search_strategy",
+ filename=True,
+ equate=False,
+ ),
+ _Option(
+ ["-export_search_strategy", "export_search_strategy"],
+ "File name to record the search strategy used.\n\n"
+ "Incompatible with: import_search_strategy",
+ filename=True,
+ equate=False,
+ ),
+ # Miscellaneous options
+ _Option(
+ ["-num_threads", "num_threads"],
+ "Number of threads to use in the BLAST search.\n\n"
+ "Integer, at least one. Default is one. Incompatible with: remote",
+ equate=False,
+ ),
+ _Switch(
+ ["-remote", "remote"],
+ "Execute search remotely?\n\n"
+ "Incompatible with: gilist, negative_gilist, subject_loc, "
+ "num_threads, ...",
+ ),
+ ]
+ try:
+ # Insert extra parameters - at the start just in case there
+ # are any arguments which must come last:
+ self.parameters = extra_parameters + self.parameters
+ except AttributeError:
+ # Should we raise an error? The subclass should have set this up!
+ self.parameters = extra_parameters
+ _NcbibaseblastCommandline.__init__(self, cmd, **kwargs)
+
+ def _validate(self):
+ incompatibles = {
+ "remote": ["gilist", "negative_gilist", "num_threads"],
+ "import_search_strategy": ["export_search_strategy"],
+ "gilist": ["negative_gilist"],
+ "seqidlist": ["gilist", "negative_gilist", "remote"],
+ }
+ self._validate_incompatibilities(incompatibles)
+ if self.entrez_query and not self.remote:
+ raise ValueError("Option entrez_query requires remote option.")
+ AbstractCommandline._validate(self)
+
+
+class _Ncbiblast2SeqCommandline(_NcbiblastCommandline):
+ """Base Commandline object for (new) NCBI BLAST+ wrappers (PRIVATE).
+
+ This is provided for subclassing, it deals with shared options
+ common to all the BLAST tools supporting two-sequence BLAST
+ (blastn, psiblast, etc) but not rpsblast or rpstblastn.
+ """
+
+ def __init__(self, cmd=None, **kwargs):
+ assert cmd is not None
+ extra_parameters = [
+ # General search options:
+ _Option(
+ ["-gapopen", "gapopen"], "Cost to open a gap (integer).", equate=False
+ ),
+ _Option(
+ ["-gapextend", "gapextend"],
+ "Cost to extend a gap (integer).",
+ equate=False,
+ ),
+ # BLAST-2-Sequences options:
+ _Option(
+ ["-subject", "subject"],
+ "Subject sequence(s) to search.\n\n"
+ "Incompatible with: db, gilist, seqidlist, negative_gilist, "
+ "negative_seqidlist, db_soft_mask, db_hard_mask\n\n"
+ "See also subject_loc.",
+ filename=True,
+ equate=False,
+ ),
+ _Option(
+ ["-subject_loc", "subject_loc"],
+ "Location on the subject sequence (Format: start-stop).\n\n"
+ "Incompatible with: db, gilist, seqidlist, negative_gilist, "
+ "negative_seqidlist, db_soft_mask, db_hard_mask, remote.\n\n"
+ "See also subject.",
+ equate=False,
+ ),
+ # Restrict search or results:
+ _Option(
+ ["-culling_limit", "culling_limit"],
+ "Hit culling limit (integer).\n\n"
+ "If the query range of a hit is enveloped by that of at "
+ "least this many higher-scoring hits, delete the hit.\n\n"
+ "Incompatible with: best_hit_overhang, best_hit_score_edge.",
+ equate=False,
+ ),
+ _Option(
+ ["-best_hit_overhang", "best_hit_overhang"],
+ "Best Hit algorithm overhang value (float, recommended value: 0.1)\n\n"
+ "Float between 0.0 and 0.5 inclusive. "
+ "Incompatible with: culling_limit.",
+ equate=False,
+ ),
+ _Option(
+ ["-best_hit_score_edge", "best_hit_score_edge"],
+ "Best Hit algorithm score edge value (float).\n\n"
+ "Float between 0.0 and 0.5 inclusive. Recommended value: 0.1\n\n"
+ "Incompatible with: culling_limit.",
+ equate=False,
+ ),
+ ]
+ try:
+ # Insert extra parameters - at the start just in case there
+ # are any arguments which must come last:
+ self.parameters = extra_parameters + self.parameters
+ except AttributeError:
+ # Should we raise an error? The subclass should have set this up!
+ self.parameters = extra_parameters
+ _NcbiblastCommandline.__init__(self, cmd, **kwargs)
+
+ def _validate(self):
+ incompatibles = {
+ "subject_loc": ["db", "gilist", "negative_gilist", "seqidlist", "remote"],
+ "culling_limit": ["best_hit_overhang", "best_hit_score_edge"],
+ "subject": ["db", "gilist", "negative_gilist", "seqidlist"],
+ }
+ self._validate_incompatibilities(incompatibles)
+ _NcbiblastCommandline._validate(self)
+
+
+class _NcbiblastMain2SeqCommandline(_Ncbiblast2SeqCommandline):
+ """Base Commandline object for (new) NCBI BLAST+ wrappers (PRIVATE).
+
+ This is provided for subclassing, it deals with shared options
+ common to the main BLAST tools blastp, blastn, blastx, tblastx, tblastn
+ but not psiblast, rpsblast or rpstblastn.
+ """
+
+ def __init__(self, cmd=None, **kwargs):
+ assert cmd is not None
+ extra_parameters = [
+ # Restrict search or results:
+ _Option(
+ ["-db_soft_mask", "db_soft_mask"],
+ "Filtering algorithm for soft masking (integer).\n\n"
+ "Filtering algorithm ID to apply to BLAST database as soft masking. "
+ "Incompatible with: db_hard_mask, subject, subject_loc",
+ equate=False,
+ ),
+ _Option(
+ ["-db_hard_mask", "db_hard_mask"],
+ "Filtering algorithm for hard masking (integer).\n\n"
+ "Filtering algorithm ID to apply to BLAST database as hard masking. "
+ "Incompatible with: db_soft_mask, subject, subject_loc",
+ equate=False,
+ ),
+ ]
+ try:
+ # Insert extra parameters - at the start just in case there
+ # are any arguments which must come last:
+ self.parameters = extra_parameters + self.parameters
+ except AttributeError:
+ # Should we raise an error? The subclass should have set this up!
+ self.parameters = extra_parameters
+ _Ncbiblast2SeqCommandline.__init__(self, cmd, **kwargs)
+
+ def _validate(self):
+ incompatibles = {
+ "db_soft_mask": ["db_hard_mask", "subject", "subject_loc"],
+ "db_hard_mask": ["db_soft_mask", "subject", "subject_loc"],
+ }
+ self._validate_incompatibilities(incompatibles)
+ _Ncbiblast2SeqCommandline._validate(self)
+
+
+class NcbiblastpCommandline(_NcbiblastMain2SeqCommandline):
+ """Create a commandline for the NCBI BLAST+ program blastp (for proteins).
+
+ With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI
+ replaced the old blastall tool with separate tools for each of the searches.
+ This wrapper therefore replaces BlastallCommandline with option -p blastp.
+
+ >>> from Bio.Blast.Applications import NcbiblastpCommandline
+ >>> cline = NcbiblastpCommandline(query="rosemary.pro", db="nr",
+ ... evalue=0.001, remote=True, ungapped=True)
+ >>> cline
+ NcbiblastpCommandline(cmd='blastp', query='rosemary.pro', db='nr', evalue=0.001, remote=True, ungapped=True)
+ >>> print(cline)
+ blastp -query rosemary.pro -db nr -evalue 0.001 -remote -ungapped
+
+ You would typically run the command line with cline() or via the Python
+ subprocess module, as described in the Biopython tutorial.
+ """
+
+ def __init__(self, cmd="blastp", **kwargs):
+ """Initialize the class."""
+ self.parameters = [
+ # General search options:
+ _Option(
+ ["-task", "task"],
+ "Task to execute (string, blastp (default), blastp-fast or blastp-short).",
+ checker_function=lambda value: value
+ in ["blastp", "blastp-fast", "blastp-short"],
+ equate=False,
+ ),
+ _Option(["-matrix", "matrix"], "Scoring matrix name (default BLOSUM62)."),
+ _Option(
+ ["-threshold", "threshold"],
+ "Minimum score for words to be added to the BLAST lookup table (float).",
+ equate=False,
+ ),
+ _Option(
+ ["-comp_based_stats", "comp_based_stats"],
+ "Use composition-based statistics (string, default 2, i.e. True).\n\n"
+ "0, F or f: no composition-based statistics\n\n"
+ "2, T or t, D or d : Composition-based score adjustment as in "
+ "Bioinformatics 21:902-911, 2005, conditioned on sequence "
+ "properties\n\n"
+ "Note that tblastn also supports values of 1 and 3.",
+ checker_function=lambda value: value in "0Ft2TtDd",
+ equate=False,
+ ),
+ # Query filtering options:
+ _Option(
+ ["-seg", "seg"],
+ "Filter query sequence with SEG (string).\n\n"
+ 'Format: "yes", "window locut hicut", or "no" to disable\n'
+ 'Default is "12 2.2 2.5"',
+ equate=False,
+ ),
+ # Extension options:
+ _Switch(["-ungapped", "ungapped"], "Perform ungapped alignment only?"),
+ # Miscellaneous options:
+ _Switch(
+ ["-use_sw_tback", "use_sw_tback"],
+ "Compute locally optimal Smith-Waterman alignments?",
+ ),
+ ]
+ _NcbiblastMain2SeqCommandline.__init__(self, cmd, **kwargs)
+
+
+class NcbiblastnCommandline(_NcbiblastMain2SeqCommandline):
+ """Wrapper for the NCBI BLAST+ program blastn (for nucleotides).
+
+ With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI
+ replaced the old blastall tool with separate tools for each of the searches.
+ This wrapper therefore replaces BlastallCommandline with option -p blastn.
+
+ For example, to run a search against the "nt" nucleotide database using the
+ FASTA nucleotide file "m_code.fasta" as the query, with an expectation value
+ cut off of 0.001, saving the output to a file in XML format:
+
+ >>> from Bio.Blast.Applications import NcbiblastnCommandline
+ >>> cline = NcbiblastnCommandline(query="m_cold.fasta", db="nt", strand="plus",
+ ... evalue=0.001, out="m_cold.xml", outfmt=5)
+ >>> cline
+ NcbiblastnCommandline(cmd='blastn', out='m_cold.xml', outfmt=5, query='m_cold.fasta', db='nt', evalue=0.001, strand='plus')
+ >>> print(cline)
+ blastn -out m_cold.xml -outfmt 5 -query m_cold.fasta -db nt -evalue 0.001 -strand plus
+
+ You would typically run the command line with cline() or via the Python
+ subprocess module, as described in the Biopython tutorial.
+ """
+
+ def __init__(self, cmd="blastn", **kwargs):
+ """Initialize the class."""
+ self.parameters = [
+ # Input query options:
+ _Option(
+ ["-strand", "strand"],
+ "Query strand(s) to search against database/subject.\n\n"
+ 'Values allowed are "both" (default), "minus", "plus".',
+ checker_function=lambda value: value in ["both", "minus", "plus"],
+ equate=False,
+ ),
+ # General search options:
+ _Option(
+ ["-task", "task"],
+ "Task to execute (string, default 'megablast')\n\n"
+ "Allowed values 'blastn', 'blastn-short', 'dc-megablast', 'megablast' "
+ "(the default), or 'vecscreen'.",
+ checker_function=lambda value: value
+ in ["blastn", "blastn-short", "dc-megablast", "megablast", "vecscreen"],
+ equate=False,
+ ),
+ _Option(
+ ["-penalty", "penalty"],
+ "Penalty for a nucleotide mismatch (integer, at most zero).",
+ equate=False,
+ ),
+ _Option(
+ ["-reward", "reward"],
+ "Reward for a nucleotide match (integer, at least zero).",
+ equate=False,
+ ),
+ _Option(
+ ["-use_index", "use_index"],
+ "Use MegaBLAST database index (Boolean, Default = False)",
+ equate=False,
+ ),
+ _Option(
+ ["-index_name", "index_name"],
+ "MegaBLAST database index name.",
+ equate=False,
+ ),
+ # Query filtering options:
+ _Option(
+ ["-dust", "dust"],
+ "Filter query sequence with DUST (string).\n\n"
+ "Format: 'yes', 'level window linker', or 'no' to disable.\n\n"
+ "Default = '20 64 1'.",
+ equate=False,
+ ),
+ _Option(
+ ["-filtering_db", "filtering_db"],
+ "BLAST database containing filtering elements (i.e. repeats).",
+ equate=False,
+ ),
+ _Option(
+ ["-window_masker_taxid", "window_masker_taxid"],
+ "Enable WindowMasker filtering using a Taxonomic ID (integer).",
+ equate=False,
+ ),
+ _Option(
+ ["-window_masker_db", "window_masker_db"],
+ "Enable WindowMasker filtering using this repeats database (string).",
+ equate=False,
+ ),
+ # Restrict search or results:
+ _Option(
+ ["-perc_identity", "perc_identity"],
+ "Percent identity (real, 0 to 100 inclusive).",
+ equate=False,
+ ),
+ # Discontiguous MegaBLAST options
+ _Option(
+ ["-template_type", "template_type"],
+ "Discontiguous MegaBLAST template type (string).\n\n"
+ "Allowed values: 'coding', 'coding_and_optimal' or 'optimal'.\n"
+ "Requires: template_length.",
+ checker_function=lambda value: value
+ in ["coding", "coding_and_optimal", "optimal"],
+ equate=False,
+ ),
+ _Option(
+ ["-template_length", "template_length"],
+ "Discontiguous MegaBLAST template length (integer).\n\n"
+ "Allowed values: 16, 18, 21.\n\n"
+ "Requires: template_type.",
+ checker_function=lambda value: value in [16, 18, 21, "16", "18", "21"],
+ equate=False,
+ ),
+ # Extension options:
+ _Switch(
+ ["-no_greedy", "no_greedy"],
+ "Use non-greedy dynamic programming extension",
+ ),
+ _Option(
+ ["-min_raw_gapped_score", "min_raw_gapped_score"],
+ "Minimum raw gapped score to keep an alignment in the "
+ "preliminary gapped and traceback stages (integer).",
+ equate=False,
+ ),
+ _Switch(["-ungapped", "ungapped"], "Perform ungapped alignment only?"),
+ _Option(
+ ["-off_diagonal_range", "off_diagonal_range"],
+ "Number of off-diagonals to search for the 2nd hit (integer).\n\n"
+ "Expects a positive integer, or 0 (default) to turn off."
+ "Added in BLAST 2.2.23+",
+ equate=False,
+ ),
+ ]
+ _NcbiblastMain2SeqCommandline.__init__(self, cmd, **kwargs)
+
+ def _validate(self):
+ if (self.template_type and not self.template_length) or (
+ self.template_length and not self.template_type
+ ):
+ raise ValueError(
+ "Options template_type and template_type require each other."
+ )
+ _NcbiblastMain2SeqCommandline._validate(self)
+
+
+class NcbiblastxCommandline(_NcbiblastMain2SeqCommandline):
+ """Wrapper for the NCBI BLAST+ program blastx (nucleotide query, protein database).
+
+ With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI
+ replaced the old blastall tool with separate tools for each of the searches.
+ This wrapper therefore replaces BlastallCommandline with option -p blastx.
+
+ >>> from Bio.Blast.Applications import NcbiblastxCommandline
+ >>> cline = NcbiblastxCommandline(query="m_cold.fasta", db="nr", evalue=0.001)
+ >>> cline
+ NcbiblastxCommandline(cmd='blastx', query='m_cold.fasta', db='nr', evalue=0.001)
+ >>> print(cline)
+ blastx -query m_cold.fasta -db nr -evalue 0.001
+
+ You would typically run the command line with cline() or via the Python
+ subprocess module, as described in the Biopython tutorial.
+ """
+
+ def __init__(self, cmd="blastx", **kwargs):
+ """Initialize the class."""
+ self.parameters = [
+ # Input query options:
+ _Option(
+ ["-task", "task"],
+ "Task to execute (string, blastx (default) or blastx-fast).",
+ checker_function=lambda value: value in ["blastx", "blastx-fast"],
+ equate=False,
+ ),
+ _Option(
+ ["-strand", "strand"],
+ "Query strand(s) to search against database/subject.\n\n"
+ 'Values allowed are "both" (default), "minus", "plus".',
+ checker_function=lambda value: value in ["both", "minus", "plus"],
+ equate=False,
+ ),
+ # Input query options:
+ _Option(
+ ["-query_gencode", "query_gencode"],
+ "Genetic code to use to translate query (integer, default 1).",
+ equate=False,
+ ),
+ # General search options:
+ _Option(
+ ["-frame_shift_penalty", "frame_shift_penalty"],
+ "Frame shift penalty (integer, at least 1, default ignored) (OBSOLETE).\n\n"
+ "This was removed in BLAST 2.2.27+",
+ equate=False,
+ ),
+ _Option(
+ ["-max_intron_length", "max_intron_length"],
+ "Maximum intron length (integer).\n\n"
+ "Length of the largest intron allowed in a translated nucleotide "
+ "sequence when linking multiple distinct alignments (a negative "
+ "value disables linking). Default zero.",
+ equate=False,
+ ),
+ _Option(
+ ["-matrix", "matrix"],
+ "Scoring matrix name (default BLOSUM62).",
+ equate=False,
+ ),
+ _Option(
+ ["-threshold", "threshold"],
+ "Minimum score for words to be added to the BLAST lookup table (float).",
+ equate=False,
+ ),
+ _Option(
+ ["-comp_based_stats", "comp_based_stats"],
+ "Use composition-based statistics for blastp, blastx, or tblastn.\n\n"
+ "D or d: default (equivalent to 2 )\n\n"
+ "0 or F or f: no composition-based statistics\n\n"
+ "1: Composition-based statistics as in NAR 29:2994-3005, 2001\n\n"
+ "2 or T or t : Composition-based score adjustment as in "
+ "Bioinformatics 21:902-911, 2005, conditioned on sequence "
+ "properties\n\n"
+ "3: Composition-based score adjustment as in Bioinformatics "
+ "21:902-911, 2005, unconditionally.\n\n"
+ "For programs other than tblastn, must either be absent or be "
+ "D, F or 0\n\n"
+ "Default = 2.",
+ equate=False,
+ ),
+ # Query filtering options:
+ _Option(
+ ["-seg", "seg"],
+ "Filter query sequence with SEG (string).\n\n"
+ 'Format: "yes", "window locut hicut", or "no" to disable.'
+ 'Default is "12 2.2 2.5"',
+ equate=False,
+ ),
+ # Extension options:
+ _Switch(["-ungapped", "ungapped"], "Perform ungapped alignment only?"),
+ _Switch(
+ ["-use_sw_tback", "use_sw_tback"],
+ "Compute locally optimal Smith-Waterman alignments?",
+ ),
+ ]
+ _NcbiblastMain2SeqCommandline.__init__(self, cmd, **kwargs)
+
+
+class NcbitblastnCommandline(_NcbiblastMain2SeqCommandline):
+ """Wrapper for the NCBI BLAST+ program tblastn.
+
+ With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI
+ replaced the old blastall tool with separate tools for each of the searches.
+ This wrapper therefore replaces BlastallCommandline with option -p tblastn.
+
+ >>> from Bio.Blast.Applications import NcbitblastnCommandline
+ >>> cline = NcbitblastnCommandline(help=True)
+ >>> cline
+ NcbitblastnCommandline(cmd='tblastn', help=True)
+ >>> print(cline)
+ tblastn -help
+
+ You would typically run the command line with cline() or via the Python
+ subprocess module, as described in the Biopython tutorial.
+ """
+
+ def __init__(self, cmd="tblastn", **kwargs):
+ """Initialize the class."""
+ self.parameters = [
+ # General search options:
+ _Option(
+ ["-task", "task"],
+ "Task to execute (string, tblastn (default) or tblastn-fast).",
+ checker_function=lambda value: value in ["tblastn", "tblastn-fast"],
+ equate=False,
+ ),
+ _Option(
+ ["-db_gencode", "db_gencode"],
+ "Genetic code to use to translate query (integer, default 1).",
+ equate=False,
+ ),
+ _Option(
+ ["-frame_shift_penalty", "frame_shift_penalty"],
+ "Frame shift penalty (integer, at least 1, default ignored) (OBSOLETE).\n\n"
+ "This was removed in BLAST 2.2.27+",
+ equate=False,
+ ),
+ _Option(
+ ["-max_intron_length", "max_intron_length"],
+ "Maximum intron length (integer).\n\n"
+ "Length of the largest intron allowed in a translated nucleotide "
+ "sequence when linking multiple distinct alignments (a negative "
+ "value disables linking). Default zero.",
+ equate=False,
+ ),
+ _Option(
+ ["-matrix", "matrix"],
+ "Scoring matrix name (default BLOSUM62).",
+ equate=False,
+ ),
+ _Option(
+ ["-threshold", "threshold"],
+ "Minimum score for words to be added to the BLAST lookup table (float).",
+ equate=False,
+ ),
+ _Option(
+ ["-comp_based_stats", "comp_based_stats"],
+ "Use composition-based statistics (string, default 2, i.e. True).\n\n"
+ "0, F or f: no composition-based statistics\n\n"
+ "1: Composition-based statistics as in NAR 29:2994-3005, 2001\n\n"
+ "2, T or t, D or d : Composition-based score adjustment as in "
+ "Bioinformatics 21:902-911, 2005, conditioned on sequence properties\n\n"
+ "3: Composition-based score adjustment as in Bioinformatics 21:902-911, "
+ "2005, unconditionally\n\n"
+ "Note that only tblastn supports values of 1 and 3.",
+ checker_function=lambda value: value in "0Ft12TtDd3",
+ equate=False,
+ ),
+ # Query filtering options:
+ _Option(
+ ["-seg", "seg"],
+ "Filter query sequence with SEG (string).\n\n"
+ 'Format: "yes", "window locut hicut", or "no" to disable.\n\n'
+ 'Default is "12 2.2 2.5"',
+ equate=False,
+ ),
+ # Extension options:
+ _Switch(["-ungapped", "ungapped"], "Perform ungapped alignment only?"),
+ # Miscellaneous options:
+ _Switch(
+ ["-use_sw_tback", "use_sw_tback"],
+ "Compute locally optimal Smith-Waterman alignments?",
+ ),
+ # PSI-TBLASTN options:
+ _Option(
+ ["-in_pssm", "in_pssm"],
+ "PSI-BLAST checkpoint file.\n\nIncompatible with: remote, query",
+ filename=True,
+ equate=False,
+ ),
+ ]
+ _NcbiblastMain2SeqCommandline.__init__(self, cmd, **kwargs)
+
+
+class NcbitblastxCommandline(_NcbiblastMain2SeqCommandline):
+ """Wrapper for the NCBI BLAST+ program tblastx.
+
+ With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI
+ replaced the old blastall tool with separate tools for each of the searches.
+ This wrapper therefore replaces BlastallCommandline with option -p tblastx.
+
+ >>> from Bio.Blast.Applications import NcbitblastxCommandline
+ >>> cline = NcbitblastxCommandline(help=True)
+ >>> cline
+ NcbitblastxCommandline(cmd='tblastx', help=True)
+ >>> print(cline)
+ tblastx -help
+
+ You would typically run the command line with cline() or via the Python
+ subprocess module, as described in the Biopython tutorial.
+ """
+
+ def __init__(self, cmd="tblastx", **kwargs):
+ """Initialize the class."""
+ self.parameters = [
+ # Input query options:
+ _Option(
+ ["-strand", "strand"],
+ "Query strand(s) to search against database/subject.\n\n"
+ 'Values allowed are "both" (default), "minus", "plus".',
+ checker_function=lambda value: value in ["both", "minus", "plus"],
+ equate=False,
+ ),
+ # Input query options:
+ _Option(
+ ["-query_gencode", "query_gencode"],
+ "Genetic code to use to translate query (integer, default 1).",
+ equate=False,
+ ),
+ # General search options:
+ _Option(
+ ["-db_gencode", "db_gencode"],
+ "Genetic code to use to translate query (integer, default 1).",
+ equate=False,
+ ),
+ _Option(
+ ["-max_intron_length", "max_intron_length"],
+ "Maximum intron length (integer).\n\n"
+ "Length of the largest intron allowed in a translated nucleotide "
+ "sequence when linking multiple distinct alignments (a negative "
+ "value disables linking). Default zero.",
+ equate=False,
+ ),
+ _Option(
+ ["-matrix", "matrix"],
+ "Scoring matrix name (default BLOSUM62).",
+ equate=False,
+ ),
+ _Option(
+ ["-threshold", "threshold"],
+ "Minimum score for words to be added to the BLAST lookup table (float).",
+ equate=False,
+ ),
+ # Query filtering options:
+ _Option(
+ ["-seg", "seg"],
+ "Filter query sequence with SEG (string).\n\n"
+ 'Format: "yes", "window locut hicut", or "no" to disable.\n\n'
+ 'Default is "12 2.2 2.5"',
+ equate=False,
+ ),
+ ]
+ _NcbiblastMain2SeqCommandline.__init__(self, cmd, **kwargs)
+
+
+class NcbipsiblastCommandline(_Ncbiblast2SeqCommandline):
+ """Wrapper for the NCBI BLAST+ program psiblast.
+
+ With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI
+ replaced the old blastpgp tool with a similar tool psiblast. This wrapper
+ therefore replaces BlastpgpCommandline, the wrapper for blastpgp.
+
+ >>> from Bio.Blast.Applications import NcbipsiblastCommandline
+ >>> cline = NcbipsiblastCommandline(help=True)
+ >>> cline
+ NcbipsiblastCommandline(cmd='psiblast', help=True)
+ >>> print(cline)
+ psiblast -help
+
+ You would typically run the command line with cline() or via the Python
+ subprocess module, as described in the Biopython tutorial.
+ """
+
+ def __init__(self, cmd="psiblast", **kwargs):
+ """Initialize the class."""
+ self.parameters = [
+ # General search options:
+ _Option(
+ ["-matrix", "matrix"],
+ "Scoring matrix name (default BLOSUM62).",
+ equate=False,
+ ),
+ _Option(
+ ["-threshold", "threshold"],
+ "Minimum score for words to be added to the BLAST lookup table (float).",
+ equate=False,
+ ),
+ _Option(
+ ["-comp_based_stats", "comp_based_stats"],
+ "Use composition-based statistics (string, default 2, i.e. True).\n\n"
+ "0, F or f: no composition-based statistics\n\n"
+ "2, T or t, D or d : Composition-based score adjustment as in "
+ "Bioinformatics 21:902-911, 2005, conditioned on sequence properties\n\n"
+ "Note that tblastn also supports values of 1 and 3.",
+ checker_function=lambda value: value in "0Ft2TtDd",
+ equate=False,
+ ),
+ # Query filtering options:
+ _Option(
+ ["-seg", "seg"],
+ "Filter query sequence with SEG (string).\n\n"
+ 'Format: "yes", "window locut hicut", or "no" to disable. '
+ 'Default is "12 2.2 2.5"',
+ equate=False,
+ ),
+ # Extension options:
+ _Option(
+ ["-gap_trigger", "gap_trigger"],
+ "Number of bits to trigger gapping (float, default 22).",
+ equate=False,
+ ),
+ # Miscellaneous options:
+ _Switch(
+ ["-use_sw_tback", "use_sw_tback"],
+ "Compute locally optimal Smith-Waterman alignments?",
+ ),
+ # PSI-BLAST options:
+ _Option(
+ ["-num_iterations", "num_iterations"],
+ "Number of iterations to perform (integer, at least one).\n\n"
+ "Default is one. Incompatible with: remote",
+ equate=False,
+ ),
+ _Option(
+ ["-out_pssm", "out_pssm"],
+ "File name to store checkpoint file.",
+ filename=True,
+ equate=False,
+ ),
+ _Option(
+ ["-out_ascii_pssm", "out_ascii_pssm"],
+ "File name to store ASCII version of PSSM.",
+ filename=True,
+ equate=False,
+ ),
+ _Switch(
+ ["-save_pssm_after_last_round", "save_pssm_after_last_round"],
+ "Save PSSM after the last database search.",
+ ),
+ _Switch(
+ ["-save_each_pssm", "save_each_pssm"],
+ "Save PSSM after each iteration\n\n"
+ "File name is given in -save_pssm or -save_ascii_pssm options.",
+ ),
+ _Option(
+ ["-in_msa", "in_msa"],
+ "File name of multiple sequence alignment to restart PSI-BLAST.\n\n"
+ "Incompatible with: in_pssm, query",
+ filename=True,
+ equate=False,
+ ),
+ _Option(
+ ["-msa_master_idx", "msa_master_idx"],
+ "Index of sequence to use as master in MSA.\n\n"
+ "Index (1-based) of sequence to use as the master in the multiple "
+ "sequence alignment. If not specified, the first sequence is used.",
+ equate=False,
+ ),
+ _Option(
+ ["-in_pssm", "in_pssm"],
+ "PSI-BLAST checkpoint file.\n\n"
+ "Incompatible with: in_msa, query, phi_pattern",
+ filename=True,
+ equate=False,
+ ),
+ # PSSM engine options:
+ _Option(
+ ["-pseudocount", "pseudocount"],
+ "Pseudo-count value used when constructing PSSM.\n\n"
+ "Integer. Default is zero.",
+ equate=False,
+ ),
+ _Option(
+ ["-inclusion_ethresh", "inclusion_ethresh"],
+ "E-value inclusion threshold for pairwise alignments (float, default 0.002).",
+ equate=False,
+ ),
+ _Switch(
+ ["-ignore_msa_master", "ignore_msa_master"],
+ "Ignore the master sequence when creating PSSM.\n\n"
+ "Requires: in_msa\n"
+ "Incompatible with: msa_master_idx, in_pssm, query, query_loc, "
+ "phi_pattern",
+ ),
+ # PHI-BLAST options:
+ _Option(
+ ["-phi_pattern", "phi_pattern"],
+ "File name containing pattern to search.\n\n"
+ "Incompatible with: in_pssm",
+ filename=True,
+ equate=False,
+ ),
+ ]
+ _Ncbiblast2SeqCommandline.__init__(self, cmd, **kwargs)
+
+ def _validate(self):
+ incompatibles = {
+ "num_iterations": ["remote"],
+ "in_msa": ["in_pssm", "query"],
+ "in_pssm": ["in_msa", "query", "phi_pattern"],
+ "ignore_msa_master": [
+ "msa_master_idx",
+ "in_pssm",
+ "query",
+ "query_loc",
+ "phi_pattern",
+ ],
+ }
+ self._validate_incompatibilities(incompatibles)
+ _Ncbiblast2SeqCommandline._validate(self)
+
+
+class NcbirpsblastCommandline(_NcbiblastCommandline):
+ """Wrapper for the NCBI BLAST+ program rpsblast.
+
+ With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI
+ replaced the old rpsblast tool with a similar tool of the same name. This
+ wrapper replaces RpsBlastCommandline, the wrapper for the old rpsblast.
+
+ >>> from Bio.Blast.Applications import NcbirpsblastCommandline
+ >>> cline = NcbirpsblastCommandline(help=True)
+ >>> cline
+ NcbirpsblastCommandline(cmd='rpsblast', help=True)
+ >>> print(cline)
+ rpsblast -help
+
+ You would typically run the command line with cline() or via the Python
+ subprocess module, as described in the Biopython tutorial.
+ """
+
+ def __init__(self, cmd="rpsblast", **kwargs):
+ """Initialize the class."""
+ # TODO - remove the -word_size argument as per BLAST+ 2.2.30
+ # (BLAST team say it should never have been included, since
+ # the word size is set when building the domain database.)
+ # This likely means reviewing the class hierarchy again.
+ self.parameters = [
+ # Query filtering options:
+ _Option(
+ ["-seg", "seg"],
+ "Filter query sequence with SEG (string).\n\n"
+ 'Format: "yes", "window locut hicut", or "no" to disable.'
+ 'Default is "12 2.2 2.5"',
+ equate=False,
+ ),
+ # Restrict search or results:
+ _Option(
+ ["-culling_limit", "culling_limit"],
+ "Hit culling limit (integer).\n\n"
+ "If the query range of a hit is enveloped by that of at "
+ "least this many higher-scoring hits, delete the hit. "
+ "Incompatible with: best_hit_overhang, best_hit_score_edge.",
+ equate=False,
+ ),
+ _Option(
+ ["-best_hit_overhang", "best_hit_overhang"],
+ "Best Hit algorithm overhang value (recommended value: 0.1).\n\n"
+ "Float between 0.0 and 0.5 inclusive. "
+ "Incompatible with: culling_limit.",
+ equate=False,
+ ),
+ _Option(
+ ["-best_hit_score_edge", "best_hit_score_edge"],
+ "Best Hit algorithm score edge value (recommended value: 0.1).\n\n"
+ "Float between 0.0 and 0.5 inclusive. "
+ "Incompatible with: culling_limit.",
+ equate=False,
+ ),
+ # General search options:
+ _Option(
+ ["-comp_based_stats", "comp_based_stats"],
+ "Use composition-based statistics.\n\n"
+ "D or d: default (equivalent to 0)\n\n"
+ "0 or F or f: Simplified Composition-based statistics as in "
+ "Bioinformatics 15:1000-1011, 1999\n\n"
+ "1 or T or t: Composition-based statistics as in NAR 29:2994-3005, "
+ "2001\n\n"
+ "Default = 0.",
+ checker_function=lambda value: value in "Dd0Ff1Tt",
+ equate=False,
+ ),
+ # Misc options:
+ _Switch(
+ ["-use_sw_tback", "use_sw_tback"],
+ "Compute locally optimal Smith-Waterman alignments?",
+ ),
+ ]
+ _NcbiblastCommandline.__init__(self, cmd, **kwargs)
+
+ def _validate(self):
+ incompatibles = {"culling_limit": ["best_hit_overhang", "best_hit_score_edge"]}
+ self._validate_incompatibilities(incompatibles)
+ _NcbiblastCommandline._validate(self)
+
+
+class NcbirpstblastnCommandline(_NcbiblastCommandline):
+ """Wrapper for the NCBI BLAST+ program rpstblastn.
+
+ With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI
+ replaced the old rpsblast tool with a similar tool of the same name, and a
+ separate tool rpstblastn for Translated Reverse Position Specific BLAST.
+
+ >>> from Bio.Blast.Applications import NcbirpstblastnCommandline
+ >>> cline = NcbirpstblastnCommandline(help=True)
+ >>> cline
+ NcbirpstblastnCommandline(cmd='rpstblastn', help=True)
+ >>> print(cline)
+ rpstblastn -help
+
+ You would typically run the command line with cline() or via the Python
+ subprocess module, as described in the Biopython tutorial.
+ """
+
+ def __init__(self, cmd="rpstblastn", **kwargs):
+ """Initialize the class."""
+ # TODO - remove the -word_size argument as per BLAST+ 2.2.30
+ # (BLAST team say it should never have been included, since
+ # the word size is set when building the domain database.)
+ # This likely means reviewing the class hierarchy again.
+ self.parameters = [
+ # Input query options:
+ _Option(
+ ["-strand", "strand"],
+ "Query strand(s) to search against database/subject.\n\n"
+ 'Values allowed are "both" (default), "minus", "plus".',
+ checker_function=lambda value: value in ["both", "minus", "plus"],
+ equate=False,
+ ),
+ # Input query options:
+ _Option(
+ ["-query_gencode", "query_gencode"],
+ "Genetic code to use to translate query (integer, default 1).",
+ equate=False,
+ ),
+ # Query filtering options:
+ _Option(
+ ["-seg", "seg"],
+ "Filter query sequence with SEG (string).\n\n"
+ 'Format: "yes", "window locut hicut", or "no" to disable. '
+ 'Default is "12 2.2 2.5"',
+ equate=False,
+ ),
+ # General search options:
+ _Option(
+ ["-comp_based_stats", "comp_based_stats"],
+ "Use composition-based statistics.\n\n"
+ "D or d: default (equivalent to 0)\n\n"
+ "0 or F or f: Simplified Composition-based statistics as in "
+ "Bioinformatics 15:1000-1011, 1999\n\n"
+ "1 or T or t: Composition-based statistics as in NAR 29:2994-3005, "
+ "2001\n\n"
+ "Default = 0.",
+ checker_function=lambda value: value in "Dd0Ff1Tt",
+ equate=False,
+ ),
+ # Extension options:
+ _Switch(["-ungapped", "ungapped"], "Perform ungapped alignment only?"),
+ # Miscellaneous options:
+ _Switch(
+ ["-use_sw_tback", "use_sw_tback"],
+ "Compute locally optimal Smith-Waterman alignments?",
+ ),
+ ]
+ _NcbiblastCommandline.__init__(self, cmd, **kwargs)
+
+
+class NcbiblastformatterCommandline(_NcbibaseblastCommandline):
+ """Wrapper for the NCBI BLAST+ program blast_formatter.
+
+ With the release of BLAST 2.2.24+ (i.e. the BLAST suite rewritten in C++
+ instead of C), the NCBI added the ASN.1 output format option to all the
+ search tools, and extended the blast_formatter to support this as input.
+
+ The blast_formatter command allows you to convert the ASN.1 output into
+ the other output formats (XML, tabular, plain text, HTML).
+
+ >>> from Bio.Blast.Applications import NcbiblastformatterCommandline
+ >>> cline = NcbiblastformatterCommandline(archive="example.asn", outfmt=5, out="example.xml")
+ >>> cline
+ NcbiblastformatterCommandline(cmd='blast_formatter', out='example.xml', outfmt=5, archive='example.asn')
+ >>> print(cline)
+ blast_formatter -out example.xml -outfmt 5 -archive example.asn
+
+ You would typically run the command line with cline() or via the Python
+ subprocess module, as described in the Biopython tutorial.
+
+ Note that this wrapper is for the version of blast_formatter from BLAST
+ 2.2.24+ (or later) which is when the NCBI first announced the inclusion
+ this tool. There was actually an early version in BLAST 2.2.23+ (and
+ possibly in older releases) but this did not have the -archive option
+ (instead -rid is a mandatory argument), and is not supported by this
+ wrapper.
+ """
+
+ def __init__(self, cmd="blast_formatter", **kwargs):
+ """Initialize the class."""
+ self.parameters = [
+ # Input options
+ _Option(
+ ["-rid", "rid"],
+ "BLAST Request ID (RID), not compatible with archive arg.",
+ equate=False,
+ ),
+ _Option(
+ ["-archive", "archive"],
+ "Archive file of results, not compatible with rid arg.",
+ filename=True,
+ equate=False,
+ ),
+ # Restrict search or results
+ _Option(
+ ["-max_target_seqs", "max_target_seqs"],
+ "Maximum number of aligned sequences to keep.",
+ checker_function=lambda value: value >= 1,
+ equate=False,
+ ),
+ ]
+ _NcbibaseblastCommandline.__init__(self, cmd, **kwargs)
+
+ def _validate(self):
+ incompatibles = {"rid": ["archive"]}
+ self._validate_incompatibilities(incompatibles)
+ _NcbibaseblastCommandline._validate(self)
+
+
+class NcbideltablastCommandline(_Ncbiblast2SeqCommandline):
+ """Create a commandline for the NCBI BLAST+ program deltablast (for proteins).
+
+ This is a wrapper for the deltablast command line command included in
+ the NCBI BLAST+ software (not present in the original BLAST).
+
+ >>> from Bio.Blast.Applications import NcbideltablastCommandline
+ >>> cline = NcbideltablastCommandline(query="rosemary.pro", db="nr",
+ ... evalue=0.001, remote=True)
+ >>> cline
+ NcbideltablastCommandline(cmd='deltablast', query='rosemary.pro', db='nr', evalue=0.001, remote=True)
+ >>> print(cline)
+ deltablast -query rosemary.pro -db nr -evalue 0.001 -remote
+
+ You would typically run the command line with cline() or via the Python
+ subprocess module, as described in the Biopython tutorial.
+ """
+
+ def __init__(self, cmd="deltablast", **kwargs):
+ """Initialize the class."""
+ self.parameters = [
+ # General search options:
+ _Option(["-matrix", "matrix"], "Scoring matrix name (default BLOSUM62)."),
+ _Option(
+ ["-threshold", "threshold"],
+ "Minimum score for words to be added to the BLAST lookup table (float).",
+ equate=False,
+ ),
+ _Option(
+ ["-comp_based_stats", "comp_based_stats"],
+ "Use composition-based statistics (string, default 2, i.e. True).\n\n"
+ "0, F or f: no composition-based statistics.\n\n"
+ "2, T or t, D or d : Composition-based score adjustment as in "
+ "Bioinformatics 21:902-911, 2005, conditioned on sequence properties\n\n"
+ "Note that tblastn also supports values of 1 and 3.",
+ checker_function=lambda value: value in "0Ft2TtDd",
+ equate=False,
+ ),
+ # Query filtering options:
+ _Option(
+ ["-seg", "seg"],
+ "Filter query sequence with SEG (string).\n\n"
+ 'Format: "yes", "window locut hicut", or "no" to disable. '
+ 'Default is "12 2.2 2.5"',
+ equate=False,
+ ),
+ # Extension options:
+ _Option(
+ ["-gap_trigger", "gap_trigger"],
+ "Number of bits to trigger gapping. Default = 22.",
+ equate=False,
+ ),
+ # Miscellaneous options:
+ _Switch(
+ ["-use_sw_tback", "use_sw_tback"],
+ "Compute locally optimal Smith-Waterman alignments?",
+ ),
+ # PSI-BLAST options
+ _Option(
+ ["-num_iterations", "num_iterations"],
+ "Number of iterations to perform. (integer >=1, Default is 1).\n\n"
+ "Incompatible with: remote",
+ equate=False,
+ ),
+ _Option(
+ ["-out_pssm", "out_pssm"],
+ "File name to store checkpoint file.",
+ filename=True,
+ equate=False,
+ ),
+ _Option(
+ ["-out_ascii_pssm", "out_ascii_pssm"],
+ "File name to store ASCII version of PSSM.",
+ filename=True,
+ equate=False,
+ ),
+ _Switch(
+ ["-save_pssm_after_last_round", "save_pssm_after_last_round"],
+ "Save PSSM after the last database search.",
+ ),
+ _Switch(
+ ["-save_each_pssm", "save_each_pssm"],
+ "Save PSSM after each iteration.\n\n"
+ "File name is given in -save_pssm or -save_ascii_pssm options.",
+ ),
+ # PSSM engine options
+ _Option(
+ ["-pseudocount", "pseudocount"],
+ "Pseudo-count value used when constructing PSSM (integer, default 0).",
+ equate=False,
+ ),
+ _Option(
+ ["-domain_inclusion_ethresh", "domain_inclusion_ethresh"],
+ "E-value inclusion threshold for alignments with conserved domains.\n\n"
+ "(float, Default is 0.05)",
+ equate=False,
+ ),
+ _Option(
+ ["-inclusion_ethresh", "inclusion_ethresh"],
+ "Pairwise alignment e-value inclusion threshold (float, default 0.002).",
+ equate=False,
+ ),
+ # DELTA-BLAST options
+ _Option(
+ ["-rpsdb", "rpsdb"],
+ "BLAST domain database name (dtring, Default = 'cdd_delta').",
+ equate=False,
+ ),
+ _Switch(
+ ["-show_domain_hits", "show_domain_hits"],
+ "Show domain hits?\n\nIncompatible with: remote, subject",
+ ),
+ ]
+ _Ncbiblast2SeqCommandline.__init__(self, cmd, **kwargs)
+
+
+class NcbimakeblastdbCommandline(AbstractCommandline):
+ """Wrapper for the NCBI BLAST+ program makeblastdb.
+
+ This is a wrapper for the NCBI BLAST+ makeblastdb application
+ to create BLAST databases. By default, this creates a blast database
+ with the same name as the input file. The default output location
+ is the same directory as the input.
+
+ >>> from Bio.Blast.Applications import NcbimakeblastdbCommandline
+ >>> cline = NcbimakeblastdbCommandline(dbtype="prot",
+ ... input_file="NC_005816.faa")
+ >>> cline
+ NcbimakeblastdbCommandline(cmd='makeblastdb', dbtype='prot', input_file='NC_005816.faa')
+ >>> print(cline)
+ makeblastdb -dbtype prot -in NC_005816.faa
+
+ You would typically run the command line with cline() or via the Python
+ subprocess module, as described in the Biopython tutorial.
+ """
+
+ def __init__(self, cmd="makeblastdb", **kwargs):
+ """Initialize the class."""
+ self.parameters = [
+ # Basic input options
+ _Switch(
+ ["-h", "h"], "Print USAGE and DESCRIPTION; ignore other arguments."
+ ),
+ _Switch(
+ ["-help", "help"],
+ "Print USAGE, DESCRIPTION and ARGUMENTS description; "
+ "ignore other arguments.",
+ ),
+ _Switch(
+ ["-version", "version"],
+ "Print version number; ignore other arguments.",
+ ),
+ # Output configuration options
+ _Option(
+ ["-out", "out"],
+ "Output file for alignment.",
+ filename=True,
+ equate=False,
+ ),
+ # makeblastdb specific options
+ _Option(
+ ["-blastdb_version", "blastdb_version"],
+ "Version of BLAST database to be created. "
+ "Tip: use BLAST database version 4 on 32 bit CPU. "
+ "Default = 5",
+ equate=False,
+ checker_function=lambda x: x == 4 or x == 5,
+ ),
+ _Option(
+ ["-dbtype", "dbtype"],
+ "Molecule type of target db ('nucl' or 'prot').",
+ equate=False,
+ is_required=True,
+ checker_function=lambda x: x == "nucl" or x == "prot",
+ ),
+ _Option(
+ ["-in", "input_file"],
+ "Input file/database name.",
+ filename=True,
+ equate=False,
+ ),
+ _Option(
+ ["-input_type", "input_type"],
+ "Type of the data specified in input_file.\n\n"
+ "Default = 'fasta'. Added in BLAST 2.2.26.",
+ filename=False,
+ equate=False,
+ checker_function=self._input_type_checker,
+ ),
+ _Option(
+ ["-title", "title"],
+ "Title for BLAST database.",
+ filename=False,
+ equate=False,
+ ),
+ _Switch(
+ ["-parse_seqids", "parse_seqids"],
+ "Option to parse seqid for FASTA input if set.\n\n"
+ "For all other input types, seqids are parsed automatically",
+ ),
+ _Switch(
+ ["-hash_index", "hash_index"], "Create index of sequence hash values."
+ ),
+ _Option(
+ ["-mask_data", "mask_data"],
+ "Comma-separated list of input files containing masking "
+ "data as produced by NCBI masking applications "
+ "(e.g. dustmasker, segmasker, windowmasker).",
+ filename=True,
+ equate=False,
+ ),
+ _Option(
+ ["-mask_id", "mask_id"],
+ "Comma-separated list of strings to uniquely identify the "
+ "masking algorithm.",
+ filename=False,
+ equate=False,
+ ),
+ _Option(
+ ["-mask_desc", "mask_desc"],
+ "Comma-separated list of free form strings to describe "
+ "the masking algorithm details.",
+ filename=False,
+ equate=False,
+ ),
+ _Switch(["-gi_mask", "gi_mask"], "Create GI indexed masking data."),
+ _Option(
+ ["-gi_mask_name", "gi_mask_name"],
+ "Comma-separated list of masking data output files.",
+ filename=False,
+ equate=False,
+ ),
+ _Option(
+ ["-max_file_sz", "max_file_sz"],
+ "Maximum file size for BLAST database files. Default = '1GB'.",
+ filename=False,
+ equate=False,
+ ),
+ _Option(
+ ["-logfile", "logfile"],
+ "File to which the program log should be redirected.",
+ filename=True,
+ equate=False,
+ ),
+ _Option(
+ ["-taxid", "taxid"],
+ "Taxonomy ID to assign to all sequences.",
+ filename=False,
+ equate=False,
+ checker_function=lambda x: type(x)(int(x)) == x,
+ ),
+ _Option(
+ ["-taxid_map", "taxid_map"],
+ "Text file mapping sequence IDs to taxonomy IDs.\n\n"
+ "Format:",
+ filename=True,
+ equate=False,
+ ),
+ ]
+ AbstractCommandline.__init__(self, cmd, **kwargs)
+
+ def _input_type_checker(self, command):
+ return command in ("asn1_bin", "asn1_txt", "blastdb", "fasta")
+
+ def _validate(self):
+ incompatibles = {
+ "mask_id": ["gi_mask"],
+ "gi_mask": ["mask_id"],
+ "taxid": ["taxid_map"],
+ }
+
+ # Copied from _NcbibaseblastCommandline class above.
+ # Code repeated here for python2 and 3 compatibility,
+ # because this is not a _NcbibaseblastCommandline subclass.
+ for a in incompatibles:
+ if self._get_parameter(a):
+ for b in incompatibles[a]:
+ if self._get_parameter(b):
+ raise ValueError("Options %s and %s are incompatible." % (a, b))
+
+ if self.mask_id and not self.mask_data:
+ raise ValueError("Option mask_id requires mask_data to be set.")
+ if self.mask_desc and not self.mask_id:
+ raise ValueError("Option mask_desc requires mask_id to be set.")
+ if self.gi_mask and not self.parse_seqids:
+ raise ValueError("Option gi_mask requires parse_seqids to be set.")
+ if self.gi_mask_name and not (self.mask_data and self.gi_mask):
+ raise ValueError(
+ "Option gi_mask_name requires mask_data and gi_mask to be set."
+ )
+ if self.taxid_map and not self.parse_seqids:
+ raise ValueError("Option taxid_map requires parse_seqids to be set.")
+ AbstractCommandline._validate(self)
+
+
+def _test():
+ """Run the Bio.Blast.Applications module's doctests (PRIVATE)."""
+ import doctest
+
+ doctest.testmod(verbose=1)
+
+
+if __name__ == "__main__":
+ # Run the doctests
+ _test()
diff --git a/code/lib/Bio/Blast/NCBIWWW.py b/code/lib/Bio/Blast/NCBIWWW.py
new file mode 100644
index 0000000..4bcca3f
--- /dev/null
+++ b/code/lib/Bio/Blast/NCBIWWW.py
@@ -0,0 +1,348 @@
+# Copyright 1999 by Jeffrey Chang. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+#
+# Patched by Brad Chapman.
+# Chris Wroe added modifications for work in myGrid
+
+"""Code to invoke the NCBI BLAST server over the internet.
+
+This module provides code to work with the WWW version of BLAST
+provided by the NCBI. https://blast.ncbi.nlm.nih.gov/
+"""
+
+
+import warnings
+
+from io import StringIO
+import time
+
+from urllib.request import urlopen
+from urllib.parse import urlencode
+from urllib.request import Request
+
+from Bio import BiopythonWarning
+
+
+NCBI_BLAST_URL = "https://blast.ncbi.nlm.nih.gov/Blast.cgi"
+
+
+def qblast(
+ program,
+ database,
+ sequence,
+ url_base=NCBI_BLAST_URL,
+ auto_format=None,
+ composition_based_statistics=None,
+ db_genetic_code=None,
+ endpoints=None,
+ entrez_query="(none)",
+ expect=10.0,
+ filter=None,
+ gapcosts=None,
+ genetic_code=None,
+ hitlist_size=50,
+ i_thresh=None,
+ layout=None,
+ lcase_mask=None,
+ matrix_name=None,
+ nucl_penalty=None,
+ nucl_reward=None,
+ other_advanced=None,
+ perc_ident=None,
+ phi_pattern=None,
+ query_file=None,
+ query_believe_defline=None,
+ query_from=None,
+ query_to=None,
+ searchsp_eff=None,
+ service=None,
+ threshold=None,
+ ungapped_alignment=None,
+ word_size=None,
+ short_query=None,
+ alignments=500,
+ alignment_view=None,
+ descriptions=500,
+ entrez_links_new_window=None,
+ expect_low=None,
+ expect_high=None,
+ format_entrez_query=None,
+ format_object=None,
+ format_type="XML",
+ ncbi_gi=None,
+ results_file=None,
+ show_overview=None,
+ megablast=None,
+ template_type=None,
+ template_length=None,
+):
+ """BLAST search using NCBI's QBLAST server or a cloud service provider.
+
+ Supports all parameters of the old qblast API for Put and Get.
+
+ Please note that NCBI uses the new Common URL API for BLAST searches
+ on the internet (http://ncbi.github.io/blast-cloud/dev/api.html). Thus,
+ some of the parameters used by this function are not (or are no longer)
+ officially supported by NCBI. Although they are still functioning, this
+ may change in the future.
+
+ The Common URL API (http://ncbi.github.io/blast-cloud/dev/api.html) allows
+ doing BLAST searches on cloud servers. To use this feature, please set
+ ``url_base='http://host.my.cloud.service.provider.com/cgi-bin/blast.cgi'``
+ and ``format_object='Alignment'``. For more details, please see
+ https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=CloudBlast
+
+ Some useful parameters:
+
+ - program blastn, blastp, blastx, tblastn, or tblastx (lower case)
+ - database Which database to search against (e.g. "nr").
+ - sequence The sequence to search.
+ - ncbi_gi TRUE/FALSE whether to give 'gi' identifier.
+ - descriptions Number of descriptions to show. Def 500.
+ - alignments Number of alignments to show. Def 500.
+ - expect An expect value cutoff. Def 10.0.
+ - matrix_name Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45).
+ - filter "none" turns off filtering. Default no filtering
+ - format_type "HTML", "Text", "ASN.1", or "XML". Def. "XML".
+ - entrez_query Entrez query to limit Blast search
+ - hitlist_size Number of hits to return. Default 50
+ - megablast TRUE/FALSE whether to use MEga BLAST algorithm (blastn only)
+ - short_query TRUE/FALSE whether to adjust the search parameters for a
+ short query sequence. Note that this will override
+ manually set parameters like word size and e value. Turns
+ off when sequence length is > 30 residues. Default: None.
+ - service plain, psi, phi, rpsblast, megablast (lower case)
+
+ This function does no checking of the validity of the parameters
+ and passes the values to the server as is. More help is available at:
+ https://ncbi.github.io/blast-cloud/dev/api.html
+
+ """
+ programs = ["blastn", "blastp", "blastx", "tblastn", "tblastx"]
+ if program not in programs:
+ raise ValueError(
+ "Program specified is %s. Expected one of %s"
+ % (program, ", ".join(programs))
+ )
+
+ # SHORT_QUERY_ADJUST throws an error when using blastn (wrong parameter
+ # assignment from NCBIs side).
+ # Thus we set the (known) parameters directly:
+ if short_query and program == "blastn":
+ short_query = None
+ # We only use the 'short-query' parameters for short sequences:
+ if len(sequence) < 31:
+ expect = 1000
+ word_size = 7
+ nucl_reward = 1
+ filter = None
+ lcase_mask = None
+ warnings.warn(
+ '"SHORT_QUERY_ADJUST" is incorrectly implemented (by NCBI) for blastn.'
+ " We bypass the problem by manually adjusting the search parameters."
+ " Thus, results may slightly differ from web page searches.",
+ BiopythonWarning,
+ )
+
+ # Format the "Put" command, which sends search requests to qblast.
+ # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007
+ # Additional parameters are taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node9.html on 8 Oct 2010
+ # To perform a PSI-BLAST or PHI-BLAST search the service ("Put" and "Get" commands) must be specified
+ # (e.g. psi_blast = NCBIWWW.qblast("blastp", "refseq_protein", input_sequence, service="psi"))
+ parameters = [
+ ("AUTO_FORMAT", auto_format),
+ ("COMPOSITION_BASED_STATISTICS", composition_based_statistics),
+ ("DATABASE", database),
+ ("DB_GENETIC_CODE", db_genetic_code),
+ ("ENDPOINTS", endpoints),
+ ("ENTREZ_QUERY", entrez_query),
+ ("EXPECT", expect),
+ ("FILTER", filter),
+ ("GAPCOSTS", gapcosts),
+ ("GENETIC_CODE", genetic_code),
+ ("HITLIST_SIZE", hitlist_size),
+ ("I_THRESH", i_thresh),
+ ("LAYOUT", layout),
+ ("LCASE_MASK", lcase_mask),
+ ("MEGABLAST", megablast),
+ ("MATRIX_NAME", matrix_name),
+ ("NUCL_PENALTY", nucl_penalty),
+ ("NUCL_REWARD", nucl_reward),
+ ("OTHER_ADVANCED", other_advanced),
+ ("PERC_IDENT", perc_ident),
+ ("PHI_PATTERN", phi_pattern),
+ ("PROGRAM", program),
+ # ('PSSM',pssm), - It is possible to use PSI-BLAST via this API?
+ ("QUERY", sequence),
+ ("QUERY_FILE", query_file),
+ ("QUERY_BELIEVE_DEFLINE", query_believe_defline),
+ ("QUERY_FROM", query_from),
+ ("QUERY_TO", query_to),
+ # ('RESULTS_FILE',...), - Can we use this parameter?
+ ("SEARCHSP_EFF", searchsp_eff),
+ ("SERVICE", service),
+ ("SHORT_QUERY_ADJUST", short_query),
+ ("TEMPLATE_TYPE", template_type),
+ ("TEMPLATE_LENGTH", template_length),
+ ("THRESHOLD", threshold),
+ ("UNGAPPED_ALIGNMENT", ungapped_alignment),
+ ("WORD_SIZE", word_size),
+ ("CMD", "Put"),
+ ]
+ query = [x for x in parameters if x[1] is not None]
+ message = urlencode(query).encode()
+
+ # Send off the initial query to qblast.
+ # Note the NCBI do not currently impose a rate limit here, other
+ # than the request not to make say 50 queries at once using multiple
+ # threads.
+ request = Request(url_base, message, {"User-Agent": "BiopythonClient"})
+ handle = urlopen(request)
+
+ # Format the "Get" command, which gets the formatted results from qblast
+ # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007
+ rid, rtoe = _parse_qblast_ref_page(handle)
+ parameters = [
+ ("ALIGNMENTS", alignments),
+ ("ALIGNMENT_VIEW", alignment_view),
+ ("DESCRIPTIONS", descriptions),
+ ("ENTREZ_LINKS_NEW_WINDOW", entrez_links_new_window),
+ ("EXPECT_LOW", expect_low),
+ ("EXPECT_HIGH", expect_high),
+ ("FORMAT_ENTREZ_QUERY", format_entrez_query),
+ ("FORMAT_OBJECT", format_object),
+ ("FORMAT_TYPE", format_type),
+ ("NCBI_GI", ncbi_gi),
+ ("RID", rid),
+ ("RESULTS_FILE", results_file),
+ ("SERVICE", service),
+ ("SHOW_OVERVIEW", show_overview),
+ ("CMD", "Get"),
+ ]
+ query = [x for x in parameters if x[1] is not None]
+ message = urlencode(query).encode()
+
+ # Poll NCBI until the results are ready.
+ # https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
+ # 1. Do not contact the server more often than once every 10 seconds.
+ # 2. Do not poll for any single RID more often than once a minute.
+ # 3. Use the URL parameter email and tool, so that the NCBI
+ # can contact you if there is a problem.
+ # 4. Run scripts weekends or between 9 pm and 5 am Eastern time
+ # on weekdays if more than 50 searches will be submitted.
+ # --
+ # Could start with a 10s delay, but expect most short queries
+ # will take longer thus at least 70s with delay. Therefore,
+ # start with 20s delay, thereafter once a minute.
+ delay = 20 # seconds
+ while True:
+ current = time.time()
+ wait = qblast._previous + delay - current
+ if wait > 0:
+ time.sleep(wait)
+ qblast._previous = current + wait
+ else:
+ qblast._previous = current
+ # delay by at least 60 seconds only if running the request against the public NCBI API
+ if delay < 60 and url_base == NCBI_BLAST_URL:
+ # Wasn't a quick return, must wait at least a minute
+ delay = 60
+
+ request = Request(url_base, message, {"User-Agent": "BiopythonClient"})
+ handle = urlopen(request)
+ results = handle.read().decode()
+
+ # Can see an "\n\n" page while results are in progress,
+ # if so just wait a bit longer...
+ if results == "\n\n":
+ continue
+ # XML results don't have the Status tag when finished
+ if "Status=" not in results:
+ break
+ i = results.index("Status=")
+ j = results.index("\n", i)
+ status = results[i + len("Status=") : j].strip()
+ if status.upper() == "READY":
+ break
+ return StringIO(results)
+
+
+qblast._previous = 0
+
+
+def _parse_qblast_ref_page(handle):
+ """Extract a tuple of RID, RTOE from the 'please wait' page (PRIVATE).
+
+ The NCBI FAQ pages use TOE for 'Time of Execution', so RTOE is probably
+ 'Request Time of Execution' and RID would be 'Request Identifier'.
+ """
+ s = handle.read().decode()
+ i = s.find("RID =")
+ if i == -1:
+ rid = None
+ else:
+ j = s.find("\n", i)
+ rid = s[i + len("RID =") : j].strip()
+
+ i = s.find("RTOE =")
+ if i == -1:
+ rtoe = None
+ else:
+ j = s.find("\n", i)
+ rtoe = s[i + len("RTOE =") : j].strip()
+
+ if not rid and not rtoe:
+ # Can we reliably extract the error message from the HTML page?
+ # e.g. "Message ID#24 Error: Failed to read the Blast query:
+ # Nucleotide FASTA provided for protein sequence"
+ # or "Message ID#32 Error: Query contains no data: Query
+ # contains no sequence data"
+ #
+ # This used to occur inside a
entry:
+ i = s.find('
')
+ if i != -1:
+ msg = s[i + len('
') :].strip()
+ msg = msg.split("
", 1)[0].split("\n", 1)[0].strip()
+ if msg:
+ raise ValueError("Error message from NCBI: %s" % msg)
+ # In spring 2010 the markup was like this:
+ i = s.find('
')
+ if i != -1:
+ msg = s[i + len('
') :].strip()
+ msg = msg.split("
", 1)[0].split("\n", 1)[0].strip()
+ if msg:
+ raise ValueError("Error message from NCBI: %s" % msg)
+ # Generic search based on the way the error messages start:
+ i = s.find("Message ID#")
+ if i != -1:
+ # Break the message at the first HTML tag
+ msg = s[i:].split("<", 1)[0].split("\n", 1)[0].strip()
+ raise ValueError("Error message from NCBI: %s" % msg)
+ # We didn't recognise the error layout :(
+ # print(s)
+ raise ValueError(
+ "No RID and no RTOE found in the 'please wait' page, "
+ "there was probably an error in your request but we "
+ "could not extract a helpful error message."
+ )
+ elif not rid:
+ # Can this happen?
+ raise ValueError(
+ "No RID found in the 'please wait' page. (although RTOE = %r)" % rtoe
+ )
+ elif not rtoe:
+ # Can this happen?
+ raise ValueError(
+ "No RTOE found in the 'please wait' page. (although RID = %r)" % rid
+ )
+
+ try:
+ return rid, int(rtoe)
+ except ValueError:
+ raise ValueError(
+ "A non-integer RTOE found in the 'please wait' page, %r" % rtoe
+ ) from None
diff --git a/code/lib/Bio/Blast/NCBIXML.py b/code/lib/Bio/Blast/NCBIXML.py
new file mode 100644
index 0000000..90e91a9
--- /dev/null
+++ b/code/lib/Bio/Blast/NCBIXML.py
@@ -0,0 +1,864 @@
+# Copyright 2000 by Bertrand Frottier. All rights reserved.
+# Revisions 2005-2006 copyright Michiel de Hoon
+# Revisions 2006-2009 copyright Peter Cock
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Code to work with the BLAST XML output.
+
+The BLAST XML DTD file is on the NCBI FTP site at:
+ftp://ftp.ncbi.nlm.nih.gov/blast/documents/xml/NCBI_BlastOutput.dtd
+"""
+
+from Bio.Blast import Record
+import xml.sax
+from xml.sax.handler import ContentHandler
+
+
+class _XMLparser(ContentHandler):
+ """Generic SAX Parser (PRIVATE).
+
+ Just a very basic SAX parser.
+
+ Redefine the methods startElement, characters and endElement.
+ """
+
+ def __init__(self, debug=0):
+ """Initialize the parser.
+
+ Arguments:
+ - debug - integer, amount of debug information to print
+
+ """
+ self._tag = []
+ self._value = ""
+ self._debug = debug
+ self._debug_ignore_list = []
+ self._method_name_level = 1
+ self._method_map = None
+
+ def startElement(self, name, attr):
+ """Found XML start tag.
+
+ No real need of attr, BLAST DTD doesn't use them
+
+ Arguments:
+ - name -- name of the tag
+ - attr -- tag attributes
+
+ """
+ self._tag.append(name)
+
+ if len(self._tag) == 1:
+ # root node
+ self._on_root_node(name)
+ return
+
+ # Try to call a method (defined in subclasses)
+ method = "start_" + self._node_method_name(name)
+
+ # Note could use try / except AttributeError
+ # BUT I found often triggered by nested errors...
+ if method in self._method_map:
+ self._method_map[method]()
+ if self._debug > 4:
+ print("NCBIXML: Parsed: " + method)
+ elif self._debug > 3:
+ # Doesn't exist (yet) and may want to warn about it
+ if method not in self._debug_ignore_list:
+ print("NCBIXML: Ignored: " + method)
+ self._debug_ignore_list.append(method)
+
+ # We don't care about white space in parent tags like Hsp,
+ # but that white space doesn't belong to child tags like Hsp_midline
+ if self._value.strip():
+ raise ValueError(
+ "What should we do with %s before the %r tag?" % (self._value, name)
+ )
+ self._value = ""
+
+ def characters(self, ch):
+ """Found some text.
+
+ Arguments:
+ - ch -- characters read
+
+ """
+ self._value += ch # You don't ever get the whole string
+
+ def endElement(self, name):
+ """Found XML end tag.
+
+ Arguments:
+ - name -- tag name
+
+ """
+ # DON'T strip any white space, we may need it e.g. the hsp-midline
+
+ # Try to call a method (defined in subclasses)
+ method = "end_" + self._node_method_name(name)
+
+ # Note could use try / except AttributeError
+ # BUT I found often triggered by nested errors...
+ if method in self._method_map:
+ self._method_map[method]()
+ if self._debug > 2:
+ print("NCBIXML: Parsed: %s %s" % (method, self._value))
+ elif self._debug > 1:
+ # Doesn't exist (yet) and may want to warn about it
+ if method not in self._debug_ignore_list:
+ print("NCBIXML: Ignored: %s %s" % (method, self._value))
+ self._debug_ignore_list.append(method)
+
+ # Reset character buffer
+ self._value = ""
+
+ self._tag.pop()
+
+ def _node_method_name(self, name):
+ if self._method_name_level == 1:
+ return name
+ return "/".join(self._tag[-self._method_name_level :])
+
+
+class BlastParser(_XMLparser):
+ """Parse XML BLAST data into a Record.Blast object.
+
+ Parses XML output from BLAST (direct use discouraged).
+ This (now) returns a list of Blast records.
+ Historically it returned a single Blast record.
+ You are expected to use this via the parse or read functions.
+
+ All XML 'action' methods are private methods and may be:
+
+ - ``_start_TAG`` called when the start tag is found
+ - ``_end_TAG`` called when the end tag is found
+
+ """
+
+ def __init__(self, debug=0):
+ """Initialize the parser.
+
+ Arguments:
+ - debug - integer, amount of debug information to print
+
+ """
+ # Calling superclass method
+ _XMLparser.__init__(self, debug)
+
+ self._parser = xml.sax.make_parser()
+ self._parser.setContentHandler(self)
+
+ # To avoid ValueError: unknown url type: NCBI_BlastOutput.dtd
+ self._parser.setFeature(xml.sax.handler.feature_validation, 0)
+ self._parser.setFeature(xml.sax.handler.feature_namespaces, 0)
+ self._parser.setFeature(xml.sax.handler.feature_external_pes, 0)
+ self._parser.setFeature(xml.sax.handler.feature_external_ges, 0)
+
+ self._xml_version = 1
+
+ self.reset()
+
+ def reset(self):
+ """Reset all the data allowing reuse of the BlastParser() object."""
+ self._records = []
+ self._header = Record.Header()
+ self._parameters = Record.Parameters()
+ self._parameters.filter = None # Maybe I should update the class?
+
+ def _on_root_node(self, name):
+ if name == "BlastOutput":
+ self._setup_blast_v1()
+ elif name == "BlastXML2":
+ self._setup_blast_v2()
+ else:
+ raise ValueError(
+ "Invalid root node name: %s. Root node should be either"
+ " BlastOutput or BlastXML2" % name
+ )
+
+ def _setup_blast_v1(self):
+ self._method_map = {
+ "start_Iteration": self._start_blast_record,
+ "end_Iteration": self._end_blast_record,
+ "end_BlastOutput_program": self._set_header_application,
+ "end_BlastOutput_version": self._set_header_version,
+ "end_BlastOutput_reference": self._set_header_reference,
+ "end_BlastOutput_db": self._set_header_database,
+ "end_BlastOutput_query-ID": self._set_header_query_id,
+ "end_BlastOutput_query-def": self._set_header_query,
+ "end_BlastOutput_query-len": self._set_header_query_letters,
+ "end_Iteration_query-ID": self._set_record_query_id,
+ "end_Iteration_query-def": self._set_record_query_def,
+ "end_Iteration_query-len": self._set_record_query_letters,
+ "end_BlastOutput_hits": self._set_record_hits,
+ "end_Parameters_matrix": self._set_parameters_matrix,
+ "end_Parameters_expect": self._set_parameters_expect,
+ "end_Parameters_sc-match": self._set_parameters_sc_match,
+ "end_Parameters_sc-mismatch": self._set_parameters_sc_mismatch,
+ "end_Parameters_gap-open": self._set_parameters_gap_penalties,
+ "end_Parameters_gap-extend": self._set_parameters_gap_extend,
+ "end_Parameters_filter": self._set_parameters_filter,
+ "start_Hit": self._start_hit,
+ "end_Hit": self._end_hit,
+ "end_Hit_id": self.set_hit_id,
+ "end_Hit_def": self.set_hit_def,
+ "end_Hit_accession": self.set_hit_accession,
+ "end_Hit_len": self.set_hit_len,
+ "start_Hsp": self._start_hsp,
+ "end_Hsp_score": self._set_hsp_score,
+ "end_Hsp_bit-score": self._set_hsp_bit_score,
+ "end_Hsp_evalue": self._set_hsp_e_value,
+ "end_Hsp_query-from": self._set_hsp_query_start,
+ "end_Hsp_query-to": self._set_hsp_query_end,
+ "end_Hsp_hit-from": self._set_hsp_hit_from,
+ "end_Hsp_hit-to": self._set_hsp_hit_to,
+ "end_Hsp_query-frame": self._set_hsp_query_frame,
+ "end_Hsp_hit-frame": self._set_hsp_hit_frame,
+ "end_Hsp_identity": self._set_hsp_identity,
+ "end_Hsp_positive": self._set_hsp_positive,
+ "end_Hsp_gaps": self._set_hsp_gaps,
+ "end_Hsp_align-len": self._set_hsp_align_len,
+ "end_Hsp_qseq": self._set_hsp_query_seq,
+ "end_Hsp_hseq": self._set_hsp_subject_seq,
+ "end_Hsp_midline": self._set_hsp_midline,
+ "end_Statistics_db-num": self._set_statistics_db_num,
+ "end_Statistics_db-len": self._set_statistics_db_len,
+ "end_Statistics_hsp-len": self._set_statistics_hsp_len,
+ "end_Statistics_eff-space": self._set_statistics_eff_space,
+ "end_Statistics_kappa": self._set_statistics_kappa,
+ "end_Statistics_lambda": self._set_statistics_lambda,
+ "end_Statistics_entropy": self._set_statistics_entropy,
+ }
+
+ def _setup_blast_v2(self):
+ self._method_name_level = 2
+ self._xml_version = 2
+ self._method_map = {
+ "start_report/Report": self._start_blast_record,
+ "end_report/Report": self._end_blast_record,
+ "end_Report/program": self._set_header_application,
+ "end_Report/version": self._set_header_version,
+ "end_Report/reference": self._set_header_reference,
+ "end_Target/db": self._set_header_database,
+ "end_Search/query-id": self._set_record_query_id,
+ "end_Search/query-title": self._set_record_query_def,
+ "end_Search/query-len": self._set_record_query_letters,
+ "end_BlastOutput_hits": self._set_record_hits,
+ "end_Parameters/matrix": self._set_parameters_matrix,
+ "end_Parameters/expect": self._set_parameters_expect,
+ "end_Parameters/sc-match": self._set_parameters_sc_match,
+ "end_Parameters/sc-mismatch": self._set_parameters_sc_mismatch,
+ "end_Parameters/gap-open": self._set_parameters_gap_penalties,
+ "end_Parameters/gap-extend": self._set_parameters_gap_extend,
+ "end_Parameters/filter": self._set_parameters_filter,
+ "start_hits/Hit": self._start_hit,
+ "end_hits/Hit": self._end_hit,
+ "start_description/HitDescr": self._start_hit_descr_item,
+ "end_description/HitDescr": self._end_hit_descr_item,
+ "end_HitDescr/id": self._end_description_id,
+ "end_HitDescr/accession": self._end_description_accession,
+ "end_HitDescr/title": self._end_description_title,
+ "end_HitDescr/taxid": self._end_description_taxid,
+ "end_HitDescr/sciname": self._end_description_sciname,
+ "end_Hit/len": self.set_hit_len,
+ "start_hsps/Hsp": self._start_hsp,
+ "end_hsps/Hsp": self._end_hsp,
+ "end_Hsp/score": self._set_hsp_score,
+ "end_Hsp/bit-score": self._set_hsp_bit_score,
+ "end_Hsp/evalue": self._set_hsp_e_value,
+ "end_Hsp/query-from": self._set_hsp_query_start,
+ "end_Hsp/query-to": self._set_hsp_query_end,
+ "end_Hsp/hit-from": self._set_hsp_hit_from,
+ "end_Hsp/hit-to": self._set_hsp_hit_to,
+ "end_Hsp/query-frame": self._set_hsp_query_frame,
+ "end_Hsp/hit-frame": self._set_hsp_hit_frame,
+ "end_Hsp/query-strand": self._set_hsp_query_strand,
+ "end_Hsp/hit-strand": self._set_hsp_hit_strand,
+ "end_Hsp/identity": self._set_hsp_identity,
+ "end_Hsp/positive": self._set_hsp_positive,
+ "end_Hsp/gaps": self._set_hsp_gaps,
+ "end_Hsp/align-len": self._set_hsp_align_len,
+ "end_Hsp/qseq": self._set_hsp_query_seq,
+ "end_Hsp/hseq": self._set_hsp_subject_seq,
+ "end_Hsp/midline": self._set_hsp_midline,
+ "end_Statistics/db-num": self._set_statistics_db_num,
+ "end_Statistics/db-len": self._set_statistics_db_len,
+ "end_Statistics/hsp-len": self._set_statistics_hsp_len,
+ "end_Statistics/eff-space": self._set_statistics_eff_space,
+ "end_Statistics/kappa": self._set_statistics_kappa,
+ "end_Statistics/lambda": self._set_statistics_lambda,
+ "end_Statistics/entropy": self._set_statistics_entropy,
+ }
+
+ def _start_blast_record(self):
+ """Start interaction (PRIVATE)."""
+ self._blast = Record.Blast()
+
+ def _end_blast_record(self):
+ """End interaction (PRIVATE)."""
+ # We stored a lot of generic "top level" information
+ # in self._header (an object of type Record.Header)
+ self._blast.reference = self._header.reference
+ self._blast.date = self._header.date
+ self._blast.version = self._header.version
+ self._blast.database = self._header.database
+ self._blast.application = self._header.application
+
+ # These are required for "old" pre 2.2.14 files
+ # where only ,
+ # and were used. Now they
+ # are supplemented/replaced by ,
+ # and
+ if not hasattr(self._blast, "query") or not self._blast.query:
+ self._blast.query = self._header.query
+ if not hasattr(self._blast, "query_id") or not self._blast.query_id:
+ self._blast.query_id = self._header.query_id
+ if not hasattr(self._blast, "query_letters") or not self._blast.query_letters:
+ self._blast.query_letters = self._header.query_letters
+
+ # Hack to record the query length as both the query_letters and
+ # query_length properties (as in the plain text parser, see
+ # Bug 2176 comment 12):
+ self._blast.query_length = self._blast.query_letters
+ # Perhaps in the long term we should deprecate one, but I would
+ # prefer to drop query_letters - so we need a transition period
+ # with both.
+
+ # Hack to record the claimed database size as database_length
+ # (as well as in num_letters_in_database, see Bug 2176 comment 13):
+ self._blast.database_length = self._blast.num_letters_in_database
+ # TODO? Deprecate database_letters next?
+
+ # Hack to record the claimed database sequence count as database_sequences
+ self._blast.database_sequences = self._blast.num_sequences_in_database
+
+ # Apply the "top level" parameter information
+ self._blast.matrix = self._parameters.matrix
+ self._blast.num_seqs_better_e = self._parameters.num_seqs_better_e
+ self._blast.gap_penalties = self._parameters.gap_penalties
+ self._blast.filter = self._parameters.filter
+ self._blast.expect = self._parameters.expect
+ self._blast.sc_match = self._parameters.sc_match
+ self._blast.sc_mismatch = self._parameters.sc_mismatch
+
+ # Add to the list
+ self._records.append(self._blast)
+ # Clear the object (a new empty one is create in _start_Iteration)
+ self._blast = None
+
+ if self._debug:
+ print("NCBIXML: Added Blast record to results")
+
+ # Header
+ def _set_header_application(self):
+ """BLAST program, e.g., blastp, blastn, etc. (PRIVATE).
+
+ Save this to put on each blast record object
+ """
+ self._header.application = self._value.upper()
+
+ def _set_header_version(self):
+ """Version number and date of the BLAST engine (PRIVATE).
+
+ e.g. "BLASTX 2.2.12 [Aug-07-2005]" but there can also be
+ variants like "BLASTP 2.2.18+" without the date.
+
+ Save this to put on each blast record object
+ """
+ parts = self._value.split()
+ # TODO - Check the first word starts with BLAST?
+
+ # The version is the second word (field one)
+ self._header.version = parts[1]
+
+ # Check there is a third word (the date)
+ if len(parts) >= 3:
+ if parts[2][0] == "[" and parts[2][-1] == "]":
+ self._header.date = parts[2][1:-1]
+ else:
+ # Assume this is still a date, but without the
+ # square brackets
+ self._header.date = parts[2]
+
+ def _set_header_reference(self):
+ """Record any article reference describing the algorithm (PRIVATE).
+
+ Save this to put on each blast record object
+ """
+ self._header.reference = self._value
+
+ def _set_header_database(self):
+ """Record the database(s) searched (PRIVATE).
+
+ Save this to put on each blast record object
+ """
+ self._header.database = self._value
+
+ def _set_header_query_id(self):
+ """Record the identifier of the query (PRIVATE).
+
+ Important in old pre 2.2.14 BLAST, for recent versions
+ is enough
+ """
+ self._header.query_id = self._value
+
+ def _set_header_query(self):
+ """Record the definition line of the query (PRIVATE).
+
+ Important in old pre 2.2.14 BLAST, for recent versions
+ is enough
+ """
+ self._header.query = self._value
+
+ def _set_header_query_letters(self):
+ """Record the length of the query (PRIVATE).
+
+ Important in old pre 2.2.14 BLAST, for recent versions
+ is enough
+ """
+ self._header.query_letters = int(self._value)
+
+ def _set_record_query_id(self):
+ """Record the identifier of the query (PRIVATE)."""
+ self._blast.query_id = self._value
+
+ def _set_record_query_def(self):
+ """Record the definition line of the query (PRIVATE)."""
+ self._blast.query = self._value
+
+ def _set_record_query_letters(self):
+ """Record the length of the query (PRIVATE)."""
+ self._blast.query_letters = int(self._value)
+
+ # def _end_BlastOutput_query_seq(self):
+ # """The query sequence (PRIVATE)."""
+ # pass # XXX Missing in Record.Blast ?
+
+ # def _end_BlastOutput_iter_num(self):
+ # """The psi-blast iteration number (PRIVATE)."""
+ # pass # XXX TODO PSI
+
+ def _set_record_hits(self):
+ """Hits to the database sequences, one for every sequence (PRIVATE)."""
+ self._blast.num_hits = int(self._value)
+
+ # def _end_BlastOutput_message(self):
+ # """error messages (PRIVATE)."""
+ # pass # XXX What to do ?
+
+ # Parameters
+ def _set_parameters_matrix(self):
+ """Matrix used (-M on legacy BLAST) (PRIVATE)."""
+ self._parameters.matrix = self._value
+
+ def _set_parameters_expect(self):
+ """Expect values cutoff (PRIVATE)."""
+ # NOTE: In old text output there was a line:
+ # Number of sequences better than 1.0e-004: 1
+ # As far as I can see, parameters.num_seqs_better_e
+ # would take the value of 1, and the expectation
+ # value was not recorded.
+ #
+ # Anyway we should NOT record this against num_seqs_better_e
+ self._parameters.expect = self._value
+
+ # def _end_Parameters_include(self):
+ # """Inclusion threshold for a psi-blast iteration (-h) (PRIVATE)."""
+ # pass # XXX TODO PSI
+
+ def _set_parameters_sc_match(self):
+ """Match score for nucleotide-nucleotide comparison (-r) (PRIVATE)."""
+ self._parameters.sc_match = int(self._value)
+
+ def _set_parameters_sc_mismatch(self):
+ """Mismatch penalty for nucleotide-nucleotide comparison (-r) (PRIVATE)."""
+ self._parameters.sc_mismatch = int(self._value)
+
+ def _set_parameters_gap_penalties(self):
+ """Gap existence cost (-G) (PRIVATE)."""
+ self._parameters.gap_penalties = int(self._value)
+
+ def _set_parameters_gap_extend(self):
+ """Gap extension cose (-E) (PRIVATE)."""
+ self._parameters.gap_penalties = (
+ self._parameters.gap_penalties,
+ int(self._value),
+ )
+
+ def _set_parameters_filter(self):
+ """Record filtering options (-F) (PRIVATE)."""
+ self._parameters.filter = self._value
+
+ # def _end_Parameters_pattern(self):
+ # """Pattern used for phi-blast search (PRIVATE).
+ # """
+ # pass # XXX TODO PSI
+
+ # def _end_Parameters_entrez_query(self):
+ # """Entrez query used to limit search (PRIVATE).
+ # """
+ # pass # XXX TODO PSI
+
+ # Hits
+ def _start_hit(self):
+ """Start filling records (PRIVATE)."""
+ self._blast.alignments.append(Record.Alignment())
+ self._descr = (
+ Record.Description() if self._xml_version == 1 else Record.DescriptionExt()
+ )
+ self._blast.descriptions.append(self._descr)
+ self._blast.multiple_alignment = []
+ self._hit = self._blast.alignments[-1]
+
+ self._descr.num_alignments = 0
+
+ def _end_hit(self):
+ """Clear variables (PRIVATE)."""
+ # Cleanup
+ self._blast.multiple_alignment = None
+ self._hit = None
+ self._descr = None
+
+ def set_hit_id(self):
+ """Record the identifier of the database sequence (PRIVATE)."""
+ self._hit.hit_id = self._value
+ self._hit.title = self._value + " "
+
+ def set_hit_def(self):
+ """Record the definition line of the database sequence (PRIVATE)."""
+ self._hit.hit_def = self._value
+ self._hit.title += self._value
+ self._descr.title = self._hit.title
+
+ def set_hit_accession(self):
+ """Record the accession value of the database sequence (PRIVATE)."""
+ self._hit.accession = self._value
+ self._descr.accession = self._value
+
+ def set_hit_len(self):
+ """Record the length of the hit."""
+ self._hit.length = int(self._value)
+
+ # HSPs
+ def _start_hsp(self):
+ # Note that self._start_Hit() should have been called
+ # to setup things like self._blast.multiple_alignment
+ self._hsp = Record.HSP()
+ self._hsp.positives = None
+ self._hit.hsps.append(self._hsp)
+ self._descr.num_alignments += 1
+ self._blast.multiple_alignment.append(Record.MultipleAlignment())
+ self._mult_al = self._blast.multiple_alignment[-1]
+
+ def _end_hsp(self):
+ if self._hsp.frame and len(self._hsp.frame) == 1:
+ self._hsp.frame += (0,)
+
+ # Hsp_num is useless
+ def _set_hsp_score(self):
+ """Record the raw score of HSP (PRIVATE)."""
+ self._hsp.score = float(self._value)
+ if self._descr.score is None:
+ self._descr.score = float(self._value)
+
+ def _set_hsp_bit_score(self):
+ """Record the Bit score of HSP (PRIVATE)."""
+ self._hsp.bits = float(self._value)
+ if self._descr.bits is None:
+ self._descr.bits = float(self._value)
+
+ def _set_hsp_e_value(self):
+ """Record the expect value of the HSP (PRIVATE)."""
+ self._hsp.expect = float(self._value)
+ if self._descr.e is None:
+ self._descr.e = float(self._value)
+
+ def _set_hsp_query_start(self):
+ """Offset of query at the start of the alignment (one-offset) (PRIVATE)."""
+ self._hsp.query_start = int(self._value)
+
+ def _set_hsp_query_end(self):
+ """Offset of query at the end of the alignment (one-offset) (PRIVATE)."""
+ self._hsp.query_end = int(self._value)
+
+ def _set_hsp_hit_from(self):
+ """Offset of the database at the start of the alignment (one-offset) (PRIVATE)."""
+ self._hsp.sbjct_start = int(self._value)
+
+ def _set_hsp_hit_to(self):
+ """Offset of the database at the end of the alignment (one-offset) (PRIVATE)."""
+ self._hsp.sbjct_end = int(self._value)
+
+ # def _end_Hsp_pattern_from(self):
+ # """Start of phi-blast pattern on the query (one-offset) (PRIVATE)."""
+ # pass # XXX TODO PSI
+
+ # def _end_Hsp_pattern_to(self):
+ # """End of phi-blast pattern on the query (one-offset) (PRIVATE)."""
+ # pass # XXX TODO PSI
+
+ def _set_hsp_query_frame(self):
+ """Frame of the query if applicable (PRIVATE)."""
+ v = int(self._value)
+ self._hsp.frame = (v,)
+ if self._header.application == "BLASTN":
+ self._hsp.strand = ("Plus" if v > 0 else "Minus",)
+
+ def _set_hsp_hit_frame(self):
+ """Frame of the database sequence if applicable (PRIVATE)."""
+ v = int(self._value)
+ if len(self._hsp.frame) == 0:
+ self._hsp.frame = (0, v)
+ else:
+ self._hsp.frame += (v,)
+ if self._header.application == "BLASTN":
+ self._hsp.strand += ("Plus" if v > 0 else "Minus",)
+
+ def _set_hsp_query_strand(self):
+ """Frame of the query if applicable (PRIVATE)."""
+ self._hsp.strand = (self._value,)
+ if self._header.application == "BLASTN":
+ self._hsp.frame = (1 if self._value == "Plus" else -1,)
+
+ def _set_hsp_hit_strand(self):
+ """Frame of the database sequence if applicable (PRIVATE)."""
+ self._hsp.strand += (self._value,)
+ if self._header.application == "BLASTN":
+ self._hsp.frame += (1 if self._value == "Plus" else -1,)
+
+ def _set_hsp_identity(self):
+ """Record the number of identities in the alignment (PRIVATE)."""
+ v = int(self._value)
+ self._hsp.identities = v
+ if self._hsp.positives is None:
+ self._hsp.positives = v
+
+ def _set_hsp_positive(self):
+ """Record the number of positive (conservative) substitutions in the alignment (PRIVATE)."""
+ self._hsp.positives = int(self._value)
+
+ def _set_hsp_gaps(self):
+ """Record the number of gaps in the alignment (PRIVATE)."""
+ self._hsp.gaps = int(self._value)
+
+ def _set_hsp_align_len(self):
+ """Record the length of the alignment (PRIVATE)."""
+ self._hsp.align_length = int(self._value)
+
+ # def _en_Hsp_density(self):
+ # """Score density (PRIVATE)."""
+ # pass # XXX ???
+
+ def _set_hsp_query_seq(self):
+ """Record the alignment string for the query (PRIVATE)."""
+ self._hsp.query = self._value
+
+ def _set_hsp_subject_seq(self):
+ """Record the alignment string for the database (PRIVATE)."""
+ self._hsp.sbjct = self._value
+
+ def _set_hsp_midline(self):
+ """Record the middle line as normally seen in BLAST report (PRIVATE)."""
+ self._hsp.match = self._value # do NOT strip spaces!
+ assert len(self._hsp.match) == len(self._hsp.query)
+ assert len(self._hsp.match) == len(self._hsp.sbjct)
+
+ # Statistics
+ def _set_statistics_db_num(self):
+ """Record the number of sequences in the database (PRIVATE)."""
+ self._blast.num_sequences_in_database = int(self._value)
+
+ def _set_statistics_db_len(self):
+ """Record the number of letters in the database (PRIVATE)."""
+ self._blast.num_letters_in_database = int(self._value)
+
+ def _set_statistics_hsp_len(self):
+ """Record the effective HSP length (PRIVATE)."""
+ self._blast.effective_hsp_length = int(self._value)
+
+ def _set_statistics_eff_space(self):
+ """Record the effective search space (PRIVATE)."""
+ self._blast.effective_search_space = float(self._value)
+
+ def _set_statistics_kappa(self):
+ """Karlin-Altschul parameter K (PRIVATE)."""
+ self._blast.ka_params = float(self._value)
+
+ def _set_statistics_lambda(self):
+ """Karlin-Altschul parameter Lambda (PRIVATE)."""
+ self._blast.ka_params = (float(self._value), self._blast.ka_params)
+
+ def _set_statistics_entropy(self):
+ """Karlin-Altschul parameter H (PRIVATE)."""
+ self._blast.ka_params = self._blast.ka_params + (float(self._value),)
+
+ def _start_hit_descr_item(self):
+ """XML v2. Start hit description item."""
+ self._hit_descr_item = Record.DescriptionExtItem()
+
+ def _end_hit_descr_item(self):
+ """XML v2. Start hit description item."""
+ self._descr.append_item(self._hit_descr_item)
+ if not self._hit.title:
+ self._hit.title = str(self._hit_descr_item)
+ self._hit_descr_item = None
+
+ def _end_description_id(self):
+ """XML v2. The identifier of the database sequence(PRIVATE)."""
+ self._hit_descr_item.id = self._value
+ if not self._hit.hit_id:
+ self._hit.hit_id = self._value
+
+ def _end_description_accession(self):
+ """XML v2. The accession value of the database sequence (PRIVATE)."""
+ self._hit_descr_item.accession = self._value
+ if not getattr(self._hit, "accession", None):
+ self._hit.accession = self._value
+
+ def _end_description_title(self):
+ """XML v2. The hit description title (PRIVATE)."""
+ self._hit_descr_item.title = self._value
+
+ def _end_description_taxid(self):
+ try:
+ self._hit_descr_item.taxid = int(self._value)
+ except ValueError:
+ pass
+
+ def _end_description_sciname(self):
+ self._hit_descr_item.sciname = self._value
+
+
+def read(handle, debug=0):
+ """Return a single Blast record (assumes just one query).
+
+ Uses the BlastParser internally.
+
+ This function is for use when there is one and only one BLAST
+ result in your XML file.
+
+ Use the Bio.Blast.NCBIXML.parse() function if you expect more than
+ one BLAST record (i.e. if you have more than one query sequence).
+ """
+ iterator = parse(handle, debug)
+ try:
+ record = next(iterator)
+ except StopIteration:
+ raise ValueError("No records found in handle") from None
+ try:
+ next(iterator)
+ raise ValueError("More than one record found in handle")
+ except StopIteration:
+ pass
+ return record
+
+
+def parse(handle, debug=0):
+ """Return an iterator a Blast record for each query.
+
+ Incremental parser, this is an iterator that returns
+ Blast records. It uses the BlastParser internally.
+
+ handle - file handle to and XML file to parse
+ debug - integer, amount of debug information to print
+
+ This is a generator function that returns multiple Blast records
+ objects - one for each query sequence given to blast. The file
+ is read incrementally, returning complete records as they are read
+ in.
+
+ Should cope with new BLAST 2.2.14+ which gives a single XML file
+ for multiple query records.
+
+ Should also cope with XML output from older versions BLAST which
+ gave multiple XML files concatenated together (giving a single file
+ which strictly speaking wasn't valid XML).
+ """
+ from xml.parsers import expat
+
+ BLOCK = 1024
+ MARGIN = 10 # must be at least length of newline + XML start
+ XML_START = ""):
+ """Ensure the given value formats to a string correctly."""
+ if value is None:
+ return default_str
+ return format_spec % value
+
+
+class Header:
+ """Saves information from a blast header.
+
+ Members:
+ application The name of the BLAST flavor that generated this data.
+ version Version of blast used.
+ date Date this data was generated.
+ reference Reference for blast.
+
+ query Name of query sequence.
+ query_letters Number of letters in the query sequence. (int)
+
+ database Name of the database.
+ database_sequences Number of sequences in the database. (int)
+ database_letters Number of letters in the database. (int)
+
+ """
+
+ def __init__(self):
+ """Initialize the class."""
+ self.application = ""
+ self.version = ""
+ self.date = ""
+ self.reference = ""
+
+ self.query = ""
+ self.query_letters = None
+
+ self.database = ""
+ self.database_sequences = None
+ self.database_letters = None
+
+
+class Description:
+ """Stores information about one hit in the descriptions section.
+
+ Members:
+ title Title of the hit.
+ score Number of bits. (int)
+ bits Bit score. (float)
+ e E value. (float)
+ num_alignments Number of alignments for the same subject. (int)
+ """
+
+ def __init__(self):
+ """Initialize the class."""
+ self.title = ""
+ self.score = None
+ self.bits = None
+ self.e = None
+ self.num_alignments = None
+
+ def __str__(self):
+ """Return the description as a string."""
+ return "%-66s %5s %s" % (self.title, self.score, self.e)
+
+
+class DescriptionExt(Description):
+ """Extended description record for BLASTXML version 2.
+
+ Members:
+ items List of DescriptionExtItem
+ """
+
+ def __init__(self):
+ """Initialize the class."""
+ super().__init__()
+
+ self.items = []
+
+ def append_item(self, item):
+ """Add a description extended record."""
+ if len(self.items) == 0:
+ self.title = str(item)
+ self.items.append(item)
+
+
+class DescriptionExtItem:
+ """Stores information about one record in hit description for BLASTXML version 2.
+
+ Members:
+ id Database identifier
+ title Title of the hit.
+ """
+
+ def __init__(self):
+ """Initialize the class."""
+ self.id = None
+ self.title = None
+ self.accession = None
+ self.taxid = None
+ self.sciname = None
+
+ def __str__(self):
+ """Return the description identifier and title as a string."""
+ return "%s %s" % (self.id, self.title)
+
+
+class Alignment:
+ """Stores information about one hit in the alignments section.
+
+ Members:
+ title Name.
+ hit_id Hit identifier. (str)
+ hit_def Hit definition. (str)
+ length Length. (int)
+ hsps A list of HSP objects.
+
+ """
+
+ def __init__(self):
+ """Initialize the class."""
+ self.title = ""
+ self.hit_id = ""
+ self.hit_def = ""
+ self.length = None
+ self.hsps = []
+
+ def __str__(self):
+ """Return the BLAST alignment as a formatted string."""
+ lines = self.title.split("\n")
+ lines.append("Length = %s\n" % self.length)
+ return "\n ".join(lines)
+
+
+class HSP:
+ """Stores information about one hsp in an alignment hit.
+
+ Members:
+ - score BLAST score of hit. (float)
+ - bits Number of bits for that score. (float)
+ - expect Expect value. (float)
+ - num_alignments Number of alignments for same subject. (int)
+ - identities Number of identities (int) if using the XML parser.
+ Tuple of number of identities/total aligned (int, int)
+ if using the (obsolete) plain text parser.
+ - positives Number of positives (int) if using the XML parser.
+ Tuple of number of positives/total aligned (int, int)
+ if using the (obsolete) plain text parser.
+ - gaps Number of gaps (int) if using the XML parser.
+ Tuple of number of gaps/total aligned (int, int) if
+ using the (obsolete) plain text parser.
+ - align_length Length of the alignment. (int)
+ - strand Tuple of (query, target) strand.
+ - frame Tuple of 1 or 2 frame shifts, depending on the flavor.
+
+ - query The query sequence.
+ - query_start The start residue for the query sequence. (1-based)
+ - query_end The end residue for the query sequence. (1-based)
+ - match The match sequence.
+ - sbjct The sbjct sequence.
+ - sbjct_start The start residue for the sbjct sequence. (1-based)
+ - sbjct_end The end residue for the sbjct sequence. (1-based)
+
+ Not all flavors of BLAST return values for every attribute::
+
+ score expect identities positives strand frame
+ BLASTP X X X X
+ BLASTN X X X X X
+ BLASTX X X X X X
+ TBLASTN X X X X X
+ TBLASTX X X X X X/X
+
+ Note: for BLASTX, the query sequence is shown as a protein sequence,
+ but the numbering is based on the nucleotides. Thus, the numbering
+ is 3x larger than the number of amino acid residues. A similar effect
+ can be seen for the sbjct sequence in TBLASTN, and for both sequences
+ in TBLASTX.
+
+ Also, for negative frames, the sequence numbering starts from
+ query_start and counts down.
+
+ """
+
+ def __init__(self):
+ """Initialize the class."""
+ self.score = None
+ self.bits = None
+ self.expect = None
+ self.num_alignments = None
+ self.identities = (None, None)
+ self.positives = (None, None)
+ self.gaps = (None, None)
+ self.align_length = None
+ self.strand = (None, None)
+ self.frame = ()
+
+ self.query = ""
+ self.query_start = None
+ self.query_end = None
+ self.match = ""
+ self.sbjct = ""
+ self.sbjct_start = None
+ self.sbjct_end = None
+
+ def __str__(self):
+ """Return the BLAST HSP as a formatted string."""
+ lines = [
+ "Score %s (%s bits), expectation %s, alignment length %s"
+ % (
+ fmt_(self.score, "%i"),
+ fmt_(self.bits, "%i"),
+ fmt_(self.expect, "%0.1e"),
+ fmt_(self.align_length, "%i"),
+ )
+ ]
+ if self.align_length is None:
+ return "\n".join(lines)
+ if self.align_length < 50:
+ lines.append(
+ "Query:%8s %s %s" % (self.query_start, self.query, self.query_end)
+ )
+ lines.append(" %s" % self.match)
+ lines.append(
+ "Sbjct:%8s %s %s" % (self.sbjct_start, self.sbjct, self.sbjct_end)
+ )
+ else:
+ lines.append(
+ "Query:%8s %s...%s %s"
+ % (self.query_start, self.query[:45], self.query[-3:], self.query_end,)
+ )
+ lines.append(" %s...%s" % (self.match[:45], self.match[-3:]))
+ lines.append(
+ "Sbjct:%8s %s...%s %s"
+ % (self.sbjct_start, self.sbjct[:45], self.sbjct[-3:], self.sbjct_end)
+ )
+ return "\n".join(lines)
+
+
+class MultipleAlignment:
+ """Holds information about a multiple alignment.
+
+ Members:
+ alignment A list of tuples (name, start residue, sequence, end residue).
+
+ The start residue is 1-based. It may be blank, if that sequence is
+ not aligned in the multiple alignment.
+
+ """
+
+ def __init__(self):
+ """Initialize the class."""
+ self.alignment = []
+
+ def to_generic(self):
+ """Retrieve generic alignment object for the given alignment.
+
+ Instead of the tuples, this returns a MultipleSeqAlignment object
+ from Bio.Align, through which you can manipulate and query
+ the object.
+
+ Thanks to James Casbon for the code.
+ """
+ seq_parts = []
+ seq_names = []
+ parse_number = 0
+ n = 0
+ for name, start, seq, end in self.alignment:
+ if name == "QUERY": # QUERY is the first in each alignment block
+ parse_number += 1
+ n = 0
+
+ if parse_number == 1: # create on first_parse, append on all others
+ seq_parts.append(seq)
+ seq_names.append(name)
+ else:
+ seq_parts[n] += seq
+ n += 1
+
+ records = (
+ SeqRecord(Seq(seq), name) for (name, seq) in zip(seq_names, seq_parts)
+ )
+ return MultipleSeqAlignment(records)
+
+
+class Round:
+ """Holds information from a PSI-BLAST round.
+
+ Members:
+ number Round number. (int)
+ reused_seqs Sequences in model, found again. List of Description objects.
+ new_seqs Sequences not found, or below threshold. List of Description.
+ alignments A list of Alignment objects.
+ multiple_alignment A MultipleAlignment object.
+ """
+
+ def __init__(self):
+ """Initialize the class."""
+ self.number = None
+ self.reused_seqs = []
+ self.new_seqs = []
+ self.alignments = []
+ self.multiple_alignment = None
+
+
+class DatabaseReport:
+ """Holds information about a database report.
+
+ Members:
+ database_name List of database names. (can have multiple dbs)
+ num_letters_in_database Number of letters in the database. (int)
+ num_sequences_in_database List of number of sequences in the database.
+ posted_date List of the dates the databases were posted.
+ ka_params A tuple of (lambda, k, h) values. (floats)
+ gapped # XXX this isn't set right!
+ ka_params_gap A tuple of (lambda, k, h) values. (floats)
+
+ """
+
+ def __init__(self):
+ """Initialize the class."""
+ self.database_name = []
+ self.posted_date = []
+ self.num_letters_in_database = []
+ self.num_sequences_in_database = []
+ self.ka_params = (None, None, None)
+ self.gapped = 0
+ self.ka_params_gap = (None, None, None)
+
+
+class Parameters:
+ """Holds information about the parameters.
+
+ Members:
+ matrix Name of the matrix.
+ gap_penalties Tuple of (open, extend) penalties. (floats)
+ sc_match Match score for nucleotide-nucleotide comparison
+ sc_mismatch Mismatch penalty for nucleotide-nucleotide comparison
+ num_hits Number of hits to the database. (int)
+ num_sequences Number of sequences. (int)
+ num_good_extends Number of extensions. (int)
+ num_seqs_better_e Number of sequences better than e-value. (int)
+ hsps_no_gap Number of HSP's better, without gapping. (int)
+ hsps_prelim_gapped Number of HSP's gapped in prelim test. (int)
+ hsps_prelim_gapped_attemped Number of HSP's attempted in prelim. (int)
+ hsps_gapped Total number of HSP's gapped. (int)
+ query_length Length of the query. (int)
+ query_id Identifier of the query sequence. (str)
+ database_length Number of letters in the database. (int)
+ effective_hsp_length Effective HSP length. (int)
+ effective_query_length Effective length of query. (int)
+ effective_database_length Effective length of database. (int)
+ effective_search_space Effective search space. (int)
+ effective_search_space_used Effective search space used. (int)
+ frameshift Frameshift window. Tuple of (int, float)
+ threshold Threshold. (int)
+ window_size Window size. (int)
+ dropoff_1st_pass Tuple of (score, bits). (int, float)
+ gap_x_dropoff Tuple of (score, bits). (int, float)
+ gap_x_dropoff_final Tuple of (score, bits). (int, float)
+ gap_trigger Tuple of (score, bits). (int, float)
+ blast_cutoff Tuple of (score, bits). (int, float)
+ """
+
+ def __init__(self):
+ """Initialize the class."""
+ self.matrix = ""
+ self.gap_penalties = (None, None)
+ self.sc_match = None
+ self.sc_mismatch = None
+ self.num_hits = None
+ self.num_sequences = None
+ self.num_good_extends = None
+ self.num_seqs_better_e = None
+ self.hsps_no_gap = None
+ self.hsps_prelim_gapped = None
+ self.hsps_prelim_gapped_attemped = None
+ self.hsps_gapped = None
+ self.query_id = None
+ self.query_length = None
+ self.database_length = None
+ self.effective_hsp_length = None
+ self.effective_query_length = None
+ self.effective_database_length = None
+ self.effective_search_space = None
+ self.effective_search_space_used = None
+ self.frameshift = (None, None)
+ self.threshold = None
+ self.window_size = None
+ self.dropoff_1st_pass = (None, None)
+ self.gap_x_dropoff = (None, None)
+ self.gap_x_dropoff_final = (None, None)
+ self.gap_trigger = (None, None)
+ self.blast_cutoff = (None, None)
+
+
+# TODO - Add a friendly __str__ method to BLAST results
+class Blast(Header, DatabaseReport, Parameters):
+ """Saves the results from a blast search.
+
+ Members:
+ descriptions A list of Description objects.
+ alignments A list of Alignment objects.
+ multiple_alignment A MultipleAlignment object.
+ + members inherited from base classes
+
+ """
+
+ def __init__(self):
+ """Initialize the class."""
+ Header.__init__(self)
+ DatabaseReport.__init__(self)
+ Parameters.__init__(self)
+ self.descriptions = []
+ self.alignments = []
+ self.multiple_alignment = None
+
+
+class PSIBlast(Header, DatabaseReport, Parameters):
+ """Saves the results from a blastpgp search.
+
+ Members:
+ rounds A list of Round objects.
+ converged Whether the search converged.
+ + members inherited from base classes
+
+ """
+
+ def __init__(self):
+ """Initialize the class."""
+ Header.__init__(self)
+ DatabaseReport.__init__(self)
+ Parameters.__init__(self)
+ self.rounds = []
+ self.converged = 0
diff --git a/code/lib/Bio/Blast/__init__.py b/code/lib/Bio/Blast/__init__.py
new file mode 100644
index 0000000..27c0ec3
--- /dev/null
+++ b/code/lib/Bio/Blast/__init__.py
@@ -0,0 +1,7 @@
+# Copyright 1999 by Jeffrey Chang. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Code for dealing with BLAST programs and output."""
diff --git a/code/lib/Bio/Blast/__pycache__/Applications.cpython-37.pyc b/code/lib/Bio/Blast/__pycache__/Applications.cpython-37.pyc
new file mode 100644
index 0000000..44e9314
Binary files /dev/null and b/code/lib/Bio/Blast/__pycache__/Applications.cpython-37.pyc differ
diff --git a/code/lib/Bio/Blast/__pycache__/NCBIWWW.cpython-37.pyc b/code/lib/Bio/Blast/__pycache__/NCBIWWW.cpython-37.pyc
new file mode 100644
index 0000000..f6823c0
Binary files /dev/null and b/code/lib/Bio/Blast/__pycache__/NCBIWWW.cpython-37.pyc differ
diff --git a/code/lib/Bio/Blast/__pycache__/NCBIXML.cpython-37.pyc b/code/lib/Bio/Blast/__pycache__/NCBIXML.cpython-37.pyc
new file mode 100644
index 0000000..18c8311
Binary files /dev/null and b/code/lib/Bio/Blast/__pycache__/NCBIXML.cpython-37.pyc differ
diff --git a/code/lib/Bio/Blast/__pycache__/ParseBlastTable.cpython-37.pyc b/code/lib/Bio/Blast/__pycache__/ParseBlastTable.cpython-37.pyc
new file mode 100644
index 0000000..e31274f
Binary files /dev/null and b/code/lib/Bio/Blast/__pycache__/ParseBlastTable.cpython-37.pyc differ
diff --git a/code/lib/Bio/Blast/__pycache__/Record.cpython-37.pyc b/code/lib/Bio/Blast/__pycache__/Record.cpython-37.pyc
new file mode 100644
index 0000000..37eead1
Binary files /dev/null and b/code/lib/Bio/Blast/__pycache__/Record.cpython-37.pyc differ
diff --git a/code/lib/Bio/Blast/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/Blast/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..9e31daf
Binary files /dev/null and b/code/lib/Bio/Blast/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/CAPS/__init__.py b/code/lib/Bio/CAPS/__init__.py
new file mode 100644
index 0000000..2c28e8f
--- /dev/null
+++ b/code/lib/Bio/CAPS/__init__.py
@@ -0,0 +1,135 @@
+# Copyright 2005 by Jonathan Taylor.
+# All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+"""Cleaved amplified polymorphic sequence (CAPS) markers.
+
+A CAPS marker is a location a DifferentialCutsite as described below and a
+set of primers that can be used to visualize this. More information can
+be found in the paper `Konieczny and Ausubel (1993)`_ (PMID 8106085).
+
+.. _`Konieczny and Ausubel (1993)`: https://doi.org/10.1046/j.1365-313X.1993.04020403.x
+
+"""
+
+
+class DifferentialCutsite:
+ """Differential enzyme cutsite in an alignment.
+
+ A differential cutsite is a location in an alignment where an enzyme cuts
+ at least one sequence and also cannot cut at least one other sequence.
+
+ Members:
+ - start - Where it lives in the alignment.
+ - enzyme - The enzyme that causes this.
+ - cuts_in - A list of sequences (as indexes into the alignment) the
+ enzyme cuts in.
+ - blocked_in - A list of sequences (as indexes into the alignment) the
+ enzyme is blocked in.
+
+ """
+
+ def __init__(self, **kwds):
+ """Initialize a DifferentialCutsite.
+
+ Each member (as listed in the class description) should be included as a
+ keyword.
+ """
+ self.start = int(kwds["start"])
+ self.enzyme = kwds["enzyme"]
+ self.cuts_in = kwds["cuts_in"]
+ self.blocked_in = kwds["blocked_in"]
+
+
+class AlignmentHasDifferentLengthsError(Exception):
+ """Exception where sequences in alignment have different lengths."""
+
+ pass
+
+
+class CAPSMap:
+ """A map of an alignment showing all possible dcuts.
+
+ Members:
+ - alignment - The alignment that is mapped.
+ - dcuts - A list of possible CAPS markers in the form of
+ DifferentialCutsites.
+
+ """
+
+ def __init__(self, alignment, enzymes=None):
+ """Initialize the CAPSMap.
+
+ Required:
+ - alignment - The alignment to be mapped.
+
+ Optional:
+ - enzymes - List of enzymes to be used to create the map.
+ Defaults to an empty list.
+
+ """
+ if enzymes is None:
+ enzymes = []
+ self.sequences = [rec.seq for rec in alignment]
+ self.size = len(self.sequences)
+ self.length = len(self.sequences[0])
+ for seq in self.sequences:
+ if len(seq) != self.length:
+ raise AlignmentHasDifferentLengthsError
+
+ self.alignment = alignment
+ self.enzymes = enzymes
+
+ # look for dcuts
+ self._digest()
+
+ def _digest_with(self, enzyme):
+ cuts = [] # list of lists, one per sequence
+ all = []
+
+ # go through each sequence
+ for seq in self.sequences:
+ # grab all the cuts in the sequence
+ seq_cuts = [cut - enzyme.fst5 for cut in enzyme.search(seq)]
+ # maintain a list of all cuts in all sequences
+ all.extend(seq_cuts)
+ cuts.append(seq_cuts)
+
+ # we sort the all list and remove duplicates
+ all.sort()
+
+ last = -999
+ new = []
+ for cut in all:
+ if cut != last:
+ new.append(cut)
+ last = cut
+ all = new
+ # all now has indices for all sequences in the alignment
+
+ for cut in all:
+ # test for dcuts
+
+ cuts_in = []
+ blocked_in = []
+
+ for i in range(0, self.size):
+ seq = self.sequences[i]
+ if cut in cuts[i]:
+ cuts_in.append(i)
+ else:
+ blocked_in.append(i)
+
+ if cuts_in != [] and blocked_in != []:
+ self.dcuts.append(
+ DifferentialCutsite(
+ start=cut, enzyme=enzyme, cuts_in=cuts_in, blocked_in=blocked_in
+ )
+ )
+
+ def _digest(self):
+ self.dcuts = []
+
+ for enzyme in self.enzymes:
+ self._digest_with(enzyme)
diff --git a/code/lib/Bio/CAPS/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/CAPS/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..06df99c
Binary files /dev/null and b/code/lib/Bio/CAPS/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/Cluster/__init__.py b/code/lib/Bio/Cluster/__init__.py
new file mode 100644
index 0000000..32444da
--- /dev/null
+++ b/code/lib/Bio/Cluster/__init__.py
@@ -0,0 +1,1293 @@
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+#
+"""Cluster Analysis.
+
+The Bio.Cluster provides commonly used clustering algorithms and was
+designed with the application to gene expression data in mind. However,
+this module can also be used for cluster analysis of other types of data.
+
+Bio.Cluster and the underlying C Clustering Library is described in
+M. de Hoon et al. (2004) https://doi.org/10.1093/bioinformatics/bth078
+"""
+
+import numbers
+
+try:
+ import numpy
+except ImportError:
+ from Bio import MissingPythonDependencyError
+
+ raise MissingPythonDependencyError(
+ "Please install numpy if you want to use Bio.Cluster. "
+ "See http://www.numpy.org/"
+ ) from None
+
+from . import _cluster
+
+__all__ = (
+ "Node",
+ "Tree",
+ "kcluster",
+ "kmedoids",
+ "treecluster",
+ "somcluster",
+ "clusterdistance",
+ "clustercentroids",
+ "distancematrix",
+ "pca",
+ "Record",
+ "read",
+)
+
+
+__version__ = _cluster.version()
+
+
+class Node(_cluster.Node):
+ """Element of a hierarchical clustering tree.
+
+ A node contains items or other Nodes(sub-nodes).
+ """
+
+ __doc__ = _cluster.Node.__doc__
+
+
+class Tree(_cluster.Tree):
+ """Hierarchical clustering tree.
+
+ A Tree consists of Nodes.
+ """
+
+ def sort(self, order=None):
+ """Sort the hierarchical clustering tree.
+
+ Sort the hierarchical clustering tree by switching the left and
+ right subnode of nodes such that the elements in the left-to-right
+ order of the tree tend to have increasing order values.
+
+ Return the indices of the elements in the left-to-right order in
+ the hierarchical clustering tree, such that the element with index
+ indices[i] occurs at position i in the dendrogram.
+
+ """
+ n = len(self) + 1
+ indices = numpy.ones(n, dtype="intc")
+ if order is None:
+ order = numpy.ones(n, dtype="d")
+ elif isinstance(order, numpy.ndarray):
+ order = numpy.require(order, dtype="d", requirements="C")
+ else:
+ order = numpy.array(order, dtype="d")
+ _cluster.Tree.sort(self, indices, order)
+ return indices
+
+ def cut(self, nclusters=None):
+ """Create clusters by cutting the hierarchical clustering tree.
+
+ Divide the elements in a hierarchical clustering result mytree
+ into clusters, and return an array with the number of the cluster
+ to which each element was assigned.
+
+ Keyword arguments:
+ - nclusters: The desired number of clusters.
+ """
+ n = len(self) + 1
+ indices = numpy.ones(n, dtype="intc")
+ if nclusters is None:
+ nclusters = n
+ _cluster.Tree.cut(self, indices, nclusters)
+ return indices
+
+
+def kcluster(
+ data,
+ nclusters=2,
+ mask=None,
+ weight=None,
+ transpose=False,
+ npass=1,
+ method="a",
+ dist="e",
+ initialid=None,
+):
+ """Perform k-means clustering.
+
+ This function performs k-means clustering on the values in data, and
+ returns the cluster assignments, the within-cluster sum of distances
+ of the optimal k-means clustering solution, and the number of times
+ the optimal solution was found.
+
+ Keyword arguments:
+ - data: nrows x ncolumns array containing the data values.
+ - nclusters: number of clusters (the 'k' in k-means).
+ - mask: nrows x ncolumns array of integers, showing which data
+ are missing. If mask[i,j]==0, then data[i,j] is missing.
+ - weight: the weights to be used when calculating distances
+ - transpose:
+ - if False: rows are clustered;
+ - if True: columns are clustered.
+ - npass: number of times the k-means clustering algorithm is
+ performed, each time with a different (random) initial
+ condition.
+ - method: specifies how the center of a cluster is found:
+ - method == 'a': arithmetic mean;
+ - method == 'm': median.
+ - dist: specifies the distance function to be used:
+ - dist == 'e': Euclidean distance;
+ - dist == 'b': City Block distance;
+ - dist == 'c': Pearson correlation;
+ - dist == 'a': absolute value of the correlation;
+ - dist == 'u': uncentered correlation;
+ - dist == 'x': absolute uncentered correlation;
+ - dist == 's': Spearman's rank correlation;
+ - dist == 'k': Kendall's tau.
+ - initialid: the initial clustering from which the algorithm
+ should start.
+ If initialid is None, the routine carries out npass
+ repetitions of the EM algorithm, each time starting from a
+ different random initial clustering. If initialid is given,
+ the routine carries out the EM algorithm only once, starting
+ from the given initial clustering and without randomizing the
+ order in which items are assigned to clusters (i.e., using
+ the same order as in the data matrix). In that case, the
+ k-means algorithm is fully deterministic.
+
+ Return values:
+ - clusterid: array containing the number of the cluster to which each
+ item was assigned in the best k-means clustering solution that was
+ found in the npass runs;
+ - error: the within-cluster sum of distances for the returned k-means
+ clustering solution;
+ - nfound: the number of times this solution was found.
+ """
+ data = __check_data(data)
+ shape = data.shape
+ if transpose:
+ ndata, nitems = shape
+ else:
+ nitems, ndata = shape
+ mask = __check_mask(mask, shape)
+ weight = __check_weight(weight, ndata)
+ clusterid, npass = __check_initialid(initialid, npass, nitems)
+ error, nfound = _cluster.kcluster(
+ data, nclusters, mask, weight, transpose, npass, method, dist, clusterid
+ )
+ return clusterid, error, nfound
+
+
+def kmedoids(distance, nclusters=2, npass=1, initialid=None):
+ """Perform k-medoids clustering.
+
+ This function performs k-medoids clustering, and returns the cluster
+ assignments, the within-cluster sum of distances of the optimal
+ k-medoids clustering solution, and the number of times the optimal
+ solution was found.
+
+ Keyword arguments:
+ - distance: The distance matrix between the items. There are three
+ ways in which you can pass a distance matrix:
+ 1. a 2D Numerical Python array (in which only the left-lower
+ part of the array will be accessed);
+ 2. a 1D Numerical Python array containing the distances
+ consecutively;
+ 3. a list of rows containing the lower-triangular part of
+ the distance matrix.
+
+ Examples are:
+
+ >>> from numpy import array
+ >>> # option 1:
+ >>> distance = array([[0.0, 1.1, 2.3],
+ ... [1.1, 0.0, 4.5],
+ ... [2.3, 4.5, 0.0]])
+ >>> # option 2:
+ >>> distance = array([1.1, 2.3, 4.5])
+ >>> # option 3:
+ >>> distance = [array([]),
+ ... array([1.1]),
+ ... array([2.3, 4.5])]
+
+
+ These three correspond to the same distance matrix.
+ - nclusters: number of clusters (the 'k' in k-medoids)
+ - npass: the number of times the k-medoids clustering algorithm
+ is performed, each time with a different (random) initial
+ condition.
+ - initialid: the initial clustering from which the algorithm should start.
+ If initialid is not given, the routine carries out npass
+ repetitions of the EM algorithm, each time starting from a
+ different random initial clustering. If initialid is given,
+ the routine carries out the EM algorithm only once, starting
+ from the initial clustering specified by initialid and
+ without randomizing the order in which items are assigned to
+ clusters (i.e., using the same order as in the data matrix).
+ In that case, the k-medoids algorithm is fully deterministic.
+
+ Return values:
+ - clusterid: array containing the number of the cluster to which each
+ item was assigned in the best k-means clustering solution that was
+ found in the npass runs;
+ - error: the within-cluster sum of distances for the returned k-means
+ clustering solution;
+ - nfound: the number of times this solution was found.
+ """
+ distance = __check_distancematrix(distance)
+ nitems = len(distance)
+ clusterid, npass = __check_initialid(initialid, npass, nitems)
+ error, nfound = _cluster.kmedoids(distance, nclusters, npass, clusterid)
+ return clusterid, error, nfound
+
+
+def treecluster(
+ data,
+ mask=None,
+ weight=None,
+ transpose=False,
+ method="m",
+ dist="e",
+ distancematrix=None,
+):
+ """Perform hierarchical clustering, and return a Tree object.
+
+ This function implements the pairwise single, complete, centroid, and
+ average linkage hierarchical clustering methods.
+
+ Keyword arguments:
+ - data: nrows x ncolumns array containing the data values.
+ - mask: nrows x ncolumns array of integers, showing which data are
+ missing. If mask[i][j]==0, then data[i][j] is missing.
+ - weight: the weights to be used when calculating distances.
+ - transpose:
+ - if False, rows are clustered;
+ - if True, columns are clustered.
+ - dist: specifies the distance function to be used:
+ - dist == 'e': Euclidean distance
+ - dist == 'b': City Block distance
+ - dist == 'c': Pearson correlation
+ - dist == 'a': absolute value of the correlation
+ - dist == 'u': uncentered correlation
+ - dist == 'x': absolute uncentered correlation
+ - dist == 's': Spearman's rank correlation
+ - dist == 'k': Kendall's tau
+ - method: specifies which linkage method is used:
+ - method == 's': Single pairwise linkage
+ - method == 'm': Complete (maximum) pairwise linkage (default)
+ - method == 'c': Centroid linkage
+ - method == 'a': Average pairwise linkage
+ - distancematrix: The distance matrix between the items. There are
+ three ways in which you can pass a distance matrix:
+ 1. a 2D Numerical Python array (in which only the left-lower
+ part of the array will be accessed);
+ 2. a 1D Numerical Python array containing the distances
+ consecutively;
+ 3. a list of rows containing the lower-triangular part of
+ the distance matrix.
+
+ Examples are:
+
+ >>> from numpy import array
+ >>> # option 1:
+ >>> distance = array([[0.0, 1.1, 2.3],
+ ... [1.1, 0.0, 4.5],
+ ... [2.3, 4.5, 0.0]])
+ >>> # option 2:
+ >>> distance = array([1.1, 2.3, 4.5])
+ >>> # option 3:
+ >>> distance = [array([]),
+ ... array([1.1]),
+ ... array([2.3, 4.5])]
+
+ These three correspond to the same distance matrix.
+
+ PLEASE NOTE:
+ As the treecluster routine may shuffle the values in the
+ distance matrix as part of the clustering algorithm, be sure
+ to save this array in a different variable before calling
+ treecluster if you need it later.
+
+ Either data or distancematrix should be None. If distancematrix is None,
+ the hierarchical clustering solution is calculated from the values stored
+ in the argument data. If data is None, the hierarchical clustering solution
+ is instead calculated from the distance matrix. Pairwise centroid-linkage
+ clustering can be performed only from the data values and not from the
+ distance matrix. Pairwise single-, maximum-, and average-linkage clustering
+ can be calculated from the data values or from the distance matrix.
+
+ Return value:
+ treecluster returns a Tree object describing the hierarchical clustering
+ result. See the description of the Tree class for more information.
+ """
+ if data is None and distancematrix is None:
+ raise ValueError("use either data or distancematrix")
+ if data is not None and distancematrix is not None:
+ raise ValueError("use either data or distancematrix; do not use both")
+ if data is not None:
+ data = __check_data(data)
+ shape = data.shape
+ ndata = shape[0] if transpose else shape[1]
+ mask = __check_mask(mask, shape)
+ weight = __check_weight(weight, ndata)
+ if distancematrix is not None:
+ distancematrix = __check_distancematrix(distancematrix)
+ if mask is not None:
+ raise ValueError("mask is ignored if distancematrix is used")
+ if weight is not None:
+ raise ValueError("weight is ignored if distancematrix is used")
+ tree = Tree()
+ _cluster.treecluster(
+ tree, data, mask, weight, transpose, method, dist, distancematrix
+ )
+ return tree
+
+
+def somcluster(
+ data,
+ mask=None,
+ weight=None,
+ transpose=False,
+ nxgrid=2,
+ nygrid=1,
+ inittau=0.02,
+ niter=1,
+ dist="e",
+):
+ """Calculate a Self-Organizing Map.
+
+ This function implements a Self-Organizing Map on a rectangular grid.
+
+ Keyword arguments:
+ - data: nrows x ncolumns array containing the data values;
+ - mask: nrows x ncolumns array of integers, showing which data are
+ missing. If mask[i][j]==0, then data[i][j] is missing.
+ - weight: the weights to be used when calculating distances
+ - transpose:
+ - if False: rows are clustered;
+ - if True: columns are clustered.
+ - nxgrid: the horizontal dimension of the rectangular SOM map
+ - nygrid: the vertical dimension of the rectangular SOM map
+ - inittau: the initial value of tau (the neighborbood function)
+ - niter: the number of iterations
+ - dist: specifies the distance function to be used:
+ - dist == 'e': Euclidean distance
+ - dist == 'b': City Block distance
+ - dist == 'c': Pearson correlation
+ - dist == 'a': absolute value of the correlation
+ - dist == 'u': uncentered correlation
+ - dist == 'x': absolute uncentered correlation
+ - dist == 's': Spearman's rank correlation
+ - dist == 'k': Kendall's tau
+
+ Return values:
+
+ - clusterid: array with two columns, with the number of rows equal to
+ the items that are being clustered. Each row in the array contains
+ the x and y coordinates of the cell in the rectangular SOM grid to
+ which the item was assigned.
+ - celldata: an array with dimensions [nxgrid, nygrid, number of columns]
+ if rows are being clustered, or [nxgrid, nygrid, number of rows) if
+ columns are being clustered.
+ Each element [ix, iy] of this array is a 1D vector containing the
+ data values for the centroid of the cluster in the SOM grid cell
+ with coordinates [ix, iy].
+ """
+ if transpose:
+ ndata, nitems = data.shape
+ else:
+ nitems, ndata = data.shape
+ data = __check_data(data)
+ shape = data.shape
+ mask = __check_mask(mask, shape)
+ weight = __check_weight(weight, ndata)
+ if nxgrid < 1:
+ raise ValueError("nxgrid should be a positive integer (default is 2)")
+ if nygrid < 1:
+ raise ValueError("nygrid should be a positive integer (default is 1)")
+ clusterids = numpy.ones((nitems, 2), dtype="intc")
+ celldata = numpy.empty((nxgrid, nygrid, ndata), dtype="d")
+ _cluster.somcluster(
+ clusterids, celldata, data, mask, weight, transpose, inittau, niter, dist
+ )
+ return clusterids, celldata
+
+
+def clusterdistance(
+ data,
+ mask=None,
+ weight=None,
+ index1=None,
+ index2=None,
+ method="a",
+ dist="e",
+ transpose=False,
+):
+ """Calculate and return the distance between two clusters.
+
+ Keyword arguments:
+ - data: nrows x ncolumns array containing the data values.
+ - mask: nrows x ncolumns array of integers, showing which data are
+ missing. If mask[i, j]==0, then data[i, j] is missing.
+ - weight: the weights to be used when calculating distances
+ - index1: 1D array identifying which items belong to the
+ first cluster. If the cluster contains only one item, then
+ index1 can also be written as a single integer.
+ - index2: 1D array identifying which items belong to the
+ second cluster. If the cluster contains only one item, then
+ index2 can also be written as a single integer.
+ - dist: specifies the distance function to be used:
+ - dist == 'e': Euclidean distance
+ - dist == 'b': City Block distance
+ - dist == 'c': Pearson correlation
+ - dist == 'a': absolute value of the correlation
+ - dist == 'u': uncentered correlation
+ - dist == 'x': absolute uncentered correlation
+ - dist == 's': Spearman's rank correlation
+ - dist == 'k': Kendall's tau
+ - method: specifies how the distance between two clusters is defined:
+ - method == 'a': the distance between the arithmetic means
+ of the two clusters
+ - method == 'm': the distance between the medians of the two clusters
+ - method == 's': the smallest pairwise distance between members
+ of the two clusters
+ - method == 'x': the largest pairwise distance between members
+ of the two clusters
+ - method == 'v': average of the pairwise distances between members
+ of the two clusters
+ - transpose:
+ - if False: clusters of rows are considered;
+ - if True: clusters of columns are considered.
+ """
+ data = __check_data(data)
+ shape = data.shape
+ ndata = shape[0] if transpose else shape[1]
+ mask = __check_mask(mask, shape)
+ weight = __check_weight(weight, ndata)
+ index1 = __check_index(index1)
+ index2 = __check_index(index2)
+ return _cluster.clusterdistance(
+ data, mask, weight, index1, index2, method, dist, transpose
+ )
+
+
+def clustercentroids(data, mask=None, clusterid=None, method="a", transpose=False):
+ """Calculate and return the centroid of each cluster.
+
+ The clustercentroids routine calculates the cluster centroids, given to
+ which cluster each item belongs. The centroid is defined as either
+ the mean or the median over all items for each dimension.
+
+ Keyword arguments:
+ - data: nrows x ncolumns array containing the data values.
+ - mask: nrows x ncolumns array of integers, showing which data are
+ missing. If mask[i, j]==0, then data[i, j] is missing.
+ - clusterid: array containing the cluster number for each item.
+ The cluster number should be non-negative.
+ - method: specifies whether the centroid is calculated from the
+ arithmetic mean (method == 'a', default) or the median (method == 'm')
+ over each dimension.
+ - transpose: if False, each row contains the data for one item;
+ if True, each column contains the data for one item.
+
+ Return values:
+ - cdata: 2D array containing the cluster centroids.
+ If transpose is False, then the dimensions of cdata are
+ nclusters x ncolumns.
+ If transpose is True, then the dimensions of cdata are
+ nrows x nclusters.
+ - cmask: 2D array of integers describing which items in cdata,
+ if any, are missing.
+ """
+ data = __check_data(data)
+ mask = __check_mask(mask, data.shape)
+ nrows, ncolumns = data.shape
+ if clusterid is None:
+ n = ncolumns if transpose else nrows
+ clusterid = numpy.zeros(n, dtype="intc")
+ nclusters = 1
+ else:
+ clusterid = numpy.require(clusterid, dtype="intc", requirements="C")
+ nclusters = max(clusterid + 1)
+ if transpose:
+ shape = (nrows, nclusters)
+ else:
+ shape = (nclusters, ncolumns)
+ cdata = numpy.zeros(shape, dtype="d")
+ cmask = numpy.zeros(shape, dtype="intc")
+ _cluster.clustercentroids(data, mask, clusterid, method, transpose, cdata, cmask)
+ return cdata, cmask
+
+
+def distancematrix(data, mask=None, weight=None, transpose=False, dist="e"):
+ """Calculate and return a distance matrix from the data.
+
+ This function returns the distance matrix calculated from the data.
+
+ Keyword arguments:
+ - data: nrows x ncolumns array containing the data values.
+ - mask: nrows x ncolumns array of integers, showing which data are
+ missing. If mask[i, j]==0, then data[i, j] is missing.
+ - weight: the weights to be used when calculating distances.
+ - transpose: if False: the distances between rows are calculated;
+ if True: the distances between columns are calculated.
+ - dist: specifies the distance function to be used:
+ - dist == 'e': Euclidean distance
+ - dist == 'b': City Block distance
+ - dist == 'c': Pearson correlation
+ - dist == 'a': absolute value of the correlation
+ - dist == 'u': uncentered correlation
+ - dist == 'x': absolute uncentered correlation
+ - dist == 's': Spearman's rank correlation
+ - dist == 'k': Kendall's tau
+
+ Return value:
+ The distance matrix is returned as a list of 1D arrays containing the
+ distance matrix calculated from the data. The number of columns in eac
+ row is equal to the row number. Hence, the first row has zero length.
+ For example:
+
+ >>> from numpy import array
+ >>> from Bio.Cluster import distancematrix
+ >>> data = array([[0, 1, 2, 3],
+ ... [4, 5, 6, 7],
+ ... [8, 9, 10, 11],
+ ... [1, 2, 3, 4]])
+ >>> distances = distancematrix(data, dist='e')
+ >>> distances
+ [array([], dtype=float64), array([ 16.]), array([ 64., 16.]), array([ 1., 9., 49.])]
+
+ which can be rewritten as
+ distances = [array([], dtype=float64),
+ array([ 16.]),
+ array([ 64., 16.]),
+ array([ 1., 9., 49.])]
+
+ This corresponds to the distance matrix:
+
+ [ 0., 16., 64., 1.]
+ [16., 0., 16., 9.]
+ [64., 16., 0., 49.]
+ [ 1., 9., 49., 0.]
+ """
+ data = __check_data(data)
+ shape = data.shape
+ mask = __check_mask(mask, shape)
+ if transpose:
+ ndata, nitems = shape
+ else:
+ nitems, ndata = shape
+ weight = __check_weight(weight, ndata)
+ matrix = [numpy.empty(i, dtype="d") for i in range(nitems)]
+ _cluster.distancematrix(data, mask, weight, transpose, dist, matrix)
+ return matrix
+
+
+def pca(data):
+ """Perform principal component analysis.
+
+ Keyword arguments:
+ - data: nrows x ncolumns array containing the data values.
+
+ Return value:
+ This function returns an array containing the mean of each column, the
+ principal components as an nmin x ncolumns array, as well as the
+ coordinates (an nrows x nmin array) of the data along the principal
+ components, and the associated eigenvalues. The principal components, the
+ coordinates, and the eigenvalues are sorted by the magnitude of the
+ eigenvalue, with the largest eigenvalues appearing first. Here, nmin is
+ the smaller of nrows and ncolumns.
+ Adding the column means to the dot product of the coordinates and the
+ principal components recreates the data matrix:
+
+ >>> from numpy import array, dot, amax, amin
+ >>> from Bio.Cluster import pca
+ >>> matrix = array([[ 0., 0., 0.],
+ ... [ 1., 0., 0.],
+ ... [ 7., 3., 0.],
+ ... [ 4., 2., 6.]])
+ >>> columnmean, coordinates, pc, _ = pca(matrix)
+ >>> m = matrix - (columnmean + dot(coordinates, pc))
+ >>> amax(m) < 1e-12 and amin(m) > -1e-12
+ True
+
+ """
+ data = __check_data(data)
+ nrows, ncols = data.shape
+ nmin = min(nrows, ncols)
+ columnmean = numpy.empty(ncols, dtype="d")
+ pc = numpy.empty((nmin, ncols), dtype="d")
+ coordinates = numpy.empty((nrows, nmin), dtype="d")
+ eigenvalues = numpy.empty(nmin, dtype="d")
+ _cluster.pca(data, columnmean, coordinates, pc, eigenvalues)
+ return columnmean, coordinates, pc, eigenvalues
+
+
+class Record:
+ """Store gene expression data.
+
+ A Record stores the gene expression data and related information contained
+ in a data file following the file format defined for Michael Eisen's
+ Cluster/TreeView program.
+
+ Attributes:
+ - data: a matrix containing the gene expression data
+ - mask: a matrix containing only 1's and 0's, denoting which values
+ are present (1) or missing (0). If all items of mask are
+ one (no missing data), then mask is set to None.
+ - geneid: a list containing a unique identifier for each gene
+ (e.g., ORF name)
+ - genename: a list containing an additional description for each gene
+ (e.g., gene name)
+ - gweight: the weight to be used for each gene when calculating the
+ distance
+ - gorder: an array of real numbers indicating the preferred order of the
+ genes in the output file
+ - expid: a list containing a unique identifier for each sample.
+ - eweight: the weight to be used for each sample when calculating the
+ distance
+ - eorder: an array of real numbers indication the preferred order of the
+ samples in the output file
+ - uniqid: the string that was used instead of UNIQID in the input file.
+
+ """
+
+ def __init__(self, handle=None):
+ """Read gene expression data from the file handle and return a Record.
+
+ The file should be in the format defined for Michael Eisen's
+ Cluster/TreeView program.
+ """
+ self.data = None
+ self.mask = None
+ self.geneid = None
+ self.genename = None
+ self.gweight = None
+ self.gorder = None
+ self.expid = None
+ self.eweight = None
+ self.eorder = None
+ self.uniqid = None
+ if not handle:
+ return
+ line = handle.readline().strip("\r\n").split("\t")
+ n = len(line)
+ self.uniqid = line[0]
+ self.expid = []
+ cols = {0: "GENEID"}
+ for word in line[1:]:
+ if word == "NAME":
+ cols[line.index(word)] = word
+ self.genename = []
+ elif word == "GWEIGHT":
+ cols[line.index(word)] = word
+ self.gweight = []
+ elif word == "GORDER":
+ cols[line.index(word)] = word
+ self.gorder = []
+ else:
+ self.expid.append(word)
+ self.geneid = []
+ self.data = []
+ self.mask = []
+ needmask = 0
+ for line in handle:
+ line = line.strip("\r\n").split("\t")
+ if len(line) != n:
+ raise ValueError(
+ "Line with %d columns found (expected %d)" % (len(line), n)
+ )
+ if line[0] == "EWEIGHT":
+ i = max(cols) + 1
+ self.eweight = numpy.array(line[i:], float)
+ continue
+ if line[0] == "EORDER":
+ i = max(cols) + 1
+ self.eorder = numpy.array(line[i:], float)
+ continue
+ rowdata = []
+ rowmask = []
+ n = len(line)
+ for i in range(n):
+ word = line[i]
+ if i in cols:
+ if cols[i] == "GENEID":
+ self.geneid.append(word)
+ if cols[i] == "NAME":
+ self.genename.append(word)
+ if cols[i] == "GWEIGHT":
+ self.gweight.append(float(word))
+ if cols[i] == "GORDER":
+ self.gorder.append(float(word))
+ continue
+ if not word:
+ rowdata.append(0.0)
+ rowmask.append(0)
+ needmask = 1
+ else:
+ rowdata.append(float(word))
+ rowmask.append(1)
+ self.data.append(rowdata)
+ self.mask.append(rowmask)
+ self.data = numpy.array(self.data)
+ if needmask:
+ self.mask = numpy.array(self.mask, int)
+ else:
+ self.mask = None
+ if self.gweight:
+ self.gweight = numpy.array(self.gweight)
+ if self.gorder:
+ self.gorder = numpy.array(self.gorder)
+
+ def treecluster(self, transpose=False, method="m", dist="e"):
+ """Apply hierarchical clustering and return a Tree object.
+
+ The pairwise single, complete, centroid, and average linkage
+ hierarchical clustering methods are available.
+
+ Keyword arguments:
+ - transpose: if False: rows are clustered;
+ if True: columns are clustered.
+ - dist: specifies the distance function to be used:
+ - dist == 'e': Euclidean distance
+ - dist == 'b': City Block distance
+ - dist == 'c': Pearson correlation
+ - dist == 'a': absolute value of the correlation
+ - dist == 'u': uncentered correlation
+ - dist == 'x': absolute uncentered correlation
+ - dist == 's': Spearman's rank correlation
+ - dist == 'k': Kendall's tau
+ - method: specifies which linkage method is used:
+ - method == 's': Single pairwise linkage
+ - method == 'm': Complete (maximum) pairwise linkage (default)
+ - method == 'c': Centroid linkage
+ - method == 'a': Average pairwise linkage
+
+ See the description of the Tree class for more information about
+ the Tree object returned by this method.
+ """
+ if transpose:
+ weight = self.gweight
+ else:
+ weight = self.eweight
+ return treecluster(self.data, self.mask, weight, transpose, method, dist)
+
+ def kcluster(
+ self,
+ nclusters=2,
+ transpose=False,
+ npass=1,
+ method="a",
+ dist="e",
+ initialid=None,
+ ):
+ """Apply k-means or k-median clustering.
+
+ This method returns a tuple (clusterid, error, nfound).
+
+ Keyword arguments:
+ - nclusters: number of clusters (the 'k' in k-means)
+ - transpose: if False, genes (rows) are clustered;
+ if True, samples (columns) are clustered.
+ - npass: number of times the k-means clustering algorithm is
+ performed, each time with a different (random) initial condition.
+ - method: specifies how the center of a cluster is found:
+ - method == 'a': arithmetic mean
+ - method == 'm': median
+ - dist: specifies the distance function to be used:
+ - dist == 'e': Euclidean distance
+ - dist == 'b': City Block distance
+ - dist == 'c': Pearson correlation
+ - dist == 'a': absolute value of the correlation
+ - dist == 'u': uncentered correlation
+ - dist == 'x': absolute uncentered correlation
+ - dist == 's': Spearman's rank correlation
+ - dist == 'k': Kendall's tau
+ - initialid: the initial clustering from which the algorithm should
+ start. If initialid is None, the routine carries out npass
+ repetitions of the EM algorithm, each time starting from a different
+ random initial clustering. If initialid is given, the routine
+ carries out the EM algorithm only once, starting from the given
+ initial clustering and without randomizing the order in which items
+ are assigned to clusters (i.e., using the same order as in the data
+ matrix). In that case, the k-means algorithm is fully deterministic.
+
+ Return values:
+ - clusterid: array containing the number of the cluster to which each
+ gene/sample was assigned in the best k-means clustering
+ solution that was found in the npass runs;
+ - error: the within-cluster sum of distances for the returned
+ k-means clustering solution;
+ - nfound: the number of times this solution was found.
+ """
+ if transpose:
+ weight = self.gweight
+ else:
+ weight = self.eweight
+ return kcluster(
+ self.data,
+ nclusters,
+ self.mask,
+ weight,
+ transpose,
+ npass,
+ method,
+ dist,
+ initialid,
+ )
+
+ def somcluster(
+ self, transpose=False, nxgrid=2, nygrid=1, inittau=0.02, niter=1, dist="e"
+ ):
+ """Calculate a self-organizing map on a rectangular grid.
+
+ The somcluster method returns a tuple (clusterid, celldata).
+
+ Keyword arguments:
+ - transpose: if False, genes (rows) are clustered;
+ if True, samples (columns) are clustered.
+ - nxgrid: the horizontal dimension of the rectangular SOM map
+ - nygrid: the vertical dimension of the rectangular SOM map
+ - inittau: the initial value of tau (the neighborbood function)
+ - niter: the number of iterations
+ - dist: specifies the distance function to be used:
+ - dist == 'e': Euclidean distance
+ - dist == 'b': City Block distance
+ - dist == 'c': Pearson correlation
+ - dist == 'a': absolute value of the correlation
+ - dist == 'u': uncentered correlation
+ - dist == 'x': absolute uncentered correlation
+ - dist == 's': Spearman's rank correlation
+ - dist == 'k': Kendall's tau
+
+ Return values:
+ - clusterid: array with two columns, while the number of rows is equal
+ to the number of genes or the number of samples depending on
+ whether genes or samples are being clustered. Each row in
+ the array contains the x and y coordinates of the cell in the
+ rectangular SOM grid to which the gene or samples was assigned.
+ - celldata: an array with dimensions (nxgrid, nygrid, number of
+ samples) if genes are being clustered, or (nxgrid, nygrid,
+ number of genes) if samples are being clustered. Each item
+ [ix, iy] of this array is a 1D vector containing the gene
+ expression data for the centroid of the cluster in the SOM grid
+ cell with coordinates [ix, iy].
+ """
+ if transpose:
+ weight = self.gweight
+ else:
+ weight = self.eweight
+ return somcluster(
+ self.data,
+ self.mask,
+ weight,
+ transpose,
+ nxgrid,
+ nygrid,
+ inittau,
+ niter,
+ dist,
+ )
+
+ def clustercentroids(self, clusterid=None, method="a", transpose=False):
+ """Calculate the cluster centroids and return a tuple (cdata, cmask).
+
+ The centroid is defined as either the mean or the median over all
+ items for each dimension.
+
+ Keyword arguments:
+ - data: nrows x ncolumns array containing the expression data
+ - mask: nrows x ncolumns array of integers, showing which data
+ are missing. If mask[i, j]==0, then data[i, j] is missing.
+ - transpose: if False, gene (row) clusters are considered;
+ if True, sample (column) clusters are considered.
+ - clusterid: array containing the cluster number for each gene or
+ sample. The cluster number should be non-negative.
+ - method: specifies how the centroid is calculated:
+ - method == 'a': arithmetic mean over each dimension. (default)
+ - method == 'm': median over each dimension.
+
+ Return values:
+ - cdata: 2D array containing the cluster centroids. If transpose
+ is False, then the dimensions of cdata are nclusters x ncolumns.
+ If transpose is True, then the dimensions of cdata are nrows x
+ nclusters.
+ - cmask: 2D array of integers describing which items in cdata,
+ if any, are missing.
+ """
+ return clustercentroids(self.data, self.mask, clusterid, method, transpose)
+
+ def clusterdistance(
+ self, index1=0, index2=0, method="a", dist="e", transpose=False
+ ):
+ """Calculate the distance between two clusters.
+
+ Keyword arguments:
+ - index1: 1D array identifying which genes/samples belong to the
+ first cluster. If the cluster contains only one gene, then
+ index1 can also be written as a single integer.
+ - index2: 1D array identifying which genes/samples belong to the
+ second cluster. If the cluster contains only one gene, then
+ index2 can also be written as a single integer.
+ - transpose: if False, genes (rows) are clustered;
+ if True, samples (columns) are clustered.
+ - dist: specifies the distance function to be used:
+ - dist == 'e': Euclidean distance
+ - dist == 'b': City Block distance
+ - dist == 'c': Pearson correlation
+ - dist == 'a': absolute value of the correlation
+ - dist == 'u': uncentered correlation
+ - dist == 'x': absolute uncentered correlation
+ - dist == 's': Spearman's rank correlation
+ - dist == 'k': Kendall's tau
+ - method: specifies how the distance between two clusters is defined:
+ - method == 'a': the distance between the arithmetic means
+ of the two clusters
+ - method == 'm': the distance between the medians of the
+ two clusters
+ - method == 's': the smallest pairwise distance between members
+ of the two clusters
+ - method == 'x': the largest pairwise distance between members
+ of the two clusters
+ - method == 'v': average of the pairwise distances between members
+ of the two clusters
+ - transpose: if False: clusters of rows are considered;
+ if True: clusters of columns are considered.
+ """
+ if transpose:
+ weight = self.gweight
+ else:
+ weight = self.eweight
+ return clusterdistance(
+ self.data, self.mask, weight, index1, index2, method, dist, transpose
+ )
+
+ def distancematrix(self, transpose=False, dist="e"):
+ """Calculate the distance matrix and return it as a list of arrays.
+
+ Keyword arguments:
+ - transpose:
+ if False: calculate the distances between genes (rows);
+ if True: calculate the distances between samples (columns).
+ - dist: specifies the distance function to be used:
+ - dist == 'e': Euclidean distance
+ - dist == 'b': City Block distance
+ - dist == 'c': Pearson correlation
+ - dist == 'a': absolute value of the correlation
+ - dist == 'u': uncentered correlation
+ - dist == 'x': absolute uncentered correlation
+ - dist == 's': Spearman's rank correlation
+ - dist == 'k': Kendall's tau
+
+ Return value:
+
+ The distance matrix is returned as a list of 1D arrays containing the
+ distance matrix between the gene expression data. The number of columns
+ in each row is equal to the row number. Hence, the first row has zero
+ length. An example of the return value is:
+
+ matrix = [[],
+ array([1.]),
+ array([7., 3.]),
+ array([4., 2., 6.])]
+
+ This corresponds to the distance matrix:
+
+ [0., 1., 7., 4.]
+ [1., 0., 3., 2.]
+ [7., 3., 0., 6.]
+ [4., 2., 6., 0.]
+
+ """
+ if transpose:
+ weight = self.gweight
+ else:
+ weight = self.eweight
+ return distancematrix(self.data, self.mask, weight, transpose, dist)
+
+ def save(self, jobname, geneclusters=None, expclusters=None):
+ """Save the clustering results.
+
+ The saved files follow the convention for the Java TreeView program,
+ which can therefore be used to view the clustering result.
+
+ Keyword arguments:
+ - jobname: The base name of the files to be saved. The filenames
+ are jobname.cdt, jobname.gtr, and jobname.atr for hierarchical
+ clustering, and jobname-K*.cdt, jobname-K*.kgg, jobname-K*.kag
+ for k-means clustering results.
+ - geneclusters: For hierarchical clustering results, geneclusters
+ is a Tree object as returned by the treecluster method. For k-means
+ clustering results, geneclusters is a vector containing ngenes
+ integers, describing to which cluster a given gene belongs. This
+ vector can be calculated by kcluster.
+ - expclusters: For hierarchical clustering results, expclusters
+ is a Tree object as returned by the treecluster method. For k-means
+ clustering results, expclusters is a vector containing nexps
+ integers, describing to which cluster a given sample belongs. This
+ vector can be calculated by kcluster.
+ """
+ (ngenes, nexps) = numpy.shape(self.data)
+ if self.gorder is None:
+ gorder = numpy.arange(ngenes)
+ else:
+ gorder = self.gorder
+ if self.eorder is None:
+ eorder = numpy.arange(nexps)
+ else:
+ eorder = self.eorder
+ if (
+ geneclusters is not None
+ and expclusters is not None
+ and type(geneclusters) != type(expclusters)
+ ):
+ raise ValueError(
+ "found one k-means and one hierarchical "
+ "clustering solution in geneclusters and "
+ "expclusters"
+ )
+ gid = 0
+ aid = 0
+ filename = jobname
+ postfix = ""
+ if isinstance(geneclusters, Tree):
+ # This is a hierarchical clustering result.
+ geneindex = self._savetree(jobname, geneclusters, gorder, False)
+ gid = 1
+ elif geneclusters is not None:
+ # This is a k-means clustering result.
+ filename = jobname + "_K"
+ k = max(geneclusters) + 1
+ kggfilename = "%s_K_G%d.kgg" % (jobname, k)
+ geneindex = self._savekmeans(kggfilename, geneclusters, gorder, False)
+ postfix = "_G%d" % k
+ else:
+ geneindex = numpy.argsort(gorder)
+ if isinstance(expclusters, Tree):
+ # This is a hierarchical clustering result.
+ expindex = self._savetree(jobname, expclusters, eorder, True)
+ aid = 1
+ elif expclusters is not None:
+ # This is a k-means clustering result.
+ filename = jobname + "_K"
+ k = max(expclusters) + 1
+ kagfilename = "%s_K_A%d.kag" % (jobname, k)
+ expindex = self._savekmeans(kagfilename, expclusters, eorder, True)
+ postfix += "_A%d" % k
+ else:
+ expindex = numpy.argsort(eorder)
+ filename = filename + postfix
+ self._savedata(filename, gid, aid, geneindex, expindex)
+
+ def _savetree(self, jobname, tree, order, transpose):
+ """Save the hierarchical clustering solution (PRIVATE)."""
+ if transpose:
+ extension = ".atr"
+ keyword = "ARRY"
+ else:
+ extension = ".gtr"
+ keyword = "GENE"
+ index = tree.sort(order)
+ nnodes = len(tree)
+ with open(jobname + extension, "w") as outputfile:
+ nodeID = [""] * nnodes
+ nodedist = numpy.array([node.distance for node in tree[:]])
+ for nodeindex in range(nnodes):
+ min1 = tree[nodeindex].left
+ min2 = tree[nodeindex].right
+ nodeID[nodeindex] = "NODE%dX" % (nodeindex + 1)
+ outputfile.write(nodeID[nodeindex])
+ outputfile.write("\t")
+ if min1 < 0:
+ index1 = -min1 - 1
+ outputfile.write(nodeID[index1] + "\t")
+ nodedist[nodeindex] = max(nodedist[nodeindex], nodedist[index1])
+ else:
+ outputfile.write("%s%dX\t" % (keyword, min1))
+ if min2 < 0:
+ index2 = -min2 - 1
+ outputfile.write(nodeID[index2] + "\t")
+ nodedist[nodeindex] = max(nodedist[nodeindex], nodedist[index2])
+ else:
+ outputfile.write("%s%dX\t" % (keyword, min2))
+ outputfile.write(str(1.0 - nodedist[nodeindex]))
+ outputfile.write("\n")
+ return index
+
+ def _savekmeans(self, filename, clusterids, order, transpose):
+ """Save the k-means clustering solution (PRIVATE)."""
+ if transpose:
+ label = "ARRAY"
+ names = self.expid
+ else:
+ label = self.uniqid
+ names = self.geneid
+ with open(filename, "w") as outputfile:
+ outputfile.write(label + "\tGROUP\n")
+ index = numpy.argsort(order)
+ n = len(names)
+ sortedindex = numpy.zeros(n, int)
+ counter = 0
+ cluster = 0
+ while counter < n:
+ for j in index:
+ if clusterids[j] == cluster:
+ outputfile.write("%s\t%s\n" % (names[j], cluster))
+ sortedindex[counter] = j
+ counter += 1
+ cluster += 1
+ return sortedindex
+
+ def _savedata(self, jobname, gid, aid, geneindex, expindex):
+ """Save the clustered data (PRIVATE)."""
+ if self.genename is None:
+ genename = self.geneid
+ else:
+ genename = self.genename
+ (ngenes, nexps) = numpy.shape(self.data)
+ with open(jobname + ".cdt", "w") as outputfile:
+ if self.mask is not None:
+ mask = self.mask
+ else:
+ mask = numpy.ones((ngenes, nexps), int)
+ if self.gweight is not None:
+ gweight = self.gweight
+ else:
+ gweight = numpy.ones(ngenes)
+ if self.eweight is not None:
+ eweight = self.eweight
+ else:
+ eweight = numpy.ones(nexps)
+ if gid:
+ outputfile.write("GID\t")
+ outputfile.write(self.uniqid)
+ outputfile.write("\tNAME\tGWEIGHT")
+ # Now add headers for data columns.
+ for j in expindex:
+ outputfile.write("\t%s" % self.expid[j])
+ outputfile.write("\n")
+ if aid:
+ outputfile.write("AID")
+ if gid:
+ outputfile.write("\t")
+ outputfile.write("\t\t")
+ for j in expindex:
+ outputfile.write("\tARRY%dX" % j)
+ outputfile.write("\n")
+ outputfile.write("EWEIGHT")
+ if gid:
+ outputfile.write("\t")
+ outputfile.write("\t\t")
+ for j in expindex:
+ outputfile.write("\t%f" % eweight[j])
+ outputfile.write("\n")
+ for i in geneindex:
+ if gid:
+ outputfile.write("GENE%dX\t" % i)
+ outputfile.write(
+ "%s\t%s\t%f" % (self.geneid[i], genename[i], gweight[i])
+ )
+ for j in expindex:
+ outputfile.write("\t")
+ if mask[i, j]:
+ outputfile.write(str(self.data[i, j]))
+ outputfile.write("\n")
+
+
+def read(handle):
+ """Read gene expression data from the file handle and return a Record.
+
+ The file should be in the file format defined for Michael Eisen's
+ Cluster/TreeView program.
+ """
+ return Record(handle)
+
+
+# Everything below is private
+#
+
+
+def __check_data(data):
+ if isinstance(data, numpy.ndarray):
+ data = numpy.require(data, dtype="d", requirements="C")
+ else:
+ data = numpy.array(data, dtype="d")
+ if data.ndim != 2:
+ raise ValueError("data should be 2-dimensional")
+ if numpy.isnan(data).any():
+ raise ValueError("data contains NaN values")
+ return data
+
+
+def __check_mask(mask, shape):
+ if mask is None:
+ return numpy.ones(shape, dtype="intc")
+ elif isinstance(mask, numpy.ndarray):
+ return numpy.require(mask, dtype="intc", requirements="C")
+ else:
+ return numpy.array(mask, dtype="intc")
+
+
+def __check_weight(weight, ndata):
+ if weight is None:
+ return numpy.ones(ndata, dtype="d")
+ if isinstance(weight, numpy.ndarray):
+ weight = numpy.require(weight, dtype="d", requirements="C")
+ else:
+ weight = numpy.array(weight, dtype="d")
+ if numpy.isnan(weight).any():
+ raise ValueError("weight contains NaN values")
+ return weight
+
+
+def __check_initialid(initialid, npass, nitems):
+ if initialid is None:
+ if npass <= 0:
+ raise ValueError("npass should be a positive integer")
+ clusterid = numpy.empty(nitems, dtype="intc")
+ else:
+ npass = 0
+ clusterid = numpy.array(initialid, dtype="intc")
+ return clusterid, npass
+
+
+def __check_index(index):
+ if index is None:
+ return numpy.zeros(1, dtype="intc")
+ elif isinstance(index, numbers.Integral):
+ return numpy.array([index], dtype="intc")
+ elif isinstance(index, numpy.ndarray):
+ return numpy.require(index, dtype="intc", requirements="C")
+ else:
+ return numpy.array(index, dtype="intc")
+
+
+def __check_distancematrix(distancematrix):
+ if distancematrix is None:
+ return distancematrix
+ if isinstance(distancematrix, numpy.ndarray):
+ distancematrix = numpy.require(distancematrix, dtype="d", requirements="C")
+ else:
+ try:
+ distancematrix = numpy.array(distancematrix, dtype="d")
+ except ValueError:
+ n = len(distancematrix)
+ d = [None] * n
+ for i, row in enumerate(distancematrix):
+ if isinstance(row, numpy.ndarray):
+ row = numpy.require(row, dtype="d", requirements="C")
+ else:
+ row = numpy.array(row, dtype="d")
+ if row.ndim != 1:
+ raise ValueError("row %d is not one-dimensional" % i) from None
+ m = len(row)
+ if m != i:
+ raise ValueError(
+ "row %d has incorrect size (%d, expected %d)" % (i, m, i)
+ ) from None
+ if numpy.isnan(row).any():
+ raise ValueError("distancematrix contains NaN values") from None
+ d[i] = row
+ return d
+ if numpy.isnan(distancematrix).any():
+ raise ValueError("distancematrix contains NaN values")
+ return distancematrix
diff --git a/code/lib/Bio/Cluster/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/Cluster/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..0a45e9e
Binary files /dev/null and b/code/lib/Bio/Cluster/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/Cluster/_cluster.cp37-win_amd64.pyd b/code/lib/Bio/Cluster/_cluster.cp37-win_amd64.pyd
new file mode 100644
index 0000000..e000cce
Binary files /dev/null and b/code/lib/Bio/Cluster/_cluster.cp37-win_amd64.pyd differ
diff --git a/code/lib/Bio/Cluster/cluster.c b/code/lib/Bio/Cluster/cluster.c
new file mode 100644
index 0000000..89db792
--- /dev/null
+++ b/code/lib/Bio/Cluster/cluster.c
@@ -0,0 +1,5061 @@
+/* The C clustering library.
+ * Copyright (C) 2002 Michiel Jan Laurens de Hoon.
+ *
+ * This library was written at the Laboratory of DNA Information Analysis,
+ * Human Genome Center, Institute of Medical Science, University of Tokyo,
+ * 4-6-1 Shirokanedai, Minato-ku, Tokyo 108-8639, Japan.
+ * Contact: michiel.dehoon 'AT' riken.jp
+ *
+ * Permission to use, copy, modify, and distribute this software and its
+ * documentation with or without modifications and for any purpose and
+ * without fee is hereby granted, provided that any copyright notices
+ * appear in all copies and that both those copyright notices and this
+ * permission notice appear in supporting documentation, and that the
+ * names of the contributors or copyright holders not be used in
+ * advertising or publicity pertaining to distribution of the software
+ * without specific prior permission.
+ *
+ * THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
+ * WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
+ * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
+ * OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
+ * OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include "cluster.h"
+
+/* ************************************************************************ */
+/* SORTING FUNCTIONS */
+/*
+* C qsort() is very slow, much slower than C++ std::sort().
+* This is because qsort() doesn't utilize data-type information at compile time,
+* and it has redundant pointer dereference since it requires a compare function.
+* For projects that use old C, it's impossible to convert to C++/newer C.
+*
+* So we implement a simple quicksort that is ~~25% faster than std::sort()
+* with mostly random data, and much faster with structured/sorted data
+*/
+
+static const int INF = INT_MAX; // 2^31 - 1
+
+static int TEMP_SWAP_INT;
+#define swap_int(x,y) {TEMP_SWAP_INT = (x); (x) = (y); (y) = TEMP_SWAP_INT;}
+
+/* For quicksort, we need to choose a random pivot. Any random function should work. Even bad ones. */
+static int
+cheap_random()
+{
+ const int base = 2 * 100 * 1000 * 1000 + 33;
+ static int seed = 0;
+ seed = seed * 7 + 13;
+ if (seed > base) seed %= base;
+ return seed;
+}
+
+static inline int
+median_index_of3_index(const double arr[], int index[], const int a, const int b, const int c)
+{
+ if (arr[index[a]] < arr[index[b]]) {
+ if (arr[index[b]] < arr[index[c]]) return b;
+ else if (arr[index[a]] < arr[index[c]]) return c;
+ else return a;
+ }
+ else {
+ if (arr[index[a]] < arr[index[c]]) return a;
+ else if (arr[index[b]] < arr[index[c]]) return c;
+ else return b;
+ }
+}
+
+
+/* Insertion sort is best when the array is small. */
+static void
+insertion_sort_index(const double a[], int index[], int l, int r)
+{
+ int i, j, current_index;
+ double value;
+
+ if (r <= l) return;
+ i = l; j = r;
+ value = a[index[(l + r) >> 1]];
+ while (i <= j) {
+ while (a[index[i]] < value) i++;
+ while (a[index[j]] > value) j--;
+
+ if (i <= j) {
+ swap_int(index[i], index[j]);
+ i++;
+ j--;
+ }
+ }
+
+ for (i = l + 1; i <= r; i++) {
+ j = i - 1;
+ value = a[index[i]];
+ current_index = index[i];
+
+ while (j >= l && a[index[j]] > value) {
+ index[j + 1] = index[j];
+ j--;
+ }
+ index[j + 1] = current_index;
+ }
+}
+
+//***************
+static void
+fastsort_partition_index(const double a[], int index[], const int left, const int right, int* first_end_ptr, int* second_start_ptr) {
+ int low, high, i, pivot, mid;
+ double value;
+ int increasing = 1, decreasing = 1;
+
+ /*******/
+ /* choose a random way to choose pivot, to prevent all possible worst-cases*/
+ if ((right - left) & 1) pivot = left + cheap_random() % (right - left);
+ else pivot = median_index_of3_index(a, index, left, (left + right) >> 1, right);
+ value = a[index[pivot]];
+
+ /*******/
+ /* Skip through smaller values on left and larger values on right*/
+ low = left; high = right;
+ while (a[index[low]] < value) {
+ low++;
+ decreasing = 0;
+ if (a[index[low]] < a[index[low - 1]]) increasing = 0;
+ }
+
+ while (a[index[high]] > value) {
+ high--;
+ decreasing = 0;
+ if (a[index[high]] > a[index[high + 1]]) increasing = 0;
+ }
+
+ increasing &= a[index[high]] >= a[index[low]];
+ decreasing &= a[index[high]] <= a[index[low]];
+
+ /*******/
+ /* Resolve degenerate input cases */
+ if (increasing) {
+ if ((right - left) & 1) {
+ for (i = low + 1; i <= high; i++) if (a[index[i]] < a[index[i - 1]]) {
+ increasing = 0;
+ break;
+ }
+ }
+ else {
+ for (i = high; i >= low + 1; i--) if (a[index[i]] < a[index[i - 1]]) {
+ increasing = 0;
+ break;
+ }
+ }
+ if (increasing) { /* sorted */
+ *first_end_ptr = INF;
+ return;
+ }
+ }
+
+ if (decreasing) {
+ if ((right - left) & 1) {
+ for (i = low + 1; i <= high; i++) if (a[index[i]] > a[index[i - 1]]) {
+ decreasing = 0;
+ break;
+ }
+ }
+ else {
+ for (i = high; i >= low + 1; i--) if (a[index[i]] > a[index[i - 1]]) {
+ decreasing = 0;
+ break;
+ }
+ }
+ if (decreasing) {
+ mid = (right - left + 1) >> 1;
+ for (i = 0; i < mid; i++) swap_int(index[left + i], index[right - i]);
+ *first_end_ptr = INF;
+ return;
+ }
+ }
+
+ /******/
+ while (low <= high) {
+ while (a[index[low]] < value) low++;
+ while (a[index[high]] > value) high--;
+
+ if (low <= high) {
+ swap_int(index[low], index[high]);
+ low++;
+ high--;
+ }
+ }
+
+ *first_end_ptr = high;
+ *second_start_ptr = low;
+}
+
+//***************
+static void
+fastsort_recursive_index(const double a[], int index[], int l, int r)
+{
+ int first_end, second_start;
+ while (l < r) {
+ if (r - l <= 70) { /* determined through experiments and benchmarks, not randomly. 70-150 works fine on random/mixed (hard) data */
+ insertion_sort_index(a, index, l, r);
+ return;
+ }
+
+ fastsort_partition_index(a, index, l, r, &first_end, &second_start);
+ if (first_end == INF) return; /* sorted */
+
+ /* Recurse into smaller branch to avoid stack overflow */
+ if (first_end - l < r - second_start) {
+ fastsort_recursive_index(a, index, l, first_end);
+ l = second_start;
+ }
+ else {
+ fastsort_recursive_index(a, index, second_start, r);
+ r = first_end;
+ }
+ }
+}
+
+/* ************************************************************************ */
+
+double
+mean(int n, double a[])
+/*
+ Add 4 elements at once instead of 1. The advantages are:
+ 1. less loop overhead
+ 2. compile with -O2 -> use SSE/AVX.
+ 3. without AVX, still faster because 4 independent additions -> parallel instruction possible
+ 4. smaller floating-point error
+*/
+{
+ double result = 0.;
+ int i;
+ double sum[4] = {0., 0., 0., 0.};
+
+ int nstep4 = n - n % 4;
+ for (i = 0; i < nstep4; i += 4) {
+ sum[0] += a[i];
+ sum[1] += a[i + 1];
+ sum[2] += a[i + 2];
+ sum[3] += a[i + 3];
+ }
+
+ for (i = nstep4; i < n; i++) result += a[i];
+ result += (sum[0] + sum[1]) + (sum[2] + sum[3]);
+
+ return result / n;
+}
+
+/* ************************************************************************ */
+
+double
+median(int n, double x[])
+/*
+Find the median of X(1), ... , X(N), using as much of the quicksort
+algorithm as is needed to isolate it.
+N.B. On exit, the array X is partially ordered.
+Based on Alan J. Miller's median.f90 routine.
+*/
+
+{
+ int i, j;
+ int nr = n / 2;
+ int nl = nr - 1;
+ int even = 0;
+ /* hi & lo are position limits encompassing the median. */
+ int lo = 0;
+ int hi = n-1;
+
+ if (n == 2*nr) even = 1;
+ if (n < 3) {
+ if (n < 1) return 0.;
+ if (n == 1) return x[0];
+ return 0.5*(x[0]+x[1]);
+ }
+
+ /* Find median of 1st, middle & last values. */
+ do {
+ int loop;
+ int mid = (lo + hi)/2;
+ double result = x[mid];
+ double xlo = x[lo];
+ double xhi = x[hi];
+ if (xhixhi) result = xhi;
+ else if (resultresult) j--;
+ loop = 0;
+ if (inr) hi = j;
+ if (i == j) {
+ if (i == nl) lo = nl;
+ if (j == nr) hi = nr;
+ }
+ }
+ else {
+ if (jnr) hi = j;
+ /* Test whether median has been isolated. */
+ if (i == j && i == nr) return result;
+ }
+ }
+ while (lox[hi]) {
+ double temp = x[lo];
+ x[lo] = x[hi];
+ x[hi] = temp;
+ }
+ return x[nr];
+}
+
+/* ********************************************************************** */
+
+void
+sort_index(int n, const double data[], int index[])
+/* Sets up an index table given the data, such that data[index[]] is in
+ * increasing order. Sorting is done on the indices; the array data
+ * is unchanged.
+ */
+{
+ int i;
+ for (i = 0; i < n; i++) index[i] = i;
+ fastsort_recursive_index(data, index, 0, n - 1);
+}
+
+/* ********************************************************************** */
+
+static double*
+getrank(int n, const double data[], const double weight[])
+/* Calculates the ranks of the elements in the array data. Two elements with
+ * the same value get the same rank, equal to the average of the ranks had the
+ * elements different values. The ranks are returned as a newly allocated
+ * array that should be freed by the calling routine. If getrank fails due to
+ * a memory allocation error, it returns NULL.
+ */
+{
+ int i, j, k, l;
+ double* rank;
+ int* index;
+ double total = 0.0;
+ double subtotal;
+ double current;
+ double value;
+
+ rank = malloc(n*sizeof(double));
+ if (!rank) return NULL;
+ index = malloc(n*sizeof(int));
+ if (!index) {
+ free(rank);
+ return NULL;
+ }
+ /* Call sort to get an index table */
+ sort_index(n, data, index);
+ /* Build a rank table */
+ k = 0;
+ j = index[0];
+ current = data[j];
+ subtotal = weight[j];
+ for (i = 1; i < n; i++) {
+ j = index[i];
+ value = data[j];
+ if (value != current) {
+ current = value;
+ value = total + (subtotal + 1.0) / 2.0;
+ for (l = k; l < i; l++) rank[index[l]] = value;
+ k = i;
+ total += subtotal;
+ subtotal = 0.0;
+ }
+ subtotal += weight[j];
+ }
+ value = total + (subtotal + 1.0) / 2.0;
+ for (l = k; l < i; l++) rank[index[l]] = value;
+ free(index);
+ return rank;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int
+makedatamask(int nrows, int ncols, double*** pdata, int*** pmask)
+{
+ int i;
+ double** data;
+ int** mask;
+
+ data = malloc(nrows*sizeof(double*));
+ if (!data) return 0;
+ mask = malloc(nrows*sizeof(int*));
+ if (!mask) {
+ free(data);
+ return 0;
+ }
+ for (i = 0; i < nrows; i++) {
+ data[i] = malloc(ncols*sizeof(double));
+ if (!data[i]) break;
+ mask[i] = malloc(ncols*sizeof(int));
+ if (!mask[i]) {
+ free(data[i]);
+ break;
+ }
+ }
+ if (i == nrows) { /* break not encountered */
+ *pdata = data;
+ *pmask = mask;
+ return 1;
+ }
+ *pdata = NULL;
+ *pmask = NULL;
+ nrows = i;
+ for (i = 0; i < nrows; i++) {
+ free(data[i]);
+ free(mask[i]);
+ }
+ free(data);
+ free(mask);
+ return 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static void
+freedatamask(int n, double** data, int** mask)
+{
+ int i;
+
+ for (i = 0; i < n; i++) {
+ free(mask[i]);
+ free(data[i]);
+ }
+ free(mask);
+ free(data);
+}
+
+/* ---------------------------------------------------------------------- */
+
+static double
+find_closest_pair(int n, double** distmatrix, int* ip, int* jp)
+/*
+This function searches the distance matrix to find the pair with the shortest
+distance between them. The indices of the pair are returned in ip and jp; the
+distance itself is returned by the function.
+
+n (input) int
+The number of elements in the distance matrix.
+
+distmatrix (input) double**
+A ragged array containing the distance matrix. The number of columns in each
+row is one less than the row index.
+
+ip (output) int*
+A pointer to the integer that is to receive the first index of the pair with
+the shortest distance.
+
+jp (output) int*
+A pointer to the integer that is to receive the second index of the pair with
+the shortest distance.
+*/
+{
+ int i, j;
+ double temp;
+ double distance = distmatrix[1][0];
+
+ *ip = 1;
+ *jp = 0;
+ for (i = 1; i < n; i++) {
+ for (j = 0; j < i; j++) {
+ temp = distmatrix[i][j];
+ if (temp= n) {
+ /* Householder reduction to bidiagonal form */
+ for (i = 0; i < n; i++) {
+ l = i + 1;
+ rv1[i] = scale * g;
+ g = 0.0;
+ s = 0.0;
+ scale = 0.0;
+ for (k = i; k < m; k++) scale += fabs(u[k][i]);
+ if (scale != 0.0) {
+ for (k = i; k < m; k++) {
+ u[k][i] /= scale;
+ s += u[k][i]*u[k][i];
+ }
+ f = u[i][i];
+ g = (f >= 0) ? -sqrt(s) : sqrt(s);
+ h = f * g - s;
+ u[i][i] = f - g;
+ if (i < n-1) {
+ for (j = l; j < n; j++) {
+ s = 0.0;
+ for (k = i; k < m; k++) s += u[k][i] * u[k][j];
+ f = s / h;
+ for (k = i; k < m; k++) u[k][j] += f * u[k][i];
+ }
+ }
+ for (k = i; k < m; k++) u[k][i] *= scale;
+ }
+ w[i] = scale * g;
+ g = 0.0;
+ s = 0.0;
+ scale = 0.0;
+ if (i= 0) ? -sqrt(s) : sqrt(s);
+ h = f * g - s;
+ u[i][l] = f - g;
+ for (k = l; k < n; k++) rv1[k] = u[i][k] / h;
+ for (j = l; j < m; j++) {
+ s = 0.0;
+ for (k = l; k < n; k++) s += u[j][k] * u[i][k];
+ for (k = l; k < n; k++) u[j][k] += s * rv1[k];
+ }
+ for (k = l; k < n; k++) u[i][k] *= scale;
+ }
+ }
+ anorm = max(anorm, fabs(w[i])+fabs(rv1[i]));
+ }
+ /* accumulation of right-hand transformations */
+ for (i = n-1; i >= 0; i--) {
+ if (i < n-1) {
+ if (g != 0.0) {
+ for (j = l; j < n; j++) vt[i][j] = (u[i][j] / u[i][l]) / g;
+ /* double division avoids possible underflow */
+ for (j = l; j < n; j++) {
+ s = 0.0;
+ for (k = l; k < n; k++) s += u[i][k] * vt[j][k];
+ for (k = l; k < n; k++) vt[j][k] += s * vt[i][k];
+ }
+ }
+ }
+ for (j = l; j < n; j++) {
+ vt[j][i] = 0.0;
+ vt[i][j] = 0.0;
+ }
+ vt[i][i] = 1.0;
+ g = rv1[i];
+ l = i;
+ }
+ /* accumulation of left-hand transformations */
+ for (i = n-1; i >= 0; i--) {
+ l = i + 1;
+ g = w[i];
+ if (i != n-1)
+ for (j = l; j < n; j++) u[i][j] = 0.0;
+ if (g != 0.0) {
+ if (i != n-1) {
+ for (j = l; j < n; j++) {
+ s = 0.0;
+ for (k = l; k < m; k++) s += u[k][i] * u[k][j];
+ /* double division avoids possible underflow */
+ f = (s / u[i][i]) / g;
+ for (k = i; k < m; k++) u[k][j] += f * u[k][i];
+ }
+ }
+ for (j = i; j < m; j++) u[j][i] /= g;
+ }
+ else
+ for (j = i; j < m; j++) u[j][i] = 0.0;
+ u[i][i] += 1.0;
+ }
+ /* diagonalization of the bidiagonal form */
+ for (k = n-1; k >= 0; k--) {
+ k1 = k-1;
+ its = 0;
+ while (1) {
+ /* test for splitting */
+ for (l = k; l >= 0; l--) {
+ l1 = l-1;
+ if (fabs(rv1[l]) + anorm == anorm) break;
+ /* rv1[0] is always zero, so there is no exit
+ * through the bottom of the loop */
+ if (fabs(w[l1]) + anorm == anorm) {
+ /* cancellation of rv1[l] if l greater than 0 */
+ c = 0.0;
+ s = 1.0;
+ for (i = l; i <= k; i++) {
+ f = s * rv1[i];
+ rv1[i] *= c;
+ if (fabs(f) + anorm == anorm) break;
+ g = w[i];
+ h = sqrt(f*f+g*g);
+ w[i] = h;
+ c = g / h;
+ s = -f / h;
+ for (j = 0; j < m; j++) {
+ y = u[j][l1];
+ z = u[j][i];
+ u[j][l1] = y * c + z * s;
+ u[j][i] = -y * s + z * c;
+ }
+ }
+ break;
+ }
+ }
+ /* test for convergence */
+ z = w[k];
+ if (l == k) { /* convergence */
+ if (z < 0.0) {
+ /* w[k] is made non-negative */
+ w[k] = -z;
+ for (j = 0; j < n; j++) vt[k][j] = -vt[k][j];
+ }
+ break;
+ }
+ else if (its == 30) {
+ ierr = k;
+ break;
+ }
+ else {
+ /* shift from bottom 2 by 2 minor */
+ its++;
+ x = w[l];
+ y = w[k1];
+ g = rv1[k1];
+ h = rv1[k];
+ f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0*h*y);
+ g = sqrt(f*f+1.0);
+ f = ((x - z) * (x + z)
+ + h * (y / (f + (f >= 0 ? g : -g)) - h)) / x;
+ /* next qr transformation */
+ c = 1.0;
+ s = 1.0;
+ for (i1 = l; i1 <= k1; i1++) {
+ i = i1 + 1;
+ g = rv1[i];
+ y = w[i];
+ h = s * g;
+ g = c * g;
+ z = sqrt(f*f+h*h);
+ rv1[i1] = z;
+ c = f / z;
+ s = h / z;
+ f = x * c + g * s;
+ g = -x * s + g * c;
+ h = y * s;
+ y = y * c;
+ for (j = 0; j < n; j++) {
+ x = vt[i1][j];
+ z = vt[i][j];
+ vt[i1][j] = x * c + z * s;
+ vt[i][j] = -x * s + z * c;
+ }
+ z = sqrt(f*f+h*h);
+ w[i1] = z;
+ /* rotation can be arbitrary if z is zero */
+ if (z != 0.0) {
+ c = f / z;
+ s = h / z;
+ }
+ f = c * g + s * y;
+ x = -s * g + c * y;
+ for (j = 0; j < m; j++) {
+ y = u[j][i1];
+ z = u[j][i];
+ u[j][i1] = y * c + z * s;
+ u[j][i] = -y * s + z * c;
+ }
+ }
+ rv1[l] = 0.0;
+ rv1[k] = f;
+ w[k] = x;
+ }
+ }
+ }
+ }
+ else /* m < n */ {
+ /* Householder reduction to bidiagonal form */
+ for (i = 0; i < m; i++) {
+ l = i + 1;
+ rv1[i] = scale * g;
+ g = 0.0;
+ s = 0.0;
+ scale = 0.0;
+ for (k = i; k < n; k++) scale += fabs(u[i][k]);
+ if (scale != 0.0) {
+ for (k = i; k < n; k++) {
+ u[i][k] /= scale;
+ s += u[i][k]*u[i][k];
+ }
+ f = u[i][i];
+ g = (f >= 0) ? -sqrt(s) : sqrt(s);
+ h = f * g - s;
+ u[i][i] = f - g;
+ if (i < m-1) {
+ for (j = l; j < m; j++) {
+ s = 0.0;
+ for (k = i; k < n; k++) s += u[i][k] * u[j][k];
+ f = s / h;
+ for (k = i; k < n; k++) u[j][k] += f * u[i][k];
+ }
+ }
+ for (k = i; k < n; k++) u[i][k] *= scale;
+ }
+ w[i] = scale * g;
+ g = 0.0;
+ s = 0.0;
+ scale = 0.0;
+ if (i= 0) ? -sqrt(s) : sqrt(s);
+ h = f * g - s;
+ u[l][i] = f - g;
+ for (k = l; k < m; k++) rv1[k] = u[k][i] / h;
+ for (j = l; j < n; j++) {
+ s = 0.0;
+ for (k = l; k < m; k++) s += u[k][j] * u[k][i];
+ for (k = l; k < m; k++) u[k][j] += s * rv1[k];
+ }
+ for (k = l; k < m; k++) u[k][i] *= scale;
+ }
+ }
+ anorm = max(anorm, fabs(w[i])+fabs(rv1[i]));
+ }
+ /* accumulation of right-hand transformations */
+ for (i = m-1; i >= 0; i--) {
+ if (i < m-1) {
+ if (g != 0.0) {
+ for (j = l; j < m; j++) vt[j][i] = (u[j][i] / u[l][i]) / g;
+ /* double division avoids possible underflow */
+ for (j = l; j < m; j++) {
+ s = 0.0;
+ for (k = l; k < m; k++) s += u[k][i] * vt[k][j];
+ for (k = l; k < m; k++) vt[k][j] += s * vt[k][i];
+ }
+ }
+ }
+ for (j = l; j < m; j++) {
+ vt[i][j] = 0.0;
+ vt[j][i] = 0.0;
+ }
+ vt[i][i] = 1.0;
+ g = rv1[i];
+ l = i;
+ }
+ /* accumulation of left-hand transformations */
+ for (i = m-1; i >= 0; i--) {
+ l = i + 1;
+ g = w[i];
+ if (i != m-1)
+ for (j = l; j < m; j++) u[j][i] = 0.0;
+ if (g != 0.0) {
+ if (i != m-1) {
+ for (j = l; j < m; j++) {
+ s = 0.0;
+ for (k = l; k < n; k++) s += u[i][k] * u[j][k];
+ /* double division avoids possible underflow */
+ f = (s / u[i][i]) / g;
+ for (k = i; k < n; k++) u[j][k] += f * u[i][k];
+ }
+ }
+ for (j = i; j < n; j++) u[i][j] /= g;
+ }
+ else
+ for (j = i; j < n; j++) u[i][j] = 0.0;
+ u[i][i] += 1.0;
+ }
+ /* diagonalization of the bidiagonal form */
+ for (k = m-1; k >= 0; k--) {
+ k1 = k-1;
+ its = 0;
+ while (1) {
+ /* test for splitting */
+ for (l = k; l >= 0; l--) {
+ l1 = l-1;
+ if (fabs(rv1[l]) + anorm == anorm) break;
+ /* rv1[0] is always zero, so there is no exit
+ * through the bottom of the loop */
+ if (fabs(w[l1]) + anorm == anorm) {
+ /* cancellation of rv1[l] if l greater than 0 */
+ c = 0.0;
+ s = 1.0;
+ for (i = l; i <= k; i++) {
+ f = s * rv1[i];
+ rv1[i] *= c;
+ if (fabs(f) + anorm == anorm) break;
+ g = w[i];
+ h = sqrt(f*f+g*g);
+ w[i] = h;
+ c = g / h;
+ s = -f / h;
+ for (j = 0; j < n; j++) {
+ y = u[l1][j];
+ z = u[i][j];
+ u[l1][j] = y * c + z * s;
+ u[i][j] = -y * s + z * c;
+ }
+ }
+ break;
+ }
+ }
+ /* test for convergence */
+ z = w[k];
+ if (l == k) /* convergence */ {
+ if (z < 0.0) {
+ /* w[k] is made non-negative */
+ w[k] = -z;
+ for (j = 0; j < m; j++) vt[j][k] = -vt[j][k];
+ }
+ break;
+ }
+ else if (its == 30) {
+ ierr = k;
+ break;
+ }
+ else {
+ /* shift from bottom 2 by 2 minor */
+ its++;
+ x = w[l];
+ y = w[k1];
+ g = rv1[k1];
+ h = rv1[k];
+ f = ((y - z) * (y + z)
+ + (g - h) * (g + h)) / (2.0 * h * y);
+ g = sqrt(f*f+1.0);
+ f = ((x - z) * (x + z)
+ + h * (y / (f + (f >= 0 ? g : -g)) - h)) / x;
+ /* next qr transformation */
+ c = 1.0;
+ s = 1.0;
+ for (i1 = l; i1 <= k1; i1++) {
+ i = i1 + 1;
+ g = rv1[i];
+ y = w[i];
+ h = s * g;
+ g = c * g;
+ z = sqrt(f*f+h*h);
+ rv1[i1] = z;
+ c = f / z;
+ s = h / z;
+ f = x * c + g * s;
+ g = -x * s + g * c;
+ h = y * s;
+ y = y * c;
+ for (j = 0; j < m; j++) {
+ x = vt[j][i1];
+ z = vt[j][i];
+ vt[j][i1] = x * c + z * s;
+ vt[j][i] = -x * s + z * c;
+ }
+ z = sqrt(f*f+h*h);
+ w[i1] = z;
+ /* rotation can be arbitrary if z is zero */
+ if (z != 0.0) {
+ c = f / z;
+ s = h / z;
+ }
+ f = c * g + s * y;
+ x = -s * g + c * y;
+ for (j = 0; j < n; j++) {
+ y = u[i1][j];
+ z = u[i][j];
+ u[i1][j] = y * c + z * s;
+ u[i][j] = -y * s + z * c;
+ }
+ }
+ rv1[l] = 0.0;
+ rv1[k] = f;
+ w[k] = x;
+ }
+ }
+ }
+ }
+ free(rv1);
+ return ierr;
+}
+
+/* ********************************************************************* */
+
+int
+pca(int nrows, int ncolumns, double** u, double** v, double* w)
+/*
+Purpose
+=======
+
+This subroutine uses the singular value decomposition to perform principal
+components analysis of a real nrows by ncolumns rectangular matrix.
+
+Arguments
+=========
+
+nrows (input) int
+The number of rows in the matrix u.
+
+ncolumns (input) int
+The number of columns in the matrix v.
+
+u (input) double[nrows][ncolumns]
+On input, the array containing the data to which the principal component
+analysis should be applied. The function assumes that the mean has already been
+subtracted of each column, and hence that the mean of each column is zero.
+On output, see below.
+
+v (input) double[n][n], where n = min(nrows, ncolumns)
+Not used on input.
+
+w (input) double[n], where n = min(nrows, ncolumns)
+Not used on input.
+
+
+Return value
+============
+
+On output:
+
+If nrows >= ncolumns, then
+
+u contains the coordinates with respect to the principal components;
+v contains the principal component vectors.
+
+The dot product u . v reproduces the data that were passed in u.
+
+
+If nrows < ncolumns, then
+
+u contains the principal component vectors;
+v contains the coordinates with respect to the principal components.
+
+The dot product v . u reproduces the data that were passed in u.
+
+The eigenvalues of the covariance matrix are returned in w.
+
+The arrays u, v, and w are sorted according to eigenvalue, with the largest
+eigenvalues appearing first.
+
+The function returns 0 if successful, -1 if memory allocation fails, and a
+positive integer if the singular value decomposition fails to converge.
+*/
+{
+ int i;
+ int j;
+ int error;
+ int* index = malloc(ncolumns*sizeof(int));
+ double* temp = malloc(ncolumns*sizeof(double));
+
+ if (!index || !temp) {
+ if (index) free(index);
+ if (temp) free(temp);
+ return -1;
+ }
+ error = svd(nrows, ncolumns, u, w, v);
+ if (error == 0) {
+ if (nrows >= ncolumns) {
+ for (j = 0; j < ncolumns; j++) {
+ const double s = w[j];
+ for (i = 0; i < nrows; i++) u[i][j] *= s;
+ }
+ sort_index(ncolumns, w, index);
+ for (i = 0; i < ncolumns/2; i++) {
+ j = index[i];
+ index[i] = index[ncolumns-1-i];
+ index[ncolumns-1-i] = j;
+ }
+ for (i = 0; i < nrows; i++) {
+ for (j = 0; j < ncolumns; j++) temp[j] = u[i][index[j]];
+ for (j = 0; j < ncolumns; j++) u[i][j] = temp[j];
+ }
+ for (i = 0; i < ncolumns; i++) {
+ for (j = 0; j < ncolumns; j++) temp[j] = v[index[j]][i];
+ for (j = 0; j < ncolumns; j++) v[j][i] = temp[j];
+ }
+ for (i = 0; i < ncolumns; i++) temp[i] = w[index[i]];
+ for (i = 0; i < ncolumns; i++) w[i] = temp[i];
+ }
+ else /* nrows < ncolumns */ {
+ for (j = 0; j < nrows; j++) {
+ const double s = w[j];
+ for (i = 0; i < nrows; i++) v[i][j] *= s;
+ }
+ sort_index(nrows, w, index);
+ for (i = 0; i < nrows/2; i++) {
+ j = index[i];
+ index[i] = index[nrows-1-i];
+ index[nrows-1-i] = j;
+ }
+ for (j = 0; j < ncolumns; j++) {
+ for (i = 0; i < nrows; i++) temp[i] = u[index[i]][j];
+ for (i = 0; i < nrows; i++) u[i][j] = temp[i];
+ }
+ for (j = 0; j < nrows; j++) {
+ for (i = 0; i < nrows; i++) temp[i] = v[j][index[i]];
+ for (i = 0; i < nrows; i++) v[j][i] = temp[i];
+ }
+ for (i = 0; i < nrows; i++) temp[i] = w[index[i]];
+ for (i = 0; i < nrows; i++) w[i] = temp[i];
+ }
+ }
+ free(index);
+ free(temp);
+ return error;
+}
+
+/* ********************************************************************* */
+
+static double
+euclid(int n, double** data1, double** data2, int** mask1, int** mask2,
+ const double weight[], int index1, int index2, int transpose)
+
+/*
+Purpose
+=======
+
+The euclid routine calculates the weighted Euclidean distance between two
+rows or columns in a matrix.
+
+Arguments
+=========
+
+n (input) int
+The number of elements in a row or column. If transpose == 0, then n is the
+number of columns; otherwise, n is the number of rows.
+
+data1 (input) double array
+The data array containing the first vector.
+
+data2 (input) double array
+The data array containing the second vector.
+
+mask1 (input) int array
+This array which elements in data1 are missing. If mask1[i][j] == 0, then
+data1[i][j] is missing.
+
+mask2 (input) int array
+This array which elements in data2 are missing. If mask2[i][j] == 0, then
+data2[i][j] is missing.
+
+weight (input) double[ncolumns] if transpose == 0,
+ double[nrows] otherwise
+The weights that are used to calculate the distance. This is equivalent
+to including the jth data point weight[j] times in the calculation. The
+weights can be non-integer.
+
+index1 (input) int
+Index of the first row or column.
+
+index2 (input) int
+Index of the second row or column.
+
+transpose (input) int
+If transpose == 0, the distance between two rows in the matrix is calculated.
+Otherwise, the distance between two columns in the matrix is calculated.
+
+============================================================================
+*/
+{
+ double result = 0.0;
+ double tweight = 0;
+ int i;
+
+ if (transpose == 0) /* Calculate the distance between two rows */ {
+ for (i = 0; i < n; i++) {
+ if (mask1[index1][i] && mask2[index2][i]) {
+ double term = data1[index1][i] - data2[index2][i];
+ result += weight[i]*term*term;
+ tweight += weight[i];
+ }
+ }
+ }
+ else {
+ for (i = 0; i < n; i++) {
+ if (mask1[i][index1] && mask2[i][index2]) {
+ double term = data1[i][index1] - data2[i][index2];
+ result += weight[i]*term*term;
+ tweight += weight[i];
+ }
+ }
+ }
+ if (!tweight) return 0; /* usually due to empty clusters */
+ result /= tweight;
+ return result;
+}
+
+/* ********************************************************************* */
+
+static double
+cityblock(int n, double** data1, double** data2, int** mask1, int** mask2,
+ const double weight[], int index1, int index2, int transpose)
+
+/*
+Purpose
+=======
+
+The cityblock routine calculates the weighted "City Block" distance between
+two rows or columns in a matrix. City Block distance is defined as the
+absolute value of X1-X2 plus the absolute value of Y1-Y2 plus..., which is
+equivalent to taking an "up and over" path.
+
+Arguments
+=========
+
+n (input) int
+The number of elements in a row or column. If transpose == 0, then n is the
+number of columns; otherwise, n is the number of rows.
+
+data1 (input) double array
+The data array containing the first vector.
+
+data2 (input) double array
+The data array containing the second vector.
+
+mask1 (input) int array
+This array which elements in data1 are missing. If mask1[i][j] == 0, then
+data1[i][j] is missing.
+
+mask2 (input) int array
+This array which elements in data2 are missing. If mask2[i][j] == 0, then
+data2[i][j] is missing.
+
+weight (input) double[ncolumns] if transpose == 0,
+ double[nrows] otherwise
+The weights that are used to calculate the distance. This is equivalent
+to including the jth data point weight[j] times in the calculation. The
+weights can be non-integer.
+
+index1 (input) int
+Index of the first row or column.
+
+index2 (input) int
+Index of the second row or column.
+
+transpose (input) int
+If transpose == 0, the distance between two rows in the matrix is calculated.
+Otherwise, the distance between two columns in the matrix is calculated.
+
+============================================================================ */
+{
+ double result = 0.;
+ double tweight = 0;
+ int i;
+
+ if (transpose == 0) /* Calculate the distance between two rows */ {
+ for (i = 0; i < n; i++) {
+ if (mask1[index1][i] && mask2[index2][i]) {
+ double term = data1[index1][i] - data2[index2][i];
+ result = result + weight[i]*fabs(term);
+ tweight += weight[i];
+ }
+ }
+ }
+ else {
+ for (i = 0; i < n; i++) {
+ if (mask1[i][index1] && mask2[i][index2]) {
+ double term = data1[i][index1] - data2[i][index2];
+ result = result + weight[i]*fabs(term);
+ tweight += weight[i];
+ }
+ }
+ }
+ if (!tweight) return 0; /* usually due to empty clusters */
+ result /= tweight;
+ return result;
+}
+
+/* ********************************************************************* */
+
+static double
+correlation(int n, double** data1, double** data2, int** mask1, int** mask2,
+ const double weight[], int index1, int index2, int transpose)
+/*
+Purpose
+=======
+
+The correlation routine calculates the weighted Pearson distance between two
+rows or columns in a matrix. We define the Pearson distance as one minus the
+Pearson correlation.
+This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
+but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
+(e.g., choose b = a + c).
+
+Arguments
+=========
+
+n (input) int
+The number of elements in a row or column. If transpose == 0, then n is the
+number of columns; otherwise, n is the number of rows.
+
+data1 (input) double array
+The data array containing the first vector.
+
+data2 (input) double array
+The data array containing the second vector.
+
+mask1 (input) int array
+This array which elements in data1 are missing. If mask1[i][j] == 0, then
+data1[i][j] is missing.
+
+mask2 (input) int array
+This array which elements in data2 are missing. If mask2[i][j] == 0, then
+data2[i][j] is missing.
+
+weight (input) double[ncolumns] if transpose == 0,
+ double[nrows] otherwise
+The weights that are used to calculate the distance. This is equivalent
+to including the jth data point weight[j] times in the calculation. The
+weights can be non-integer.
+
+index1 (input) int
+Index of the first row or column.
+
+index2 (input) int
+Index of the second row or column.
+
+transpose (input) int
+If transpose == 0, the distance between two rows in the matrix is calculated.
+Otherwise, the distance between two columns in the matrix is calculated.
+============================================================================
+*/
+{
+ double result = 0.;
+ double sum1 = 0.;
+ double sum2 = 0.;
+ double denom1 = 0.;
+ double denom2 = 0.;
+ double tweight = 0.;
+
+ if (transpose == 0) /* Calculate the distance between two rows */ {
+ int i;
+ for (i = 0; i < n; i++) {
+ if (mask1[index1][i] && mask2[index2][i]) {
+ double term1 = data1[index1][i];
+ double term2 = data2[index2][i];
+ double w = weight[i];
+ sum1 += w*term1;
+ sum2 += w*term2;
+ result += w*term1*term2;
+ denom1 += w*term1*term1;
+ denom2 += w*term2*term2;
+ tweight += w;
+ }
+ }
+ }
+ else {
+ int i;
+ for (i = 0; i < n; i++) {
+ if (mask1[i][index1] && mask2[i][index2]) {
+ double term1 = data1[i][index1];
+ double term2 = data2[i][index2];
+ double w = weight[i];
+ sum1 += w*term1;
+ sum2 += w*term2;
+ result += w*term1*term2;
+ denom1 += w*term1*term1;
+ denom2 += w*term2*term2;
+ tweight += w;
+ }
+ }
+ }
+ if (!tweight) return 0; /* usually due to empty clusters */
+ result -= sum1 * sum2 / tweight;
+ denom1 -= sum1 * sum1 / tweight;
+ denom2 -= sum2 * sum2 / tweight;
+ if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */
+ if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */
+ result = result / sqrt(denom1*denom2);
+ result = 1. - result;
+ return result;
+}
+
+/* ********************************************************************* */
+
+static double
+acorrelation(int n, double** data1, double** data2, int** mask1, int** mask2,
+ const double weight[], int index1, int index2, int transpose)
+/*
+Purpose
+=======
+
+The acorrelation routine calculates the weighted Pearson distance between two
+rows or columns, using the absolute value of the correlation.
+This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
+but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
+(e.g., choose b = a + c).
+
+Arguments
+=========
+
+n (input) int
+The number of elements in a row or column. If transpose == 0, then n is the
+number of columns; otherwise, n is the number of rows.
+
+data1 (input) double array
+The data array containing the first vector.
+
+data2 (input) double array
+The data array containing the second vector.
+
+mask1 (input) int array
+This array which elements in data1 are missing. If mask1[i][j] == 0, then
+data1[i][j] is missing.
+
+mask2 (input) int array
+This array which elements in data2 are missing. If mask2[i][j] == 0, then
+data2[i][j] is missing.
+
+weight (input) double[ncolumns] if transpose == 0,
+ double[nrows] otherwise
+The weights that are used to calculate the distance. This is equivalent
+to including the jth data point weight[j] times in the calculation. The
+weights can be non-integer.
+
+index1 (input) int
+Index of the first row or column.
+
+index2 (input) int
+Index of the second row or column.
+
+transpose (input) int
+If transpose == 0, the distance between two rows in the matrix is calculated.
+Otherwise, the distance between two columns in the matrix is calculated.
+============================================================================
+*/
+{
+ double result = 0.;
+ double sum1 = 0.;
+ double sum2 = 0.;
+ double denom1 = 0.;
+ double denom2 = 0.;
+ double tweight = 0.;
+
+ if (transpose == 0) /* Calculate the distance between two rows */ {
+ int i;
+ for (i = 0; i < n; i++) {
+ if (mask1[index1][i] && mask2[index2][i]) {
+ double term1 = data1[index1][i];
+ double term2 = data2[index2][i];
+ double w = weight[i];
+ sum1 += w*term1;
+ sum2 += w*term2;
+ result += w*term1*term2;
+ denom1 += w*term1*term1;
+ denom2 += w*term2*term2;
+ tweight += w;
+ }
+ }
+ }
+ else {
+ int i;
+ for (i = 0; i < n; i++) {
+ if (mask1[i][index1] && mask2[i][index2]) {
+ double term1 = data1[i][index1];
+ double term2 = data2[i][index2];
+ double w = weight[i];
+ sum1 += w*term1;
+ sum2 += w*term2;
+ result += w*term1*term2;
+ denom1 += w*term1*term1;
+ denom2 += w*term2*term2;
+ tweight += w;
+ }
+ }
+ }
+ if (!tweight) return 0; /* usually due to empty clusters */
+ result -= sum1 * sum2 / tweight;
+ denom1 -= sum1 * sum1 / tweight;
+ denom2 -= sum2 * sum2 / tweight;
+ if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */
+ if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */
+ result = fabs(result) / sqrt(denom1*denom2);
+ result = 1. - result;
+ return result;
+}
+
+/* ********************************************************************* */
+
+static double
+ucorrelation(int n, double** data1, double** data2, int** mask1, int** mask2,
+ const double weight[], int index1, int index2, int transpose)
+/*
+Purpose
+=======
+
+The ucorrelation routine calculates the weighted Pearson distance between two
+rows or columns, using the uncentered version of the Pearson correlation. In
+the uncentered Pearson correlation, a zero mean is used for both vectors even
+if the actual mean is nonzero.
+This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
+but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
+(e.g., choose b = a + c).
+
+Arguments
+=========
+
+n (input) int
+The number of elements in a row or column. If transpose == 0, then n is the
+number of columns; otherwise, n is the number of rows.
+
+data1 (input) double array
+The data array containing the first vector.
+
+data2 (input) double array
+The data array containing the second vector.
+
+mask1 (input) int array
+This array which elements in data1 are missing. If mask1[i][j] == 0, then
+data1[i][j] is missing.
+
+mask2 (input) int array
+This array which elements in data2 are missing. If mask2[i][j] == 0, then
+data2[i][j] is missing.
+
+weight (input) double[ncolumns] if transpose == 0,
+ double[nrows] otherwise
+The weights that are used to calculate the distance. This is equivalent
+to including the jth data point weight[j] times in the calculation. The
+weights can be non-integer.
+
+index1 (input) int
+Index of the first row or column.
+
+index2 (input) int
+Index of the second row or column.
+
+transpose (input) int
+If transpose == 0, the distance between two rows in the matrix is calculated.
+Otherwise, the distance between two columns in the matrix is calculated.
+============================================================================
+*/
+{
+ double result = 0.;
+ double denom1 = 0.;
+ double denom2 = 0.;
+ int flag = 0;
+
+ /* flag will remain zero if no nonzero combinations of mask1 and mask2 are
+ * found.
+ */
+ if (transpose == 0) /* Calculate the distance between two rows */ {
+ int i;
+ for (i = 0; i < n; i++) {
+ if (mask1[index1][i] && mask2[index2][i]) {
+ double term1 = data1[index1][i];
+ double term2 = data2[index2][i];
+ double w = weight[i];
+ result += w*term1*term2;
+ denom1 += w*term1*term1;
+ denom2 += w*term2*term2;
+ flag = 1;
+ }
+ }
+ }
+ else {
+ int i;
+ for (i = 0; i < n; i++) {
+ if (mask1[i][index1] && mask2[i][index2]) {
+ double term1 = data1[i][index1];
+ double term2 = data2[i][index2];
+ double w = weight[i];
+ result += w*term1*term2;
+ denom1 += w*term1*term1;
+ denom2 += w*term2*term2;
+ flag = 1;
+ }
+ }
+ }
+ if (!flag) return 0.;
+ if (denom1 == 0.) return 1.;
+ if (denom2 == 0.) return 1.;
+ result = result / sqrt(denom1*denom2);
+ result = 1. - result;
+ return result;
+}
+
+/* ********************************************************************* */
+
+static double
+uacorrelation(int n, double** data1, double** data2, int** mask1, int** mask2,
+ const double weight[], int index1, int index2, int transpose)
+/*
+Purpose
+=======
+
+The uacorrelation routine calculates the weighted Pearson distance between two
+rows or columns, using the absolute value of the uncentered version of the
+Pearson correlation. In the uncentered Pearson correlation, a zero mean is used
+for both vectors even if the actual mean is nonzero.
+This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
+but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
+(e.g., choose b = a + c).
+
+Arguments
+=========
+
+n (input) int
+The number of elements in a row or column. If transpose == 0, then n is the
+number of columns; otherwise, n is the number of rows.
+
+data1 (input) double array
+The data array containing the first vector.
+
+data2 (input) double array
+The data array containing the second vector.
+
+mask1 (input) int array
+This array which elements in data1 are missing. If mask1[i][j] == 0, then
+data1[i][j] is missing.
+
+mask2 (input) int array
+This array which elements in data2 are missing. If mask2[i][j] == 0, then
+data2[i][j] is missing.
+
+weight (input) double[ncolumns] if transpose == 0,
+ double[nrows] otherwise
+The weights that are used to calculate the distance. This is equivalent
+to including the jth data point weight[j] times in the calculation. The
+weights can be non-integer.
+
+index1 (input) int
+Index of the first row or column.
+
+index2 (input) int
+Index of the second row or column.
+
+transpose (input) int
+If transpose == 0, the distance between two rows in the matrix is calculated.
+Otherwise, the distance between two columns in the matrix is calculated.
+============================================================================
+*/
+{
+ double result = 0.;
+ double denom1 = 0.;
+ double denom2 = 0.;
+ int flag = 0;
+ /* flag will remain zero if no nonzero combinations of mask1 and mask2 are
+ * found.
+ */
+
+ if (transpose == 0) /* Calculate the distance between two rows */ {
+ int i;
+ for (i = 0; i < n; i++) {
+ if (mask1[index1][i] && mask2[index2][i]) {
+ double term1 = data1[index1][i];
+ double term2 = data2[index2][i];
+ double w = weight[i];
+ result += w*term1*term2;
+ denom1 += w*term1*term1;
+ denom2 += w*term2*term2;
+ flag = 1;
+ }
+ }
+ }
+ else {
+ int i;
+ for (i = 0; i < n; i++) {
+ if (mask1[i][index1] && mask2[i][index2]) {
+ double term1 = data1[i][index1];
+ double term2 = data2[i][index2];
+ double w = weight[i];
+ result += w*term1*term2;
+ denom1 += w*term1*term1;
+ denom2 += w*term2*term2;
+ flag = 1;
+ }
+ }
+ }
+ if (!flag) return 0.;
+ if (denom1 == 0.) return 1.;
+ if (denom2 == 0.) return 1.;
+ result = fabs(result) / sqrt(denom1*denom2);
+ result = 1. - result;
+ return result;
+}
+
+/* ********************************************************************* */
+
+static double
+spearman(int n, double** data1, double** data2, int** mask1, int** mask2,
+ const double weight[], int index1, int index2, int transpose)
+/*
+Purpose
+=======
+
+The spearman routine calculates the Spearman distance between two rows or
+columns. The Spearman distance is defined as one minus the Spearman rank
+correlation.
+
+Arguments
+=========
+
+n (input) int
+The number of elements in a row or column. If transpose == 0, then n is the
+number of columns; otherwise, n is the number of rows.
+
+data1 (input) double array
+The data array containing the first vector.
+
+data2 (input) double array
+The data array containing the second vector.
+
+mask1 (input) int array
+This array which elements in data1 are missing. If mask1[i][j] == 0, then
+data1[i][j] is missing.
+
+mask2 (input) int array
+This array which elements in data2 are missing. If mask2[i][j] == 0, then
+data2[i][j] is missing.
+
+weight (input) double[ncolumns] if transpose == 0,
+ double[nrows] otherwise
+The weights that are used to calculate the distance. This is equivalent
+to including the jth data point weight[j] times in the calculation. The
+weights can be non-integer.
+
+index1 (input) int
+Index of the first row or column.
+
+index2 (input) int
+Index of the second row or column.
+
+transpose (input) int
+If transpose == 0, the distance between two rows in the matrix is calculated.
+Otherwise, the distance between two columns in the matrix is calculated.
+============================================================================
+*/
+{
+ int i;
+ int m = 0;
+ double* rank1;
+ double* rank2;
+ double result = 0.;
+ double denom1 = 0.;
+ double denom2 = 0.;
+ double sum1 = 0.;
+ double sum2 = 0.;
+ double totalweight = 0.;
+ double* tdata1;
+ double* tdata2;
+
+ tdata1 = malloc(n*sizeof(double));
+ if (!tdata1) return 0.0; /* Memory allocation error */
+ tdata2 = malloc(n*sizeof(double));
+ if (!tdata2) /* Memory allocation error */ {
+ free(tdata1);
+ return 0.0;
+ }
+ if (transpose == 0) {
+ for (i = 0; i < n; i++) {
+ if (mask1[index1][i] && mask2[index2][i]) {
+ tdata1[m] = data1[index1][i];
+ tdata2[m] = data2[index2][i];
+ m++;
+ }
+ }
+ }
+ else {
+ for (i = 0; i < n; i++) {
+ if (mask1[i][index1] && mask2[i][index2]) {
+ tdata1[m] = data1[i][index1];
+ tdata2[m] = data2[i][index2];
+ m++;
+ }
+ }
+ }
+ if (m == 0) {
+ free(tdata1);
+ free(tdata2);
+ return 0;
+ }
+ rank1 = getrank(m, tdata1, weight);
+ free(tdata1);
+ if (!rank1) {
+ free(tdata2);
+ return 0.0; /* Memory allocation error */
+ }
+ rank2 = getrank(m, tdata2, weight);
+ free(tdata2);
+ if (!rank2) /* Memory allocation error */ {
+ free(rank1);
+ return 0.0;
+ }
+ for (i = 0; i < m; i++) {
+ const double term1 = rank1[i];
+ const double term2 = rank2[i];
+ const double w = weight[i];
+ sum1 += term1 * w;
+ sum2 += term2 * w;
+ result += term1 * term2 * w;
+ denom1 += term1 * term1 * w;
+ denom2 += term2 * term2 * w;
+ totalweight += w;
+ }
+ /* Note: denom1 and denom2 cannot be calculated directly from the number
+ * of elements. If two elements have the same rank, the squared sum of
+ * their ranks will change.
+ */
+ free(rank1);
+ free(rank2);
+ if (!totalweight) return 0; /* usually due to empty clusters */
+ result -= sum1 * sum2 / totalweight;
+ denom1 -= sum1 * sum1 / totalweight;
+ denom2 -= sum2 * sum2 / totalweight;
+ if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */
+ if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */
+ result = result / sqrt(denom1*denom2);
+ result = 1. - result;
+ return result;
+}
+
+/* ********************************************************************* */
+
+static double
+kendall(int n, double** data1, double** data2, int** mask1, int** mask2,
+ const double weight[], int index1, int index2, int transpose)
+/*
+Purpose
+=======
+
+The kendall routine calculates the Kendall distance between two
+rows or columns. The Kendall distance is defined as one minus Kendall's tau.
+
+Arguments
+=========
+
+n (input) int
+The number of elements in a row or column. If transpose == 0, then n is the
+number of columns; otherwise, n is the number of rows.
+
+data1 (input) double array
+The data array containing the first vector.
+
+data2 (input) double array
+The data array containing the second vector.
+
+mask1 (input) int array
+This array which elements in data1 are missing. If mask1[i][j] == 0, then
+data1[i][j] is missing.
+
+mask2 (input) int array
+This array which elements in data2 are missing. If mask2[i][j] == 0, then
+data2[i][j] is missing.
+
+weight (input) double[ncolumns] if transpose == 0,
+ double[nrows] otherwise
+The weights that are used to calculate the distance. This is equivalent
+to including the jth data point weight[j] times in the calculation. The
+weights can be non-integer.
+
+index1 (input) int
+Index of the first row or column.
+
+index2 (input) int
+Index of the second row or column.
+
+transpose (input) int
+If transpose == 0, the distance between two rows in the matrix is calculated.
+Otherwise, the distance between two columns in the matrix is calculated.
+============================================================================
+*/
+{
+ double con = 0;
+ double dis = 0;
+ double exx = 0;
+ double exy = 0;
+ int flag = 0;
+ /* flag will remain zero if no nonzero combinations of mask1 and mask2 are
+ * found.
+ */
+ double denomx;
+ double denomy;
+ double tau;
+ int i, j;
+
+ if (transpose == 0) {
+ for (i = 0; i < n; i++) {
+ if (mask1[index1][i] && mask2[index2][i]) {
+ for (j = 0; j < i; j++) {
+ if (mask1[index1][j] && mask2[index2][j]) {
+ const double x1 = data1[index1][i];
+ const double x2 = data1[index1][j];
+ const double y1 = data2[index2][i];
+ const double y2 = data2[index2][j];
+ const double w = weight[i] * weight[j];
+ if (x1 < x2 && y1 < y2) con += w;
+ else if (x1 > x2 && y1 > y2) con += w;
+ else if (x1 < x2 && y1 > y2) dis += w;
+ else if (x1 > x2 && y1 < y2) dis += w;
+ else if (x1 == x2 && y1 != y2) exx += w;
+ else if (x1 != x2 && y1 == y2) exy += w;
+ flag = 1;
+ }
+ }
+ }
+ }
+ }
+ else {
+ for (i = 0; i < n; i++) {
+ if (mask1[i][index1] && mask2[i][index2]) {
+ for (j = 0; j < i; j++) {
+ if (mask1[j][index1] && mask2[j][index2]) {
+ const double x1 = data1[i][index1];
+ const double x2 = data1[j][index1];
+ const double y1 = data2[i][index2];
+ const double y2 = data2[j][index2];
+ const double w = weight[i] * weight[j];
+ if (x1 < x2 && y1 < y2) con += w;
+ else if (x1 > x2 && y1 > y2) con += w;
+ else if (x1 < x2 && y1 > y2) dis += w;
+ else if (x1 > x2 && y1 < y2) dis += w;
+ else if (x1 == x2 && y1 != y2) exx += w;
+ else if (x1 != x2 && y1 == y2) exy += w;
+ flag = 1;
+ }
+ }
+ }
+ }
+ }
+ if (!flag) return 0.;
+ denomx = con + dis + exx;
+ denomy = con + dis + exy;
+ if (denomx == 0) return 1;
+ if (denomy == 0) return 1;
+ tau = (con-dis)/sqrt(denomx*denomy);
+ return 1.-tau;
+}
+
+/* ********************************************************************* */
+
+static double(*setmetric(char dist))
+ (int, double**, double**, int**, int**, const double[], int, int, int)
+{
+ switch(dist) {
+ case 'e': return &euclid;
+ case 'b': return &cityblock;
+ case 'c': return &correlation;
+ case 'a': return &acorrelation;
+ case 'u': return &ucorrelation;
+ case 'x': return &uacorrelation;
+ case 's': return &spearman;
+ case 'k': return &kendall;
+ default: return &euclid;
+ }
+}
+
+/* ********************************************************************* */
+
+static double
+uniform(void)
+/*
+Purpose
+=======
+
+This routine returns a uniform random number between 0.0 and 1.0. Both 0.0
+and 1.0 are excluded. This random number generator is described in:
+
+Pierre l'Ecuyer
+Efficient and Portable Combined Random Number Generators
+Communications of the ACM, Volume 31, Number 6, June 1988, pages 742-749, 774.
+
+The first time this routine is called, it initializes the random number
+generator using the current time. First, the current epoch time in seconds is
+used as a seed for the random number generator in the C library. The first two
+random numbers generated by this generator are used to initialize the random
+number generator implemented in this routine.
+
+
+Arguments
+=========
+
+None.
+
+
+Return value
+============
+
+A double-precison number between 0.0 and 1.0.
+============================================================================
+*/
+{
+ int z;
+ static const int m1 = 2147483563;
+ static const int m2 = 2147483399;
+ const double scale = 1.0/m1;
+
+ static int s1 = 0;
+ static int s2 = 0;
+
+ if (s1 == 0 || s2 == 0) {
+ /* initialize */
+ unsigned int initseed = (unsigned int) time(0);
+ srand(initseed);
+ s1 = rand();
+ s2 = rand();
+ }
+
+ do {
+ int k = s1/53668;
+ s1 = 40014*(s1-k*53668)-k*12211;
+ if (s1 < 0) s1+=m1;
+ k = s2/52774;
+ s2 = 40692*(s2-k*52774)-k*3791;
+ if (s2 < 0) s2 += m2;
+ z = s1-s2;
+ if (z < 1) z += (m1-1);
+ } while (z == m1); /* To avoid returning 1.0 */
+
+ return z*scale;
+}
+
+/* ************************************************************************ */
+
+static int
+binomial(int n, double p)
+/*
+Purpose
+=======
+
+This routine generates a random number between 0 and n inclusive, following
+the binomial distribution with probability p and n trials. The routine is
+based on the BTPE algorithm, described in:
+
+Voratas Kachitvichyanukul and Bruce W. Schmeiser:
+Binomial Random Variate Generation
+Communications of the ACM, Volume 31, Number 2, February 1988, pages 216-222.
+
+
+Arguments
+=========
+
+p (input) double
+The probability of a single event. This probability should be less than or
+equal to 0.5.
+
+n (input) int
+The number of trials.
+
+
+Return value
+============
+
+An integer drawn from a binomial distribution with parameters (p, n).
+
+============================================================================
+*/
+{
+ const double q = 1 - p;
+
+ if (n*p < 30.0) /* Algorithm BINV */ {
+ const double s = p/q;
+ const double a = (n+1)*s;
+ double r = exp(n*log(q)); /* pow() causes a crash on AIX */
+ int x = 0;
+ double u = uniform();
+ while (1) {
+ if (u < r) return x;
+ u -= r;
+ x++;
+ r *= (a/x)-s;
+ }
+ }
+ else /* Algorithm BTPE */ {
+ /* Step 0 */
+ const double fm = n*p + p;
+ const int m = (int) fm;
+ const double p1 = floor(2.195*sqrt(n*p*q) -4.6*q) + 0.5;
+ const double xm = m + 0.5;
+ const double xl = xm - p1;
+ const double xr = xm + p1;
+ const double c = 0.134 + 20.5/(15.3+m);
+ const double a = (fm-xl)/(fm-xl*p);
+ const double b = (xr-fm)/(xr*q);
+ const double lambdal = a*(1.0+0.5*a);
+ const double lambdar = b*(1.0+0.5*b);
+ const double p2 = p1*(1+2*c);
+ const double p3 = p2 + c/lambdal;
+ const double p4 = p3 + c/lambdar;
+ while (1) {
+ /* Step 1 */
+ int y;
+ int k;
+ double u = uniform();
+ double v = uniform();
+ u *= p4;
+ if (u <= p1) return (int)(xm-p1*v+u);
+ /* Step 2 */
+ if (u > p2) {
+ /* Step 3 */
+ if (u > p3) {
+ /* Step 4 */
+ y = (int)(xr-log(v)/lambdar);
+ if (y > n) continue;
+ /* Go to step 5 */
+ v = v*(u-p3)*lambdar;
+ }
+ else {
+ y = (int)(xl+log(v)/lambdal);
+ if (y < 0) continue;
+ /* Go to step 5 */
+ v = v*(u-p2)*lambdal;
+ }
+ }
+ else {
+ const double x = xl + (u-p1)/c;
+ v = v*c + 1.0 - fabs(m-x+0.5)/p1;
+ if (v > 1) continue;
+ /* Go to step 5 */
+ y = (int)x;
+ }
+ /* Step 5 */
+ /* Step 5.0 */
+ k = abs(y-m);
+ if (k > 20 && k < 0.5*n*p*q-1.0) {
+ /* Step 5.2 */
+ double rho = (k/(n*p*q))*((k*(k/3.0 + 0.625)
+ + 0.1666666666666)/(n*p*q)+0.5);
+ double t = -k*k/(2*n*p*q);
+ double A = log(v);
+ if (A < t-rho) return y;
+ else if (A > t+rho) continue;
+ else {
+ /* Step 5.3 */
+ double x1 = y+1;
+ double f1 = m+1;
+ double z = n+1-m;
+ double w = n-y+1;
+ double x2 = x1*x1;
+ double f2 = f1*f1;
+ double z2 = z*z;
+ double w2 = w*w;
+ if (A > xm * log(f1/x1) + (n-m+0.5)*log(z/w)
+ + (y-m)*log(w*p/(x1*q))
+ + (13860.-(462.-(132.-(99.-140./f2)/f2)/f2)/f2)/f1/166320.
+ + (13860.-(462.-(132.-(99.-140./z2)/z2)/z2)/z2)/z/166320.
+ + (13860.-(462.-(132.-(99.-140./x2)/x2)/x2)/x2)/x1/166320.
+ + (13860.-(462.-(132.-(99.-140./w2)/w2)/w2)/w2)/w/166320.)
+ continue;
+ return y;
+ }
+ }
+ else {
+ /* Step 5.1 */
+ int i;
+ const double s = p/q;
+ const double aa = s*(n+1);
+ double f = 1.0;
+ for (i = m; i < y; f *= (aa/(++i)-s));
+ for (i = y; i < m; f /= (aa/(++i)-s));
+ if (v > f) continue;
+ return y;
+ }
+ }
+ }
+ return -1;
+}
+
+/* ************************************************************************ */
+
+static void
+randomassign(int nclusters, int nelements, int clusterid[])
+/*
+Purpose
+=======
+
+The randomassign routine performs an initial random clustering, needed for
+k-means or k-median clustering. Elements (genes or samples) are randomly
+assigned to clusters. The number of elements in each cluster is chosen
+randomly, making sure that each cluster will receive at least one element.
+
+
+Arguments
+=========
+
+nclusters (input) int
+The number of clusters.
+
+nelements (input) int
+The number of elements to be clustered (i.e., the number of genes or samples
+to be clustered).
+
+clusterid (output) int[nelements]
+The cluster number to which an element was assigned.
+
+============================================================================
+*/
+{
+ int i, j;
+ int k = 0;
+ double p;
+ int n = nelements-nclusters;
+
+ /* Draw the number of elements in each cluster from a multinomial
+ * distribution, reserving ncluster elements to set independently
+ * in order to guarantee that none of the clusters are empty.
+ */
+ for (i = 0; i < nclusters-1; i++) {
+ p = 1.0/(nclusters-i);
+ j = binomial(n, p);
+ n -= j;
+ j += k+1; /* Assign at least one element to cluster i */
+ for ( ; k < j; k++) clusterid[k] = i;
+ }
+ /* Assign the remaining elements to the last cluster */
+ for ( ; k < nelements; k++) clusterid[k] = i;
+
+ /* Create a random permutation of the cluster assignments */
+ for (i = 0; i < nelements; i++) {
+ j = (int) (i + (nelements-i)*uniform());
+ k = clusterid[j];
+ clusterid[j] = clusterid[i];
+ clusterid[i] = k;
+ }
+}
+
+/* ********************************************************************* */
+
+static void
+getclustermeans(int nclusters, int nrows, int ncolumns,
+ double** data, int** mask, int clusterid[], double** cdata, int** cmask,
+ int transpose)
+/*
+Purpose
+=======
+
+The getclustermeans routine calculates the cluster centroids, given to which
+cluster each element belongs. The centroid is defined as the mean over all
+elements for each dimension.
+
+Arguments
+=========
+
+nclusters (input) int
+The number of clusters.
+
+nrows (input) int
+The number of rows in the gene expression data matrix, equal to the number of
+genes.
+
+ncolumns (input) int
+The number of columns in the gene expression data matrix, equal to the number
+of samples.
+
+data (input) double[nrows][ncolumns]
+The array containing the gene expression data.
+
+mask (input) int[nrows][ncolumns]
+This array shows which data values are missing. If mask[i][j] == 0, then
+data[i][j] is missing.
+
+clusterid (output) int[nrows] if transpose == 0
+ int[ncolumns] otherwise
+The cluster number to which each element belongs. If transpose == 0, then the
+dimension of clusterid is equal to nrows (the number of genes). Otherwise, it
+is equal to ncolumns (the number of samples).
+
+cdata (output) double[nclusters][ncolumns] if transpose == 0
+ double[nrows][nclusters] otherwise
+On exit of getclustermeans, this array contains the cluster centroids.
+
+cmask (output) int[nclusters][ncolumns] if transpose == 0
+ int[nrows][nclusters] otherwise
+This array shows which data values of are missing for each centroid. If
+cmask[i][j] == 0, then cdata[i][j] is missing. A data value is missing for a
+centroid if all corresponding data values of the cluster members are missing.
+
+transpose (input) int
+If transpose == 0, clusters of rows (genes) are specified. Otherwise, clusters
+of columns (samples) are specified.
+
+========================================================================
+*/
+{
+ int i, j, k;
+
+ if (transpose == 0) {
+ for (i = 0; i < nclusters; i++) {
+ for (j = 0; j < ncolumns; j++) {
+ cmask[i][j] = 0;
+ cdata[i][j] = 0.;
+ }
+ }
+ for (k = 0; k < nrows; k++) {
+ i = clusterid[k];
+ for (j = 0; j < ncolumns; j++) {
+ if (mask[k][j] != 0) {
+ cdata[i][j] += data[k][j];
+ cmask[i][j]++;
+ }
+ }
+ }
+ for (i = 0; i < nclusters; i++) {
+ for (j = 0; j < ncolumns; j++) {
+ if (cmask[i][j]>0) {
+ cdata[i][j] /= cmask[i][j];
+ cmask[i][j] = 1;
+ }
+ }
+ }
+ }
+ else {
+ for (i = 0; i < nrows; i++) {
+ for (j = 0; j < nclusters; j++) {
+ cdata[i][j] = 0.;
+ cmask[i][j] = 0;
+ }
+ }
+ for (k = 0; k < ncolumns; k++) {
+ i = clusterid[k];
+ for (j = 0; j < nrows; j++) {
+ if (mask[j][k] != 0) {
+ cdata[j][i] += data[j][k];
+ cmask[j][i]++;
+ }
+ }
+ }
+ for (i = 0; i < nrows; i++) {
+ for (j = 0; j < nclusters; j++) {
+ if (cmask[i][j]>0) {
+ cdata[i][j] /= cmask[i][j];
+ cmask[i][j] = 1;
+ }
+ }
+ }
+ }
+}
+
+/* ********************************************************************* */
+
+static void
+getclustermedians(int nclusters, int nrows, int ncolumns,
+ double** data, int** mask, int clusterid[], double** cdata, int** cmask,
+ int transpose, double cache[])
+/*
+Purpose
+=======
+
+The getclustermedians routine calculates the cluster centroids, given to which
+cluster each element belongs. The centroid is defined as the median over all
+elements for each dimension.
+
+Arguments
+=========
+
+nclusters (input) int
+The number of clusters.
+
+nrows (input) int
+The number of rows in the gene expression data matrix, equal to the number of
+genes.
+
+ncolumns (input) int
+The number of columns in the gene expression data matrix, equal to the number
+of samples.
+
+data (input) double[nrows][ncolumns]
+The array containing the gene expression data.
+
+mask (input) int[nrows][ncolumns]
+This array shows which data values are missing. If mask[i][j] == 0, then
+data[i][j] is missing.
+
+clusterid (output) int[nrows] if transpose == 0
+ int[ncolumns] otherwise
+The cluster number to which each element belongs. If transpose == 0, then the
+dimension of clusterid is equal to nrows (the number of genes). Otherwise, it
+is equal to ncolumns (the number of samples).
+
+cdata (output) double[nclusters][ncolumns] if transpose == 0
+ double[nrows][nclusters] otherwise
+On exit of getclustermedians, this array contains the cluster centroids.
+
+cmask (output) int[nclusters][ncolumns] if transpose == 0
+ int[nrows][nclusters] otherwise
+This array shows which data values of are missing for each centroid. If
+cmask[i][j] == 0, then cdata[i][j] is missing. A data value is missing for
+a centroid if all corresponding data values of the cluster members are missing.
+
+transpose (input) int
+If transpose == 0, clusters of rows (genes) are specified. Otherwise, clusters
+of columns (samples) are specified.
+
+cache (input) double[nrows] if transpose == 0
+ double[ncolumns] otherwise
+This array should be allocated before calling getclustermedians; its contents
+on input is not relevant. This array is used as a temporary storage space when
+calculating the medians.
+
+========================================================================
+*/
+{
+ int i, j, k;
+
+ if (transpose == 0) {
+ for (i = 0; i < nclusters; i++) {
+ for (j = 0; j < ncolumns; j++) {
+ int count = 0;
+ for (k = 0; k < nrows; k++) {
+ if (i == clusterid[k] && mask[k][j]) {
+ cache[count] = data[k][j];
+ count++;
+ }
+ }
+ if (count>0) {
+ cdata[i][j] = median(count, cache);
+ cmask[i][j] = 1;
+ }
+ else {
+ cdata[i][j] = 0.;
+ cmask[i][j] = 0;
+ }
+ }
+ }
+ }
+ else {
+ for (i = 0; i < nclusters; i++) {
+ for (j = 0; j < nrows; j++) {
+ int count = 0;
+ for (k = 0; k < ncolumns; k++) {
+ if (i == clusterid[k] && mask[j][k]) {
+ cache[count] = data[j][k];
+ count++;
+ }
+ }
+ if (count>0) {
+ cdata[j][i] = median(count, cache);
+ cmask[j][i] = 1;
+ }
+ else {
+ cdata[j][i] = 0.;
+ cmask[j][i] = 0;
+ }
+ }
+ }
+ }
+}
+
+/* ********************************************************************* */
+
+int
+getclustercentroids(int nclusters, int nrows, int ncolumns,
+ double** data, int** mask, int clusterid[], double** cdata, int** cmask,
+ int transpose, char method)
+/*
+Purpose
+=======
+
+The getclustercentroids routine calculates the cluster centroids, given to
+which cluster each element belongs. Depending on the argument method, the
+centroid is defined as either the mean or the median for each dimension over
+all elements belonging to a cluster.
+
+Arguments
+=========
+
+nclusters (input) int
+The number of clusters.
+
+nrows (input) int
+The number of rows in the gene expression data matrix, equal to the number of
+genes.
+
+ncolumns (input) int
+The number of columns in the gene expression data matrix, equal to the number
+of samples.
+
+data (input) double[nrows][ncolumns]
+The array containing the gene expression data.
+
+mask (input) int[nrows][ncolumns]
+This array shows which data values are missing. If mask[i][j] == 0, then
+data[i][j] is missing.
+
+clusterid (output) int[nrows] if transpose == 0
+ int[ncolumns] otherwise
+The cluster number to which each element belongs. If transpose == 0, then the
+dimension of clusterid is equal to nrows (the number of genes). Otherwise, it
+is equal to ncolumns (the number of samples).
+
+cdata (output) double[nclusters][ncolumns] if transpose == 0
+ double[nrows][nclusters] otherwise
+On exit of getclustercentroids, this array contains the cluster centroids.
+
+cmask (output) int[nclusters][ncolumns] if transpose == 0
+ int[nrows][nclusters] otherwise
+This array shows which data values of are missing for each centroid. If
+cmask[i][j] == 0, then cdata[i][j] is missing. A data value is missing for
+a centroid if all corresponding data values of the cluster members are missing.
+
+transpose (input) int
+If transpose == 0, clusters of rows (genes) are specified. Otherwise, clusters
+of columns (samples) are specified.
+
+method (input) char
+For method == 'a', the centroid is defined as the mean over all elements
+belonging to a cluster for each dimension.
+For method == 'm', the centroid is defined as the median over all elements
+belonging to a cluster for each dimension.
+
+Return value
+============
+
+The function returns an integer to indicate success or failure. If a
+memory error occurs, or if method is not 'm' or 'a', getclustercentroids
+returns 0. If successful, getclustercentroids returns 1.
+========================================================================
+*/
+{
+ switch(method) {
+ case 'm': {
+ const int nelements = (transpose == 0) ? nrows : ncolumns;
+ double* cache = malloc(nelements*sizeof(double));
+ if (!cache) return 0;
+ getclustermedians(nclusters, nrows, ncolumns, data, mask,
+ clusterid, cdata, cmask, transpose, cache);
+ free(cache);
+ return 1;
+ }
+ case 'a': {
+ getclustermeans(nclusters, nrows, ncolumns, data, mask,
+ clusterid, cdata, cmask, transpose);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/* ********************************************************************* */
+
+void
+getclustermedoids(int nclusters, int nelements, double** distance,
+ int clusterid[], int centroids[], double errors[])
+/*
+Purpose
+=======
+
+The getclustermedoids routine calculates the cluster centroids, given to which
+cluster each element belongs. The centroid is defined as the element with the
+smallest sum of distances to the other elements.
+
+Arguments
+=========
+
+nclusters (input) int
+The number of clusters.
+
+nelements (input) int
+The total number of elements.
+
+distmatrix (input) double array, ragged
+ (number of rows is nelements, number of columns is equal to the row number)
+The distance matrix. To save space, the distance matrix is given in the
+form of a ragged array. The distance matrix is symmetric and has zeros
+on the diagonal. See distancematrix for a description of the content.
+
+clusterid (output) int[nelements]
+The cluster number to which each element belongs.
+
+centroid (output) int[nclusters]
+The index of the element that functions as the centroid for each cluster.
+
+errors (output) double[nclusters]
+The within-cluster sum of distances between the items and the cluster
+centroid.
+
+========================================================================
+*/
+{
+ int i, j, k;
+
+ for (j = 0; j < nclusters; j++) errors[j] = DBL_MAX;
+ for (i = 0; i < nelements; i++) {
+ double d = 0.0;
+ j = clusterid[i];
+ for (k = 0; k < nelements; k++) {
+ if (i == k || clusterid[k]!=j) continue;
+ d += (i < k ? distance[k][i] : distance[i][k]);
+ if (d > errors[j]) break;
+ }
+ if (d < errors[j]) {
+ errors[j] = d;
+ centroids[j] = i;
+ }
+ }
+}
+
+/* ********************************************************************* */
+
+static int
+kmeans(int nclusters, int nrows, int ncolumns, double** data, int** mask,
+ double weight[], int transpose, int npass, char dist,
+ double** cdata, int** cmask, int clusterid[], double* error,
+ int tclusterid[], int counts[], int mapping[])
+{
+ int i, j, k;
+ const int nelements = (transpose == 0) ? nrows : ncolumns;
+ const int ndata = (transpose == 0) ? ncolumns : nrows;
+ int ifound = 1;
+ int ipass = 0;
+ /* Set the metric function as indicated by dist */
+ double (*metric) (int, double**, double**, int**, int**,
+ const double[], int, int, int) = setmetric(dist);
+
+ /* Save the clustering solution periodically and check if it reappears */
+ int* saved = malloc(nelements*sizeof(int));
+ if (saved == NULL) return -1;
+
+ *error = DBL_MAX;
+
+ do {
+ double total = DBL_MAX;
+ int counter = 0;
+ int period = 10;
+
+ /* Perform the EM algorithm.
+ * First, randomly assign elements to clusters. */
+ if (npass != 0) randomassign(nclusters, nelements, tclusterid);
+
+ for (i = 0; i < nclusters; i++) counts[i] = 0;
+ for (i = 0; i < nelements; i++) counts[tclusterid[i]]++;
+
+ /* Start the loop */
+ while (1) {
+ double previous = total;
+ total = 0.0;
+
+ if (counter % period == 0) {
+ /* Save the current cluster assignments */
+ for (i = 0; i < nelements; i++) saved[i] = tclusterid[i];
+ if (period < INT_MAX / 2) period *= 2;
+ }
+ counter++;
+
+ /* Find the center */
+ getclustermeans(nclusters, nrows, ncolumns, data, mask, tclusterid,
+ cdata, cmask, transpose);
+
+ for (i = 0; i < nelements; i++) {
+ double distance;
+ /* Calculate the distances */
+ k = tclusterid[i];
+ if (counts[k] == 1) continue;
+ /* No reassignment if that would lead to an empty cluster */
+ /* Treat the present cluster as a special case */
+ distance = metric(ndata, data, cdata, mask, cmask, weight,
+ i, k, transpose);
+ for (j = 0; j < nclusters; j++) {
+ double tdistance;
+ if (j == k) continue;
+ tdistance = metric(ndata, data, cdata, mask, cmask, weight,
+ i, j, transpose);
+ if (tdistance < distance) {
+ distance = tdistance;
+ counts[tclusterid[i]]--;
+ tclusterid[i] = j;
+ counts[j]++;
+ }
+ }
+ total += distance;
+ }
+ if (total >= previous) break;
+ /* total >= previous is FALSE on some machines even if total and
+ * previous are bitwise identical. */
+ for (i = 0; i < nelements; i++)
+ if (saved[i]!=tclusterid[i]) break;
+ if (i == nelements)
+ break; /* Identical solution found; break out of this loop */
+ }
+
+ if (npass <= 1) {
+ *error = total;
+ break;
+ }
+
+ for (i = 0; i < nclusters; i++) mapping[i] = -1;
+ for (i = 0; i < nelements; i++) {
+ j = tclusterid[i];
+ k = clusterid[i];
+ if (mapping[k] == -1) mapping[k] = j;
+ else if (mapping[k] != j) {
+ if (total < *error) {
+ ifound = 1;
+ *error = total;
+ for (j = 0; j < nelements; j++)
+ clusterid[j] = tclusterid[j];
+ }
+ break;
+ }
+ }
+ if (i == nelements) ifound++; /* break statement not encountered */
+ } while (++ipass < npass);
+
+ free(saved);
+ return ifound;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int
+kmedians(int nclusters, int nrows, int ncolumns, double** data, int** mask,
+ double weight[], int transpose, int npass, char dist,
+ double** cdata, int** cmask, int clusterid[], double* error,
+ int tclusterid[], int counts[], int mapping[], double cache[])
+{
+ int i, j, k;
+ const int nelements = (transpose == 0) ? nrows : ncolumns;
+ const int ndata = (transpose == 0) ? ncolumns : nrows;
+ int ifound = 1;
+ int ipass = 0;
+ int* saved;
+ /* Set the metric function as indicated by dist */
+ double (*metric) (int, double**, double**, int**, int**,
+ const double[], int, int, int) = setmetric(dist);
+
+ /* Save the clustering solution periodically and check if it reappears */
+ saved = malloc(nelements*sizeof(int));
+ if (saved == NULL) return -1;
+
+ *error = DBL_MAX;
+
+ do {
+ double total = DBL_MAX;
+ int counter = 0;
+ int period = 10;
+
+ /* Perform the EM algorithm.
+ * First, randomly assign elements to clusters. */
+ if (npass != 0) randomassign(nclusters, nelements, tclusterid);
+
+ for (i = 0; i < nclusters; i++) counts[i] = 0;
+ for (i = 0; i < nelements; i++) counts[tclusterid[i]]++;
+
+ /* Start the loop */
+ while (1) {
+ double previous = total;
+ total = 0.0;
+
+ if (counter % period == 0) {
+ /* Save the current cluster assignments */
+ for (i = 0; i < nelements; i++) saved[i] = tclusterid[i];
+ if (period < INT_MAX / 2) period *= 2;
+ }
+ counter++;
+
+ /* Find the center */
+ getclustermedians(nclusters, nrows, ncolumns, data, mask,
+ tclusterid, cdata, cmask, transpose, cache);
+
+ for (i = 0; i < nelements; i++) {
+ /* Calculate the distances */
+ double distance;
+ k = tclusterid[i];
+ if (counts[k] == 1) continue;
+ /* No reassignment if that would lead to an empty cluster */
+ /* Treat the present cluster as a special case */
+ distance = metric(ndata, data, cdata, mask, cmask, weight,
+ i, k, transpose);
+ for (j = 0; j < nclusters; j++) {
+ double tdistance;
+ if (j == k) continue;
+ tdistance = metric(ndata, data, cdata, mask, cmask, weight,
+ i, j, transpose);
+ if (tdistance < distance) {
+ distance = tdistance;
+ counts[tclusterid[i]]--;
+ tclusterid[i] = j;
+ counts[j]++;
+ }
+ }
+ total += distance;
+ }
+ if (total >= previous) break;
+ /* total >= previous is FALSE on some machines even if total and
+ * previous are bitwise identical. */
+ for (i = 0; i < nelements; i++)
+ if (saved[i]!=tclusterid[i]) break;
+ if (i == nelements)
+ break; /* Identical solution found; break out of this loop */
+ }
+
+ if (npass <= 1) {
+ *error = total;
+ break;
+ }
+
+ for (i = 0; i < nclusters; i++) mapping[i] = -1;
+ for (i = 0; i < nelements; i++) {
+ j = tclusterid[i];
+ k = clusterid[i];
+ if (mapping[k] == -1) mapping[k] = j;
+ else if (mapping[k] != j) {
+ if (total < *error) {
+ ifound = 1;
+ *error = total;
+ for (j = 0; j < nelements; j++)
+ clusterid[j] = tclusterid[j];
+ }
+ break;
+ }
+ }
+ if (i == nelements) ifound++; /* break statement not encountered */
+ } while (++ipass < npass);
+
+ free(saved);
+ return ifound;
+}
+
+/* ********************************************************************* */
+
+void
+kcluster(int nclusters, int nrows, int ncolumns, double** data, int** mask,
+ double weight[], int transpose, int npass, char method, char dist,
+ int clusterid[], double* error, int* ifound)
+/*
+Purpose
+=======
+
+The kcluster routine performs k-means or k-median clustering on a given set of
+elements, using the specified distance measure. The number of clusters is given
+by the user. Multiple passes are being made to find the optimal clustering
+solution, each time starting from a different initial clustering.
+
+
+Arguments
+=========
+
+nclusters (input) int
+The number of clusters to be found.
+
+data (input) double[nrows][ncolumns]
+The array containing the data of the elements to be clustered (i.e., the gene
+expression data).
+
+mask (input) int[nrows][ncolumns]
+This array shows which data values are missing. If
+mask[i][j] == 0, then data[i][j] is missing.
+
+nrows (input) int
+The number of rows in the data matrix, equal to the number of genes.
+
+ncolumns (input) int
+The number of columns in the data matrix, equal to the number of samples.
+
+weight (input) double[ncolumns] if transpose == 0,
+ double[nrows] otherwise
+The weights that are used to calculate the distance. This is equivalent
+to including the jth data point weight[j] times in the calculation. The
+weights can be non-integer.
+
+transpose (input) int
+If transpose == 0, the rows of the matrix are clustered. Otherwise, columns
+of the matrix are clustered.
+
+npass (input) int
+The number of times clustering is performed. Clustering is performed npass
+times, each time starting from a different (random) initial assignment of
+genes to clusters. The clustering solution with the lowest within-cluster sum
+of distances is chosen.
+If npass == 0, then the clustering algorithm will be run once, where the
+initial assignment of elements to clusters is taken from the clusterid array.
+
+method (input) char
+Defines whether the arithmetic mean (method == 'a') or the median
+(method == 'm') is used to calculate the cluster center.
+
+dist (input) char
+Defines which distance measure is used, as given by the table:
+dist == 'e': Euclidean distance
+dist == 'b': City-block distance
+dist == 'c': correlation
+dist == 'a': absolute value of the correlation
+dist == 'u': uncentered correlation
+dist == 'x': absolute uncentered correlation
+dist == 's': Spearman's rank correlation
+dist == 'k': Kendall's tau
+For other values of dist, the default (Euclidean distance) is used.
+
+clusterid (output; input) int[nrows] if transpose == 0
+ int[ncolumns] otherwise
+The cluster number to which a gene or microarray was assigned. If npass == 0,
+then on input clusterid contains the initial clustering assignment from which
+the clustering algorithm starts. On output, it contains the clustering solution
+that was found.
+
+error (output) double*
+The sum of distances to the cluster center of each item in the optimal k-means
+clustering solution that was found.
+
+ifound (output) int*
+The number of times the optimal clustering solution was
+found. The value of ifound is at least 1; its maximum value is npass. If the
+number of clusters is larger than the number of elements being clustered,
+*ifound is set to 0 as an error code. If a memory allocation error occurs,
+*ifound is set to -1.
+
+========================================================================
+*/
+{
+ const int nelements = (transpose == 0) ? nrows : ncolumns;
+ const int ndata = (transpose == 0) ? ncolumns : nrows;
+
+ int i;
+ int ok;
+ int* tclusterid;
+ int* mapping = NULL;
+ double** cdata;
+ int** cmask;
+ int* counts;
+
+ if (nelements < nclusters) {
+ *ifound = 0;
+ return;
+ }
+ /* More clusters asked for than elements available */
+
+ *ifound = -1;
+
+ /* This will contain the number of elements in each cluster, which is
+ * needed to check for empty clusters. */
+ counts = malloc(nclusters*sizeof(int));
+ if (!counts) return;
+
+ /* Find out if the user specified an initial clustering */
+ if (npass <= 1) tclusterid = clusterid;
+ else {
+ tclusterid = malloc(nelements*sizeof(int));
+ if (!tclusterid) {
+ free(counts);
+ return;
+ }
+ mapping = malloc(nclusters*sizeof(int));
+ if (!mapping) {
+ free(counts);
+ free(tclusterid);
+ return;
+ }
+ for (i = 0; i < nelements; i++) clusterid[i] = 0;
+ }
+
+ /* Allocate space to store the centroid data */
+ if (transpose == 0) ok = makedatamask(nclusters, ndata, &cdata, &cmask);
+ else ok = makedatamask(ndata, nclusters, &cdata, &cmask);
+ if (!ok) {
+ free(counts);
+ if (npass>1) {
+ free(tclusterid);
+ free(mapping);
+ }
+ return;
+ }
+
+ if (method == 'm') {
+ double* cache = malloc(nelements*sizeof(double));
+ if (cache) {
+ *ifound = kmedians(nclusters, nrows, ncolumns, data, mask, weight,
+ transpose, npass, dist, cdata, cmask, clusterid,
+ error, tclusterid, counts, mapping, cache);
+ free(cache);
+ }
+ }
+ else
+ *ifound = kmeans(nclusters, nrows, ncolumns, data, mask, weight,
+ transpose, npass, dist, cdata, cmask, clusterid,
+ error, tclusterid, counts, mapping);
+
+ /* Deallocate temporarily used space */
+ if (npass > 1) {
+ free(mapping);
+ free(tclusterid);
+ }
+
+ if (transpose == 0) freedatamask(nclusters, cdata, cmask);
+ else freedatamask(ndata, cdata, cmask);
+
+ free(counts);
+}
+
+/* *********************************************************************** */
+
+void
+kmedoids(int nclusters, int nelements, double** distmatrix, int npass,
+ int clusterid[], double* error, int* ifound)
+/*
+Purpose
+=======
+
+The kmedoids routine performs k-medoids clustering on a given set of elements,
+using the distance matrix and the number of clusters passed by the user.
+Multiple passes are being made to find the optimal clustering solution, each
+time starting from a different initial clustering.
+
+
+Arguments
+=========
+
+nclusters (input) int
+The number of clusters to be found.
+
+nelements (input) int
+The number of elements to be clustered.
+
+distmatrix (input) double array, ragged
+ (number of rows is nelements, number of columns is equal to the row number)
+The distance matrix. To save space, the distance matrix is given in the
+form of a ragged array. The distance matrix is symmetric and has zeros
+on the diagonal. See distancematrix for a description of the content.
+
+npass (input) int
+The number of times clustering is performed. Clustering is performed npass
+times, each time starting from a different (random) initial assignment of genes
+to clusters. The clustering solution with the lowest within-cluster sum of
+distances is chosen.
+If npass == 0, then the clustering algorithm will be run once, where the
+initial assignment of elements to clusters is taken from the clusterid array.
+
+clusterid (output; input) int[nelements]
+On input, if npass == 0, then clusterid contains the initial clustering
+assignment from which the clustering algorithm starts; all numbers in clusterid
+should be between zero and nelements-1 inclusive. If npass != 0, clusterid is
+ignored on input.
+On output, clusterid contains the clustering solution that was found: clusterid
+contains the number of the cluster to which each item was assigned. On output,
+the number of a cluster is defined as the item number of the centroid of the
+cluster.
+
+error (output) double
+The sum of distances to the cluster center of each item in the optimal
+k-medoids clustering solution that was found.
+
+ifound (output) int
+If kmedoids is successful: the number of times the optimal clustering solution
+was found. The value of ifound is at least 1; its maximum value is npass.
+If the user requested more clusters than elements available, ifound is set
+to 0. If kmedoids fails due to a memory allocation error, ifound is set to -1.
+
+========================================================================
+*/
+{
+ int i, j, icluster;
+ int* tclusterid;
+ int* saved;
+ int* centroids;
+ double* errors;
+ int ipass = 0;
+
+ if (nelements < nclusters) {
+ *ifound = 0;
+ return;
+ } /* More clusters asked for than elements available */
+
+ *ifound = -1;
+
+ /* Save the clustering solution periodically and check if it reappears */
+ saved = malloc(nelements*sizeof(int));
+ if (saved == NULL) return;
+
+ centroids = malloc(nclusters*sizeof(int));
+ if (!centroids) {
+ free(saved);
+ return;
+ }
+
+ errors = malloc(nclusters*sizeof(double));
+ if (!errors) {
+ free(saved);
+ free(centroids);
+ return;
+ }
+
+ /* Find out if the user specified an initial clustering */
+ if (npass <= 1) tclusterid = clusterid;
+ else {
+ tclusterid = malloc(nelements*sizeof(int));
+ if (!tclusterid) {
+ free(saved);
+ free(centroids);
+ free(errors);
+ return;
+ }
+ for (i = 0; i < nelements; i++) clusterid[i] = -1;
+ }
+
+ *error = DBL_MAX;
+ do /* Start the loop */ {
+ double total = DBL_MAX;
+ int counter = 0;
+ int period = 10;
+
+ if (npass != 0) randomassign(nclusters, nelements, tclusterid);
+ while (1) {
+ double previous = total;
+ total = 0.0;
+
+ if (counter % period == 0) {
+ /* Save the current cluster assignments */
+ for (i = 0; i < nelements; i++) saved[i] = tclusterid[i];
+ if (period < INT_MAX / 2) period *= 2;
+ }
+ counter++;
+
+ /* Find the center */
+ getclustermedoids(nclusters, nelements, distmatrix, tclusterid,
+ centroids, errors);
+
+ for (i = 0; i < nelements; i++) {
+ /* Find the closest cluster */
+ double distance = DBL_MAX;
+ for (icluster = 0; icluster < nclusters; icluster++) {
+ double tdistance;
+ j = centroids[icluster];
+ if (i == j) {
+ distance = 0.0;
+ tclusterid[i] = icluster;
+ break;
+ }
+ tdistance = (i > j) ? distmatrix[i][j] : distmatrix[j][i];
+ if (tdistance < distance) {
+ distance = tdistance;
+ tclusterid[i] = icluster;
+ }
+ }
+ total += distance;
+ }
+ if (total >= previous) break;
+ /* total >= previous is FALSE on some machines even if total and
+ * previous are bitwise identical. */
+ for (i = 0; i < nelements; i++)
+ if (saved[i] != tclusterid[i]) break;
+ if (i == nelements)
+ break; /* Identical solution found; break out of this loop */
+ }
+
+ if (npass <= 1) {
+ *ifound = 1;
+ *error = total;
+ /* Replace by the centroid in each cluster. */
+ for (j = 0; j < nelements; j++) {
+ clusterid[j] = centroids[tclusterid[j]];
+ }
+ break;
+ }
+
+ for (i = 0; i < nelements; i++) {
+ if (clusterid[i]!=centroids[tclusterid[i]]) {
+ if (total < *error) {
+ *ifound = 1;
+ *error = total;
+ /* Replace by the centroid in each cluster. */
+ for (j = 0; j < nelements; j++) {
+ clusterid[j] = centroids[tclusterid[j]];
+ }
+ }
+ break;
+ }
+ }
+ if (i == nelements) (*ifound)++; /* break statement not encountered */
+ } while (++ipass < npass);
+
+ /* Deallocate temporarily used space */
+ if (npass > 1) free(tclusterid);
+
+ free(saved);
+ free(centroids);
+ free(errors);
+}
+
+/* ******************************************************************** */
+
+void
+distancematrix(int nrows, int ncolumns, double** data, int** mask,
+ double weights[], char dist, int transpose, double** matrix)
+/*
+Purpose
+=======
+
+The distancematrix routine calculates the distance matrix between genes or
+samples using their measured gene expression data. Several distance measures
+can be used. As the distance matrix is symmetric, with zeros on the diagonal,
+only the lower triangular half of the distance matrix is stored.
+Space for the distance matrix should be allocated before calling this routine.
+If the parameter transpose is set to a nonzero value, the distances between
+columns of the data matrix are calculated, otherwise distances between the rows
+are calculated.
+
+
+Arguments
+=========
+
+nrows (input) int
+The number of rows in the gene expression data matrix (i.e., the number of
+genes)
+
+ncolumns (input) int
+The number of columns in the gene expression data matrix (i.e., the number of
+samples)
+
+data (input) double[nrows][ncolumns]
+The array containing the gene expression data.
+
+mask (input) int[nrows][ncolumns]
+This array shows which data values are missing. If mask[i][j] == 0, then
+data[i][j] is missing.
+
+weight (input) double[ncolumns] if transpose == 0,
+ double[nrows] otherwise
+The weights that are used to calculate the distance. This is equivalent
+to including the jth data point weight[j] times in the calculation. The
+weights can be non-integer.
+
+dist (input) char
+Defines which distance measure is used, as given by the table:
+dist == 'e': Euclidean distance
+dist == 'b': City-block distance
+dist == 'c': correlation
+dist == 'a': absolute value of the correlation
+dist == 'u': uncentered correlation
+dist == 'x': absolute uncentered correlation
+dist == 's': Spearman's rank correlation
+dist == 'k': Kendall's tau
+For other values of dist, the default (Euclidean distance) is used.
+
+transpose (input) int
+If transpose is equal to zero, the distances between the rows is
+calculated. Otherwise, the distances between the columns is calculated.
+The former is needed when genes are being clustered; the latter is used
+when samples are being clustered.
+
+distmatrix (output) double**
+A ragged array, with the number of columns in each row is equal to the
+row index (so distmatrix[i] has i columns). Upon return, the values of
+the distance matrix are stored in this array.
+
+
+========================================================================
+*/
+{
+ /* First determine the size of the distance matrix */
+ const int n = (transpose == 0) ? nrows : ncolumns;
+ const int ndata = (transpose == 0) ? ncolumns : nrows;
+ int i, j;
+
+ /* Set the metric function as indicated by dist */
+ double (*metric) (int, double**, double**, int**, int**,
+ const double[], int, int, int) = setmetric(dist);
+
+ /* Calculate the distances and save them in the ragged array */
+ for (i = 1; i < n; i++)
+ for (j = 0; j < i; j++)
+ matrix[i][j] = metric(ndata, data, data, mask, mask, weights,
+ i, j, transpose);
+}
+
+/* ******************************************************************** */
+
+double*
+calculate_weights(int nrows, int ncolumns, double** data, int** mask,
+ double weights[], int transpose, char dist, double cutoff, double exponent)
+
+/*
+Purpose
+=======
+
+This function calculates the weights using the weighting scheme proposed by
+Michael Eisen:
+w[i] = 1.0 / sum_{j where d[i][j]= 0) {
+ clusterid[i] = k;
+ j = i;
+ i = previous;
+ previous = j;
+ }
+ else {
+ j = -i-1;
+ if (previous == tree[j].left) {
+ previous = i;
+ i = tree[j].right;
+ if (j >= n && (i >= 0 || -i-1 < n)) k++;
+ }
+ else if (previous == tree[j].right) {
+ previous = i;
+ i = parents[j];
+ if (i == nelements) break;
+ }
+ else {
+ parents[j] = previous;
+ previous = i;
+ i = tree[j].left;
+ if (j >= n && (i >= 0 || -i-1 < n)) k++;
+ }
+ }
+ }
+ free(parents);
+ return 1;
+}
+
+/* ******************************************************************** */
+
+static Node*
+pclcluster(int nrows, int ncolumns, double** data, int** mask, double weight[],
+ double** distmatrix, char dist, int transpose)
+
+/*
+
+Purpose
+=======
+
+The pclcluster routine performs clustering using pairwise centroid-linking on a
+given set of gene expression data, using the distance metric given by dist.
+
+Arguments
+=========
+
+nrows (input) int
+The number of rows in the gene expression data matrix, equal to the number of
+genes.
+
+ncolumns (input) int
+The number of columns in the gene expression data matrix, equal to the number
+of samples.
+
+data (input) double[nrows][ncolumns]
+The array containing the gene expression data.
+
+mask (input) int[nrows][ncolumns]
+This array shows which data values are missing. If
+mask[i][j] == 0, then data[i][j] is missing.
+
+weight (input) double[ncolumns] if transpose == 0;
+ double[nrows] otherwise
+The weights that are used to calculate the distance. This is equivalent
+to including the jth data point weight[j] times in the calculation. The
+weights can be non-integer.
+
+transpose (input) int
+If transpose == 0, the rows of the matrix are clustered. Otherwise, columns
+of the matrix are clustered.
+
+dist (input) char
+Defines which distance measure is used, as given by the table:
+dist == 'e': Euclidean distance
+dist == 'b': City-block distance
+dist == 'c': correlation
+dist == 'a': absolute value of the correlation
+dist == 'u': uncentered correlation
+dist == 'x': absolute uncentered correlation
+dist == 's': Spearman's rank correlation
+dist == 'k': Kendall's tau
+For other values of dist, the default (Euclidean distance) is used.
+
+distmatrix (input) double**
+The distance matrix. This matrix is precalculated by the calling routine
+treecluster. The pclcluster routine modifies the contents of distmatrix, but
+does not deallocate it.
+
+Return value
+============
+
+A pointer to a newly allocated array of Node structs, describing the
+hierarchical clustering solution consisting of nelements-1 nodes. Depending
+on whether genes (rows) or samples (columns) were clustered, nelements is
+equal to nrows or ncolumns. See src/cluster.h for a description of the Node
+structure.
+If a memory error occurs, pclcluster returns NULL.
+========================================================================
+*/
+{
+ int i, j;
+ const int nelements = (transpose == 0) ? nrows : ncolumns;
+ int inode;
+ const int ndata = transpose ? nrows : ncolumns;
+ const int nnodes = nelements - 1;
+ Node* result;
+ double** newdata;
+ int** newmask;
+ int* distid;
+
+ /* Set the metric function as indicated by dist */
+ double (*metric) (int, double**, double**, int**, int**,
+ const double[], int, int, int) = setmetric(dist);
+
+ distid = malloc(nelements*sizeof(int));
+ if (!distid) return NULL;
+ result = malloc(nnodes*sizeof(Node));
+ if (!result) {
+ free(distid);
+ return NULL;
+ }
+ if (!makedatamask(nelements, ndata, &newdata, &newmask)) {
+ free(result);
+ free(distid);
+ return NULL;
+ }
+
+ for (i = 0; i < nelements; i++) distid[i] = i;
+ /* To remember which row/column in the distance matrix contains what */
+
+ /* Storage for node data */
+ if (transpose) {
+ for (i = 0; i < nelements; i++) {
+ for (j = 0; j < ndata; j++) {
+ newdata[i][j] = data[j][i];
+ newmask[i][j] = mask[j][i];
+ }
+ }
+ data = newdata;
+ mask = newmask;
+ }
+ else {
+ for (i = 0; i < nelements; i++) {
+ memcpy(newdata[i], data[i], ndata*sizeof(double));
+ memcpy(newmask[i], mask[i], ndata*sizeof(int));
+ }
+ data = newdata;
+ mask = newmask;
+ }
+
+ for (inode = 0; inode < nnodes; inode++) {
+ /* Find the pair with the shortest distance */
+ int is = 1;
+ int js = 0;
+ result[inode].distance = find_closest_pair(nelements-inode, distmatrix,
+ &is, &js);
+ result[inode].left = distid[js];
+ result[inode].right = distid[is];
+
+ /* Make node js the new node */
+ for (i = 0; i < ndata; i++) {
+ data[js][i] = data[js][i]*mask[js][i] + data[is][i]*mask[is][i];
+ mask[js][i] += mask[is][i];
+ if (mask[js][i]) data[js][i] /= mask[js][i];
+ }
+ free(data[is]);
+ free(mask[is]);
+ data[is] = data[nnodes-inode];
+ mask[is] = mask[nnodes-inode];
+
+ /* Fix the distances */
+ distid[is] = distid[nnodes-inode];
+ for (i = 0; i < is; i++)
+ distmatrix[is][i] = distmatrix[nnodes-inode][i];
+ for (i = is + 1; i < nnodes-inode; i++)
+ distmatrix[i][is] = distmatrix[nnodes-inode][i];
+
+ distid[js] = -inode-1;
+ for (i = 0; i < js; i++)
+ distmatrix[js][i] = metric(ndata, data, data, mask, mask, weight,
+ js, i, 0);
+ for (i = js + 1; i < nnodes-inode; i++)
+ distmatrix[i][js] = metric(ndata, data, data, mask, mask, weight,
+ js, i, 0);
+ }
+
+ /* Free temporarily allocated space */
+ free(data[0]);
+ free(mask[0]);
+ free(data);
+ free(mask);
+ free(distid);
+
+ return result;
+}
+
+/* ******************************************************************** */
+
+static int
+nodecompare(const void* a, const void* b)
+/* Helper function for qsort. */
+{
+ const Node* node1 = (const Node*)a;
+ const Node* node2 = (const Node*)b;
+ const double term1 = node1->distance;
+ const double term2 = node2->distance;
+
+ if (term1 < term2) return -1;
+ if (term1 > term2) return +1;
+ return 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static Node*
+pslcluster(int nrows, int ncolumns, double** data, int** mask,
+ double weight[], double** distmatrix, char dist, int transpose)
+
+/*
+
+Purpose
+=======
+
+The pslcluster routine performs single-linkage hierarchical clustering, using
+either the distance matrix directly, if available, or by calculating the
+distances from the data array. This implementation is based on the SLINK
+algorithm, described in:
+Sibson, R. (1973). SLINK: An optimally efficient algorithm for the single-link
+cluster method. The Computer Journal, 16(1): 30-34.
+The output of this algorithm is identical to conventional single-linkage
+hierarchical clustering, but is much more memory-efficient and faster. Hence,
+it can be applied to large data sets, for which the conventional single-
+linkage algorithm fails due to lack of memory.
+
+
+Arguments
+=========
+
+nrows (input) int
+The number of rows in the gene expression data matrix, equal to the number of
+genes.
+
+ncolumns (input) int
+The number of columns in the gene expression data matrix, equal to the number
+of samples.
+
+data (input) double[nrows][ncolumns]
+The array containing the gene expression data.
+
+mask (input) int[nrows][ncolumns]
+This array shows which data values are missing. If
+mask[i][j] == 0, then data[i][j] is missing.
+
+weight (input) double[ncolumns] if transpose == 0,
+ double[nrows] otherwise
+The weights that are used to calculate the distance. This is equivalent
+to including the jth data point weight[j] times in the calculation. The
+weights can be non-integer.
+
+transpose (input) int
+If transpose == 0, the rows of the matrix are clustered. Otherwise, columns
+of the matrix are clustered.
+
+dist (input) char
+Defines which distance measure is used, as given by the table:
+dist == 'e': Euclidean distance
+dist == 'b': City-block distance
+dist == 'c': correlation
+dist == 'a': absolute value of the correlation
+dist == 'u': uncentered correlation
+dist == 'x': absolute uncentered correlation
+dist == 's': Spearman's rank correlation
+dist == 'k': Kendall's tau
+For other values of dist, the default (Euclidean distance) is used.
+
+distmatrix (input) double**
+The distance matrix. If the distance matrix is passed by the calling routine
+treecluster, it is used by pslcluster to speed up the clustering calculation.
+The pslcluster routine does not modify the contents of distmatrix, and does
+not deallocate it. If distmatrix is NULL, the pairwise distances are calculated
+by the pslcluster routine from the gene expression data (the data and mask
+arrays) and stored in temporary arrays. If distmatrix is passed, the original
+gene expression data (specified by the data and mask arguments) are not needed
+and are therefore ignored.
+
+
+Return value
+============
+
+A pointer to a newly allocated array of Node structs, describing the
+hierarchical clustering solution consisting of nelements-1 nodes. Depending
+on whether genes (rows) or samples (columns) were clustered, nelements is
+equal to nrows or ncolumns. See src/cluster.h for a description of the Node
+structure.
+If a memory error occurs, pslcluster returns NULL.
+
+========================================================================
+*/
+{
+ int i, j, k;
+ const int nelements = transpose ? ncolumns : nrows;
+ const int nnodes = nelements - 1;
+ int* vector;
+ double* temp;
+ int* index;
+ Node* result;
+
+ temp = malloc(nnodes*sizeof(double));
+ if (!temp) return NULL;
+ index = malloc(nelements*sizeof(int));
+ if (!index) {
+ free(temp);
+ return NULL;
+ }
+ vector = malloc(nnodes*sizeof(int));
+ if (!vector) {
+ free(index);
+ free(temp);
+ return NULL;
+ }
+ result = malloc(nelements*sizeof(Node));
+ if (!result) {
+ free(vector);
+ free(index);
+ free(temp);
+ return NULL;
+ }
+
+ for (i = 0; i < nnodes; i++) vector[i] = i;
+
+ if (distmatrix) {
+ for (i = 0; i < nrows; i++) {
+ result[i].distance = DBL_MAX;
+ for (j = 0; j < i; j++) temp[j] = distmatrix[i][j];
+ for (j = 0; j < i; j++) {
+ k = vector[j];
+ if (result[j].distance >= temp[j]) {
+ if (result[j].distance < temp[k])
+ temp[k] = result[j].distance;
+ result[j].distance = temp[j];
+ vector[j] = i;
+ }
+ else if (temp[j] < temp[k]) temp[k] = temp[j];
+ }
+ for (j = 0; j < i; j++) {
+ if (result[j].distance >= result[vector[j]].distance)
+ vector[j] = i;
+ }
+ }
+ }
+ else {
+ const int ndata = transpose ? nrows : ncolumns;
+ /* Set the metric function as indicated by dist */
+ double (*metric) (int, double**, double**, int**, int**,
+ const double[], int, int, int) = setmetric(dist);
+
+ for (i = 0; i < nelements; i++) {
+ result[i].distance = DBL_MAX;
+ for (j = 0; j < i; j++) temp[j] =
+ metric(ndata, data, data, mask, mask, weight, i, j, transpose);
+ for (j = 0; j < i; j++) {
+ k = vector[j];
+ if (result[j].distance >= temp[j]) {
+ if (result[j].distance < temp[k])
+ temp[k] = result[j].distance;
+ result[j].distance = temp[j];
+ vector[j] = i;
+ }
+ else if (temp[j] < temp[k]) temp[k] = temp[j];
+ }
+ for (j = 0; j < i; j++)
+ if (result[j].distance >= result[vector[j]].distance)
+ vector[j] = i;
+ }
+ }
+ free(temp);
+
+ for (i = 0; i < nnodes; i++) result[i].left = i;
+ qsort(result, nnodes, sizeof(Node), nodecompare);
+
+ for (i = 0; i < nelements; i++) index[i] = i;
+ for (i = 0; i < nnodes; i++) {
+ j = result[i].left;
+ k = vector[j];
+ result[i].left = index[j];
+ result[i].right = index[k];
+ index[k] = -i-1;
+ }
+ free(vector);
+ free(index);
+
+ result = realloc(result, nnodes*sizeof(Node));
+
+ return result;
+}
+/* ******************************************************************** */
+
+static Node*
+pmlcluster(int nelements, double** distmatrix)
+/*
+
+Purpose
+=======
+
+The pmlcluster routine performs clustering using pairwise maximum- (complete-)
+linking on the given distance matrix.
+
+Arguments
+=========
+
+nelements (input) int
+The number of elements to be clustered.
+
+distmatrix (input) double**
+The distance matrix, with nelements rows, each row being filled up to the
+diagonal. The elements on the diagonal are not used, as they are assumed to be
+zero. The distance matrix will be modified by this routine.
+
+Return value
+============
+
+A pointer to a newly allocated array of Node structs, describing the
+hierarchical clustering solution consisting of nelements-1 nodes. Depending on
+whether genes (rows) or samples (columns) were clustered, nelements is equal
+to nrows or ncolumns. See src/cluster.h for a description of the Node
+structure.
+If a memory error occurs, pmlcluster returns NULL.
+========================================================================
+*/
+{
+ int j;
+ int n;
+ int* clusterid;
+ Node* result;
+
+ clusterid = malloc(nelements*sizeof(int));
+ if (!clusterid) return NULL;
+ result = malloc((nelements-1)*sizeof(Node));
+ if (!result) {
+ free(clusterid);
+ return NULL;
+ }
+
+ /* Setup a list specifying to which cluster a gene belongs */
+ for (j = 0; j < nelements; j++) clusterid[j] = j;
+
+ for (n = nelements; n > 1; n--) {
+ int is = 1;
+ int js = 0;
+
+ result[nelements-n].distance = find_closest_pair(n, distmatrix,
+ &is, &js);
+
+ /* Fix the distances */
+ for (j = 0; j < js; j++)
+ distmatrix[js][j] = max(distmatrix[is][j], distmatrix[js][j]);
+ for (j = js+1; j < is; j++)
+ distmatrix[j][js] = max(distmatrix[is][j], distmatrix[j][js]);
+ for (j = is+1; j < n; j++)
+ distmatrix[j][js] = max(distmatrix[j][is], distmatrix[j][js]);
+
+ for (j = 0; j < is; j++) distmatrix[is][j] = distmatrix[n-1][j];
+ for (j = is+1; j < n-1; j++) distmatrix[j][is] = distmatrix[n-1][j];
+
+ /* Update clusterids */
+ result[nelements-n].left = clusterid[is];
+ result[nelements-n].right = clusterid[js];
+ clusterid[js] = n-nelements-1;
+ clusterid[is] = clusterid[n-1];
+ }
+ free(clusterid);
+
+ return result;
+}
+
+/* ******************************************************************* */
+
+static Node*
+palcluster(int nelements, double** distmatrix)
+/*
+Purpose
+=======
+
+The palcluster routine performs clustering using pairwise average
+linking on the given distance matrix.
+
+Arguments
+=========
+
+nelements (input) int
+The number of elements to be clustered.
+
+distmatrix (input) double**
+The distance matrix, with nelements rows, each row being filled up to the
+diagonal. The elements on the diagonal are not used, as they are assumed to be
+zero. The distance matrix will be modified by this routine.
+
+Return value
+============
+
+A pointer to a newly allocated array of Node structs, describing the
+hierarchical clustering solution consisting of nelements-1 nodes. Depending on
+whether genes (rows) or samples (columns) were clustered, nelements is equal
+to nrows or ncolumns. See src/cluster.h for a description of the Node
+structure.
+If a memory error occurs, palcluster returns NULL.
+========================================================================
+*/
+{
+ int j;
+ int n;
+ int* clusterid;
+ int* number;
+ Node* result;
+
+ clusterid = malloc(nelements*sizeof(int));
+ if (!clusterid) return NULL;
+ number = malloc(nelements*sizeof(int));
+ if (!number) {
+ free(clusterid);
+ return NULL;
+ }
+ result = malloc((nelements-1)*sizeof(Node));
+ if (!result) {
+ free(clusterid);
+ free(number);
+ return NULL;
+ }
+
+ /* Setup a list specifying to which cluster a gene belongs, and keep track
+ * of the number of elements in each cluster (needed to calculate the
+ * average). */
+ for (j = 0; j < nelements; j++) {
+ number[j] = 1;
+ clusterid[j] = j;
+ }
+
+ for (n = nelements; n > 1; n--) {
+ int sum;
+ int is = 1;
+ int js = 0;
+ result[nelements-n].distance = find_closest_pair(n, distmatrix,
+ &is, &js);
+
+ /* Save result */
+ result[nelements-n].left = clusterid[is];
+ result[nelements-n].right = clusterid[js];
+
+ /* Fix the distances */
+ sum = number[is] + number[js];
+ for (j = 0; j < js; j++) {
+ distmatrix[js][j] = distmatrix[is][j]*number[is]
+ + distmatrix[js][j]*number[js];
+ distmatrix[js][j] /= sum;
+ }
+ for (j = js+1; j < is; j++) {
+ distmatrix[j][js] = distmatrix[is][j]*number[is]
+ + distmatrix[j][js]*number[js];
+ distmatrix[j][js] /= sum;
+ }
+ for (j = is+1; j < n; j++) {
+ distmatrix[j][js] = distmatrix[j][is]*number[is]
+ + distmatrix[j][js]*number[js];
+ distmatrix[j][js] /= sum;
+ }
+
+ for (j = 0; j < is; j++) distmatrix[is][j] = distmatrix[n-1][j];
+ for (j = is+1; j < n-1; j++) distmatrix[j][is] = distmatrix[n-1][j];
+
+ /* Update number of elements in the clusters */
+ number[js] = sum;
+ number[is] = number[n-1];
+
+ /* Update clusterids */
+ clusterid[js] = n-nelements-1;
+ clusterid[is] = clusterid[n-1];
+ }
+ free(clusterid);
+ free(number);
+
+ return result;
+}
+
+/* ******************************************************************* */
+
+Node*
+treecluster(int nrows, int ncolumns, double** data, int** mask,
+ double weight[], int transpose, char dist, char method,
+ double** distmatrix)
+/*
+Purpose
+=======
+
+The treecluster routine performs hierarchical clustering using pairwise
+single-, maximum-, centroid-, or average-linkage, as defined by method, on a
+given set of gene expression data, using the distance metric given by dist.
+If successful, the function returns a pointer to a newly allocated Tree struct
+containing the hierarchical clustering solution, and NULL if a memory error
+occurs. The pointer should be freed by the calling routine to prevent memory
+leaks.
+
+Arguments
+=========
+
+nrows (input) int
+The number of rows in the data matrix, equal to the number of genes.
+
+ncolumns (input) int
+The number of columns in the data matrix, equal to the number of samples.
+
+data (input) double[nrows][ncolumns]
+The array containing the data of the vectors to be clustered.
+
+mask (input) int[nrows][ncolumns]
+This array shows which data values are missing. If mask[i][j] == 0, then
+data[i][j] is missing.
+
+weight (input) double[ncolumns] if transpose == 0,
+ double[nrows] otherwise
+The weights that are used to calculate the distance. This is equivalent
+to including the jth data point weight[j] times in the calculation. The
+weights can be non-integer.
+
+transpose (input) int
+If transpose == 0, the rows of the matrix are clustered. Otherwise, columns
+of the matrix are clustered.
+
+dist (input) char
+Defines which distance measure is used, as given by the table:
+dist == 'e': Euclidean distance
+dist == 'b': City-block distance
+dist == 'c': correlation
+dist == 'a': absolute value of the correlation
+dist == 'u': uncentered correlation
+dist == 'x': absolute uncentered correlation
+dist == 's': Spearman's rank correlation
+dist == 'k': Kendall's tau
+For other values of dist, the default (Euclidean distance) is used.
+
+method (input) char
+Defines which hierarchical clustering method is used:
+method == 's': pairwise single-linkage clustering
+method == 'm': pairwise maximum- (or complete-) linkage clustering
+method == 'a': pairwise average-linkage clustering
+method == 'c': pairwise centroid-linkage clustering
+For the first three, either the distance matrix or the gene expression data is
+sufficient to perform the clustering algorithm. For pairwise centroid-linkage
+clustering, however, the gene expression data are always needed, even if the
+distance matrix itself is available.
+
+distmatrix (input) double**
+The distance matrix. If the distance matrix is zero initially, the distance
+matrix will be allocated and calculated from the data by treecluster, and
+deallocated before treecluster returns. If the distance matrix is passed by the
+calling routine, treecluster will modify the contents of the distance matrix as
+part of the clustering algorithm, but will not deallocate it. The calling
+routine should deallocate the distance matrix after the return from
+treecluster.
+
+Return value
+============
+
+A pointer to a newly allocated array of Node structs, describing the
+hierarchical clustering solution consisting of nelements-1 nodes. Depending on
+whether genes (rows) or samples (columns) were clustered, nelements is equal
+to nrows or ncolumns. See src/cluster.h for a description of the Node
+structure.
+If a memory error occurs, treecluster returns NULL.
+
+========================================================================
+*/
+{
+ Node* result = NULL;
+ const int nelements = (transpose == 0) ? nrows : ncolumns;
+ const int ldistmatrix = (distmatrix == NULL && method != 's') ? 1 : 0;
+
+ if (nelements < 2) return NULL;
+
+ /* Calculate the distance matrix if the user didn't give it */
+ if (ldistmatrix) {
+ /* Set up the ragged array */
+ int i;
+ distmatrix = malloc(nelements*sizeof(double*));
+ if (distmatrix == NULL) return NULL; /* Not enough memory available */
+ distmatrix[0] = NULL;
+ for (i = 1; i < nelements; i++) {
+ distmatrix[i] = malloc(i*sizeof(double));
+ if (distmatrix[i] == NULL) /* Not enough memory available */ {
+ while (--i > 0) free(distmatrix[i]);
+ free(distmatrix);
+ return NULL;
+ }
+ }
+ distancematrix(nrows, ncolumns, data, mask, weight, dist, transpose,
+ distmatrix);
+ }
+
+ switch(method) {
+ case 's':
+ result = pslcluster(nrows, ncolumns, data, mask, weight,
+ distmatrix, dist, transpose);
+ break;
+ case 'm':
+ result = pmlcluster(nelements, distmatrix);
+ break;
+ case 'a':
+ result = palcluster(nelements, distmatrix);
+ break;
+ case 'c':
+ result = pclcluster(nrows, ncolumns, data, mask, weight,
+ distmatrix, dist, transpose);
+ break;
+ }
+
+ /* Deallocate space for distance matrix if allocated by treecluster */
+ if (ldistmatrix) {
+ int i;
+ for (i = 1; i < nelements; i++) free(distmatrix[i]);
+ free(distmatrix);
+ }
+
+ return result;
+}
+
+/* ******************************************************************* */
+
+int
+sorttree(const int nnodes, Node* tree, const double order[], int indices[])
+/*
+Purpose
+=======
+
+The sorttree routine sorts the items in a hierarchical clustering solution
+based on their order values, while remaining consistent with the hierchical
+clustering solution.
+
+Arguments
+=========
+
+nnodes (input) int
+The number of nodes in the hierarchical clustering tree.
+
+tree (input) Node[nnodes]
+The hierarchical clustering tree describing the clustering solution.
+
+order (input) double[nnodes+1]
+The preferred order of the items.
+
+indices (output) int*
+The indices of each item after sorting, with item i appearing at indices[i]
+after sorting.
+
+Return value
+============
+
+If no errors occur, sorttree returns 1.
+If a memory error occurs, sorttree returns 0.
+
+========================================================================
+*/
+
+{
+ int i;
+ int index;
+ int i1, i2;
+ double order1, order2;
+ int counts1, counts2;
+ int* nodecounts;
+
+ nodecounts = malloc(nnodes*sizeof(int));
+ if (!nodecounts) return 0;
+ if (order) {
+ double* nodeorder = malloc(nnodes*sizeof(double));
+ if (!nodeorder) {
+ free(nodecounts);
+ return 0;
+ }
+ for (i = 0; i < nnodes; i++) {
+ i1 = tree[i].left;
+ i2 = tree[i].right;
+ /* i1 and i2 are the elements that are to be joined */
+ if (i1 < 0) {
+ index = -i1-1;
+ order1 = nodeorder[index];
+ counts1 = nodecounts[index];
+ }
+ else {
+ order1 = order[i1];
+ counts1 = 1;
+ }
+ if (i2 < 0) {
+ index = -i2-1;
+ order2 = nodeorder[index];
+ counts2 = nodecounts[index];
+ }
+ else {
+ order2 = order[i2];
+ counts2 = 1;
+ }
+ if (order1 > order2) {
+ tree[i].left = i2;
+ tree[i].right = i1;
+ }
+ nodecounts[i] = counts1 + counts2;
+ nodeorder[i] = (counts1*order1+counts2*order2) / (counts1+counts2);
+ }
+ free(nodeorder);
+ }
+ else {
+ for (i = 0; i < nnodes; i++) {
+ i1 = tree[i].left;
+ i2 = tree[i].right;
+ /* i1 and i2 are the elements that are to be joined */
+ counts1 = (i1 < 0) ? nodecounts[-i1-1] : 1;
+ counts2 = (i2 < 0) ? nodecounts[-i2-1] : 1;
+ nodecounts[i] = counts1 + counts2;
+ }
+ }
+ i--;
+ nodecounts[i] = 0;
+ for ( ; i >= 0; i--) {
+ i1 = tree[i].left;
+ i2 = tree[i].right;
+ counts1 = (i1<0) ? nodecounts[-i1-1] : 1;
+ index = nodecounts[i];
+ if (i1 >= 0) indices[index] = i1;
+ else nodecounts[-i1-1] = index;
+ index += counts1;
+ if (i2 >= 0) indices[index] = i2;
+ else nodecounts[-i2-1] = index;
+ }
+ free(nodecounts);
+ return 1;
+}
+
+/* ******************************************************************* */
+
+static void
+somworker(int nrows, int ncolumns, double** data, int** mask,
+ const double weights[], int transpose, int nxgrid, int nygrid,
+ double inittau, double*** celldata, int niter, char dist)
+
+{
+ const int nelements = (transpose == 0) ? nrows : ncolumns;
+ const int ndata = (transpose == 0) ? ncolumns : nrows;
+ int i, j;
+ int** dummymask;
+ int ix, iy;
+ int* index;
+ int iter;
+ /* Maximum radius in which nodes are adjusted */
+ double maxradius = sqrt(nxgrid*nxgrid+nygrid*nygrid);
+ double* stddata = calloc(nelements, sizeof(double));
+
+ /* Set the metric function as indicated by dist */
+ double (*metric) (int, double**, double**, int**, int**,
+ const double[], int, int, int) = setmetric(dist);
+
+ /* Calculate the standard deviation for each row or column */
+ if (transpose == 0) {
+ for (i = 0; i < nelements; i++) {
+ int n = 0;
+ for (j = 0; j < ndata; j++) {
+ if (mask[i][j]) {
+ double term = data[i][j];
+ term = term * term;
+ stddata[i] += term;
+ n++;
+ }
+ }
+ if (stddata[i] > 0) stddata[i] = sqrt(stddata[i]/n);
+ else stddata[i] = 1;
+ }
+ }
+ else {
+ for (i = 0; i < nelements; i++) {
+ int n = 0;
+ for (j = 0; j < ndata; j++) {
+ if (mask[j][i]) {
+ double term = data[j][i];
+ term = term * term;
+ stddata[i] += term;
+ n++;
+ }
+ }
+ if (stddata[i] > 0) stddata[i] = sqrt(stddata[i]/n);
+ else stddata[i] = 1;
+ }
+ }
+
+ if (transpose == 0) {
+ dummymask = malloc(nygrid*sizeof(int*));
+ for (i = 0; i < nygrid; i++) {
+ dummymask[i] = malloc(ndata*sizeof(int));
+ for (j = 0; j < ndata; j++) dummymask[i][j] = 1;
+ }
+ }
+ else {
+ dummymask = malloc(ndata*sizeof(int*));
+ for (i = 0; i < ndata; i++) {
+ dummymask[i] = malloc(sizeof(int));
+ dummymask[i][0] = 1;
+ }
+ }
+
+ /* Randomly initialize the nodes */
+ for (ix = 0; ix < nxgrid; ix++) {
+ for (iy = 0; iy < nygrid; iy++) {
+ double sum = 0.;
+ for (i = 0; i < ndata; i++) {
+ double term = -1.0 + 2.0*uniform();
+ celldata[ix][iy][i] = term;
+ sum += term * term;
+ }
+ sum = sqrt(sum/ndata);
+ for (i = 0; i < ndata; i++) celldata[ix][iy][i] /= sum;
+ }
+ }
+
+ /* Randomize the order in which genes or arrays will be used */
+ index = malloc(nelements*sizeof(int));
+ for (i = 0; i < nelements; i++) index[i] = i;
+ for (i = 0; i < nelements; i++) {
+ j = (int) (i + (nelements-i)*uniform());
+ ix = index[j];
+ index[j] = index[i];
+ index[i] = ix;
+ }
+
+ /* Start the iteration */
+ for (iter = 0; iter < niter; iter++) {
+ int ixbest = 0;
+ int iybest = 0;
+ int iobject = iter % nelements;
+ iobject = index[iobject];
+ if (transpose == 0) {
+ double closest = metric(ndata, data, celldata[ixbest], mask,
+ dummymask, weights, iobject, iybest,
+ transpose);
+ double radius = maxradius * (1. - ((double)iter)/((double)niter));
+ double tau = inittau * (1. - ((double)iter)/((double)niter));
+
+ for (ix = 0; ix < nxgrid; ix++) {
+ for (iy = 0; iy < nygrid; iy++) {
+ double distance = metric(ndata, data, celldata[ix], mask,
+ dummymask, weights, iobject, iy,
+ transpose);
+ if (distance < closest) {
+ ixbest = ix;
+ iybest = iy;
+ closest = distance;
+ }
+ }
+ }
+ for (ix = 0; ix < nxgrid; ix++) {
+ for (iy = 0; iy < nygrid; iy++) {
+ if (sqrt((ix-ixbest)*(ix-ixbest)+(iy-iybest)*(iy-iybest)) <
+ radius) {
+ double sum = 0.;
+ for (i = 0; i < ndata; i++) {
+ if (mask[iobject][i] == 0) continue;
+ celldata[ix][iy][i] +=
+ tau * (data[iobject][i]/stddata[iobject]
+ -celldata[ix][iy][i]);
+ }
+ for (i = 0; i < ndata; i++) {
+ double term = celldata[ix][iy][i];
+ term = term * term;
+ sum += term;
+ }
+ if (sum>0) {
+ sum = sqrt(sum/ndata);
+ for (i = 0; i < ndata; i++)
+ celldata[ix][iy][i] /= sum;
+ }
+ }
+ }
+ }
+ }
+ else {
+ double closest;
+ double** celldatavector = malloc(ndata*sizeof(double*));
+ double radius = maxradius * (1. - ((double)iter)/((double)niter));
+ double tau = inittau * (1. - ((double)iter)/((double)niter));
+
+ for (i = 0; i < ndata; i++)
+ celldatavector[i] = &(celldata[ixbest][iybest][i]);
+ closest = metric(ndata, data, celldatavector, mask, dummymask,
+ weights, iobject, 0, transpose);
+ for (ix = 0; ix < nxgrid; ix++) {
+ for (iy = 0; iy < nygrid; iy++) {
+ double distance;
+ for (i = 0; i < ndata; i++)
+ celldatavector[i] = &(celldata[ixbest][iybest][i]);
+ distance = metric(ndata, data, celldatavector, mask,
+ dummymask, weights, iobject, 0,
+ transpose);
+ if (distance < closest) {
+ ixbest = ix;
+ iybest = iy;
+ closest = distance;
+ }
+ }
+ }
+ free(celldatavector);
+ for (ix = 0; ix < nxgrid; ix++) {
+ for (iy = 0; iy < nygrid; iy++) {
+ if (sqrt((ix-ixbest)*(ix-ixbest)+(iy-iybest)*(iy-iybest)) <
+ radius) {
+ double sum = 0.;
+ for (i = 0; i < ndata; i++) {
+ if (mask[i][iobject] == 0) continue;
+ celldata[ix][iy][i] +=
+ tau * (data[i][iobject]/stddata[iobject]
+ -celldata[ix][iy][i]);
+ }
+ for (i = 0; i < ndata; i++) {
+ double term = celldata[ix][iy][i];
+ term = term * term;
+ sum += term;
+ }
+ if (sum>0) {
+ sum = sqrt(sum/ndata);
+ for (i = 0; i < ndata; i++)
+ celldata[ix][iy][i] /= sum;
+ }
+ }
+ }
+ }
+ }
+ }
+ if (transpose == 0)
+ for (i = 0; i < nygrid; i++) free(dummymask[i]);
+ else
+ for (i = 0; i < ndata; i++) free(dummymask[i]);
+ free(dummymask);
+ free(stddata);
+ free(index);
+}
+
+/* ******************************************************************* */
+
+static void
+somassign(int nrows, int ncolumns, double** data, int** mask,
+ const double weights[], int transpose, int nxgrid, int nygrid,
+ double*** celldata, char dist, int clusterid[][2])
+/* Collect clusterids */
+{
+ const int ndata = (transpose == 0) ? ncolumns : nrows;
+ int i, j;
+
+ /* Set the metric function as indicated by dist */
+ double (*metric) (int, double**, double**, int**, int**,
+ const double[], int, int, int) = setmetric(dist);
+
+ if (transpose == 0) {
+ int** dummymask = malloc(nygrid*sizeof(int*));
+ for (i = 0; i < nygrid; i++) {
+ dummymask[i] = malloc(ncolumns*sizeof(int));
+ for (j = 0; j < ncolumns; j++) dummymask[i][j] = 1;
+ }
+ for (i = 0; i < nrows; i++) {
+ int ixbest = 0;
+ int iybest = 0;
+ double closest = metric(ndata, data, celldata[ixbest], mask,
+ dummymask, weights, i, iybest, transpose);
+ int ix, iy;
+ for (ix = 0; ix < nxgrid; ix++) {
+ for (iy = 0; iy < nygrid; iy++) {
+ double distance = metric(ndata, data, celldata[ix], mask,
+ dummymask, weights, i, iy,
+ transpose);
+ if (distance < closest) {
+ ixbest = ix;
+ iybest = iy;
+ closest = distance;
+ }
+ }
+ }
+ clusterid[i][0] = ixbest;
+ clusterid[i][1] = iybest;
+ }
+ for (i = 0; i < nygrid; i++) free(dummymask[i]);
+ free(dummymask);
+ }
+ else {
+ double** celldatavector = malloc(ndata*sizeof(double*));
+ int** dummymask = malloc(nrows*sizeof(int*));
+ int ixbest = 0;
+ int iybest = 0;
+ for (i = 0; i < nrows; i++) {
+ dummymask[i] = malloc(sizeof(int));
+ dummymask[i][0] = 1;
+ }
+ for (i = 0; i < ncolumns; i++) {
+ double closest;
+ int ix, iy;
+ for (j = 0; j < ndata; j++)
+ celldatavector[j] = &(celldata[ixbest][iybest][j]);
+ closest = metric(ndata, data, celldatavector, mask, dummymask,
+ weights, i, 0, transpose);
+ for (ix = 0; ix < nxgrid; ix++) {
+ for (iy = 0; iy < nygrid; iy++) {
+ double distance;
+ for (j = 0; j < ndata; j++)
+ celldatavector[j] = &(celldata[ix][iy][j]);
+ distance = metric(ndata, data, celldatavector, mask,
+ dummymask, weights, i, 0, transpose);
+ if (distance < closest) {
+ ixbest = ix;
+ iybest = iy;
+ closest = distance;
+ }
+ }
+ }
+ clusterid[i][0] = ixbest;
+ clusterid[i][1] = iybest;
+ }
+ free(celldatavector);
+ for (i = 0; i < nrows; i++) free(dummymask[i]);
+ free(dummymask);
+ }
+}
+
+/* ******************************************************************* */
+
+void
+somcluster(int nrows, int ncolumns, double** data, int** mask,
+ const double weight[], int transpose, int nxgrid, int nygrid,
+ double inittau, int niter, char dist, double*** celldata,
+ int clusterid[][2])
+/*
+
+Purpose
+=======
+
+The somcluster routine implements a self-organizing map (Kohonen) on a
+rectangular grid, using a given set of vectors. The distance measure to be
+used to find the similarity between genes and nodes is given by dist.
+
+Arguments
+=========
+
+nrows (input) int
+The number of rows in the data matrix, equal to the number of genes.
+
+ncolumns (input) int
+The number of columns in the data matrix, equal to the number of samples.
+
+data (input) double[nrows][ncolumns]
+The array containing the gene expression data.
+
+mask (input) int[nrows][ncolumns]
+This array shows which data values are missing. If
+mask[i][j] == 0, then data[i][j] is missing.
+
+weight (input) double[ncolumns] if transpose == 0;
+ double[nrows] otherwise
+The weights that are used to calculate the distance. This is equivalent
+to including the jth data point weight[j] times in the calculation. The
+weights can be non-integer.
+
+transpose (input) int
+If transpose == 0, the rows (genes) of the matrix are clustered. Otherwise,
+columns (samples) of the matrix are clustered.
+
+nxgrid (input) int
+The number of grid cells horizontally in the rectangular topology of clusters.
+
+nygrid (input) int
+The number of grid cells horizontally in the rectangular topology of clusters.
+
+inittau (input) double
+The initial value of tau, representing the neighborhood function.
+
+niter (input) int
+The number of iterations to be performed.
+
+dist (input) char
+Defines which distance measure is used, as given by the table:
+dist == 'e': Euclidean distance
+dist == 'b': City-block distance
+dist == 'c': correlation
+dist == 'a': absolute value of the correlation
+dist == 'u': uncentered correlation
+dist == 'x': absolute uncentered correlation
+dist == 's': Spearman's rank correlation
+dist == 'k': Kendall's tau
+For other values of dist, the default (Euclidean distance) is used.
+
+celldata (output) double[nxgrid][nygrid][ncolumns] if transpose == 0;
+ double[nxgrid][nygrid][nrows] otherwise
+The gene expression data for each node (cell) in the 2D grid. This can be
+interpreted as the centroid for the cluster corresponding to that cell. If
+celldata is NULL, then the centroids are not returned. If celldata is not
+NULL, enough space should be allocated to store the centroid data before
+calling somcluster.
+
+clusterid (output) int[nrows][2] if transpose == 0;
+ int[ncolumns][2] otherwise
+For each item (gene or microarray) that is clustered, the coordinates of the
+cell in the 2D grid to which the item was assigned. If clusterid is NULL, the
+cluster assignments are not returned. If clusterid is not NULL, enough memory
+should be allocated to store the clustering information before calling
+somcluster.
+
+========================================================================
+*/
+{
+ const int nobjects = (transpose == 0) ? nrows : ncolumns;
+ const int ndata = (transpose == 0) ? ncolumns : nrows;
+ int i, j;
+ const int lcelldata = (celldata == NULL) ? 0 : 1;
+
+ if (nobjects < 2) return;
+
+ if (lcelldata == 0) {
+ celldata = malloc(nxgrid*nygrid*ndata*sizeof(double**));
+ for (i = 0; i < nxgrid; i++) {
+ celldata[i] = malloc(nygrid*ndata*sizeof(double*));
+ for (j = 0; j < nygrid; j++)
+ celldata[i][j] = malloc(ndata*sizeof(double));
+ }
+ }
+
+ somworker(nrows, ncolumns, data, mask, weight, transpose, nxgrid, nygrid,
+ inittau, celldata, niter, dist);
+ if (clusterid)
+ somassign(nrows, ncolumns, data, mask, weight, transpose,
+ nxgrid, nygrid, celldata, dist, clusterid);
+ if (lcelldata == 0) {
+ for (i = 0; i < nxgrid; i++)
+ for (j = 0; j < nygrid; j++)
+ free(celldata[i][j]);
+ for (i = 0; i < nxgrid; i++)
+ free(celldata[i]);
+ free(celldata);
+ }
+}
+
+/* ******************************************************************** */
+
+double
+clusterdistance(int nrows, int ncolumns, double** data, int** mask,
+ double weight[], int n1, int n2, int index1[], int index2[],
+ char dist, char method, int transpose)
+
+/*
+Purpose
+=======
+
+The clusterdistance routine calculates the distance between two clusters
+containing genes or samples using the measured gene expression vectors. The
+distance between clusters, given the genes/samples in each cluster, can be
+defined in several ways. Several distance measures can be used.
+
+The routine returns the distance in double precision.
+If the parameter transpose is set to a nonzero value, the clusters are
+interpreted as clusters of samples, otherwise as clusters of gene.
+
+Arguments
+=========
+
+nrows (input) int
+The number of rows (i.e., the number of genes) in the gene expression data
+matrix.
+
+ncolumns (input) int
+The number of columns (i.e., the number of samples) in the gene expression
+data matrix.
+
+data (input) double[nrows][ncolumns]
+The array containing the data of the vectors.
+
+mask (input) int[nrows][ncolumns]
+This array shows which data values are missing. If mask[i][j] == 0, then
+data[i][j] is missing.
+
+weight (input) double[ncolumns] if transpose == 0;
+ double[nrows] otherwise
+The weights that are used to calculate the distance. This is equivalent
+to including the jth data point weight[j] times in the calculation. The
+weights can be non-integer.
+
+n1 (input) int
+The number of elements in the first cluster.
+
+n2 (input) int
+The number of elements in the second cluster.
+
+index1 (input) int[n1]
+Identifies which genes/samples belong to the first cluster.
+
+index2 (input) int[n2]
+Identifies which genes/samples belong to the second cluster.
+
+dist (input) char
+Defines which distance measure is used, as given by the table:
+dist == 'e': Euclidean distance
+dist == 'b': City-block distance
+dist == 'c': correlation
+dist == 'a': absolute value of the correlation
+dist == 'u': uncentered correlation
+dist == 'x': absolute uncentered correlation
+dist == 's': Spearman's rank correlation
+dist == 'k': Kendall's tau
+For other values of dist, the default (Euclidean distance) is used.
+
+method (input) char
+Defines how the distance between two clusters is defined, given which genes
+belong to which cluster:
+method == 'a': the distance between the arithmetic means of the two clusters
+method == 'm': the distance between the medians of the two clusters
+method == 's': the smallest pairwise distance between members of the two
+ clusters
+method == 'x': the largest pairwise distance between members of the two
+ clusters
+method == 'v': average of the pairwise distances between members of the two
+ clusters
+
+transpose (input) int
+If transpose is equal to zero, the distances between the rows is
+calculated. Otherwise, the distances between the columns is calculated.
+The former is needed when genes are being clustered; the latter is used
+when samples are being clustered.
+
+========================================================================
+*/
+{
+ /* Set the metric function as indicated by dist */
+ double (*metric) (int, double**, double**, int**, int**,
+ const double[], int, int, int) = setmetric(dist);
+
+ /* if one or both clusters are empty, return */
+ if (n1 < 1 || n2 < 1) return -1.0;
+ /* Check the indices */
+ if (transpose == 0) {
+ int i;
+ for (i = 0; i < n1; i++) {
+ int index = index1[i];
+ if (index < 0 || index >= nrows) return -1.0;
+ }
+ for (i = 0; i < n2; i++) {
+ int index = index2[i];
+ if (index < 0 || index >= nrows) return -1.0;
+ }
+ }
+ else {
+ int i;
+ for (i = 0; i < n1; i++) {
+ int index = index1[i];
+ if (index < 0 || index >= ncolumns) return -1.0;
+ }
+ for (i = 0; i < n2; i++) {
+ int index = index2[i];
+ if (index < 0 || index >= ncolumns) return -1.0;
+ }
+ }
+
+ switch (method) {
+ case 'a': {
+ /* Find the center */
+ int i, j, k;
+ if (transpose == 0) {
+ double distance;
+ double* cdata[2];
+ int* cmask[2];
+ int* count[2];
+ count[0] = calloc(ncolumns, sizeof(int));
+ count[1] = calloc(ncolumns, sizeof(int));
+ cdata[0] = calloc(ncolumns, sizeof(double));
+ cdata[1] = calloc(ncolumns, sizeof(double));
+ cmask[0] = malloc(ncolumns*sizeof(int));
+ cmask[1] = malloc(ncolumns*sizeof(int));
+ for (i = 0; i < n1; i++) {
+ k = index1[i];
+ for (j = 0; j < ncolumns; j++)
+ if (mask[k][j] != 0) {
+ cdata[0][j] = cdata[0][j] + data[k][j];
+ count[0][j] = count[0][j] + 1;
+ }
+ }
+ for (i = 0; i < n2; i++) {
+ k = index2[i];
+ for (j = 0; j < ncolumns; j++)
+ if (mask[k][j] != 0) {
+ cdata[1][j] = cdata[1][j] + data[k][j];
+ count[1][j] = count[1][j] + 1;
+ }
+ }
+ for (i = 0; i < 2; i++)
+ for (j = 0; j < ncolumns; j++) {
+ if (count[i][j]>0) {
+ cdata[i][j] = cdata[i][j] / count[i][j];
+ cmask[i][j] = 1;
+ }
+ else
+ cmask[i][j] = 0;
+ }
+ distance = metric(ncolumns, cdata, cdata, cmask, cmask, weight,
+ 0, 1, 0);
+ for (i = 0; i < 2; i++) {
+ free(cdata[i]);
+ free(cmask[i]);
+ free(count[i]);
+ }
+ return distance;
+ }
+ else {
+ double distance;
+ int** count = malloc(nrows*sizeof(int*));
+ double** cdata = malloc(nrows*sizeof(double*));
+ int** cmask = malloc(nrows*sizeof(int*));
+ for (i = 0; i < nrows; i++) {
+ count[i] = calloc(2, sizeof(int));
+ cdata[i] = calloc(2, sizeof(double));
+ cmask[i] = malloc(2*sizeof(int));
+ }
+ for (i = 0; i < n1; i++) {
+ k = index1[i];
+ for (j = 0; j < nrows; j++) {
+ if (mask[j][k] != 0) {
+ cdata[j][0] += data[j][k];
+ count[j][0]++;
+ }
+ }
+ }
+ for (i = 0; i < n2; i++) {
+ k = index2[i];
+ for (j = 0; j < nrows; j++) {
+ if (mask[j][k] != 0) {
+ cdata[j][1] += data[j][k];
+ count[j][1]++;
+ }
+ }
+ }
+ for (i = 0; i < nrows; i++)
+ for (j = 0; j < 2; j++)
+ if (count[i][j]>0) {
+ cdata[i][j] /= count[i][j];
+ cmask[i][j] = 1;
+ }
+ else
+ cmask[i][j] = 0;
+ distance = metric(nrows, cdata, cdata, cmask, cmask, weight,
+ 0, 1, 1);
+ for (i = 0; i < nrows; i++) {
+ free(count[i]);
+ free(cdata[i]);
+ free(cmask[i]);
+ }
+ free(count);
+ free(cdata);
+ free(cmask);
+ return distance;
+ }
+ }
+ case 'm': {
+ int i, j, k;
+ if (transpose == 0) {
+ double distance;
+ double* temp = malloc(nrows*sizeof(double));
+ double* cdata[2];
+ int* cmask[2];
+ for (i = 0; i < 2; i++) {
+ cdata[i] = malloc(ncolumns*sizeof(double));
+ cmask[i] = malloc(ncolumns*sizeof(int));
+ }
+ for (j = 0; j < ncolumns; j++) {
+ int count = 0;
+ for (k = 0; k < n1; k++) {
+ i = index1[k];
+ if (mask[i][j]) {
+ temp[count] = data[i][j];
+ count++;
+ }
+ }
+ if (count>0) {
+ cdata[0][j] = median(count, temp);
+ cmask[0][j] = 1;
+ }
+ else {
+ cdata[0][j] = 0.;
+ cmask[0][j] = 0;
+ }
+ }
+ for (j = 0; j < ncolumns; j++) {
+ int count = 0;
+ for (k = 0; k < n2; k++) {
+ i = index2[k];
+ if (mask[i][j]) {
+ temp[count] = data[i][j];
+ count++;
+ }
+ }
+ if (count>0) {
+ cdata[1][j] = median(count, temp);
+ cmask[1][j] = 1;
+ }
+ else {
+ cdata[1][j] = 0.;
+ cmask[1][j] = 0;
+ }
+ }
+ distance = metric(ncolumns, cdata, cdata, cmask, cmask, weight,
+ 0, 1, 0);
+ for (i = 0; i < 2; i++) {
+ free(cdata[i]);
+ free(cmask[i]);
+ }
+ free(temp);
+ return distance;
+ }
+ else {
+ double distance;
+ double* temp = malloc(ncolumns*sizeof(double));
+ double** cdata = malloc(nrows*sizeof(double*));
+ int** cmask = malloc(nrows*sizeof(int*));
+ for (i = 0; i < nrows; i++) {
+ cdata[i] = malloc(2*sizeof(double));
+ cmask[i] = malloc(2*sizeof(int));
+ }
+ for (j = 0; j < nrows; j++) {
+ int count = 0;
+ for (k = 0; k < n1; k++) {
+ i = index1[k];
+ if (mask[j][i]) {
+ temp[count] = data[j][i];
+ count++;
+ }
+ }
+ if (count>0) {
+ cdata[j][0] = median(count, temp);
+ cmask[j][0] = 1;
+ }
+ else {
+ cdata[j][0] = 0.;
+ cmask[j][0] = 0;
+ }
+ }
+ for (j = 0; j < nrows; j++) {
+ int count = 0;
+ for (k = 0; k < n2; k++) {
+ i = index2[k];
+ if (mask[j][i]) {
+ temp[count] = data[j][i];
+ count++;
+ }
+ }
+ if (count>0) {
+ cdata[j][1] = median(count, temp);
+ cmask[j][1] = 1;
+ }
+ else {
+ cdata[j][1] = 0.;
+ cmask[j][1] = 0;
+ }
+ }
+ distance = metric(nrows, cdata, cdata, cmask, cmask, weight,
+ 0, 1, 1);
+ for (i = 0; i < nrows; i++) {
+ free(cdata[i]);
+ free(cmask[i]);
+ }
+ free(cdata);
+ free(cmask);
+ free(temp);
+ return distance;
+ }
+ }
+ case 's': {
+ int i1, i2, j1, j2;
+ const int n = (transpose == 0) ? ncolumns : nrows;
+ double mindistance = DBL_MAX;
+ for (i1 = 0; i1 < n1; i1++)
+ for (i2 = 0; i2 < n2; i2++) {
+ double distance;
+ j1 = index1[i1];
+ j2 = index2[i2];
+ distance = metric(n, data, data, mask, mask, weight,
+ j1, j2, transpose);
+ if (distance < mindistance) mindistance = distance;
+ }
+ return mindistance;
+ }
+ case 'x': {
+ int i1, i2, j1, j2;
+ const int n = (transpose == 0) ? ncolumns : nrows;
+ double maxdistance = 0;
+ for (i1 = 0; i1 < n1; i1++)
+ for (i2 = 0; i2 < n2; i2++) {
+ double distance;
+ j1 = index1[i1];
+ j2 = index2[i2];
+ distance = metric(n, data, data, mask, mask, weight,
+ j1, j2, transpose);
+ if (distance > maxdistance) maxdistance = distance;
+ }
+ return maxdistance;
+ }
+ case 'v': {
+ int i1, i2, j1, j2;
+ const int n = (transpose == 0) ? ncolumns : nrows;
+ double distance = 0;
+ for (i1 = 0; i1 < n1; i1++)
+ for (i2 = 0; i2 < n2; i2++) {
+ j1 = index1[i1];
+ j2 = index2[i2];
+ distance += metric(n, data, data, mask, mask, weight,
+ j1, j2, transpose);
+ }
+ distance /= (n1*n2);
+ return distance;
+ }
+ }
+ /* Never get here */
+ return -2.0;
+}
diff --git a/code/lib/Bio/Cluster/cluster.h b/code/lib/Bio/Cluster/cluster.h
new file mode 100644
index 0000000..fbbfd26
--- /dev/null
+++ b/code/lib/Bio/Cluster/cluster.h
@@ -0,0 +1,90 @@
+/******************************************************************************/
+/* The C Clustering Library.
+ * Copyright (C) 2002 Michiel Jan Laurens de Hoon.
+ *
+ * This library was written at the Laboratory of DNA Information Analysis,
+ * Human Genome Center, Institute of Medical Science, University of Tokyo,
+ * 4-6-1 Shirokanedai, Minato-ku, Tokyo 108-8639, Japan.
+ * Contact: michiel.dehoon 'AT' riken.jp
+ *
+ * Permission to use, copy, modify, and distribute this software and its
+ * documentation with or without modifications and for any purpose and
+ * without fee is hereby granted, provided that any copyright notices
+ * appear in all copies and that both those copyright notices and this
+ * permission notice appear in supporting documentation, and that the
+ * names of the contributors or copyright holders not be used in
+ * advertising or publicity pertaining to distribution of the software
+ * without specific prior permission.
+ *
+ * THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
+ * WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
+ * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
+ * OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
+ * OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#ifndef min
+#define min(x, y) ((x) < (y) ? (x) : (y))
+#endif
+#ifndef max
+#define max(x, y) ((x) > (y) ? (x) : (y))
+#endif
+
+#define CLUSTERVERSION "1.59"
+
+/* Chapter 2 */
+double clusterdistance(int nrows, int ncolumns, double** data, int** mask,
+ double weight[], int n1, int n2, int index1[], int index2[], char dist,
+ char method, int transpose);
+void distancematrix(int ngenes, int ndata, double** data, int** mask,
+ double* weight, char dist, int transpose, double** distances);
+
+/* Chapter 3 */
+int getclustercentroids(int nclusters, int nrows, int ncolumns,
+ double** data, int** mask, int clusterid[], double** cdata, int** cmask,
+ int transpose, char method);
+void getclustermedoids(int nclusters, int nelements, double** distance,
+ int clusterid[], int centroids[], double errors[]);
+void kcluster(int nclusters, int ngenes, int ndata, double** data,
+ int** mask, double weight[], int transpose, int npass, char method, char dist,
+ int clusterid[], double* error, int* ifound);
+void kmedoids(int nclusters, int nelements, double** distance,
+ int npass, int clusterid[], double* error, int* ifound);
+
+/* Chapter 4 */
+typedef struct {int left; int right; double distance;} Node;
+/*
+ * A Node struct describes a single node in a tree created by hierarchical
+ * clustering. The tree can be represented by an array of n Node structs,
+ * where n is the number of elements minus one. The integers left and right
+ * in each Node struct refer to the two elements or subnodes that are joined
+ * in this node. The original elements are numbered 0..nelements-1, and the
+ * nodes -1..-(nelements-1). For each node, distance contains the distance
+ * between the two subnodes that were joined.
+ */
+
+Node* treecluster(int nrows, int ncolumns, double** data, int** mask,
+ double weight[], int transpose, char dist, char method, double** distmatrix);
+int sorttree(const int nnodes, Node* tree, const double order[], int indices[]);
+int cuttree(int nelements, const Node* tree, int nclusters, int clusterid[]);
+
+/* Chapter 5 */
+void somcluster(int nrows, int ncolumns, double** data, int** mask,
+ const double weight[], int transpose, int nxnodes, int nynodes,
+ double inittau, int niter, char dist, double*** celldata,
+ int clusterid[][2]);
+
+/* Chapter 6 */
+int pca(int m, int n, double** u, double** v, double* w);
+
+/* Utility routines, currently undocumented */
+void sort(int n, const double data[], int index[]);
+double mean(int n, double x[]);
+double median (int n, double x[]);
+
+double* calculate_weights(int nrows, int ncolumns, double** data, int** mask,
+ double weights[], int transpose, char dist, double cutoff, double exponent);
diff --git a/code/lib/Bio/Cluster/clustermodule.c b/code/lib/Bio/Cluster/clustermodule.c
new file mode 100644
index 0000000..29b2a5c
--- /dev/null
+++ b/code/lib/Bio/Cluster/clustermodule.c
@@ -0,0 +1,2457 @@
+#include "Python.h"
+#include
+#include
+#include
+#include "cluster.h"
+
+
+/* ========================================================================= */
+/* -- Helper routines ------------------------------------------------------ */
+/* ========================================================================= */
+
+static char
+extract_single_character(PyObject* object, const char variable[],
+ const char allowed[])
+{
+ Py_UCS4 ch;
+ Py_ssize_t n;
+ if (!PyUnicode_Check(object)) {
+ PyErr_Format(PyExc_ValueError, "%s should be a string", variable);
+ return 0;
+ }
+ if (PyUnicode_READY(object) == -1) return 0;
+ n = PyUnicode_GET_LENGTH(object);
+ if (n != 1) {
+ PyErr_Format(PyExc_ValueError,
+ "%s should be a single character", variable);
+ return 0;
+ }
+ ch = PyUnicode_READ_CHAR(object, 0);
+ if (ch < 128) {
+ const char c = ch;
+ if (strchr(allowed, c)) return c;
+ }
+ PyErr_Format(PyExc_ValueError,
+ "unknown %s function specified (should be one of '%s')",
+ variable, allowed);
+ return 0;
+}
+
+static int
+distance_converter(PyObject* object, void* pointer)
+{
+ char c;
+
+ c = extract_single_character(object, "dist", "ebcauxsk");
+ if (c == 0) return 0;
+ *((char*)pointer) = c;
+ return 1;
+}
+
+static int
+method_treecluster_converter(PyObject* object, void* pointer)
+{
+ char c;
+
+ c = extract_single_character(object, "method", "csma");
+ if (c == 0) return 0;
+ *((char*)pointer) = c;
+ return 1;
+}
+
+static int
+method_kcluster_converter(PyObject* object, void* pointer)
+{
+ char c;
+
+ c = extract_single_character(object, "method", "am");
+ if (c == 0) return 0;
+ *((char*)pointer) = c;
+ return 1;
+}
+
+static int
+method_clusterdistance_converter(PyObject* object, void* pointer)
+{
+ char c;
+
+ c = extract_single_character(object, "method", "amsxv");
+ if (c == 0) return 0;
+ *((char*)pointer) = c;
+ return 1;
+}
+
+/* -- data ----------------------------------------------------------------- */
+
+typedef struct {
+ int nrows;
+ int ncols;
+ double** values;
+ Py_buffer view;
+} Data;
+
+static int
+data_converter(PyObject* object, void* pointer)
+{
+ Data* data = pointer;
+ int nrows;
+ int ncols;
+ int i;
+ double** values = data->values;
+ Py_buffer* view = &data->view;
+ const char* p;
+ Py_ssize_t stride;
+ const int flag = PyBUF_ND | PyBUF_STRIDES;
+
+ if (object == NULL) goto exit;
+ if (object == Py_None) return 1;
+
+ if (PyObject_GetBuffer(object, view, flag) == -1) {
+ PyErr_SetString(PyExc_RuntimeError,
+ "data matrix has unexpected format.");
+ return 0;
+ }
+
+ if (view->ndim != 2) {
+ PyErr_Format(PyExc_RuntimeError,
+ "data matrix has incorrect rank %d (expected 2)",
+ view->ndim);
+ goto exit;
+ }
+ if (view->itemsize != sizeof(double)) {
+ PyErr_SetString(PyExc_RuntimeError,
+ "data matrix has incorrect data type");
+ goto exit;
+ }
+ nrows = (int) view->shape[0];
+ ncols = (int) view->shape[1];
+ if (nrows != view->shape[0] || ncols != view->shape[1]) {
+ PyErr_Format(PyExc_ValueError,
+ "data matrix is too large (dimensions = %zd x %zd)",
+ view->shape[0], view->shape[1]);
+ goto exit;
+ }
+ if (nrows < 1 || ncols < 1) {
+ PyErr_SetString(PyExc_ValueError, "data matrix is empty");
+ goto exit;
+ }
+ stride = view->strides[0];
+ if (view->strides[1] != view->itemsize) {
+ PyErr_SetString(PyExc_RuntimeError, "data is not contiguous");
+ goto exit;
+ }
+ values = PyMem_Malloc(nrows*sizeof(double*));
+ if (!values) {
+ PyErr_NoMemory();
+ goto exit;
+ }
+ for (i = 0, p = view->buf; i < nrows; i++, p += stride)
+ values[i] = (double*)p;
+ data->values = values;
+ data->nrows = nrows;
+ data->ncols = ncols;
+ return Py_CLEANUP_SUPPORTED;
+
+exit:
+ if (values) PyMem_Free(values);
+ PyBuffer_Release(view);
+ return 0;
+}
+
+/* -- mask ----------------------------------------------------------------- */
+
+typedef struct {
+ int** values;
+ Py_buffer view;
+} Mask;
+
+static int
+mask_converter(PyObject* object, void* pointer)
+{
+ Mask* mask = pointer;
+ int nrows;
+ int ncols;
+ int i;
+ int** values = mask->values;
+ Py_buffer* view = &mask->view;
+ const char* p;
+ Py_ssize_t stride;
+ const int flag = PyBUF_ND | PyBUF_STRIDES;
+
+ if (object == NULL) goto exit;
+ if (object == Py_None) return 1;
+
+ if (PyObject_GetBuffer(object, view, flag) == -1) {
+ PyErr_SetString(PyExc_RuntimeError, "mask has unexpected format.");
+ return 0;
+ }
+
+ if (view->ndim != 2) {
+ PyErr_Format(PyExc_ValueError,
+ "mask has incorrect rank %d (expected 2)", view->ndim);
+ goto exit;
+ }
+ if (view->itemsize != sizeof(int)) {
+ PyErr_SetString(PyExc_RuntimeError, "mask has incorrect data type");
+ goto exit;
+ }
+ nrows = (int) view->shape[0];
+ ncols = (int) view->shape[1];
+ if (nrows != view->shape[0] || ncols != view->shape[1]) {
+ PyErr_Format(PyExc_ValueError,
+ "mask is too large (dimensions = %zd x %zd)",
+ view->shape[0], view->shape[1]);
+ goto exit;
+ }
+ stride = view->strides[0];
+ if (view->strides[1] != view->itemsize) {
+ PyErr_SetString(PyExc_RuntimeError, "mask is not contiguous");
+ goto exit;
+ }
+ values = PyMem_Malloc(nrows*sizeof(int*));
+ if (!values) {
+ PyErr_NoMemory();
+ goto exit;
+ }
+ for (i = 0, p = view->buf; i < nrows; i++, p += stride)
+ values[i] = (int*)p;
+ mask->values = values;
+ return Py_CLEANUP_SUPPORTED;
+
+exit:
+ if (values) PyMem_Free(values);
+ PyBuffer_Release(view);
+ return 0;
+}
+
+/* -- 1d array ------------------------------------------------------------- */
+
+static int
+vector_converter(PyObject* object, void* pointer)
+{
+ Py_buffer* view = pointer;
+ int ndata;
+ const int flag = PyBUF_ND | PyBUF_C_CONTIGUOUS;
+
+ if (object == NULL) goto exit;
+
+ if (PyObject_GetBuffer(object, view, flag) == -1) {
+ PyErr_SetString(PyExc_RuntimeError, "unexpected format.");
+ return 0;
+ }
+
+ if (view->ndim != 1) {
+ PyErr_Format(PyExc_ValueError, "incorrect rank %d (expected 1)",
+ view->ndim);
+ goto exit;
+ }
+ if (view->itemsize != sizeof(double)) {
+ PyErr_SetString(PyExc_RuntimeError, "array has incorrect data type");
+ goto exit;
+ }
+ ndata = (int) view->shape[0];
+ if (ndata != view->shape[0]) {
+ PyErr_Format(PyExc_ValueError,
+ "array is too large (size = %zd)", view->shape[0]);
+ goto exit;
+ }
+ return Py_CLEANUP_SUPPORTED;
+
+exit:
+ PyBuffer_Release(view);
+ return 0;
+}
+
+static int
+vector_none_converter(PyObject* object, void* pointer)
+{
+ if (object == Py_None) return 1;
+ return vector_converter(object, pointer);
+}
+
+/* -- clusterid ------------------------------------------------------------ */
+
+static int
+check_clusterid(Py_buffer clusterid, int nitems) {
+ int i, j;
+ int *p = clusterid.buf;
+ int nclusters = 0;
+ int* number;
+
+ if (nitems != clusterid.shape[0]) {
+ PyErr_Format(PyExc_ValueError, "incorrect size (%zd, expected %d)",
+ clusterid.shape[0], nitems);
+ return 0;
+ }
+ for (i = 0; i < nitems; i++) {
+ j = p[i];
+ if (j > nclusters) nclusters = j;
+ if (j < 0) {
+ PyErr_SetString(PyExc_ValueError, "negative cluster number found");
+ return 0;
+ }
+ }
+ nclusters++;
+ /* -- Count the number of items in each cluster --------------------- */
+ number = calloc(nclusters, sizeof(int));
+ if (!number) {
+ PyErr_NoMemory();
+ return 0;
+ }
+ for (i = 0; i < nitems; i++) {
+ j = p[i];
+ number[j]++;
+ }
+ for (j = 0; j < nclusters; j++) if (number[j] == 0) break;
+ PyMem_Free(number);
+ if (j < nclusters) {
+ PyErr_Format(PyExc_ValueError, "cluster %d is empty", j);
+ return 0;
+ }
+ return nclusters;
+}
+
+/* -- distance ----------------------------------------------------------- */
+
+typedef struct {
+ int n;
+ double** values;
+ Py_buffer* views;
+ Py_buffer view;
+} Distancematrix;
+
+static int
+_convert_list_to_distancematrix(PyObject* list, Distancematrix* distances)
+{
+ int i;
+ double** values;
+ Py_buffer* view;
+ Py_buffer* views;
+ const int flag = PyBUF_ND | PyBUF_C_CONTIGUOUS;
+ const int n = (int) PyList_GET_SIZE(list);
+
+ if (n != PyList_GET_SIZE(list)) {
+ PyErr_SetString(PyExc_ValueError, "distance matrix is too large");
+ return 0;
+ }
+ values = PyMem_Malloc(n*sizeof(double*));
+ if (!values) {
+ PyErr_NoMemory();
+ return 0;
+ }
+ distances->values = values;
+ views = PyMem_Malloc(n*sizeof(Py_buffer));
+ if (!views) {
+ PyErr_NoMemory();
+ return 0;
+ }
+ view = views;
+ for (i = 0; i < n; i++, view++) {
+ PyObject* item = PyList_GET_ITEM(list, i);
+ view->len = -1;
+ if (PyObject_GetBuffer(item, view, flag) == -1) {
+ PyErr_Format(PyExc_RuntimeError, "failed to parse row %d.", i);
+ view--;
+ break;
+ }
+ if (view->ndim != 1) {
+ PyErr_Format(PyExc_ValueError,
+ "row %d has incorrect rank (%d expected 1)",
+ i, view->ndim);
+ break;
+ }
+ if (view->itemsize != sizeof(double)) {
+ PyErr_Format(PyExc_RuntimeError,
+ "row %d has incorrect data type", i);
+ break;
+ }
+ if (view->shape[0] != i) {
+ PyErr_Format(PyExc_RuntimeError,
+ "row %d has incorrect size %zd (expected %d)",
+ i, view->shape[0], i);
+ break;
+ }
+ values[i] = view->buf;
+ }
+ if (i < n) {
+ for ( ; view >= views; view--) PyBuffer_Release(view);
+ PyMem_Free(views);
+ return 0;
+ }
+ distances->n = n;
+ distances->view.len = 0;
+ distances->views = views;
+ distances->values = values;
+ return 1;
+}
+
+static int
+_convert_array_to_distancematrix(PyObject* array, Distancematrix* distances)
+{
+ int i;
+ int n;
+ double** values;
+ double* p;
+ Py_buffer* view = &distances->view;
+ const int flag = PyBUF_ND | PyBUF_C_CONTIGUOUS;
+
+ if (PyObject_GetBuffer(array, view, flag) == -1) {
+ PyErr_SetString(PyExc_RuntimeError,
+ "distance matrix has unexpected format.");
+ return 0;
+ }
+
+ if (view->len == 0) {
+ PyBuffer_Release(view);
+ PyErr_SetString(PyExc_ValueError, "distance matrix is empty");
+ return 0;
+ }
+ if (view->itemsize != sizeof(double)) {
+ PyErr_SetString(PyExc_RuntimeError,
+ "distance matrix has an incorrect data type");
+ return 0;
+ }
+ if (view->ndim == 1) {
+ int m = (int) view->shape[0];
+ if (m != view->shape[0]) {
+ PyErr_Format(PyExc_ValueError,
+ "distance matrix is too large (size = %zd)",
+ view->shape[0]);
+ return 0;
+ }
+ n = (int)(1+sqrt(1+8*m)/2); /* rounds to (1+sqrt(1+8*m))/2 */
+ if (n*n-n != 2 * m) {
+ PyErr_SetString(PyExc_ValueError,
+ "distance matrix has unexpected size.");
+ return 0;
+ }
+ distances->n = n;
+ values = PyMem_Malloc(n*sizeof(double*));
+ if (!values) {
+ PyErr_NoMemory();
+ return 0;
+ }
+ distances->values = values;
+ for (p = view->buf, i = 0; i < n; p += i, i++) values[i] = p;
+ }
+ else if (view->ndim == 2) {
+ n = (int) view->shape[0];
+ if (n != view->shape[0]) {
+ PyErr_Format(PyExc_ValueError,
+ "distance matrix is too large (size = %zd)",
+ view->shape[0]);
+ return 0;
+ }
+ distances->n = n;
+ if (view->shape[1] != n) {
+ PyErr_SetString(PyExc_ValueError,
+ "distance matrix is not square.");
+ return 0;
+ }
+ values = PyMem_Malloc(n*sizeof(double*));
+ if (!values) {
+ PyErr_NoMemory();
+ return 0;
+ }
+ distances->values = values;
+ for (p = view->buf, i = 0; i < n; p += n, i++) values[i] = p;
+ }
+ else {
+ PyErr_Format(PyExc_ValueError,
+ "distance matrix has incorrect rank %d (expected 1 or 2)",
+ view->ndim);
+ return 0;
+ }
+ return 1;
+}
+
+static int
+distancematrix_converter(PyObject* argument, void* pointer)
+{
+ Distancematrix* distances = pointer;
+ double** values;
+
+ if (argument == NULL) goto exit;
+ if (argument == Py_None) return 1;
+ if (PyList_Check(argument)) {
+ if (_convert_list_to_distancematrix(argument, distances))
+ return Py_CLEANUP_SUPPORTED;
+ }
+ else {
+ if (_convert_array_to_distancematrix(argument, distances))
+ return Py_CLEANUP_SUPPORTED;
+ }
+
+exit:
+ values = distances->values;
+ if (values == NULL) return 0;
+ else {
+ int i;
+ const int n = distances->n;
+ Py_buffer* views = distances->views;
+ if (views) {
+ for (i = 0; i < n; i++) PyBuffer_Release(&views[i]);
+ PyMem_Free(views);
+ }
+ else if (distances->view.len) {
+ PyBuffer_Release(&distances->view);
+ }
+ PyMem_Free(values);
+ }
+ return 0;
+}
+
+/* -- celldata ------------------------------------------------------------- */
+
+typedef struct {
+ int nx;
+ int ny;
+ int nz;
+ double*** values;
+ Py_buffer view;
+} Celldata;
+
+static int
+celldata_converter(PyObject* argument, void* pointer)
+{
+ int i, n;
+ double* p;
+ Celldata* celldata = pointer;
+ double*** ppp = celldata->values;
+ double** pp = ppp ? ppp[0] : NULL;
+ int nx;
+ int ny;
+ int nz;
+ Py_buffer* view = &celldata->view;
+ const int flag = PyBUF_ND | PyBUF_C_CONTIGUOUS;
+
+ if (argument == NULL) goto exit;
+
+ if (PyObject_GetBuffer(argument, view, flag) == -1) {
+ PyErr_SetString(PyExc_RuntimeError,
+ "celldata array has unexpected format.");
+ return 0;
+ }
+
+ nx = (int) view->shape[0];
+ ny = (int) view->shape[1];
+ nz = (int) view->shape[2];
+ if (nx != view->shape[0] || ny != view->shape[1] || nz != view->shape[2]) {
+ PyErr_SetString(PyExc_RuntimeError, "celldata array too large");
+ goto exit;
+ }
+ if (view->itemsize != sizeof(double)) {
+ PyErr_SetString(PyExc_RuntimeError,
+ "celldata array has incorrect data type");
+ goto exit;
+ }
+ pp = PyMem_Malloc(nx*ny*sizeof(double*));
+ ppp = PyMem_Malloc(nx*sizeof(double**));
+ if (!pp || !ppp) {
+ PyErr_NoMemory();
+ goto exit;
+ }
+ p = view->buf;
+ n = nx * ny;
+ for (i = 0; i < n; i++, p += nz) pp[i] = p;
+ for (i = 0; i < nx; i++, pp += ny) ppp[i] = pp;
+ celldata->values = ppp;
+ celldata->nx = nx;
+ celldata->ny = ny;
+ celldata->nz = nz;
+ return Py_CLEANUP_SUPPORTED;
+
+exit:
+ if (pp) PyMem_Free(pp);
+ if (ppp) PyMem_Free(ppp);
+ PyBuffer_Release(view);
+ return 0;
+}
+
+
+/* -- index ---------------------------------------------------------------- */
+
+static int
+index_converter(PyObject* argument, void* pointer)
+{
+ Py_buffer* view = pointer;
+ int n;
+ const int flag = PyBUF_ND | PyBUF_C_CONTIGUOUS;
+
+ if (argument == NULL) goto exit;
+
+ if (PyObject_GetBuffer(argument, view, flag) == -1) {
+ PyErr_SetString(PyExc_RuntimeError, "unexpected format.");
+ return 0;
+ }
+
+ if (view->ndim != 1) {
+ PyErr_Format(PyExc_ValueError, "incorrect rank %d (expected 1)",
+ view->ndim);
+ goto exit;
+ }
+ if (view->itemsize != sizeof(int)) {
+ PyErr_SetString(PyExc_RuntimeError,
+ "argument has incorrect data type");
+ goto exit;
+ }
+ n = (int) view->shape[0];
+ if (n != view->shape[0]) {
+ PyErr_Format(PyExc_ValueError,
+ "array size is too large (size = %zd)", view->shape[0]);
+ goto exit;
+ }
+ return Py_CLEANUP_SUPPORTED;
+
+exit:
+ PyBuffer_Release(view);
+ return 0;
+}
+
+/* -- index2d ------------------------------------------------------------- */
+
+static int
+index2d_converter(PyObject* argument, void* pointer)
+{
+ Py_buffer* view = pointer;
+ int n;
+ const int flag = PyBUF_ND | PyBUF_C_CONTIGUOUS;
+
+ if (argument == NULL) goto exit;
+
+ if (PyObject_GetBuffer(argument, view, flag) == -1) {
+ PyErr_SetString(PyExc_RuntimeError, "unexpected format.");
+ return 0;
+ }
+
+ if (view->ndim != 2) {
+ PyErr_Format(PyExc_ValueError, "incorrect rank %d (expected 2)",
+ view->ndim);
+ goto exit;
+ }
+ if (view->itemsize != sizeof(int)) {
+ PyErr_SetString(PyExc_RuntimeError,
+ "argument has incorrect data type");
+ goto exit;
+ }
+ n = (int) view->shape[0];
+ if (n != view->shape[0]) {
+ PyErr_Format(PyExc_ValueError,
+ "array size is too large (size = %zd)", view->shape[0]);
+ goto exit;
+ }
+ if (view->shape[1] != 2) {
+ PyErr_Format(PyExc_ValueError,
+ "array has %zd columns (expected 2)", view->shape[1]);
+ goto exit;
+ }
+ return Py_CLEANUP_SUPPORTED;
+
+exit:
+ PyBuffer_Release(view);
+ return 0;
+}
+
+/* ========================================================================= */
+/* -- Classes -------------------------------------------------------------- */
+/* ========================================================================= */
+
+typedef struct {
+ PyObject_HEAD
+ Node node;
+} PyNode;
+
+static int
+PyNode_init(PyNode *self, PyObject *args, PyObject *kwds)
+{
+ int left, right;
+ double distance = 0.0;
+ static char *kwlist[] = {"left", "right", "distance", NULL};
+
+ if (!PyArg_ParseTupleAndKeywords(args, kwds, "ii|d", kwlist,
+ &left, &right, &distance))
+ return -1;
+ self->node.left = left;
+ self->node.right = right;
+ self->node.distance = distance;
+ return 0;
+}
+
+static PyObject*
+PyNode_repr(PyNode* self)
+{
+ char string[64];
+
+ sprintf(string, "(%d, %d): %g",
+ self->node.left, self->node.right, self->node.distance);
+ return PyUnicode_FromString(string);
+}
+
+static char PyNode_left__doc__[] =
+"integer representing the first member of this node";
+
+static PyObject*
+PyNode_getleft(PyNode* self, void* closure)
+{
+ int left = self->node.left;
+
+ return PyLong_FromLong((long)left);
+}
+
+static int
+PyNode_setleft(PyNode* self, PyObject* value, void* closure)
+{
+ long left = PyLong_AsLong(value);
+
+ if (PyErr_Occurred()) return -1;
+ self->node.left = (int) left;
+ return 0;
+}
+
+static char PyNode_right__doc__[] =
+"integer representing the second member of this node";
+
+static PyObject*
+PyNode_getright(PyNode* self, void* closure)
+{
+ int right = self->node.right;
+
+ return PyLong_FromLong((long)right);
+}
+
+static int
+PyNode_setright(PyNode* self, PyObject* value, void* closure)
+{
+ long right = PyLong_AsLong(value);
+
+ if (PyErr_Occurred()) return -1;
+ self->node.right = (int) right;
+ return 0;
+}
+
+static PyObject*
+PyNode_getdistance(PyNode* self, void* closure)
+{
+ return PyFloat_FromDouble(self->node.distance);
+}
+
+static int
+PyNode_setdistance(PyNode* self, PyObject* value, void* closure)
+{
+ const double distance = PyFloat_AsDouble(value);
+
+ if (PyErr_Occurred()) return -1;
+ self->node.distance = distance;
+ return 0;
+}
+
+static char PyNode_distance__doc__[] =
+"the distance between the two members of this node\n";
+
+static PyGetSetDef PyNode_getset[] = {
+ {"left",
+ (getter)PyNode_getleft,
+ (setter)PyNode_setleft,
+ PyNode_left__doc__, NULL},
+ {"right",
+ (getter)PyNode_getright,
+ (setter)PyNode_setright,
+ PyNode_right__doc__, NULL},
+ {"distance",
+ (getter)PyNode_getdistance,
+ (setter)PyNode_setdistance,
+ PyNode_distance__doc__, NULL},
+ {NULL} /* Sentinel */
+};
+
+static char PyNode_doc[] =
+"A Node object describes a single node in a hierarchical clustering tree.\n"
+"The integer attributes 'left' and 'right' represent the two members that\n"
+"make up this node; the floating point attribute 'distance' contains the\n"
+"distance between the two members of this node.\n";
+
+static PyTypeObject PyNodeType = {
+ PyVarObject_HEAD_INIT(NULL, 0)
+ "_cluster.Node", /* tp_name */
+ sizeof(PyNode), /* tp_basicsize */
+ 0, /* tp_itemsize */
+ 0, /* tp_dealloc */
+ 0, /* tp_print */
+ 0, /* tp_getattr */
+ 0, /* tp_setattr */
+ 0, /* tp_compare */
+ (reprfunc)PyNode_repr, /* tp_repr */
+ 0, /* tp_as_number */
+ 0, /* tp_as_sequence */
+ 0, /* tp_as_mapping */
+ 0, /* tp_hash */
+ 0, /* tp_call */
+ 0, /* tp_str */
+ 0, /* tp_getattro */
+ 0, /* tp_setattro */
+ 0, /* tp_as_buffer */
+ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/
+ PyNode_doc, /* tp_doc */
+ 0, /* tp_traverse */
+ 0, /* tp_clear */
+ 0, /* tp_richcompare */
+ 0, /* tp_weaklistoffset */
+ 0, /* tp_iter */
+ 0, /* tp_iternext */
+ 0, /* tp_methods */
+ 0, /* tp_members */
+ PyNode_getset, /* tp_getset */
+ 0, /* tp_base */
+ 0, /* tp_dict */
+ 0, /* tp_descr_get */
+ 0, /* tp_descr_set */
+ 0, /* tp_dictoffset */
+ (initproc)PyNode_init, /* tp_init */
+};
+
+typedef struct {
+ PyObject_HEAD
+ Node* nodes;
+ int n;
+} PyTree;
+
+static void
+PyTree_dealloc(PyTree* self)
+{
+ if (self->n) PyMem_Free(self->nodes);
+ Py_TYPE(self)->tp_free((PyObject*)self);
+}
+
+static PyObject*
+PyTree_new(PyTypeObject *type, PyObject* args, PyObject* kwds)
+{
+ int i, j;
+ int n;
+ Node* nodes;
+ PyObject* arg = NULL;
+ int* flag;
+ PyTree* self;
+
+ self = (PyTree *)type->tp_alloc(type, 0);
+ if (!self) return NULL;
+
+ if (!PyArg_ParseTuple(args, "|O", &arg)) {
+ Py_DECREF(self);
+ return NULL;
+ }
+
+ if (arg == NULL) {
+ self->n = 0;
+ self->nodes = NULL;
+ return (PyObject*)self;
+ }
+
+ if (!PyList_Check(arg)) {
+ Py_DECREF(self);
+ PyErr_SetString(PyExc_TypeError,
+ "Argument should be a list of Node objects");
+ return NULL;
+ }
+
+ n = (int) PyList_GET_SIZE(arg);
+ if (n != PyList_GET_SIZE(arg)) {
+ Py_DECREF(self);
+ PyErr_Format(PyExc_ValueError,
+ "List is too large (size = %zd)", PyList_GET_SIZE(arg));
+ return NULL;
+ }
+ if (n < 1) {
+ Py_DECREF(self);
+ PyErr_SetString(PyExc_ValueError, "List is empty");
+ return NULL;
+ }
+ nodes = PyMem_Malloc(n*sizeof(Node));
+ if (!nodes) {
+ Py_DECREF(self);
+ return PyErr_NoMemory();
+ }
+ for (i = 0; i < n; i++) {
+ PyNode* p;
+ PyObject* row = PyList_GET_ITEM(arg, i);
+ if (!PyType_IsSubtype(Py_TYPE(row), &PyNodeType)) {
+ PyMem_Free(nodes);
+ Py_DECREF(self);
+ PyErr_Format(PyExc_TypeError,
+ "Row %d in list is not a Node object", i);
+ return NULL;
+ }
+ p = (PyNode*)row;
+ nodes[i] = p->node;
+ }
+ /* --- Check if this is a bona fide tree ------------------------------- */
+ flag = PyMem_Malloc((2*n+1)*sizeof(int));
+ if (!flag) {
+ PyMem_Free(nodes);
+ Py_DECREF(self);
+ return PyErr_NoMemory();
+ }
+ for (i = 0; i < 2*n+1; i++) flag[i] = 0;
+ for (i = 0; i < n; i++) {
+ j = nodes[i].left;
+ if (j < 0) {
+ j = -j-1;
+ if (j >= i) break;
+ }
+ else j += n;
+ if (flag[j]) break;
+ flag[j] = 1;
+ j = nodes[i].right;
+ if (j < 0) {
+ j = -j-1;
+ if (j >= i) break;
+ }
+ else j += n;
+ if (flag[j]) break;
+ flag[j] = 1;
+ }
+ PyMem_Free(flag);
+ if (i < n) {
+ /* break encountered */
+ PyMem_Free(nodes);
+ Py_DECREF(self);
+ PyErr_SetString(PyExc_ValueError, "Inconsistent tree");
+ return NULL;
+ }
+ self->n = n;
+ self->nodes = nodes;
+ return (PyObject*)self;
+}
+
+static PyObject*
+PyTree_str(PyTree* self)
+{
+ int i;
+ const int n = self->n;
+ char string[128];
+ Node node;
+ PyObject* line;
+ PyObject* output;
+ PyObject* temp;
+
+ output = PyUnicode_FromString("");
+ for (i = 0; i < n; i++) {
+ node = self->nodes[i];
+ sprintf(string, "(%d, %d): %g", node.left, node.right, node.distance);
+ if (i < n-1) strcat(string, "\n");
+ line = PyUnicode_FromString(string);
+ if (!line) {
+ Py_DECREF(output);
+ return NULL;
+ }
+ temp = PyUnicode_Concat(output, line);
+ if (!temp) {
+ Py_DECREF(output);
+ Py_DECREF(line);
+ return NULL;
+ }
+ output = temp;
+ }
+ return output;
+}
+
+static int
+PyTree_length(PyTree *self)
+{
+ return self->n;
+}
+
+static PyObject*
+PyTree_subscript(PyTree* self, PyObject* item)
+{
+ if (PyIndex_Check(item)) {
+ PyNode* result;
+ Py_ssize_t i;
+ i = PyNumber_AsSsize_t(item, PyExc_IndexError);
+ if (i == -1 && PyErr_Occurred())
+ return NULL;
+ if (i < 0)
+ i += self->n;
+ if (i < 0 || i >= self->n) {
+ PyErr_SetString(PyExc_IndexError, "tree index out of range");
+ return NULL;
+ }
+ result = (PyNode*) PyNodeType.tp_alloc(&PyNodeType, 0);
+ if (!result) return PyErr_NoMemory();
+ result->node = self->nodes[i];
+ return (PyObject*) result;
+ }
+ else if (PySlice_Check(item)) {
+ Py_ssize_t i, j;
+ Py_ssize_t start, stop, step, slicelength;
+ if (PySlice_GetIndicesEx(item, self->n, &start, &stop, &step,
+ &slicelength) == -1) return NULL;
+ if (slicelength == 0) return PyList_New(0);
+ else {
+ PyNode* node;
+ PyObject* result = PyList_New(slicelength);
+ if (!result) return PyErr_NoMemory();
+ for (i = 0, j = start; i < slicelength; i++, j += step) {
+ node = (PyNode*) PyNodeType.tp_alloc(&PyNodeType, 0);
+ if (!node) {
+ Py_DECREF(result);
+ return PyErr_NoMemory();
+ }
+ node->node = self->nodes[j];
+ PyList_SET_ITEM(result, i, (PyObject*)node);
+ }
+ return result;
+ }
+ }
+ else {
+ PyErr_Format(PyExc_TypeError,
+ "tree indices must be integers, not %.200s",
+ item->ob_type->tp_name);
+ return NULL;
+ }
+}
+
+static PyMappingMethods PyTree_mapping = {
+ (lenfunc)PyTree_length, /* mp_length */
+ (binaryfunc)PyTree_subscript, /* mp_subscript */
+};
+
+static char PyTree_scale__doc__[] =
+"mytree.scale()\n"
+"\n"
+"Scale the node distances in the tree such that they are all between one\n"
+"and zero.\n";
+
+static PyObject*
+PyTree_scale(PyTree* self)
+{
+ int i;
+ const int n = self->n;
+ Node* nodes = self->nodes;
+ double maximum = DBL_MIN;
+
+ for (i = 0; i < n; i++) {
+ double distance = nodes[i].distance;
+ if (distance > maximum) maximum = distance;
+ }
+ if (maximum != 0.0)
+ for (i = 0; i < n; i++) nodes[i].distance /= maximum;
+ Py_INCREF(Py_None);
+ return Py_None;
+}
+
+static char PyTree_cut__doc__[] =
+"mytree.cut(nclusters) -> array\n"
+"\n"
+"Divide the elements in a hierarchical clustering result mytree into\n"
+"clusters, and return an array with the number of the cluster to which each\n"
+"element was assigned. The number of clusters is given by nclusters.\n";
+
+static PyObject*
+PyTree_cut(PyTree* self, PyObject* args)
+{
+ int ok = -1;
+ int nclusters;
+ const int n = self->n + 1;
+ Py_buffer indices = {0};
+
+ if (!PyArg_ParseTuple(args, "O&i",
+ index_converter, &indices, &nclusters)) goto exit;
+ if (nclusters < 1) {
+ PyErr_SetString(PyExc_ValueError,
+ "requested number of clusters should be positive");
+ goto exit;
+ }
+ if (nclusters > n) {
+ PyErr_SetString(PyExc_ValueError,
+ "more clusters requested than items available");
+ goto exit;
+ }
+ if (indices.shape[0] != n) {
+ PyErr_SetString(PyExc_RuntimeError,
+ "indices array inconsistent with tree");
+ goto exit;
+ }
+ ok = cuttree(n, self->nodes, nclusters, indices.buf);
+
+exit:
+ index_converter(NULL, &indices);
+ if (ok == -1) return NULL;
+ if (ok == 0) return PyErr_NoMemory();
+ Py_INCREF(Py_None);
+ return Py_None;
+}
+
+static char PyTree_sort__doc__[] =
+"mytree.sort(order) -> array\n"
+"\n"
+"Sort a hierarchical clustering tree by switching the left and right\n"
+"subnode of nodes such that the elements in the left-to-right order of the\n"
+"tree tend to have increasing order values.\n"
+"\n"
+"Return the indices of the elements in the left-to-right order in the\n"
+"hierarchical clustering tree, such that the element with index indices[i]\n"
+"occurs at position i in the dendrogram.\n";
+
+static PyObject*
+PyTree_sort(PyTree* self, PyObject* args)
+{
+ int ok = -1;
+ Py_buffer indices = {0};
+ const int n = self->n;
+ Py_buffer order = {0};
+
+ if (n == 0) {
+ PyErr_SetString(PyExc_ValueError, "tree is empty");
+ return NULL;
+ }
+ if (!PyArg_ParseTuple(args, "O&O&",
+ index_converter, &indices,
+ vector_converter, &order)) goto exit;
+ if (indices.shape[0] != n + 1) {
+ PyErr_SetString(PyExc_RuntimeError,
+ "indices array inconsistent with tree");
+ goto exit;
+ }
+ if (order.shape[0] != n + 1) {
+ PyErr_Format(PyExc_ValueError,
+ "order array has incorrect size %zd (expected %d)",
+ order.shape[0], n + 1);
+ goto exit;
+ }
+ ok = sorttree(n, self->nodes, order.buf, indices.buf);
+exit:
+ index_converter(NULL, &indices);
+ vector_converter(NULL, &order);
+ if (ok == -1) return NULL;
+ if (ok == 0) return PyErr_NoMemory();
+ Py_INCREF(Py_None);
+ return Py_None;
+}
+
+static PyMethodDef PyTree_methods[] = {
+ {"scale", (PyCFunction)PyTree_scale, METH_NOARGS, PyTree_scale__doc__},
+ {"cut", (PyCFunction)PyTree_cut, METH_VARARGS, PyTree_cut__doc__},
+ {"sort", (PyCFunction)PyTree_sort, METH_VARARGS, PyTree_sort__doc__},
+ {NULL} /* Sentinel */
+};
+
+static char PyTree_doc[] =
+"Tree objects store a hierarchical clustering solution.\n"
+"Individual nodes in the tree can be accessed with tree[i], where i is\n"
+"an integer. Whereas the tree itself is a read-only object, tree[:]\n"
+"returns a list of all the nodes, which can then be modified. To create\n"
+"a new Tree from this list, use Tree(list).\n"
+"See the description of the Node class for more information.";
+
+static PyTypeObject PyTreeType = {
+ PyVarObject_HEAD_INIT(NULL, 0)
+ "_cluster.Tree", /* tp_name */
+ sizeof(PyTree), /* tp_basicsize */
+ 0, /* tp_itemsize */
+ (destructor)PyTree_dealloc, /* tp_dealloc */
+ 0, /* tp_print */
+ 0, /* tp_getattr */
+ 0, /* tp_setattr */
+ 0, /* tp_compare */
+ 0, /* tp_repr */
+ 0, /* tp_as_number */
+ 0, /* tp_as_sequence */
+ &PyTree_mapping, /* tp_as_mapping */
+ 0, /* tp_hash */
+ 0, /* tp_call */
+ (reprfunc)PyTree_str, /* tp_str */
+ 0, /* tp_getattro */
+ 0, /* tp_setattro */
+ 0, /* tp_as_buffer */
+ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/
+ PyTree_doc, /* tp_doc */
+ 0, /* tp_traverse */
+ 0, /* tp_clear */
+ 0, /* tp_richcompare */
+ 0, /* tp_weaklistoffset */
+ 0, /* tp_iter */
+ 0, /* tp_iternext */
+ PyTree_methods, /* tp_methods */
+ NULL, /* tp_members */
+ 0, /* tp_getset */
+ 0, /* tp_base */
+ 0, /* tp_dict */
+ 0, /* tp_descr_get */
+ 0, /* tp_descr_set */
+ 0, /* tp_dictoffset */
+ 0, /* tp_init */
+ 0, /* tp_alloc */
+ (newfunc)PyTree_new, /* tp_new */
+};
+
+/* ========================================================================= */
+/* -- Methods -------------------------------------------------------------- */
+/* ========================================================================= */
+
+/* version */
+static char version__doc__[] =
+"version() -> string\n"
+"\n"
+"Return the version number of the C Clustering Library as a string.\n";
+
+static PyObject*
+py_version(PyObject* self)
+{
+ return PyUnicode_FromString( CLUSTERVERSION );
+}
+
+/* kcluster */
+static char kcluster__doc__[] =
+"kcluster(data, nclusters, mask, weight, transpose, npass, method,\n"
+" dist, clusterid) -> None\n"
+"\n"
+"This function implements k-means clustering.\n"
+"\n"
+"Arguments:\n"
+"\n"
+" - data: nrows x ncols array containing the data to be clustered\n"
+"\n"
+" - nclusters: number of clusters (the 'k' in k-means)\n"
+"\n"
+" - mask: nrows x ncols array of integers, showing which data are\n"
+" missing. If mask[i,j] == 0, then data[i,j] is missing.\n"
+"\n"
+" - weight: the weights to be used when calculating distances\n"
+" - transpose:\n"
+"\n"
+" - if equal to 0, rows are clustered;\n"
+" - if equal to 1, columns are clustered.\n"
+"\n"
+" - npass: number of times the k-means clustering algorithm is\n"
+" performed, each time with a different (random) initial\n"
+" condition. If npass == 0, then the assignments in clusterid\n"
+" are used as the initial condition.\n"
+"\n"
+" - method: specifies how the center of a cluster is found:\n"
+"\n"
+" - method == 'a': arithmetic mean\n"
+" - method == 'm': median\n"
+"\n"
+" - dist: specifies the distance function to be used:\n"
+"\n"
+" - dist == 'e': Euclidean distance\n"
+" - dist == 'b': City Block distance\n"
+" - dist == 'c': Pearson correlation\n"
+" - dist == 'a': absolute value of the correlation\n"
+" - dist == 'u': uncentered correlation\n"
+" - dist == 'x': absolute uncentered correlation\n"
+" - dist == 's': Spearman's rank correlation\n"
+" - dist == 'k': Kendall's tau\n"
+"\n"
+" - clusterid: array in which the final clustering solution will be\n"
+" stored (output variable). If npass == 0, then clusterid is also used\n"
+" as an input variable, containing the initial condition from which\n"
+" the EM algorithm should start. In this case, the k-means algorithm\n"
+" is fully deterministic.\n"
+"\n";
+
+static PyObject*
+py_kcluster(PyObject* self, PyObject* args, PyObject* keywords)
+{
+ int nclusters = 2;
+ int nrows, ncols;
+ int nitems;
+ int ndata;
+ Data data = {0};
+ Mask mask = {0};
+ Py_buffer weight = {0};
+ int transpose = 0;
+ int npass = 1;
+ char method = 'a';
+ char dist = 'e';
+ Py_buffer clusterid = {0};
+ double error;
+ int ifound = 0;
+
+ static char* kwlist[] = {"data",
+ "nclusters",
+ "mask",
+ "weight",
+ "transpose",
+ "npass",
+ "method",
+ "dist",
+ "clusterid",
+ NULL};
+
+ if (!PyArg_ParseTupleAndKeywords(args, keywords, "O&iO&O&iiO&O&O&", kwlist,
+ data_converter, &data,
+ &nclusters,
+ mask_converter, &mask,
+ vector_converter, &weight,
+ &transpose,
+ &npass,
+ method_kcluster_converter, &method,
+ distance_converter, &dist,
+ index_converter, &clusterid)) return NULL;
+ if (!data.values) {
+ PyErr_SetString(PyExc_RuntimeError, "data is None");
+ goto exit;
+ }
+ if (!mask.values) {
+ PyErr_SetString(PyExc_RuntimeError, "mask is None");
+ goto exit;
+ }
+ if (data.nrows != mask.view.shape[0] ||
+ data.ncols != mask.view.shape[1]) {
+ PyErr_Format(PyExc_ValueError,
+ "mask has incorrect dimensions %zd x %zd (expected %d x %d)",
+ mask.view.shape[0], mask.view.shape[1], data.nrows, data.ncols);
+ goto exit;
+ }
+ nrows = data.nrows;
+ ncols = data.ncols;
+ ndata = transpose ? nrows : ncols;
+ nitems = transpose ? ncols : nrows;
+ if (weight.shape[0] != ndata) {
+ PyErr_Format(PyExc_ValueError,
+ "weight has incorrect size %zd (expected %d)",
+ weight.shape[0], ndata);
+ goto exit;
+ }
+ if (nclusters < 1) {
+ PyErr_SetString(PyExc_ValueError, "nclusters should be positive");
+ goto exit;
+ }
+ if (nitems < nclusters) {
+ PyErr_SetString(PyExc_ValueError,
+ "more clusters than items to be clustered");
+ goto exit;
+ }
+ if (npass < 0) {
+ PyErr_SetString(PyExc_RuntimeError, "expected a non-negative integer");
+ goto exit;
+ }
+ else if (npass == 0) {
+ int n = check_clusterid(clusterid, nitems);
+ if (n == 0) goto exit;
+ if (n != nclusters) {
+ PyErr_SetString(PyExc_ValueError,
+ "more clusters requested than found in clusterid");
+ goto exit;
+ }
+ }
+ kcluster(nclusters,
+ nrows,
+ ncols,
+ data.values,
+ mask.values,
+ weight.buf,
+ transpose,
+ npass,
+ method,
+ dist,
+ clusterid.buf,
+ &error,
+ &ifound);
+exit:
+ data_converter(NULL, &data);
+ mask_converter(NULL, &mask);
+ vector_converter(NULL, &weight);
+ index_converter(NULL, &clusterid);
+ if (ifound) return Py_BuildValue("di", error, ifound);
+ return NULL;
+}
+/* end of wrapper for kcluster */
+
+/* kmedoids */
+static char kmedoids__doc__[] =
+"kmedoids(distance, nclusters, npass, clusterid) -> error, nfound\n"
+"\n"
+"This function implements k-medoids clustering.\n"
+"\n"
+"Arguments:\n"
+" - distance: The distance matrix between the elements. There are three\n"
+" ways in which you can pass a distance matrix:\n"
+"\n"
+" 1. a 2D Numerical Python array (in which only the left-lower\n"
+" part of the array will be accessed);\n"
+" 2. a 1D Numerical Python array containing the distances\n"
+" consecutively;\n"
+" 3. a list of rows containing the lower-triangular part of\n"
+" the distance matrix.\n"
+"\n"
+" Examples are:\n"
+"\n"
+" >>> from numpy import array\n"
+" >>> distance = array([[0.0, 1.1, 2.3],\n"
+" ... [1.1, 0.0, 4.5],\n"
+" ... [2.3, 4.5, 0.0]])\n"
+" >>> # (option #1)\n"
+" >>> distance = array([1.1, 2.3, 4.5])\n"
+" >>> # (option #2)\n"
+" >>> distance = [array([]),\n"
+" ... array([1.1]),\n"
+" ... array([2.3, 4.5])]\n"
+" >>> # (option #3)\n"
+"\n"
+" These three correspond to the same distance matrix.\n"
+"\n"
+" - nclusters: number of clusters (the 'k' in k-medoids)\n"
+"\n"
+" - npass: number of times the k-medoids clustering algorithm is\n"
+" performed, each time with a different (random) initial\n"
+" condition. If npass == 0, then the assignments in clusterid\n"
+" are used as the initial condition.\n"
+"\n"
+" - clusterid: array in which the final clustering solution will be\n"
+" stored (output variable). If npass == 0, then clusterid is also used\n"
+" as an input variable, containing the initial condition from which\n"
+" the EM algorithm should start. In this case, the k-medoids algorithm\n"
+" is fully deterministic.\n"
+"\n"
+"Return values:\n"
+" - error: the within-cluster sum of distances for the returned k-means\n"
+" clustering solution;\n"
+" - nfound: the number of times this solution was found.\n";
+
+static PyObject*
+py_kmedoids(PyObject* self, PyObject* args, PyObject* keywords)
+{
+ int nclusters = 2;
+ Distancematrix distances = {0};
+ Py_buffer clusterid = {0};
+ int npass = 1;
+ double error;
+ int ifound = -2;
+
+ static char* kwlist[] = {"distance",
+ "nclusters",
+ "npass",
+ "clusterid",
+ NULL};
+
+ if (!PyArg_ParseTupleAndKeywords(args, keywords, "O&iiO&", kwlist,
+ distancematrix_converter, &distances,
+ &nclusters,
+ &npass,
+ index_converter, &clusterid)) return NULL;
+ if (npass < 0) {
+ PyErr_SetString(PyExc_RuntimeError, "expected a non-negative integer");
+ goto exit;
+ }
+ else if (npass == 0) {
+ int n = check_clusterid(clusterid, distances.n);
+ if (n == 0) goto exit;
+ if (n != nclusters) {
+ PyErr_SetString(PyExc_RuntimeError,
+ "more clusters requested than found in clusterid");
+ goto exit;
+ }
+ }
+ if (nclusters <= 0) {
+ PyErr_SetString(PyExc_ValueError,
+ "nclusters should be a positive integer");
+ goto exit;
+ }
+ if (distances.n < nclusters) {
+ PyErr_SetString(PyExc_ValueError,
+ "more clusters requested than items to be clustered");
+ goto exit;
+ }
+ kmedoids(nclusters,
+ distances.n,
+ distances.values,
+ npass,
+ clusterid.buf,
+ &error,
+ &ifound);
+
+exit:
+ distancematrix_converter(NULL, &distances);
+ index_converter(NULL, &clusterid);
+ switch (ifound) {
+ case -2:
+ return NULL;
+ case -1:
+ return PyErr_NoMemory();
+ case 0: /* should not occur */
+ PyErr_SetString(PyExc_RuntimeError,
+ "error in kmedoids input arguments");
+ return NULL;
+ default:
+ return Py_BuildValue("di", error, ifound);
+ }
+}
+/* end of wrapper for kmedoids */
+
+/* treecluster */
+static char treecluster__doc__[] =
+"treecluster(tree, data, mask, weight, transpose, dist, method,\n"
+" distancematrix) -> None\n"
+"\n"
+"This function implements the pairwise single, complete, centroid, and\n"
+"average linkage hierarchical clustering methods.\n"
+"\n"
+"Arguments:\n"
+" - tree: an empty Tree object; its nodes will be filled by treecluster\n"
+" to describe the hierarchical clustering result. See the description\n"
+" of the Tree class for more information.\n"
+"\n"
+" - data: nrows x ncols array containing the data to be clustered.\n"
+" Either data or distancematrix (see below) should be None.\n"
+"\n"
+" - mask: nrows x ncols array of integers, showing which data are\n"
+" missing. If mask[i,j]==0, then data[i,j] is missing.\n"
+"\n"
+" - weight: the weights to be used when calculating distances.\n"
+"\n"
+" - transpose:\n"
+"\n"
+" - if equal to 0, rows are clustered;\n"
+" - if equal to 1, columns are clustered.\n"
+"\n"
+" - dist: specifies the distance function to be used:\n"
+"\n"
+" - dist == 'e': Euclidean distance\n"
+" - dist == 'b': City Block distance\n"
+" - dist == 'c': Pearson correlation\n"
+" - dist == 'a': absolute value of the correlation\n"
+" - dist == 'u': uncentered correlation\n"
+" - dist == 'x': absolute uncentered correlation\n"
+" - dist == 's': Spearman's rank correlation\n"
+" - dist == 'k': Kendall's tau\n"
+"\n"
+" - method: specifies which linkage method is used:\n"
+"\n"
+" - method == 's': Single pairwise linkage\n"
+" - method == 'm': Complete (maximum) pairwise linkage (default)\n"
+" - method == 'c': Centroid linkage\n"
+" - method == 'a': Average pairwise linkage\n"
+"\n"
+" - distancematrix: The distance matrix between the elements.\n"
+" Either data (see above) or distancematrix should be None.\n"
+" There are three ways in which you can pass a distance matrix:\n"
+"\n"
+" 1. a 2D Numerical Python array (in which only the left-lower\n"
+" part of the array will be accessed);\n"
+" 2. a 1D Numerical Python array containing the distances\n"
+" consecutively;\n"
+" 3. a list of rows containing the lower-triangular part of\n"
+" the distance matrix.\n"
+"\n"
+" Examples are:\n"
+"\n"
+" >>> from numpy import array\n"
+" >>> distance = array([[0.0, 1.1, 2.3],\n"
+" ... [1.1, 0.0, 4.5],\n"
+" ... [2.3, 4.5, 0.0]])\n"
+" >>> # option 1.\n"
+" >>> distance = array([1.1, 2.3, 4.5])\n"
+" >>> # option 2.\n"
+" >>> distance = [array([]),\n"
+" ... array([1.1]),\n"
+" ... array([2.3, 4.5])]\n"
+" >>> # option 3.\n"
+"\n"
+" These three correspond to the same distance matrix.\n"
+"\n"
+" PLEASE NOTE:\n"
+" As the treecluster routine may shuffle the values in the\n"
+" distance matrix as part of the clustering algorithm, be sure\n"
+" to save this array in a different variable before calling\n"
+" treecluster if you need it later.\n"
+"\n"
+"Either data or distancematrix should be None. If distancematrix is None,\n"
+"the hierarchical clustering solution is calculated from the values in\n"
+"the argument data. Instead if data is None, the hierarchical clustering\n"
+"solution is calculated from the distance matrix.\n"
+"Pairwise centroid-linkage clustering can be calculated only from the data\n"
+"and not from the distance matrix.\n"
+"Pairwise single-, maximum-, and average-linkage clustering can be\n"
+"calculated from either the data or from the distance matrix.\n";
+
+static PyObject*
+py_treecluster(PyObject* self, PyObject* args, PyObject* keywords)
+{
+ Data data = {0};
+ Mask mask = {0};
+ Py_buffer weight = {0};
+ int transpose = 0;
+ char dist = 'e';
+ char method = 'm';
+ Distancematrix distances = {0};
+ PyTree* tree = NULL;
+ Node* nodes;
+ int nitems;
+
+ static char* kwlist[] = {"tree",
+ "data",
+ "mask",
+ "weight",
+ "transpose",
+ "method",
+ "dist",
+ "distancematrix",
+ NULL };
+
+ if (!PyArg_ParseTupleAndKeywords(args, keywords, "O!O&O&O&iO&O&O&", kwlist,
+ &PyTreeType, &tree,
+ data_converter, &data,
+ mask_converter, &mask,
+ vector_none_converter, &weight,
+ &transpose,
+ method_treecluster_converter, &method,
+ distance_converter, &dist,
+ distancematrix_converter, &distances))
+ return NULL;
+
+ if (tree->n != 0) {
+ PyErr_SetString(PyExc_RuntimeError, "expected an empty tree");
+ goto exit;
+ }
+ if (data.values != NULL && distances.values != NULL) {
+ PyErr_SetString(PyExc_ValueError,
+ "use either data or distancematrix, do not use both");
+ goto exit;
+ }
+ if (data.values == NULL && distances.values == NULL) {
+ PyErr_SetString(PyExc_ValueError,
+ "neither data nor distancematrix was given");
+ goto exit;
+ }
+
+ if (data.values) /* use the values in data, not the distance matrix */ {
+ int nrows;
+ int ncols;
+ int ndata;
+
+ if (!mask.values) {
+ PyErr_SetString(PyExc_RuntimeError, "mask is None");
+ goto exit;
+ }
+ if (!weight.buf) {
+ PyErr_SetString(PyExc_RuntimeError, "weight is None");
+ goto exit;
+ }
+ nrows = data.nrows;
+ ncols = data.ncols;
+ if (nrows != mask.view.shape[0] || ncols != mask.view.shape[1]) {
+ PyErr_Format(PyExc_ValueError,
+ "mask has incorrect dimensions (%zd x %zd, expected %d x %d)",
+ mask.view.shape[0], mask.view.shape[1],
+ data.nrows, data.ncols);
+ goto exit;
+ }
+ ndata = transpose ? nrows : ncols;
+ nitems = transpose ? ncols : nrows;
+ if (weight.shape[0] != ndata) {
+ PyErr_Format(PyExc_RuntimeError,
+ "weight has incorrect size %zd (expected %d)",
+ weight.shape[0], ndata);
+ goto exit;
+ }
+
+ nodes = treecluster(nrows,
+ ncols,
+ data.values,
+ mask.values,
+ weight.buf,
+ transpose,
+ dist,
+ method,
+ NULL);
+ }
+ else { /* use the distance matrix instead of the values in data */
+ if (!strchr("sma", method)) {
+ PyErr_SetString(PyExc_ValueError,
+ "argument method should be 's', 'm', or 'a' "
+ "when specifying the distance matrix");
+ goto exit;
+ }
+ nitems = distances.n;
+ nodes = treecluster(nitems,
+ nitems,
+ 0,
+ 0,
+ 0,
+ transpose,
+ dist,
+ method,
+ distances.values);
+ }
+
+ if (!nodes) {
+ PyErr_NoMemory();
+ goto exit;
+ }
+ tree->nodes = nodes;
+ tree->n = nitems-1;
+
+exit:
+ data_converter(NULL, &data);
+ mask_converter(NULL, &mask);
+ vector_none_converter(NULL, &weight);
+ distancematrix_converter(NULL, &distances);
+ if (tree == NULL || tree->n == 0) return NULL;
+ Py_INCREF(Py_None);
+ return Py_None;
+}
+/* end of wrapper for treecluster */
+
+/* somcluster */
+static char somcluster__doc__[] =
+"somcluster(clusterid, celldata, data, mask, weight, transpose,\n"
+" inittau, niter, dist) -> None\n"
+"\n"
+"This function implements a self-organizing map on a rectangular grid.\n"
+"\n"
+"Arguments:\n"
+" - clusterid: array with two columns, with the number of rows equal\n"
+" to the number of items being clustered. Upon return, each row\n"
+" in the array contains the x and y coordinates of the cell in the\n"
+" the rectangular SOM grid to which the item was assigned.\n"
+"\n"
+" - celldata: array with dimensions nxgrid x nygrid x number of columns\n"
+" if rows are being clustered, or nxgrid x nygrid x number of rows\n"
+" if columns are being clustered, where nxgrid is the horizontal\n"
+" dimension of the rectangular SOM map and nygrid is the vertical\n"
+" dimension of the rectangular SOM map.\n"
+" Upon return, each element [ix, iy] of this array contains the\n"
+" data for the centroid of the cluster in the SOM grid cell with\n"
+" coordinates [ix, iy].\n"
+"\n"
+" - data: nrows x ncols array containing the data to be clustered.\n"
+"\n"
+" - mask: nrows x ncols array of integers, showing which data are\n"
+" missing. If mask[i,j] == 0, then data[i,j] is missing.\n"
+"\n"
+" - weight: the weights to be used when calculating distances\n"
+"\n"
+" - transpose:\n"
+"\n"
+" - if equal to 0, rows are clustered;\n"
+" - if equal to 1, columns are clustered.\n"
+"\n"
+" - inittau: the initial value of tau (the neighborbood function)\n"
+"\n"
+" - niter: the number of iterations\n"
+"\n"
+" - dist: specifies the distance function to be used:\n"
+"\n"
+" - dist == 'e': Euclidean distance\n"
+" - dist == 'b': City Block distance\n"
+" - dist == 'c': Pearson correlation\n"
+" - dist == 'a': absolute value of the correlation\n"
+" - dist == 'u': uncentered correlation\n"
+" - dist == 'x': absolute uncentered correlation\n"
+" - dist == 's': Spearman's rank correlation\n"
+" - dist == 'k': Kendall's tau\n";
+
+static PyObject*
+py_somcluster(PyObject* self, PyObject* args, PyObject* keywords)
+{
+ int nrows;
+ int ncols;
+ int ndata;
+ Data data = {0};
+ Mask mask = {0};
+ Py_buffer weight = {0};
+ int transpose = 0;
+ double inittau = 0.02;
+ int niter = 1;
+ char dist = 'e';
+ Py_buffer indices = {0};
+ Celldata celldata = {0};
+ PyObject* result = NULL;
+
+ static char* kwlist[] = {"clusterids",
+ "celldata",
+ "data",
+ "mask",
+ "weight",
+ "transpose",
+ "inittau",
+ "niter",
+ "dist",
+ NULL};
+
+ if (!PyArg_ParseTupleAndKeywords(args, keywords, "O&O&O&O&O&idiO&", kwlist,
+ index2d_converter, &indices,
+ celldata_converter, &celldata,
+ data_converter, &data,
+ mask_converter, &mask,
+ vector_converter, &weight,
+ &transpose,
+ &inittau,
+ &niter,
+ distance_converter, &dist)) return NULL;
+ if (niter < 1) {
+ PyErr_SetString(PyExc_ValueError,
+ "number of iterations (niter) should be positive");
+ goto exit;
+ }
+ if (!data.values) {
+ PyErr_SetString(PyExc_RuntimeError, "data is None");
+ goto exit;
+ }
+ if (!mask.values) {
+ PyErr_SetString(PyExc_RuntimeError, "mask is None");
+ goto exit;
+ }
+ nrows = data.nrows;
+ ncols = data.ncols;
+ if (nrows != mask.view.shape[0] || ncols != mask.view.shape[1]) {
+ PyErr_Format(PyExc_ValueError,
+ "mask has incorrect dimensions (%zd x %zd, expected %d x %d)",
+ mask.view.shape[0], mask.view.shape[1], data.nrows, data.ncols);
+ goto exit;
+ }
+ ndata = transpose ? nrows : ncols;
+ if (weight.shape[0] != ndata) {
+ PyErr_Format(PyExc_RuntimeError,
+ "weight has incorrect size %zd (expected %d)",
+ weight.shape[0], ndata);
+ goto exit;
+ }
+ if (celldata.nz != ndata) {
+ PyErr_Format(PyExc_RuntimeError,
+ "the celldata array size is not consistent with the data "
+ "(last dimension is %d; expected %d)", celldata.nz, ndata);
+ goto exit;
+ }
+ somcluster(nrows,
+ ncols,
+ data.values,
+ mask.values,
+ weight.buf,
+ transpose,
+ celldata.nx,
+ celldata.ny,
+ inittau,
+ niter,
+ dist,
+ celldata.values,
+ indices.buf);
+ Py_INCREF(Py_None);
+ result = Py_None;
+
+exit:
+ data_converter(NULL, &data);
+ vector_converter(NULL, &weight);
+ index2d_converter(NULL, &indices);
+ celldata_converter(NULL, &celldata);
+ return result;
+}
+/* end of wrapper for somcluster */
+
+/* clusterdistance */
+static char clusterdistance__doc__[] =
+"clusterdistance(data, mask, weight, index1, index2, dist, method,\n"
+" transpose) -> distance between two clusters\n"
+"\n"
+"Arguments:\n"
+"\n"
+" - data: nrows x ncols array containing the data values.\n"
+"\n"
+" - mask: nrows x ncols array of integers, showing which data are\n"
+" missing. If mask[i,j] == 0, then data[i,j] is missing.\n"
+"\n"
+" - weight: the weights to be used when calculating distances\n"
+"\n"
+" - index1: 1D array identifying which items belong to the first\n"
+" cluster.\n"
+"\n"
+" - index2: 1D array identifying which items belong to the second\n"
+" cluster.\n"
+"\n"
+" - dist: specifies the distance function to be used:\n"
+"\n"
+" - dist == 'e': Euclidean distance\n"
+" - dist == 'b': City Block distance\n"
+" - dist == 'c': Pearson correlation\n"
+" - dist == 'a': absolute value of the correlation\n"
+" - dist == 'u': uncentered correlation\n"
+" - dist == 'x': absolute uncentered correlation\n"
+" - dist == 's': Spearman's rank correlation\n"
+" - dist == 'k': Kendall's tau\n"
+"\n"
+" - method: specifies how the distance between two clusters is defined:\n"
+"\n"
+" - method == 'a': the distance between the arithmetic means of the\n"
+" two clusters\n"
+" - method == 'm': the distance between the medians of the two\n"
+" clusters\n"
+" - method == 's': the smallest pairwise distance between members\n"
+" of the two clusters\n"
+" - method == 'x': the largest pairwise distance between members of\n"
+" the two clusters\n"
+" - method == 'v': average of the pairwise distances between\n"
+" members of the clusters\n"
+"\n"
+" - transpose:\n"
+"\n"
+" - if equal to 0: clusters of rows are considered;\n"
+" - if equal to 1: clusters of columns are considered.\n"
+"\n";
+
+static PyObject*
+py_clusterdistance(PyObject* self, PyObject* args, PyObject* keywords)
+{
+ double distance;
+ int nrows;
+ int ncols;
+ int ndata;
+ Data data = {0};
+ Mask mask = {0};
+ Py_buffer weight = {0};
+ char dist = 'e';
+ char method = 'a';
+ int transpose = 0;
+ Py_buffer index1 = {0};
+ Py_buffer index2 = {0};
+ PyObject* result = NULL;
+
+ static char* kwlist[] = {"data",
+ "mask",
+ "weight",
+ "index1",
+ "index2",
+ "method",
+ "dist",
+ "transpose",
+ NULL};
+
+ if (!PyArg_ParseTupleAndKeywords(args, keywords, "O&O&O&O&O&O&O&i", kwlist,
+ data_converter, &data,
+ mask_converter, &mask,
+ vector_converter, &weight,
+ index_converter, &index1,
+ index_converter, &index2,
+ method_clusterdistance_converter, &method,
+ distance_converter, &dist,
+ &transpose)) return NULL;
+ if (!data.values) {
+ PyErr_SetString(PyExc_RuntimeError, "data is None");
+ goto exit;
+ }
+ if (!mask.values) {
+ PyErr_SetString(PyExc_RuntimeError, "mask is None");
+ goto exit;
+ }
+ nrows = data.nrows;
+ ncols = data.ncols;
+ ndata = transpose ? nrows : ncols;
+ if (nrows != mask.view.shape[0] || ncols != mask.view.shape[1]) {
+ PyErr_Format(PyExc_ValueError,
+ "mask has incorrect dimensions (%zd x %zd, expected %d x %d)",
+ mask.view.shape[0], mask.view.shape[1], data.nrows, data.ncols);
+ goto exit;
+ }
+ if (weight.shape[0] != ndata) {
+ PyErr_Format(PyExc_RuntimeError,
+ "weight has incorrect size %zd (expected %d)",
+ weight.shape[0], ndata);
+ goto exit;
+ }
+
+ distance = clusterdistance(nrows,
+ ncols,
+ data.values,
+ mask.values,
+ weight.buf,
+ (int) index1.shape[0],
+ (int) index2.shape[0],
+ index1.buf,
+ index2.buf,
+ dist,
+ method,
+ transpose);
+
+ if (distance < -0.5) /* Actually -1.0; avoiding roundoff errors */
+ PyErr_SetString(PyExc_IndexError, "index out of range");
+ else
+ result = PyFloat_FromDouble(distance);
+exit:
+ data_converter(NULL, &data);
+ mask_converter(NULL, &mask);
+ vector_converter(NULL, &weight);
+ index_converter(NULL, &index1);
+ index_converter(NULL, &index2);
+ return result;
+}
+/* end of wrapper for clusterdistance */
+
+/* clustercentroids */
+static char clustercentroids__doc__[] =
+"clustercentroids(data, mask, clusterid, method, transpose) -> cdata, cmask\n"
+"\n"
+"The clustercentroids routine calculates the cluster centroids, given to\n"
+"which cluster each element belongs. The centroid is defined as either\n"
+"the mean or the median over all elements for each dimension.\n"
+"\n"
+"Arguments:\n"
+" - data: nrows x ncols array containing the data values.\n"
+"\n"
+" - mask: nrows x ncols array of integers, showing which data are\n"
+" missing. If mask[i,j] == 0, then data[i,j] is missing.\n"
+"\n"
+" - clusterid: array containing the cluster number for each item.\n"
+" The cluster number should be non-negative.\n"
+"\n"
+" - method: specifies whether the centroid is calculated from the\n"
+" arithmetic mean (method == 'a', default) or the median\n"
+" (method == 'm') over each dimension.\n"
+"\n"
+" - transpose: if equal to 0, row clusters are considered;\n"
+" if equal to 1, column clusters are considered.\n"
+"\n"
+" - cdata: 2D array containing, upon return, the cluster centroids.\n"
+" If transpose == 0, then the dimensions of cdata should be\n"
+" nclusters x ncols.\n"
+" If transpose == 1, then the dimensions of cdata should be \n"
+" nrows x nclusters.\n"
+"\n"
+" - cmask: 2D array of integers describing, upon return, which elements\n"
+" in cdata, if any, are missing.\n";
+
+static PyObject*
+py_clustercentroids(PyObject* self, PyObject* args, PyObject* keywords)
+{
+ int nrows;
+ int ncols;
+ int nclusters;
+ Data data = {0};
+ Mask mask = {0};
+ Data cdata = {0};
+ Mask cmask = {0};
+ Py_buffer clusterid = {0};
+ char method = 'a';
+ int transpose = 0;
+ int ok = -1;
+
+ static char* kwlist[] = {"data",
+ "mask",
+ "clusterid",
+ "method",
+ "transpose",
+ "cdata",
+ "cmask",
+ NULL };
+
+ if (!PyArg_ParseTupleAndKeywords(args, keywords, "O&O&O&O&iO&O&", kwlist,
+ data_converter, &data,
+ mask_converter, &mask,
+ index_converter, &clusterid,
+ method_kcluster_converter, &method,
+ &transpose,
+ data_converter, &cdata,
+ mask_converter, &cmask)) return NULL;
+ if (!data.values) {
+ PyErr_SetString(PyExc_RuntimeError, "data is None");
+ goto exit;
+ }
+ if (!mask.values) {
+ PyErr_SetString(PyExc_RuntimeError, "mask is None");
+ goto exit;
+ }
+ nrows = data.nrows;
+ ncols = data.ncols;
+ if (nrows != mask.view.shape[0] || ncols != mask.view.shape[1]) {
+ PyErr_Format(PyExc_ValueError,
+ "mask has incorrect dimensions (%zd x %zd, expected %d x %d)",
+ mask.view.shape[0], mask.view.shape[1], data.nrows, data.ncols);
+ goto exit;
+ }
+ if (transpose == 0) {
+ nclusters = check_clusterid(clusterid, nrows);
+ nrows = nclusters;
+ }
+ else {
+ nclusters = check_clusterid(clusterid, ncols);
+ ncols = nclusters;
+ }
+ if (nclusters == 0) goto exit;
+ if (cdata.nrows != nrows) {
+ PyErr_Format(PyExc_RuntimeError,
+ "cdata has incorrect number of rows (%d, expected %d)",
+ cdata.nrows, nrows);
+ goto exit;
+ }
+ if (cdata.ncols != ncols) {
+ PyErr_Format(PyExc_RuntimeError,
+ "cdata has incorrect number of columns (%d, expected %d)",
+ cdata.ncols, ncols);
+ goto exit;
+ }
+ if (cmask.view.shape[0] != nrows) {
+ PyErr_Format(PyExc_RuntimeError,
+ "cmask has incorrect number of rows (%zd, expected %d)",
+ cmask.view.shape[0], nrows);
+ goto exit;
+ }
+ if (cmask.view.shape[1] != ncols) {
+ PyErr_Format(PyExc_RuntimeError,
+ "cmask has incorrect number of columns "
+ "(%zd, expected %d)", cmask.view.shape[1], ncols);
+ goto exit;
+ }
+ ok = getclustercentroids(nclusters,
+ data.nrows,
+ data.ncols,
+ data.values,
+ mask.values,
+ clusterid.buf,
+ cdata.values,
+ cmask.values,
+ transpose,
+ method);
+exit:
+ data_converter(NULL, &data);
+ mask_converter(NULL, &mask);
+ data_converter(NULL, &cdata);
+ mask_converter(NULL, &cmask);
+ index_converter(NULL, &clusterid);
+ if (ok == -1) return NULL;
+ if (ok == 0) return PyErr_NoMemory();
+ Py_INCREF(Py_None);
+ return Py_None;
+}
+/* end of wrapper for clustercentroids */
+
+/* distancematrix */
+static char distancematrix__doc__[] =
+"distancematrix(data, mask, weight, transpose, dist, distancematrix)\n"
+" -> None\n"
+"\n"
+"This function calculuates the distance matrix between the data values.\n"
+"\n"
+"Arguments:\n"
+"\n"
+" - data: nrows x ncols array containing the data values.\n"
+"\n"
+" - mask: nrows x ncols array of integers, showing which data are\n"
+" missing. If mask[i,j] == 0, then data[i,j] is missing.\n"
+"\n"
+" - weight: the weights to be used when calculating distances.\n"
+"\n"
+" - transpose: if equal to 0: the distances between rows are\n"
+" calculated;\n"
+" if equal to 1, the distances between columns are calculated.\n"
+"\n"
+" - dist: specifies the distance function to be used:\n"
+"\n"
+" - dist == 'e': Euclidean distance\n"
+" - dist == 'b': City Block distance\n"
+" - dist == 'c': Pearson correlation\n"
+" - dist == 'a': absolute value of the correlation\n"
+" - dist == 'u': uncentered correlation\n"
+" - dist == 'x': absolute uncentered correlation\n"
+" - dist == 's': Spearman's rank correlation\n"
+" - dist == 'k': Kendall's tau\n"
+"\n"
+" - distancematrix: Upon return, the distance matrix as a list of 1D\n"
+" arrays. The number of columns in each row is equal to the row number\n"
+" (i.e., len(distancematrix[i]) == i).\n"
+" An example of the return value is:\n"
+"\n"
+" matrix = [[],\n"
+" array([1.]),\n"
+" array([7., 3.]),\n"
+" array([4., 2., 6.])]\n"
+"\n"
+"This corresponds to the distance matrix:\n"
+"\n"
+" [0.\t1.\t7.\t4.]\n"
+" [1.\t0.\t3.\t2.]\n"
+" [7.\t3.\t0.\t6.]\n"
+" [4.\t2.\t6.\t0.]\n";
+
+static PyObject*
+py_distancematrix(PyObject* self, PyObject* args, PyObject* keywords)
+{
+ PyObject* list;
+ Distancematrix distances = {0};
+ Data data = {0};
+ Mask mask = {0};
+ Py_buffer weight = {0};
+ int transpose = 0;
+ char dist = 'e';
+ int nrows, ncols, ndata;
+ PyObject* result = NULL;
+
+ /* -- Read the input variables --------------------------------------- */
+ static char* kwlist[] = {"data",
+ "mask",
+ "weight",
+ "transpose",
+ "dist",
+ "distancematrix",
+ NULL};
+
+ if (!PyArg_ParseTupleAndKeywords(args, keywords, "O&O&O&iO&O!", kwlist,
+ data_converter, &data,
+ mask_converter, &mask,
+ vector_converter, &weight,
+ &transpose,
+ distance_converter, &dist,
+ &PyList_Type, &list)) return NULL;
+ if (!data.values) {
+ PyErr_SetString(PyExc_RuntimeError, "data is None");
+ goto exit;
+ }
+ if (!mask.values) {
+ PyErr_SetString(PyExc_RuntimeError, "mask is None");
+ goto exit;
+ }
+ nrows = data.nrows;
+ ncols = data.ncols;
+ if (nrows != mask.view.shape[0] || ncols != mask.view.shape[1]) {
+ PyErr_Format(PyExc_ValueError,
+ "mask has incorrect dimensions (%zd x %zd, expected %d x %d)",
+ mask.view.shape[0], mask.view.shape[1], data.nrows, data.ncols);
+ goto exit;
+ }
+ ndata = (transpose == 0) ? ncols : nrows;
+ if (weight.shape[0] != ndata) {
+ PyErr_Format(PyExc_ValueError,
+ "weight has incorrect size %zd (expected %d)",
+ weight.shape[0], ndata);
+ goto exit;
+ }
+ if (_convert_list_to_distancematrix(list, &distances) == 0) goto exit;
+
+ distancematrix(nrows,
+ ncols,
+ data.values,
+ mask.values,
+ weight.buf,
+ dist,
+ transpose,
+ distances.values);
+
+ Py_INCREF(Py_None);
+ result = Py_None;
+exit:
+ data_converter(NULL, &data);
+ mask_converter(NULL, &mask);
+ vector_converter(NULL, &weight);
+ distancematrix_converter(NULL, &distances);
+ return result;
+}
+/* end of wrapper for distancematrix */
+
+/* pca */
+static char pca__doc__[] =
+"pca(data, columnmean, coordinates, pc, eigenvalues) -> None\n"
+"\n"
+"This function calculates the principal component decomposition\n"
+"of the values in data.\n"
+"\n"
+"Arguments:\n"
+"\n"
+" - data: nrows x ncols array containing the data values.\n"
+"\n"
+" - columnmean: array of size nrows) in which the mean of each column\n"
+" will be sorted.\n"
+"\n"
+" - coordinates: nrows x nmin array in which the coordinates of the\n"
+" data along the principal components will be stored;\n"
+" nmin is min(nrows, ncols).\n"
+"\n"
+" - pc : the principal components as an nmin x ncols array, where nmin\n"
+" is min(nrows, ncols).\n"
+"\n"
+" - eigenvalues: array of size min(nrows, ncols), in which the\n"
+" eigenvalues will be stored, sorted by the magnitude\n"
+" of the eigenvalues, with the largest eigenvalues\n"
+" appearing first.\n"
+"\n"
+"Adding the column means to the dot product of the coordinates and the\n"
+"principal components, i.e.\n"
+"\n"
+" columnmean + dot(coordinates, pc)\n"
+"\n"
+"recreates the data matrix.\n";
+
+static PyObject*
+py_pca(PyObject* self, PyObject* args)
+{
+ Py_buffer eigenvalues = {0};
+ double** u;
+ double** v;
+ Data data = {0};
+ Data pc = {0};
+ Data coordinates = {0};
+ Py_buffer mean = {0};
+ int nrows, ncols;
+ int nmin;
+ int error = -2;
+ double* p;
+ double** values;
+ int i, j;
+
+ if (!PyArg_ParseTuple(args, "O&O&O&O&O&",
+ data_converter, &data,
+ vector_converter, &mean,
+ data_converter, &coordinates,
+ data_converter, &pc,
+ vector_converter, &eigenvalues)) return NULL;
+
+ values = data.values;
+ if (!values) {
+ PyErr_SetString(PyExc_RuntimeError, "data is None");
+ goto exit;
+ }
+ nrows = data.nrows;
+ ncols = data.ncols;
+ if (mean.shape[0] != ncols) {
+ PyErr_Format(PyExc_RuntimeError,
+ "columnmean has inconsistent size %zd (expected %d)",
+ mean.shape[0], ncols);
+ goto exit;
+ }
+ nmin = nrows < ncols ? nrows : ncols;
+ if (pc.nrows != nmin || pc.ncols != ncols) {
+ PyErr_Format(PyExc_RuntimeError,
+ "pc has inconsistent size %zd x %zd (expected %d x %d)",
+ mean.shape[0], mean.shape[1], nmin, ncols);
+ goto exit;
+ }
+ if (coordinates.nrows != nrows || coordinates.ncols != nmin) {
+ PyErr_Format(PyExc_RuntimeError,
+ "coordinates has inconsistent size %zd x %zd (expected %d x %d)",
+ mean.shape[0], mean.shape[1], nrows, nmin);
+ goto exit;
+ }
+ if (nrows >= ncols) {
+ u = coordinates.values;
+ v = pc.values;
+ }
+ else { /* nrows < ncolums */
+ u = pc.values;
+ v = coordinates.values;
+ }
+ /* -- Calculate the mean of each column ------------------------------ */
+ p = mean.buf;
+ for (j = 0; j < ncols; j++) {
+ p[j] = 0.0;
+ for (i = 0; i < nrows; i++) p[j] += values[i][j];
+ p[j] /= nrows;
+ }
+ /* -- Subtract the mean of each column ----------------------------- */
+ for (i = 0; i < nrows; i++)
+ for (j = 0; j < ncols; j++)
+ u[i][j] = values[i][j] - p[j];
+ /* -- Perform the principal component analysis ----------------------- */
+ error = pca(nrows, ncols, u, v, eigenvalues.buf);
+ /* ------------------------------------------------------------------- */
+exit:
+ data_converter(NULL, &data);
+ vector_converter(NULL, &mean);
+ data_converter(NULL, &pc);
+ data_converter(NULL, &coordinates);
+ vector_converter(NULL, &eigenvalues);
+ if (error == 0) {
+ Py_INCREF(Py_None);
+ return Py_None;
+ }
+ if (error == -1) return PyErr_NoMemory();
+ else if (error > 0)
+ PyErr_SetString(PyExc_RuntimeError,
+ "Singular value decomposition failed to converge");
+ return NULL;
+}
+/* end of wrapper for pca */
+
+/* ========================================================================= */
+/* -- The methods table ---------------------------------------------------- */
+/* ========================================================================= */
+
+
+static struct PyMethodDef cluster_methods[] = {
+ {"version", (PyCFunction) py_version, METH_NOARGS, version__doc__},
+ {"kcluster",
+ (PyCFunction) py_kcluster,
+ METH_VARARGS | METH_KEYWORDS,
+ kcluster__doc__
+ },
+ {"kmedoids",
+ (PyCFunction) py_kmedoids,
+ METH_VARARGS | METH_KEYWORDS,
+ kmedoids__doc__
+ },
+ {"treecluster",
+ (PyCFunction) py_treecluster,
+ METH_VARARGS | METH_KEYWORDS,
+ treecluster__doc__
+ },
+ {"somcluster",
+ (PyCFunction) py_somcluster,
+ METH_VARARGS | METH_KEYWORDS,
+ somcluster__doc__
+ },
+ {"clusterdistance",
+ (PyCFunction) py_clusterdistance,
+ METH_VARARGS | METH_KEYWORDS,
+ clusterdistance__doc__
+ },
+ {"clustercentroids",
+ (PyCFunction) py_clustercentroids,
+ METH_VARARGS | METH_KEYWORDS,
+ clustercentroids__doc__
+ },
+ {"distancematrix",
+ (PyCFunction) py_distancematrix,
+ METH_VARARGS | METH_KEYWORDS,
+ distancematrix__doc__
+ },
+ {"pca",
+ (PyCFunction) py_pca,
+ METH_VARARGS | METH_KEYWORDS,
+ pca__doc__
+ },
+ {NULL, NULL, 0, NULL} /* sentinel */
+};
+
+/* ========================================================================= */
+/* -- Initialization ------------------------------------------------------- */
+/* ========================================================================= */
+
+static struct PyModuleDef moduledef = {
+ PyModuleDef_HEAD_INIT,
+ "_cluster",
+ "C Clustering Library",
+ -1,
+ cluster_methods,
+ NULL,
+ NULL,
+ NULL,
+ NULL
+};
+
+PyObject *
+PyInit__cluster(void)
+{
+ PyObject *module;
+
+ PyNodeType.tp_new = PyType_GenericNew;
+ if (PyType_Ready(&PyNodeType) < 0)
+ return NULL;
+ if (PyType_Ready(&PyTreeType) < 0)
+ return NULL;
+
+ module = PyModule_Create(&moduledef);
+ if (module == NULL) return NULL;
+
+ Py_INCREF(&PyTreeType);
+ if (PyModule_AddObject(module, "Tree", (PyObject*) &PyTreeType) < 0) {
+ Py_DECREF(module);
+ Py_DECREF(&PyTreeType);
+ return NULL;
+ }
+
+ Py_INCREF(&PyNodeType);
+ if (PyModule_AddObject(module, "Node", (PyObject*) &PyNodeType) < 0) {
+ Py_DECREF(module);
+ Py_DECREF(&PyNodeType);
+ return NULL;
+ }
+
+ return module;
+}
diff --git a/code/lib/Bio/Compass/__init__.py b/code/lib/Bio/Compass/__init__.py
new file mode 100644
index 0000000..3d5e37a
--- /dev/null
+++ b/code/lib/Bio/Compass/__init__.py
@@ -0,0 +1,223 @@
+# Copyright 2004 by James Casbon. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+
+"""Code to deal with COMPASS output, a program for profile/profile comparison.
+
+Compass is described in:
+
+Sadreyev R, Grishin N. COMPASS: a tool for comparison of multiple protein
+alignments with assessment of statistical significance. J Mol Biol. 2003 Feb
+7;326(1):317-36.
+
+Tested with COMPASS 1.24.
+"""
+
+import re
+
+
+def read(handle):
+ """Read a COMPASS file containing one COMPASS record."""
+ record = None
+ try:
+ line = next(handle)
+ record = Record()
+ __read_names(record, line)
+ line = next(handle)
+ __read_threshold(record, line)
+ line = next(handle)
+ __read_lengths(record, line)
+ line = next(handle)
+ __read_profilewidth(record, line)
+ line = next(handle)
+ __read_scores(record, line)
+ except StopIteration:
+ if not record:
+ raise ValueError("No record found in handle") from None
+ else:
+ raise ValueError("Unexpected end of stream.") from None
+ for line in handle:
+ if not line.strip(): # skip empty lines
+ continue
+ __read_query_alignment(record, line)
+ try:
+ line = next(handle)
+ __read_positive_alignment(record, line)
+ line = next(handle)
+ __read_hit_alignment(record, line)
+ except StopIteration:
+ raise ValueError("Unexpected end of stream.") from None
+ return record
+
+
+def parse(handle):
+ """Iterate over records in a COMPASS file."""
+ record = None
+ try:
+ line = next(handle)
+ except StopIteration:
+ return
+ while True:
+ try:
+ record = Record()
+ __read_names(record, line)
+ line = next(handle)
+ __read_threshold(record, line)
+ line = next(handle)
+ __read_lengths(record, line)
+ line = next(handle)
+ __read_profilewidth(record, line)
+ line = next(handle)
+ __read_scores(record, line)
+ except StopIteration:
+ raise ValueError("Unexpected end of stream.") from None
+ for line in handle:
+ if not line.strip():
+ continue
+ if "Ali1:" in line:
+ yield record
+ break
+ __read_query_alignment(record, line)
+ try:
+ line = next(handle)
+ __read_positive_alignment(record, line)
+ line = next(handle)
+ __read_hit_alignment(record, line)
+ except StopIteration:
+ raise ValueError("Unexpected end of stream.") from None
+ else:
+ yield record
+ break
+
+
+class Record:
+ """Hold information from one compass hit.
+
+ Ali1 is the query, Ali2 the hit.
+ """
+
+ def __init__(self):
+ """Initialize the class."""
+ self.query = ""
+ self.hit = ""
+ self.gap_threshold = 0
+ self.query_length = 0
+ self.query_filtered_length = 0
+ self.query_nseqs = 0
+ self.query_neffseqs = 0
+ self.hit_length = 0
+ self.hit_filtered_length = 0
+ self.hit_nseqs = 0
+ self.hit_neffseqs = 0
+ self.sw_score = 0
+ self.evalue = -1
+ self.query_start = -1
+ self.hit_start = -1
+ self.query_aln = ""
+ self.hit_aln = ""
+ self.positives = ""
+
+ def query_coverage(self):
+ """Return the length of the query covered in the alignment."""
+ s = self.query_aln.replace("=", "")
+ return len(s)
+
+ def hit_coverage(self):
+ """Return the length of the hit covered in the alignment."""
+ s = self.hit_aln.replace("=", "")
+ return len(s)
+
+
+# Everything below is private
+
+__regex = {
+ "names": re.compile(r"Ali1:\s+(\S+)\s+Ali2:\s+(\S+)\s+"),
+ "threshold": re.compile(r"Threshold of effective gap content in columns: (\S+)"),
+ "lengths": re.compile(
+ r"length1=(\S+)\s+filtered_length1=(\S+)"
+ r"\s+length2=(\S+)\s+filtered_length2=(\S+)"
+ ),
+ "profilewidth": re.compile(
+ r"Nseqs1=(\S+)\s+Neff1=(\S+)\s+Nseqs2=(\S+)\s+Neff2=(\S+)"
+ ),
+ "scores": re.compile(r"Smith-Waterman score = (\S+)\s+Evalue = (\S+)"),
+ "start": re.compile(r"(\d+)"),
+ "align": re.compile(r"^.{15}(\S+)"),
+ "positive_alignment": re.compile(r"^.{15}(.+)"),
+}
+
+
+def __read_names(record, line):
+ # Ali1: 60456.blo.gz.aln Ali2: allscop//14984.blo.gz.aln
+ # ------query----- -------hit-------------
+ if "Ali1:" not in line:
+ raise ValueError("Line does not contain 'Ali1:':\n%s" % line)
+ m = __regex["names"].search(line)
+ record.query = m.group(1)
+ record.hit = m.group(2)
+
+
+def __read_threshold(record, line):
+ if not line.startswith("Threshold"):
+ raise ValueError("Line does not start with 'Threshold':\n%s" % line)
+ m = __regex["threshold"].search(line)
+ record.gap_threshold = float(m.group(1))
+
+
+def __read_lengths(record, line):
+ if not line.startswith("length1="):
+ raise ValueError("Line does not start with 'length1=':\n%s" % line)
+ m = __regex["lengths"].search(line)
+ record.query_length = int(m.group(1))
+ record.query_filtered_length = float(m.group(2))
+ record.hit_length = int(m.group(3))
+ record.hit_filtered_length = float(m.group(4))
+
+
+def __read_profilewidth(record, line):
+ if "Nseqs1" not in line:
+ raise ValueError("Line does not contain 'Nseqs1':\n%s" % line)
+ m = __regex["profilewidth"].search(line)
+ record.query_nseqs = int(m.group(1))
+ record.query_neffseqs = float(m.group(2))
+ record.hit_nseqs = int(m.group(3))
+ record.hit_neffseqs = float(m.group(4))
+
+
+def __read_scores(record, line):
+ if not line.startswith("Smith-Waterman"):
+ raise ValueError("Line does not start with 'Smith-Waterman':\n%s" % line)
+ m = __regex["scores"].search(line)
+ if m:
+ record.sw_score = int(m.group(1))
+ record.evalue = float(m.group(2))
+ else:
+ record.sw_score = 0
+ record.evalue = -1.0
+
+
+def __read_query_alignment(record, line):
+ m = __regex["start"].search(line)
+ if m:
+ record.query_start = int(m.group(1))
+ m = __regex["align"].match(line)
+ assert m is not None, "invalid match"
+ record.query_aln += m.group(1)
+
+
+def __read_positive_alignment(record, line):
+ m = __regex["positive_alignment"].match(line)
+ assert m is not None, "invalid match"
+ record.positives += m.group(1)
+
+
+def __read_hit_alignment(record, line):
+ m = __regex["start"].search(line)
+ if m:
+ record.hit_start = int(m.group(1))
+ m = __regex["align"].match(line)
+ assert m is not None, "invalid match"
+ record.hit_aln += m.group(1)
diff --git a/code/lib/Bio/Compass/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/Compass/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..fea698f
Binary files /dev/null and b/code/lib/Bio/Compass/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/Data/CodonTable.py b/code/lib/Bio/Data/CodonTable.py
new file mode 100644
index 0000000..bc006ee
--- /dev/null
+++ b/code/lib/Bio/Data/CodonTable.py
@@ -0,0 +1,1313 @@
+# Copyright 2000 Andrew Dalke. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Codon tables based on those from the NCBI.
+
+These tables are based on parsing the NCBI file
+ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt
+using Scripts/update_ncbi_codon_table.py
+
+Last updated at Version 4.4 (May 2019)
+"""
+
+from Bio.Data import IUPACData
+
+
+unambiguous_dna_by_name = {}
+unambiguous_dna_by_id = {}
+unambiguous_rna_by_name = {}
+unambiguous_rna_by_id = {}
+generic_by_name = {} # unambiguous DNA or RNA
+generic_by_id = {} # unambiguous DNA or RNA
+
+ambiguous_dna_by_name = {}
+ambiguous_dna_by_id = {}
+ambiguous_rna_by_name = {}
+ambiguous_rna_by_id = {}
+ambiguous_generic_by_name = {} # ambiguous DNA or RNA
+ambiguous_generic_by_id = {} # ambiguous DNA or RNA
+
+# standard IUPAC unambiguous codons
+standard_dna_table = None
+standard_rna_table = None
+
+
+# In the future, the back_table could return a statistically
+# appropriate distribution of codons, so do not cache the results of
+# back_table lookups!
+
+
+class TranslationError(Exception):
+ """Container for translation specific exceptions."""
+
+ pass
+
+
+class CodonTable:
+ """A codon-table, or genetic code."""
+
+ forward_table = {} # only includes codons which actually code
+ back_table = {} # for back translations
+ start_codons = []
+ stop_codons = []
+
+ # Not always called from derived classes!
+ def __init__(
+ self,
+ nucleotide_alphabet=None,
+ protein_alphabet=None,
+ forward_table=forward_table,
+ back_table=back_table,
+ start_codons=start_codons,
+ stop_codons=stop_codons,
+ ):
+ """Initialize the class."""
+ self.nucleotide_alphabet = nucleotide_alphabet
+ self.protein_alphabet = protein_alphabet
+ self.forward_table = forward_table
+ self.back_table = back_table
+ self.start_codons = start_codons
+ self.stop_codons = stop_codons
+
+ def __str__(self):
+ """Return a simple text representation of the codon table.
+
+ e.g.::
+
+ >>> import Bio.Data.CodonTable
+ >>> print(Bio.Data.CodonTable.standard_dna_table)
+ Table 1 Standard, SGC0
+
+ | T | C | A | G |
+ --+---------+---------+---------+---------+--
+ T | TTT F | TCT S | TAT Y | TGT C | T
+ T | TTC F | TCC S | TAC Y | TGC C | C
+ ...
+ G | GTA V | GCA A | GAA E | GGA G | A
+ G | GTG V | GCG A | GAG E | GGG G | G
+ --+---------+---------+---------+---------+--
+ >>> print(Bio.Data.CodonTable.generic_by_id[1])
+ Table 1 Standard, SGC0
+
+ | U | C | A | G |
+ --+---------+---------+---------+---------+--
+ U | UUU F | UCU S | UAU Y | UGU C | U
+ U | UUC F | UCC S | UAC Y | UGC C | C
+ ...
+ G | GUA V | GCA A | GAA E | GGA G | A
+ G | GUG V | GCG A | GAG E | GGG G | G
+ --+---------+---------+---------+---------+--
+ """
+ if self.id:
+ answer = "Table %i" % self.id
+ else:
+ answer = "Table ID unknown"
+ if self.names:
+ answer += " " + ", ".join([x for x in self.names if x])
+
+ # Use the main four letters (and the conventional ordering)
+ # even for ambiguous tables
+ letters = self.nucleotide_alphabet
+ if letters is not None and "T" in letters:
+ letters = "TCAG"
+ else:
+ # Should be either RNA or generic nucleotides,
+ # e.g. Bio.Data.CodonTable.generic_by_id[1]
+ letters = "UCAG"
+
+ # Build the table...
+ answer += "\n\n"
+ answer += " |" + "|".join(" %s " % c2 for c2 in letters) + "|"
+ answer += "\n--+" + "+".join("---------" for c2 in letters) + "+--"
+ for c1 in letters:
+ for c3 in letters:
+ line = c1 + " |"
+ for c2 in letters:
+ codon = c1 + c2 + c3
+ line += " %s" % codon
+ if codon in self.stop_codons:
+ line += " Stop|"
+ else:
+ try:
+ amino = self.forward_table[codon]
+ except KeyError:
+ amino = "?"
+ except TranslationError:
+ amino = "?"
+ if codon in self.start_codons:
+ line += " %s(s)|" % amino
+ else:
+ line += " %s |" % amino
+ line += " " + c3
+ answer += "\n" + line
+ answer += "\n--+" + "+".join("---------" for c2 in letters) + "+--"
+ return answer
+
+
+def make_back_table(table, default_stop_codon):
+ """Back a back-table (naive single codon mapping).
+
+ ONLY RETURNS A SINGLE CODON, chosen from the possible alternatives
+ based on their sort order.
+ """
+ # Do the sort so changes in the hash implementation won't affect
+ # the result when one amino acid is coded by more than one codon.
+ back_table = {}
+ for key in sorted(table):
+ back_table[table[key]] = key
+ back_table[None] = default_stop_codon
+ return back_table
+
+
+class NCBICodonTable(CodonTable):
+ """Codon table for generic nucleotide sequences."""
+
+ nucleotide_alphabet = None
+ protein_alphabet = IUPACData.protein_letters
+
+ def __init__(self, id, names, table, start_codons, stop_codons):
+ """Initialize the class."""
+ self.id = id
+ self.names = names
+ self.forward_table = table
+ self.back_table = make_back_table(table, stop_codons[0])
+ self.start_codons = start_codons
+ self.stop_codons = stop_codons
+
+ def __repr__(self):
+ """Represent the NCBI codon table class as a string for debugging."""
+ return "%s(id=%r, names=%r, ...)" % (
+ self.__class__.__name__,
+ self.id,
+ self.names,
+ )
+
+
+class NCBICodonTableDNA(NCBICodonTable):
+ """Codon table for unambiguous DNA sequences."""
+
+ nucleotide_alphabet = IUPACData.unambiguous_dna_letters
+
+
+class NCBICodonTableRNA(NCBICodonTable):
+ """Codon table for unambiguous RNA sequences."""
+
+ nucleotide_alphabet = IUPACData.unambiguous_rna_letters
+
+
+# ######## Deal with ambiguous forward translations
+
+
+class AmbiguousCodonTable(CodonTable):
+ """Base codon table for ambiguous sequences."""
+
+ def __init__(
+ self,
+ codon_table,
+ ambiguous_nucleotide_alphabet,
+ ambiguous_nucleotide_values,
+ ambiguous_protein_alphabet,
+ ambiguous_protein_values,
+ ):
+ """Initialize the class."""
+ CodonTable.__init__(
+ self,
+ ambiguous_nucleotide_alphabet,
+ ambiguous_protein_alphabet,
+ AmbiguousForwardTable(
+ codon_table.forward_table,
+ ambiguous_nucleotide_values,
+ ambiguous_protein_values,
+ ),
+ codon_table.back_table,
+ # These two are WRONG! I need to get the
+ # list of ambiguous codons which code for
+ # the stop codons XXX
+ list_ambiguous_codons(
+ codon_table.start_codons, ambiguous_nucleotide_values
+ ),
+ list_ambiguous_codons(codon_table.stop_codons, ambiguous_nucleotide_values),
+ )
+ self._codon_table = codon_table
+
+ # Be sneaky and forward attribute lookups to the original table.
+ # This lets us get the names, if the original table is an NCBI
+ # table.
+ def __getattr__(self, name):
+ """Forward attribute lookups to the original table."""
+ return getattr(self._codon_table, name)
+
+
+def list_possible_proteins(codon, forward_table, ambiguous_nucleotide_values):
+ """Return all possible encoded amino acids for ambiguous codon."""
+ c1, c2, c3 = codon
+ x1 = ambiguous_nucleotide_values[c1]
+ x2 = ambiguous_nucleotide_values[c2]
+ x3 = ambiguous_nucleotide_values[c3]
+ possible = {}
+ stops = []
+ for y1 in x1:
+ for y2 in x2:
+ for y3 in x3:
+ try:
+ possible[forward_table[y1 + y2 + y3]] = 1
+ except KeyError:
+ # If tripping over a stop codon
+ stops.append(y1 + y2 + y3)
+ if stops:
+ if possible:
+ raise TranslationError(
+ "ambiguous codon %r codes for both proteins and stop codons" % codon
+ )
+ # This is a true stop codon - tell the caller about it
+ raise KeyError(codon)
+ return list(possible)
+
+
+def list_ambiguous_codons(codons, ambiguous_nucleotide_values):
+ """Extend a codon list to include all possible ambigous codons.
+
+ e.g.::
+
+ ['TAG', 'TAA'] -> ['TAG', 'TAA', 'TAR']
+ ['UAG', 'UGA'] -> ['UAG', 'UGA', 'URA']
+
+ Note that ['TAG', 'TGA'] -> ['TAG', 'TGA'], this does not add 'TRR'
+ (which could also mean 'TAA' or 'TGG').
+ Thus only two more codons are added in the following:
+
+ e.g.::
+
+ ['TGA', 'TAA', 'TAG'] -> ['TGA', 'TAA', 'TAG', 'TRA', 'TAR']
+
+ Returns a new (longer) list of codon strings.
+ """
+ # Note ambiguous_nucleotide_values['R'] = 'AG' (etc)
+ # This will generate things like 'TRR' from ['TAG', 'TGA'], which
+ # we don't want to include:
+ c1_list = sorted(
+ letter
+ for letter, meanings in ambiguous_nucleotide_values.items()
+ if {codon[0] for codon in codons}.issuperset(set(meanings))
+ )
+ c2_list = sorted(
+ letter
+ for letter, meanings in ambiguous_nucleotide_values.items()
+ if {codon[1] for codon in codons}.issuperset(set(meanings))
+ )
+ c3_list = sorted(
+ letter
+ for letter, meanings in ambiguous_nucleotide_values.items()
+ if {codon[2] for codon in codons}.issuperset(set(meanings))
+ )
+ # candidates is a list (not a set) to preserve the iteration order
+ candidates = []
+ for c1 in c1_list:
+ for c2 in c2_list:
+ for c3 in c3_list:
+ codon = c1 + c2 + c3
+ if codon not in candidates and codon not in codons:
+ candidates.append(codon)
+ answer = codons[:] # copy
+ # print("Have %i new candidates" % len(candidates))
+ for ambig_codon in candidates:
+ wanted = True
+ # e.g. 'TRR' -> 'TAA', 'TAG', 'TGA', 'TGG'
+ for codon in [
+ c1 + c2 + c3
+ for c1 in ambiguous_nucleotide_values[ambig_codon[0]]
+ for c2 in ambiguous_nucleotide_values[ambig_codon[1]]
+ for c3 in ambiguous_nucleotide_values[ambig_codon[2]]
+ ]:
+ if codon not in codons:
+ # This ambiguous codon can code for a non-stop, exclude it!
+ wanted = False
+ # print("Rejecting %s" % ambig_codon)
+ continue
+ if wanted:
+ answer.append(ambig_codon)
+ return answer
+
+
+assert list_ambiguous_codons(["TGA", "TAA"], IUPACData.ambiguous_dna_values) == [
+ "TGA",
+ "TAA",
+ "TRA",
+]
+assert list_ambiguous_codons(["TAG", "TGA"], IUPACData.ambiguous_dna_values) == [
+ "TAG",
+ "TGA",
+]
+assert list_ambiguous_codons(["TAG", "TAA"], IUPACData.ambiguous_dna_values) == [
+ "TAG",
+ "TAA",
+ "TAR",
+]
+assert list_ambiguous_codons(["UAG", "UAA"], IUPACData.ambiguous_rna_values) == [
+ "UAG",
+ "UAA",
+ "UAR",
+]
+assert list_ambiguous_codons(["TGA", "TAA", "TAG"], IUPACData.ambiguous_dna_values) == [
+ "TGA",
+ "TAA",
+ "TAG",
+ "TAR",
+ "TRA",
+]
+
+# Forward translation is "onto", that is, any given codon always maps
+# to the same protein, or it doesn't map at all. Thus, I can build
+# off of an existing table to produce the ambiguous mappings.
+#
+# This handles the general case. Perhaps it's overkill?
+# >>> t = CodonTable.ambiguous_dna_by_id[1]
+# >>> t.forward_table["AAT"]
+# 'N'
+# >>> t.forward_table["GAT"]
+# 'D'
+# >>> t.forward_table["RAT"]
+# 'B'
+# >>> t.forward_table["YTA"]
+# 'L'
+
+
+class AmbiguousForwardTable:
+ """Forward table for translation of ambiguous nucleotide sequences."""
+
+ def __init__(self, forward_table, ambiguous_nucleotide, ambiguous_protein):
+ """Initialize the class."""
+ self.forward_table = forward_table
+
+ self.ambiguous_nucleotide = ambiguous_nucleotide
+ self.ambiguous_protein = ambiguous_protein
+
+ inverted = {}
+ for name, val in ambiguous_protein.items():
+ for c in val:
+ x = inverted.get(c, {})
+ x[name] = 1
+ inverted[c] = x
+ for name, val in inverted.items():
+ inverted[name] = list(val)
+ self._inverted = inverted
+
+ self._cache = {}
+
+ def __contains__(self, codon):
+ """Check if codon works as key for ambiguous forward_table.
+
+ Only returns 'True' if forward_table[codon] returns a value.
+ """
+ try:
+ self.__getitem__(codon)
+ return True
+ except (KeyError, TranslationError):
+ return False
+
+ def get(self, codon, failobj=None):
+ """Implement get for dictionary-like behaviour."""
+ try:
+ return self.__getitem__(codon)
+ except KeyError:
+ return failobj
+
+ def __getitem__(self, codon):
+ """Implement dictionary-like behaviour for AmbiguousForwardTable.
+
+ forward_table[codon] will either return an amino acid letter,
+ or throws a KeyError (if codon does not encode an amino acid)
+ or a TranslationError (if codon does encode for an amino acid,
+ but either is also a stop codon or does encode several amino acids,
+ for which no unique letter is available in the given alphabet.
+ """
+ try:
+ x = self._cache[codon]
+ except KeyError:
+ pass
+ else:
+ if x is TranslationError:
+ raise TranslationError(codon) # no unique translation
+ if x is KeyError:
+ raise KeyError(codon) # it's a stop codon
+ return x
+ try:
+ x = self.forward_table[codon]
+ self._cache[codon] = x
+ return x
+ except KeyError:
+ pass
+
+ # XXX Need to make part of this into a method which returns
+ # a list of all possible encodings for a codon!
+ try:
+ possible = list_possible_proteins(
+ codon, self.forward_table, self.ambiguous_nucleotide
+ )
+ except KeyError:
+ self._cache[codon] = KeyError
+ raise KeyError(codon) from None # stop codon
+ except TranslationError:
+ self._cache[codon] = TranslationError
+ raise TranslationError(codon) # does not code
+ assert len(possible) > 0, "unambiguous codons must code"
+
+ # Hah! Only one possible protein, so use it
+ if len(possible) == 1:
+ self._cache[codon] = possible[0]
+ return possible[0]
+
+ # See if there's an ambiguous protein encoding for the multiples.
+ # Find residues which exist in every coding set.
+ ambiguous_possible = {}
+ for amino in possible:
+ for term in self._inverted[amino]:
+ ambiguous_possible[term] = ambiguous_possible.get(term, 0) + 1
+
+ n = len(possible)
+ possible = []
+ for amino, val in ambiguous_possible.items():
+ if val == n:
+ possible.append(amino)
+
+ # No amino acid encoding for the results
+ if len(possible) == 0:
+ self._cache[codon] = TranslationError
+ raise TranslationError(codon) # no valid translation
+
+ # All of these are valid, so choose one
+ # To be unique, sort by smallet ambiguity then alphabetically
+ # Can get this if "X" encodes for everything.
+ # def _sort(x, y, table = self.ambiguous_protein):
+ # a = cmp(len(table[x]), len(table[y]))
+ # if a == 0:
+ # return cmp(x, y)
+ # return a
+
+ # Sort by key is 2.x and 3.x compatible
+ possible.sort(key=lambda x: (len(self.ambiguous_protein[x]), x))
+
+ x = possible[0]
+ self._cache[codon] = x
+ return x
+
+
+def register_ncbi_table(name, alt_name, id, table, start_codons, stop_codons):
+ """Turn codon table data into objects (PRIVATE).
+
+ The data is stored in the dictionaries.
+ """
+ # In most cases names are divided by "; ", however there is also
+ # Table 11 'Bacterial, Archaeal and Plant Plastid Code', previously
+ # 'Bacterial and Plant Plastid' which used to be just 'Bacterial'
+ names = [
+ x.strip() for x in name.replace(" and ", "; ").replace(", ", "; ").split("; ")
+ ]
+
+ dna = NCBICodonTableDNA(id, names + [alt_name], table, start_codons, stop_codons)
+ ambig_dna = AmbiguousCodonTable(
+ dna,
+ IUPACData.ambiguous_dna_letters,
+ IUPACData.ambiguous_dna_values,
+ IUPACData.extended_protein_letters,
+ IUPACData.extended_protein_values,
+ )
+
+ # replace all T's with U's for the RNA tables
+ rna_table = {}
+ generic_table = {}
+ for codon, val in table.items():
+ generic_table[codon] = val
+ codon = codon.replace("T", "U")
+ generic_table[codon] = val
+ rna_table[codon] = val
+ rna_start_codons = []
+ generic_start_codons = []
+ for codon in start_codons:
+ generic_start_codons.append(codon)
+ # We need to check if 'T' is in the codon, otherwise
+ # generic_start_codons may contain duplicates
+ if "T" in codon:
+ codon = codon.replace("T", "U")
+ generic_start_codons.append(codon)
+ rna_start_codons.append(codon)
+ rna_stop_codons = []
+ generic_stop_codons = []
+ for codon in stop_codons:
+ generic_stop_codons.append(codon)
+ if "T" in codon:
+ codon = codon.replace("T", "U")
+ generic_stop_codons.append(codon)
+ rna_stop_codons.append(codon)
+
+ generic = NCBICodonTable(
+ id, names + [alt_name], generic_table, generic_start_codons, generic_stop_codons
+ )
+
+ # The following isn't very elegant, but seems to work nicely.
+ _merged_values = dict(IUPACData.ambiguous_rna_values.items())
+ _merged_values["T"] = "U"
+ ambig_generic = AmbiguousCodonTable(
+ generic,
+ None,
+ _merged_values,
+ IUPACData.extended_protein_letters,
+ IUPACData.extended_protein_values,
+ )
+
+ rna = NCBICodonTableRNA(
+ id, names + [alt_name], rna_table, rna_start_codons, rna_stop_codons
+ )
+
+ ambig_rna = AmbiguousCodonTable(
+ rna,
+ IUPACData.ambiguous_rna_letters,
+ IUPACData.ambiguous_rna_values,
+ IUPACData.extended_protein_letters,
+ IUPACData.extended_protein_values,
+ )
+
+ if id == 1:
+ global standard_dna_table, standard_rna_table
+ standard_dna_table = dna
+ standard_rna_table = rna
+
+ unambiguous_dna_by_id[id] = dna
+ unambiguous_rna_by_id[id] = rna
+ generic_by_id[id] = generic
+ ambiguous_dna_by_id[id] = ambig_dna
+ ambiguous_rna_by_id[id] = ambig_rna
+ ambiguous_generic_by_id[id] = ambig_generic
+
+ if alt_name is not None:
+ names.append(alt_name)
+
+ for name in names:
+ unambiguous_dna_by_name[name] = dna
+ unambiguous_rna_by_name[name] = rna
+ generic_by_name[name] = generic
+ ambiguous_dna_by_name[name] = ambig_dna
+ ambiguous_rna_by_name[name] = ambig_rna
+ ambiguous_generic_by_name[name] = ambig_generic
+
+
+# The rest of this file is automatically generated, here we turn off
+# black formatting in order to keep the codon tables compact.
+#
+# fmt: off
+
+##########################################################################
+# Start of auto-generated output from Scripts/update_ncbi_codon_table.py #
+##########################################################################
+
+# Data from NCBI genetic code table version 4.5
+
+register_ncbi_table(
+ name="Standard",
+ alt_name="SGC0",
+ id=1,
+ table={
+ "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
+ "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
+ "TAT": "Y", "TAC": "Y", # noqa: E241
+ "TGT": "C", "TGC": "C", "TGG": "W", # noqa: E241
+ "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
+ "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
+ "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
+ "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
+ "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
+ "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
+ "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
+ "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
+ "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
+ "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
+ "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
+ "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
+ },
+ stop_codons=["TAA", "TAG", "TGA"],
+ start_codons=["TTG", "CTG", "ATG"],
+)
+
+register_ncbi_table(
+ name="Vertebrate Mitochondrial",
+ alt_name="SGC1",
+ id=2,
+ table={
+ "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
+ "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
+ "TAT": "Y", "TAC": "Y", # noqa: E241
+ "TGT": "C", "TGC": "C", "TGA": "W", "TGG": "W",
+ "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
+ "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
+ "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
+ "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
+ "ATT": "I", "ATC": "I", "ATA": "M", "ATG": "M",
+ "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
+ "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
+ "AGT": "S", "AGC": "S", # noqa: E241
+ "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
+ "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
+ "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
+ "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
+ },
+ stop_codons=["TAA", "TAG", "AGA", "AGG"],
+ start_codons=["ATT", "ATC", "ATA", "ATG", "GTG"],
+)
+
+register_ncbi_table(
+ name="Yeast Mitochondrial",
+ alt_name="SGC2",
+ id=3,
+ table={
+ "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
+ "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
+ "TAT": "Y", "TAC": "Y", # noqa: E241
+ "TGT": "C", "TGC": "C", "TGA": "W", "TGG": "W",
+ "CTT": "T", "CTC": "T", "CTA": "T", "CTG": "T",
+ "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
+ "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
+ "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
+ "ATT": "I", "ATC": "I", "ATA": "M", "ATG": "M",
+ "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
+ "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
+ "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
+ "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
+ "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
+ "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
+ "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
+ },
+ stop_codons=["TAA", "TAG"],
+ start_codons=["ATA", "ATG", "GTG"],
+)
+
+register_ncbi_table(
+ name="Mold Mitochondrial; Protozoan Mitochondrial; Coelenterate "
+ "Mitochondrial; Mycoplasma; Spiroplasma",
+ alt_name="SGC3",
+ id=4,
+ table={
+ "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
+ "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
+ "TAT": "Y", "TAC": "Y", # noqa: E241
+ "TGT": "C", "TGC": "C", "TGA": "W", "TGG": "W",
+ "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
+ "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
+ "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
+ "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
+ "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
+ "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
+ "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
+ "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
+ "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
+ "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
+ "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
+ "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
+ },
+ stop_codons=["TAA", "TAG"],
+ start_codons=["TTA", "TTG", "CTG", "ATT", "ATC", "ATA", "ATG", "GTG"],
+)
+
+register_ncbi_table(
+ name="Invertebrate Mitochondrial",
+ alt_name="SGC4",
+ id=5,
+ table={
+ "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
+ "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
+ "TAT": "Y", "TAC": "Y", # noqa: E241
+ "TGT": "C", "TGC": "C", "TGA": "W", "TGG": "W",
+ "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
+ "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
+ "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
+ "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
+ "ATT": "I", "ATC": "I", "ATA": "M", "ATG": "M",
+ "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
+ "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
+ "AGT": "S", "AGC": "S", "AGA": "S", "AGG": "S",
+ "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
+ "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
+ "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
+ "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
+ },
+ stop_codons=["TAA", "TAG"],
+ start_codons=["TTG", "ATT", "ATC", "ATA", "ATG", "GTG"],
+)
+
+register_ncbi_table(
+ name="Ciliate Nuclear; Dasycladacean Nuclear; Hexamita Nuclear",
+ alt_name="SGC5",
+ id=6,
+ table={
+ "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
+ "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
+ "TAT": "Y", "TAC": "Y", "TAA": "Q", "TAG": "Q",
+ "TGT": "C", "TGC": "C", "TGG": "W", # noqa: E241
+ "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
+ "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
+ "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
+ "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
+ "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
+ "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
+ "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
+ "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
+ "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
+ "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
+ "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
+ "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
+ },
+ stop_codons=["TGA"],
+ start_codons=["ATG"],
+)
+
+register_ncbi_table(
+ name="Echinoderm Mitochondrial; Flatworm Mitochondrial",
+ alt_name="SGC8",
+ id=9,
+ table={
+ "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
+ "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
+ "TAT": "Y", "TAC": "Y", # noqa: E241
+ "TGT": "C", "TGC": "C", "TGA": "W", "TGG": "W",
+ "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
+ "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
+ "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
+ "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
+ "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
+ "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
+ "AAT": "N", "AAC": "N", "AAA": "N", "AAG": "K",
+ "AGT": "S", "AGC": "S", "AGA": "S", "AGG": "S",
+ "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
+ "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
+ "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
+ "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
+ },
+ stop_codons=["TAA", "TAG"],
+ start_codons=["ATG", "GTG"],
+)
+
+register_ncbi_table(
+ name="Euplotid Nuclear",
+ alt_name="SGC9",
+ id=10,
+ table={
+ "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
+ "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
+ "TAT": "Y", "TAC": "Y", # noqa: E241
+ "TGT": "C", "TGC": "C", "TGA": "C", "TGG": "W",
+ "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
+ "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
+ "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
+ "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
+ "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
+ "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
+ "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
+ "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
+ "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
+ "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
+ "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
+ "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
+ },
+ stop_codons=["TAA", "TAG"],
+ start_codons=["ATG"],
+)
+
+register_ncbi_table(
+ name="Bacterial, Archaeal and Plant Plastid",
+ alt_name=None,
+ id=11,
+ table={
+ "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
+ "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
+ "TAT": "Y", "TAC": "Y", # noqa: E241
+ "TGT": "C", "TGC": "C", "TGG": "W", # noqa: E241
+ "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
+ "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
+ "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
+ "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
+ "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
+ "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
+ "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
+ "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
+ "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
+ "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
+ "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
+ "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
+ },
+ stop_codons=["TAA", "TAG", "TGA"],
+ start_codons=["TTG", "CTG", "ATT", "ATC", "ATA", "ATG", "GTG"],
+)
+
+register_ncbi_table(
+ name="Alternative Yeast Nuclear",
+ alt_name=None,
+ id=12,
+ table={
+ "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
+ "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
+ "TAT": "Y", "TAC": "Y", # noqa: E241
+ "TGT": "C", "TGC": "C", "TGG": "W", # noqa: E241
+ "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "S",
+ "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
+ "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
+ "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
+ "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
+ "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
+ "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
+ "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
+ "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
+ "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
+ "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
+ "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
+ },
+ stop_codons=["TAA", "TAG", "TGA"],
+ start_codons=["CTG", "ATG"],
+)
+
+register_ncbi_table(
+ name="Ascidian Mitochondrial",
+ alt_name=None,
+ id=13,
+ table={
+ "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
+ "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
+ "TAT": "Y", "TAC": "Y", # noqa: E241
+ "TGT": "C", "TGC": "C", "TGA": "W", "TGG": "W",
+ "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
+ "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
+ "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
+ "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
+ "ATT": "I", "ATC": "I", "ATA": "M", "ATG": "M",
+ "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
+ "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
+ "AGT": "S", "AGC": "S", "AGA": "G", "AGG": "G",
+ "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
+ "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
+ "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
+ "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
+ },
+ stop_codons=["TAA", "TAG"],
+ start_codons=["TTG", "ATA", "ATG", "GTG"],
+)
+
+register_ncbi_table(
+ name="Alternative Flatworm Mitochondrial",
+ alt_name=None,
+ id=14,
+ table={
+ "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
+ "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
+ "TAT": "Y", "TAC": "Y", "TAA": "Y", # noqa: E241
+ "TGT": "C", "TGC": "C", "TGA": "W", "TGG": "W",
+ "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
+ "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
+ "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
+ "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
+ "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
+ "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
+ "AAT": "N", "AAC": "N", "AAA": "N", "AAG": "K",
+ "AGT": "S", "AGC": "S", "AGA": "S", "AGG": "S",
+ "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
+ "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
+ "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
+ "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
+ },
+ stop_codons=["TAG"],
+ start_codons=["ATG"],
+)
+
+register_ncbi_table(
+ name="Blepharisma Macronuclear",
+ alt_name=None,
+ id=15,
+ table={
+ "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
+ "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
+ "TAT": "Y", "TAC": "Y", "TAG": "Q", # noqa: E241
+ "TGT": "C", "TGC": "C", "TGG": "W", # noqa: E241
+ "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
+ "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
+ "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
+ "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
+ "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
+ "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
+ "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
+ "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
+ "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
+ "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
+ "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
+ "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
+ },
+ stop_codons=["TAA", "TGA"],
+ start_codons=["ATG"],
+)
+
+register_ncbi_table(
+ name="Chlorophycean Mitochondrial",
+ alt_name=None,
+ id=16,
+ table={
+ "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
+ "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
+ "TAT": "Y", "TAC": "Y", "TAG": "L", # noqa: E241
+ "TGT": "C", "TGC": "C", "TGG": "W", # noqa: E241
+ "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
+ "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
+ "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
+ "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
+ "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
+ "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
+ "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
+ "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
+ "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
+ "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
+ "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
+ "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
+ },
+ stop_codons=["TAA", "TGA"],
+ start_codons=["ATG"],
+)
+
+register_ncbi_table(
+ name="Trematode Mitochondrial",
+ alt_name=None,
+ id=21,
+ table={
+ "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
+ "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
+ "TAT": "Y", "TAC": "Y", # noqa: E241
+ "TGT": "C", "TGC": "C", "TGA": "W", "TGG": "W",
+ "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
+ "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
+ "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
+ "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
+ "ATT": "I", "ATC": "I", "ATA": "M", "ATG": "M",
+ "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
+ "AAT": "N", "AAC": "N", "AAA": "N", "AAG": "K",
+ "AGT": "S", "AGC": "S", "AGA": "S", "AGG": "S",
+ "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
+ "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
+ "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
+ "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
+ },
+ stop_codons=["TAA", "TAG"],
+ start_codons=["ATG", "GTG"],
+)
+
+register_ncbi_table(
+ name="Scenedesmus obliquus Mitochondrial",
+ alt_name=None,
+ id=22,
+ table={
+ "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
+ "TCT": "S", "TCC": "S", "TCG": "S", # noqa: E241
+ "TAT": "Y", "TAC": "Y", "TAG": "L", # noqa: E241
+ "TGT": "C", "TGC": "C", "TGG": "W", # noqa: E241
+ "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
+ "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
+ "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
+ "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
+ "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
+ "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
+ "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
+ "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
+ "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
+ "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
+ "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
+ "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
+ },
+ stop_codons=["TCA", "TAA", "TGA"],
+ start_codons=["ATG"],
+)
+
+register_ncbi_table(
+ name="Thraustochytrium Mitochondrial",
+ alt_name=None,
+ id=23,
+ table={
+ "TTT": "F", "TTC": "F", "TTG": "L", # noqa: E241
+ "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
+ "TAT": "Y", "TAC": "Y", # noqa: E241
+ "TGT": "C", "TGC": "C", "TGG": "W", # noqa: E241
+ "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
+ "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
+ "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
+ "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
+ "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
+ "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
+ "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
+ "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
+ "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
+ "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
+ "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
+ "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
+ },
+ stop_codons=["TTA", "TAA", "TAG", "TGA"],
+ start_codons=["ATT", "ATG", "GTG"],
+)
+
+register_ncbi_table(
+ name="Pterobranchia Mitochondrial",
+ alt_name=None,
+ id=24,
+ table={
+ "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
+ "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
+ "TAT": "Y", "TAC": "Y", # noqa: E241
+ "TGT": "C", "TGC": "C", "TGA": "W", "TGG": "W",
+ "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
+ "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
+ "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
+ "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
+ "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
+ "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
+ "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
+ "AGT": "S", "AGC": "S", "AGA": "S", "AGG": "K",
+ "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
+ "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
+ "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
+ "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
+ },
+ stop_codons=["TAA", "TAG"],
+ start_codons=["TTG", "CTG", "ATG", "GTG"],
+)
+
+register_ncbi_table(
+ name="Candidate Division SR1 and Gracilibacteria",
+ alt_name=None,
+ id=25,
+ table={
+ "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
+ "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
+ "TAT": "Y", "TAC": "Y", # noqa: E241
+ "TGT": "C", "TGC": "C", "TGA": "G", "TGG": "W",
+ "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
+ "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
+ "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
+ "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
+ "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
+ "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
+ "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
+ "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
+ "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
+ "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
+ "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
+ "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
+ },
+ stop_codons=["TAA", "TAG"],
+ start_codons=["TTG", "ATG", "GTG"],
+)
+
+register_ncbi_table(
+ name="Pachysolen tannophilus Nuclear",
+ alt_name=None,
+ id=26,
+ table={
+ "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
+ "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
+ "TAT": "Y", "TAC": "Y", # noqa: E241
+ "TGT": "C", "TGC": "C", "TGG": "W", # noqa: E241
+ "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "A",
+ "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
+ "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
+ "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
+ "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
+ "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
+ "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
+ "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
+ "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
+ "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
+ "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
+ "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
+ },
+ stop_codons=["TAA", "TAG", "TGA"],
+ start_codons=["CTG", "ATG"],
+)
+
+register_ncbi_table(
+ name="Karyorelict Nuclear",
+ alt_name=None,
+ id=27,
+ table={
+ "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
+ "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
+ "TAT": "Y", "TAC": "Y", "TAA": "Q", "TAG": "Q",
+ "TGT": "C", "TGC": "C", "TGA": "W", "TGG": "W",
+ "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
+ "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
+ "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
+ "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
+ "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
+ "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
+ "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
+ "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
+ "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
+ "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
+ "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
+ "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
+ },
+ stop_codons=["TGA"],
+ start_codons=["ATG"],
+)
+
+register_ncbi_table(
+ name="Condylostoma Nuclear",
+ alt_name=None,
+ id=28,
+ table={
+ "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
+ "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
+ "TAT": "Y", "TAC": "Y", "TAA": "Q", "TAG": "Q",
+ "TGT": "C", "TGC": "C", "TGA": "W", "TGG": "W",
+ "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
+ "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
+ "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
+ "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
+ "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
+ "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
+ "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
+ "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
+ "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
+ "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
+ "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
+ "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
+ },
+ stop_codons=["TAA", "TAG", "TGA"],
+ start_codons=["ATG"],
+)
+
+register_ncbi_table(
+ name="Mesodinium Nuclear",
+ alt_name=None,
+ id=29,
+ table={
+ "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
+ "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
+ "TAT": "Y", "TAC": "Y", "TAA": "Y", "TAG": "Y",
+ "TGT": "C", "TGC": "C", "TGG": "W", # noqa: E241
+ "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
+ "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
+ "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
+ "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
+ "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
+ "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
+ "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
+ "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
+ "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
+ "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
+ "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
+ "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
+ },
+ stop_codons=["TGA"],
+ start_codons=["ATG"],
+)
+
+register_ncbi_table(
+ name="Peritrich Nuclear",
+ alt_name=None,
+ id=30,
+ table={
+ "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
+ "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
+ "TAT": "Y", "TAC": "Y", "TAA": "E", "TAG": "E",
+ "TGT": "C", "TGC": "C", "TGG": "W", # noqa: E241
+ "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
+ "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
+ "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
+ "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
+ "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
+ "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
+ "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
+ "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
+ "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
+ "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
+ "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
+ "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
+ },
+ stop_codons=["TGA"],
+ start_codons=["ATG"],
+)
+
+register_ncbi_table(
+ name="Blastocrithidia Nuclear",
+ alt_name=None,
+ id=31,
+ table={
+ "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
+ "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
+ "TAT": "Y", "TAC": "Y", "TAA": "E", "TAG": "E",
+ "TGT": "C", "TGC": "C", "TGA": "W", "TGG": "W",
+ "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
+ "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
+ "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
+ "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
+ "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
+ "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
+ "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
+ "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
+ "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
+ "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
+ "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
+ "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
+ },
+ stop_codons=["TAA", "TAG"],
+ start_codons=["ATG"],
+)
+
+register_ncbi_table(
+ name="Balanophoraceae Plastid",
+ alt_name=None,
+ id=32,
+ table={
+ "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
+ "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
+ "TAT": "Y", "TAC": "Y", "TAG": "W", # noqa: E241
+ "TGT": "C", "TGC": "C", "TGG": "W", # noqa: E241
+ "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
+ "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
+ "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
+ "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
+ "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
+ "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
+ "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
+ "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
+ "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
+ "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
+ "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
+ "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
+ },
+ stop_codons=["TAA", "TGA"],
+ start_codons=["TTG", "CTG", "ATT", "ATC", "ATA", "ATG", "GTG"],
+)
+
+register_ncbi_table(
+ name="Cephalodiscidae Mitochondrial",
+ alt_name=None,
+ id=33,
+ table={
+ "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
+ "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
+ "TAT": "Y", "TAC": "Y", "TAA": "Y", # noqa: E241
+ "TGT": "C", "TGC": "C", "TGA": "W", "TGG": "W",
+ "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
+ "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
+ "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
+ "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
+ "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
+ "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
+ "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
+ "AGT": "S", "AGC": "S", "AGA": "S", "AGG": "K",
+ "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
+ "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
+ "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
+ "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
+ },
+ stop_codons=["TAG"],
+ start_codons=["TTG", "CTG", "ATG", "GTG"],
+)
+
+########################################################################
+# End of auto-generated output from Scripts/update_ncbi_codon_table.py #
+########################################################################
diff --git a/code/lib/Bio/Data/IUPACData.py b/code/lib/Bio/Data/IUPACData.py
new file mode 100644
index 0000000..42bf7a9
--- /dev/null
+++ b/code/lib/Bio/Data/IUPACData.py
@@ -0,0 +1,423 @@
+# Copyright 2000 Andrew Dalke. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Information about the IUPAC alphabets."""
+
+
+protein_letters = "ACDEFGHIKLMNPQRSTVWY"
+extended_protein_letters = "ACDEFGHIKLMNPQRSTVWYBXZJUO"
+# B = "Asx"; aspartic acid or asparagine (D or N)
+# X = "Xxx"; unknown or 'other' amino acid
+# Z = "Glx"; glutamic acid or glutamine (E or Q)
+# http://www.chem.qmul.ac.uk/iupac/AminoAcid/A2021.html#AA212
+#
+# J = "Xle"; leucine or isoleucine (L or I, used in NMR)
+# Mentioned in http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html
+# Also the International Nucleotide Sequence Database Collaboration (INSDC)
+# (i.e. GenBank, EMBL, DDBJ) adopted this in 2006
+# http://www.ddbj.nig.ac.jp/insdc/icm2006-e.html
+#
+# Xle (J); Leucine or Isoleucine
+# The residue abbreviations, Xle (the three-letter abbreviation) and J
+# (the one-letter abbreviation) are reserved for the case that cannot
+# experimentally distinguish leucine from isoleucine.
+#
+# U = "Sec"; selenocysteine
+# http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html
+#
+# O = "Pyl"; pyrrolysine
+# http://www.chem.qmul.ac.uk/iubmb/newsletter/2009.html#item35
+
+protein_letters_1to3 = {
+ "A": "Ala",
+ "C": "Cys",
+ "D": "Asp",
+ "E": "Glu",
+ "F": "Phe",
+ "G": "Gly",
+ "H": "His",
+ "I": "Ile",
+ "K": "Lys",
+ "L": "Leu",
+ "M": "Met",
+ "N": "Asn",
+ "P": "Pro",
+ "Q": "Gln",
+ "R": "Arg",
+ "S": "Ser",
+ "T": "Thr",
+ "V": "Val",
+ "W": "Trp",
+ "Y": "Tyr",
+}
+protein_letters_1to3_extended = dict(
+ list(protein_letters_1to3.items())
+ + list(
+ {"B": "Asx", "X": "Xaa", "Z": "Glx", "J": "Xle", "U": "Sec", "O": "Pyl"}.items()
+ )
+)
+
+protein_letters_3to1 = {x[1]: x[0] for x in protein_letters_1to3.items()}
+protein_letters_3to1_extended = {
+ x[1]: x[0] for x in protein_letters_1to3_extended.items()
+}
+
+ambiguous_dna_letters = "GATCRYWSMKHBVDN"
+unambiguous_dna_letters = "GATC"
+ambiguous_rna_letters = "GAUCRYWSMKHBVDN"
+unambiguous_rna_letters = "GAUC"
+
+# B == 5-bromouridine
+# D == 5,6-dihydrouridine
+# S == thiouridine
+# W == wyosine
+extended_dna_letters = "GATCBDSW"
+
+# are there extended forms?
+# extended_rna_letters = "GAUCBDSW"
+
+# "X" is included in the following _values and _complement dictionaries,
+# for historical reasons although it is not an IUPAC nucleotide,
+# and so is not in the corresponding _letters strings above
+ambiguous_dna_values = {
+ "A": "A",
+ "C": "C",
+ "G": "G",
+ "T": "T",
+ "M": "AC",
+ "R": "AG",
+ "W": "AT",
+ "S": "CG",
+ "Y": "CT",
+ "K": "GT",
+ "V": "ACG",
+ "H": "ACT",
+ "D": "AGT",
+ "B": "CGT",
+ "X": "GATC",
+ "N": "GATC",
+}
+ambiguous_rna_values = {
+ "A": "A",
+ "C": "C",
+ "G": "G",
+ "U": "U",
+ "M": "AC",
+ "R": "AG",
+ "W": "AU",
+ "S": "CG",
+ "Y": "CU",
+ "K": "GU",
+ "V": "ACG",
+ "H": "ACU",
+ "D": "AGU",
+ "B": "CGU",
+ "X": "GAUC",
+ "N": "GAUC",
+}
+
+ambiguous_dna_complement = {
+ "A": "T",
+ "C": "G",
+ "G": "C",
+ "T": "A",
+ "M": "K",
+ "R": "Y",
+ "W": "W",
+ "S": "S",
+ "Y": "R",
+ "K": "M",
+ "V": "B",
+ "H": "D",
+ "D": "H",
+ "B": "V",
+ "X": "X",
+ "N": "N",
+}
+
+ambiguous_rna_complement = {
+ "A": "U",
+ "C": "G",
+ "G": "C",
+ "U": "A",
+ "M": "K",
+ "R": "Y",
+ "W": "W",
+ "S": "S",
+ "Y": "R",
+ "K": "M",
+ "V": "B",
+ "H": "D",
+ "D": "H",
+ "B": "V",
+ "X": "X",
+ "N": "N",
+}
+
+
+def _make_ranges(mydict):
+ d = {}
+ for key, value in mydict.items():
+ d[key] = (value, value)
+ return d
+
+
+# Mass data taken from PubChem
+
+
+# Average masses of monophosphate deoxy nucleotides
+unambiguous_dna_weights = {"A": 331.2218, "C": 307.1971, "G": 347.2212, "T": 322.2085}
+
+# Monoisotopic masses of monophospate deoxy nucleotides
+monoisotopic_unambiguous_dna_weights = {
+ "A": 331.06817,
+ "C": 307.056936,
+ "G": 347.063084,
+ "T": 322.056602,
+}
+
+unambiguous_dna_weight_ranges = _make_ranges(unambiguous_dna_weights)
+
+unambiguous_rna_weights = {"A": 347.2212, "C": 323.1965, "G": 363.2206, "U": 324.1813}
+
+monoisotopic_unambiguous_rna_weights = {
+ "A": 347.063084,
+ "C": 323.051851,
+ "G": 363.057999,
+ "U": 324.035867,
+}
+
+unambiguous_rna_weight_ranges = _make_ranges(unambiguous_rna_weights)
+
+
+def _make_ambiguous_ranges(mydict, weight_table):
+ range_d = {}
+ avg_d = {}
+ for letter, values in mydict.items():
+ # Following line is a quick hack to skip undefined weights for U and O
+ if len(values) == 1 and values[0] not in weight_table:
+ continue
+
+ weights = [weight_table.get(x) for x in values]
+ range_d[letter] = (min(weights), max(weights))
+ total_w = 0.0
+ for w in weights:
+ total_w = total_w + w
+ avg_d[letter] = total_w / len(weights)
+ return range_d, avg_d
+
+
+ambiguous_dna_weight_ranges, avg_ambiguous_dna_weights = _make_ambiguous_ranges(
+ ambiguous_dna_values, unambiguous_dna_weights
+)
+
+ambiguous_rna_weight_ranges, avg_ambiguous_rna_weights = _make_ambiguous_ranges(
+ ambiguous_rna_values, unambiguous_rna_weights
+)
+
+protein_weights = {
+ "A": 89.0932,
+ "C": 121.1582,
+ "D": 133.1027,
+ "E": 147.1293,
+ "F": 165.1891,
+ "G": 75.0666,
+ "H": 155.1546,
+ "I": 131.1729,
+ "K": 146.1876,
+ "L": 131.1729,
+ "M": 149.2113,
+ "N": 132.1179,
+ "O": 255.3134,
+ "P": 115.1305,
+ "Q": 146.1445,
+ "R": 174.201,
+ "S": 105.0926,
+ "T": 119.1192,
+ "U": 168.0532,
+ "V": 117.1463,
+ "W": 204.2252,
+ "Y": 181.1885,
+}
+
+monoisotopic_protein_weights = {
+ "A": 89.047678,
+ "C": 121.019749,
+ "D": 133.037508,
+ "E": 147.053158,
+ "F": 165.078979,
+ "G": 75.032028,
+ "H": 155.069477,
+ "I": 131.094629,
+ "K": 146.105528,
+ "L": 131.094629,
+ "M": 149.051049,
+ "N": 132.053492,
+ "O": 255.158292,
+ "P": 115.063329,
+ "Q": 146.069142,
+ "R": 174.111676,
+ "S": 105.042593,
+ "T": 119.058243,
+ "U": 168.964203,
+ "V": 117.078979,
+ "W": 204.089878,
+ "Y": 181.073893,
+}
+
+extended_protein_values = {
+ "A": "A",
+ "B": "ND",
+ "C": "C",
+ "D": "D",
+ "E": "E",
+ "F": "F",
+ "G": "G",
+ "H": "H",
+ "I": "I",
+ "J": "IL",
+ "K": "K",
+ "L": "L",
+ "M": "M",
+ "N": "N",
+ "O": "O",
+ "P": "P",
+ "Q": "Q",
+ "R": "R",
+ "S": "S",
+ "T": "T",
+ "U": "U",
+ "V": "V",
+ "W": "W",
+ "X": "ACDEFGHIKLMNPQRSTVWY",
+ # TODO - Include U and O in the possible values of X?
+ # This could alter the extended_protein_weight_ranges ...
+ # by MP: Won't do this, because they are so rare.
+ "Y": "Y",
+ "Z": "QE",
+}
+
+protein_weight_ranges = _make_ranges(protein_weights)
+
+extended_protein_weight_ranges, avg_extended_protein_weights = _make_ambiguous_ranges(
+ extended_protein_values, protein_weights
+)
+
+
+# For Center of Mass Calculation.
+# Taken from http://www.chem.qmul.ac.uk/iupac/AtWt/ & PyMol
+atom_weights = {
+ "H": 1.00794,
+ "D": 2.01410,
+ "He": 4.002602,
+ "Li": 6.941,
+ "Be": 9.012182,
+ "B": 10.811,
+ "C": 12.0107,
+ "N": 14.0067,
+ "O": 15.9994,
+ "F": 18.9984032,
+ "Ne": 20.1797,
+ "Na": 22.989770,
+ "Mg": 24.3050,
+ "Al": 26.981538,
+ "Si": 28.0855,
+ "P": 30.973761,
+ "S": 32.065,
+ "Cl": 35.453,
+ "Ar": 39.948,
+ "K": 39.0983,
+ "Ca": 40.078,
+ "Sc": 44.955910,
+ "Ti": 47.867,
+ "V": 50.9415,
+ "Cr": 51.9961,
+ "Mn": 54.938049,
+ "Fe": 55.845,
+ "Co": 58.933200,
+ "Ni": 58.6934,
+ "Cu": 63.546,
+ "Zn": 65.39,
+ "Ga": 69.723,
+ "Ge": 72.64,
+ "As": 74.92160,
+ "Se": 78.96,
+ "Br": 79.904,
+ "Kr": 83.80,
+ "Rb": 85.4678,
+ "Sr": 87.62,
+ "Y": 88.90585,
+ "Zr": 91.224,
+ "Nb": 92.90638,
+ "Mo": 95.94,
+ "Tc": 98.0,
+ "Ru": 101.07,
+ "Rh": 102.90550,
+ "Pd": 106.42,
+ "Ag": 107.8682,
+ "Cd": 112.411,
+ "In": 114.818,
+ "Sn": 118.710,
+ "Sb": 121.760,
+ "Te": 127.60,
+ "I": 126.90447,
+ "Xe": 131.293,
+ "Cs": 132.90545,
+ "Ba": 137.327,
+ "La": 138.9055,
+ "Ce": 140.116,
+ "Pr": 140.90765,
+ "Nd": 144.24,
+ "Pm": 145.0,
+ "Sm": 150.36,
+ "Eu": 151.964,
+ "Gd": 157.25,
+ "Tb": 158.92534,
+ "Dy": 162.50,
+ "Ho": 164.93032,
+ "Er": 167.259,
+ "Tm": 168.93421,
+ "Yb": 173.04,
+ "Lu": 174.967,
+ "Hf": 178.49,
+ "Ta": 180.9479,
+ "W": 183.84,
+ "Re": 186.207,
+ "Os": 190.23,
+ "Ir": 192.217,
+ "Pt": 195.078,
+ "Au": 196.96655,
+ "Hg": 200.59,
+ "Tl": 204.3833,
+ "Pb": 207.2,
+ "Bi": 208.98038,
+ "Po": 208.98,
+ "At": 209.99,
+ "Rn": 222.02,
+ "Fr": 223.02,
+ "Ra": 226.03,
+ "Ac": 227.03,
+ "Th": 232.0381,
+ "Pa": 231.03588,
+ "U": 238.02891,
+ "Np": 237.05,
+ "Pu": 244.06,
+ "Am": 243.06,
+ "Cm": 247.07,
+ "Bk": 247.07,
+ "Cf": 251.08,
+ "Es": 252.08,
+ "Fm": 257.10,
+ "Md": 258.10,
+ "No": 259.10,
+ "Lr": 262.11,
+ "Rf": 261.11,
+ "Db": 262.11,
+ "Sg": 266.12,
+ "Bh": 264.12,
+ "Hs": 269.13,
+ "Mt": 268.14,
+}
diff --git a/code/lib/Bio/Data/SCOPData.py b/code/lib/Bio/Data/SCOPData.py
new file mode 100644
index 0000000..79cfd4e
--- /dev/null
+++ b/code/lib/Bio/Data/SCOPData.py
@@ -0,0 +1,277 @@
+# Copyright Lenna Peterson (2012)
+# All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+
+"""Additional protein alphabets used in the SCOP database and PDB files.
+
+See Bio.SCOP for more information about SCOP and Biopython's SCOP module.
+"""
+
+# This file was automatically generated from PDB data.
+# Black would reformat this to one entry per line, so tell it not to:
+# fmt: off
+protein_letters_3to1 = {
+ "00C": "C", "01W": "X", "02K": "A", "03Y": "C", "07O": "C",
+ "08P": "C", "0A0": "D", "0A1": "Y", "0A2": "K", "0A8": "C",
+ "0AA": "V", "0AB": "V", "0AC": "G", "0AD": "G", "0AF": "W",
+ "0AG": "L", "0AH": "S", "0AK": "D", "0AM": "A", "0AP": "C",
+ "0AU": "U", "0AV": "A", "0AZ": "P", "0BN": "F", "0C ": "C",
+ "0CS": "A", "0DC": "C", "0DG": "G", "0DT": "T", "0FL": "A",
+ "0G ": "G", "0NC": "A", "0SP": "A", "0U ": "U", "0YG": "YG",
+ "10C": "C", "125": "U", "126": "U", "127": "U", "128": "N",
+ "12A": "A", "143": "C", "175": "ASG", "193": "X", "1AP": "A",
+ "1MA": "A", "1MG": "G", "1PA": "F", "1PI": "A", "1PR": "N",
+ "1SC": "C", "1TQ": "W", "1TY": "Y", "1X6": "S", "200": "F",
+ "23F": "F", "23S": "X", "26B": "T", "2AD": "X", "2AG": "A",
+ "2AO": "X", "2AR": "A", "2AS": "X", "2AT": "T", "2AU": "U",
+ "2BD": "I", "2BT": "T", "2BU": "A", "2CO": "C", "2DA": "A",
+ "2DF": "N", "2DM": "N", "2DO": "X", "2DT": "T", "2EG": "G",
+ "2FE": "N", "2FI": "N", "2FM": "M", "2GT": "T", "2HF": "H",
+ "2LU": "L", "2MA": "A", "2MG": "G", "2ML": "L", "2MR": "R",
+ "2MT": "P", "2MU": "U", "2NT": "T", "2OM": "U", "2OT": "T",
+ "2PI": "X", "2PR": "G", "2SA": "N", "2SI": "X", "2ST": "T",
+ "2TL": "T", "2TY": "Y", "2VA": "V", "2XA": "C", "32S": "X",
+ "32T": "X", "3AH": "H", "3AR": "X", "3CF": "F", "3DA": "A",
+ "3DR": "N", "3GA": "A", "3MD": "D", "3ME": "U", "3NF": "Y",
+ "3QN": "K", "3TY": "X", "3XH": "G", "4AC": "N", "4BF": "Y",
+ "4CF": "F", "4CY": "M", "4DP": "W", "4F3": "GYG", "4FB": "P",
+ "4FW": "W", "4HT": "W", "4IN": "W", "4MF": "N", "4MM": "X",
+ "4OC": "C", "4PC": "C", "4PD": "C", "4PE": "C", "4PH": "F",
+ "4SC": "C", "4SU": "U", "4TA": "N", "4U7": "A", "56A": "H",
+ "5AA": "A", "5AB": "A", "5AT": "T", "5BU": "U", "5CG": "G",
+ "5CM": "C", "5CS": "C", "5FA": "A", "5FC": "C", "5FU": "U",
+ "5HP": "E", "5HT": "T", "5HU": "U", "5IC": "C", "5IT": "T",
+ "5IU": "U", "5MC": "C", "5MD": "N", "5MU": "U", "5NC": "C",
+ "5PC": "C", "5PY": "T", "5SE": "U", "5ZA": "TWG", "64T": "T",
+ "6CL": "K", "6CT": "T", "6CW": "W", "6HA": "A", "6HC": "C",
+ "6HG": "G", "6HN": "K", "6HT": "T", "6IA": "A", "6MA": "A",
+ "6MC": "A", "6MI": "N", "6MT": "A", "6MZ": "N", "6OG": "G",
+ "70U": "U", "7DA": "A", "7GU": "G", "7JA": "I", "7MG": "G",
+ "8AN": "A", "8FG": "G", "8MG": "G", "8OG": "G", "9NE": "E",
+ "9NF": "F", "9NR": "R", "9NV": "V", "A ": "A", "A1P": "N",
+ "A23": "A", "A2L": "A", "A2M": "A", "A34": "A", "A35": "A",
+ "A38": "A", "A39": "A", "A3A": "A", "A3P": "A", "A40": "A",
+ "A43": "A", "A44": "A", "A47": "A", "A5L": "A", "A5M": "C",
+ "A5N": "N", "A5O": "A", "A66": "X", "AA3": "A", "AA4": "A",
+ "AAR": "R", "AB7": "X", "ABA": "A", "ABR": "A", "ABS": "A",
+ "ABT": "N", "ACB": "D", "ACL": "R", "AD2": "A", "ADD": "X",
+ "ADX": "N", "AEA": "X", "AEI": "D", "AET": "A", "AFA": "N",
+ "AFF": "N", "AFG": "G", "AGM": "R", "AGT": "C", "AHB": "N",
+ "AHH": "X", "AHO": "A", "AHP": "A", "AHS": "X", "AHT": "X",
+ "AIB": "A", "AKL": "D", "AKZ": "D", "ALA": "A", "ALC": "A",
+ "ALM": "A", "ALN": "A", "ALO": "T", "ALQ": "X", "ALS": "A",
+ "ALT": "A", "ALV": "A", "ALY": "K", "AN8": "A", "AP7": "A",
+ "APE": "X", "APH": "A", "API": "K", "APK": "K", "APM": "X",
+ "APP": "X", "AR2": "R", "AR4": "E", "AR7": "R", "ARG": "R",
+ "ARM": "R", "ARO": "R", "ARV": "X", "AS ": "A", "AS2": "D",
+ "AS9": "X", "ASA": "D", "ASB": "D", "ASI": "D", "ASK": "D",
+ "ASL": "D", "ASM": "X", "ASN": "N", "ASP": "D", "ASQ": "D",
+ "ASU": "N", "ASX": "B", "ATD": "T", "ATL": "T", "ATM": "T",
+ "AVC": "A", "AVN": "X", "AYA": "A", "AYG": "AYG", "AZK": "K",
+ "AZS": "S", "AZY": "Y", "B1F": "F", "B1P": "N", "B2A": "A",
+ "B2F": "F", "B2I": "I", "B2V": "V", "B3A": "A", "B3D": "D",
+ "B3E": "E", "B3K": "K", "B3L": "X", "B3M": "X", "B3Q": "X",
+ "B3S": "S", "B3T": "X", "B3U": "H", "B3X": "N", "B3Y": "Y",
+ "BB6": "C", "BB7": "C", "BB8": "F", "BB9": "C", "BBC": "C",
+ "BCS": "C", "BE2": "X", "BFD": "D", "BG1": "S", "BGM": "G",
+ "BH2": "D", "BHD": "D", "BIF": "F", "BIL": "X", "BIU": "I",
+ "BJH": "X", "BLE": "L", "BLY": "K", "BMP": "N", "BMT": "T",
+ "BNN": "F", "BNO": "X", "BOE": "T", "BOR": "R", "BPE": "C",
+ "BRU": "U", "BSE": "S", "BT5": "N", "BTA": "L", "BTC": "C",
+ "BTR": "W", "BUC": "C", "BUG": "V", "BVP": "U", "BZG": "N",
+ "C ": "C", "C12": "TYG", "C1X": "K", "C25": "C", "C2L": "C",
+ "C2S": "C", "C31": "C", "C32": "C", "C34": "C", "C36": "C",
+ "C37": "C", "C38": "C", "C3Y": "C", "C42": "C", "C43": "C",
+ "C45": "C", "C46": "C", "C49": "C", "C4R": "C", "C4S": "C",
+ "C5C": "C", "C66": "X", "C6C": "C", "C99": "TFG", "CAF": "C",
+ "CAL": "X", "CAR": "C", "CAS": "C", "CAV": "X", "CAY": "C",
+ "CB2": "C", "CBR": "C", "CBV": "C", "CCC": "C", "CCL": "K",
+ "CCS": "C", "CCY": "CYG", "CDE": "X", "CDV": "X", "CDW": "C",
+ "CEA": "C", "CFL": "C", "CFY": "FCYG", "CG1": "G", "CGA": "E",
+ "CGU": "E", "CH ": "C", "CH6": "MYG", "CH7": "KYG", "CHF": "X",
+ "CHG": "X", "CHP": "G", "CHS": "X", "CIR": "R", "CJO": "GYG",
+ "CLE": "L", "CLG": "K", "CLH": "K", "CLV": "AFG", "CM0": "N",
+ "CME": "C", "CMH": "C", "CML": "C", "CMR": "C", "CMT": "C",
+ "CNU": "U", "CP1": "C", "CPC": "X", "CPI": "X", "CQR": "GYG",
+ "CR0": "TLG", "CR2": "GYG", "CR5": "G", "CR7": "KYG", "CR8": "HYG",
+ "CRF": "TWG", "CRG": "THG", "CRK": "MYG", "CRO": "GYG", "CRQ": "QYG",
+ "CRU": "EYG", "CRW": "ASG", "CRX": "ASG", "CS0": "C", "CS1": "C",
+ "CS3": "C", "CS4": "C", "CS8": "N", "CSA": "C", "CSB": "C",
+ "CSD": "C", "CSE": "C", "CSF": "C", "CSH": "SHG", "CSI": "G",
+ "CSJ": "C", "CSL": "C", "CSO": "C", "CSP": "C", "CSR": "C",
+ "CSS": "C", "CSU": "C", "CSW": "C", "CSX": "C", "CSY": "SYG",
+ "CSZ": "C", "CTE": "W", "CTG": "T", "CTH": "T", "CUC": "X",
+ "CWR": "S", "CXM": "M", "CY0": "C", "CY1": "C", "CY3": "C",
+ "CY4": "C", "CYA": "C", "CYD": "C", "CYF": "C", "CYG": "C",
+ "CYJ": "X", "CYM": "C", "CYQ": "C", "CYR": "C", "CYS": "C",
+ "CZ2": "C", "CZO": "GYG", "CZZ": "C", "D11": "T", "D1P": "N",
+ "D3 ": "N", "D33": "N", "D3P": "G", "D3T": "T", "D4M": "T",
+ "D4P": "X", "DA ": "A", "DA2": "X", "DAB": "A", "DAH": "F",
+ "DAL": "A", "DAR": "R", "DAS": "D", "DBB": "T", "DBM": "N",
+ "DBS": "S", "DBU": "T", "DBY": "Y", "DBZ": "A", "DC ": "C",
+ "DC2": "C", "DCG": "G", "DCI": "X", "DCL": "X", "DCT": "C",
+ "DCY": "C", "DDE": "H", "DDG": "G", "DDN": "U", "DDX": "N",
+ "DFC": "C", "DFG": "G", "DFI": "X", "DFO": "X", "DFT": "N",
+ "DG ": "G", "DGH": "G", "DGI": "G", "DGL": "E", "DGN": "Q",
+ "DHA": "S", "DHI": "H", "DHL": "X", "DHN": "V", "DHP": "X",
+ "DHU": "U", "DHV": "V", "DI ": "I", "DIL": "I", "DIR": "R",
+ "DIV": "V", "DLE": "L", "DLS": "K", "DLY": "K", "DM0": "K",
+ "DMH": "N", "DMK": "D", "DMT": "X", "DN ": "N", "DNE": "L",
+ "DNG": "L", "DNL": "K", "DNM": "L", "DNP": "A", "DNR": "C",
+ "DNS": "K", "DOA": "X", "DOC": "C", "DOH": "D", "DON": "L",
+ "DPB": "T", "DPH": "F", "DPL": "P", "DPP": "A", "DPQ": "Y",
+ "DPR": "P", "DPY": "N", "DRM": "U", "DRP": "N", "DRT": "T",
+ "DRZ": "N", "DSE": "S", "DSG": "N", "DSN": "S", "DSP": "D",
+ "DT ": "T", "DTH": "T", "DTR": "W", "DTY": "Y", "DU ": "U",
+ "DVA": "V", "DXD": "N", "DXN": "N", "DYG": "DYG", "DYS": "C",
+ "DZM": "A", "E ": "A", "E1X": "A", "ECC": "Q", "EDA": "A",
+ "EFC": "C", "EHP": "F", "EIT": "T", "ENP": "N", "ESB": "Y",
+ "ESC": "M", "EXB": "X", "EXY": "L", "EY5": "N", "EYS": "X",
+ "F2F": "F", "FA2": "A", "FA5": "N", "FAG": "N", "FAI": "N",
+ "FB5": "A", "FB6": "A", "FCL": "F", "FFD": "N", "FGA": "E",
+ "FGL": "G", "FGP": "S", "FHL": "X", "FHO": "K", "FHU": "U",
+ "FLA": "A", "FLE": "L", "FLT": "Y", "FME": "M", "FMG": "G",
+ "FMU": "N", "FOE": "C", "FOX": "G", "FP9": "P", "FPA": "F",
+ "FRD": "X", "FT6": "W", "FTR": "W", "FTY": "Y", "FVA": "V",
+ "FZN": "K", "G ": "G", "G25": "G", "G2L": "G", "G2S": "G",
+ "G31": "G", "G32": "G", "G33": "G", "G36": "G", "G38": "G",
+ "G42": "G", "G46": "G", "G47": "G", "G48": "G", "G49": "G",
+ "G4P": "N", "G7M": "G", "GAO": "G", "GAU": "E", "GCK": "C",
+ "GCM": "X", "GDP": "G", "GDR": "G", "GFL": "G", "GGL": "E",
+ "GH3": "G", "GHG": "Q", "GHP": "G", "GL3": "G", "GLH": "Q",
+ "GLJ": "E", "GLK": "E", "GLM": "X", "GLN": "Q", "GLQ": "E",
+ "GLU": "E", "GLX": "Z", "GLY": "G", "GLZ": "G", "GMA": "E",
+ "GMS": "G", "GMU": "U", "GN7": "G", "GND": "X", "GNE": "N",
+ "GOM": "G", "GPL": "K", "GS ": "G", "GSC": "G", "GSR": "G",
+ "GSS": "G", "GSU": "E", "GT9": "C", "GTP": "G", "GVL": "X",
+ "GYC": "CYG", "GYS": "SYG", "H2U": "U", "H5M": "P", "HAC": "A",
+ "HAR": "R", "HBN": "H", "HCS": "X", "HDP": "U", "HEU": "U",
+ "HFA": "X", "HGL": "X", "HHI": "H", "HHK": "AK", "HIA": "H",
+ "HIC": "H", "HIP": "H", "HIQ": "H", "HIS": "H", "HL2": "L",
+ "HLU": "L", "HMR": "R", "HOL": "N", "HPC": "F", "HPE": "F",
+ "HPH": "F", "HPQ": "F", "HQA": "A", "HRG": "R", "HRP": "W",
+ "HS8": "H", "HS9": "H", "HSE": "S", "HSL": "S", "HSO": "H",
+ "HTI": "C", "HTN": "N", "HTR": "W", "HV5": "A", "HVA": "V",
+ "HY3": "P", "HYP": "P", "HZP": "P", "I ": "I", "I2M": "I",
+ "I58": "K", "I5C": "C", "IAM": "A", "IAR": "R", "IAS": "D",
+ "IC ": "C", "IEL": "K", "IEY": "HYG", "IG ": "G", "IGL": "G",
+ "IGU": "G", "IIC": "SHG", "IIL": "I", "ILE": "I", "ILG": "E",
+ "ILX": "I", "IMC": "C", "IML": "I", "IOY": "F", "IPG": "G",
+ "IPN": "N", "IRN": "N", "IT1": "K", "IU ": "U", "IYR": "Y",
+ "IYT": "T", "IZO": "M", "JJJ": "C", "JJK": "C", "JJL": "C",
+ "JW5": "N", "K1R": "C", "KAG": "G", "KCX": "K", "KGC": "K",
+ "KNB": "A", "KOR": "M", "KPI": "K", "KST": "K", "KYQ": "K",
+ "L2A": "X", "LA2": "K", "LAA": "D", "LAL": "A", "LBY": "K",
+ "LC ": "C", "LCA": "A", "LCC": "N", "LCG": "G", "LCH": "N",
+ "LCK": "K", "LCX": "K", "LDH": "K", "LED": "L", "LEF": "L",
+ "LEH": "L", "LEI": "V", "LEM": "L", "LEN": "L", "LET": "X",
+ "LEU": "L", "LEX": "L", "LG ": "G", "LGP": "G", "LHC": "X",
+ "LHU": "U", "LKC": "N", "LLP": "K", "LLY": "K", "LME": "E",
+ "LMF": "K", "LMQ": "Q", "LMS": "N", "LP6": "K", "LPD": "P",
+ "LPG": "G", "LPL": "X", "LPS": "S", "LSO": "X", "LTA": "X",
+ "LTR": "W", "LVG": "G", "LVN": "V", "LYF": "K", "LYK": "K",
+ "LYM": "K", "LYN": "K", "LYR": "K", "LYS": "K", "LYX": "K",
+ "LYZ": "K", "M0H": "C", "M1G": "G", "M2G": "G", "M2L": "K",
+ "M2S": "M", "M30": "G", "M3L": "K", "M5M": "C", "MA ": "A",
+ "MA6": "A", "MA7": "A", "MAA": "A", "MAD": "A", "MAI": "R",
+ "MBQ": "Y", "MBZ": "N", "MC1": "S", "MCG": "X", "MCL": "K",
+ "MCS": "C", "MCY": "C", "MD3": "C", "MD6": "G", "MDH": "X",
+ "MDO": "ASG", "MDR": "N", "MEA": "F", "MED": "M", "MEG": "E",
+ "MEN": "N", "MEP": "U", "MEQ": "Q", "MET": "M", "MEU": "G",
+ "MF3": "X", "MFC": "GYG", "MG1": "G", "MGG": "R", "MGN": "Q",
+ "MGQ": "A", "MGV": "G", "MGY": "G", "MHL": "L", "MHO": "M",
+ "MHS": "H", "MIA": "A", "MIS": "S", "MK8": "L", "ML3": "K",
+ "MLE": "L", "MLL": "L", "MLY": "K", "MLZ": "K", "MME": "M",
+ "MMO": "R", "MMT": "T", "MND": "N", "MNL": "L", "MNU": "U",
+ "MNV": "V", "MOD": "X", "MP8": "P", "MPH": "X", "MPJ": "X",
+ "MPQ": "G", "MRG": "G", "MSA": "G", "MSE": "M", "MSL": "M",
+ "MSO": "M", "MSP": "X", "MT2": "M", "MTR": "T", "MTU": "A",
+ "MTY": "Y", "MVA": "V", "N ": "N", "N10": "S", "N2C": "X",
+ "N5I": "N", "N5M": "C", "N6G": "G", "N7P": "P", "NA8": "A",
+ "NAL": "A", "NAM": "A", "NB8": "N", "NBQ": "Y", "NC1": "S",
+ "NCB": "A", "NCX": "N", "NCY": "X", "NDF": "F", "NDN": "U",
+ "NEM": "H", "NEP": "H", "NF2": "N", "NFA": "F", "NHL": "E",
+ "NIT": "X", "NIY": "Y", "NLE": "L", "NLN": "L", "NLO": "L",
+ "NLP": "L", "NLQ": "Q", "NMC": "G", "NMM": "R", "NMS": "T",
+ "NMT": "T", "NNH": "R", "NP3": "N", "NPH": "C", "NPI": "A",
+ "NRP": "LYG", "NRQ": "MYG", "NSK": "X", "NTY": "Y", "NVA": "V",
+ "NYC": "TWG", "NYG": "NYG", "NYM": "N", "NYS": "C", "NZH": "H",
+ "O12": "X", "O2C": "N", "O2G": "G", "OAD": "N", "OAS": "S",
+ "OBF": "X", "OBS": "X", "OCS": "C", "OCY": "C", "ODP": "N",
+ "OHI": "H", "OHS": "D", "OIC": "X", "OIP": "I", "OLE": "X",
+ "OLT": "T", "OLZ": "S", "OMC": "C", "OMG": "G", "OMT": "M",
+ "OMU": "U", "ONE": "U", "ONH": "A", "ONL": "X", "OPR": "R",
+ "ORN": "A", "ORQ": "R", "OSE": "S", "OTB": "X", "OTH": "T",
+ "OTY": "Y", "OXX": "D", "P ": "G", "P1L": "C", "P1P": "N",
+ "P2T": "T", "P2U": "U", "P2Y": "P", "P5P": "A", "PAQ": "Y",
+ "PAS": "D", "PAT": "W", "PAU": "A", "PBB": "C", "PBF": "F",
+ "PBT": "N", "PCA": "E", "PCC": "P", "PCE": "X", "PCS": "F",
+ "PDL": "X", "PDU": "U", "PEC": "C", "PF5": "F", "PFF": "F",
+ "PFX": "X", "PG1": "S", "PG7": "G", "PG9": "G", "PGL": "X",
+ "PGN": "G", "PGP": "G", "PGY": "G", "PHA": "F", "PHD": "D",
+ "PHE": "F", "PHI": "F", "PHL": "F", "PHM": "F", "PIA": "AYG",
+ "PIV": "X", "PLE": "L", "PM3": "F", "PMT": "C", "POM": "P",
+ "PPN": "F", "PPU": "A", "PPW": "G", "PQ1": "N", "PR3": "C",
+ "PR5": "A", "PR9": "P", "PRN": "A", "PRO": "P", "PRS": "P",
+ "PSA": "F", "PSH": "H", "PST": "T", "PSU": "U", "PSW": "C",
+ "PTA": "X", "PTH": "Y", "PTM": "Y", "PTR": "Y", "PU ": "A",
+ "PUY": "N", "PVH": "H", "PVL": "X", "PYA": "A", "PYO": "U",
+ "PYX": "C", "PYY": "N", "QLG": "QLG", "QMM": "Q", "QPA": "C",
+ "QPH": "F", "QUO": "G", "R ": "A", "R1A": "C", "R4K": "W",
+ "RC7": "HYG", "RE0": "W", "RE3": "W", "RIA": "A", "RMP": "A",
+ "RON": "X", "RT ": "T", "RTP": "N", "S1H": "S", "S2C": "C",
+ "S2D": "A", "S2M": "T", "S2P": "A", "S4A": "A", "S4C": "C",
+ "S4G": "G", "S4U": "U", "S6G": "G", "SAC": "S", "SAH": "C",
+ "SAR": "G", "SBL": "S", "SC ": "C", "SCH": "C", "SCS": "C",
+ "SCY": "C", "SD2": "X", "SDG": "G", "SDP": "S", "SEB": "S",
+ "SEC": "A", "SEG": "A", "SEL": "S", "SEM": "S", "SEN": "S",
+ "SEP": "S", "SER": "S", "SET": "S", "SGB": "S", "SHC": "C",
+ "SHP": "G", "SHR": "K", "SIB": "C", "SIC": "DC", "SLA": "P",
+ "SLR": "P", "SLZ": "K", "SMC": "C", "SME": "M", "SMF": "F",
+ "SMP": "A", "SMT": "T", "SNC": "C", "SNN": "N", "SOC": "C",
+ "SOS": "N", "SOY": "S", "SPT": "T", "SRA": "A", "SSU": "U",
+ "STY": "Y", "SUB": "X", "SUI": "DG", "SUN": "S", "SUR": "U",
+ "SVA": "S", "SVV": "S", "SVW": "S", "SVX": "S", "SVY": "S",
+ "SVZ": "X", "SWG": "SWG", "SYS": "C", "T ": "T", "T11": "F",
+ "T23": "T", "T2S": "T", "T2T": "N", "T31": "U", "T32": "T",
+ "T36": "T", "T37": "T", "T38": "T", "T39": "T", "T3P": "T",
+ "T41": "T", "T48": "T", "T49": "T", "T4S": "T", "T5O": "U",
+ "T5S": "T", "T66": "X", "T6A": "A", "TA3": "T", "TA4": "X",
+ "TAF": "T", "TAL": "N", "TAV": "D", "TBG": "V", "TBM": "T",
+ "TC1": "C", "TCP": "T", "TCQ": "Y", "TCR": "W", "TCY": "A",
+ "TDD": "L", "TDY": "T", "TFE": "T", "TFO": "A", "TFQ": "F",
+ "TFT": "T", "TGP": "G", "TH6": "T", "THC": "T", "THO": "X",
+ "THR": "T", "THX": "N", "THZ": "R", "TIH": "A", "TLB": "N",
+ "TLC": "T", "TLN": "U", "TMB": "T", "TMD": "T", "TNB": "C",
+ "TNR": "S", "TOX": "W", "TP1": "T", "TPC": "C", "TPG": "G",
+ "TPH": "X", "TPL": "W", "TPO": "T", "TPQ": "Y", "TQI": "W",
+ "TQQ": "W", "TRF": "W", "TRG": "K", "TRN": "W", "TRO": "W",
+ "TRP": "W", "TRQ": "W", "TRW": "W", "TRX": "W", "TS ": "N",
+ "TST": "X", "TT ": "N", "TTD": "T", "TTI": "U", "TTM": "T",
+ "TTQ": "W", "TTS": "Y", "TY1": "Y", "TY2": "Y", "TY3": "Y",
+ "TY5": "Y", "TYB": "Y", "TYI": "Y", "TYJ": "Y", "TYN": "Y",
+ "TYO": "Y", "TYQ": "Y", "TYR": "Y", "TYS": "Y", "TYT": "Y",
+ "TYU": "N", "TYW": "Y", "TYX": "X", "TYY": "Y", "TZB": "X",
+ "TZO": "X", "U ": "U", "U25": "U", "U2L": "U", "U2N": "U",
+ "U2P": "U", "U31": "U", "U33": "U", "U34": "U", "U36": "U",
+ "U37": "U", "U8U": "U", "UAR": "U", "UCL": "U", "UD5": "U",
+ "UDP": "N", "UFP": "N", "UFR": "U", "UFT": "U", "UMA": "A",
+ "UMP": "U", "UMS": "U", "UN1": "X", "UN2": "X", "UNK": "X",
+ "UR3": "U", "URD": "U", "US1": "U", "US2": "U", "US3": "T",
+ "US5": "U", "USM": "U", "VAD": "V", "VAF": "V", "VAL": "V",
+ "VB1": "K", "VDL": "X", "VLL": "X", "VLM": "X", "VMS": "X",
+ "VOL": "X", "WCR": "GYG", "X ": "G", "X2W": "E", "X4A": "N",
+ "X9Q": "AFG", "XAD": "A", "XAE": "N", "XAL": "A", "XAR": "N",
+ "XCL": "C", "XCN": "C", "XCP": "X", "XCR": "C", "XCS": "N",
+ "XCT": "C", "XCY": "C", "XGA": "N", "XGL": "G", "XGR": "G",
+ "XGU": "G", "XPR": "P", "XSN": "N", "XTH": "T", "XTL": "T",
+ "XTR": "T", "XTS": "G", "XTY": "N", "XUA": "A", "XUG": "G",
+ "XX1": "K", "XXY": "THG", "XYG": "DYG", "Y ": "A", "YCM": "C",
+ "YG ": "G", "YOF": "Y", "YRR": "N", "YYG": "G", "Z ": "C",
+ "Z01": "A", "ZAD": "A", "ZAL": "A", "ZBC": "C", "ZBU": "U",
+ "ZCL": "F", "ZCY": "C", "ZDU": "U", "ZFB": "X", "ZGU": "G",
+ "ZHP": "N", "ZTH": "T", "ZU0": "T", "ZZJ": "A"}
diff --git a/code/lib/Bio/Data/__init__.py b/code/lib/Bio/Data/__init__.py
new file mode 100644
index 0000000..568286c
--- /dev/null
+++ b/code/lib/Bio/Data/__init__.py
@@ -0,0 +1,8 @@
+# Copyright 2000 Andrew Dalke. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+
+"""Collections of various bits of useful biological data."""
diff --git a/code/lib/Bio/Data/__pycache__/CodonTable.cpython-37.pyc b/code/lib/Bio/Data/__pycache__/CodonTable.cpython-37.pyc
new file mode 100644
index 0000000..7566625
Binary files /dev/null and b/code/lib/Bio/Data/__pycache__/CodonTable.cpython-37.pyc differ
diff --git a/code/lib/Bio/Data/__pycache__/IUPACData.cpython-37.pyc b/code/lib/Bio/Data/__pycache__/IUPACData.cpython-37.pyc
new file mode 100644
index 0000000..f47d34a
Binary files /dev/null and b/code/lib/Bio/Data/__pycache__/IUPACData.cpython-37.pyc differ
diff --git a/code/lib/Bio/Data/__pycache__/SCOPData.cpython-37.pyc b/code/lib/Bio/Data/__pycache__/SCOPData.cpython-37.pyc
new file mode 100644
index 0000000..ce87c43
Binary files /dev/null and b/code/lib/Bio/Data/__pycache__/SCOPData.cpython-37.pyc differ
diff --git a/code/lib/Bio/Data/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/Data/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..1ff1640
Binary files /dev/null and b/code/lib/Bio/Data/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/Emboss/Applications.py b/code/lib/Bio/Emboss/Applications.py
new file mode 100644
index 0000000..c3eab72
--- /dev/null
+++ b/code/lib/Bio/Emboss/Applications.py
@@ -0,0 +1,1221 @@
+# Copyright 2001-2009 Brad Chapman.
+# Revisions copyright 2009-2016 by Peter Cock.
+# Revisions copyright 2009 by David Winter.
+# Revisions copyright 2009-2010 by Leighton Pritchard.
+# All rights reserved.
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Code to interact with and run various EMBOSS programs (OBSOLETE).
+
+These classes follow the AbstractCommandline interfaces for running
+programs.
+
+We have decided to remove this module in future, and instead recommend
+building your command and invoking it via the subprocess module directly.
+"""
+
+
+from Bio.Application import _Option, _Switch, AbstractCommandline
+
+
+class _EmbossMinimalCommandLine(AbstractCommandline):
+ """Base Commandline object for EMBOSS wrappers (PRIVATE).
+
+ This is provided for subclassing, it deals with shared options
+ common to all the EMBOSS tools:
+
+ Attributes:
+ - auto Turn off prompts
+ - stdout Write standard output
+ - filter Read standard input, write standard output
+ - options Prompt for standard and additional values
+ - debug Write debug output to program.dbg
+ - verbose Report some/full command line options
+ - help Report command line options. More
+ information on associated and general
+ qualifiers can be found with -help -verbose
+ - warning Report warnings
+ - error Report errors
+ - fatal Report fatal errors
+ - die Report dying program messages
+
+ """
+
+ def __init__(self, cmd=None, **kwargs):
+ assert cmd is not None
+ extra_parameters = [
+ _Switch(
+ ["-auto", "auto"],
+ "Turn off prompts.\n\n"
+ "Automatic mode disables prompting, so we recommend you set this "
+ "argument all the time when calling an EMBOSS tool from Biopython.",
+ ),
+ _Switch(["-stdout", "stdout"], "Write standard output."),
+ _Switch(
+ ["-filter", "filter"], "Read standard input, write standard output."
+ ),
+ _Switch(
+ ["-options", "options"],
+ "Prompt for standard and additional values.\n\n"
+ "If you are calling an EMBOSS tool from within Biopython, "
+ "we DO NOT recommend using this option.",
+ ),
+ _Switch(["-debug", "debug"], "Write debug output to program.dbg."),
+ _Switch(["-verbose", "verbose"], "Report some/full command line options"),
+ _Switch(
+ ["-help", "help"],
+ "Report command line options.\n\n"
+ "More information on associated and general qualifiers "
+ "can be found with -help -verbose",
+ ),
+ _Switch(["-warning", "warning"], "Report warnings."),
+ _Switch(["-error", "error"], "Report errors."),
+ _Switch(["-die", "die"], "Report dying program messages."),
+ ]
+ try:
+ # Insert extra parameters - at the start just in case there
+ # are any arguments which must come last:
+ self.parameters = extra_parameters + self.parameters
+ except AttributeError:
+ # Should we raise an error? The subclass should have set this up!
+ self.parameters = extra_parameters
+ AbstractCommandline.__init__(self, cmd, **kwargs)
+
+
+class _EmbossCommandLine(_EmbossMinimalCommandLine):
+ """Base Commandline object for EMBOSS wrappers (PRIVATE).
+
+ This is provided for subclassing, it deals with shared options
+ common to all the EMBOSS tools plus:
+
+ - outfile Output filename
+
+ """
+
+ def __init__(self, cmd=None, **kwargs):
+ assert cmd is not None
+ extra_parameters = [
+ _Option(["-outfile", "outfile"], "Output filename", filename=True)
+ ]
+ try:
+ # Insert extra parameters - at the start just in case there
+ # are any arguments which must come last:
+ self.parameters = extra_parameters + self.parameters
+ except AttributeError:
+ # Should we raise an error? The subclass should have set this up!
+ self.parameters = extra_parameters
+ _EmbossMinimalCommandLine.__init__(self, cmd, **kwargs)
+
+ def _validate(self):
+ # Check the outfile, filter, or stdout option has been set.
+ # We can't simply do this via the required flag for the outfile
+ # output - this seems the simplest solution.
+ if not (self.outfile or self.filter or self.stdout):
+ raise ValueError(
+ "You must either set outfile (output filename), "
+ "or enable filter or stdout (output to stdout)."
+ )
+ return _EmbossMinimalCommandLine._validate(self)
+
+
+class Primer3Commandline(_EmbossCommandLine):
+ """Commandline object for the Primer3 interface from EMBOSS.
+
+ The precise set of supported arguments depends on your version of EMBOSS.
+ This version accepts arguments current at EMBOSS 6.1.0:
+
+ >>> cline = Primer3Commandline(sequence="mysequence.fas", auto=True, hybridprobe=True)
+ >>> cline.explainflag = True
+ >>> cline.osizeopt=20
+ >>> cline.psizeopt=200
+ >>> cline.outfile = "myresults.out"
+ >>> cline.bogusparameter = 1967 # Invalid parameter
+ Traceback (most recent call last):
+ ...
+ ValueError: Option name bogusparameter was not found.
+ >>> print(cline)
+ eprimer3 -auto -outfile=myresults.out -sequence=mysequence.fas -hybridprobe=True -psizeopt=200 -osizeopt=20 -explainflag=True
+
+ """
+
+ def __init__(self, cmd="eprimer3", **kwargs):
+ """Initialize the class."""
+ self.parameters = [
+ _Option(
+ ["-sequence", "sequence"],
+ "Sequence to choose primers from.",
+ is_required=True,
+ ),
+ _Option(["-task", "task"], "Tell eprimer3 what task to perform."),
+ _Option(
+ ["-hybridprobe", "hybridprobe"],
+ "Find an internal oligo to use as a hyb probe.",
+ ),
+ _Option(
+ ["-numreturn", "numreturn"], "Maximum number of primer pairs to return."
+ ),
+ _Option(
+ ["-includedregion", "includedregion"],
+ "Subregion of the sequence in which to pick primers.",
+ ),
+ _Option(["-target", "target"], "Sequence to target for flanking primers."),
+ _Option(
+ ["-excludedregion", "excludedregion"],
+ "Regions to exclude from primer picking.",
+ ),
+ _Option(
+ ["-forwardinput", "forwardinput"],
+ "Sequence of a forward primer to check.",
+ ),
+ _Option(
+ ["-reverseinput", "reverseinput"],
+ "Sequence of a reverse primer to check.",
+ ),
+ _Option(
+ ["-gcclamp", "gcclamp"],
+ "The required number of Gs and Cs at the 3' of each primer.",
+ ),
+ _Option(["-osize", "osize"], "Optimum length of a primer oligo."),
+ _Option(["-minsize", "minsize"], "Minimum length of a primer oligo."),
+ _Option(["-maxsize", "maxsize"], "Maximum length of a primer oligo."),
+ _Option(
+ ["-otm", "otm"],
+ "Melting temperature for primer oligo (OBSOLETE).\n\n"
+ "Option replaced in EMBOSS 6.6.0 by -opttm",
+ ),
+ _Option(
+ ["-opttm", "opttm"],
+ "Optimum melting temperature for a primer oligo.\n\n"
+ "Option added in EMBOSS 6.6.0, replacing -otm",
+ ),
+ _Option(
+ ["-mintm", "mintm"], "Minimum melting temperature for a primer oligo."
+ ),
+ _Option(
+ ["-maxtm", "maxtm"], "Maximum melting temperature for a primer oligo."
+ ),
+ _Option(
+ ["-maxdifftm", "maxdifftm"],
+ "Maximum difference in melting temperatures between "
+ "forward and reverse primers.",
+ ),
+ _Option(["-ogcpercent", "ogcpercent"], "Optimum GC% for a primer."),
+ _Option(["-mingc", "mingc"], "Minimum GC% for a primer."),
+ _Option(["-maxgc", "maxgc"], "Maximum GC% for a primer."),
+ _Option(
+ ["-saltconc", "saltconc"], "Millimolar salt concentration in the PCR."
+ ),
+ _Option(
+ ["-dnaconc", "dnaconc"],
+ "Nanomolar concentration of annealing oligos in the PCR.",
+ ),
+ _Option(
+ ["-maxpolyx", "maxpolyx"],
+ "Maximum allowable mononucleotide repeat length in a primer.",
+ ),
+ # Primer length:
+ _Option(["-psizeopt", "psizeopt"], "Optimum size for the PCR product."),
+ _Option(
+ ["-prange", "prange"], "Acceptable range of length for the PCR product."
+ ),
+ # Primer temperature:
+ _Option(
+ ["-ptmopt", "ptmopt"],
+ "Optimum melting temperature for the PCR product.",
+ ),
+ _Option(
+ ["-ptmmin", "ptmmin"],
+ "Minimum allowed melting temperature for the amplicon.",
+ ),
+ _Option(
+ ["-ptmmax", "ptmmax"],
+ "Maximum allowed melting temperature for the amplicon.",
+ ),
+ # Note to self, should be -oexcludedregion not -oexcluderegion
+ _Option(
+ ["-oexcludedregion", "oexcludedregion"],
+ "Do not pick internal oligos in this region.",
+ ),
+ _Option(["-oligoinput", "oligoinput"], "Sequence of the internal oligo."),
+ # Oligo length:
+ _Option(["-osizeopt", "osizeopt"], "Optimum length of internal oligo."),
+ _Option(["-ominsize", "ominsize"], "Minimum length of internal oligo."),
+ _Option(["-omaxsize", "omaxsize"], "Maximum length of internal oligo."),
+ # Oligo GC temperature:
+ _Option(
+ ["-otmopt", "otmopt"], "Optimum melting temperature of internal oligo."
+ ),
+ _Option(
+ ["-otmmin", "otmmin"], "Minimum melting temperature of internal oligo."
+ ),
+ _Option(
+ ["-otmmax", "otmmax"], "Maximum melting temperature of internal oligo."
+ ),
+ # Oligo GC percent:
+ _Option(["-ogcopt", "ogcopt"], "Optimum GC% for internal oligo."),
+ _Option(["-ogcmin", "ogcmin"], "Minimum GC% for internal oligo."),
+ _Option(["-ogcmax", "ogcmax"], "Maximum GC% for internal oligo."),
+ # Oligo salt concentration:
+ _Option(
+ ["-osaltconc", "osaltconc"],
+ "Millimolar concentration of salt in the hybridisation.",
+ ),
+ _Option(
+ ["-odnaconc", "odnaconc"],
+ "Nanomolar concentration of internal oligo in the hybridisation.",
+ ),
+ # Oligo self complementarity
+ _Option(
+ ["-oanyself", "oanyself"],
+ "Maximum allowable alignment score for self-complementarity.",
+ ),
+ _Option(
+ ["-oendself", "oendself"],
+ "Max 3'-anchored self-complementarity global alignment score.",
+ ),
+ _Option(
+ ["-opolyxmax", "opolyxmax"],
+ "Maximum length of mononucleotide repeat in internal oligo.",
+ ),
+ _Option(
+ ["-mispriminglibraryfile", "mispriminglibraryfile"],
+ "File containing library of sequences to avoid amplifying",
+ ),
+ _Option(
+ ["-maxmispriming", "maxmispriming"],
+ "Maximum allowed similarity of primers to sequences in "
+ "library specified by -mispriminglibrary",
+ ),
+ _Option(
+ ["-omishybmax", "omishybmax"],
+ "Maximum alignment score for hybridisation of internal oligo to "
+ "library specified by -mishyblibraryfile.",
+ ),
+ _Option(
+ ["-mishyblibraryfile", "mishyblibraryfile"],
+ "Library file of seqs to avoid internal oligo hybridisation.",
+ ),
+ _Option(
+ ["-explainflag", "explainflag"],
+ "Produce output tags with eprimer3 statistics",
+ ),
+ ]
+ _EmbossCommandLine.__init__(self, cmd, **kwargs)
+
+
+class PrimerSearchCommandline(_EmbossCommandLine):
+ """Commandline object for the primersearch program from EMBOSS."""
+
+ def __init__(self, cmd="primersearch", **kwargs):
+ """Initialize the class."""
+ self.parameters = [
+ _Option(
+ ["-seqall", "-sequences", "sequences", "seqall"],
+ "Sequence to look for the primer pairs in.",
+ is_required=True,
+ ),
+ # When this wrapper was written primersearch used -sequences
+ # as the argument name. Since at least EMBOSS 5.0 (and
+ # perhaps earlier) this has been -seqall instead.
+ _Option(
+ ["-infile", "-primers", "primers", "infile"],
+ "File containing the primer pairs to search for.",
+ filename=True,
+ is_required=True,
+ ),
+ # When this wrapper was written primersearch used -primers
+ # as the argument name. Since at least EMBOSS 5.0 (and
+ # perhaps earlier) this has been -infile instead.
+ _Option(
+ ["-mismatchpercent", "mismatchpercent"],
+ "Allowed percentage mismatch (any integer value, default 0).",
+ is_required=True,
+ ),
+ _Option(
+ ["-snucleotide", "snucleotide"], "Sequences are nucleotide (boolean)"
+ ),
+ _Option(["-sprotein", "sprotein"], "Sequences are protein (boolean)"),
+ ]
+ _EmbossCommandLine.__init__(self, cmd, **kwargs)
+
+
+class FDNADistCommandline(_EmbossCommandLine):
+ """Commandline object for the fdnadist program from EMBOSS.
+
+ fdnadist is an EMBOSS wrapper for the PHYLIP program dnadist for
+ calulating distance matrices from DNA sequence files.
+ """
+
+ def __init__(self, cmd="fdnadist", **kwargs):
+ """Initialize the class."""
+ self.parameters = [
+ _Option(
+ ["-sequence", "sequence"],
+ "seq file to use (phylip)",
+ filename=True,
+ is_required=True,
+ ),
+ _Option(["-method", "method"], "sub. model [f,k,j,l,s]", is_required=True),
+ _Option(["-gamma", "gamma"], "gamma [g, i,n]"),
+ _Option(
+ ["-ncategories", "ncategories"], "number of rate catergories (1-9)"
+ ),
+ _Option(["-rate", "rate"], "rate for each category"),
+ _Option(
+ ["-categories", "categories"], "File of substitution rate categories"
+ ),
+ _Option(["-weights", "weights"], "weights file"),
+ _Option(
+ ["-gammacoefficient", "gammacoefficient"], "value for gamma (> 0.001)"
+ ),
+ _Option(["-invarfrac", "invarfrac"], "proportoin of invariant sites"),
+ _Option(["-ttratio", "ttratio"], "ts/tv ratio"),
+ _Option(["-freqsfrom", "freqsfrom"], "use emprical base freqs"),
+ _Option(["-basefreq", "basefreq"], "specify basefreqs"),
+ _Option(["-lower", "lower"], "lower triangle matrix (y/N)"),
+ ]
+ _EmbossCommandLine.__init__(self, cmd, **kwargs)
+
+
+class FTreeDistCommandline(_EmbossCommandLine):
+ """Commandline object for the ftreedist program from EMBOSS.
+
+ ftreedist is an EMBOSS wrapper for the PHYLIP program treedist used for
+ calulating distance measures between phylogentic trees.
+ """
+
+ def __init__(self, cmd="ftreedist", **kwargs):
+ """Initialize the class."""
+ self.parameters = [
+ _Option(
+ ["-intreefile", "intreefile"],
+ "tree file to score (phylip)",
+ filename=True,
+ is_required=True,
+ ),
+ _Option(["-dtype", "dtype"], "distance type ([S]ymetric, [b]ranch score)"),
+ _Option(
+ ["-pairing", "pairing"],
+ "tree pairing method ([A]djacent pairs, all [p]ossible pairs)",
+ ),
+ _Option(["-style", "style"], "output style - [V]erbose, [f]ill, [s]parse"),
+ _Option(["-noroot", "noroot"], "treat trees as rooted [N/y]"),
+ _Option(
+ ["-outgrno", "outgrno"],
+ "which taxon to root the trees with (starts from 0)",
+ ),
+ ]
+ _EmbossCommandLine.__init__(self, cmd, **kwargs)
+
+
+class FNeighborCommandline(_EmbossCommandLine):
+ """Commandline object for the fneighbor program from EMBOSS.
+
+ fneighbor is an EMBOSS wrapper for the PHYLIP program neighbor used for
+ calulating neighbor-joining or UPGMA trees from distance matrices.
+ """
+
+ def __init__(self, cmd="fneighbor", **kwargs):
+ """Initialize the class."""
+ self.parameters = [
+ _Option(
+ ["-datafile", "datafile"],
+ "dist file to use (phylip)",
+ filename=True,
+ is_required=True,
+ ),
+ _Option(
+ ["-matrixtype", "matrixtype"],
+ "is martrix [S]quare pr [u]pper or [l]ower",
+ ),
+ _Option(["-treetype", "treetype"], "nj or UPGMA tree (n/u)"),
+ _Option(["-outgrno", "outgrno"], "taxon to use as OG"),
+ _Option(["-jumble", "jumble"], "randommise input order (Y/n)"),
+ _Option(["-seed", "seed"], "provide a random seed"),
+ _Option(["-trout", "trout"], "write tree (Y/n)"),
+ _Option(["-outtreefile", "outtreefile"], "filename for output tree"),
+ _Option(["-progress", "progress"], "print progress (Y/n)"),
+ _Option(["-treeprint", "treeprint"], "print tree (Y/n)"),
+ ]
+ _EmbossCommandLine.__init__(self, cmd, **kwargs)
+
+
+class FSeqBootCommandline(_EmbossCommandLine):
+ """Commandline object for the fseqboot program from EMBOSS.
+
+ fseqboot is an EMBOSS wrapper for the PHYLIP program seqboot used to
+ pseudo-sample alignment files.
+ """
+
+ def __init__(self, cmd="fseqboot", **kwargs):
+ """Initialize the class."""
+ self.parameters = [
+ _Option(
+ ["-sequence", "sequence"],
+ "seq file to sample (phylip)",
+ filename=True,
+ is_required=True,
+ ),
+ _Option(["-categories", "catergories"], "file of input categories"),
+ _Option(["-weights", "weights"], " weights file"),
+ _Option(["-test", "test"], "specify operation, default is bootstrap"),
+ _Option(["-regular", "regular"], "absolute number to resample"),
+ _Option(["-fracsample", "fracsample"], "fraction to resample"),
+ _Option(
+ ["-rewriteformat", "rewriteformat"],
+ "output format ([P]hyilp, [n]exus, [x]ml",
+ ),
+ _Option(["-seqtype", "seqtype"], "output format ([D]na, [p]rotein, [r]na"),
+ _Option(["-blocksize", "blocksize"], "print progress (Y/n)"),
+ _Option(["-reps", "reps"], "how many replicates, defaults to 100)"),
+ _Option(
+ ["-justweights", "jusweights"],
+ "what to write out [D]atasets of just [w]eights",
+ ),
+ _Option(["-seed", "seed"], "specify random seed"),
+ _Option(["-dotdiff", "dotdiff"], "Use dot-differencing? [Y/n]"),
+ ]
+ _EmbossCommandLine.__init__(self, cmd, **kwargs)
+
+
+class FDNAParsCommandline(_EmbossCommandLine):
+ """Commandline object for the fdnapars program from EMBOSS.
+
+ fdnapars is an EMBOSS version of the PHYLIP program dnapars, for
+ estimating trees from DNA sequences using parsiomny. Calling this command
+ without providing a value for the option "-intreefile" will invoke
+ "interactive mode" (and as a result fail if called with subprocess) if
+ "-auto" is not set to true.
+ """
+
+ def __init__(self, cmd="fdnapars", **kwargs):
+ """Initialize the class."""
+ self.parameters = [
+ _Option(
+ ["-sequence", "sequence"],
+ "seq file to use (phylip)",
+ filename=True,
+ is_required=True,
+ ),
+ _Option(["-intreefile", "intreefile"], "Phylip tree file"),
+ _Option(["-weights", "weights"], "weights file"),
+ _Option(["-maxtrees", "maxtrees"], "max trees to save during run"),
+ _Option(["-thorough", "thorough"], "more thorough search (Y/n)"),
+ _Option(["-rearrange", "rearrange"], "Rearrange on just 1 best tree (Y/n)"),
+ _Option(
+ ["-transversion", "transversion"], "Use tranversion parsimony (y/N)"
+ ),
+ _Option(
+ ["-njumble", "njumble"],
+ "number of times to randomise input order (default is 0)",
+ ),
+ _Option(["-seed", "seed"], "provide random seed"),
+ _Option(["-outgrno", "outgrno"], "Specify outgroup"),
+ _Option(["-thresh", "thresh"], "Use threshold parsimony (y/N)"),
+ _Option(["-threshold", "threshold"], "Threshold value"),
+ _Option(["-trout", "trout"], "Write trees to file (Y/n)"),
+ _Option(["-outtreefile", "outtreefile"], "filename for output tree"),
+ _Option(["-dotdiff", "dotdiff"], "Use dot-differencing? [Y/n]"),
+ ]
+ _EmbossCommandLine.__init__(self, cmd, **kwargs)
+
+
+class FProtParsCommandline(_EmbossCommandLine):
+ """Commandline object for the fdnapars program from EMBOSS.
+
+ fprotpars is an EMBOSS version of the PHYLIP program protpars, for
+ estimating trees from protein sequences using parsiomny. Calling this
+ command without providing a value for the option "-intreefile" will invoke
+ "interactive mode" (and as a result fail if called with subprocess) if
+ "-auto" is not set to true.
+ """
+
+ def __init__(self, cmd="fprotpars", **kwargs):
+ """Initialize the class."""
+ self.parameters = [
+ _Option(
+ ["-sequence", "sequence"],
+ "seq file to use (phylip)",
+ filename=True,
+ is_required=True,
+ ),
+ _Option(["-intreefile", "intreefile"], "Phylip tree file to score"),
+ _Option(
+ ["-outtreefile", "outtreefile"],
+ "phylip tree output file",
+ filename=True,
+ is_required=True,
+ ),
+ _Option(["-weights", "weights"], "weights file"),
+ _Option(["-whichcode", "whichcode"], "which genetic code, [U,M,V,F,Y]]"),
+ _Option(
+ ["-njumble", "njumble"],
+ "number of times to randomise input order (default is 0)",
+ ),
+ _Option(["-seed", "seed"], "provide random seed"),
+ _Option(["-outgrno", "outgrno"], "Specify outgroup"),
+ _Option(["-thresh", "thresh"], "Use threshold parsimony (y/N)"),
+ _Option(["-threshold", "threshold"], "Threshold value"),
+ _Option(["-trout", "trout"], "Write trees to file (Y/n)"),
+ _Option(["-dotdiff", "dotdiff"], "Use dot-differencing? [Y/n]"),
+ ]
+ _EmbossCommandLine.__init__(self, cmd, **kwargs)
+
+
+class FProtDistCommandline(_EmbossCommandLine):
+ """Commandline object for the fprotdist program from EMBOSS.
+
+ fprotdist is an EMBOSS wrapper for the PHYLIP program protdist used to
+ estimate trees from protein sequences using parsimony
+ """
+
+ def __init__(self, cmd="fprotdist", **kwargs):
+ """Initialize the class."""
+ self.parameters = [
+ _Option(
+ ["-sequence", "sequence"],
+ "seq file to use (phylip)",
+ filename=True,
+ is_required=True,
+ ),
+ _Option(
+ ["-ncategories", "ncategories"], "number of rate catergories (1-9)"
+ ),
+ _Option(["-rate", "rate"], "rate for each category"),
+ _Option(["-catergories", "catergories"], "file of rates"),
+ _Option(["-weights", "weights"], "weights file"),
+ _Option(["-method", "method"], "sub. model [j,h,d,k,s,c]"),
+ _Option(["-gamma", "gamma"], "gamma [g, i,c]"),
+ _Option(
+ ["-gammacoefficient", "gammacoefficient"], "value for gamma (> 0.001)"
+ ),
+ _Option(
+ ["-invarcoefficient", "invarcoefficient"],
+ "float for variation of substitution rate among sites",
+ ),
+ _Option(["-aacateg", "aacateg"], "Choose the category to use [G,C,H]"),
+ _Option(["-whichcode", "whichcode"], "genetic code [c,m,v,f,y]"),
+ _Option(["-ease", "ease"], "Pob change catergory (float between -0 and 1)"),
+ _Option(["-ttratio", "ttratio"], "Transition/transversion ratio (0-1)"),
+ _Option(
+ ["-basefreq", "basefreq"], "DNA base frequencies (space separated list)"
+ ),
+ ]
+ _EmbossCommandLine.__init__(self, cmd, **kwargs)
+
+
+class FConsenseCommandline(_EmbossCommandLine):
+ """Commandline object for the fconsense program from EMBOSS.
+
+ fconsense is an EMBOSS wrapper for the PHYLIP program consense used to
+ calculate consensus trees.
+ """
+
+ def __init__(self, cmd="fconsense", **kwargs):
+ """Initialize the class."""
+ self.parameters = [
+ _Option(
+ ["-intreefile", "intreefile"],
+ "file with phylip trees to make consensus from",
+ filename=True,
+ is_required=True,
+ ),
+ _Option(["-method", "method"], "consensus method [s, mr, MRE, ml]"),
+ _Option(
+ ["-mlfrac", "mlfrac"],
+ "cut-off freq for branch to appear in consensus (0.5-1.0)",
+ ),
+ _Option(["-root", "root"], "treat trees as rooted (YES, no)"),
+ _Option(["-outgrno", "outgrno"], "OTU to use as outgroup (starts from 0)"),
+ _Option(["-trout", "trout"], "treat trees as rooted (YES, no)"),
+ _Option(
+ ["-outtreefile", "outtreefile"], "Phylip tree output file (optional)"
+ ),
+ ]
+ _EmbossCommandLine.__init__(self, cmd, **kwargs)
+
+
+class WaterCommandline(_EmbossCommandLine):
+ """Commandline object for the water program from EMBOSS."""
+
+ def __init__(self, cmd="water", **kwargs):
+ """Initialize the class."""
+ self.parameters = [
+ _Option(
+ ["-asequence", "asequence"],
+ "First sequence to align",
+ filename=True,
+ is_required=True,
+ ),
+ _Option(
+ ["-bsequence", "bsequence"],
+ "Second sequence to align",
+ filename=True,
+ is_required=True,
+ ),
+ _Option(["-gapopen", "gapopen"], "Gap open penalty", is_required=True),
+ _Option(
+ ["-gapextend", "gapextend"], "Gap extension penalty", is_required=True
+ ),
+ _Option(["-datafile", "datafile"], "Matrix file", filename=True),
+ _Switch(
+ ["-nobrief", "nobrief"], "Display extended identity and similarity"
+ ),
+ _Switch(["-brief", "brief"], "Display brief identity and similarity"),
+ _Option(
+ ["-similarity", "similarity"], "Display percent identity and similarity"
+ ),
+ _Option(
+ ["-snucleotide", "snucleotide"], "Sequences are nucleotide (boolean)"
+ ),
+ _Option(["-sprotein", "sprotein"], "Sequences are protein (boolean)"),
+ _Option(
+ ["-aformat", "aformat"],
+ "Display output in a different specified output format",
+ ),
+ ]
+ _EmbossCommandLine.__init__(self, cmd, **kwargs)
+
+
+class NeedleCommandline(_EmbossCommandLine):
+ """Commandline object for the needle program from EMBOSS."""
+
+ def __init__(self, cmd="needle", **kwargs):
+ """Initialize the class."""
+ self.parameters = [
+ _Option(
+ ["-asequence", "asequence"],
+ "First sequence to align",
+ filename=True,
+ is_required=True,
+ ),
+ _Option(
+ ["-bsequence", "bsequence"],
+ "Second sequence to align",
+ filename=True,
+ is_required=True,
+ ),
+ _Option(["-gapopen", "gapopen"], "Gap open penalty", is_required=True),
+ _Option(
+ ["-gapextend", "gapextend"], "Gap extension penalty", is_required=True
+ ),
+ _Option(["-datafile", "datafile"], "Matrix file", filename=True),
+ _Option(["-endweight", "endweight"], "Apply And gap penalties"),
+ _Option(
+ ["-endopen", "endopen"],
+ "The score taken away when an end gap is created.",
+ ),
+ _Option(
+ ["-endextend", "endextend"],
+ "The score added to the end gap penality for each base or "
+ "residue in the end gap.",
+ ),
+ _Switch(
+ ["-nobrief", "nobrief"], "Display extended identity and similarity"
+ ),
+ _Switch(["-brief", "brief"], "Display brief identity and similarity"),
+ _Option(
+ ["-similarity", "similarity"], "Display percent identity and similarity"
+ ),
+ _Option(
+ ["-snucleotide", "snucleotide"], "Sequences are nucleotide (boolean)"
+ ),
+ _Option(["-sprotein", "sprotein"], "Sequences are protein (boolean)"),
+ _Option(
+ ["-aformat", "aformat"],
+ "Display output in a different specified output format",
+ ),
+ ]
+ _EmbossCommandLine.__init__(self, cmd, **kwargs)
+
+
+class NeedleallCommandline(_EmbossCommandLine):
+ """Commandline object for the needleall program from EMBOSS."""
+
+ def __init__(self, cmd="needleall", **kwargs):
+ """Initialize the class."""
+ self.parameters = [
+ _Option(
+ ["-asequence", "asequence"],
+ "First sequence to align",
+ filename=True,
+ is_required=True,
+ ),
+ _Option(
+ ["-bsequence", "bsequence"],
+ "Second sequence to align",
+ filename=True,
+ is_required=True,
+ ),
+ _Option(["-gapopen", "gapopen"], "Gap open penalty", is_required=True),
+ _Option(
+ ["-gapextend", "gapextend"], "Gap extension penalty", is_required=True
+ ),
+ _Option(["-datafile", "datafile"], "Matrix file", filename=True),
+ _Option(
+ ["-minscore", "minscore"],
+ "Exclude alignments with scores below this threshold score.",
+ ),
+ _Option(["-errorfile", "errorfile"], "Error file to be written to."),
+ _Option(["-endweight", "endweight"], "Apply And gap penalties"),
+ _Option(
+ ["-endopen", "endopen"],
+ "The score taken away when an end gap is created.",
+ ),
+ _Option(
+ ["-endextend", "endextend"],
+ "The score added to the end gap penality for each base or "
+ "residue in the end gap.",
+ ),
+ _Switch(
+ ["-nobrief", "nobrief"], "Display extended identity and similarity"
+ ),
+ _Switch(["-brief", "brief"], "Display brief identity and similarity"),
+ _Option(
+ ["-similarity", "similarity"], "Display percent identity and similarity"
+ ),
+ _Option(
+ ["-snucleotide", "snucleotide"], "Sequences are nucleotide (boolean)"
+ ),
+ _Option(["-sprotein", "sprotein"], "Sequences are protein (boolean)"),
+ _Option(
+ ["-aformat", "aformat"],
+ "Display output in a different specified output format",
+ ),
+ ]
+ _EmbossCommandLine.__init__(self, cmd, **kwargs)
+
+
+class StretcherCommandline(_EmbossCommandLine):
+ """Commandline object for the stretcher program from EMBOSS."""
+
+ def __init__(self, cmd="stretcher", **kwargs):
+ """Initialize the class."""
+ self.parameters = [
+ _Option(
+ ["-asequence", "asequence"],
+ "First sequence to align",
+ filename=True,
+ is_required=True,
+ ),
+ _Option(
+ ["-bsequence", "bsequence"],
+ "Second sequence to align",
+ filename=True,
+ is_required=True,
+ ),
+ _Option(
+ ["-gapopen", "gapopen"],
+ "Gap open penalty",
+ is_required=True,
+ checker_function=lambda value: isinstance(value, int),
+ ),
+ _Option(
+ ["-gapextend", "gapextend"],
+ "Gap extension penalty",
+ is_required=True,
+ checker_function=lambda value: isinstance(value, int),
+ ),
+ _Option(["-datafile", "datafile"], "Matrix file", filename=True),
+ _Option(
+ ["-snucleotide", "snucleotide"], "Sequences are nucleotide (boolean)"
+ ),
+ _Option(["-sprotein", "sprotein"], "Sequences are protein (boolean)"),
+ _Option(
+ ["-aformat", "aformat"],
+ "Display output in a different specified output format",
+ ),
+ ]
+ _EmbossCommandLine.__init__(self, cmd, **kwargs)
+
+
+class FuzznucCommandline(_EmbossCommandLine):
+ """Commandline object for the fuzznuc program from EMBOSS."""
+
+ def __init__(self, cmd="fuzznuc", **kwargs):
+ """Initialize the class."""
+ self.parameters = [
+ _Option(
+ ["-sequence", "sequence"], "Sequence database USA", is_required=True
+ ),
+ _Option(
+ ["-pattern", "pattern"],
+ "Search pattern, using standard IUPAC one-letter codes",
+ is_required=True,
+ ),
+ _Option(["-pmismatch", "pmismatch"], "Number of mismatches"),
+ _Option(["-complement", "complement"], "Search complementary strand"),
+ _Option(["-rformat", "rformat"], "Specify the report format to output in."),
+ ]
+ _EmbossCommandLine.__init__(self, cmd, **kwargs)
+
+
+class FuzzproCommandline(_EmbossCommandLine):
+ """Commandline object for the fuzzpro program from EMBOSS."""
+
+ def __init__(self, cmd="fuzzpro", **kwargs):
+ """Initialize the class."""
+ self.parameters = [
+ _Option(
+ ["-sequence", "sequence"], "Sequence database USA", is_required=True
+ ),
+ _Option(
+ ["-pattern", "pattern"],
+ "Search pattern, using standard IUPAC one-letter codes",
+ is_required=True,
+ ),
+ _Option(["-pmismatch", "pmismatch"], "Number of mismatches"),
+ _Option(["-rformat", "rformat"], "Specify the report format to output in."),
+ ]
+ _EmbossCommandLine.__init__(self, cmd, **kwargs)
+
+
+class Est2GenomeCommandline(_EmbossCommandLine):
+ """Commandline object for the est2genome program from EMBOSS."""
+
+ def __init__(self, cmd="est2genome", **kwargs):
+ """Initialize the class."""
+ self.parameters = [
+ _Option(["-est", "est"], "EST sequence(s)", is_required=True),
+ _Option(["-genome", "genome"], "Genomic sequence", is_required=True),
+ _Option(["-match", "match"], "Score for matching two bases"),
+ _Option(["-mismatch", "mismatch"], "Cost for mismatching two bases"),
+ _Option(
+ ["-gappenalty", "gappenalty"],
+ "Cost for deleting a single base in either sequence, "
+ "excluding introns",
+ ),
+ _Option(
+ ["-intronpenalty", "intronpenalty"],
+ "Cost for an intron, independent of length.",
+ ),
+ _Option(
+ ["-splicepenalty", "splicepenalty"],
+ "Cost for an intron, independent of length "
+ "and starting/ending on donor-acceptor sites",
+ ),
+ _Option(
+ ["-minscore", "minscore"],
+ "Exclude alignments with scores below this threshold score.",
+ ),
+ _Option(
+ ["-reverse", "reverse"], "Reverse the orientation of the EST sequence"
+ ),
+ _Option(["-splice", "splice"], "Use donor and acceptor splice sites."),
+ _Option(
+ ["-mode", "mode"],
+ "This determines the comparion mode. 'both', 'forward', or 'reverse'",
+ ),
+ _Option(
+ ["-best", "best"],
+ "You can print out all comparisons instead of just the best",
+ ),
+ _Option(["-space", "space"], "for linear-space recursion."),
+ _Option(["-shuffle", "shuffle"], "Shuffle"),
+ _Option(["-seed", "seed"], "Random number seed"),
+ _Option(["-align", "align"], "Show the alignment."),
+ _Option(["-width", "width"], "Alignment width"),
+ ]
+ _EmbossCommandLine.__init__(self, cmd, **kwargs)
+
+
+class ETandemCommandline(_EmbossCommandLine):
+ """Commandline object for the etandem program from EMBOSS."""
+
+ def __init__(self, cmd="etandem", **kwargs):
+ """Initialize the class."""
+ self.parameters = [
+ _Option(
+ ["-sequence", "sequence"], "Sequence", filename=True, is_required=True
+ ),
+ _Option(
+ ["-minrepeat", "minrepeat"], "Minimum repeat size", is_required=True
+ ),
+ _Option(
+ ["-maxrepeat", "maxrepeat"], "Maximum repeat size", is_required=True
+ ),
+ _Option(["-threshold", "threshold"], "Threshold score"),
+ _Option(["-mismatch", "mismatch"], "Allow N as a mismatch"),
+ _Option(["-uniform", "uniform"], "Allow uniform consensus"),
+ _Option(["-rformat", "rformat"], "Output report format"),
+ ]
+ _EmbossCommandLine.__init__(self, cmd, **kwargs)
+
+
+class EInvertedCommandline(_EmbossCommandLine):
+ """Commandline object for the einverted program from EMBOSS."""
+
+ def __init__(self, cmd="einverted", **kwargs):
+ """Initialize the class."""
+ self.parameters = [
+ _Option(
+ ["-sequence", "sequence"], "Sequence", filename=True, is_required=True
+ ),
+ _Option(["-gap", "gap"], "Gap penalty", filename=True, is_required=True),
+ _Option(
+ ["-threshold", "threshold"], "Minimum score threshold", is_required=True
+ ),
+ _Option(["-match", "match"], "Match score", is_required=True),
+ _Option(["-mismatch", "mismatch"], "Mismatch score", is_required=True),
+ _Option(
+ ["-maxrepeat", "maxrepeat"],
+ "Maximum separation between the start and end of repeat",
+ ),
+ ]
+ _EmbossCommandLine.__init__(self, cmd, **kwargs)
+
+
+class PalindromeCommandline(_EmbossCommandLine):
+ """Commandline object for the palindrome program from EMBOSS."""
+
+ def __init__(self, cmd="palindrome", **kwargs):
+ """Initialize the class."""
+ self.parameters = [
+ _Option(
+ ["-sequence", "sequence"], "Sequence", filename=True, is_required=True
+ ),
+ _Option(
+ ["-minpallen", "minpallen"],
+ "Minimum palindrome length",
+ is_required=True,
+ ),
+ _Option(
+ ["-maxpallen", "maxpallen"],
+ "Maximum palindrome length",
+ is_required=True,
+ ),
+ _Option(
+ ["-gaplimit", "gaplimit"],
+ "Maximum gap between repeats",
+ is_required=True,
+ ),
+ _Option(
+ ["-nummismatches", "nummismatches"],
+ "Number of mismatches allowed",
+ is_required=True,
+ ),
+ _Option(
+ ["-overlap", "overlap"], "Report overlapping matches", is_required=True
+ ),
+ ]
+ _EmbossCommandLine.__init__(self, cmd, **kwargs)
+
+
+class TranalignCommandline(_EmbossCommandLine):
+ """Commandline object for the tranalign program from EMBOSS."""
+
+ def __init__(self, cmd="tranalign", **kwargs):
+ """Initialize the class."""
+ self.parameters = [
+ _Option(
+ ["-asequence", "asequence"],
+ "Nucleotide sequences to be aligned.",
+ filename=True,
+ is_required=True,
+ ),
+ _Option(
+ ["-bsequence", "bsequence"],
+ "Protein sequence alignment",
+ filename=True,
+ is_required=True,
+ ),
+ _Option(
+ ["-outseq", "outseq"],
+ "Output sequence file.",
+ filename=True,
+ is_required=True,
+ ),
+ _Option(["-table", "table"], "Code to use"),
+ ]
+ _EmbossCommandLine.__init__(self, cmd, **kwargs)
+
+
+class DiffseqCommandline(_EmbossCommandLine):
+ """Commandline object for the diffseq program from EMBOSS."""
+
+ def __init__(self, cmd="diffseq", **kwargs):
+ """Initialize the class."""
+ self.parameters = [
+ _Option(
+ ["-asequence", "asequence"],
+ "First sequence to compare",
+ filename=True,
+ is_required=True,
+ ),
+ _Option(
+ ["-bsequence", "bsequence"],
+ "Second sequence to compare",
+ filename=True,
+ is_required=True,
+ ),
+ _Option(
+ ["-wordsize", "wordsize"],
+ "Word size to use for comparisons (10 default)",
+ is_required=True,
+ ),
+ _Option(
+ ["-aoutfeat", "aoutfeat"],
+ "File for output of first sequence's features",
+ filename=True,
+ is_required=True,
+ ),
+ _Option(
+ ["-boutfeat", "boutfeat"],
+ "File for output of second sequence's features",
+ filename=True,
+ is_required=True,
+ ),
+ _Option(["-rformat", "rformat"], "Output report file format"),
+ ]
+ _EmbossCommandLine.__init__(self, cmd, **kwargs)
+
+
+class IepCommandline(_EmbossCommandLine):
+ """Commandline for EMBOSS iep: calculated isoelectric point and charge.
+
+ Examples
+ --------
+ >>> from Bio.Emboss.Applications import IepCommandline
+ >>> iep_cline = IepCommandline(sequence="proteins.faa",
+ ... outfile="proteins.txt")
+ >>> print(iep_cline)
+ iep -outfile=proteins.txt -sequence=proteins.faa
+
+ You would typically run the command line with iep_cline() or via the
+ Python subprocess module, as described in the Biopython tutorial.
+
+ """
+
+ def __init__(self, cmd="iep", **kwargs):
+ """Initialize the class."""
+ self.parameters = [
+ _Option(
+ ["-sequence", "sequence"],
+ "Protein sequence(s) filename",
+ filename=True,
+ is_required=True,
+ ),
+ _Option(
+ ["-amino", "amino"],
+ """Number of N-termini
+
+ Integer 0 (default) or more.
+ """,
+ ),
+ _Option(
+ ["-carboxyl", "carboxyl"],
+ """Number of C-termini
+
+ Integer 0 (default) or more.
+ """,
+ ),
+ _Option(
+ ["-lysinemodified", "lysinemodified"],
+ """Number of modified lysines
+
+ Integer 0 (default) or more.
+ """,
+ ),
+ _Option(
+ ["-disulphides", "disulphides"],
+ """Number of disulphide bridges
+
+ Integer 0 (default) or more.
+ """,
+ ),
+ # Should we implement the -termini switch as well?
+ _Option(
+ ["-notermini", "notermini"],
+ "Exclude (True) or include (False) charge at N and C terminus.",
+ ),
+ ]
+ _EmbossCommandLine.__init__(self, cmd, **kwargs)
+
+
+# seqret uses -outseq, not -outfile, so use the base class:
+class SeqretCommandline(_EmbossMinimalCommandLine):
+ """Commandline object for the seqret program from EMBOSS.
+
+ This tool allows you to interconvert between different sequence file
+ formats (e.g. GenBank to FASTA). Combining Biopython's Bio.SeqIO module
+ with seqret using a suitable intermediate file format can allow you to
+ read/write to an even wider range of file formats.
+
+ This wrapper currently only supports the core functionality, things like
+ feature tables (in EMBOSS 6.1.0 onwards) are not yet included.
+ """
+
+ def __init__(self, cmd="seqret", **kwargs):
+ """Initialize the class."""
+ self.parameters = [
+ _Option(
+ ["-sequence", "sequence"], "Input sequence(s) filename", filename=True
+ ),
+ _Option(["-outseq", "outseq"], "Output sequence file.", filename=True),
+ _Option(
+ ["-sformat", "sformat"],
+ "Input sequence(s) format (e.g. fasta, genbank)",
+ ),
+ _Option(
+ ["-osformat", "osformat"],
+ "Output sequence(s) format (e.g. fasta, genbank)",
+ ),
+ ]
+ _EmbossMinimalCommandLine.__init__(self, cmd, **kwargs)
+
+ def _validate(self):
+ # Check the outfile, filter, or stdout option has been set.
+ # We can't simply do this via the required flag for the outfile
+ # output - this seems the simplest solution.
+ if not (self.outseq or self.filter or self.stdout):
+ raise ValueError(
+ "You must either set outfile (output filename), "
+ "or enable filter or stdout (output to stdout)."
+ )
+ if not (self.sequence or self.filter or self.stdint):
+ raise ValueError(
+ "You must either set sequence (input filename), "
+ "or enable filter or stdin (input from stdin)."
+ )
+ return _EmbossMinimalCommandLine._validate(self)
+
+
+class SeqmatchallCommandline(_EmbossCommandLine):
+ """Commandline object for the seqmatchall program from EMBOSS.
+
+ e.g.
+ >>> cline = SeqmatchallCommandline(sequence="opuntia.fasta", outfile="opuntia.txt")
+ >>> cline.auto = True
+ >>> cline.wordsize = 18
+ >>> cline.aformat = "pair"
+ >>> print(cline)
+ seqmatchall -auto -outfile=opuntia.txt -sequence=opuntia.fasta -wordsize=18 -aformat=pair
+
+ """
+
+ def __init__(self, cmd="seqmatchall", **kwargs):
+ """Initialize the class."""
+ self.parameters = [
+ _Option(
+ ["-sequence", "sequence"],
+ "Readable set of sequences",
+ filename=True,
+ is_required=True,
+ ),
+ _Option(
+ ["-wordsize", "wordsize"], "Word size (Integer 2 or more, default 4)"
+ ),
+ _Option(
+ ["-aformat", "aformat"],
+ "Display output in a different specified output format",
+ ),
+ ]
+ _EmbossCommandLine.__init__(self, cmd, **kwargs)
+
+
+if __name__ == "__main__":
+ from Bio._utils import run_doctest
+
+ run_doctest()
diff --git a/code/lib/Bio/Emboss/Primer3.py b/code/lib/Bio/Emboss/Primer3.py
new file mode 100644
index 0000000..0e210a5
--- /dev/null
+++ b/code/lib/Bio/Emboss/Primer3.py
@@ -0,0 +1,183 @@
+# Copyright 2008 Michiel de Hoon.
+# Revisions copyright 2009 Leighton Pritchard.
+# Revisions copyright 2010 Peter Cock.
+# All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Code to parse output from the EMBOSS eprimer3 program.
+
+As elsewhere in Biopython there are two input functions, read and parse,
+for single record output and multi-record output. For primer3, a single
+record object is created for each target sequence and may contain
+multiple primers.
+
+i.e. If you ran eprimer3 with a single target sequence, use the read
+function. If you ran eprimer3 with multiple targets, use the parse
+function to iterate over the retsults.
+"""
+
+
+# --- primer3
+
+
+class Record:
+ """Represent information from a primer3 run finding primers.
+
+ Members:
+
+ - primers - list of Primer objects describing primer pairs for
+ this target sequence.
+ - comments - the comment line(s) for the record
+
+ """
+
+ def __init__(self):
+ """Initialize the class."""
+ self.comments = ""
+ self.primers = []
+
+
+class Primers:
+ """A primer set designed by Primer3.
+
+ Members:
+
+ - size - length of product, note you can use len(primer) as an
+ alternative to primer.size
+
+ - forward_seq
+ - forward_start
+ - forward_length
+ - forward_tm
+ - forward_gc
+
+ - reverse_seq
+ - reverse_start
+ - reverse_length
+ - reverse_tm
+ - reverse_gc
+
+ - internal_seq
+ - internal_start
+ - internal_length
+ - internal_tm
+ - internal_gc
+
+ """
+
+ def __init__(self):
+ """Initialize the class."""
+ self.size = 0
+ self.forward_seq = ""
+ self.forward_start = 0
+ self.forward_length = 0
+ self.forward_tm = 0.0
+ self.forward_gc = 0.0
+ self.reverse_seq = ""
+ self.reverse_start = 0
+ self.reverse_length = 0
+ self.reverse_tm = 0.0
+ self.reverse_gc = 0.0
+ self.internal_seq = ""
+ self.internal_start = 0
+ self.internal_length = 0
+ self.internal_tm = 0.0
+ self.internal_gc = 0.0
+
+ def __len__(self):
+ """Length of the primer product (i.e. product size)."""
+ return self.size
+
+
+def parse(handle):
+ """Iterate over primer3 output as Bio.Emboss.Primer3.Record objects."""
+ # Skip blank lines at head of file
+ while True:
+ line = handle.readline()
+ if line.strip():
+ break # Starting a record
+
+ # Read each record
+ record = None
+ primer = None
+ while True:
+ if line.startswith("# EPRIMER3") or line.startswith("# PRIMER3"):
+ # Record data
+ if record is not None:
+ yield record
+ record = Record()
+ record.comments += line
+ primer = None
+ elif line.startswith("#"):
+ if (
+ line.strip()
+ != "# Start Len Tm GC% Sequence"
+ ):
+ record.comments += line
+ elif not line.strip():
+ pass
+ elif line[5:19] == "PRODUCT SIZE: ":
+ primer = Primers()
+ primer.size = int(line[19:])
+ record.primers.append(primer)
+ elif line[5:19] == "FORWARD PRIMER":
+ words = line.split()
+ if not primer or primer.size == 0:
+ primer = Primers()
+ record.primers.append(primer)
+ primer.forward_start = int(words[2])
+ primer.forward_length = int(words[3])
+ primer.forward_tm = float(words[4])
+ primer.forward_gc = float(words[5])
+ primer.forward_seq = words[6]
+ elif line[5:19] == "REVERSE PRIMER":
+ words = line.split()
+ if not primer or primer.size == 0:
+ primer = Primers()
+ record.primers.append(primer)
+ primer.reverse_start = int(words[2])
+ primer.reverse_length = int(words[3])
+ primer.reverse_tm = float(words[4])
+ primer.reverse_gc = float(words[5])
+ primer.reverse_seq = words[6]
+ elif line[5:19] == "INTERNAL OLIGO":
+ words = line.split()
+ if not primer or primer.size == 0:
+ primer = Primers()
+ record.primers.append(primer)
+ primer.internal_start = int(words[2])
+ primer.internal_length = int(words[3])
+ primer.internal_tm = float(words[4])
+ primer.internal_gc = float(words[5])
+ try:
+ primer.internal_seq = words[6]
+ except IndexError: # eprimer3 reports oligo without sequence
+ primer.internal_seq = ""
+ try:
+ line = next(handle)
+ except StopIteration:
+ break
+ if record:
+ yield record
+
+
+def read(handle):
+ """Parse primer3 output into a Bio.Emboss.Primer3.Record object.
+
+ This is for when there is one and only one target sequence. If
+ designing primers for multiple sequences, use the parse function.
+ """
+ iterator = parse(handle)
+ try:
+ record = next(iterator)
+ except StopIteration:
+ raise ValueError("No records found in handle") from None
+ try:
+ next(iterator)
+ raise ValueError("More than one record found in handle")
+ except StopIteration:
+ pass
+ return record
diff --git a/code/lib/Bio/Emboss/PrimerSearch.py b/code/lib/Bio/Emboss/PrimerSearch.py
new file mode 100644
index 0000000..3a7fb7a
--- /dev/null
+++ b/code/lib/Bio/Emboss/PrimerSearch.py
@@ -0,0 +1,80 @@
+# Copyright 2008 Michiel de Hoon. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+
+"""Code to interact with the primersearch program from EMBOSS."""
+
+
+class InputRecord:
+ """Represent the input file into the primersearch program.
+
+ This makes it easy to add primer information and write it out to the
+ simple primer file format.
+ """
+
+ def __init__(self):
+ """Initialize the class."""
+ self.primer_info = []
+
+ def __str__(self):
+ """Summarize the primersearch input record as a string."""
+ output = ""
+ for name, primer1, primer2 in self.primer_info:
+ output += "%s %s %s\n" % (name, primer1, primer2)
+ return output
+
+ def add_primer_set(self, primer_name, first_primer_seq, second_primer_seq):
+ """Add primer information to the record."""
+ self.primer_info.append((primer_name, first_primer_seq, second_primer_seq))
+
+
+class OutputRecord:
+ """Represent the information from a primersearch job.
+
+ amplifiers is a dictionary where the keys are the primer names and
+ the values are a list of PrimerSearchAmplifier objects.
+ """
+
+ def __init__(self):
+ """Initialize the class."""
+ self.amplifiers = {}
+
+
+class Amplifier:
+ """Represent a single amplification from a primer."""
+
+ def __init__(self):
+ """Initialize the class."""
+ self.hit_info = ""
+ self.length = 0
+
+
+def read(handle):
+ """Get output from primersearch into a PrimerSearchOutputRecord."""
+ record = OutputRecord()
+
+ for line in handle:
+ if not line.strip():
+ continue
+ elif line.startswith("Primer name"):
+ name = line.split()[-1]
+ record.amplifiers[name] = []
+ elif line.startswith("Amplimer"):
+ amplifier = Amplifier()
+ record.amplifiers[name].append(amplifier)
+ elif line.startswith("\tSequence: "):
+ amplifier.hit_info = line.replace("\tSequence: ", "")
+ elif line.startswith("\tAmplimer length: "):
+ length = line.split()[-2]
+ amplifier.length = int(length)
+ else:
+ amplifier.hit_info += line
+
+ for name in record.amplifiers:
+ for amplifier in record.amplifiers[name]:
+ amplifier.hit_info = amplifier.hit_info.rstrip()
+
+ return record
diff --git a/code/lib/Bio/Emboss/__init__.py b/code/lib/Bio/Emboss/__init__.py
new file mode 100644
index 0000000..630780d
--- /dev/null
+++ b/code/lib/Bio/Emboss/__init__.py
@@ -0,0 +1,8 @@
+# Copyright 2001 Brad Chapman. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+
+"""Code to interact with the ever-so-useful EMBOSS programs."""
diff --git a/code/lib/Bio/Emboss/__pycache__/Applications.cpython-37.pyc b/code/lib/Bio/Emboss/__pycache__/Applications.cpython-37.pyc
new file mode 100644
index 0000000..56718f6
Binary files /dev/null and b/code/lib/Bio/Emboss/__pycache__/Applications.cpython-37.pyc differ
diff --git a/code/lib/Bio/Emboss/__pycache__/Primer3.cpython-37.pyc b/code/lib/Bio/Emboss/__pycache__/Primer3.cpython-37.pyc
new file mode 100644
index 0000000..6302571
Binary files /dev/null and b/code/lib/Bio/Emboss/__pycache__/Primer3.cpython-37.pyc differ
diff --git a/code/lib/Bio/Emboss/__pycache__/PrimerSearch.cpython-37.pyc b/code/lib/Bio/Emboss/__pycache__/PrimerSearch.cpython-37.pyc
new file mode 100644
index 0000000..808a8cb
Binary files /dev/null and b/code/lib/Bio/Emboss/__pycache__/PrimerSearch.cpython-37.pyc differ
diff --git a/code/lib/Bio/Emboss/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/Emboss/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..3515132
Binary files /dev/null and b/code/lib/Bio/Emboss/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/Entrez/DTDs/Docsum_3_0.dtd b/code/lib/Bio/Entrez/DTDs/Docsum_3_0.dtd
new file mode 100644
index 0000000..43a18d0
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/Docsum_3_0.dtd
@@ -0,0 +1,17 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%Docsum_3_0_module;
diff --git a/code/lib/Bio/Entrez/DTDs/Docsum_3_0.mod.dtd b/code/lib/Bio/Entrez/DTDs/Docsum_3_0.mod.dtd
new file mode 100644
index 0000000..64a4549
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/Docsum_3_0.mod.dtd
@@ -0,0 +1,1054 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/Docsum_3_1.dtd b/code/lib/Bio/Entrez/DTDs/Docsum_3_1.dtd
new file mode 100644
index 0000000..a82d6a8
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/Docsum_3_1.dtd
@@ -0,0 +1,17 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%Docsum_3_1_module;
diff --git a/code/lib/Bio/Entrez/DTDs/Docsum_3_1.mod.dtd b/code/lib/Bio/Entrez/DTDs/Docsum_3_1.mod.dtd
new file mode 100644
index 0000000..ce57767
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/Docsum_3_1.mod.dtd
@@ -0,0 +1,1055 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/Docsum_3_2.dtd b/code/lib/Bio/Entrez/DTDs/Docsum_3_2.dtd
new file mode 100644
index 0000000..0c04c5e
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/Docsum_3_2.dtd
@@ -0,0 +1,17 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%Docsum_3_2_module;
diff --git a/code/lib/Bio/Entrez/DTDs/Docsum_3_2.mod.dtd b/code/lib/Bio/Entrez/DTDs/Docsum_3_2.mod.dtd
new file mode 100644
index 0000000..fd17c81
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/Docsum_3_2.mod.dtd
@@ -0,0 +1,1418 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/Docsum_3_3.dtd b/code/lib/Bio/Entrez/DTDs/Docsum_3_3.dtd
new file mode 100644
index 0000000..36da5a0
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/Docsum_3_3.dtd
@@ -0,0 +1,17 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%Docsum_3_3_module;
diff --git a/code/lib/Bio/Entrez/DTDs/Docsum_3_3.mod.dtd b/code/lib/Bio/Entrez/DTDs/Docsum_3_3.mod.dtd
new file mode 100644
index 0000000..c1c1169
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/Docsum_3_3.mod.dtd
@@ -0,0 +1,1585 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/Docsum_3_4.dtd b/code/lib/Bio/Entrez/DTDs/Docsum_3_4.dtd
new file mode 100644
index 0000000..89ba0bc
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/Docsum_3_4.dtd
@@ -0,0 +1,17 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%Docsum_3_4_module;
diff --git a/code/lib/Bio/Entrez/DTDs/Docsum_3_4.mod.dtd b/code/lib/Bio/Entrez/DTDs/Docsum_3_4.mod.dtd
new file mode 100644
index 0000000..51899a8
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/Docsum_3_4.mod.dtd
@@ -0,0 +1,1594 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/EMBL_General.dtd b/code/lib/Bio/Entrez/DTDs/EMBL_General.dtd
new file mode 100644
index 0000000..267f9e1
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/EMBL_General.dtd
@@ -0,0 +1,20 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%EMBL_General_module;
+
+
+%NCBI_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/EMBL_General.mod.dtd b/code/lib/Bio/Entrez/DTDs/EMBL_General.mod.dtd
new file mode 100644
index 0000000..1f6f6fa
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/EMBL_General.mod.dtd
@@ -0,0 +1,133 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/GenBank_General.dtd b/code/lib/Bio/Entrez/DTDs/GenBank_General.dtd
new file mode 100644
index 0000000..c8707a9
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/GenBank_General.dtd
@@ -0,0 +1,20 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%GenBank_General_module;
+
+
+%NCBI_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/GenBank_General.mod.dtd b/code/lib/Bio/Entrez/DTDs/GenBank_General.mod.dtd
new file mode 100644
index 0000000..0cba454
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/GenBank_General.mod.dtd
@@ -0,0 +1,65 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/HomoloGene.dtd b/code/lib/Bio/Entrez/DTDs/HomoloGene.dtd
new file mode 100644
index 0000000..82262e4
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/HomoloGene.dtd
@@ -0,0 +1,89 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%EMBL_General_module;
+
+
+%GenBank_General_module;
+
+
+%HomoloGene_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_BioSource_module;
+
+
+%NCBI_Gene_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Medline_module;
+
+
+%NCBI_Organism_module;
+
+
+%NCBI_Protein_module;
+
+
+%NCBI_Pub_module;
+
+
+%NCBI_RNA_module;
+
+
+%NCBI_Rsite_module;
+
+
+%NCBI_SeqTable_module;
+
+
+%NCBI_Seqalign_module;
+
+
+%NCBI_Seqfeat_module;
+
+
+%NCBI_Seqloc_module;
+
+
+%NCBI_Seqres_module;
+
+
+%NCBI_Sequence_module;
+
+
+%NCBI_TxInit_module;
+
+
+%NCBI_Variation_module;
+
+
+%PDB_General_module;
+
+
+%PIR_General_module;
+
+
+%PRF_General_module;
+
+
+%SP_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/HomoloGene.mod.dtd b/code/lib/Bio/Entrez/DTDs/HomoloGene.mod.dtd
new file mode 100644
index 0000000..c88a5cd
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/HomoloGene.mod.dtd
@@ -0,0 +1,293 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/INSD_INSDSeq.dtd b/code/lib/Bio/Entrez/DTDs/INSD_INSDSeq.dtd
new file mode 100644
index 0000000..fdf3b96
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/INSD_INSDSeq.dtd
@@ -0,0 +1,17 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%INSD_INSDSeq_module;
diff --git a/code/lib/Bio/Entrez/DTDs/INSD_INSDSeq.mod.dtd b/code/lib/Bio/Entrez/DTDs/INSD_INSDSeq.mod.dtd
new file mode 100644
index 0000000..308423d
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/INSD_INSDSeq.mod.dtd
@@ -0,0 +1,491 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/MMDB.dtd b/code/lib/Bio/Entrez/DTDs/MMDB.dtd
new file mode 100644
index 0000000..738efa9
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/MMDB.dtd
@@ -0,0 +1,98 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%EMBL_General_module;
+
+
+%GenBank_General_module;
+
+
+%MMDB_module;
+
+
+%MMDB_Chemical_graph_module;
+
+
+%MMDB_Features_module;
+
+
+%MMDB_Structural_model_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_BioSource_module;
+
+
+%NCBI_Gene_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Medline_module;
+
+
+%NCBI_Organism_module;
+
+
+%NCBI_Protein_module;
+
+
+%NCBI_Pub_module;
+
+
+%NCBI_RNA_module;
+
+
+%NCBI_Rsite_module;
+
+
+%NCBI_SeqTable_module;
+
+
+%NCBI_Seqalign_module;
+
+
+%NCBI_Seqfeat_module;
+
+
+%NCBI_Seqloc_module;
+
+
+%NCBI_Seqres_module;
+
+
+%NCBI_Sequence_module;
+
+
+%NCBI_TxInit_module;
+
+
+%NCBI_Variation_module;
+
+
+%PDB_General_module;
+
+
+%PIR_General_module;
+
+
+%PRF_General_module;
+
+
+%SP_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/MMDB.mod.dtd b/code/lib/Bio/Entrez/DTDs/MMDB.mod.dtd
new file mode 100644
index 0000000..8424533
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/MMDB.mod.dtd
@@ -0,0 +1,259 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/MMDB_Chemical_graph.dtd b/code/lib/Bio/Entrez/DTDs/MMDB_Chemical_graph.dtd
new file mode 100644
index 0000000..fd56bf7
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/MMDB_Chemical_graph.dtd
@@ -0,0 +1,98 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%EMBL_General_module;
+
+
+%GenBank_General_module;
+
+
+%MMDB_module;
+
+
+%MMDB_Chemical_graph_module;
+
+
+%MMDB_Features_module;
+
+
+%MMDB_Structural_model_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_BioSource_module;
+
+
+%NCBI_Gene_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Medline_module;
+
+
+%NCBI_Organism_module;
+
+
+%NCBI_Protein_module;
+
+
+%NCBI_Pub_module;
+
+
+%NCBI_RNA_module;
+
+
+%NCBI_Rsite_module;
+
+
+%NCBI_SeqTable_module;
+
+
+%NCBI_Seqalign_module;
+
+
+%NCBI_Seqfeat_module;
+
+
+%NCBI_Seqloc_module;
+
+
+%NCBI_Seqres_module;
+
+
+%NCBI_Sequence_module;
+
+
+%NCBI_TxInit_module;
+
+
+%NCBI_Variation_module;
+
+
+%PDB_General_module;
+
+
+%PIR_General_module;
+
+
+%PRF_General_module;
+
+
+%SP_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/MMDB_Chemical_graph.mod.dtd b/code/lib/Bio/Entrez/DTDs/MMDB_Chemical_graph.mod.dtd
new file mode 100644
index 0000000..5763354
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/MMDB_Chemical_graph.mod.dtd
@@ -0,0 +1,561 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/MMDB_Features.dtd b/code/lib/Bio/Entrez/DTDs/MMDB_Features.dtd
new file mode 100644
index 0000000..b8eb295
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/MMDB_Features.dtd
@@ -0,0 +1,98 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%EMBL_General_module;
+
+
+%GenBank_General_module;
+
+
+%MMDB_module;
+
+
+%MMDB_Chemical_graph_module;
+
+
+%MMDB_Features_module;
+
+
+%MMDB_Structural_model_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_BioSource_module;
+
+
+%NCBI_Gene_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Medline_module;
+
+
+%NCBI_Organism_module;
+
+
+%NCBI_Protein_module;
+
+
+%NCBI_Pub_module;
+
+
+%NCBI_RNA_module;
+
+
+%NCBI_Rsite_module;
+
+
+%NCBI_SeqTable_module;
+
+
+%NCBI_Seqalign_module;
+
+
+%NCBI_Seqfeat_module;
+
+
+%NCBI_Seqloc_module;
+
+
+%NCBI_Seqres_module;
+
+
+%NCBI_Sequence_module;
+
+
+%NCBI_TxInit_module;
+
+
+%NCBI_Variation_module;
+
+
+%PDB_General_module;
+
+
+%PIR_General_module;
+
+
+%PRF_General_module;
+
+
+%SP_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/MMDB_Features.mod.dtd b/code/lib/Bio/Entrez/DTDs/MMDB_Features.mod.dtd
new file mode 100644
index 0000000..160fb02
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/MMDB_Features.mod.dtd
@@ -0,0 +1,932 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/MMDB_Structural_model.dtd b/code/lib/Bio/Entrez/DTDs/MMDB_Structural_model.dtd
new file mode 100644
index 0000000..a5a075e
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/MMDB_Structural_model.dtd
@@ -0,0 +1,98 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%EMBL_General_module;
+
+
+%GenBank_General_module;
+
+
+%MMDB_module;
+
+
+%MMDB_Chemical_graph_module;
+
+
+%MMDB_Features_module;
+
+
+%MMDB_Structural_model_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_BioSource_module;
+
+
+%NCBI_Gene_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Medline_module;
+
+
+%NCBI_Organism_module;
+
+
+%NCBI_Protein_module;
+
+
+%NCBI_Pub_module;
+
+
+%NCBI_RNA_module;
+
+
+%NCBI_Rsite_module;
+
+
+%NCBI_SeqTable_module;
+
+
+%NCBI_Seqalign_module;
+
+
+%NCBI_Seqfeat_module;
+
+
+%NCBI_Seqloc_module;
+
+
+%NCBI_Seqres_module;
+
+
+%NCBI_Sequence_module;
+
+
+%NCBI_TxInit_module;
+
+
+%NCBI_Variation_module;
+
+
+%PDB_General_module;
+
+
+%PIR_General_module;
+
+
+%PRF_General_module;
+
+
+%SP_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/MMDB_Structural_model.mod.dtd b/code/lib/Bio/Entrez/DTDs/MMDB_Structural_model.mod.dtd
new file mode 100644
index 0000000..aa9f16c
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/MMDB_Structural_model.mod.dtd
@@ -0,0 +1,676 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Access.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Access.dtd
new file mode 100644
index 0000000..5dcedf0
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Access.dtd
@@ -0,0 +1,17 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%NCBI_Access_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Access.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Access.mod.dtd
new file mode 100644
index 0000000..e83ad2a
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Access.mod.dtd
@@ -0,0 +1,49 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Biblio.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Biblio.dtd
new file mode 100644
index 0000000..17edc09
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Biblio.dtd
@@ -0,0 +1,20 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Biblio.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Biblio.mod.dtd
new file mode 100644
index 0000000..e0b7e1d
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Biblio.mod.dtd
@@ -0,0 +1,690 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_BioSource.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_BioSource.dtd
new file mode 100644
index 0000000..2bfea5c
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_BioSource.dtd
@@ -0,0 +1,23 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%NCBI_BioSource_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Organism_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_BioSource.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_BioSource.mod.dtd
new file mode 100644
index 0000000..6b5c5da
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_BioSource.mod.dtd
@@ -0,0 +1,200 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_BioTree.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_BioTree.dtd
new file mode 100644
index 0000000..26ba5d9
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_BioTree.dtd
@@ -0,0 +1,17 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%NCBI_BioTree_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_BioTree.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_BioTree.mod.dtd
new file mode 100644
index 0000000..7a2ad1a
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_BioTree.mod.dtd
@@ -0,0 +1,109 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Blast4.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Blast4.dtd
new file mode 100644
index 0000000..24437c5
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Blast4.dtd
@@ -0,0 +1,95 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%EMBL_General_module;
+
+
+%GenBank_General_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_BioSource_module;
+
+
+%NCBI_Blast4_module;
+
+
+%NCBI_Gene_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Medline_module;
+
+
+%NCBI_Organism_module;
+
+
+%NCBI_Protein_module;
+
+
+%NCBI_Pub_module;
+
+
+%NCBI_RNA_module;
+
+
+%NCBI_Rsite_module;
+
+
+%NCBI_ScoreMat_module;
+
+
+%NCBI_SeqTable_module;
+
+
+%NCBI_Seqalign_module;
+
+
+%NCBI_Seqfeat_module;
+
+
+%NCBI_Seqloc_module;
+
+
+%NCBI_Seqres_module;
+
+
+%NCBI_Seqset_module;
+
+
+%NCBI_Sequence_module;
+
+
+%NCBI_TxInit_module;
+
+
+%NCBI_Variation_module;
+
+
+%PDB_General_module;
+
+
+%PIR_General_module;
+
+
+%PRF_General_module;
+
+
+%SP_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Blast4.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Blast4.mod.dtd
new file mode 100644
index 0000000..f001a47
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Blast4.mod.dtd
@@ -0,0 +1,1498 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_BlastDL.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_BlastDL.dtd
new file mode 100644
index 0000000..451e782
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_BlastDL.dtd
@@ -0,0 +1,89 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%EMBL_General_module;
+
+
+%GenBank_General_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_BioSource_module;
+
+
+%NCBI_BlastDL_module;
+
+
+%NCBI_Gene_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Medline_module;
+
+
+%NCBI_Organism_module;
+
+
+%NCBI_Protein_module;
+
+
+%NCBI_Pub_module;
+
+
+%NCBI_RNA_module;
+
+
+%NCBI_Rsite_module;
+
+
+%NCBI_SeqTable_module;
+
+
+%NCBI_Seqalign_module;
+
+
+%NCBI_Seqfeat_module;
+
+
+%NCBI_Seqloc_module;
+
+
+%NCBI_Seqres_module;
+
+
+%NCBI_Sequence_module;
+
+
+%NCBI_TxInit_module;
+
+
+%NCBI_Variation_module;
+
+
+%PDB_General_module;
+
+
+%PIR_General_module;
+
+
+%PRF_General_module;
+
+
+%SP_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_BlastDL.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_BlastDL.mod.dtd
new file mode 100644
index 0000000..78a99bb
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_BlastDL.mod.dtd
@@ -0,0 +1,138 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_BlastOutput.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_BlastOutput.dtd
new file mode 100644
index 0000000..307176a
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_BlastOutput.dtd
@@ -0,0 +1,17 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%NCBI_BlastOutput_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_BlastOutput.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_BlastOutput.mod.dtd
new file mode 100644
index 0000000..7b0f47f
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_BlastOutput.mod.dtd
@@ -0,0 +1,273 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Cdd.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Cdd.dtd
new file mode 100644
index 0000000..b2d06ad
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Cdd.dtd
@@ -0,0 +1,110 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%EMBL_General_module;
+
+
+%GenBank_General_module;
+
+
+%MMDB_module;
+
+
+%MMDB_Chemical_graph_module;
+
+
+%MMDB_Features_module;
+
+
+%MMDB_Structural_model_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_BioSource_module;
+
+
+%NCBI_Cdd_module;
+
+
+%NCBI_Cn3d_module;
+
+
+%NCBI_Gene_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Medline_module;
+
+
+%NCBI_Organism_module;
+
+
+%NCBI_Protein_module;
+
+
+%NCBI_Pub_module;
+
+
+%NCBI_RNA_module;
+
+
+%NCBI_Rsite_module;
+
+
+%NCBI_ScoreMat_module;
+
+
+%NCBI_SeqTable_module;
+
+
+%NCBI_Seqalign_module;
+
+
+%NCBI_Seqfeat_module;
+
+
+%NCBI_Seqloc_module;
+
+
+%NCBI_Seqres_module;
+
+
+%NCBI_Seqset_module;
+
+
+%NCBI_Sequence_module;
+
+
+%NCBI_TxInit_module;
+
+
+%NCBI_Variation_module;
+
+
+%PDB_General_module;
+
+
+%PIR_General_module;
+
+
+%PRF_General_module;
+
+
+%SP_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Cdd.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Cdd.mod.dtd
new file mode 100644
index 0000000..7cf68d6
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Cdd.mod.dtd
@@ -0,0 +1,1088 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Cn3d.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Cn3d.dtd
new file mode 100644
index 0000000..9558045
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Cn3d.dtd
@@ -0,0 +1,101 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%EMBL_General_module;
+
+
+%GenBank_General_module;
+
+
+%MMDB_module;
+
+
+%MMDB_Chemical_graph_module;
+
+
+%MMDB_Features_module;
+
+
+%MMDB_Structural_model_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_BioSource_module;
+
+
+%NCBI_Cn3d_module;
+
+
+%NCBI_Gene_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Medline_module;
+
+
+%NCBI_Organism_module;
+
+
+%NCBI_Protein_module;
+
+
+%NCBI_Pub_module;
+
+
+%NCBI_RNA_module;
+
+
+%NCBI_Rsite_module;
+
+
+%NCBI_SeqTable_module;
+
+
+%NCBI_Seqalign_module;
+
+
+%NCBI_Seqfeat_module;
+
+
+%NCBI_Seqloc_module;
+
+
+%NCBI_Seqres_module;
+
+
+%NCBI_Sequence_module;
+
+
+%NCBI_TxInit_module;
+
+
+%NCBI_Variation_module;
+
+
+%PDB_General_module;
+
+
+%PIR_General_module;
+
+
+%PRF_General_module;
+
+
+%SP_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Cn3d.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Cn3d.mod.dtd
new file mode 100644
index 0000000..16a815f
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Cn3d.mod.dtd
@@ -0,0 +1,534 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Entity.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Entity.mod.dtd
new file mode 100644
index 0000000..3919c3e
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Entity.mod.dtd
@@ -0,0 +1,13 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Entrez2.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Entrez2.dtd
new file mode 100644
index 0000000..a08a907
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Entrez2.dtd
@@ -0,0 +1,17 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%NCBI_Entrez2_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Entrez2.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Entrez2.mod.dtd
new file mode 100644
index 0000000..5eb72c9
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Entrez2.mod.dtd
@@ -0,0 +1,747 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Entrezgene.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Entrezgene.dtd
new file mode 100644
index 0000000..36e206f
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Entrezgene.dtd
@@ -0,0 +1,89 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%EMBL_General_module;
+
+
+%GenBank_General_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_BioSource_module;
+
+
+%NCBI_Entrezgene_module;
+
+
+%NCBI_Gene_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Medline_module;
+
+
+%NCBI_Organism_module;
+
+
+%NCBI_Protein_module;
+
+
+%NCBI_Pub_module;
+
+
+%NCBI_RNA_module;
+
+
+%NCBI_Rsite_module;
+
+
+%NCBI_SeqTable_module;
+
+
+%NCBI_Seqalign_module;
+
+
+%NCBI_Seqfeat_module;
+
+
+%NCBI_Seqloc_module;
+
+
+%NCBI_Seqres_module;
+
+
+%NCBI_Sequence_module;
+
+
+%NCBI_TxInit_module;
+
+
+%NCBI_Variation_module;
+
+
+%PDB_General_module;
+
+
+%PIR_General_module;
+
+
+%PRF_General_module;
+
+
+%SP_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Entrezgene.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Entrezgene.mod.dtd
new file mode 100644
index 0000000..c75d32f
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Entrezgene.mod.dtd
@@ -0,0 +1,394 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_FeatDef.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_FeatDef.dtd
new file mode 100644
index 0000000..a3ce559
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_FeatDef.dtd
@@ -0,0 +1,17 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%NCBI_FeatDef_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_FeatDef.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_FeatDef.mod.dtd
new file mode 100644
index 0000000..65fbf90
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_FeatDef.mod.dtd
@@ -0,0 +1,97 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_GBSeq.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_GBSeq.dtd
new file mode 100644
index 0000000..d317e96
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_GBSeq.dtd
@@ -0,0 +1,17 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%NCBI_GBSeq_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_GBSeq.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_GBSeq.mod.dtd
new file mode 100644
index 0000000..95be4f3
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_GBSeq.mod.dtd
@@ -0,0 +1,407 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Gene.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Gene.dtd
new file mode 100644
index 0000000..cd6d122
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Gene.dtd
@@ -0,0 +1,20 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%NCBI_Gene_module;
+
+
+%NCBI_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Gene.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Gene.mod.dtd
new file mode 100644
index 0000000..be703af
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Gene.mod.dtd
@@ -0,0 +1,97 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_General.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_General.dtd
new file mode 100644
index 0000000..a8bb6c1
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_General.dtd
@@ -0,0 +1,17 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%NCBI_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_General.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_General.mod.dtd
new file mode 100644
index 0000000..c573ca5
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_General.mod.dtd
@@ -0,0 +1,333 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_ID1Access.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_ID1Access.dtd
new file mode 100644
index 0000000..8e57ced
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_ID1Access.dtd
@@ -0,0 +1,92 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%EMBL_General_module;
+
+
+%GenBank_General_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_BioSource_module;
+
+
+%NCBI_Gene_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_ID1Access_module;
+
+
+%NCBI_Medline_module;
+
+
+%NCBI_Organism_module;
+
+
+%NCBI_Protein_module;
+
+
+%NCBI_Pub_module;
+
+
+%NCBI_RNA_module;
+
+
+%NCBI_Rsite_module;
+
+
+%NCBI_SeqTable_module;
+
+
+%NCBI_Seqalign_module;
+
+
+%NCBI_Seqfeat_module;
+
+
+%NCBI_Seqloc_module;
+
+
+%NCBI_Seqres_module;
+
+
+%NCBI_Seqset_module;
+
+
+%NCBI_Sequence_module;
+
+
+%NCBI_TxInit_module;
+
+
+%NCBI_Variation_module;
+
+
+%PDB_General_module;
+
+
+%PIR_General_module;
+
+
+%PRF_General_module;
+
+
+%SP_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_ID1Access.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_ID1Access.mod.dtd
new file mode 100644
index 0000000..b489907
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_ID1Access.mod.dtd
@@ -0,0 +1,218 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_ID2Access.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_ID2Access.dtd
new file mode 100644
index 0000000..4adbfa5
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_ID2Access.dtd
@@ -0,0 +1,95 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%EMBL_General_module;
+
+
+%GenBank_General_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_BioSource_module;
+
+
+%NCBI_Gene_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_ID2Access_module;
+
+
+%NCBI_Medline_module;
+
+
+%NCBI_Organism_module;
+
+
+%NCBI_Protein_module;
+
+
+%NCBI_Pub_module;
+
+
+%NCBI_RNA_module;
+
+
+%NCBI_Rsite_module;
+
+
+%NCBI_Seq_split_module;
+
+
+%NCBI_SeqTable_module;
+
+
+%NCBI_Seqalign_module;
+
+
+%NCBI_Seqfeat_module;
+
+
+%NCBI_Seqloc_module;
+
+
+%NCBI_Seqres_module;
+
+
+%NCBI_Seqset_module;
+
+
+%NCBI_Sequence_module;
+
+
+%NCBI_TxInit_module;
+
+
+%NCBI_Variation_module;
+
+
+%PDB_General_module;
+
+
+%PIR_General_module;
+
+
+%PRF_General_module;
+
+
+%SP_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_ID2Access.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_ID2Access.mod.dtd
new file mode 100644
index 0000000..5d5ecf7
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_ID2Access.mod.dtd
@@ -0,0 +1,759 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_MedArchive.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_MedArchive.dtd
new file mode 100644
index 0000000..1082302
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_MedArchive.dtd
@@ -0,0 +1,35 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_MedArchive_module;
+
+
+%NCBI_Medlars_module;
+
+
+%NCBI_Medline_module;
+
+
+%NCBI_Pub_module;
+
+
+%NCBI_PubMed_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_MedArchive.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_MedArchive.mod.dtd
new file mode 100644
index 0000000..b4c2701
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_MedArchive.mod.dtd
@@ -0,0 +1,271 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Medlars.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Medlars.dtd
new file mode 100644
index 0000000..6d1410c
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Medlars.dtd
@@ -0,0 +1,23 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Medlars_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Medlars.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Medlars.mod.dtd
new file mode 100644
index 0000000..cb8d48a
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Medlars.mod.dtd
@@ -0,0 +1,58 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Medline.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Medline.dtd
new file mode 100644
index 0000000..9495345
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Medline.dtd
@@ -0,0 +1,23 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Medline_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Medline.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Medline.mod.dtd
new file mode 100644
index 0000000..b05a78e
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Medline.mod.dtd
@@ -0,0 +1,245 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Mim.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Mim.dtd
new file mode 100644
index 0000000..7dc862b
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Mim.dtd
@@ -0,0 +1,17 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%NCBI_Mim_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Mim.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Mim.mod.dtd
new file mode 100644
index 0000000..664a851
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Mim.mod.dtd
@@ -0,0 +1,354 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Mime.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Mime.dtd
new file mode 100644
index 0000000..a7be929
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Mime.dtd
@@ -0,0 +1,113 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%EMBL_General_module;
+
+
+%GenBank_General_module;
+
+
+%MMDB_module;
+
+
+%MMDB_Chemical_graph_module;
+
+
+%MMDB_Features_module;
+
+
+%MMDB_Structural_model_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_BioSource_module;
+
+
+%NCBI_Cdd_module;
+
+
+%NCBI_Cn3d_module;
+
+
+%NCBI_Gene_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Medline_module;
+
+
+%NCBI_Mime_module;
+
+
+%NCBI_Organism_module;
+
+
+%NCBI_Protein_module;
+
+
+%NCBI_Pub_module;
+
+
+%NCBI_RNA_module;
+
+
+%NCBI_Rsite_module;
+
+
+%NCBI_ScoreMat_module;
+
+
+%NCBI_SeqTable_module;
+
+
+%NCBI_Seqalign_module;
+
+
+%NCBI_Seqfeat_module;
+
+
+%NCBI_Seqloc_module;
+
+
+%NCBI_Seqres_module;
+
+
+%NCBI_Seqset_module;
+
+
+%NCBI_Sequence_module;
+
+
+%NCBI_TxInit_module;
+
+
+%NCBI_Variation_module;
+
+
+%PDB_General_module;
+
+
+%PIR_General_module;
+
+
+%PRF_General_module;
+
+
+%SP_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Mime.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Mime.mod.dtd
new file mode 100644
index 0000000..a7f8ef4
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Mime.mod.dtd
@@ -0,0 +1,251 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_ObjPrt.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_ObjPrt.dtd
new file mode 100644
index 0000000..ae5196a
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_ObjPrt.dtd
@@ -0,0 +1,17 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%NCBI_ObjPrt_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_ObjPrt.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_ObjPrt.mod.dtd
new file mode 100644
index 0000000..23f916a
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_ObjPrt.mod.dtd
@@ -0,0 +1,133 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Organism.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Organism.dtd
new file mode 100644
index 0000000..b06e17e
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Organism.dtd
@@ -0,0 +1,20 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Organism_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Organism.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Organism.mod.dtd
new file mode 100644
index 0000000..9c36c43
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Organism.mod.dtd
@@ -0,0 +1,226 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_PCAssay.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_PCAssay.dtd
new file mode 100644
index 0000000..7b35bb2
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_PCAssay.dtd
@@ -0,0 +1,38 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_BioSource_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Medline_module;
+
+
+%NCBI_Organism_module;
+
+
+%NCBI_PCAssay_module;
+
+
+%NCBI_PCSubstance_module;
+
+
+%NCBI_Pub_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_PCAssay.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_PCAssay.mod.dtd
new file mode 100644
index 0000000..020ab07
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_PCAssay.mod.dtd
@@ -0,0 +1,1006 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_PCSubstance.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_PCSubstance.dtd
new file mode 100644
index 0000000..0efe6fd
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_PCSubstance.dtd
@@ -0,0 +1,29 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Medline_module;
+
+
+%NCBI_PCSubstance_module;
+
+
+%NCBI_Pub_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_PCSubstance.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_PCSubstance.mod.dtd
new file mode 100644
index 0000000..479ed86
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_PCSubstance.mod.dtd
@@ -0,0 +1,1628 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Project.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Project.dtd
new file mode 100644
index 0000000..4d42013
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Project.dtd
@@ -0,0 +1,95 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%EMBL_General_module;
+
+
+%GenBank_General_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_BioSource_module;
+
+
+%NCBI_Gene_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Medline_module;
+
+
+%NCBI_Organism_module;
+
+
+%NCBI_Project_module;
+
+
+%NCBI_Protein_module;
+
+
+%NCBI_Pub_module;
+
+
+%NCBI_PubMed_module;
+
+
+%NCBI_RNA_module;
+
+
+%NCBI_Rsite_module;
+
+
+%NCBI_SeqTable_module;
+
+
+%NCBI_Seqalign_module;
+
+
+%NCBI_Seqfeat_module;
+
+
+%NCBI_Seqloc_module;
+
+
+%NCBI_Seqres_module;
+
+
+%NCBI_Seqset_module;
+
+
+%NCBI_Sequence_module;
+
+
+%NCBI_TxInit_module;
+
+
+%NCBI_Variation_module;
+
+
+%PDB_General_module;
+
+
+%PIR_General_module;
+
+
+%PRF_General_module;
+
+
+%SP_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Project.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Project.mod.dtd
new file mode 100644
index 0000000..e2215fe
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Project.mod.dtd
@@ -0,0 +1,158 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Protein.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Protein.dtd
new file mode 100644
index 0000000..e8279ea
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Protein.dtd
@@ -0,0 +1,20 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Protein_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Protein.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Protein.mod.dtd
new file mode 100644
index 0000000..e833a5d
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Protein.mod.dtd
@@ -0,0 +1,75 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Pub.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Pub.dtd
new file mode 100644
index 0000000..6a52954
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Pub.dtd
@@ -0,0 +1,26 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Medline_module;
+
+
+%NCBI_Pub_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Pub.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Pub.mod.dtd
new file mode 100644
index 0000000..ca92c18
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Pub.mod.dtd
@@ -0,0 +1,120 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_PubMed.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_PubMed.dtd
new file mode 100644
index 0000000..b272da6
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_PubMed.dtd
@@ -0,0 +1,26 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Medline_module;
+
+
+%NCBI_PubMed_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_PubMed.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_PubMed.mod.dtd
new file mode 100644
index 0000000..4313a76
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_PubMed.mod.dtd
@@ -0,0 +1,64 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_RNA.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_RNA.dtd
new file mode 100644
index 0000000..c64fad9
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_RNA.dtd
@@ -0,0 +1,86 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%EMBL_General_module;
+
+
+%GenBank_General_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_BioSource_module;
+
+
+%NCBI_Gene_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Medline_module;
+
+
+%NCBI_Organism_module;
+
+
+%NCBI_Protein_module;
+
+
+%NCBI_Pub_module;
+
+
+%NCBI_RNA_module;
+
+
+%NCBI_Rsite_module;
+
+
+%NCBI_SeqTable_module;
+
+
+%NCBI_Seqalign_module;
+
+
+%NCBI_Seqfeat_module;
+
+
+%NCBI_Seqloc_module;
+
+
+%NCBI_Seqres_module;
+
+
+%NCBI_Sequence_module;
+
+
+%NCBI_TxInit_module;
+
+
+%NCBI_Variation_module;
+
+
+%PDB_General_module;
+
+
+%PIR_General_module;
+
+
+%PRF_General_module;
+
+
+%SP_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_RNA.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_RNA.mod.dtd
new file mode 100644
index 0000000..b1c7991
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_RNA.mod.dtd
@@ -0,0 +1,144 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Remap.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Remap.dtd
new file mode 100644
index 0000000..4696a50
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Remap.dtd
@@ -0,0 +1,89 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%EMBL_General_module;
+
+
+%GenBank_General_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_BioSource_module;
+
+
+%NCBI_Gene_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Medline_module;
+
+
+%NCBI_Organism_module;
+
+
+%NCBI_Protein_module;
+
+
+%NCBI_Pub_module;
+
+
+%NCBI_RNA_module;
+
+
+%NCBI_Remap_module;
+
+
+%NCBI_Rsite_module;
+
+
+%NCBI_SeqTable_module;
+
+
+%NCBI_Seqalign_module;
+
+
+%NCBI_Seqfeat_module;
+
+
+%NCBI_Seqloc_module;
+
+
+%NCBI_Seqres_module;
+
+
+%NCBI_Sequence_module;
+
+
+%NCBI_TxInit_module;
+
+
+%NCBI_Variation_module;
+
+
+%PDB_General_module;
+
+
+%PIR_General_module;
+
+
+%PRF_General_module;
+
+
+%SP_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Remap.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Remap.mod.dtd
new file mode 100644
index 0000000..9f14d35
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Remap.mod.dtd
@@ -0,0 +1,158 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Rsite.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Rsite.dtd
new file mode 100644
index 0000000..7bcf4de
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Rsite.dtd
@@ -0,0 +1,20 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Rsite_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Rsite.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Rsite.mod.dtd
new file mode 100644
index 0000000..1758ab1
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Rsite.mod.dtd
@@ -0,0 +1,38 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_ScoreMat.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_ScoreMat.dtd
new file mode 100644
index 0000000..64a676b
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_ScoreMat.dtd
@@ -0,0 +1,92 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%EMBL_General_module;
+
+
+%GenBank_General_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_BioSource_module;
+
+
+%NCBI_Gene_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Medline_module;
+
+
+%NCBI_Organism_module;
+
+
+%NCBI_Protein_module;
+
+
+%NCBI_Pub_module;
+
+
+%NCBI_RNA_module;
+
+
+%NCBI_Rsite_module;
+
+
+%NCBI_ScoreMat_module;
+
+
+%NCBI_SeqTable_module;
+
+
+%NCBI_Seqalign_module;
+
+
+%NCBI_Seqfeat_module;
+
+
+%NCBI_Seqloc_module;
+
+
+%NCBI_Seqres_module;
+
+
+%NCBI_Seqset_module;
+
+
+%NCBI_Sequence_module;
+
+
+%NCBI_TxInit_module;
+
+
+%NCBI_Variation_module;
+
+
+%PDB_General_module;
+
+
+%PIR_General_module;
+
+
+%PRF_General_module;
+
+
+%SP_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_ScoreMat.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_ScoreMat.mod.dtd
new file mode 100644
index 0000000..d79b8e0
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_ScoreMat.mod.dtd
@@ -0,0 +1,579 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_SeqCode.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_SeqCode.dtd
new file mode 100644
index 0000000..3e754ac
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_SeqCode.dtd
@@ -0,0 +1,17 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%NCBI_SeqCode_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_SeqCode.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_SeqCode.mod.dtd
new file mode 100644
index 0000000..1e60966
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_SeqCode.mod.dtd
@@ -0,0 +1,150 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_SeqTable.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_SeqTable.dtd
new file mode 100644
index 0000000..f47d9ba
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_SeqTable.dtd
@@ -0,0 +1,86 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%EMBL_General_module;
+
+
+%GenBank_General_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_BioSource_module;
+
+
+%NCBI_Gene_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Medline_module;
+
+
+%NCBI_Organism_module;
+
+
+%NCBI_Protein_module;
+
+
+%NCBI_Pub_module;
+
+
+%NCBI_RNA_module;
+
+
+%NCBI_Rsite_module;
+
+
+%NCBI_SeqTable_module;
+
+
+%NCBI_Seqalign_module;
+
+
+%NCBI_Seqfeat_module;
+
+
+%NCBI_Seqloc_module;
+
+
+%NCBI_Seqres_module;
+
+
+%NCBI_Sequence_module;
+
+
+%NCBI_TxInit_module;
+
+
+%NCBI_Variation_module;
+
+
+%PDB_General_module;
+
+
+%PIR_General_module;
+
+
+%PRF_General_module;
+
+
+%SP_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_SeqTable.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_SeqTable.mod.dtd
new file mode 100644
index 0000000..54232b3
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_SeqTable.mod.dtd
@@ -0,0 +1,390 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Seq_split.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Seq_split.dtd
new file mode 100644
index 0000000..d705b7c
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Seq_split.dtd
@@ -0,0 +1,92 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%EMBL_General_module;
+
+
+%GenBank_General_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_BioSource_module;
+
+
+%NCBI_Gene_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Medline_module;
+
+
+%NCBI_Organism_module;
+
+
+%NCBI_Protein_module;
+
+
+%NCBI_Pub_module;
+
+
+%NCBI_RNA_module;
+
+
+%NCBI_Rsite_module;
+
+
+%NCBI_Seq_split_module;
+
+
+%NCBI_SeqTable_module;
+
+
+%NCBI_Seqalign_module;
+
+
+%NCBI_Seqfeat_module;
+
+
+%NCBI_Seqloc_module;
+
+
+%NCBI_Seqres_module;
+
+
+%NCBI_Seqset_module;
+
+
+%NCBI_Sequence_module;
+
+
+%NCBI_TxInit_module;
+
+
+%NCBI_Variation_module;
+
+
+%PDB_General_module;
+
+
+%PIR_General_module;
+
+
+%PRF_General_module;
+
+
+%SP_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Seq_split.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Seq_split.mod.dtd
new file mode 100644
index 0000000..4ab94ad
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Seq_split.mod.dtd
@@ -0,0 +1,559 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Seqalign.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Seqalign.dtd
new file mode 100644
index 0000000..08e37eb
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Seqalign.dtd
@@ -0,0 +1,86 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%EMBL_General_module;
+
+
+%GenBank_General_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_BioSource_module;
+
+
+%NCBI_Gene_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Medline_module;
+
+
+%NCBI_Organism_module;
+
+
+%NCBI_Protein_module;
+
+
+%NCBI_Pub_module;
+
+
+%NCBI_RNA_module;
+
+
+%NCBI_Rsite_module;
+
+
+%NCBI_SeqTable_module;
+
+
+%NCBI_Seqalign_module;
+
+
+%NCBI_Seqfeat_module;
+
+
+%NCBI_Seqloc_module;
+
+
+%NCBI_Seqres_module;
+
+
+%NCBI_Sequence_module;
+
+
+%NCBI_TxInit_module;
+
+
+%NCBI_Variation_module;
+
+
+%PDB_General_module;
+
+
+%PIR_General_module;
+
+
+%PRF_General_module;
+
+
+%SP_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Seqalign.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Seqalign.mod.dtd
new file mode 100644
index 0000000..721a351
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Seqalign.mod.dtd
@@ -0,0 +1,570 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Seqfeat.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Seqfeat.dtd
new file mode 100644
index 0000000..48af83f
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Seqfeat.dtd
@@ -0,0 +1,86 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%EMBL_General_module;
+
+
+%GenBank_General_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_BioSource_module;
+
+
+%NCBI_Gene_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Medline_module;
+
+
+%NCBI_Organism_module;
+
+
+%NCBI_Protein_module;
+
+
+%NCBI_Pub_module;
+
+
+%NCBI_RNA_module;
+
+
+%NCBI_Rsite_module;
+
+
+%NCBI_SeqTable_module;
+
+
+%NCBI_Seqalign_module;
+
+
+%NCBI_Seqfeat_module;
+
+
+%NCBI_Seqloc_module;
+
+
+%NCBI_Seqres_module;
+
+
+%NCBI_Sequence_module;
+
+
+%NCBI_TxInit_module;
+
+
+%NCBI_Variation_module;
+
+
+%PDB_General_module;
+
+
+%PIR_General_module;
+
+
+%PRF_General_module;
+
+
+%SP_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Seqfeat.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Seqfeat.mod.dtd
new file mode 100644
index 0000000..fb7dc75
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Seqfeat.mod.dtd
@@ -0,0 +1,772 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Seqloc.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Seqloc.dtd
new file mode 100644
index 0000000..a0464a4
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Seqloc.dtd
@@ -0,0 +1,86 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%EMBL_General_module;
+
+
+%GenBank_General_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_BioSource_module;
+
+
+%NCBI_Gene_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Medline_module;
+
+
+%NCBI_Organism_module;
+
+
+%NCBI_Protein_module;
+
+
+%NCBI_Pub_module;
+
+
+%NCBI_RNA_module;
+
+
+%NCBI_Rsite_module;
+
+
+%NCBI_SeqTable_module;
+
+
+%NCBI_Seqalign_module;
+
+
+%NCBI_Seqfeat_module;
+
+
+%NCBI_Seqloc_module;
+
+
+%NCBI_Seqres_module;
+
+
+%NCBI_Sequence_module;
+
+
+%NCBI_TxInit_module;
+
+
+%NCBI_Variation_module;
+
+
+%PDB_General_module;
+
+
+%PIR_General_module;
+
+
+%PRF_General_module;
+
+
+%SP_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Seqloc.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Seqloc.mod.dtd
new file mode 100644
index 0000000..7daa894
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Seqloc.mod.dtd
@@ -0,0 +1,325 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Seqres.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Seqres.dtd
new file mode 100644
index 0000000..353ff75
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Seqres.dtd
@@ -0,0 +1,86 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%EMBL_General_module;
+
+
+%GenBank_General_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_BioSource_module;
+
+
+%NCBI_Gene_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Medline_module;
+
+
+%NCBI_Organism_module;
+
+
+%NCBI_Protein_module;
+
+
+%NCBI_Pub_module;
+
+
+%NCBI_RNA_module;
+
+
+%NCBI_Rsite_module;
+
+
+%NCBI_SeqTable_module;
+
+
+%NCBI_Seqalign_module;
+
+
+%NCBI_Seqfeat_module;
+
+
+%NCBI_Seqloc_module;
+
+
+%NCBI_Seqres_module;
+
+
+%NCBI_Sequence_module;
+
+
+%NCBI_TxInit_module;
+
+
+%NCBI_Variation_module;
+
+
+%PDB_General_module;
+
+
+%PIR_General_module;
+
+
+%PRF_General_module;
+
+
+%SP_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Seqres.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Seqres.mod.dtd
new file mode 100644
index 0000000..27dba62
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Seqres.mod.dtd
@@ -0,0 +1,134 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Seqset.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Seqset.dtd
new file mode 100644
index 0000000..69de4db
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Seqset.dtd
@@ -0,0 +1,89 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%EMBL_General_module;
+
+
+%GenBank_General_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_BioSource_module;
+
+
+%NCBI_Gene_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Medline_module;
+
+
+%NCBI_Organism_module;
+
+
+%NCBI_Protein_module;
+
+
+%NCBI_Pub_module;
+
+
+%NCBI_RNA_module;
+
+
+%NCBI_Rsite_module;
+
+
+%NCBI_SeqTable_module;
+
+
+%NCBI_Seqalign_module;
+
+
+%NCBI_Seqfeat_module;
+
+
+%NCBI_Seqloc_module;
+
+
+%NCBI_Seqres_module;
+
+
+%NCBI_Seqset_module;
+
+
+%NCBI_Sequence_module;
+
+
+%NCBI_TxInit_module;
+
+
+%NCBI_Variation_module;
+
+
+%PDB_General_module;
+
+
+%PIR_General_module;
+
+
+%PRF_General_module;
+
+
+%SP_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Seqset.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Seqset.mod.dtd
new file mode 100644
index 0000000..549d8d6
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Seqset.mod.dtd
@@ -0,0 +1,138 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Sequence.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Sequence.dtd
new file mode 100644
index 0000000..78673f6
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Sequence.dtd
@@ -0,0 +1,86 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%EMBL_General_module;
+
+
+%GenBank_General_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_BioSource_module;
+
+
+%NCBI_Gene_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Medline_module;
+
+
+%NCBI_Organism_module;
+
+
+%NCBI_Protein_module;
+
+
+%NCBI_Pub_module;
+
+
+%NCBI_RNA_module;
+
+
+%NCBI_Rsite_module;
+
+
+%NCBI_SeqTable_module;
+
+
+%NCBI_Seqalign_module;
+
+
+%NCBI_Seqfeat_module;
+
+
+%NCBI_Seqloc_module;
+
+
+%NCBI_Seqres_module;
+
+
+%NCBI_Sequence_module;
+
+
+%NCBI_TxInit_module;
+
+
+%NCBI_Variation_module;
+
+
+%PDB_General_module;
+
+
+%PIR_General_module;
+
+
+%PRF_General_module;
+
+
+%SP_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Sequence.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Sequence.mod.dtd
new file mode 100644
index 0000000..d09ae02
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Sequence.mod.dtd
@@ -0,0 +1,1112 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Submit.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Submit.dtd
new file mode 100644
index 0000000..b81b188
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Submit.dtd
@@ -0,0 +1,92 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%EMBL_General_module;
+
+
+%GenBank_General_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_BioSource_module;
+
+
+%NCBI_Gene_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Medline_module;
+
+
+%NCBI_Organism_module;
+
+
+%NCBI_Protein_module;
+
+
+%NCBI_Pub_module;
+
+
+%NCBI_RNA_module;
+
+
+%NCBI_Rsite_module;
+
+
+%NCBI_SeqTable_module;
+
+
+%NCBI_Seqalign_module;
+
+
+%NCBI_Seqfeat_module;
+
+
+%NCBI_Seqloc_module;
+
+
+%NCBI_Seqres_module;
+
+
+%NCBI_Seqset_module;
+
+
+%NCBI_Sequence_module;
+
+
+%NCBI_Submit_module;
+
+
+%NCBI_TxInit_module;
+
+
+%NCBI_Variation_module;
+
+
+%PDB_General_module;
+
+
+%PIR_General_module;
+
+
+%PRF_General_module;
+
+
+%SP_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Submit.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Submit.mod.dtd
new file mode 100644
index 0000000..64885f0
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Submit.mod.dtd
@@ -0,0 +1,156 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Systems.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Systems.dtd
new file mode 100644
index 0000000..d1f8c2a
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Systems.dtd
@@ -0,0 +1,86 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%EMBL_General_module;
+
+
+%GenBank_General_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_BioSource_module;
+
+
+%NCBI_Gene_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Medline_module;
+
+
+%NCBI_Organism_module;
+
+
+%NCBI_Protein_module;
+
+
+%NCBI_Pub_module;
+
+
+%NCBI_RNA_module;
+
+
+%NCBI_Rsite_module;
+
+
+%NCBI_SeqTable_module;
+
+
+%NCBI_Seqalign_module;
+
+
+%NCBI_Seqfeat_module;
+
+
+%NCBI_Seqloc_module;
+
+
+%NCBI_Seqres_module;
+
+
+%NCBI_Sequence_module;
+
+
+%NCBI_Systems_module;
+
+
+%NCBI_TxInit_module;
+
+
+%PDB_General_module;
+
+
+%PIR_General_module;
+
+
+%PRF_General_module;
+
+
+%SP_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_TSeq.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_TSeq.dtd
new file mode 100644
index 0000000..9463557
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_TSeq.dtd
@@ -0,0 +1,17 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%NCBI_TSeq_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_TSeq.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_TSeq.mod.dtd
new file mode 100644
index 0000000..e7fc3f4
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_TSeq.mod.dtd
@@ -0,0 +1,66 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_TxInit.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_TxInit.dtd
new file mode 100644
index 0000000..fe9a6c1
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_TxInit.dtd
@@ -0,0 +1,29 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%NCBI_Gene_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Organism_module;
+
+
+%NCBI_Protein_module;
+
+
+%NCBI_TxInit_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_TxInit.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_TxInit.mod.dtd
new file mode 100644
index 0000000..d5d97c1
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_TxInit.mod.dtd
@@ -0,0 +1,184 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Variation.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Variation.dtd
new file mode 100644
index 0000000..7a992b1
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Variation.dtd
@@ -0,0 +1,86 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%EMBL_General_module;
+
+
+%GenBank_General_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_BioSource_module;
+
+
+%NCBI_Gene_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Medline_module;
+
+
+%NCBI_Organism_module;
+
+
+%NCBI_Protein_module;
+
+
+%NCBI_Pub_module;
+
+
+%NCBI_RNA_module;
+
+
+%NCBI_Rsite_module;
+
+
+%NCBI_SeqTable_module;
+
+
+%NCBI_Seqalign_module;
+
+
+%NCBI_Seqfeat_module;
+
+
+%NCBI_Seqloc_module;
+
+
+%NCBI_Seqres_module;
+
+
+%NCBI_Sequence_module;
+
+
+%NCBI_TxInit_module;
+
+
+%NCBI_Variation_module;
+
+
+%PDB_General_module;
+
+
+%PIR_General_module;
+
+
+%PRF_General_module;
+
+
+%SP_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_Variation.mod.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_Variation.mod.dtd
new file mode 100644
index 0000000..ee7f020
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_Variation.mod.dtd
@@ -0,0 +1,944 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/NCBI_all.dtd b/code/lib/Bio/Entrez/DTDs/NCBI_all.dtd
new file mode 100644
index 0000000..3ed101d
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NCBI_all.dtd
@@ -0,0 +1,202 @@
+
+
+%NCBI_Entity_module;
+
+
+%Docsum_3_0_module;
+
+
+%Docsum_3_1_module;
+
+
+%Docsum_3_2_module;
+
+
+%Docsum_3_3_module;
+
+
+%Docsum_3_4_module;
+
+
+%EMBL_General_module;
+
+
+%GenBank_General_module;
+
+
+%HomoloGene_module;
+
+
+%INSD_INSDSeq_module;
+
+
+%MMDB_module;
+
+
+%MMDB_Chemical_graph_module;
+
+
+%MMDB_Features_module;
+
+
+%MMDB_Structural_model_module;
+
+
+%NCBI_Access_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_BioSource_module;
+
+
+%NCBI_BioTree_module;
+
+
+%NCBI_Blast4_module;
+
+
+%NCBI_BlastDL_module;
+
+
+%NCBI_BlastOutput_module;
+
+
+%NCBI_Cdd_module;
+
+
+%NCBI_Cn3d_module;
+
+
+%NCBI_Entrez2_module;
+
+
+%NCBI_Entrezgene_module;
+
+
+%NCBI_FeatDef_module;
+
+
+%NCBI_GBSeq_module;
+
+
+%NCBI_Gene_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_ID1Access_module;
+
+
+%NCBI_ID2Access_module;
+
+
+%NCBI_MedArchive_module;
+
+
+%NCBI_Medlars_module;
+
+
+%NCBI_Medline_module;
+
+
+%NCBI_Mim_module;
+
+
+%NCBI_Mime_module;
+
+
+%NCBI_ObjPrt_module;
+
+
+%NCBI_Organism_module;
+
+
+%NCBI_PCAssay_module;
+
+
+%NCBI_PCSubstance_module;
+
+
+%NCBI_Project_module;
+
+
+%NCBI_Protein_module;
+
+
+%NCBI_Pub_module;
+
+
+%NCBI_PubMed_module;
+
+
+%NCBI_RNA_module;
+
+
+%NCBI_Remap_module;
+
+
+%NCBI_Rsite_module;
+
+
+%NCBI_ScoreMat_module;
+
+
+%NCBI_SeqCode_module;
+
+
+%NCBI_SeqTable_module;
+
+
+%NCBI_Seq_split_module;
+
+
+%NCBI_Seqalign_module;
+
+
+%NCBI_Seqfeat_module;
+
+
+%NCBI_Seqloc_module;
+
+
+%NCBI_Seqres_module;
+
+
+%NCBI_Seqset_module;
+
+
+%NCBI_Sequence_module;
+
+
+%NCBI_Submit_module;
+
+
+%NCBI_TSeq_module;
+
+
+%NCBI_TxInit_module;
+
+
+%NCBI_Variation_module;
+
+
+%NSE_module;
+
+
+%OMSSA_module;
+
+
+%PDB_General_module;
+
+
+%PIR_General_module;
+
+
+%PRF_General_module;
+
+
+%SP_General_module;
+
diff --git a/code/lib/Bio/Entrez/DTDs/NSE.dtd b/code/lib/Bio/Entrez/DTDs/NSE.dtd
new file mode 100644
index 0000000..74f0075
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NSE.dtd
@@ -0,0 +1,17 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%NSE_module;
diff --git a/code/lib/Bio/Entrez/DTDs/NSE.mod.dtd b/code/lib/Bio/Entrez/DTDs/NSE.mod.dtd
new file mode 100644
index 0000000..4327f48
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/NSE.mod.dtd
@@ -0,0 +1,895 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/OMSSA.dtd b/code/lib/Bio/Entrez/DTDs/OMSSA.dtd
new file mode 100644
index 0000000..4ab5adc
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/OMSSA.dtd
@@ -0,0 +1,89 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%EMBL_General_module;
+
+
+%GenBank_General_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_BioSource_module;
+
+
+%NCBI_Gene_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Medline_module;
+
+
+%NCBI_Organism_module;
+
+
+%NCBI_Protein_module;
+
+
+%NCBI_Pub_module;
+
+
+%NCBI_RNA_module;
+
+
+%NCBI_Rsite_module;
+
+
+%NCBI_SeqTable_module;
+
+
+%NCBI_Seqalign_module;
+
+
+%NCBI_Seqfeat_module;
+
+
+%NCBI_Seqloc_module;
+
+
+%NCBI_Seqres_module;
+
+
+%NCBI_Sequence_module;
+
+
+%NCBI_TxInit_module;
+
+
+%NCBI_Variation_module;
+
+
+%OMSSA_module;
+
+
+%PDB_General_module;
+
+
+%PIR_General_module;
+
+
+%PRF_General_module;
+
+
+%SP_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/OMSSA.mod.dtd b/code/lib/Bio/Entrez/DTDs/OMSSA.mod.dtd
new file mode 100644
index 0000000..26f050e
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/OMSSA.mod.dtd
@@ -0,0 +1,1361 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/PDB_General.dtd b/code/lib/Bio/Entrez/DTDs/PDB_General.dtd
new file mode 100644
index 0000000..cb2b663
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/PDB_General.dtd
@@ -0,0 +1,20 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%NCBI_General_module;
+
+
+%PDB_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/PDB_General.mod.dtd b/code/lib/Bio/Entrez/DTDs/PDB_General.mod.dtd
new file mode 100644
index 0000000..1e5aae9
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/PDB_General.mod.dtd
@@ -0,0 +1,70 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/PIR_General.dtd b/code/lib/Bio/Entrez/DTDs/PIR_General.dtd
new file mode 100644
index 0000000..15f7879
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/PIR_General.dtd
@@ -0,0 +1,86 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%EMBL_General_module;
+
+
+%GenBank_General_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_BioSource_module;
+
+
+%NCBI_Gene_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Medline_module;
+
+
+%NCBI_Organism_module;
+
+
+%NCBI_Protein_module;
+
+
+%NCBI_Pub_module;
+
+
+%NCBI_RNA_module;
+
+
+%NCBI_Rsite_module;
+
+
+%NCBI_SeqTable_module;
+
+
+%NCBI_Seqalign_module;
+
+
+%NCBI_Seqfeat_module;
+
+
+%NCBI_Seqloc_module;
+
+
+%NCBI_Seqres_module;
+
+
+%NCBI_Sequence_module;
+
+
+%NCBI_TxInit_module;
+
+
+%NCBI_Variation_module;
+
+
+%PDB_General_module;
+
+
+%PIR_General_module;
+
+
+%PRF_General_module;
+
+
+%SP_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/PIR_General.mod.dtd b/code/lib/Bio/Entrez/DTDs/PIR_General.mod.dtd
new file mode 100644
index 0000000..bedc2ed
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/PIR_General.mod.dtd
@@ -0,0 +1,78 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/PRF_General.dtd b/code/lib/Bio/Entrez/DTDs/PRF_General.dtd
new file mode 100644
index 0000000..b6cf457
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/PRF_General.dtd
@@ -0,0 +1,17 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%PRF_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/PRF_General.mod.dtd b/code/lib/Bio/Entrez/DTDs/PRF_General.mod.dtd
new file mode 100644
index 0000000..e09cc60
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/PRF_General.mod.dtd
@@ -0,0 +1,56 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/SP_General.dtd b/code/lib/Bio/Entrez/DTDs/SP_General.dtd
new file mode 100644
index 0000000..8a981ec
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/SP_General.dtd
@@ -0,0 +1,86 @@
+
+
+
+
+
+%NCBI_Entity_module;
+
+
+%EMBL_General_module;
+
+
+%GenBank_General_module;
+
+
+%NCBI_Biblio_module;
+
+
+%NCBI_BioSource_module;
+
+
+%NCBI_Gene_module;
+
+
+%NCBI_General_module;
+
+
+%NCBI_Medline_module;
+
+
+%NCBI_Organism_module;
+
+
+%NCBI_Protein_module;
+
+
+%NCBI_Pub_module;
+
+
+%NCBI_RNA_module;
+
+
+%NCBI_Rsite_module;
+
+
+%NCBI_SeqTable_module;
+
+
+%NCBI_Seqalign_module;
+
+
+%NCBI_Seqfeat_module;
+
+
+%NCBI_Seqloc_module;
+
+
+%NCBI_Seqres_module;
+
+
+%NCBI_Sequence_module;
+
+
+%NCBI_TxInit_module;
+
+
+%NCBI_Variation_module;
+
+
+%PDB_General_module;
+
+
+%PIR_General_module;
+
+
+%PRF_General_module;
+
+
+%SP_General_module;
diff --git a/code/lib/Bio/Entrez/DTDs/SP_General.mod.dtd b/code/lib/Bio/Entrez/DTDs/SP_General.mod.dtd
new file mode 100644
index 0000000..cab7937
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/SP_General.mod.dtd
@@ -0,0 +1,94 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/XHTMLtablesetup.ent b/code/lib/Bio/Entrez/DTDs/XHTMLtablesetup.ent
new file mode 100644
index 0000000..1ebe3ce
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/XHTMLtablesetup.ent
@@ -0,0 +1,309 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+%htmltable.dtd;
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/archivearticle.dtd b/code/lib/Bio/Entrez/DTDs/archivearticle.dtd
new file mode 100644
index 0000000..0b81d6a
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/archivearticle.dtd
@@ -0,0 +1,952 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+%archivecustom-modules.ent;
+
+
+
+
+
+%modules.ent;
+
+
+
+
+
+
+
+
+
+
+
+
+
+%archivecustom-classes.ent;
+
+
+
+
+%default-classes.ent;
+
+
+
+
+%archivecustom-mixes.ent;
+
+
+
+
+%default-mixes.ent;
+
+
+
+
+%archivecustom-models.ent;
+
+
+
+
+
+
+
+
+
+
+%common.ent;
+
+
+
+
+
+
+
+%articlemeta.ent;
+
+
+
+%backmatter.ent;
+
+
+
+%display.ent;
+
+
+
+
+%format.ent;
+
+
+
+%journalmeta.ent;
+
+
+
+%link.ent;
+
+
+
+%list.ent;
+
+
+
+%math.ent;
+
+
+
+%para.ent;
+
+
+
+%phrase.ent;
+
+
+
+%references.ent;
+
+
+
+%section.ent;
+
+
+
+
+
+
+
+
+
+%mathmlsetup.ent;
+
+
+
+
+
+%XHTMLtablesetup.ent;
+
+
+
+
+%xmlspecchars.ent;
+
+
+
+
+%chars.ent;
+
+
+
+%notat.ent;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/archivecustom-classes.ent b/code/lib/Bio/Entrez/DTDs/archivecustom-classes.ent
new file mode 100644
index 0000000..3d665ad
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/archivecustom-classes.ent
@@ -0,0 +1,157 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/archivecustom-mixes.ent b/code/lib/Bio/Entrez/DTDs/archivecustom-mixes.ent
new file mode 100644
index 0000000..a5e1b05
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/archivecustom-mixes.ent
@@ -0,0 +1,306 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/archivecustom-models.ent b/code/lib/Bio/Entrez/DTDs/archivecustom-models.ent
new file mode 100644
index 0000000..eb494c0
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/archivecustom-models.ent
@@ -0,0 +1,756 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/archivecustom-modules.ent b/code/lib/Bio/Entrez/DTDs/archivecustom-modules.ent
new file mode 100644
index 0000000..36fbc43
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/archivecustom-modules.ent
@@ -0,0 +1,116 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/articlemeta.ent b/code/lib/Bio/Entrez/DTDs/articlemeta.ent
new file mode 100644
index 0000000..f594afe
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/articlemeta.ent
@@ -0,0 +1,1811 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/backmatter.ent b/code/lib/Bio/Entrez/DTDs/backmatter.ent
new file mode 100644
index 0000000..1ece324
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/backmatter.ent
@@ -0,0 +1,277 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/bookdoc_100301.dtd b/code/lib/Bio/Entrez/DTDs/bookdoc_100301.dtd
new file mode 100644
index 0000000..78e2bae
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/bookdoc_100301.dtd
@@ -0,0 +1,78 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/bookdoc_110101.dtd b/code/lib/Bio/Entrez/DTDs/bookdoc_110101.dtd
new file mode 100644
index 0000000..78e2bae
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/bookdoc_110101.dtd
@@ -0,0 +1,78 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/bookdoc_120101.dtd b/code/lib/Bio/Entrez/DTDs/bookdoc_120101.dtd
new file mode 100644
index 0000000..78e2bae
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/bookdoc_120101.dtd
@@ -0,0 +1,78 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/bookdoc_130101.dtd b/code/lib/Bio/Entrez/DTDs/bookdoc_130101.dtd
new file mode 100644
index 0000000..8a4f338
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/bookdoc_130101.dtd
@@ -0,0 +1,82 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/bookdoc_140101.dtd b/code/lib/Bio/Entrez/DTDs/bookdoc_140101.dtd
new file mode 100644
index 0000000..8a4f338
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/bookdoc_140101.dtd
@@ -0,0 +1,82 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/bookdoc_150101.dtd b/code/lib/Bio/Entrez/DTDs/bookdoc_150101.dtd
new file mode 100644
index 0000000..8a4f338
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/bookdoc_150101.dtd
@@ -0,0 +1,82 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/chars.ent b/code/lib/Bio/Entrez/DTDs/chars.ent
new file mode 100644
index 0000000..19b6313
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/chars.ent
@@ -0,0 +1,359 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/common.ent b/code/lib/Bio/Entrez/DTDs/common.ent
new file mode 100644
index 0000000..c1907d6
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/common.ent
@@ -0,0 +1,2790 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/default-classes.ent b/code/lib/Bio/Entrez/DTDs/default-classes.ent
new file mode 100644
index 0000000..81d1155
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/default-classes.ent
@@ -0,0 +1,704 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/default-mixes.ent b/code/lib/Bio/Entrez/DTDs/default-mixes.ent
new file mode 100644
index 0000000..2f28bd9
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/default-mixes.ent
@@ -0,0 +1,357 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/display.ent b/code/lib/Bio/Entrez/DTDs/display.ent
new file mode 100644
index 0000000..ce81f7f
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/display.ent
@@ -0,0 +1,1468 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/code/lib/Bio/Entrez/DTDs/eInfo_020511.dtd b/code/lib/Bio/Entrez/DTDs/eInfo_020511.dtd
new file mode 100644
index 0000000..ac4a59f
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/eInfo_020511.dtd
@@ -0,0 +1,60 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/eLink_090910.dtd b/code/lib/Bio/Entrez/DTDs/eLink_090910.dtd
new file mode 100644
index 0000000..6aa4c47
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/eLink_090910.dtd
@@ -0,0 +1,79 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/eLink_101123.dtd b/code/lib/Bio/Entrez/DTDs/eLink_101123.dtd
new file mode 100644
index 0000000..934b5a9
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/eLink_101123.dtd
@@ -0,0 +1,88 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/ePost_020511.dtd b/code/lib/Bio/Entrez/DTDs/ePost_020511.dtd
new file mode 100644
index 0000000..3da7498
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/ePost_020511.dtd
@@ -0,0 +1,14 @@
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/eSearch_020511.dtd b/code/lib/Bio/Entrez/DTDs/eSearch_020511.dtd
new file mode 100644
index 0000000..15e734b
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/eSearch_020511.dtd
@@ -0,0 +1,64 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/eSpell.dtd b/code/lib/Bio/Entrez/DTDs/eSpell.dtd
new file mode 100644
index 0000000..18b6265
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/eSpell.dtd
@@ -0,0 +1,20 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/eSummary_041029.dtd b/code/lib/Bio/Entrez/DTDs/eSummary_041029.dtd
new file mode 100644
index 0000000..a10572a
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/eSummary_041029.dtd
@@ -0,0 +1,20 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/egquery.dtd b/code/lib/Bio/Entrez/DTDs/egquery.dtd
new file mode 100644
index 0000000..ff53342
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/egquery.dtd
@@ -0,0 +1,22 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/einfo.dtd b/code/lib/Bio/Entrez/DTDs/einfo.dtd
new file mode 100644
index 0000000..f42e108
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/einfo.dtd
@@ -0,0 +1,62 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/elink_020122.dtd b/code/lib/Bio/Entrez/DTDs/elink_020122.dtd
new file mode 100644
index 0000000..6f93374
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/elink_020122.dtd
@@ -0,0 +1,20 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/esearch.dtd b/code/lib/Bio/Entrez/DTDs/esearch.dtd
new file mode 100644
index 0000000..bd11e35
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/esearch.dtd
@@ -0,0 +1,103 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/esummary-v1.dtd b/code/lib/Bio/Entrez/DTDs/esummary-v1.dtd
new file mode 100644
index 0000000..a10572a
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/esummary-v1.dtd
@@ -0,0 +1,20 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/format.ent b/code/lib/Bio/Entrez/DTDs/format.ent
new file mode 100644
index 0000000..b702a9b
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/format.ent
@@ -0,0 +1,412 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/htmltable.dtd b/code/lib/Bio/Entrez/DTDs/htmltable.dtd
new file mode 100644
index 0000000..f4432ad
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/htmltable.dtd
@@ -0,0 +1,334 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+]]>
+
+
+
+]]>
+
+
+
+
+
+
+]]>
+
+
+
+]]>
+
+
+
+
+
+
+
+
+]]>
+
+
+
+]]>
+
+
+
+
+
+
+
+
+]]>
+
+
+
+]]>
+
+
+
+
+
+
+
+
+]]>
+
+
+
+]]>
+
+
+
+
+
+
+
+
+]]>
+
+
+
+]]>
+
+
+
+
+
+
+
+
+]]>
+
+
+
+]]>
+
+
+
+
+
+
+]]>
+
+
+
+]]>
+
+
+
+
+
+
+
+
+]]>
+
+
+
+]]>
+
+
+
+
+
+
+]]>
+
+
+
+]]>
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/isoamsa.ent b/code/lib/Bio/Entrez/DTDs/isoamsa.ent
new file mode 100644
index 0000000..c413168
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/isoamsa.ent
@@ -0,0 +1,167 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/isoamsb.ent b/code/lib/Bio/Entrez/DTDs/isoamsb.ent
new file mode 100644
index 0000000..b74414b
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/isoamsb.ent
@@ -0,0 +1,143 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/isoamsc.ent b/code/lib/Bio/Entrez/DTDs/isoamsc.ent
new file mode 100644
index 0000000..46ea221
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/isoamsc.ent
@@ -0,0 +1,43 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/isoamsn.ent b/code/lib/Bio/Entrez/DTDs/isoamsn.ent
new file mode 100644
index 0000000..a1df8b7
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/isoamsn.ent
@@ -0,0 +1,114 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/isoamso.ent b/code/lib/Bio/Entrez/DTDs/isoamso.ent
new file mode 100644
index 0000000..f99cf11
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/isoamso.ent
@@ -0,0 +1,73 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/isoamsr.ent b/code/lib/Bio/Entrez/DTDs/isoamsr.ent
new file mode 100644
index 0000000..2251ef1
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/isoamsr.ent
@@ -0,0 +1,204 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/isobox.ent b/code/lib/Bio/Entrez/DTDs/isobox.ent
new file mode 100644
index 0000000..05e2b13
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/isobox.ent
@@ -0,0 +1,61 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/isocyr1.ent b/code/lib/Bio/Entrez/DTDs/isocyr1.ent
new file mode 100644
index 0000000..b4149c7
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/isocyr1.ent
@@ -0,0 +1,88 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/isocyr2.ent b/code/lib/Bio/Entrez/DTDs/isocyr2.ent
new file mode 100644
index 0000000..b038bd9
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/isocyr2.ent
@@ -0,0 +1,47 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/isodia.ent b/code/lib/Bio/Entrez/DTDs/isodia.ent
new file mode 100644
index 0000000..39ccfcd
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/isodia.ent
@@ -0,0 +1,35 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/isogrk1.ent b/code/lib/Bio/Entrez/DTDs/isogrk1.ent
new file mode 100644
index 0000000..a5f52ef
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/isogrk1.ent
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/code/lib/Bio/Entrez/DTDs/isogrk2.ent b/code/lib/Bio/Entrez/DTDs/isogrk2.ent
new file mode 100644
index 0000000..d27cc30
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/isogrk2.ent
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/code/lib/Bio/Entrez/DTDs/isogrk3.ent b/code/lib/Bio/Entrez/DTDs/isogrk3.ent
new file mode 100644
index 0000000..0cbde88
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/isogrk3.ent
@@ -0,0 +1,64 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/isogrk4.ent b/code/lib/Bio/Entrez/DTDs/isogrk4.ent
new file mode 100644
index 0000000..07c4d06
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/isogrk4.ent
@@ -0,0 +1,69 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/isolat1.ent b/code/lib/Bio/Entrez/DTDs/isolat1.ent
new file mode 100644
index 0000000..43ae764
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/isolat1.ent
@@ -0,0 +1,83 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/isolat2.ent b/code/lib/Bio/Entrez/DTDs/isolat2.ent
new file mode 100644
index 0000000..c29b828
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/isolat2.ent
@@ -0,0 +1,142 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/isomfrk.ent b/code/lib/Bio/Entrez/DTDs/isomfrk.ent
new file mode 100644
index 0000000..0e1a943
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/isomfrk.ent
@@ -0,0 +1,75 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/isomopf.ent b/code/lib/Bio/Entrez/DTDs/isomopf.ent
new file mode 100644
index 0000000..4b26425
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/isomopf.ent
@@ -0,0 +1,49 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/isomscr.ent b/code/lib/Bio/Entrez/DTDs/isomscr.ent
new file mode 100644
index 0000000..a2174f0
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/isomscr.ent
@@ -0,0 +1,75 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/isonum.ent b/code/lib/Bio/Entrez/DTDs/isonum.ent
new file mode 100644
index 0000000..79f4380
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/isonum.ent
@@ -0,0 +1,97 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/isopub.ent b/code/lib/Bio/Entrez/DTDs/isopub.ent
new file mode 100644
index 0000000..9b27b63
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/isopub.ent
@@ -0,0 +1,105 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/isotech.ent b/code/lib/Bio/Entrez/DTDs/isotech.ent
new file mode 100644
index 0000000..d94c775
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/isotech.ent
@@ -0,0 +1,182 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/journalmeta.ent b/code/lib/Bio/Entrez/DTDs/journalmeta.ent
new file mode 100644
index 0000000..c615e2f
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/journalmeta.ent
@@ -0,0 +1,341 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/link.ent b/code/lib/Bio/Entrez/DTDs/link.ent
new file mode 100644
index 0000000..5481464
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/link.ent
@@ -0,0 +1,510 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/list.ent b/code/lib/Bio/Entrez/DTDs/list.ent
new file mode 100644
index 0000000..ab18cd9
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/list.ent
@@ -0,0 +1,465 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/math.ent b/code/lib/Bio/Entrez/DTDs/math.ent
new file mode 100644
index 0000000..1aa543b
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/math.ent
@@ -0,0 +1,329 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/mathml-in-pubmed.mod b/code/lib/Bio/Entrez/DTDs/mathml-in-pubmed.mod
new file mode 100644
index 0000000..ce95673
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/mathml-in-pubmed.mod
@@ -0,0 +1,151 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+%ent-mmlextra;
+
+
+
+%ent-mmlalias;
+
+
+%isobox;
+%isocyr1;
+%isocyr2;
+%isodia;
+%isolat1;
+%isolat2;
+%isonum;
+%isopub;
+%isoamsa;
+%isoamsb;
+%isoamsc;
+%isoamsn;
+%isoamso;
+%isoamsr;
+%isogrk3;
+%isomfrk;
+%isomopf;
+%isomscr;
+%isotech;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+%mathml.dtd;
diff --git a/code/lib/Bio/Entrez/DTDs/mathml2-qname-1.mod b/code/lib/Bio/Entrez/DTDs/mathml2-qname-1.mod
new file mode 100644
index 0000000..92a7621
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/mathml2-qname-1.mod
@@ -0,0 +1 @@
+
]]>
]]>
]]>
\ No newline at end of file
diff --git a/code/lib/Bio/Entrez/DTDs/mathml2.dtd b/code/lib/Bio/Entrez/DTDs/mathml2.dtd
new file mode 100644
index 0000000..ddd60eb
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/mathml2.dtd
@@ -0,0 +1,1960 @@
+
+
+
+
+
+
+
+
+
+%mathml-qname.mod;]]>
+
+
+
+]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+%ent-mmlextra;
+
+
+
+
+%ent-mmlalias;
+
+]]>
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/mathml3-qname1.mod b/code/lib/Bio/Entrez/DTDs/mathml3-qname1.mod
new file mode 100644
index 0000000..254bdb2
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/mathml3-qname1.mod
@@ -0,0 +1,294 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+]]>
+
+
+
+
+]]>
+
+
+
+
+]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/mathml3.dtd b/code/lib/Bio/Entrez/DTDs/mathml3.dtd
new file mode 100644
index 0000000..3a8886e
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/mathml3.dtd
@@ -0,0 +1,1682 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+%mathml-qname.mod;]]>
+
+
+
+]]>
+
+
+
+
+
+
+%isobox;
+
+%isocyr1;
+
+%isocyr2;
+
+%isodia;
+
+%isolat1;
+
+%isolat2;
+
+%isonum;
+
+%isopub;
+
+%isoamsa;
+
+%isoamsb;
+
+%isoamsc;
+
+%isoamsn;
+
+%isoamso;
+
+%isoamsr;
+
+%isogrk3;
+
+%isomfrk;
+
+%isomopf;
+
+%isomscr;
+
+%isotech;
+
+%mmlextra;
+
+%mmlalias;
+
+]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/mathmlsetup.ent b/code/lib/Bio/Entrez/DTDs/mathmlsetup.ent
new file mode 100644
index 0000000..76215a5
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/mathmlsetup.ent
@@ -0,0 +1,191 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+%mathml.dtd;
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/mmlalias.ent b/code/lib/Bio/Entrez/DTDs/mmlalias.ent
new file mode 100644
index 0000000..1371af3
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/mmlalias.ent
@@ -0,0 +1,564 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/mmlextra.ent b/code/lib/Bio/Entrez/DTDs/mmlextra.ent
new file mode 100644
index 0000000..850c7e7
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/mmlextra.ent
@@ -0,0 +1,122 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/modules.ent b/code/lib/Bio/Entrez/DTDs/modules.ent
new file mode 100644
index 0000000..5d8b7a6
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/modules.ent
@@ -0,0 +1,417 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/nlm-articleset-2.0.dtd b/code/lib/Bio/Entrez/DTDs/nlm-articleset-2.0.dtd
new file mode 100644
index 0000000..f82c149
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/nlm-articleset-2.0.dtd
@@ -0,0 +1,271 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+%archive-article;
+
diff --git a/code/lib/Bio/Entrez/DTDs/nlmcatalogrecordset_170601.dtd b/code/lib/Bio/Entrez/DTDs/nlmcatalogrecordset_170601.dtd
new file mode 100644
index 0000000..85f6cbe
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/nlmcatalogrecordset_170601.dtd
@@ -0,0 +1,280 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/nlmcommon_011101.dtd b/code/lib/Bio/Entrez/DTDs/nlmcommon_011101.dtd
new file mode 100644
index 0000000..a092651
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/nlmcommon_011101.dtd
@@ -0,0 +1,175 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/nlmcommon_080101.dtd b/code/lib/Bio/Entrez/DTDs/nlmcommon_080101.dtd
new file mode 100644
index 0000000..ac0ae02
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/nlmcommon_080101.dtd
@@ -0,0 +1,201 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/nlmcommon_090101.dtd b/code/lib/Bio/Entrez/DTDs/nlmcommon_090101.dtd
new file mode 100644
index 0000000..787129b
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/nlmcommon_090101.dtd
@@ -0,0 +1,220 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/code/lib/Bio/Entrez/DTDs/nlmmedline_011101.dtd b/code/lib/Bio/Entrez/DTDs/nlmmedline_011101.dtd
new file mode 100644
index 0000000..1c5aa06
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/nlmmedline_011101.dtd
@@ -0,0 +1,60 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+%MedlineCitation;
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/nlmmedline_080101.dtd b/code/lib/Bio/Entrez/DTDs/nlmmedline_080101.dtd
new file mode 100644
index 0000000..1f935d1
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/nlmmedline_080101.dtd
@@ -0,0 +1,71 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+%MedlineCitation;
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/nlmmedline_090101.dtd b/code/lib/Bio/Entrez/DTDs/nlmmedline_090101.dtd
new file mode 100644
index 0000000..d903ebd
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/nlmmedline_090101.dtd
@@ -0,0 +1,74 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+%MedlineCitation;
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/nlmmedlinecitation_011101.dtd b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitation_011101.dtd
new file mode 100644
index 0000000..6b8a447
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitation_011101.dtd
@@ -0,0 +1,178 @@
+
+
+
+
+
+
+
+
+
+
+%NlmCommon;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/nlmmedlinecitation_080101.dtd b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitation_080101.dtd
new file mode 100644
index 0000000..670005a
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitation_080101.dtd
@@ -0,0 +1,107 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+%NlmSharedCatCit;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/nlmmedlinecitation_090101.dtd b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitation_090101.dtd
new file mode 100644
index 0000000..1987031
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitation_090101.dtd
@@ -0,0 +1,112 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+%NlmSharedCatCit;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_100101.dtd b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_100101.dtd
new file mode 100644
index 0000000..16fc7fa
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_100101.dtd
@@ -0,0 +1,194 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_100301.dtd b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_100301.dtd
new file mode 100644
index 0000000..e6b4c48
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_100301.dtd
@@ -0,0 +1,201 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_110101.dtd b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_110101.dtd
new file mode 100644
index 0000000..c520c6d
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_110101.dtd
@@ -0,0 +1,197 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_120101.dtd b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_120101.dtd
new file mode 100644
index 0000000..6489a8c
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_120101.dtd
@@ -0,0 +1,188 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_130101.dtd b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_130101.dtd
new file mode 100644
index 0000000..cda3746
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_130101.dtd
@@ -0,0 +1,191 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_130501.dtd b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_130501.dtd
new file mode 100644
index 0000000..9566d38
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_130501.dtd
@@ -0,0 +1,191 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_140101.dtd b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_140101.dtd
new file mode 100644
index 0000000..d8238d9
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_140101.dtd
@@ -0,0 +1,190 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_150101.dtd b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_150101.dtd
new file mode 100644
index 0000000..2d90743
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/nlmmedlinecitationset_150101.dtd
@@ -0,0 +1,189 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/nlmserials_080101.dtd b/code/lib/Bio/Entrez/DTDs/nlmserials_080101.dtd
new file mode 100644
index 0000000..32968d4
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/nlmserials_080101.dtd
@@ -0,0 +1,134 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+%NlmCommon;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/nlmserials_100101.dtd b/code/lib/Bio/Entrez/DTDs/nlmserials_100101.dtd
new file mode 100644
index 0000000..d4693b0
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/nlmserials_100101.dtd
@@ -0,0 +1,157 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/nlmsharedcatcit_080101.dtd b/code/lib/Bio/Entrez/DTDs/nlmsharedcatcit_080101.dtd
new file mode 100644
index 0000000..f7397ea
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/nlmsharedcatcit_080101.dtd
@@ -0,0 +1,80 @@
+
+
+
+
+
+
+
+
+
+
+%NlmCommon;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/nlmsharedcatcit_090101.dtd b/code/lib/Bio/Entrez/DTDs/nlmsharedcatcit_090101.dtd
new file mode 100644
index 0000000..2a41d3e
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/nlmsharedcatcit_090101.dtd
@@ -0,0 +1,80 @@
+
+
+
+
+
+
+
+
+
+
+%NlmCommon;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/notat.ent b/code/lib/Bio/Entrez/DTDs/notat.ent
new file mode 100644
index 0000000..6294521
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/notat.ent
@@ -0,0 +1,172 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/para.ent b/code/lib/Bio/Entrez/DTDs/para.ent
new file mode 100644
index 0000000..9838a43
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/para.ent
@@ -0,0 +1,420 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/phrase.ent b/code/lib/Bio/Entrez/DTDs/phrase.ent
new file mode 100644
index 0000000..b08987a
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/phrase.ent
@@ -0,0 +1,278 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/pmc-1.dtd b/code/lib/Bio/Entrez/DTDs/pmc-1.dtd
new file mode 100644
index 0000000..db84036
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/pmc-1.dtd
@@ -0,0 +1,900 @@
+
+
+
+
+
+
+
+
+
+%PMCEntities; %ISO8879ent; %ISO9573ent;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+%supp_data_dtd;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+%mathmlsetup.ent;
+
diff --git a/code/lib/Bio/Entrez/DTDs/pubmed_020114.dtd b/code/lib/Bio/Entrez/DTDs/pubmed_020114.dtd
new file mode 100644
index 0000000..1538918
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/pubmed_020114.dtd
@@ -0,0 +1,61 @@
+
+
+
+
+
+%Medline;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/pubmed_080101.dtd b/code/lib/Bio/Entrez/DTDs/pubmed_080101.dtd
new file mode 100644
index 0000000..11d6184
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/pubmed_080101.dtd
@@ -0,0 +1,71 @@
+
+
+
+
+
+%Medline;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/pubmed_090101.dtd b/code/lib/Bio/Entrez/DTDs/pubmed_090101.dtd
new file mode 100644
index 0000000..ea1ea8f
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/pubmed_090101.dtd
@@ -0,0 +1,71 @@
+
+
+
+
+
+%Medline;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/pubmed_100101.dtd b/code/lib/Bio/Entrez/DTDs/pubmed_100101.dtd
new file mode 100644
index 0000000..62b71f8
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/pubmed_100101.dtd
@@ -0,0 +1,72 @@
+
+
+
+
+
+%Medline;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/pubmed_100301.dtd b/code/lib/Bio/Entrez/DTDs/pubmed_100301.dtd
new file mode 100644
index 0000000..adc5272
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/pubmed_100301.dtd
@@ -0,0 +1,79 @@
+
+
+
+
+
+%Medline;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+%Bookdoc;
+
diff --git a/code/lib/Bio/Entrez/DTDs/pubmed_110101.dtd b/code/lib/Bio/Entrez/DTDs/pubmed_110101.dtd
new file mode 100644
index 0000000..6298eb6
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/pubmed_110101.dtd
@@ -0,0 +1,79 @@
+
+
+
+
+
+%Medline;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+%Bookdoc;
+
diff --git a/code/lib/Bio/Entrez/DTDs/pubmed_120101.dtd b/code/lib/Bio/Entrez/DTDs/pubmed_120101.dtd
new file mode 100644
index 0000000..dacafb8
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/pubmed_120101.dtd
@@ -0,0 +1,79 @@
+
+
+
+
+
+%Medline;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+%Bookdoc;
+
diff --git a/code/lib/Bio/Entrez/DTDs/pubmed_130101.dtd b/code/lib/Bio/Entrez/DTDs/pubmed_130101.dtd
new file mode 100644
index 0000000..82bd9c6
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/pubmed_130101.dtd
@@ -0,0 +1,79 @@
+
+
+
+
+
+%Medline;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+%Bookdoc;
+
diff --git a/code/lib/Bio/Entrez/DTDs/pubmed_130501.dtd b/code/lib/Bio/Entrez/DTDs/pubmed_130501.dtd
new file mode 100644
index 0000000..a3c640c
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/pubmed_130501.dtd
@@ -0,0 +1,79 @@
+
+
+
+
+
+%Medline;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+%Bookdoc;
+
diff --git a/code/lib/Bio/Entrez/DTDs/pubmed_140101.dtd b/code/lib/Bio/Entrez/DTDs/pubmed_140101.dtd
new file mode 100644
index 0000000..4570815
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/pubmed_140101.dtd
@@ -0,0 +1,79 @@
+
+
+
+
+
+%Medline;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+%Bookdoc;
+
diff --git a/code/lib/Bio/Entrez/DTDs/pubmed_150101.dtd b/code/lib/Bio/Entrez/DTDs/pubmed_150101.dtd
new file mode 100644
index 0000000..7c0933d
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/pubmed_150101.dtd
@@ -0,0 +1,79 @@
+
+
+
+
+
+%Medline;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+%Bookdoc;
+
diff --git a/code/lib/Bio/Entrez/DTDs/pubmed_180101.dtd b/code/lib/Bio/Entrez/DTDs/pubmed_180101.dtd
new file mode 100644
index 0000000..48bec8a
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/pubmed_180101.dtd
@@ -0,0 +1,434 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/pubmed_180601.dtd b/code/lib/Bio/Entrez/DTDs/pubmed_180601.dtd
new file mode 100644
index 0000000..aab61e7
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/pubmed_180601.dtd
@@ -0,0 +1,454 @@
+
+
+
+
+
+
+
+%mathml-in-pubmed;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/pubmed_190101.dtd b/code/lib/Bio/Entrez/DTDs/pubmed_190101.dtd
new file mode 100644
index 0000000..a1cd167
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/pubmed_190101.dtd
@@ -0,0 +1,478 @@
+
+
+
+
+
+
+
+%mathml-in-pubmed;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/references.ent b/code/lib/Bio/Entrez/DTDs/references.ent
new file mode 100644
index 0000000..9e63a18
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/references.ent
@@ -0,0 +1,726 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/section.ent b/code/lib/Bio/Entrez/DTDs/section.ent
new file mode 100644
index 0000000..1623ac7
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/section.ent
@@ -0,0 +1,220 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/taxon.dtd b/code/lib/Bio/Entrez/DTDs/taxon.dtd
new file mode 100644
index 0000000..fadf481
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/taxon.dtd
@@ -0,0 +1,131 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/DTDs/xmlspecchars.ent b/code/lib/Bio/Entrez/DTDs/xmlspecchars.ent
new file mode 100644
index 0000000..d9914bf
--- /dev/null
+++ b/code/lib/Bio/Entrez/DTDs/xmlspecchars.ent
@@ -0,0 +1,290 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+%ISOlat1;
+%ISOlat2;
+%ISObox;
+%ISOdia;
+%ISOnum;
+%ISOpub;
+%ISOtech;
+%ISOgrk1;
+%ISOgrk2;
+%ISOgrk3;
+%ISOgrk4;
+%ISOcyr1;
+%ISOcyr2;
+%ISOamsa;
+%ISOamsb;
+%ISOamsc;
+%ISOamsn;
+%ISOamso;
+%ISOamsr;
+%ISOmscr;
+%ISOmfrk;
+%ISOmopf;
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/Parser.py b/code/lib/Bio/Entrez/Parser.py
new file mode 100644
index 0000000..98ed876
--- /dev/null
+++ b/code/lib/Bio/Entrez/Parser.py
@@ -0,0 +1,1005 @@
+# Copyright 2008-2014 by Michiel de Hoon. All rights reserved.
+# Revisions copyright 2008-2015 by Peter Cock. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+
+"""Parser for XML results returned by NCBI's Entrez Utilities.
+
+This parser is used by the read() function in Bio.Entrez, and is not
+intended be used directly.
+
+The question is how to represent an XML file as Python objects. Some
+XML files returned by NCBI look like lists, others look like dictionaries,
+and others look like a mix of lists and dictionaries.
+
+My approach is to classify each possible element in the XML as a plain
+string, an integer, a list, a dictionary, or a structure. The latter is a
+dictionary where the same key can occur multiple times; in Python, it is
+represented as a dictionary where that key occurs once, pointing to a list
+of values found in the XML file.
+
+The parser then goes through the XML and creates the appropriate Python
+object for each element. The different levels encountered in the XML are
+preserved on the Python side. So a subelement of a subelement of an element
+is a value in a dictionary that is stored in a list which is a value in
+some other dictionary (or a value in a list which itself belongs to a list
+which is a value in a dictionary, and so on). Attributes encountered in
+the XML are stored as a dictionary in a member .attributes of each element,
+and the tag name is saved in a member .tag.
+
+To decide which kind of Python object corresponds to each element in the
+XML, the parser analyzes the DTD referred at the top of (almost) every
+XML file returned by the Entrez Utilities. This is preferred over a hand-
+written solution, since the number of DTDs is rather large and their
+contents may change over time. About half the code in this parser deals
+with parsing the DTD, and the other half with the XML itself.
+"""
+import os
+import warnings
+from collections import Counter
+from xml.parsers import expat
+from io import BytesIO
+import xml.etree.ElementTree as ET
+from xml.sax.saxutils import escape
+
+from urllib.request import urlopen, urlparse
+
+
+# The following four classes are used to add a member .attributes to integers,
+# strings, lists, and dictionaries, respectively.
+
+
+class NoneElement:
+ """NCBI Entrez XML element mapped to None."""
+
+ def __init__(self, tag, attributes, key=None):
+ """Create a NoneElement."""
+ self.tag = tag
+ if key is None:
+ self.key = tag
+ else:
+ self.key = key
+ self.attributes = attributes
+
+ def __eq__(self, other):
+ """Define equality with other None objects."""
+ if other is None:
+ return True
+ elif other.__eq__(None):
+ return True
+ else:
+ return False
+
+ def __ne__(self, other):
+ """Define non-equality."""
+ if other is None:
+ return False
+ elif other.__eq__(None):
+ return False
+ else:
+ return True
+
+ def __repr__(self):
+ """Return a string representation of the object."""
+ try:
+ attributes = self.attributes
+ except AttributeError:
+ return "NoneElement"
+ return "NoneElement(attributes=%r)" % attributes
+
+
+class IntegerElement(int):
+ """NCBI Entrez XML element mapped to an integer."""
+
+ def __new__(cls, value, tag, attributes, key=None):
+ """Create an IntegerElement."""
+ self = int.__new__(cls, value)
+ self.tag = tag
+ if key is None:
+ self.key = tag
+ else:
+ self.key = key
+ self.attributes = attributes
+ return self
+
+ def __repr__(self):
+ """Return a string representation of the object."""
+ text = int.__repr__(self)
+ try:
+ attributes = self.attributes
+ except AttributeError:
+ return text
+ return "IntegerElement(%s, attributes=%r)" % (text, attributes)
+
+
+class StringElement(str):
+ """NCBI Entrez XML element mapped to a string."""
+
+ def __new__(cls, value, tag, attributes, key=None):
+ """Create a StringElement."""
+ self = str.__new__(cls, value)
+ self.tag = tag
+ if key is None:
+ self.key = tag
+ else:
+ self.key = key
+ self.attributes = attributes
+ return self
+
+ def __repr__(self):
+ """Return a string representation of the object."""
+ text = str.__repr__(self)
+ attributes = self.attributes
+ if not attributes:
+ return text
+ return "StringElement(%s, attributes=%r)" % (text, attributes)
+
+
+class ListElement(list):
+ """NCBI Entrez XML element mapped to a list."""
+
+ def __init__(self, tag, attributes, allowed_tags, key=None):
+ """Create a ListElement."""
+ self.tag = tag
+ if key is None:
+ self.key = tag
+ else:
+ self.key = key
+ self.attributes = attributes
+ self.allowed_tags = allowed_tags
+
+ def __repr__(self):
+ """Return a string representation of the object."""
+ text = list.__repr__(self)
+ attributes = self.attributes
+ if not attributes:
+ return text
+ return "ListElement(%s, attributes=%r)" % (text, attributes)
+
+ def store(self, value):
+ """Append an element to the list, checking tags."""
+ key = value.key
+ if self.allowed_tags is not None and key not in self.allowed_tags:
+ raise ValueError("Unexpected item '%s' in list" % key)
+ self.append(value)
+
+
+class DictionaryElement(dict):
+ """NCBI Entrez XML element mapped to a dictionaray."""
+
+ def __init__(self, tag, attrs, allowed_tags, repeated_tags=None, key=None):
+ """Create a DictionaryElement."""
+ self.tag = tag
+ if key is None:
+ self.key = tag
+ else:
+ self.key = key
+ self.attributes = attrs
+ self.allowed_tags = allowed_tags
+ self.repeated_tags = repeated_tags
+ if repeated_tags:
+ for key in repeated_tags:
+ self[key] = []
+
+ def __repr__(self):
+ """Return a string representation of the object."""
+ text = dict.__repr__(self)
+ attributes = self.attributes
+ if not attributes:
+ return text
+ return "DictElement(%s, attributes=%r)" % (text, attributes)
+
+ def store(self, value):
+ """Add an entry to the dictionary, checking tags."""
+ key = value.key
+ tag = value.tag
+ if self.allowed_tags is not None and tag not in self.allowed_tags:
+ raise ValueError("Unexpected item '%s' in dictionary" % key)
+ if self.repeated_tags and key in self.repeated_tags:
+ self[key].append(value)
+ else:
+ self[key] = value
+
+
+class NotXMLError(ValueError):
+ """Failed to parse file as XML."""
+
+ def __init__(self, message):
+ """Initialize the class."""
+ self.msg = message
+
+ def __str__(self):
+ """Return a string summary of the exception."""
+ return (
+ "Failed to parse the XML data (%s). Please make sure that the input data "
+ "are in XML format." % self.msg
+ )
+
+
+class CorruptedXMLError(ValueError):
+ """Corrupted XML."""
+
+ def __init__(self, message):
+ """Initialize the class."""
+ self.msg = message
+
+ def __str__(self):
+ """Return a string summary of the exception."""
+ return (
+ "Failed to parse the XML data (%s). Please make sure that the input data "
+ "are not corrupted." % self.msg
+ )
+
+
+class ValidationError(ValueError):
+ """XML tag found which was not defined in the DTD.
+
+ Validating parsers raise this error if the parser finds a tag in the XML
+ that is not defined in the DTD. Non-validating parsers do not raise this
+ error. The Bio.Entrez.read and Bio.Entrez.parse functions use validating
+ parsers by default (see those functions for more information).
+ """
+
+ def __init__(self, name):
+ """Initialize the class."""
+ self.name = name
+
+ def __str__(self):
+ """Return a string summary of the exception."""
+ return (
+ "Failed to find tag '%s' in the DTD. To skip all tags that "
+ "are not represented in the DTD, please call Bio.Entrez.read "
+ "or Bio.Entrez.parse with validate=False." % self.name
+ )
+
+
+class DataHandlerMeta(type):
+ """A metaclass is needed until Python supports @classproperty."""
+
+ def __init__(cls, *args, **kwargs):
+ """Initialize the class."""
+ cls._directory = None
+
+ @property
+ def directory(cls):
+ """Directory for caching XSD and DTD files."""
+ return cls._directory
+
+ @directory.setter
+ def directory(cls, value):
+ """Set a custom directory for the local DTD/XSD directories."""
+ if value is None:
+ import platform
+
+ if platform.system() == "Windows":
+ value = os.path.join(os.getenv("APPDATA"), "biopython")
+ else: # Unix/Linux/Mac
+ home = os.path.expanduser("~")
+ value = os.path.join(home, ".config", "biopython")
+ cls._directory = value
+ # Create DTD local directory
+ cls.local_dtd_dir = os.path.join(cls._directory, "Bio", "Entrez", "DTDs")
+ os.makedirs(cls.local_dtd_dir, exist_ok=True)
+ # Create XSD local directory
+ cls.local_xsd_dir = os.path.join(cls._directory, "Bio", "Entrez", "XSDs")
+ os.makedirs(cls.local_xsd_dir, exist_ok=True)
+
+
+class DataHandler(metaclass=DataHandlerMeta):
+ """Data handler for parsing NCBI XML from Entrez."""
+
+ from lib.Bio import Entrez
+
+ global_dtd_dir = os.path.join(Entrez.__path__[0], "DTDs")
+ global_xsd_dir = os.path.join(Entrez.__path__[0], "XSDs")
+ local_dtd_dir = ""
+ local_xsd_dir = ""
+
+ del Entrez
+
+ def __init__(self, validate, escape):
+ """Create a DataHandler object."""
+ self.dtd_urls = []
+ self.element = None
+ self.level = 0
+ self.data = []
+ self.attributes = None
+ self.allowed_tags = None
+ self.strings = {}
+ self.lists = {}
+ self.dictionaries = {}
+ self.items = set()
+ self.errors = set()
+ self.validating = validate
+ self.parser = expat.ParserCreate(namespace_separator=" ")
+ self.parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS)
+ self.parser.XmlDeclHandler = self.xmlDeclHandler
+ self.schema_namespace = None
+ self.namespace_level = Counter()
+ self.namespace_prefix = {}
+ if escape:
+ self.characterDataHandler = self.characterDataHandlerEscape
+ else:
+ self.characterDataHandler = self.characterDataHandlerRaw
+
+ def read(self, handle):
+ """Set up the parser and let it parse the XML results."""
+ # Expat's parser.ParseFile function only accepts binary data;
+ # see also the comment below for Entrez.parse.
+ if handle.read(0) != b"":
+ raise TypeError("file should be opened in binary mode")
+ try:
+ self.parser.ParseFile(handle)
+ except expat.ExpatError as e:
+ if self.parser.StartElementHandler:
+ # We saw the initial = 2:
+ # Then the first record is finished, while the second record
+ # is still a work in progress.
+ record = records.pop(0)
+ yield record
+
+ # We have reached the end of the XML file
+ self.parser = None
+ if self.element is not None:
+ # No more XML data, but there is still some unfinished business
+ raise CorruptedXMLError("Premature end of data")
+
+ # Send out the remaining records
+ yield from records
+
+ def xmlDeclHandler(self, version, encoding, standalone):
+ """Set XML handlers when an XML declaration is found."""
+ self.parser.CharacterDataHandler = self.characterDataHandler
+ self.parser.ExternalEntityRefHandler = self.externalEntityRefHandler
+ self.parser.StartNamespaceDeclHandler = self.startNamespaceDeclHandler
+ self.parser.EndNamespaceDeclHandler = self.endNamespaceDeclHandler
+ self.parser.StartElementHandler = self.handleMissingDocumentDefinition
+
+ def handleMissingDocumentDefinition(self, tag, attrs):
+ """Raise an Exception if neither a DTD nor an XML Schema is found."""
+ raise ValueError(
+ "As the XML data contained neither a Document Type Definition (DTD) nor an XML Schema, Bio.Entrez is unable to parse these data. We recommend using a generic XML parser from the Python standard library instead, for example ElementTree."
+ )
+
+ def startNamespaceDeclHandler(self, prefix, uri):
+ """Handle start of an XML namespace declaration."""
+ if prefix == "xsi":
+ # This is an xml schema
+ self.schema_namespace = uri
+ self.parser.StartElementHandler = self.schemaHandler
+ else:
+ # Note that the DTD for MathML specifies a default attribute
+ # that declares the namespace for each MathML element. This means
+ # that MathML element in the XML has an invisible MathML namespace
+ # declaration that triggers a call to startNamespaceDeclHandler
+ # and endNamespaceDeclHandler. Therefore we need to count how often
+ # startNamespaceDeclHandler and endNamespaceDeclHandler were called
+ # to find out their first and last invocation for each namespace.
+ if prefix == "mml":
+ assert uri == "http://www.w3.org/1998/Math/MathML"
+ elif prefix == "xlink":
+ assert uri == "http://www.w3.org/1999/xlink"
+ else:
+ raise ValueError("Unknown prefix '%s' with uri '%s'" % (prefix, uri))
+ self.namespace_level[prefix] += 1
+ self.namespace_prefix[uri] = prefix
+
+ def endNamespaceDeclHandler(self, prefix):
+ """Handle end of an XML namespace declaration."""
+ if prefix != "xsi":
+ self.namespace_level[prefix] -= 1
+ if self.namespace_level[prefix] == 0:
+ for key, value in self.namespace_prefix.items():
+ if value == prefix:
+ break
+ else:
+ raise RuntimeError("Failed to find namespace prefix")
+ del self.namespace_prefix[key]
+
+ def schemaHandler(self, name, attrs):
+ """Process the XML schema (before processing the element)."""
+ key = "%s noNamespaceSchemaLocation" % self.schema_namespace
+ schema = attrs[key]
+ handle = self.open_xsd_file(os.path.basename(schema))
+ # if there is no local xsd file grab the url and parse the file
+ if not handle:
+ handle = urlopen(schema)
+ text = handle.read()
+ self.save_xsd_file(os.path.basename(schema), text)
+ handle.close()
+ self.parse_xsd(ET.fromstring(text))
+ else:
+ self.parse_xsd(ET.fromstring(handle.read()))
+ handle.close()
+ # continue handling the element
+ self.startElementHandler(name, attrs)
+ # reset the element handler
+ self.parser.StartElementHandler = self.startElementHandler
+
+ def startElementHandler(self, tag, attrs):
+ """Handle start of an XML element."""
+ if tag in self.items:
+ assert tag == "Item"
+ name = attrs["Name"]
+ itemtype = attrs["Type"]
+ del attrs["Type"]
+ if itemtype == "Structure":
+ del attrs["Name"]
+ element = DictionaryElement(
+ name, attrs, allowed_tags=None, repeated_tags=None
+ )
+ parent = self.element
+ element.parent = parent
+ # For consistency with lists below, store the element here
+ if parent is None:
+ self.record = element
+ else:
+ parent.store(element)
+ self.element = element
+ self.parser.EndElementHandler = self.endElementHandler
+ self.parser.CharacterDataHandler = self.skipCharacterDataHandler
+ elif name in ("ArticleIds", "History"):
+ del attrs["Name"]
+ allowed_tags = None # allowed tags are unknown
+ repeated_tags = frozenset(["pubmed", "medline"])
+ element = DictionaryElement(
+ tag,
+ attrs,
+ allowed_tags=allowed_tags,
+ repeated_tags=repeated_tags,
+ key=name,
+ )
+ parent = self.element
+ element.parent = parent
+ # For consistency with lists below, store the element here
+ if parent is None:
+ self.record = element
+ else:
+ parent.store(element)
+ self.element = element
+ self.parser.EndElementHandler = self.endElementHandler
+ self.parser.CharacterDataHandler = self.skipCharacterDataHandler
+ elif itemtype == "List":
+ del attrs["Name"]
+ allowed_tags = None # allowed tags are unknown
+ element = ListElement(tag, attrs, allowed_tags, name)
+ parent = self.element
+ element.parent = parent
+ if self.element is None:
+ # Set self.record here to let Entrez.parse iterate over it
+ self.record = element
+ else:
+ parent.store(element)
+ self.element = element
+ self.parser.EndElementHandler = self.endElementHandler
+ self.parser.CharacterDataHandler = self.skipCharacterDataHandler
+ elif itemtype == "Integer":
+ self.parser.EndElementHandler = self.endIntegerElementHandler
+ self.parser.CharacterDataHandler = self.characterDataHandler
+ self.attributes = attrs
+ elif itemtype in ("String", "Unknown", "Date", "Enumerator"):
+ assert self.attributes is None
+ self.attributes = attrs
+ self.parser.StartElementHandler = self.startRawElementHandler
+ self.parser.EndElementHandler = self.endStringElementHandler
+ self.parser.CharacterDataHandler = self.characterDataHandler
+ else:
+ raise ValueError("Unknown item type %s" % name)
+ elif tag in self.errors:
+ self.parser.EndElementHandler = self.endErrorElementHandler
+ self.parser.CharacterDataHandler = self.characterDataHandler
+ elif tag in self.strings:
+ self.parser.StartElementHandler = self.startRawElementHandler
+ self.parser.EndElementHandler = self.endStringElementHandler
+ self.parser.CharacterDataHandler = self.characterDataHandler
+ assert self.allowed_tags is None
+ self.allowed_tags = self.strings[tag]
+ assert self.attributes is None
+ self.attributes = attrs
+ elif tag in self.dictionaries:
+ allowed_tags, repeated_tags = self.dictionaries[tag]
+ element = DictionaryElement(tag, attrs, allowed_tags, repeated_tags)
+ parent = self.element
+ element.parent = parent
+ # For consistency with lists below, store the element here
+ if parent is None:
+ self.record = element
+ else:
+ parent.store(element)
+ self.element = element
+ self.parser.EndElementHandler = self.endElementHandler
+ self.parser.CharacterDataHandler = self.skipCharacterDataHandler
+ elif tag in self.lists:
+ allowed_tags = self.lists[tag]
+ element = ListElement(tag, attrs, allowed_tags)
+ parent = self.element
+ element.parent = parent
+ if parent is None:
+ # Set self.record here to let Entrez.parse iterate over it
+ self.record = element
+ else:
+ parent.store(element)
+ self.element = element
+ self.parser.EndElementHandler = self.endElementHandler
+ self.parser.CharacterDataHandler = self.skipCharacterDataHandler
+ else:
+ # Element not found in DTD
+ if self.validating:
+ raise ValidationError(tag)
+ else:
+ # this will not be stored in the record
+ self.parser.StartElementHandler = self.startSkipElementHandler
+ self.parser.EndElementHandler = self.endSkipElementHandler
+ self.parser.CharacterDataHandler = self.skipCharacterDataHandler
+ self.level = 1
+
+ def startRawElementHandler(self, name, attrs):
+ """Handle start of an XML raw element."""
+ # check if the name is in a namespace
+ prefix = None
+ if self.namespace_prefix:
+ try:
+ uri, name = name.split()
+ except ValueError:
+ pass
+ else:
+ prefix = self.namespace_prefix[uri]
+ if self.namespace_level[prefix] == 1:
+ attrs = {"xmlns": uri}
+ if prefix:
+ key = "%s:%s" % (prefix, name)
+ else:
+ key = name
+ # self.allowed_tags is ignored for now. Anyway we know what to do
+ # with this tag.
+ tag = "<%s" % name
+ for key, value in attrs.items():
+ tag += ' %s="%s"' % (key, value)
+ tag += ">"
+ self.data.append(tag)
+ self.parser.EndElementHandler = self.endRawElementHandler
+ self.level += 1
+
+ def startSkipElementHandler(self, name, attrs):
+ """Handle start of an XML skip element."""
+ self.level += 1
+
+ def endStringElementHandler(self, tag):
+ """Handle end of an XML string element."""
+ element = self.element
+ if element is not None:
+ self.parser.StartElementHandler = self.startElementHandler
+ self.parser.EndElementHandler = self.endElementHandler
+ self.parser.CharacterDataHandler = self.skipCharacterDataHandler
+ value = "".join(self.data)
+ self.data = []
+ attributes = self.attributes
+ self.attributes = None
+ if tag in self.items:
+ assert tag == "Item"
+ key = attributes["Name"]
+ del attributes["Name"]
+ else:
+ key = tag
+ value = StringElement(value, tag, attributes, key)
+ if element is None:
+ self.record = element
+ else:
+ element.store(value)
+ self.allowed_tags = None
+
+ def endRawElementHandler(self, name):
+ """Handle start of an XML raw element."""
+ self.level -= 1
+ if self.level == 0:
+ self.parser.EndElementHandler = self.endStringElementHandler
+ if self.namespace_prefix:
+ try:
+ uri, name = name.split()
+ except ValueError:
+ pass
+ tag = "%s>" % name
+ self.data.append(tag)
+
+ def endSkipElementHandler(self, name):
+ """Handle start of an XML skip element."""
+ self.level -= 1
+ if self.level == 0:
+ self.parser.StartElementHandler = self.startElementHandler
+ self.parser.EndElementHandler = self.endElementHandler
+
+ def endErrorElementHandler(self, name):
+ """Handle start of an XML error element."""
+ if self.data:
+ # error found:
+ value = "".join(self.data)
+ raise RuntimeError(value)
+ # no error found:
+ if self.element is not None:
+ self.parser.EndElementHandler = self.endElementHandler
+ self.parser.CharacterDataHandler = self.skipCharacterDataHandler
+
+ def endElementHandler(self, name):
+ """Handle end of an XML element."""
+ element = self.element
+ self.element = element.parent
+ del element.parent
+
+ def endIntegerElementHandler(self, tag):
+ """Handle end of an XML integer element."""
+ attributes = self.attributes
+ self.attributes = None
+ assert tag == "Item"
+ key = attributes["Name"]
+ del attributes["Name"]
+ if self.data:
+ value = int("".join(self.data))
+ self.data = []
+ value = IntegerElement(value, tag, attributes, key)
+ else:
+ value = NoneElement(tag, attributes, key)
+ element = self.element
+ if element is None:
+ self.record = value
+ else:
+ self.parser.EndElementHandler = self.endElementHandler
+ self.parser.CharacterDataHandler = self.skipCharacterDataHandler
+ if value is None:
+ return
+ element.store(value)
+
+ def characterDataHandlerRaw(self, content):
+ """Handle character data as-is (raw)."""
+ self.data.append(content)
+
+ def characterDataHandlerEscape(self, content):
+ """Handle character data by encoding it."""
+ content = escape(content)
+ self.data.append(content)
+
+ def skipCharacterDataHandler(self, content):
+ """Handle character data by skipping it."""
+
+ def parse_xsd(self, root):
+ """Parse an XSD file."""
+ prefix = "{http://www.w3.org/2001/XMLSchema}"
+ for element in root:
+ isSimpleContent = False
+ attribute_keys = []
+ keys = []
+ multiple = []
+ assert element.tag == prefix + "element"
+ name = element.attrib["name"]
+ assert len(element) == 1
+ complexType = element[0]
+ assert complexType.tag == prefix + "complexType"
+ for component in complexType:
+ tag = component.tag
+ if tag == prefix + "attribute":
+ # we could distinguish by type; keeping string for now
+ attribute_keys.append(component.attrib["name"])
+ elif tag == prefix + "sequence":
+ maxOccurs = component.attrib.get("maxOccurs", "1")
+ for key in component:
+ assert key.tag == prefix + "element"
+ ref = key.attrib["ref"]
+ keys.append(ref)
+ if maxOccurs != "1" or key.attrib.get("maxOccurs", "1") != "1":
+ multiple.append(ref)
+ elif tag == prefix + "simpleContent":
+ assert len(component) == 1
+ extension = component[0]
+ assert extension.tag == prefix + "extension"
+ assert extension.attrib["base"] == "xs:string"
+ for attribute in extension:
+ assert attribute.tag == prefix + "attribute"
+ # we could distinguish by type; keeping string for now
+ attribute_keys.append(attribute.attrib["name"])
+ isSimpleContent = True
+ allowed_tags = frozenset(keys)
+ if len(keys) == 1 and keys == multiple:
+ assert not isSimpleContent
+ self.lists[name] = allowed_tags
+ elif len(keys) >= 1:
+ assert not isSimpleContent
+ repeated_tags = frozenset(multiple)
+ self.dictionaries[name] = (allowed_tags, repeated_tags)
+ else:
+ self.strings[name] = allowed_tags
+
+ def elementDecl(self, name, model):
+ """Call a call-back function for each element declaration in a DTD.
+
+ This is used for each element declaration in a DTD like::
+
+
+
+ The purpose of this function is to determine whether this element
+ should be regarded as a string, integer, list, dictionary, structure,
+ or error.
+ """
+ if name.upper() == "ERROR":
+ self.errors.add(name)
+ return
+ if name == "Item" and model == (
+ expat.model.XML_CTYPE_MIXED,
+ expat.model.XML_CQUANT_REP,
+ None,
+ ((expat.model.XML_CTYPE_NAME, expat.model.XML_CQUANT_NONE, "Item", ()),),
+ ):
+ # Special case. As far as I can tell, this only occurs in the
+ # eSummary DTD.
+ self.items.add(name)
+ return
+ # First, remove ignorable parentheses around declarations
+ while (
+ model[0] in (expat.model.XML_CTYPE_SEQ, expat.model.XML_CTYPE_CHOICE)
+ and model[1] in (expat.model.XML_CQUANT_NONE, expat.model.XML_CQUANT_OPT)
+ and len(model[3]) == 1
+ ):
+ model = model[3][0]
+ # PCDATA declarations correspond to strings
+ if model[0] in (expat.model.XML_CTYPE_MIXED, expat.model.XML_CTYPE_EMPTY):
+ if model[1] == expat.model.XML_CQUANT_REP:
+ children = model[3]
+ allowed_tags = frozenset(child[2] for child in children)
+ else:
+ allowed_tags = frozenset()
+ self.strings[name] = allowed_tags
+ return
+ # List-type elements
+ if model[0] in (
+ expat.model.XML_CTYPE_CHOICE,
+ expat.model.XML_CTYPE_SEQ,
+ ) and model[1] in (expat.model.XML_CQUANT_PLUS, expat.model.XML_CQUANT_REP):
+ children = model[3]
+ if model[0] == expat.model.XML_CTYPE_SEQ:
+ assert len(children) == 1
+ allowed_tags = frozenset(child[2] for child in children)
+ self.lists[name] = allowed_tags
+ return
+ # This is the tricky case. Check which keys can occur multiple
+ # times. If only one key is possible, and it can occur multiple
+ # times, then this is a list. If more than one key is possible,
+ # but none of them can occur multiple times, then this is a
+ # dictionary. Otherwise, this is a structure.
+ # In 'single' and 'multiple', we keep track which keys can occur
+ # only once, and which can occur multiple times.
+ single = []
+ multiple = []
+ # The 'count' function is called recursively to make sure all the
+ # children in this model are counted. Error keys are ignored;
+ # they raise an exception in Python.
+
+ def count(model):
+ quantifier, key, children = model[1:]
+ if key is None:
+ if quantifier in (
+ expat.model.XML_CQUANT_PLUS,
+ expat.model.XML_CQUANT_REP,
+ ):
+ for child in children:
+ multiple.append(child[2])
+ else:
+ for child in children:
+ count(child)
+ elif key.upper() != "ERROR":
+ if quantifier in (
+ expat.model.XML_CQUANT_NONE,
+ expat.model.XML_CQUANT_OPT,
+ ):
+ single.append(key)
+ elif quantifier in (
+ expat.model.XML_CQUANT_PLUS,
+ expat.model.XML_CQUANT_REP,
+ ):
+ multiple.append(key)
+
+ count(model)
+ if len(single) == 0 and len(multiple) == 1:
+ allowed_tags = frozenset(multiple)
+ self.lists[name] = allowed_tags
+ else:
+ allowed_tags = frozenset(single + multiple)
+ repeated_tags = frozenset(multiple)
+ self.dictionaries[name] = (allowed_tags, repeated_tags)
+
+ def open_dtd_file(self, filename):
+ """Open specified DTD file."""
+ path = os.path.join(DataHandler.local_dtd_dir, filename)
+ try:
+ handle = open(path, "rb")
+ except FileNotFoundError:
+ pass
+ else:
+ return handle
+ path = os.path.join(DataHandler.global_dtd_dir, filename)
+ try:
+ handle = open(path, "rb")
+ except FileNotFoundError:
+ pass
+ else:
+ return handle
+ return None
+
+ def open_xsd_file(self, filename):
+ """Open specified XSD file."""
+ path = os.path.join(DataHandler.local_xsd_dir, filename)
+ try:
+ handle = open(path, "rb")
+ except FileNotFoundError:
+ pass
+ else:
+ return handle
+ path = os.path.join(DataHandler.global_xsd_dir, filename)
+ try:
+ handle = open(path, "rb")
+ except FileNotFoundError:
+ pass
+ else:
+ return handle
+ return None
+
+ def save_dtd_file(self, filename, text):
+ """Save DTD file to cache."""
+ path = os.path.join(DataHandler.local_dtd_dir, filename)
+ try:
+ handle = open(path, "wb")
+ except OSError:
+ warnings.warn("Failed to save %s at %s" % (filename, path))
+ else:
+ handle.write(text)
+ handle.close()
+
+ def save_xsd_file(self, filename, text):
+ """Save XSD file to cache."""
+ path = os.path.join(DataHandler.local_xsd_dir, filename)
+ try:
+ handle = open(path, "wb")
+ except OSError:
+ warnings.warn("Failed to save %s at %s" % (filename, path))
+ else:
+ handle.write(text)
+ handle.close()
+
+ def externalEntityRefHandler(self, context, base, systemId, publicId):
+ """Handle external entity reference in order to cache DTD locally.
+
+ The purpose of this function is to load the DTD locally, instead
+ of downloading it from the URL specified in the XML. Using the local
+ DTD results in much faster parsing. If the DTD is not found locally,
+ we try to download it. If new DTDs become available from NCBI,
+ putting them in Bio/Entrez/DTDs will allow the parser to see them.
+ """
+ urlinfo = urlparse(systemId)
+ if urlinfo.scheme in ["http", "https", "ftp"]:
+ # Then this is an absolute path to the DTD.
+ url = systemId
+ elif urlinfo.scheme == "":
+ # Then this is a relative path to the DTD.
+ # Look at the parent URL to find the full path.
+ try:
+ source = self.dtd_urls[-1]
+ except IndexError:
+ # Assume the default URL for DTDs if the top parent
+ # does not contain an absolute path
+ source = "http://www.ncbi.nlm.nih.gov/dtd/"
+ else:
+ source = os.path.dirname(source)
+ # urls always have a forward slash, don't use os.path.join
+ url = source.rstrip("/") + "/" + systemId
+ else:
+ raise ValueError("Unexpected URL scheme %r" % urlinfo.scheme)
+ self.dtd_urls.append(url)
+ # First, try to load the local version of the DTD file
+ location, filename = os.path.split(systemId)
+ handle = self.open_dtd_file(filename)
+ if not handle:
+ # DTD is not available as a local file. Try accessing it through
+ # the internet instead.
+ try:
+ handle = urlopen(url)
+ except OSError:
+ raise RuntimeError(
+ "Failed to access %s at %s" % (filename, url)
+ ) from None
+ text = handle.read()
+ handle.close()
+ self.save_dtd_file(filename, text)
+ handle = BytesIO(text)
+
+ parser = self.parser.ExternalEntityParserCreate(context)
+ parser.ElementDeclHandler = self.elementDecl
+ parser.ParseFile(handle)
+ handle.close()
+ self.dtd_urls.pop()
+ self.parser.StartElementHandler = self.startElementHandler
+ return 1
diff --git a/code/lib/Bio/Entrez/XSDs/IPGReportSet.xsd b/code/lib/Bio/Entrez/XSDs/IPGReportSet.xsd
new file mode 100644
index 0000000..6194a26
--- /dev/null
+++ b/code/lib/Bio/Entrez/XSDs/IPGReportSet.xsd
@@ -0,0 +1,97 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/code/lib/Bio/Entrez/__init__.py b/code/lib/Bio/Entrez/__init__.py
new file mode 100644
index 0000000..03f2805
--- /dev/null
+++ b/code/lib/Bio/Entrez/__init__.py
@@ -0,0 +1,696 @@
+# Copyright 1999-2000 by Jeffrey Chang. All rights reserved.
+# Copyright 2008-2013 by Michiel de Hoon. All rights reserved.
+# Revisions copyright 2011-2016 by Peter Cock. All rights reserved.
+# Revisions copyright 2015 by Eric Rasche. All rights reserved.
+# Revisions copyright 2015 by Carlos Pena. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+
+"""Provides code to access NCBI over the WWW.
+
+The main Entrez web page is available at:
+http://www.ncbi.nlm.nih.gov/Entrez/
+
+Entrez Programming Utilities web page is available at:
+http://www.ncbi.nlm.nih.gov/books/NBK25501/
+
+This module provides a number of functions like ``efetch`` (short for
+Entrez Fetch) which will return the data as a handle object. This is
+a standard interface used in Python for reading data from a file, or
+in this case a remote network connection, and provides methods like
+``.read()`` or offers iteration over the contents line by line. See
+also "What the heck is a handle?" in the Biopython Tutorial and
+Cookbook: http://biopython.org/DIST/docs/tutorial/Tutorial.html
+http://biopython.org/DIST/docs/tutorial/Tutorial.pdf
+The handle returned by these functions can be either in text mode or
+in binary mode, depending on the data requested and the results
+returned by NCBI Entrez. Typically, XML data will be in binary mode
+while other data will be in text mode, as required by the downstream
+parser to parse the data.
+
+Unlike a handle to a file on disk from the ``open(filename)`` function,
+which has a ``.name`` attribute giving the filename, the handles from
+``Bio.Entrez`` all have a ``.url`` attribute instead giving the URL
+used to connect to the NCBI Entrez API.
+
+All the functions that send requests to the NCBI Entrez API will
+automatically respect the NCBI rate limit (of 3 requests per second
+without an API key, or 10 requests per second with an API key) and
+will automatically retry when encountering transient failures
+(i.e. connection failures or HTTP 5XX codes). By default, Biopython
+does a maximum of three tries before giving up, and sleeps for 15
+seconds between tries. You can tweak these parameters by setting
+``Bio.Entrez.max_tries`` and ``Bio.Entrez.sleep_between_tries``.
+
+The Entrez module also provides an XML parser which takes a handle
+as input.
+
+Variables:
+
+ - email Set the Entrez email parameter (default is not set).
+ - tool Set the Entrez tool parameter (default is ``biopython``).
+ - api_key Personal API key from NCBI. If not set, only 3 queries per
+ second are allowed. 10 queries per seconds otherwise with a
+ valid API key.
+ - max_tries Configures how many times failed requests will be
+ automatically retried on error (default is 3).
+ - sleep_between_tries The delay, in seconds, before retrying a request on
+ error (default is 15).
+
+Functions:
+
+ - efetch Retrieves records in the requested format from a list of one or
+ more primary IDs or from the user's environment
+ - epost Posts a file containing a list of primary IDs for future use in
+ the user's environment to use with subsequent search strategies
+ - esearch Searches and retrieves primary IDs (for use in EFetch, ELink,
+ and ESummary) and term translations and optionally retains
+ results for future use in the user's environment.
+ - elink Checks for the existence of an external or Related Articles link
+ from a list of one or more primary IDs. Retrieves primary IDs
+ and relevancy scores for links to Entrez databases or Related
+ Articles; creates a hyperlink to the primary LinkOut provider
+ for a specific ID and database, or lists LinkOut URLs
+ and Attributes for multiple IDs.
+ - einfo Provides field index term counts, last update, and available
+ links for each database.
+ - esummary Retrieves document summaries from a list of primary IDs or from
+ the user's environment.
+ - egquery Provides Entrez database counts in XML for a single search
+ using Global Query.
+ - espell Retrieves spelling suggestions.
+ - ecitmatch Retrieves PubMed IDs (PMIDs) that correspond to a set of
+ input citation strings.
+
+ - read Parses the XML results returned by any of the above functions.
+ Alternatively, the XML data can be read from a file opened in binary mode.
+ Typical usage is:
+
+ >>> from Bio import Entrez
+ >>> Entrez.email = "Your.Name.Here@example.org"
+ >>> handle = Entrez.einfo() # or esearch, efetch, ...
+ >>> record = Entrez.read(handle)
+ >>> handle.close()
+
+ where record is now a Python dictionary or list.
+
+ - parse Parses the XML results returned by those of the above functions
+ which can return multiple records - such as efetch, esummary
+ and elink. Typical usage is:
+
+ >>> handle = Entrez.esummary(db="pubmed", id="19304878,14630660", retmode="xml")
+ >>> records = Entrez.parse(handle)
+ >>> for record in records:
+ ... # each record is a Python dictionary or list.
+ ... print(record['Title'])
+ Biopython: freely available Python tools for computational molecular biology and bioinformatics.
+ PDB file parser and structure class implemented in Python.
+ >>> handle.close()
+
+ This function is appropriate only if the XML file contains
+ multiple records, and is particular useful for large files.
+
+ - _open Internally used function.
+
+"""
+
+import time
+import warnings
+import io
+from urllib.error import URLError, HTTPError
+from urllib.parse import urlencode
+from urllib.request import urlopen
+
+
+email = None
+max_tries = 3
+sleep_between_tries = 15
+tool = "biopython"
+api_key = None
+
+
+# XXX retmode?
+def epost(db, **keywds):
+ """Post a file of identifiers for future use.
+
+ Posts a file containing a list of UIs for future use in the user's
+ environment to use with subsequent search strategies.
+
+ See the online documentation for an explanation of the parameters:
+ http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EPost
+
+ Return a handle to the results.
+
+ Raises an IOError exception if there's a network error.
+ """
+ cgi = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/epost.fcgi"
+ variables = {"db": db}
+ variables.update(keywds)
+ return _open(cgi, variables, post=True)
+
+
+def efetch(db, **keywords):
+ """Fetch Entrez results which are returned as a handle.
+
+ EFetch retrieves records in the requested format from a list or set of one or
+ more UIs or from user's environment.
+
+ See the online documentation for an explanation of the parameters:
+ http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch
+
+ Return a handle to the results.
+
+ Raises an IOError exception if there's a network error.
+
+ Short example:
+
+ >>> from Bio import Entrez
+ >>> Entrez.email = "Your.Name.Here@example.org"
+ >>> handle = Entrez.efetch(db="nucleotide", id="AY851612", rettype="gb", retmode="text")
+ >>> print(handle.readline().strip())
+ LOCUS AY851612 892 bp DNA linear PLN 10-APR-2007
+ >>> handle.close()
+
+ This will automatically use an HTTP POST rather than HTTP GET if there
+ are over 200 identifiers as recommended by the NCBI.
+
+ **Warning:** The NCBI changed the default retmode in Feb 2012, so many
+ databases which previously returned text output now give XML.
+ """
+ cgi = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
+ variables = {"db": db}
+ variables.update(keywords)
+ post = False
+ try:
+ ids = variables["id"]
+ except KeyError:
+ pass
+ else:
+ try:
+ # ids is a single integer or a string representing a single integer
+ ids = str(int(ids))
+ except TypeError:
+ # ids was not a string; try an iterable:
+ ids = ",".join(map(str, ids))
+ except ValueError:
+ # string with commas or string not representing an integer
+ ids = ",".join(map(str, (id.strip() for id in ids.split(","))))
+
+ variables["id"] = ids
+ if ids.count(",") >= 200:
+ # NCBI prefers an HTTP POST instead of an HTTP GET if there are
+ # more than about 200 IDs
+ post = True
+ return _open(cgi, variables, post=post)
+
+
+def esearch(db, term, **keywds):
+ """Run an Entrez search and return a handle to the results.
+
+ ESearch searches and retrieves primary IDs (for use in EFetch, ELink
+ and ESummary) and term translations, and optionally retains results
+ for future use in the user's environment.
+
+ See the online documentation for an explanation of the parameters:
+ http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
+
+ Return a handle to the results which are always in XML format.
+
+ Raises an IOError exception if there's a network error.
+
+ Short example:
+
+ >>> from Bio import Entrez
+ >>> Entrez.email = "Your.Name.Here@example.org"
+ >>> handle = Entrez.esearch(db="nucleotide", retmax=10, term="opuntia[ORGN] accD", idtype="acc")
+ >>> record = Entrez.read(handle)
+ >>> handle.close()
+ >>> int(record["Count"]) >= 2
+ True
+ >>> "EF590893.1" in record["IdList"]
+ True
+ >>> "EF590892.1" in record["IdList"]
+ True
+
+ """
+ cgi = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
+ variables = {"db": db, "term": term}
+ variables.update(keywds)
+ return _open(cgi, variables)
+
+
+def elink(**keywds):
+ """Check for linked external articles and return a handle.
+
+ ELink checks for the existence of an external or Related Articles link
+ from a list of one or more primary IDs; retrieves IDs and relevancy
+ scores for links to Entrez databases or Related Articles; creates a
+ hyperlink to the primary LinkOut provider for a specific ID and
+ database, or lists LinkOut URLs and attributes for multiple IDs.
+
+ See the online documentation for an explanation of the parameters:
+ http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ELink
+
+ Return a handle to the results, by default in XML format.
+
+ Raises an IOError exception if there's a network error.
+
+ This example finds articles related to the Biopython application
+ note's entry in the PubMed database:
+
+ >>> from Bio import Entrez
+ >>> Entrez.email = "Your.Name.Here@example.org"
+ >>> pmid = "19304878"
+ >>> handle = Entrez.elink(dbfrom="pubmed", id=pmid, linkname="pubmed_pubmed")
+ >>> record = Entrez.read(handle)
+ >>> handle.close()
+ >>> print(record[0]["LinkSetDb"][0]["LinkName"])
+ pubmed_pubmed
+ >>> linked = [link["Id"] for link in record[0]["LinkSetDb"][0]["Link"]]
+ >>> "17121776" in linked
+ True
+
+ This is explained in much more detail in the Biopython Tutorial.
+ """
+ cgi = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi"
+ variables = {}
+ variables.update(keywds)
+ return _open(cgi, variables)
+
+
+def einfo(**keywds):
+ """Return a summary of the Entrez databases as a results handle.
+
+ EInfo provides field names, index term counts, last update, and
+ available links for each Entrez database.
+
+ See the online documentation for an explanation of the parameters:
+ http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EInfo
+
+ Return a handle to the results, by default in XML format.
+
+ Raises an IOError exception if there's a network error.
+
+ Short example:
+
+ >>> from Bio import Entrez
+ >>> Entrez.email = "Your.Name.Here@example.org"
+ >>> record = Entrez.read(Entrez.einfo())
+ >>> 'pubmed' in record['DbList']
+ True
+
+ """
+ cgi = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi"
+ variables = {}
+ variables.update(keywds)
+ return _open(cgi, variables)
+
+
+def esummary(**keywds):
+ """Retrieve document summaries as a results handle.
+
+ ESummary retrieves document summaries from a list of primary IDs or
+ from the user's environment.
+
+ See the online documentation for an explanation of the parameters:
+ http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESummary
+
+ Return a handle to the results, by default in XML format.
+
+ Raises an IOError exception if there's a network error.
+
+ This example discovers more about entry 19923 in the structure
+ database:
+
+ >>> from Bio import Entrez
+ >>> Entrez.email = "Your.Name.Here@example.org"
+ >>> handle = Entrez.esummary(db="structure", id="19923")
+ >>> record = Entrez.read(handle)
+ >>> handle.close()
+ >>> print(record[0]["Id"])
+ 19923
+ >>> print(record[0]["PdbDescr"])
+ Crystal Structure Of E. Coli Aconitase B
+
+ """
+ cgi = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
+ variables = {}
+ variables.update(keywds)
+ return _open(cgi, variables)
+
+
+def egquery(**keywds):
+ """Provide Entrez database counts for a global search.
+
+ EGQuery provides Entrez database counts in XML for a single search
+ using Global Query.
+
+ See the online documentation for an explanation of the parameters:
+ http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EGQuery
+
+ Return a handle to the results in XML format.
+
+ Raises an IOError exception if there's a network error.
+
+ This quick example based on a longer version from the Biopython
+ Tutorial just checks there are over 60 matches for 'Biopython'
+ in PubMedCentral:
+
+ >>> from Bio import Entrez
+ >>> Entrez.email = "Your.Name.Here@example.org"
+ >>> handle = Entrez.egquery(term="biopython")
+ >>> record = Entrez.read(handle)
+ >>> handle.close()
+ >>> for row in record["eGQueryResult"]:
+ ... if "pmc" in row["DbName"]:
+ ... print(int(row["Count"]) > 60)
+ True
+
+ """
+ cgi = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/egquery.fcgi"
+ variables = {}
+ variables.update(keywds)
+ return _open(cgi, variables)
+
+
+def espell(**keywds):
+ """Retrieve spelling suggestions as a results handle.
+
+ ESpell retrieves spelling suggestions, if available.
+
+ See the online documentation for an explanation of the parameters:
+ http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESpell
+
+ Return a handle to the results, by default in XML format.
+
+ Raises an IOError exception if there's a network error.
+
+ Short example:
+
+ >>> from Bio import Entrez
+ >>> Entrez.email = "Your.Name.Here@example.org"
+ >>> record = Entrez.read(Entrez.espell(term="biopythooon"))
+ >>> print(record["Query"])
+ biopythooon
+ >>> print(record["CorrectedQuery"])
+ biopython
+
+ """
+ cgi = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/espell.fcgi"
+ variables = {}
+ variables.update(keywds)
+ return _open(cgi, variables)
+
+
+def _update_ecitmatch_variables(keywds):
+ # XML is the only supported value, and it actually returns TXT.
+ variables = {"retmode": "xml"}
+ citation_keys = (
+ "journal_title",
+ "year",
+ "volume",
+ "first_page",
+ "author_name",
+ "key",
+ )
+
+ # Accept pre-formatted strings
+ if isinstance(keywds["bdata"], str):
+ variables.update(keywds)
+ else:
+ # Alternatively accept a nicer interface
+ variables["db"] = keywds["db"]
+ bdata = []
+ for citation in keywds["bdata"]:
+ formatted_citation = "|".join(
+ [citation.get(key, "") for key in citation_keys]
+ )
+ bdata.append(formatted_citation)
+ variables["bdata"] = "\r".join(bdata)
+ return variables
+
+
+def ecitmatch(**keywds):
+ """Retrieve PMIDs for input citation strings, returned as a handle.
+
+ ECitMatch retrieves PubMed IDs (PMIDs) that correspond to a set of input
+ citation strings.
+
+ See the online documentation for an explanation of the parameters:
+ http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ECitMatch
+
+ Return a handle to the results, by default in plain text
+
+ Raises an IOError exception if there's a network error.
+
+ Short example:
+
+ >>> from Bio import Entrez
+ >>> Entrez.email = "Your.Name.Here@example.org"
+ >>> citation_1 = {"journal_title": "proc natl acad sci u s a",
+ ... "year": "1991", "volume": "88", "first_page": "3248",
+ ... "author_name": "mann bj", "key": "citation_1"}
+ >>> handle = Entrez.ecitmatch(db="pubmed", bdata=[citation_1])
+ >>> print(handle.read().strip().split("|"))
+ ['proc natl acad sci u s a', '1991', '88', '3248', 'mann bj', 'citation_1', '2014248']
+ >>> handle.close()
+
+ """
+ cgi = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/ecitmatch.cgi"
+ variables = _update_ecitmatch_variables(keywds)
+ return _open(cgi, variables, ecitmatch=True)
+
+
+def read(handle, validate=True, escape=False):
+ """Parse an XML file from the NCBI Entrez Utilities into python objects.
+
+ This function parses an XML file created by NCBI's Entrez Utilities,
+ returning a multilevel data structure of Python lists and dictionaries.
+ Most XML files returned by NCBI's Entrez Utilities can be parsed by
+ this function, provided its DTD is available. Biopython includes the
+ DTDs for most commonly used Entrez Utilities.
+
+ The handle must be in binary mode. This allows the parser to detect the
+ encoding from the XML file, and to use it to convert all text in the XML
+ to the correct Unicode string. The functions in Bio.Entrez to access NCBI
+ Entrez will automatically return XML data in binary mode. For files,
+ please use mode "rb" when opening the file, as in
+
+ >>> from Bio import Entrez
+ >>> handle = open("Entrez/esearch1.xml", "rb") # opened in binary mode
+ >>> record = Entrez.read(handle)
+ >>> print(record['QueryTranslation'])
+ biopython[All Fields]
+ >>> handle.close()
+
+ If validate is True (default), the parser will validate the XML file
+ against the DTD, and raise an error if the XML file contains tags that
+ are not represented in the DTD. If validate is False, the parser will
+ simply skip such tags.
+
+ If escape is True, all characters that are not valid HTML are replaced
+ by HTML escape characters to guarantee that the returned strings are
+ valid HTML fragments. For example, a less-than sign (<) is replaced by
+ <. If escape is False (default), the string is returned as is.
+
+ Whereas the data structure seems to consist of generic Python lists,
+ dictionaries, strings, and so on, each of these is actually a class
+ derived from the base type. This allows us to store the attributes
+ (if any) of each element in a dictionary my_element.attributes, and
+ the tag name in my_element.tag.
+ """
+ from .Parser import DataHandler
+
+ handler = DataHandler(validate, escape)
+ record = handler.read(handle)
+ return record
+
+
+def parse(handle, validate=True, escape=False):
+ """Parse an XML file from the NCBI Entrez Utilities into python objects.
+
+ This function parses an XML file created by NCBI's Entrez Utilities,
+ returning a multilevel data structure of Python lists and dictionaries.
+ This function is suitable for XML files that (in Python) can be represented
+ as a list of individual records. Whereas 'read' reads the complete file
+ and returns a single Python list, 'parse' is a generator function that
+ returns the records one by one. This function is therefore particularly
+ useful for parsing large files.
+
+ Most XML files returned by NCBI's Entrez Utilities can be parsed by
+ this function, provided its DTD is available. Biopython includes the
+ DTDs for most commonly used Entrez Utilities.
+
+ The handle must be in binary mode. This allows the parser to detect the
+ encoding from the XML file, and to use it to convert all text in the XML
+ to the correct Unicode string. The functions in Bio.Entrez to access NCBI
+ Entrez will automatically return XML data in binary mode. For files,
+ please use mode "rb" when opening the file, as in
+
+ >>> from Bio import Entrez
+ >>> handle = open("Entrez/pubmed1.xml", "rb") # opened in binary mode
+ >>> records = Entrez.parse(handle)
+ >>> for record in records:
+ ... print(record['MedlineCitation']['Article']['Journal']['Title'])
+ ...
+ Social justice (San Francisco, Calif.)
+ Biochimica et biophysica acta
+ >>> handle.close()
+
+ If validate is True (default), the parser will validate the XML file
+ against the DTD, and raise an error if the XML file contains tags that
+ are not represented in the DTD. If validate is False, the parser will
+ simply skip such tags.
+
+ If escape is True, all characters that are not valid HTML are replaced
+ by HTML escape characters to guarantee that the returned strings are
+ valid HTML fragments. For example, a less-than sign (<) is replaced by
+ <. If escape is False (default), the string is returned as is.
+
+ Whereas the data structure seems to consist of generic Python lists,
+ dictionaries, strings, and so on, each of these is actually a class
+ derived from the base type. This allows us to store the attributes
+ (if any) of each element in a dictionary my_element.attributes, and
+ the tag name in my_element.tag.
+ """
+ from .Parser import DataHandler
+
+ handler = DataHandler(validate, escape)
+ records = handler.parse(handle)
+ return records
+
+
+def _open(cgi, params=None, post=None, ecitmatch=False):
+ """Build the URL and open a handle to it (PRIVATE).
+
+ Open a handle to Entrez. cgi is the URL for the cgi script to access.
+ params is a dictionary with the options to pass to it. Does some
+ simple error checking, and will raise an IOError if it encounters one.
+
+ The argument post should be a boolean to explicitly control if an HTTP
+ POST should be used rather an HTTP GET based on the query length.
+ By default (post=None), POST is used if the URL encoded parameters would
+ be over 1000 characters long.
+
+ This function also enforces the "up to three queries per second rule"
+ to avoid abusing the NCBI servers.
+ """
+ # NCBI requirement: At most three queries per second if no API key is provided.
+ # Equivalently, at least a third of second between queries
+ params = _construct_params(params)
+ options = _encode_options(ecitmatch, params)
+ # Using just 0.333333334 seconds sometimes hit the NCBI rate limit,
+ # the slightly longer pause of 0.37 seconds has been more reliable.
+ delay = 0.1 if api_key else 0.37
+ current = time.time()
+ wait = _open.previous + delay - current
+ if wait > 0:
+ time.sleep(wait)
+ _open.previous = current + wait
+ else:
+ _open.previous = current
+
+ # By default, post is None. Set to a boolean to over-ride length choice:
+ if post is None and len(options) > 1000:
+ post = True
+ cgi = _construct_cgi(cgi, post, options)
+
+ for i in range(max_tries):
+ try:
+ if post:
+ handle = urlopen(cgi, data=options.encode("utf8"))
+ else:
+ handle = urlopen(cgi)
+ except HTTPError as exception:
+ # Reraise if the final try fails
+ if i >= max_tries - 1:
+ raise
+ # Reraise if the exception is triggered by a HTTP 4XX error
+ # indicating some kind of bad request, UNLESS it's specifically a
+ # 429 "Too Many Requests" response. NCBI seems to sometimes
+ # erroneously return 429s even when their rate limit is
+ # honored (and indeed even with the rate-limit-related fudging
+ # higher up in this function in place), so the best we can do is
+ # treat them as a serverside error and try again after sleeping
+ # for a bit.
+ if exception.code // 100 == 4 and exception.code != 429:
+ raise
+ except URLError:
+ # Reraise if the final try fails
+ if i >= max_tries - 1:
+ raise
+ # Treat as a transient error and try again after a brief delay:
+ time.sleep(sleep_between_tries)
+ else:
+ break
+
+ subtype = handle.headers.get_content_subtype()
+ if subtype == "plain":
+ url = handle.url
+ handle = io.TextIOWrapper(handle, encoding="UTF-8")
+ handle.url = url
+ return handle
+
+
+_open.previous = 0
+
+
+def _construct_params(params):
+ if params is None:
+ params = {}
+
+ # Remove None values from the parameters
+ for key, value in list(params.items()):
+ if value is None:
+ del params[key]
+ # Tell Entrez that we are using Biopython (or whatever the user has
+ # specified explicitly in the parameters or by changing the default)
+ if "tool" not in params:
+ params["tool"] = tool
+ # Tell Entrez who we are
+ if "email" not in params:
+ if email is not None:
+ params["email"] = email
+ else:
+ warnings.warn(
+ """
+Email address is not specified.
+
+To make use of NCBI's E-utilities, NCBI requires you to specify your
+email address with each request. As an example, if your email address
+is A.N.Other@example.com, you can specify it as follows:
+ from Bio import Entrez
+ Entrez.email = 'A.N.Other@example.com'
+In case of excessive usage of the E-utilities, NCBI will attempt to contact
+a user at the email address provided before blocking access to the
+E-utilities.""",
+ UserWarning,
+ )
+ if api_key and "api_key" not in params:
+ params["api_key"] = api_key
+ return params
+
+
+def _encode_options(ecitmatch, params):
+ # Open a handle to Entrez.
+ options = urlencode(params, doseq=True)
+ # urlencode encodes pipes, which NCBI expects in ECitMatch
+ if ecitmatch:
+ options = options.replace("%7C", "|")
+ return options
+
+
+def _construct_cgi(cgi, post, options):
+ if not post:
+ # HTTP GET
+ cgi += "?" + options
+ return cgi
+
+
+if __name__ == "__main__":
+ from Bio._utils import run_doctest
+
+ run_doctest()
diff --git a/code/lib/Bio/Entrez/__pycache__/Parser.cpython-311.pyc b/code/lib/Bio/Entrez/__pycache__/Parser.cpython-311.pyc
new file mode 100644
index 0000000..56c87a0
Binary files /dev/null and b/code/lib/Bio/Entrez/__pycache__/Parser.cpython-311.pyc differ
diff --git a/code/lib/Bio/Entrez/__pycache__/Parser.cpython-312.pyc b/code/lib/Bio/Entrez/__pycache__/Parser.cpython-312.pyc
new file mode 100644
index 0000000..877b4ba
Binary files /dev/null and b/code/lib/Bio/Entrez/__pycache__/Parser.cpython-312.pyc differ
diff --git a/code/lib/Bio/Entrez/__pycache__/Parser.cpython-37.pyc b/code/lib/Bio/Entrez/__pycache__/Parser.cpython-37.pyc
new file mode 100644
index 0000000..acf7c7a
Binary files /dev/null and b/code/lib/Bio/Entrez/__pycache__/Parser.cpython-37.pyc differ
diff --git a/code/lib/Bio/Entrez/__pycache__/__init__.cpython-311.pyc b/code/lib/Bio/Entrez/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000..7fa3a2a
Binary files /dev/null and b/code/lib/Bio/Entrez/__pycache__/__init__.cpython-311.pyc differ
diff --git a/code/lib/Bio/Entrez/__pycache__/__init__.cpython-312.pyc b/code/lib/Bio/Entrez/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000..d217589
Binary files /dev/null and b/code/lib/Bio/Entrez/__pycache__/__init__.cpython-312.pyc differ
diff --git a/code/lib/Bio/Entrez/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/Entrez/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..a5a9670
Binary files /dev/null and b/code/lib/Bio/Entrez/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/ExPASy/Enzyme.py b/code/lib/Bio/ExPASy/Enzyme.py
new file mode 100644
index 0000000..6c1e5ef
--- /dev/null
+++ b/code/lib/Bio/ExPASy/Enzyme.py
@@ -0,0 +1,157 @@
+# Copyright 1999 by Jeffrey Chang. All rights reserved.
+# Copyright 2009 by Michiel de Hoon. All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+
+"""Parse the enzyme.dat file from Enzyme at ExPASy.
+
+See https://www.expasy.org/enzyme/
+
+Tested with the release of 03-Mar-2009.
+
+Functions:
+ - read Reads a file containing one ENZYME entry
+ - parse Reads a file containing multiple ENZYME entries
+
+Classes:
+ - Record Holds ENZYME data.
+
+"""
+
+
+def parse(handle):
+ """Parse ENZYME records.
+
+ This function is for parsing ENZYME files containing multiple
+ records.
+
+ Arguments:
+ - handle - handle to the file.
+
+ """
+ while True:
+ record = __read(handle)
+ if not record:
+ break
+ yield record
+
+
+def read(handle):
+ """Read one ENZYME record.
+
+ This function is for parsing ENZYME files containing
+ exactly one record.
+
+ Arguments:
+ - handle - handle to the file.
+
+ """
+ record = __read(handle)
+ # We should have reached the end of the record by now
+ remainder = handle.read()
+ if remainder:
+ raise ValueError("More than one ENZYME record found")
+ return record
+
+
+class Record(dict):
+ """Holds information from an ExPASy ENZYME record as a Python dictionary.
+
+ Each record contains the following keys:
+
+ - ID: EC number
+ - DE: Recommended name
+ - AN: Alternative names (if any)
+ - CA: Catalytic activity
+ - CF: Cofactors (if any)
+ - PR: Pointers to the Prosite documentation entrie(s) that
+ correspond to the enzyme (if any)
+ - DR: Pointers to the Swiss-Prot protein sequence entrie(s)
+ that correspond to the enzyme (if any)
+ - CC: Comments
+
+ """
+
+ def __init__(self):
+ """Initialize the class."""
+ dict.__init__(self)
+ self["ID"] = ""
+ self["DE"] = ""
+ self["AN"] = []
+ self["CA"] = ""
+ self["CF"] = ""
+ self["CC"] = [] # one comment per line
+ self["PR"] = []
+ self["DR"] = []
+
+ def __repr__(self):
+ if self["ID"]:
+ if self["DE"]:
+ return "%s (%s, %s)" % (self.__class__.__name__, self["ID"], self["DE"])
+ else:
+ return "%s (%s)" % (self.__class__.__name__, self["ID"])
+ else:
+ return "%s ( )" % (self.__class__.__name__)
+
+ def __str__(self):
+ output = [
+ "ID: " + self["ID"],
+ "DE: " + self["DE"],
+ "AN: " + repr(self["AN"]),
+ "CA: '" + self["CA"] + "'",
+ "CF: " + self["CF"],
+ "CC: " + repr(self["CC"]),
+ "PR: " + repr(self["PR"]),
+ "DR: %d Records" % len(self["DR"]),
+ ]
+ return "\n".join(output)
+
+
+# Everything below is private
+
+
+def __read(handle):
+ record = None
+ for line in handle:
+ key, value = line[:2], line[5:].rstrip()
+ if key == "ID":
+ record = Record()
+ record["ID"] = value
+ elif key == "DE":
+ record["DE"] += value
+ elif key == "AN":
+ if record["AN"] and not record["AN"][-1].endswith("."):
+ record["AN"][-1] += " " + value
+ else:
+ record["AN"].append(value)
+ elif key == "CA":
+ record["CA"] += value
+ elif key == "DR":
+ pair_data = value.rstrip(";").split(";")
+ for pair in pair_data:
+ t1, t2 = pair.split(",")
+ row = [t1.strip(), t2.strip()]
+ record["DR"].append(row)
+ elif key == "CF":
+ if record["CF"]:
+ record["CF"] += " " + value
+ else:
+ record["CF"] = value
+ elif key == "PR":
+ assert value.startswith("PROSITE; ")
+ value = value[9:].rstrip(";")
+ record["PR"].append(value)
+ elif key == "CC":
+ if value.startswith("-!- "):
+ record["CC"].append(value[4:])
+ elif value.startswith(" ") and record["CC"]:
+ record["CC"][-1] += value[3:]
+ # copyright notice is silently skipped
+ elif key == "//":
+ if record:
+ return record
+ else: # This was the copyright notice
+ continue
+ if record:
+ raise ValueError("Unexpected end of stream")
diff --git a/code/lib/Bio/ExPASy/Prodoc.py b/code/lib/Bio/ExPASy/Prodoc.py
new file mode 100644
index 0000000..52981a0
--- /dev/null
+++ b/code/lib/Bio/ExPASy/Prodoc.py
@@ -0,0 +1,173 @@
+# Copyright 2000 by Jeffrey Chang. All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+
+"""Code to work with the prosite.doc file from Prosite.
+
+See https://www.expasy.org/prosite/
+
+Tested with:
+ - Release 15.0, July 1998
+ - Release 16.0, July 1999
+ - Release 20.22, 13 November 2007
+ - Release 20.43, 10 February 2009
+
+Functions:
+ - read Read a Prodoc file containing exactly one Prodoc entry.
+ - parse Iterates over entries in a Prodoc file.
+
+Classes:
+ - Record Holds Prodoc data.
+ - Reference Holds data from a Prodoc reference.
+
+"""
+
+
+def read(handle):
+ """Read in a record from a file with exactly one Prodoc record."""
+ record = __read(handle)
+ # We should have reached the end of the record by now
+ line = handle.readline()
+ if line:
+ raise ValueError("More than one Prodoc record found")
+ return record
+
+
+def parse(handle):
+ """Iterate over the records in a Prodoc file."""
+ while True:
+ record = __read(handle)
+ if not record:
+ return
+ yield record
+
+
+class Record:
+ """Holds information from a Prodoc record.
+
+ Attributes:
+ - accession Accession number of the record.
+ - prosite_refs List of tuples (prosite accession, prosite name).
+ - text Free format text.
+ - references List of reference objects.
+
+ """
+
+ def __init__(self):
+ """Initialize the class."""
+ self.accession = ""
+ self.prosite_refs = []
+ self.text = ""
+ self.references = []
+
+
+class Reference:
+ """Holds information from a Prodoc citation.
+
+ Attributes:
+ - number Number of the reference. (string)
+ - authors Names of the authors.
+ - citation Describes the citation.
+
+ """
+
+ def __init__(self):
+ """Initialize the class."""
+ self.number = ""
+ self.authors = ""
+ self.citation = ""
+
+
+# Below are private functions
+
+
+def __read_prosite_reference_line(record, line):
+ line = line.rstrip()
+ if line[-1] != "}":
+ raise ValueError("I don't understand the Prosite reference on line\n%s" % line)
+ acc, name = line[1:-1].split("; ")
+ record.prosite_refs.append((acc, name))
+
+
+def __read_text_line(record, line):
+ record.text += line
+ return True
+
+
+def __read_reference_start(record, line):
+ # Read the references
+ reference = Reference()
+ reference.number = line[1:3].strip()
+ if line[1] == "E":
+ # If it's an electronic reference, then the URL is on the
+ # line, instead of the author.
+ reference.citation = line[4:].strip()
+ else:
+ reference.authors = line[4:].strip()
+ record.references.append(reference)
+
+
+def __read_reference_line(record, line):
+ if not line.strip():
+ return False
+ reference = record.references[-1]
+ if line.startswith(" "):
+ if reference.authors[-1] == ",":
+ reference.authors += line[4:].rstrip()
+ else:
+ reference.citation += line[5:]
+ return True
+ raise Exception("I don't understand the reference line\n%s" % line)
+
+
+def __read_copyright_line(record, line):
+ # Skip the copyright statement
+ if line.startswith("+----"):
+ return False
+ return True
+
+
+def __read(handle):
+ # Skip blank lines between records
+ for line in handle:
+ line = line.rstrip()
+ if line and not line.startswith("//"):
+ break
+ else:
+ return None
+ record = Record()
+ # Read the accession number
+ if not line.startswith("{PDOC"):
+ raise ValueError("Line does not start with '{PDOC':\n%s" % line)
+ if line[-1] != "}":
+ raise ValueError("I don't understand accession line\n%s" % line)
+ record.accession = line[1:-1]
+ # Read the Prosite references
+ for line in handle:
+ if line.startswith("{PS"):
+ __read_prosite_reference_line(record, line)
+ else:
+ break
+ else:
+ raise ValueError("Unexpected end of stream.")
+ # Read the actual text
+ if not line.startswith("{BEGIN"):
+ raise ValueError("Line does not start with '{BEGIN':\n%s" % line)
+ read_line = __read_text_line
+ for line in handle:
+ if line.startswith("{END}"):
+ # Clean up the record and return
+ for reference in record.references:
+ reference.citation = reference.citation.rstrip()
+ reference.authors = reference.authors.rstrip()
+ return record
+ elif line[0] == "[" and line[3] == "]" and line[4] == " ":
+ __read_reference_start(record, line)
+ read_line = __read_reference_line
+ elif line.startswith("+----"):
+ read_line = __read_copyright_line
+ elif read_line:
+ if not read_line(record, line):
+ read_line = None
+ raise ValueError("Unexpected end of stream.")
diff --git a/code/lib/Bio/ExPASy/Prosite.py b/code/lib/Bio/ExPASy/Prosite.py
new file mode 100644
index 0000000..9174db8
--- /dev/null
+++ b/code/lib/Bio/ExPASy/Prosite.py
@@ -0,0 +1,314 @@
+# Copyright 1999 by Jeffrey Chang. All rights reserved.
+# Copyright 2000 by Jeffrey Chang. All rights reserved.
+# Revisions Copyright 2007 by Peter Cock. All rights reserved.
+# Revisions Copyright 2009 by Michiel de Hoon. All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+"""Parser for the prosite dat file from Prosite at ExPASy.
+
+See https://www.expasy.org/prosite/
+
+Tested with:
+ - Release 20.43, 10-Feb-2009
+ - Release 2017_03 of 15-Mar-2017.
+
+Functions:
+ - read Reads a Prosite file containing one Prosite record
+ - parse Iterates over records in a Prosite file.
+
+Classes:
+ - Record Holds Prosite data.
+
+"""
+
+
+def parse(handle):
+ """Parse Prosite records.
+
+ This function is for parsing Prosite files containing multiple
+ records.
+
+ Arguments:
+ - handle - handle to the file.
+
+ """
+ while True:
+ record = __read(handle)
+ if not record:
+ break
+ yield record
+
+
+def read(handle):
+ """Read one Prosite record.
+
+ This function is for parsing Prosite files containing
+ exactly one record.
+
+ Arguments:
+ - handle - handle to the file.
+
+ """
+ record = __read(handle)
+ # We should have reached the end of the record by now
+ remainder = handle.read()
+ if remainder:
+ raise ValueError("More than one Prosite record found")
+ return record
+
+
+class Record:
+ """Holds information from a Prosite record.
+
+ Main attributes:
+ - name ID of the record. e.g. ADH_ZINC
+ - type Type of entry. e.g. PATTERN, MATRIX, or RULE
+ - accession e.g. PS00387
+ - created Date the entry was created. (MMM-YYYY for releases
+ before January 2017, DD-MMM-YYYY since January 2017)
+ - data_update Date the 'primary' data was last updated.
+ - info_update Date data other than 'primary' data was last updated.
+ - pdoc ID of the PROSITE DOCumentation.
+ - description Free-format description.
+ - pattern The PROSITE pattern. See docs.
+ - matrix List of strings that describes a matrix entry.
+ - rules List of rule definitions (from RU lines). (strings)
+ - prorules List of prorules (from PR lines). (strings)
+
+ NUMERICAL RESULTS:
+ - nr_sp_release SwissProt release.
+ - nr_sp_seqs Number of seqs in that release of Swiss-Prot. (int)
+ - nr_total Number of hits in Swiss-Prot. tuple of (hits, seqs)
+ - nr_positive True positives. tuple of (hits, seqs)
+ - nr_unknown Could be positives. tuple of (hits, seqs)
+ - nr_false_pos False positives. tuple of (hits, seqs)
+ - nr_false_neg False negatives. (int)
+ - nr_partial False negatives, because they are fragments. (int)
+
+ COMMENTS:
+ - cc_taxo_range Taxonomic range. See docs for format
+ - cc_max_repeat Maximum number of repetitions in a protein
+ - cc_site Interesting site. list of tuples (pattern pos, desc.)
+ - cc_skip_flag Can this entry be ignored?
+ - cc_matrix_type
+ - cc_scaling_db
+ - cc_author
+ - cc_ft_key
+ - cc_ft_desc
+ - cc_version version number (introduced in release 19.0)
+
+ The following are all lists if tuples (swiss-prot accession, swiss-prot name).
+
+ DATA BANK REFERENCES:
+ - dr_positive
+ - dr_false_neg
+ - dr_false_pos
+ - dr_potential Potential hits, but fingerprint region not yet available.
+ - dr_unknown Could possibly belong
+ - pdb_structs List of PDB entries.
+
+ """
+
+ def __init__(self):
+ """Initialize the class."""
+ self.name = ""
+ self.type = ""
+ self.accession = ""
+ self.created = ""
+ self.data_update = ""
+ self.info_update = ""
+ self.pdoc = ""
+
+ self.description = ""
+ self.pattern = ""
+ self.matrix = []
+ self.rules = []
+ self.prorules = []
+ self.postprocessing = []
+
+ self.nr_sp_release = ""
+ self.nr_sp_seqs = ""
+ self.nr_total = (None, None)
+ self.nr_positive = (None, None)
+ self.nr_unknown = (None, None)
+ self.nr_false_pos = (None, None)
+ self.nr_false_neg = None
+ self.nr_partial = None
+
+ self.cc_taxo_range = ""
+ self.cc_max_repeat = ""
+ self.cc_site = []
+ self.cc_skip_flag = ""
+
+ self.dr_positive = []
+ self.dr_false_neg = []
+ self.dr_false_pos = []
+ self.dr_potential = []
+ self.dr_unknown = []
+
+ self.pdb_structs = []
+
+
+# Everything below are private functions
+
+
+def __read(handle):
+ import re
+
+ record = None
+ for line in handle:
+ keyword, value = line[:2], line[5:].rstrip()
+ if keyword == "ID":
+ record = Record()
+ cols = value.split("; ")
+ if len(cols) != 2:
+ raise ValueError("I don't understand identification line\n%s" % line)
+ record.name = cols[0]
+ record.type = cols[1].rstrip(".") # don't want '.'
+ elif keyword == "AC":
+ record.accession = value.rstrip(";")
+ elif keyword == "DT":
+ # e.g. from January 2017,
+ # DT 01-APR-1990 CREATED; 01-APR-1990 DATA UPDATE; 01-APR-1990 INFO UPDATE.
+ # Older files had brackets round the date descriptions and used MMM-YYYY
+ dates = value.rstrip(".").split("; ")
+ if dates[0].endswith((" (CREATED)", " CREATED")):
+ # Remove last word
+ record.created = dates[0].rsplit(" ", 1)[0]
+ else:
+ raise ValueError("I don't understand date line\n%s" % line)
+ if dates[1].endswith((" (DATA UPDATE)", " DATA UPDATE")):
+ # Remove last two words
+ record.data_update = dates[1].rsplit(" ", 2)[0]
+ else:
+ raise ValueError("I don't understand date line\n%s" % line)
+ if dates[2].endswith((" (INFO UPDATE)", " INFO UPDATE")):
+ # Remove last two words
+ record.info_update = dates[2].rsplit(" ", 2)[0]
+ else:
+ raise ValueError("I don't understand date line\n%s" % line)
+ elif keyword == "DE":
+ record.description = value
+ elif keyword == "PA":
+ record.pattern += value
+ elif keyword == "MA":
+ record.matrix.append(value)
+ elif keyword == "PP":
+ record.postprocessing.extend(value.split(";"))
+ elif keyword == "RU":
+ record.rules.append(value)
+ elif keyword == "NR":
+ cols = value.split(";")
+ for col in cols:
+ if not col:
+ continue
+ qual, data = [word.lstrip() for word in col.split("=")]
+ if qual == "/RELEASE":
+ release, seqs = data.split(",")
+ record.nr_sp_release = release
+ record.nr_sp_seqs = int(seqs)
+ elif qual == "/FALSE_NEG":
+ record.nr_false_neg = int(data)
+ elif qual == "/PARTIAL":
+ record.nr_partial = int(data)
+ elif qual in ["/TOTAL", "/POSITIVE", "/UNKNOWN", "/FALSE_POS"]:
+ m = re.match(r"(\d+)\((\d+)\)", data)
+ if not m:
+ raise Exception(
+ "Broken data %s in comment line\n%r" % (data, line)
+ )
+ hits = tuple(map(int, m.groups()))
+ if qual == "/TOTAL":
+ record.nr_total = hits
+ elif qual == "/POSITIVE":
+ record.nr_positive = hits
+ elif qual == "/UNKNOWN":
+ record.nr_unknown = hits
+ elif qual == "/FALSE_POS":
+ record.nr_false_pos = hits
+ else:
+ raise ValueError(
+ "Unknown qual %s in comment line\n%r" % (qual, line)
+ )
+ elif keyword == "CC":
+ # Expect CC lines like this:
+ # CC /TAXO-RANGE=??EPV; /MAX-REPEAT=2;
+ # Can (normally) split on ";" and then on "="
+ cols = value.split(";")
+ for col in cols:
+ if not col or col[:17] == "Automatic scaling":
+ # DNAJ_2 in Release 15 has a non-standard comment line:
+ # CC Automatic scaling using reversed database
+ # Throw it away. (Should I keep it?)
+ continue
+ if col.count("=") == 0:
+ # Missing qualifier! Can we recover gracefully?
+ # For example, from Bug 2403, in PS50293 have:
+ # CC /AUTHOR=K_Hofmann; N_Hulo
+ continue
+ qual, data = [word.lstrip() for word in col.split("=")]
+ if qual == "/TAXO-RANGE":
+ record.cc_taxo_range = data
+ elif qual == "/MAX-REPEAT":
+ record.cc_max_repeat = data
+ elif qual == "/SITE":
+ pos, desc = data.split(",")
+ record.cc_site.append((int(pos), desc))
+ elif qual == "/SKIP-FLAG":
+ record.cc_skip_flag = data
+ elif qual == "/MATRIX_TYPE":
+ record.cc_matrix_type = data
+ elif qual == "/SCALING_DB":
+ record.cc_scaling_db = data
+ elif qual == "/AUTHOR":
+ record.cc_author = data
+ elif qual == "/FT_KEY":
+ record.cc_ft_key = data
+ elif qual == "/FT_DESC":
+ record.cc_ft_desc = data
+ elif qual == "/VERSION":
+ record.cc_version = data
+ else:
+ raise ValueError(
+ "Unknown qual %s in comment line\n%r" % (qual, line)
+ )
+ elif keyword == "DR":
+ refs = value.split(";")
+ for ref in refs:
+ if not ref:
+ continue
+ acc, name, type = [word.strip() for word in ref.split(",")]
+ if type == "T":
+ record.dr_positive.append((acc, name))
+ elif type == "F":
+ record.dr_false_pos.append((acc, name))
+ elif type == "N":
+ record.dr_false_neg.append((acc, name))
+ elif type == "P":
+ record.dr_potential.append((acc, name))
+ elif type == "?":
+ record.dr_unknown.append((acc, name))
+ else:
+ raise ValueError("I don't understand type flag %s" % type)
+ elif keyword == "3D":
+ cols = value.split()
+ for id in cols:
+ record.pdb_structs.append(id.rstrip(";"))
+ elif keyword == "PR":
+ rules = value.split(";")
+ record.prorules.extend(rules)
+ elif keyword == "DO":
+ record.pdoc = value.rstrip(";")
+ elif keyword == "//":
+ if not record:
+ # Then this was the copyright statement
+ continue
+ break
+ else:
+ raise ValueError("Unknown keyword %s found" % keyword)
+ else:
+ return
+ if not record:
+ raise ValueError("Unexpected end of stream.")
+ return record
diff --git a/code/lib/Bio/ExPASy/ScanProsite.py b/code/lib/Bio/ExPASy/ScanProsite.py
new file mode 100644
index 0000000..3403703
--- /dev/null
+++ b/code/lib/Bio/ExPASy/ScanProsite.py
@@ -0,0 +1,145 @@
+# Copyright 2009 by Michiel de Hoon. All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+
+"""Code for calling and parsing ScanProsite from ExPASy."""
+
+# Importing these functions with leading underscore as not intended for reuse
+from urllib.request import urlopen
+from urllib.parse import urlencode
+
+from xml.sax import handler
+from xml.sax.expatreader import ExpatParser
+
+
+class Record(list):
+ """Represents search results returned by ScanProsite.
+
+ This record is a list containing the search results returned by
+ ScanProsite. The record also contains the data members n_match,
+ n_seq, capped, and warning.
+ """
+
+ def __init__(self):
+ """Initialize the class."""
+ self.n_match = None
+ self.n_seq = None
+ self.capped = None
+ self.warning = None
+
+
+def scan(seq="", mirror="https://www.expasy.org", output="xml", **keywords):
+ """Execute a ScanProsite search.
+
+ Arguments:
+ - mirror: The ScanProsite mirror to be used
+ (default: https://www.expasy.org).
+ - seq: The query sequence, or UniProtKB (Swiss-Prot,
+ TrEMBL) accession
+ - output: Format of the search results
+ (default: xml)
+
+ Further search parameters can be passed as keywords; see the
+ documentation for programmatic access to ScanProsite at
+ https://www.expasy.org/tools/scanprosite/ScanPrositeREST.html
+ for a description of such parameters.
+
+ This function returns a handle to the search results returned by
+ ScanProsite. Search results in the XML format can be parsed into a
+ Python object, by using the Bio.ExPASy.ScanProsite.read function.
+
+ """
+ parameters = {"seq": seq, "output": output}
+ for key, value in keywords.items():
+ if value is not None:
+ parameters[key] = value
+ command = urlencode(parameters)
+ url = "%s/cgi-bin/prosite/PSScan.cgi?%s" % (mirror, command)
+ handle = urlopen(url)
+ return handle
+
+
+def read(handle):
+ """Parse search results returned by ScanProsite into a Python object."""
+ content_handler = ContentHandler()
+ saxparser = Parser()
+ saxparser.setContentHandler(content_handler)
+ saxparser.parse(handle)
+ record = content_handler.record
+ return record
+
+
+# The classes below are considered private
+
+
+class Parser(ExpatParser):
+ """Process the result from a ScanProsite search (PRIVATE)."""
+
+ def __init__(self):
+ """Initialize the class."""
+ ExpatParser.__init__(self)
+ self.firsttime = True
+
+ def feed(self, data, isFinal=0):
+ """Raise an Error if plain text is received in the data.
+
+ This is to show the Error messages returned by ScanProsite.
+ """
+ # Error messages returned by the ScanProsite server are formatted as
+ # as plain text instead of an XML document. To catch such error
+ # messages, we override the feed method of the Expat parser.
+ # The error message is (hopefully) contained in the data that was just
+ # fed to the parser.
+ if self.firsttime:
+ if data[:5].decode("utf-8") != ">> from Bio import ExPASy
+ >>> import os
+ >>> with ExPASy.get_prodoc_entry('PDOC00001') as in_handle:
+ ... html = in_handle.read()
+ ...
+ >>> with open("myprodocrecord.html", "w") as out_handle:
+ ... length = out_handle.write(html)
+ ...
+ >>> os.remove("myprodocrecord.html") # tidy up
+
+ For a non-existing key XXX, ExPASy returns an HTML-formatted page
+ containing this text: 'There is currently no PROSITE entry for'
+ """
+ return _open("%s?%s" % (cgi, id))
+
+
+def get_prosite_entry(
+ id, cgi="https://prosite.expasy.org/cgi-bin/prosite/get-prosite-entry"
+):
+ """Get a text handle to a PROSITE entry at ExPASy in HTML format.
+
+ >>> from Bio import ExPASy
+ >>> import os
+ >>> with ExPASy.get_prosite_entry('PS00001') as in_handle:
+ ... html = in_handle.read()
+ ...
+ >>> with open("myprositerecord.html", "w") as out_handle:
+ ... length = out_handle.write(html)
+ ...
+ >>> os.remove("myprositerecord.html") # tidy up
+
+ For a non-existing key XXX, ExPASy returns an HTML-formatted page
+ containing this text: 'There is currently no PROSITE entry for'
+ """
+ return _open("%s?%s" % (cgi, id))
+
+
+def get_prosite_raw(id, cgi=None):
+ """Get a text handle to a raw PROSITE or PRODOC record at ExPASy.
+
+ The cgi argument is deprecated due to changes in the ExPASy
+ website.
+
+ >>> from Bio import ExPASy
+ >>> from Bio.ExPASy import Prosite
+ >>> with ExPASy.get_prosite_raw('PS00001') as handle:
+ ... record = Prosite.read(handle)
+ ...
+ >>> print(record.accession)
+ PS00001
+
+ This function raises a ValueError if the identifier does not exist:
+
+ >>> handle = ExPASy.get_prosite_raw("DOES_NOT_EXIST")
+ Traceback (most recent call last):
+ ...
+ ValueError: Failed to find entry 'DOES_NOT_EXIST' on ExPASy
+
+ """
+ handle = _open("https://prosite.expasy.org/%s.txt" % id)
+ if handle.url == "https://www.expasy.org/":
+ raise ValueError("Failed to find entry '%s' on ExPASy" % id) from None
+ return handle
+
+
+def get_sprot_raw(id):
+ """Get a text handle to a raw SwissProt entry at ExPASy.
+
+ For an ID of XXX, fetches http://www.uniprot.org/uniprot/XXX.txt
+ (as per the https://www.expasy.org/expasy_urls.html documentation).
+
+ >>> from Bio import ExPASy
+ >>> from Bio import SwissProt
+ >>> with ExPASy.get_sprot_raw("O23729") as handle:
+ ... record = SwissProt.read(handle)
+ ...
+ >>> print(record.entry_name)
+ CHS3_BROFI
+
+ This function raises a ValueError if the identifier does not exist:
+
+ >>> ExPASy.get_sprot_raw("DOES_NOT_EXIST")
+ Traceback (most recent call last):
+ ...
+ ValueError: Failed to find SwissProt entry 'DOES_NOT_EXIST'
+
+ """
+ try:
+ handle = _open("http://www.uniprot.org/uniprot/%s.txt" % id)
+ except HTTPError as exception:
+ if exception.code == 404:
+ raise ValueError("Failed to find SwissProt entry '%s'" % id) from None
+ else:
+ raise
+ return handle
+
+
+def _open(url):
+ """Open URL and convert to text assuming UTF-8 encoding (PRIVATE)."""
+ handle = urlopen(url)
+ text_handle = io.TextIOWrapper(handle, encoding="UTF-8")
+ text_handle.url = handle.url
+ return text_handle
diff --git a/code/lib/Bio/ExPASy/__pycache__/Enzyme.cpython-37.pyc b/code/lib/Bio/ExPASy/__pycache__/Enzyme.cpython-37.pyc
new file mode 100644
index 0000000..095f873
Binary files /dev/null and b/code/lib/Bio/ExPASy/__pycache__/Enzyme.cpython-37.pyc differ
diff --git a/code/lib/Bio/ExPASy/__pycache__/Prodoc.cpython-37.pyc b/code/lib/Bio/ExPASy/__pycache__/Prodoc.cpython-37.pyc
new file mode 100644
index 0000000..90a27a5
Binary files /dev/null and b/code/lib/Bio/ExPASy/__pycache__/Prodoc.cpython-37.pyc differ
diff --git a/code/lib/Bio/ExPASy/__pycache__/Prosite.cpython-37.pyc b/code/lib/Bio/ExPASy/__pycache__/Prosite.cpython-37.pyc
new file mode 100644
index 0000000..4390c2b
Binary files /dev/null and b/code/lib/Bio/ExPASy/__pycache__/Prosite.cpython-37.pyc differ
diff --git a/code/lib/Bio/ExPASy/__pycache__/ScanProsite.cpython-37.pyc b/code/lib/Bio/ExPASy/__pycache__/ScanProsite.cpython-37.pyc
new file mode 100644
index 0000000..3b41129
Binary files /dev/null and b/code/lib/Bio/ExPASy/__pycache__/ScanProsite.cpython-37.pyc differ
diff --git a/code/lib/Bio/ExPASy/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/ExPASy/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..1e75ef7
Binary files /dev/null and b/code/lib/Bio/ExPASy/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/ExPASy/__pycache__/cellosaurus.cpython-37.pyc b/code/lib/Bio/ExPASy/__pycache__/cellosaurus.cpython-37.pyc
new file mode 100644
index 0000000..6930c02
Binary files /dev/null and b/code/lib/Bio/ExPASy/__pycache__/cellosaurus.cpython-37.pyc differ
diff --git a/code/lib/Bio/ExPASy/cellosaurus.py b/code/lib/Bio/ExPASy/cellosaurus.py
new file mode 100644
index 0000000..8794cd3
--- /dev/null
+++ b/code/lib/Bio/ExPASy/cellosaurus.py
@@ -0,0 +1,188 @@
+# Copyright 2016 by Stephen Marshall. All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+
+"""Parser for the cellosaurus.txt file from ExPASy.
+
+See https://web.expasy.org/cellosaurus/
+
+Tested with the release of Version 18 (July 2016).
+
+Functions:
+ - read Reads a file containing one cell line entry
+ - parse Reads a file containing multiple cell line entries
+
+Classes:
+ - Record Holds cell line data.
+
+Examples
+--------
+You need to download the Cellosaurus database for this examples to
+run, e.g. from ftp://ftp.expasy.org/databases/cellosaurus/cellosaurus.txt
+
+ >> from Bio.ExPASy import cellosaurus
+ >> with open('cellosaurus.txt') as handle:
+ ... records = cellosaurus.parse(handle)
+ ... for record in records:
+ ... if 'Homo sapiens' in record['OX'][0]:
+ ... print(record['ID'])
+ ...
+ #15310-LN
+ #W7079
+ (L)PC6
+ 00136
+ ...
+
+"""
+
+
+def parse(handle):
+ """Parse cell line records.
+
+ This function is for parsing cell line files containing multiple
+ records.
+
+ Arguments:
+ - handle - handle to the file.
+
+ """
+ while True:
+ record = __read(handle)
+ if not record:
+ break
+ yield record
+
+
+def read(handle):
+ """Read one cell line record.
+
+ This function is for parsing cell line files containing
+ exactly one record.
+
+ Arguments:
+ - handle - handle to the file.
+
+ """
+ record = __read(handle)
+ # We should have reached the end of the record by now
+ remainder = handle.read()
+ if remainder:
+ raise ValueError("More than one cell line record found")
+ return record
+
+
+class Record(dict):
+ """Holds information from an ExPASy Cellosaurus record as a Python dictionary.
+
+ Each record contains the following keys:
+
+ --------- --------------------------- ----------------------
+ Line code Content Occurrence in an entry
+ --------- --------------------------- ----------------------
+ ID Identifier (cell line name) Once; starts an entry
+ AC Accession (CVCL_xxxx) Once
+ AS Secondary accession number(s) Optional; once
+ SY Synonyms Optional; once
+ DR Cross-references Optional; once or more
+ RX References identifiers Optional: once or more
+ WW Web pages Optional; once or more
+ CC Comments Optional; once or more
+ ST STR profile data Optional; once or more
+ DI Diseases Optional; once or more
+ OX Species of origin Once or more
+ HI Hierarchy Optional; once or more
+ OI Originate from same individual Optional; once or more
+ SX Sex (gender) of cell Optional; once
+ CA Category Once
+ // Terminator Once; ends an entry
+
+ """
+
+ def __init__(self):
+ """Initialize the class."""
+ dict.__init__(self)
+ self["ID"] = ""
+ self["AC"] = ""
+ self["AS"] = ""
+ self["SY"] = ""
+ self["DR"] = []
+ self["RX"] = []
+ self["WW"] = []
+ self["CC"] = []
+ self["ST"] = []
+ self["DI"] = []
+ self["OX"] = []
+ self["HI"] = []
+ self["OI"] = []
+ self["SX"] = ""
+ self["CA"] = ""
+
+ def __repr__(self):
+ if self["ID"]:
+ if self["AC"]:
+ return "%s (%s, %s)" % (self.__class__.__name__, self["ID"], self["AC"])
+ else:
+ return "%s (%s)" % (self.__class__.__name__, self["ID"])
+ else:
+ return "%s ( )" % (self.__class__.__name__)
+
+ def __str__(self):
+ output = "ID: " + self["ID"]
+ output += " AC: " + self["AC"]
+ output += " AS: " + self["AS"]
+ output += " SY: " + self["SY"]
+ output += " DR: " + repr(self["DR"])
+ output += " RX: " + repr(self["RX"])
+ output += " WW: " + repr(self["WW"])
+ output += " CC: " + repr(self["CC"])
+ output += " ST: " + repr(self["ST"])
+ output += " DI: " + repr(self["DI"])
+ output += " OX: " + repr(self["OX"])
+ output += " HI: " + repr(self["HI"])
+ output += " OI: " + repr(self["OI"])
+ output += " SX: " + self["SX"]
+ output += " CA: " + self["CA"]
+ return output
+
+
+# Everything below is private
+
+
+def __read(handle):
+ record = None
+
+ for line in handle:
+
+ key, value = line[:2], line[5:].rstrip()
+ if key == "ID":
+ record = Record()
+ record["ID"] = value
+ elif key in ["AC", "AS", "SY", "SX", "CA"]:
+ record[key] += value
+ elif key in [
+ "AC",
+ "AS",
+ "SY",
+ "RX",
+ "WW",
+ "CC",
+ "ST",
+ "DI",
+ "OX",
+ "HI",
+ "OI",
+ "SX",
+ "CA",
+ ]:
+ record[key].append(value)
+ elif key == "DR":
+ k, v = value.split(";")
+ record["DR"].append((k.strip(), v.strip()))
+ elif key == "//":
+ if record:
+ return record
+ else:
+ continue
+ if record:
+ raise ValueError("Unexpected end of stream")
diff --git a/code/lib/Bio/File.py b/code/lib/Bio/File.py
new file mode 100644
index 0000000..5edec51
--- /dev/null
+++ b/code/lib/Bio/File.py
@@ -0,0 +1,609 @@
+# Copyright 1999 by Jeffrey Chang. All rights reserved.
+# Copyright 2009-2018 by Peter Cock. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Code for more fancy file handles.
+
+Bio.File defines private classes used in Bio.SeqIO and Bio.SearchIO for
+indexing files. These are not intended for direct use.
+"""
+
+import os
+import contextlib
+import itertools
+import collections.abc
+
+from abc import ABC, abstractmethod
+
+try:
+ import sqlite3
+except ImportError:
+ # May be missing if Python was compiled from source without its dependencies
+ sqlite3 = None
+
+
+@contextlib.contextmanager
+def as_handle(handleish, mode="r", **kwargs):
+ r"""Context manager to ensure we are using a handle.
+
+ Context manager for arguments that can be passed to SeqIO and AlignIO read, write,
+ and parse methods: either file objects or path-like objects (strings, pathlib.Path
+ instances, or more generally, anything that can be handled by the builtin 'open'
+ function).
+
+ When given a path-like object, returns an open file handle to that path, with provided
+ mode, which will be closed when the manager exits.
+
+ All other inputs are returned, and are *not* closed.
+
+ Arguments:
+ - handleish - Either a file handle or path-like object (anything which can be
+ passed to the builtin 'open' function, such as str, bytes,
+ pathlib.Path, and os.DirEntry objects)
+ - mode - Mode to open handleish (used only if handleish is a string)
+ - kwargs - Further arguments to pass to open(...)
+
+ Examples
+ --------
+ >>> from Bio import File
+ >>> import os
+ >>> with File.as_handle('seqs.fasta', 'w') as fp:
+ ... fp.write('>test\nACGT')
+ ...
+ 10
+ >>> fp.closed
+ True
+
+ >>> handle = open('seqs.fasta', 'w')
+ >>> with File.as_handle(handle) as fp:
+ ... fp.write('>test\nACGT')
+ ...
+ 10
+ >>> fp.closed
+ False
+ >>> fp.close()
+ >>> os.remove("seqs.fasta") # tidy up
+
+ """
+ try:
+ with open(handleish, mode, **kwargs) as fp:
+ yield fp
+ except TypeError:
+ yield handleish
+
+
+def _open_for_random_access(filename):
+ """Open a file in binary mode, spot if it is BGZF format etc (PRIVATE).
+
+ This functionality is used by the Bio.SeqIO and Bio.SearchIO index
+ and index_db functions.
+
+ If the file is gzipped but not BGZF, a specific ValueError is raised.
+ """
+ handle = open(filename, "rb")
+ magic = handle.read(2)
+ handle.seek(0)
+
+ if magic == b"\x1f\x8b":
+ # This is a gzipped file, but is it BGZF?
+ from . import bgzf
+
+ try:
+ # If it is BGZF, we support that
+ return bgzf.BgzfReader(mode="rb", fileobj=handle)
+ except ValueError as e:
+ assert "BGZF" in str(e)
+ # Not a BGZF file after all,
+ handle.close()
+ raise ValueError(
+ "Gzipped files are not suitable for indexing, "
+ "please use BGZF (blocked gzip format) instead."
+ ) from None
+
+ return handle
+
+
+# The rest of this file defines code used in Bio.SeqIO and Bio.SearchIO
+# for indexing
+
+
+class _IndexedSeqFileProxy(ABC):
+ """Abstract base class for file format specific random access (PRIVATE).
+
+ This is subclasses in both Bio.SeqIO for indexing as SeqRecord
+ objects, and in Bio.SearchIO for indexing QueryResult objects.
+
+ Subclasses for each file format should define '__iter__', 'get'
+ and optionally 'get_raw' methods.
+ """
+
+ @abstractmethod
+ def __iter__(self):
+ """Return (identifier, offset, length in bytes) tuples.
+
+ The length can be zero where it is not implemented or not
+ possible for a particular file format.
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ def get(self, offset):
+ """Return parsed object for this entry."""
+ # Most file formats with self contained records can be handled by
+ # parsing StringIO(self.get_raw(offset).decode())
+ raise NotImplementedError
+
+ def get_raw(self, offset):
+ """Return the raw record from the file as a bytes string (if implemented).
+
+ If the key is not found, a KeyError exception is raised.
+
+ This may not have been implemented for all file formats.
+ """
+ # Should be done by each sub-class (if possible)
+ raise NotImplementedError("Not available for this file format.")
+
+
+class _IndexedSeqFileDict(collections.abc.Mapping):
+ """Read only dictionary interface to a sequential record file.
+
+ This code is used in both Bio.SeqIO for indexing as SeqRecord
+ objects, and in Bio.SearchIO for indexing QueryResult objects.
+
+ Keeps the keys and associated file offsets in memory, reads the file
+ to access entries as objects parsing them on demand. This approach
+ is memory limited, but will work even with millions of records.
+
+ Note duplicate keys are not allowed. If this happens, a ValueError
+ exception is raised.
+
+ As used in Bio.SeqIO, by default the SeqRecord's id string is used
+ as the dictionary key. In Bio.SearchIO, the query's id string is
+ used. This can be changed by supplying an optional key_function,
+ a callback function which will be given the record id and must
+ return the desired key. For example, this allows you to parse
+ NCBI style FASTA identifiers, and extract the GI number to use
+ as the dictionary key.
+
+ Note that this dictionary is essentially read only. You cannot
+ add or change values, pop values, nor clear the dictionary.
+ """
+
+ def __init__(self, random_access_proxy, key_function, repr, obj_repr):
+ """Initialize the class."""
+ # Use key_function=None for default value
+ self._proxy = random_access_proxy
+ self._key_function = key_function
+ self._repr = repr
+ self._obj_repr = obj_repr
+ if key_function:
+ offset_iter = ((key_function(k), o, l) for (k, o, l) in random_access_proxy)
+ else:
+ offset_iter = random_access_proxy
+ offsets = {}
+ for key, offset, length in offset_iter:
+ # Note - we don't store the length because I want to minimise the
+ # memory requirements. With the SQLite backend the length is kept
+ # and is used to speed up the get_raw method (by about 3 times).
+ # The length should be provided by all the current backends except
+ # SFF where there is an existing Roche index we can reuse (very fast
+ # but lacks the record lengths)
+ # assert length or format in ["sff", "sff-trim"], \
+ # "%s at offset %i given length %r (%s format %s)" \
+ # % (key, offset, length, filename, format)
+ if key in offsets:
+ self._proxy._handle.close()
+ raise ValueError("Duplicate key '%s'" % key)
+ else:
+ offsets[key] = offset
+ self._offsets = offsets
+
+ def __repr__(self):
+ """Return a string representation of the File object."""
+ return self._repr
+
+ def __str__(self):
+ """Create a string representation of the File object."""
+ # TODO - How best to handle the __str__ for SeqIO and SearchIO?
+ if self:
+ return "{%r : %s(...), ...}" % (list(self.keys())[0], self._obj_repr)
+ else:
+ return "{}"
+
+ def __len__(self):
+ """Return the number of records."""
+ return len(self._offsets)
+
+ def __iter__(self):
+ """Iterate over the keys."""
+ return iter(self._offsets)
+
+ def __getitem__(self, key):
+ """Return record for the specified key."""
+ # Pass the offset to the proxy
+ record = self._proxy.get(self._offsets[key])
+ if self._key_function:
+ key2 = self._key_function(record.id)
+ else:
+ key2 = record.id
+ if key != key2:
+ raise ValueError("Key did not match (%s vs %s)" % (key, key2))
+ return record
+
+ def get_raw(self, key):
+ """Return the raw record from the file as a bytes string.
+
+ If the key is not found, a KeyError exception is raised.
+ """
+ # Pass the offset to the proxy
+ return self._proxy.get_raw(self._offsets[key])
+
+ def close(self):
+ """Close the file handle being used to read the data.
+
+ Once called, further use of the index won't work. The sole purpose
+ of this method is to allow explicit handle closure - for example
+ if you wish to delete the file, on Windows you must first close
+ all open handles to that file.
+ """
+ self._proxy._handle.close()
+
+
+class _SQLiteManySeqFilesDict(_IndexedSeqFileDict):
+ """Read only dictionary interface to many sequential record files.
+
+ This code is used in both Bio.SeqIO for indexing as SeqRecord
+ objects, and in Bio.SearchIO for indexing QueryResult objects.
+
+ Keeps the keys, file-numbers and offsets in an SQLite database. To access
+ a record by key, reads from the offset in the appropriate file and then
+ parses the record into an object.
+
+ There are OS limits on the number of files that can be open at once,
+ so a pool are kept. If a record is required from a closed file, then
+ one of the open handles is closed first.
+ """
+
+ def __init__(
+ self,
+ index_filename,
+ filenames,
+ proxy_factory,
+ fmt,
+ key_function,
+ repr,
+ max_open=10,
+ ):
+ """Initialize the class."""
+ # TODO? - Don't keep filename list in memory (just in DB)?
+ # Should save a chunk of memory if dealing with 1000s of files.
+ # Furthermore could compare a generator to the DB on reloading
+ # (no need to turn it into a list)
+
+ if sqlite3 is None:
+ # Python was compiled without sqlite3 support
+ from Bio import MissingPythonDependencyError
+
+ raise MissingPythonDependencyError(
+ "Python was compiled without the sqlite3 module"
+ )
+ if filenames is not None:
+ filenames = list(filenames) # In case it was a generator
+
+ # Cache the arguments as private variables
+ self._index_filename = index_filename
+ self._filenames = filenames
+ self._format = fmt
+ self._key_function = key_function
+ self._proxy_factory = proxy_factory
+ self._repr = repr
+ self._max_open = max_open
+ self._proxies = {}
+
+ # Note if using SQLite :memory: trick index filename, this will
+ # give $PWD as the relative path (which is fine).
+ self._relative_path = os.path.abspath(os.path.dirname(index_filename))
+
+ if os.path.isfile(index_filename):
+ self._load_index()
+ else:
+ self._build_index()
+
+ def _load_index(self):
+ """Call from __init__ to re-use an existing index (PRIVATE)."""
+ index_filename = self._index_filename
+ relative_path = self._relative_path
+ filenames = self._filenames
+ fmt = self._format
+ proxy_factory = self._proxy_factory
+
+ con = sqlite3.dbapi2.connect(index_filename, check_same_thread=False)
+ self._con = con
+ # Check the count...
+ try:
+ (count,) = con.execute(
+ "SELECT value FROM meta_data WHERE key=?;", ("count",)
+ ).fetchone()
+ self._length = int(count)
+ if self._length == -1:
+ con.close()
+ raise ValueError("Unfinished/partial database") from None
+
+ # use MAX(_ROWID_) to obtain the number of sequences in the database
+ # using COUNT(key) is quite slow in SQLITE
+ # (https://stackoverflow.com/questions/8988915/sqlite-count-slow-on-big-tables)
+ (count,) = con.execute("SELECT MAX(_ROWID_) FROM offset_data;").fetchone()
+ if self._length != int(count):
+ con.close()
+ raise ValueError(
+ "Corrupt database? %i entries not %i" % (int(count), self._length)
+ ) from None
+ (self._format,) = con.execute(
+ "SELECT value FROM meta_data WHERE key=?;", ("format",)
+ ).fetchone()
+ if fmt and fmt != self._format:
+ con.close()
+ raise ValueError(
+ "Index file says format %s, not %s" % (self._format, fmt)
+ ) from None
+ try:
+ (filenames_relative_to_index,) = con.execute(
+ "SELECT value FROM meta_data WHERE key=?;",
+ ("filenames_relative_to_index",),
+ ).fetchone()
+ filenames_relative_to_index = (
+ filenames_relative_to_index.upper() == "TRUE"
+ )
+ except TypeError:
+ # Original behaviour, assume if meta_data missing
+ filenames_relative_to_index = False
+ self._filenames = [
+ row[0]
+ for row in con.execute(
+ "SELECT name FROM file_data ORDER BY file_number;"
+ ).fetchall()
+ ]
+ if filenames_relative_to_index:
+ # Not implicitly relative to $PWD, explicitly relative to index file
+ relative_path = os.path.abspath(os.path.dirname(index_filename))
+ tmp = []
+ for f in self._filenames:
+ if os.path.isabs(f):
+ tmp.append(f)
+ else:
+ # Would be stored with Unix / path separator, so convert
+ # it to the local OS path separator here:
+ tmp.append(
+ os.path.join(relative_path, f.replace("/", os.path.sep))
+ )
+ self._filenames = tmp
+ del tmp
+ if filenames and len(filenames) != len(self._filenames):
+ con.close()
+ raise ValueError(
+ "Index file says %i files, not %i"
+ % (len(self._filenames), len(filenames))
+ ) from None
+ if filenames and filenames != self._filenames:
+ for old, new in zip(self._filenames, filenames):
+ # Want exact match (after making relative to the index above)
+ if os.path.abspath(old) != os.path.abspath(new):
+ con.close()
+ if filenames_relative_to_index:
+ raise ValueError(
+ "Index file has different filenames, e.g. %r != %r"
+ % (os.path.abspath(old), os.path.abspath(new))
+ ) from None
+ else:
+ raise ValueError(
+ "Index file has different filenames "
+ "[This is an old index where any relative paths "
+ "were relative to the original working directory]. "
+ "e.g. %r != %r"
+ % (os.path.abspath(old), os.path.abspath(new))
+ ) from None
+ # Filenames are equal (after imposing abspath)
+ except sqlite3.OperationalError as err:
+ con.close()
+ raise ValueError("Not a Biopython index database? %s" % err) from None
+ # Now we have the format (from the DB if not given to us),
+ if not proxy_factory(self._format):
+ con.close()
+ raise ValueError("Unsupported format '%s'" % self._format)
+
+ def _build_index(self):
+ """Call from __init__ to create a new index (PRIVATE)."""
+ index_filename = self._index_filename
+ relative_path = self._relative_path
+ filenames = self._filenames
+ fmt = self._format
+ key_function = self._key_function
+ proxy_factory = self._proxy_factory
+ max_open = self._max_open
+ random_access_proxies = self._proxies
+
+ if not fmt or not filenames:
+ raise ValueError(
+ "Filenames to index and format required to build %r" % index_filename
+ )
+ if not proxy_factory(fmt):
+ raise ValueError("Unsupported format '%s'" % fmt)
+ # Create the index
+ con = sqlite3.dbapi2.connect(index_filename)
+ self._con = con
+ # print("Creating index")
+ # Sqlite PRAGMA settings for speed
+ con.execute("PRAGMA synchronous=OFF")
+ con.execute("PRAGMA locking_mode=EXCLUSIVE")
+ # Don't index the key column until the end (faster)
+ # con.execute("CREATE TABLE offset_data (key TEXT PRIMARY KEY, "
+ # "offset INTEGER);")
+ con.execute("CREATE TABLE meta_data (key TEXT, value TEXT);")
+ con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);", ("count", -1))
+ con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);", ("format", fmt))
+ con.execute(
+ "INSERT INTO meta_data (key, value) VALUES (?,?);",
+ ("filenames_relative_to_index", "True"),
+ )
+ # TODO - Record the file size and modified date?
+ con.execute("CREATE TABLE file_data (file_number INTEGER, name TEXT);")
+ con.execute(
+ "CREATE TABLE offset_data (key TEXT, "
+ "file_number INTEGER, offset INTEGER, length INTEGER);"
+ )
+ count = 0
+ for i, filename in enumerate(filenames):
+ # Default to storing as an absolute path,
+ f = os.path.abspath(filename)
+ if not os.path.isabs(filename) and not os.path.isabs(index_filename):
+ # Since user gave BOTH filename & index as relative paths,
+ # we will store this relative to the index file even though
+ # if it may now start ../ (meaning up a level)
+ # Note for cross platform use (e.g. shared drive over SAMBA),
+ # convert any Windows slash into Unix style for rel paths.
+ f = os.path.relpath(filename, relative_path).replace(os.path.sep, "/")
+ elif (os.path.dirname(os.path.abspath(filename)) + os.path.sep).startswith(
+ relative_path + os.path.sep
+ ):
+ # Since sequence file is in same directory or sub directory,
+ # might as well make this into a relative path:
+ f = os.path.relpath(filename, relative_path).replace(os.path.sep, "/")
+ assert not f.startswith("../"), f
+ # print("DEBUG - storing %r as [%r] %r" % (filename, relative_path, f))
+ con.execute(
+ "INSERT INTO file_data (file_number, name) VALUES (?,?);", (i, f)
+ )
+ random_access_proxy = proxy_factory(fmt, filename)
+ if key_function:
+ offset_iter = (
+ (key_function(k), i, o, l) for (k, o, l) in random_access_proxy
+ )
+ else:
+ offset_iter = ((k, i, o, l) for (k, o, l) in random_access_proxy)
+ while True:
+ batch = list(itertools.islice(offset_iter, 100))
+ if not batch:
+ break
+ # print("Inserting batch of %i offsets, %s ... %s"
+ # % (len(batch), batch[0][0], batch[-1][0]))
+ con.executemany(
+ "INSERT INTO offset_data (key,file_number,offset,length) VALUES (?,?,?,?);",
+ batch,
+ )
+ con.commit()
+ count += len(batch)
+ if len(random_access_proxies) < max_open:
+ random_access_proxies[i] = random_access_proxy
+ else:
+ random_access_proxy._handle.close()
+ self._length = count
+ # print("About to index %i entries" % count)
+ try:
+ con.execute(
+ "CREATE UNIQUE INDEX IF NOT EXISTS key_index ON offset_data(key);"
+ )
+ except sqlite3.IntegrityError as err:
+ self._proxies = random_access_proxies
+ self.close()
+ con.close()
+ raise ValueError("Duplicate key? %s" % err) from None
+ con.execute("PRAGMA locking_mode=NORMAL")
+ con.execute("UPDATE meta_data SET value = ? WHERE key = ?;", (count, "count"))
+ con.commit()
+ # print("Index created")
+
+ def __repr__(self):
+ return self._repr
+
+ def __contains__(self, key):
+ return bool(
+ self._con.execute(
+ "SELECT key FROM offset_data WHERE key=?;", (key,)
+ ).fetchone()
+ )
+
+ def __len__(self):
+ """Return the number of records indexed."""
+ return self._length
+ # return self._con.execute("SELECT COUNT(key) FROM offset_data;").fetchone()[0]
+
+ def __iter__(self):
+ """Iterate over the keys."""
+ for row in self._con.execute(
+ "SELECT key FROM offset_data ORDER BY file_number, offset;"
+ ):
+ yield str(row[0])
+
+ def __getitem__(self, key):
+ """Return record for the specified key."""
+ # Pass the offset to the proxy
+ row = self._con.execute(
+ "SELECT file_number, offset FROM offset_data WHERE key=?;", (key,)
+ ).fetchone()
+ if not row:
+ raise KeyError
+ file_number, offset = row
+ proxies = self._proxies
+ if file_number in proxies:
+ record = proxies[file_number].get(offset)
+ else:
+ if len(proxies) >= self._max_open:
+ # Close an old handle...
+ proxies.popitem()[1]._handle.close()
+ # Open a new handle...
+ proxy = self._proxy_factory(self._format, self._filenames[file_number])
+ record = proxy.get(offset)
+ proxies[file_number] = proxy
+ if self._key_function:
+ key2 = self._key_function(record.id)
+ else:
+ key2 = record.id
+ if key != key2:
+ raise ValueError("Key did not match (%s vs %s)" % (key, key2))
+ return record
+
+ def get_raw(self, key):
+ """Return the raw record from the file as a bytes string.
+
+ If the key is not found, a KeyError exception is raised.
+ """
+ # Pass the offset to the proxy
+ row = self._con.execute(
+ "SELECT file_number, offset, length FROM offset_data WHERE key=?;", (key,)
+ ).fetchone()
+ if not row:
+ raise KeyError
+ file_number, offset, length = row
+ proxies = self._proxies
+ if file_number in proxies:
+ if length:
+ # Shortcut if we have the length
+ h = proxies[file_number]._handle
+ h.seek(offset)
+ return h.read(length)
+ else:
+ return proxies[file_number].get_raw(offset)
+ else:
+ # This code is duplicated from __getitem__ to avoid a function call
+ if len(proxies) >= self._max_open:
+ # Close an old handle...
+ proxies.popitem()[1]._handle.close()
+ # Open a new handle...
+ proxy = self._proxy_factory(self._format, self._filenames[file_number])
+ proxies[file_number] = proxy
+ if length:
+ # Shortcut if we have the length
+ h = proxy._handle
+ h.seek(offset)
+ return h.read(length)
+ else:
+ return proxy.get_raw(offset)
+
+ def close(self):
+ """Close any open file handles."""
+ proxies = self._proxies
+ while proxies:
+ proxies.popitem()[1]._handle.close()
diff --git a/code/lib/Bio/GenBank/Record.py b/code/lib/Bio/GenBank/Record.py
new file mode 100644
index 0000000..268efa8
--- /dev/null
+++ b/code/lib/Bio/GenBank/Record.py
@@ -0,0 +1,669 @@
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+#
+
+"""Hold GenBank data in a straightforward format.
+
+Classes:
+ - Record - All of the information in a GenBank record.
+ - Reference - hold reference data for a record.
+ - Feature - Hold the information in a Feature Table.
+ - Qualifier - Qualifiers on a Feature.
+
+"""
+
+import Bio.GenBank
+
+
+def _wrapped_genbank(information, indent, wrap_space=1, split_char=" "):
+ """Write a line of GenBank info that can wrap over multiple lines (PRIVATE).
+
+ This takes a line of information which can potentially wrap over
+ multiple lines, and breaks it up with carriage returns and
+ indentation so it fits properly into a GenBank record.
+
+ Arguments:
+ - information - The string holding the information we want
+ wrapped in GenBank method.
+ - indent - The indentation on the lines we are writing.
+ - wrap_space - Whether or not to wrap only on spaces in the
+ information.
+ - split_char - A specific character to split the lines on. By default
+ spaces are used.
+
+ """
+ info_length = Record.GB_LINE_LENGTH - indent
+
+ if not information:
+ # GenBank files use "." for missing data
+ return ".\n"
+
+ if wrap_space:
+ info_parts = information.split(split_char)
+ else:
+ cur_pos = 0
+ info_parts = []
+ while cur_pos < len(information):
+ info_parts.append(information[cur_pos : cur_pos + info_length])
+ cur_pos += info_length
+
+ # first get the information string split up by line
+ output_parts = []
+ cur_part = ""
+ for info_part in info_parts:
+ if len(cur_part) + 1 + len(info_part) > info_length:
+ if cur_part:
+ if split_char != " ":
+ cur_part += split_char
+ output_parts.append(cur_part)
+ cur_part = info_part
+ else:
+ if cur_part == "":
+ cur_part = info_part
+ else:
+ cur_part += split_char + info_part
+
+ # add the last bit of information to the output
+ if cur_part:
+ output_parts.append(cur_part)
+
+ # now format the information string for return
+ output_info = output_parts[0] + "\n"
+ for output_part in output_parts[1:]:
+ output_info += " " * indent + output_part + "\n"
+
+ return output_info
+
+
+def _indent_genbank(information, indent):
+ """Write out information with the specified indent (PRIVATE).
+
+ Unlike _wrapped_genbank, this function makes no attempt to wrap
+ lines -- it assumes that the information already has newlines in the
+ appropriate places, and will add the specified indent to the start of
+ each line.
+ """
+ # split the info into lines based on line breaks
+ info_parts = information.split("\n")
+
+ # the first line will have no indent
+ output_info = info_parts[0] + "\n"
+ for info_part in info_parts[1:]:
+ output_info += " " * indent + info_part + "\n"
+
+ return output_info
+
+
+class Record:
+ """Hold GenBank information in a format similar to the original record.
+
+ The Record class is meant to make data easy to get to when you are
+ just interested in looking at GenBank data.
+
+ Attributes:
+ - locus - The name specified after the LOCUS keyword in the GenBank
+ record. This may be the accession number, or a clone id or something else.
+ - size - The size of the record.
+ - residue_type - The type of residues making up the sequence in this
+ record. Normally something like RNA, DNA or PROTEIN, but may be as
+ esoteric as 'ss-RNA circular'.
+ - data_file_division - The division this record is stored under in
+ GenBank (ie. PLN -> plants; PRI -> humans, primates; BCT -> bacteria...)
+ - date - The date of submission of the record, in a form like '28-JUL-1998'
+ - accession - list of all accession numbers for the sequence.
+ - nid - Nucleotide identifier number.
+ - pid - Proteint identifier number
+ - version - The accession number + version (ie. AB01234.2)
+ - db_source - Information about the database the record came from
+ - gi - The NCBI gi identifier for the record.
+ - keywords - A list of keywords related to the record.
+ - segment - If the record is one of a series, this is info about which
+ segment this record is (something like '1 of 6').
+ - source - The source of material where the sequence came from.
+ - organism - The genus and species of the organism (ie. 'Homo sapiens')
+ - taxonomy - A listing of the taxonomic classification of the organism,
+ starting general and getting more specific.
+ - references - A list of Reference objects.
+ - comment - Text with any kind of comment about the record.
+ - features - A listing of Features making up the feature table.
+ - base_counts - A string with the counts of bases for the sequence.
+ - origin - A string specifying info about the origin of the sequence.
+ - sequence - A string with the sequence itself.
+ - contig - A string of location information for a CONTIG in a RefSeq file
+ - project - The genome sequencing project numbers
+ (will be replaced by the dblink cross-references in 2009).
+ - dblinks - The genome sequencing project number(s) and other links.
+ (will replace the project information in 2009).
+
+ """
+
+ # constants for outputting GenBank information
+ GB_LINE_LENGTH = 79
+ GB_BASE_INDENT = 12
+ GB_FEATURE_INDENT = 21
+ GB_INTERNAL_INDENT = 2
+ GB_OTHER_INTERNAL_INDENT = 3
+ GB_FEATURE_INTERNAL_INDENT = 5
+ GB_SEQUENCE_INDENT = 9
+
+ BASE_FORMAT = "%-" + str(GB_BASE_INDENT) + "s"
+ INTERNAL_FORMAT = (
+ " " * GB_INTERNAL_INDENT + "%-" + str(GB_BASE_INDENT - GB_INTERNAL_INDENT) + "s"
+ )
+ OTHER_INTERNAL_FORMAT = (
+ " " * GB_OTHER_INTERNAL_INDENT
+ + "%-"
+ + str(GB_BASE_INDENT - GB_OTHER_INTERNAL_INDENT)
+ + "s"
+ )
+
+ BASE_FEATURE_FORMAT = "%-" + str(GB_FEATURE_INDENT) + "s"
+ INTERNAL_FEATURE_FORMAT = (
+ " " * GB_FEATURE_INTERNAL_INDENT
+ + "%-"
+ + str(GB_FEATURE_INDENT - GB_FEATURE_INTERNAL_INDENT)
+ + "s"
+ )
+ SEQUENCE_FORMAT = "%" + str(GB_SEQUENCE_INDENT) + "s"
+
+ def __init__(self):
+ """Initialize the class."""
+ self.accession = []
+ self.base_counts = ""
+ self.comment = ""
+ self.contig = ""
+ self.data_file_division = ""
+ self.date = ""
+ self.db_source = ""
+ self.dblinks = []
+ self.definition = ""
+ self.features = []
+ self.gi = ""
+ self.keywords = []
+ self.locus = ""
+ self.molecule_type = ""
+ self.nid = ""
+ self.organism = ""
+ self.origin = ""
+ self.pid = ""
+ self.primary = []
+ self.projects = []
+ self.references = []
+ self.residue_type = ""
+ self.segment = ""
+ self.sequence = ""
+ self.size = ""
+ self.source = ""
+ self.taxonomy = []
+ self.topology = ""
+ self.version = ""
+ self.wgs = ""
+ self.wgs_scafld = []
+
+ def __str__(self):
+ """Provide a GenBank formatted output option for a Record.
+
+ The objective of this is to provide an easy way to read in a GenBank
+ record, modify it somehow, and then output it in 'GenBank format.'
+ We are striving to make this work so that a parsed Record that is
+ output using this function will look exactly like the original
+ record.
+
+ Much of the output is based on format description info at:
+
+ ftp://ncbi.nlm.nih.gov/genbank/gbrel.txt
+ """
+ output = self._locus_line()
+ output += self._definition_line()
+ output += self._accession_line()
+ output += self._version_line()
+ output += self._project_line()
+ output += self._dblink_line()
+ output += self._nid_line()
+ output += self._pid_line()
+ output += self._keywords_line()
+ output += self._db_source_line()
+ output += self._segment_line()
+ output += self._source_line()
+ output += self._organism_line()
+ for reference in self.references:
+ output += str(reference)
+ output += self._comment_line()
+ output += self._features_line()
+ for feature in self.features:
+ output += str(feature)
+ output += self._base_count_line()
+ output += self._origin_line()
+ output += self._sequence_line()
+ output += self._wgs_line()
+ output += self._wgs_scafld_line()
+ output += self._contig_line()
+ output += "//"
+ return output
+
+ def _locus_line(self):
+ """Provide the output string for the LOCUS line (PRIVATE)."""
+ output = "LOCUS"
+ output += " " * 7 # 6-12 spaces
+ output += "%-9s" % self.locus
+ output += " " # 22 space
+ output += "%7s" % self.size
+ if "PROTEIN" in self.residue_type:
+ output += " aa"
+ else:
+ output += " bp "
+
+ # treat circular types differently, since they'll have long residue
+ # types
+ if "circular" in self.residue_type:
+ output += "%17s" % self.residue_type
+ # second case: ss-DNA types of records
+ elif "-" in self.residue_type:
+ output += "%7s" % self.residue_type
+ output += " " * 10 # spaces for circular
+ else:
+ output += " " * 3 # spaces for stuff like ss-
+ output += "%-4s" % self.residue_type
+ output += " " * 10 # spaces for circular
+
+ output += " " * 2
+ output += "%3s" % self.data_file_division
+ output += " " * 7 # spaces for 56-63
+ output += "%11s" % self.date
+ output += "\n"
+ return output
+
+ def _definition_line(self):
+ """Provide output for the DEFINITION line (PRIVATE)."""
+ output = Record.BASE_FORMAT % "DEFINITION"
+ output += _wrapped_genbank(self.definition + ".", Record.GB_BASE_INDENT)
+ return output
+
+ def _accession_line(self):
+ """Output for the ACCESSION line (PRIVATE)."""
+ if self.accession:
+ output = Record.BASE_FORMAT % "ACCESSION"
+
+ acc_info = ""
+ for accession in self.accession:
+ acc_info += "%s " % accession
+ # strip off an extra space at the end
+ acc_info = acc_info.rstrip()
+ output += _wrapped_genbank(acc_info, Record.GB_BASE_INDENT)
+ else:
+ output = ""
+
+ return output
+
+ def _version_line(self):
+ """Output for the VERSION line (PRIVATE)."""
+ if self.version:
+ output = Record.BASE_FORMAT % "VERSION"
+ output += self.version
+ output += " GI:"
+ output += "%s\n" % self.gi
+ else:
+ output = ""
+ return output
+
+ def _project_line(self):
+ output = ""
+ if len(self.projects) > 0:
+ output = Record.BASE_FORMAT % "PROJECT"
+ output += "%s\n" % " ".join(self.projects)
+ return output
+
+ def _dblink_line(self):
+ output = ""
+ if len(self.dblinks) > 0:
+ output = Record.BASE_FORMAT % "DBLINK"
+ dblink_info = "\n".join(self.dblinks)
+ output += _wrapped_genbank(dblink_info, Record.GB_BASE_INDENT)
+ return output
+
+ def _nid_line(self):
+ """Output for the NID line. Use of NID is obsolete in GenBank files (PRIVATE)."""
+ if self.nid:
+ output = Record.BASE_FORMAT % "NID"
+ output += "%s\n" % self.nid
+ else:
+ output = ""
+ return output
+
+ def _pid_line(self):
+ """Output for PID line. Presumedly, PID usage is also obsolete (PRIVATE)."""
+ if self.pid:
+ output = Record.BASE_FORMAT % "PID"
+ output += "%s\n" % self.pid
+ else:
+ output = ""
+ return output
+
+ def _keywords_line(self):
+ """Output for the KEYWORDS line (PRIVATE)."""
+ output = ""
+ if self.keywords:
+ output += Record.BASE_FORMAT % "KEYWORDS"
+ keyword_info = ""
+ for keyword in self.keywords:
+ keyword_info += "%s; " % keyword
+ # replace the ; at the end with a period
+ keyword_info = keyword_info[:-2]
+ keyword_info += "."
+
+ output += _wrapped_genbank(keyword_info, Record.GB_BASE_INDENT)
+
+ return output
+
+ def _db_source_line(self):
+ """Output for DBSOURCE line (PRIVATE)."""
+ if self.db_source:
+ output = Record.BASE_FORMAT % "DBSOURCE"
+ output += "%s\n" % self.db_source
+ else:
+ output = ""
+ return output
+
+ def _segment_line(self):
+ """Output for the SEGMENT line (PRIVATE)."""
+ output = ""
+ if self.segment:
+ output += Record.BASE_FORMAT % "SEGMENT"
+ output += _wrapped_genbank(self.segment, Record.GB_BASE_INDENT)
+ return output
+
+ def _source_line(self):
+ """Output for SOURCE line on where the sample came from (PRIVATE)."""
+ output = Record.BASE_FORMAT % "SOURCE"
+ output += _wrapped_genbank(self.source, Record.GB_BASE_INDENT)
+ return output
+
+ def _organism_line(self):
+ """Output for ORGANISM line with taxonomy info (PRIVATE)."""
+ output = Record.INTERNAL_FORMAT % "ORGANISM"
+ # Now that species names can be too long, this line can wrap (Bug 2591)
+ output += _wrapped_genbank(self.organism, Record.GB_BASE_INDENT)
+ output += " " * Record.GB_BASE_INDENT
+ taxonomy_info = ""
+ for tax in self.taxonomy:
+ taxonomy_info += "%s; " % tax
+ # replace the ; at the end with a period
+ taxonomy_info = taxonomy_info[:-2]
+ taxonomy_info += "."
+ output += _wrapped_genbank(taxonomy_info, Record.GB_BASE_INDENT)
+
+ return output
+
+ def _comment_line(self):
+ """Output for the COMMENT lines (PRIVATE)."""
+ output = ""
+ if self.comment:
+ output += Record.BASE_FORMAT % "COMMENT"
+ output += _indent_genbank(self.comment, Record.GB_BASE_INDENT)
+ return output
+
+ def _features_line(self):
+ """Output for the FEATURES line (PRIVATE)."""
+ output = ""
+ if len(self.features) > 0:
+ output += Record.BASE_FEATURE_FORMAT % "FEATURES"
+ output += "Location/Qualifiers\n"
+ return output
+
+ def _base_count_line(self):
+ """Output for the BASE COUNT line with base information (PRIVATE)."""
+ output = ""
+ if self.base_counts:
+ output += Record.BASE_FORMAT % "BASE COUNT "
+ # split up the base counts into their individual parts
+ count_parts = self.base_counts.split(" ")
+ while "" in count_parts:
+ count_parts.remove("")
+ # deal with the standard case, with a normal origin line
+ # like: 474 a 356 c 428 g 364 t
+ if len(count_parts) % 2 == 0:
+ while len(count_parts) > 0:
+ count_info = count_parts.pop(0)
+ count_type = count_parts.pop(0)
+
+ output += "%7s %s" % (count_info, count_type)
+ # deal with ugly ORIGIN lines like:
+ # 1311257 a2224835 c2190093 g1309889 t
+ # by just outputting the raw information
+ else:
+ output += self.base_counts
+ output += "\n"
+ return output
+
+ def _origin_line(self):
+ """Output for the ORIGIN line (PRIVATE)."""
+ output = ""
+ # only output the ORIGIN line if we have a sequence
+ if self.sequence:
+ output += Record.BASE_FORMAT % "ORIGIN"
+ if self.origin:
+ output += _wrapped_genbank(self.origin, Record.GB_BASE_INDENT)
+ else:
+ output += "\n"
+ return output
+
+ def _sequence_line(self):
+ """Output for all of the sequence (PRIVATE)."""
+ output = ""
+ if self.sequence:
+ cur_seq_pos = 0
+ while cur_seq_pos < len(self.sequence):
+ output += Record.SEQUENCE_FORMAT % str(cur_seq_pos + 1)
+
+ for section in range(6):
+ start_pos = cur_seq_pos + section * 10
+ end_pos = start_pos + 10
+ seq_section = self.sequence[start_pos:end_pos]
+ output += " %s" % seq_section.lower()
+
+ # stop looping if we are out of sequence
+ if end_pos > len(self.sequence):
+ break
+
+ output += "\n"
+ cur_seq_pos += 60
+ return output
+
+ def _wgs_line(self):
+ output = ""
+ if self.wgs:
+ output += Record.BASE_FORMAT % "WGS"
+ output += self.wgs
+ return output
+
+ def _wgs_scafld_line(self):
+ output = ""
+ if self.wgs_scafld:
+ output += Record.BASE_FORMAT % "WGS_SCAFLD"
+ output += self.wgs_scafld
+ return output
+
+ def _contig_line(self):
+ """Output for CONTIG location information from RefSeq (PRIVATE)."""
+ output = ""
+ if self.contig:
+ output += Record.BASE_FORMAT % "CONTIG"
+ output += _wrapped_genbank(
+ self.contig, Record.GB_BASE_INDENT, split_char=","
+ )
+ return output
+
+
+class Reference:
+ """Hold information from a GenBank reference.
+
+ Attributes:
+ - number - The number of the reference in the listing of references.
+ - bases - The bases in the sequence the reference refers to.
+ - authors - String with all of the authors.
+ - consrtm - Consortium the authors belong to.
+ - title - The title of the reference.
+ - journal - Information about the journal where the reference appeared.
+ - medline_id - The medline id for the reference.
+ - pubmed_id - The pubmed_id for the reference.
+ - remark - Free-form remarks about the reference.
+
+ """
+
+ def __init__(self):
+ """Initialize the class."""
+ self.number = ""
+ self.bases = ""
+ self.authors = ""
+ self.consrtm = ""
+ self.title = ""
+ self.journal = ""
+ self.medline_id = ""
+ self.pubmed_id = ""
+ self.remark = ""
+
+ def __str__(self):
+ """Convert the reference to a GenBank format string."""
+ output = self._reference_line()
+ output += self._authors_line()
+ output += self._consrtm_line()
+ output += self._title_line()
+ output += self._journal_line()
+ output += self._medline_line()
+ output += self._pubmed_line()
+ output += self._remark_line()
+
+ return output
+
+ def _reference_line(self):
+ """Output for REFERENCE lines (PRIVATE)."""
+ output = Record.BASE_FORMAT % "REFERENCE"
+ if self.number:
+ if self.bases:
+ output += "%-3s" % self.number
+ output += "%s" % self.bases
+ else:
+ output += "%s" % self.number
+
+ output += "\n"
+ return output
+
+ def _authors_line(self):
+ """Output for AUTHORS information (PRIVATE)."""
+ output = ""
+ if self.authors:
+ output += Record.INTERNAL_FORMAT % "AUTHORS"
+ output += _wrapped_genbank(self.authors, Record.GB_BASE_INDENT)
+ return output
+
+ def _consrtm_line(self):
+ """Output for CONSRTM information (PRIVATE)."""
+ output = ""
+ if self.consrtm:
+ output += Record.INTERNAL_FORMAT % "CONSRTM"
+ output += _wrapped_genbank(self.consrtm, Record.GB_BASE_INDENT)
+ return output
+
+ def _title_line(self):
+ """Output for TITLE information (PRIVATE)."""
+ output = ""
+ if self.title:
+ output += Record.INTERNAL_FORMAT % "TITLE"
+ output += _wrapped_genbank(self.title, Record.GB_BASE_INDENT)
+ return output
+
+ def _journal_line(self):
+ """Output for JOURNAL information (PRIVATE)."""
+ output = ""
+ if self.journal:
+ output += Record.INTERNAL_FORMAT % "JOURNAL"
+ output += _wrapped_genbank(self.journal, Record.GB_BASE_INDENT)
+ return output
+
+ def _medline_line(self):
+ """Output for MEDLINE information (PRIVATE)."""
+ output = ""
+ if self.medline_id:
+ output += Record.INTERNAL_FORMAT % "MEDLINE"
+ output += self.medline_id + "\n"
+ return output
+
+ def _pubmed_line(self):
+ """Output for PUBMED information (PRIVATE)."""
+ output = ""
+ if self.pubmed_id:
+ output += Record.OTHER_INTERNAL_FORMAT % "PUBMED"
+ output += self.pubmed_id + "\n"
+ return output
+
+ def _remark_line(self):
+ """Output for REMARK information (PRIVATE)."""
+ output = ""
+ if self.remark:
+ output += Record.INTERNAL_FORMAT % "REMARK"
+ output += _wrapped_genbank(self.remark, Record.GB_BASE_INDENT)
+ return output
+
+
+class Feature:
+ """Hold information about a Feature in the Feature Table of GenBank record.
+
+ Attributes:
+ - key - The key name of the featue (ie. source)
+ - location - The string specifying the location of the feature.
+ - qualfiers - A list of Qualifier objects in the feature.
+
+ """
+
+ def __init__(self, key="", location=""):
+ """Initialize the class."""
+ self.key = key
+ self.location = location
+ self.qualifiers = []
+
+ def __repr__(self):
+ """Representation of the object for debugging or logging."""
+ return "Feature(key=%r, location=%r)" % (self.key, self.location)
+
+ def __str__(self):
+ """Return feature as a GenBank format string."""
+ output = Record.INTERNAL_FEATURE_FORMAT % self.key
+ output += _wrapped_genbank(
+ self.location, Record.GB_FEATURE_INDENT, split_char=","
+ )
+ for qualifier in self.qualifiers:
+ output += str(qualifier)
+ return output
+
+
+class Qualifier:
+ """Hold information about a qualifier in a GenBank feature.
+
+ Attributes:
+ - key - The key name of the qualifier (ie. /organism=)
+ - value - The value of the qualifier ("Dictyostelium discoideum").
+
+ """
+
+ def __init__(self, key="", value=""):
+ """Initialize the class."""
+ self.key = key
+ self.value = value
+
+ def __repr__(self):
+ """Representation of the object for debugging or logging."""
+ return "Qualifier(key=%r, value=%r)" % (self.key, self.value)
+
+ def __str__(self):
+ """Return feature qualifier as a GenBank format string."""
+ output = " " * Record.GB_FEATURE_INDENT
+ # determine whether we can wrap on spaces
+ space_wrap = 1
+ for no_space_key in Bio.GenBank._BaseGenBankConsumer.remove_space_keys:
+ if no_space_key in self.key:
+ space_wrap = 0
+ # return double quotes as-is, leave it to the user to escape them
+ return output + _wrapped_genbank(
+ self.key + self.value, Record.GB_FEATURE_INDENT, space_wrap
+ )
diff --git a/code/lib/Bio/GenBank/Scanner.py b/code/lib/Bio/GenBank/Scanner.py
new file mode 100644
index 0000000..2d94b4c
--- /dev/null
+++ b/code/lib/Bio/GenBank/Scanner.py
@@ -0,0 +1,1904 @@
+# Copyright 2007-2017 by Peter Cock. All rights reserved.
+# Revisions copyright 2010 by Uri Laserson. All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+"""Internal code for parsing GenBank and EMBL files (PRIVATE).
+
+This code is NOT intended for direct use. It provides a basic scanner
+(for use with a event consumer such as Bio.GenBank._FeatureConsumer)
+to parse a GenBank or EMBL file (with their shared INSDC feature table).
+
+It is used by Bio.GenBank to parse GenBank files
+It is also used by Bio.SeqIO to parse GenBank and EMBL files
+
+Feature Table Documentation:
+
+- http://www.insdc.org/files/feature_table.html
+- http://www.ncbi.nlm.nih.gov/projects/collab/FT/index.html
+- ftp://ftp.ncbi.nih.gov/genbank/docs/
+"""
+# 17-MAR-2009: added wgs, wgs_scafld for GenBank whole genome shotgun master records.
+# These are GenBank files that summarize the content of a project, and provide lists of
+# scaffold and contig files in the project. These will be in annotations['wgs'] and
+# annotations['wgs_scafld']. These GenBank files do not have sequences. See
+# http://groups.google.com/group/bionet.molbio.genbank/browse_thread/thread/51fb88bf39e7dc36
+# http://is.gd/nNgk
+# for more details of this format, and an example.
+# Added by Ying Huang & Iddo Friedberg
+
+
+import warnings
+import re
+import sys
+from collections import OrderedDict
+
+from Bio.File import as_handle
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+from Bio import BiopythonParserWarning
+
+
+class InsdcScanner:
+ """Basic functions for breaking up a GenBank/EMBL file into sub sections.
+
+ The International Nucleotide Sequence Database Collaboration (INSDC)
+ between the DDBJ, EMBL, and GenBank. These organisations all use the
+ same "Feature Table" layout in their plain text flat file formats.
+
+ However, the header and sequence sections of an EMBL file are very
+ different in layout to those produced by GenBank/DDBJ.
+ """
+
+ # These constants get redefined with sensible values in the sub classes:
+ RECORD_START = "XXX" # "LOCUS " or "ID "
+ HEADER_WIDTH = 3 # 12 or 5
+ FEATURE_START_MARKERS = ["XXX***FEATURES***XXX"]
+ FEATURE_END_MARKERS = ["XXX***END FEATURES***XXX"]
+ FEATURE_QUALIFIER_INDENT = 0
+ FEATURE_QUALIFIER_SPACER = ""
+ SEQUENCE_HEADERS = ["XXX"] # with right hand side spaces removed
+
+ def __init__(self, debug=0):
+ """Initialize the class."""
+ assert len(self.RECORD_START) == self.HEADER_WIDTH
+ for marker in self.SEQUENCE_HEADERS:
+ assert marker == marker.rstrip()
+ assert len(self.FEATURE_QUALIFIER_SPACER) == self.FEATURE_QUALIFIER_INDENT
+ self.debug = debug
+ self.handle = None
+ self.line = None
+
+ def set_handle(self, handle):
+ """Set the handle attribute."""
+ self.handle = handle
+ self.line = ""
+
+ def find_start(self):
+ """Read in lines until find the ID/LOCUS line, which is returned.
+
+ Any preamble (such as the header used by the NCBI on ``*.seq.gz`` archives)
+ will we ignored.
+ """
+ while True:
+ if self.line:
+ line = self.line
+ self.line = ""
+ else:
+ line = self.handle.readline()
+ if not line:
+ if self.debug:
+ print("End of file")
+ return None
+ if isinstance(line[0], int):
+ # Same exception as for FASTQ files
+ raise ValueError("Is this handle in binary mode not text mode?")
+ if line[: self.HEADER_WIDTH] == self.RECORD_START:
+ if self.debug > 1:
+ print("Found the start of a record:\n" + line)
+ break
+ line = line.rstrip()
+ if line == "//":
+ if self.debug > 1:
+ print("Skipping // marking end of last record")
+ elif line == "":
+ if self.debug > 1:
+ print("Skipping blank line before record")
+ else:
+ # Ignore any header before the first ID/LOCUS line.
+ if self.debug > 1:
+ print("Skipping header line before record:\n" + line)
+ self.line = line
+ return line
+
+ def parse_header(self):
+ """Return list of strings making up the header.
+
+ New line characters are removed.
+
+ Assumes you have just read in the ID/LOCUS line.
+ """
+ if self.line[: self.HEADER_WIDTH] != self.RECORD_START:
+ raise ValueError("Not at start of record")
+
+ header_lines = []
+ while True:
+ line = self.handle.readline()
+ if not line:
+ raise ValueError("Premature end of line during sequence data")
+ line = line.rstrip()
+ if line in self.FEATURE_START_MARKERS:
+ if self.debug:
+ print("Found feature table")
+ break
+ # if line[:self.HEADER_WIDTH]==self.FEATURE_START_MARKER[:self.HEADER_WIDTH]:
+ # if self.debug : print("Found header table (?)")
+ # break
+ if line[: self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS:
+ if self.debug:
+ print("Found start of sequence")
+ break
+ if line == "//":
+ raise ValueError("Premature end of sequence data marker '//' found")
+ header_lines.append(line)
+ self.line = line
+ return header_lines
+
+ def parse_features(self, skip=False):
+ """Return list of tuples for the features (if present).
+
+ Each feature is returned as a tuple (key, location, qualifiers)
+ where key and location are strings (e.g. "CDS" and
+ "complement(join(490883..490885,1..879))") while qualifiers
+ is a list of two string tuples (feature qualifier keys and values).
+
+ Assumes you have already read to the start of the features table.
+ """
+ if self.line.rstrip() not in self.FEATURE_START_MARKERS:
+ if self.debug:
+ print("Didn't find any feature table")
+ return []
+
+ while self.line.rstrip() in self.FEATURE_START_MARKERS:
+ self.line = self.handle.readline()
+
+ features = []
+ line = self.line
+ while True:
+ if not line:
+ raise ValueError("Premature end of line during features table")
+ if line[: self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS:
+ if self.debug:
+ print("Found start of sequence")
+ break
+ line = line.rstrip()
+ if line == "//":
+ raise ValueError("Premature end of features table, marker '//' found")
+ if line in self.FEATURE_END_MARKERS:
+ if self.debug:
+ print("Found end of features")
+ line = self.handle.readline()
+ break
+ if line[2 : self.FEATURE_QUALIFIER_INDENT].strip() == "":
+ # This is an empty feature line between qualifiers. Empty
+ # feature lines within qualifiers are handled below (ignored).
+ line = self.handle.readline()
+ continue
+ if len(line) < self.FEATURE_QUALIFIER_INDENT:
+ warnings.warn(
+ "line too short to contain a feature: %r" % line,
+ BiopythonParserWarning,
+ )
+ line = self.handle.readline()
+ continue
+
+ if skip:
+ line = self.handle.readline()
+ while (
+ line[: self.FEATURE_QUALIFIER_INDENT]
+ == self.FEATURE_QUALIFIER_SPACER
+ ):
+ line = self.handle.readline()
+ else:
+ # Build up a list of the lines making up this feature:
+ if (
+ line[self.FEATURE_QUALIFIER_INDENT] != " "
+ and " " in line[self.FEATURE_QUALIFIER_INDENT :]
+ ):
+ # The feature table design enforces a length limit on the feature keys.
+ # Some third party files (e.g. IGMT's EMBL like files) solve this by
+ # over indenting the location and qualifiers.
+ feature_key, line = line[2:].strip().split(None, 1)
+ feature_lines = [line]
+ warnings.warn(
+ "Over indented %s feature?" % feature_key,
+ BiopythonParserWarning,
+ )
+ else:
+ feature_key = line[2 : self.FEATURE_QUALIFIER_INDENT].strip()
+ feature_lines = [line[self.FEATURE_QUALIFIER_INDENT :]]
+ line = self.handle.readline()
+ while line[
+ : self.FEATURE_QUALIFIER_INDENT
+ ] == self.FEATURE_QUALIFIER_SPACER or (
+ line != "" and line.rstrip() == ""
+ ): # cope with blank lines in the midst of a feature
+ # Use strip to remove any harmless trailing white space AND and leading
+ # white space (e.g. out of spec files with too much indentation)
+ feature_lines.append(line[self.FEATURE_QUALIFIER_INDENT :].strip())
+ line = self.handle.readline()
+ features.append(self.parse_feature(feature_key, feature_lines))
+ self.line = line
+ return features
+
+ def parse_feature(self, feature_key, lines):
+ r"""Parse a feature given as a list of strings into a tuple.
+
+ Expects a feature as a list of strings, returns a tuple (key, location,
+ qualifiers)
+
+ For example given this GenBank feature::
+
+ CDS complement(join(490883..490885,1..879))
+ /locus_tag="NEQ001"
+ /note="conserved hypothetical [Methanococcus jannaschii];
+ COG1583:Uncharacterized ACR; IPR001472:Bipartite nuclear
+ localization signal; IPR002743: Protein of unknown
+ function DUF57"
+ /codon_start=1
+ /transl_table=11
+ /product="hypothetical protein"
+ /protein_id="NP_963295.1"
+ /db_xref="GI:41614797"
+ /db_xref="GeneID:2732620"
+ /translation="MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK
+ EKYFNFTLIPKKDIIENKRYYLIISSPDKRFIEVLHNKIKDLDIITIGLAQFQLRKTK
+ KFDPKLRFPWVTITPIVLREGKIVILKGDKYYKVFVKRLEELKKYNLIKKKEPILEEP
+ IEISLNQIKDGWKIIDVKDRYYDFRNKSFSAFSNWLRDLKEQSLRKYNNFCGKNFYFE
+ EAIFEGFTFYKTVSIRIRINRGEAVYIGTLWKELNVYRKLDKEEREFYKFLYDCGLGS
+ LNSMGFGFVNTKKNSAR"
+
+ Then should give input key="CDS" and the rest of the data as a list of strings
+ lines=["complement(join(490883..490885,1..879))", ..., "LNSMGFGFVNTKKNSAR"]
+ where the leading spaces and trailing newlines have been removed.
+
+ Returns tuple containing: (key as string, location string, qualifiers as list)
+ as follows for this example:
+
+ key = "CDS", string
+ location = "complement(join(490883..490885,1..879))", string
+ qualifiers = list of string tuples:
+
+ [('locus_tag', '"NEQ001"'),
+ ('note', '"conserved hypothetical [Methanococcus jannaschii];\nCOG1583:..."'),
+ ('codon_start', '1'),
+ ('transl_table', '11'),
+ ('product', '"hypothetical protein"'),
+ ('protein_id', '"NP_963295.1"'),
+ ('db_xref', '"GI:41614797"'),
+ ('db_xref', '"GeneID:2732620"'),
+ ('translation', '"MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK\nEKYFNFT..."')]
+
+ In the above example, the "note" and "translation" were edited for compactness,
+ and they would contain multiple new line characters (displayed above as \n)
+
+ If a qualifier is quoted (in this case, everything except codon_start and
+ transl_table) then the quotes are NOT removed.
+
+ Note that no whitespace is removed.
+ """
+ # Skip any blank lines
+ iterator = (x for x in lines if x)
+ try:
+ line = next(iterator)
+
+ feature_location = line.strip()
+ while feature_location[-1:] == ",":
+ # Multiline location, still more to come!
+ line = next(iterator)
+ feature_location += line.strip()
+ if feature_location.count("(") > feature_location.count(")"):
+ # Including the prev line in warning would be more explicit,
+ # but this way get one-and-only-one warning shown by default:
+ warnings.warn(
+ "Non-standard feature line wrapping (didn't break on comma)?",
+ BiopythonParserWarning,
+ )
+ while feature_location[-1:] == "," or feature_location.count(
+ "("
+ ) > feature_location.count(")"):
+ line = next(iterator)
+ feature_location += line.strip()
+
+ qualifiers = []
+
+ for line_number, line in enumerate(iterator):
+ # check for extra wrapping of the location closing parentheses
+ if line_number == 0 and line.startswith(")"):
+ feature_location += line.strip()
+ elif line[0] == "/":
+ # New qualifier
+ i = line.find("=")
+ key = line[1:i] # does not work if i==-1
+ value = line[i + 1 :] # we ignore 'value' if i==-1
+ if i and value.startswith(" ") and value.lstrip().startswith('"'):
+ warnings.warn(
+ "White space after equals in qualifier",
+ BiopythonParserWarning,
+ )
+ value = value.lstrip()
+ if i == -1:
+ # Qualifier with no key, e.g. /pseudo
+ key = line[1:]
+ qualifiers.append((key, None))
+ elif not value:
+ # ApE can output /note=
+ qualifiers.append((key, ""))
+ elif value == '"':
+ # One single quote
+ if self.debug:
+ print("Single quote %s:%s" % (key, value))
+ # DO NOT remove the quote...
+ qualifiers.append((key, value))
+ elif value[0] == '"':
+ # Quoted...
+ value_list = [value]
+ while value_list[-1][-1] != '"':
+ value_list.append(next(iterator))
+ value = "\n".join(value_list)
+ # DO NOT remove the quotes...
+ qualifiers.append((key, value))
+ else:
+ # Unquoted
+ # if debug : print("Unquoted line %s:%s" % (key,value))
+ qualifiers.append((key, value))
+ else:
+ # Unquoted continuation
+ assert len(qualifiers) > 0
+ assert key == qualifiers[-1][0]
+ # if debug : print("Unquoted Cont %s:%s" % (key, line))
+ if qualifiers[-1][1] is None:
+ raise StopIteration
+ qualifiers[-1] = (key, qualifiers[-1][1] + "\n" + line)
+ return feature_key, feature_location, qualifiers
+ except StopIteration:
+ # Bummer
+ raise ValueError(
+ "Problem with '%s' feature:\n%s" % (feature_key, "\n".join(lines))
+ ) from None
+
+ def parse_footer(self):
+ """Return a tuple containing a list of any misc strings, and the sequence."""
+ # This is a basic bit of code to scan and discard the sequence,
+ # which was useful when developing the sub classes.
+ if self.line in self.FEATURE_END_MARKERS:
+ while self.line[: self.HEADER_WIDTH].rstrip() not in self.SEQUENCE_HEADERS:
+ self.line = self.handle.readline()
+ if not self.line:
+ raise ValueError("Premature end of file")
+ self.line = self.line.rstrip()
+
+ if self.line[: self.HEADER_WIDTH].rstrip() not in self.SEQUENCE_HEADERS:
+ raise ValueError("Not at start of sequence")
+ while True:
+ line = self.handle.readline()
+ if not line:
+ raise ValueError("Premature end of line during sequence data")
+ line = line.rstrip()
+ if line == "//":
+ break
+ self.line = line
+ return [], "" # Dummy values!
+
+ def _feed_first_line(self, consumer, line):
+ """Handle the LOCUS/ID line, passing data to the comsumer (PRIVATE).
+
+ This should be implemented by the EMBL / GenBank specific subclass
+
+ Used by the parse_records() and parse() methods.
+ """
+ pass
+
+ def _feed_header_lines(self, consumer, lines):
+ """Handle the header lines (list of strings), passing data to the comsumer (PRIVATE).
+
+ This should be implemented by the EMBL / GenBank specific subclass
+
+ Used by the parse_records() and parse() methods.
+ """
+ pass
+
+ @staticmethod
+ def _feed_feature_table(consumer, feature_tuples):
+ """Handle the feature table (list of tuples), passing data to the comsumer (PRIVATE).
+
+ Used by the parse_records() and parse() methods.
+ """
+ consumer.start_feature_table()
+ for feature_key, location_string, qualifiers in feature_tuples:
+ consumer.feature_key(feature_key)
+ consumer.location(location_string)
+ for q_key, q_value in qualifiers:
+ if q_value is None:
+ consumer.feature_qualifier(q_key, q_value)
+ else:
+ consumer.feature_qualifier(q_key, q_value.replace("\n", " "))
+
+ def _feed_misc_lines(self, consumer, lines):
+ """Handle any lines between features and sequence (list of strings), passing data to the consumer (PRIVATE).
+
+ This should be implemented by the EMBL / GenBank specific subclass
+
+ Used by the parse_records() and parse() methods.
+ """
+ pass
+
+ def feed(self, handle, consumer, do_features=True):
+ """Feed a set of data into the consumer.
+
+ This method is intended for use with the "old" code in Bio.GenBank
+
+ Arguments:
+ - handle - A handle with the information to parse.
+ - consumer - The consumer that should be informed of events.
+ - do_features - Boolean, should the features be parsed?
+ Skipping the features can be much faster.
+
+ Return values:
+ - true - Passed a record
+ - false - Did not find a record
+
+ """
+ # Should work with both EMBL and GenBank files provided the
+ # equivalent Bio.GenBank._FeatureConsumer methods are called...
+ self.set_handle(handle)
+ if not self.find_start():
+ # Could not find (another) record
+ consumer.data = None
+ return False
+
+ # We use the above class methods to parse the file into a simplified format.
+ # The first line, header lines and any misc lines after the features will be
+ # dealt with by GenBank / EMBL specific derived classes.
+
+ # First line and header:
+ self._feed_first_line(consumer, self.line)
+ self._feed_header_lines(consumer, self.parse_header())
+
+ # Features (common to both EMBL and GenBank):
+ if do_features:
+ self._feed_feature_table(consumer, self.parse_features(skip=False))
+ else:
+ self.parse_features(skip=True) # ignore the data
+
+ # Footer and sequence
+ misc_lines, sequence_string = self.parse_footer()
+ self._feed_misc_lines(consumer, misc_lines)
+
+ consumer.sequence(sequence_string)
+ # Calls to consumer.base_number() do nothing anyway
+ consumer.record_end("//")
+
+ assert self.line == "//"
+
+ # And we are done
+ return True
+
+ def parse(self, handle, do_features=True):
+ """Return a SeqRecord (with SeqFeatures if do_features=True).
+
+ See also the method parse_records() for use on multi-record files.
+ """
+ from Bio.GenBank import _FeatureConsumer
+ from Bio.GenBank.utils import FeatureValueCleaner
+
+ consumer = _FeatureConsumer(
+ use_fuzziness=1, feature_cleaner=FeatureValueCleaner()
+ )
+
+ if self.feed(handle, consumer, do_features):
+ return consumer.data
+ else:
+ return None
+
+ def parse_records(self, handle, do_features=True):
+ """Parse records, return a SeqRecord object iterator.
+
+ Each record (from the ID/LOCUS line to the // line) becomes a SeqRecord
+
+ The SeqRecord objects include SeqFeatures if do_features=True
+
+ This method is intended for use in Bio.SeqIO
+ """
+ # This is a generator function
+ with as_handle(handle) as handle:
+ while True:
+ record = self.parse(handle, do_features)
+ if record is None:
+ break
+ if record.id is None:
+ raise ValueError(
+ "Failed to parse the record's ID. Invalid ID line?"
+ )
+ if record.name == "":
+ raise ValueError(
+ "Failed to parse the record's name. Invalid ID line?"
+ )
+ if record.description == "":
+ raise ValueError("Failed to parse the record's description")
+ yield record
+
+ def parse_cds_features(
+ self, handle, alphabet=None, tags2id=("protein_id", "locus_tag", "product"),
+ ):
+ """Parse CDS features, return SeqRecord object iterator.
+
+ Each CDS feature becomes a SeqRecord.
+
+ Arguments:
+ - alphabet - Obsolete, should be left as None.
+ - tags2id - Tupple of three strings, the feature keys to use
+ for the record id, name and description,
+
+ This method is intended for use in Bio.SeqIO
+
+ """
+ if alphabet is not None:
+ raise ValueError("The alphabet argument is no longer supported")
+ with as_handle(handle) as handle:
+ self.set_handle(handle)
+ while self.find_start():
+ # Got an EMBL or GenBank record...
+ self.parse_header() # ignore header lines!
+ feature_tuples = self.parse_features()
+ # self.parse_footer() # ignore footer lines!
+ while True:
+ line = self.handle.readline()
+ if not line:
+ break
+ if line[:2] == "//":
+ break
+ self.line = line.rstrip()
+
+ # Now go though those features...
+ for key, location_string, qualifiers in feature_tuples:
+ if key == "CDS":
+ # Create SeqRecord
+ # ================
+ # SeqRecord objects cannot be created with annotations, they
+ # must be added afterwards. So create an empty record and
+ # then populate it:
+ record = SeqRecord(seq=None)
+ annotations = record.annotations
+ annotations["molecule_type"] = "protein"
+ # Should we add a location object to the annotations?
+ # I *think* that only makes sense for SeqFeatures with their
+ # sub features...
+ annotations["raw_location"] = location_string.replace(" ", "")
+
+ for (qualifier_name, qualifier_data) in qualifiers:
+ if (
+ qualifier_data is not None
+ and qualifier_data[0] == '"'
+ and qualifier_data[-1] == '"'
+ ):
+ # Remove quotes
+ qualifier_data = qualifier_data[1:-1]
+ # Append the data to the annotation qualifier...
+ if qualifier_name == "translation":
+ assert record.seq is None, "Multiple translations!"
+ record.seq = Seq(qualifier_data.replace("\n", ""))
+ elif qualifier_name == "db_xref":
+ # its a list, possibly empty. Its safe to extend
+ record.dbxrefs.append(qualifier_data)
+ else:
+ if qualifier_data is not None:
+ qualifier_data = qualifier_data.replace(
+ "\n", " "
+ ).replace(" ", " ")
+ try:
+ annotations[qualifier_name] += " " + qualifier_data
+ except KeyError:
+ # Not an addition to existing data, its the first bit
+ annotations[qualifier_name] = qualifier_data
+
+ # Fill in the ID, Name, Description
+ # =================================
+ try:
+ record.id = annotations[tags2id[0]]
+ except KeyError:
+ pass
+ try:
+ record.name = annotations[tags2id[1]]
+ except KeyError:
+ pass
+ try:
+ record.description = annotations[tags2id[2]]
+ except KeyError:
+ pass
+
+ yield record
+
+
+class EmblScanner(InsdcScanner):
+ """For extracting chunks of information in EMBL files."""
+
+ RECORD_START = "ID "
+ HEADER_WIDTH = 5
+ FEATURE_START_MARKERS = ["FH Key Location/Qualifiers", "FH"]
+ FEATURE_END_MARKERS = ["XX"] # XX can also mark the end of many things!
+ FEATURE_QUALIFIER_INDENT = 21
+ FEATURE_QUALIFIER_SPACER = "FT" + " " * (FEATURE_QUALIFIER_INDENT - 2)
+ SEQUENCE_HEADERS = ["SQ", "CO"] # Remove trailing spaces
+
+ EMBL_INDENT = HEADER_WIDTH
+ EMBL_SPACER = " " * EMBL_INDENT
+
+ def parse_footer(self):
+ """Return a tuple containing a list of any misc strings, and the sequence."""
+ if self.line[: self.HEADER_WIDTH].rstrip() not in self.SEQUENCE_HEADERS:
+ raise ValueError("Footer format unexpected: '%s'" % self.line)
+
+ # Note that the SQ line can be split into several lines...
+ misc_lines = []
+ while self.line[: self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS:
+ misc_lines.append(self.line)
+ self.line = self.handle.readline()
+ if not self.line:
+ raise ValueError("Premature end of file")
+ self.line = self.line.rstrip()
+
+ if not (
+ self.line[: self.HEADER_WIDTH] == " " * self.HEADER_WIDTH
+ or self.line.strip() == "//"
+ ):
+ raise ValueError("Unexpected content after SQ or CO line: %r" % self.line)
+
+ seq_lines = []
+ line = self.line
+ while True:
+ if not line:
+ raise ValueError("Premature end of file in sequence data")
+ line = line.strip()
+ if not line:
+ raise ValueError("Blank line in sequence data")
+ if line == "//":
+ break
+ if self.line[: self.HEADER_WIDTH] != (" " * self.HEADER_WIDTH):
+ raise ValueError(
+ "Problem with characters in header line, "
+ " or incorrect header width: " + self.line
+ )
+ # Remove tailing number now, remove spaces later
+ linersplit = line.rsplit(None, 1)
+ if len(linersplit) == 2 and linersplit[1].isdigit():
+ seq_lines.append(linersplit[0])
+ elif line.isdigit():
+ # Special case of final blank line with no bases
+ # just the sequence coordinate
+ pass
+ else:
+ warnings.warn(
+ "EMBL sequence line missing coordinates", BiopythonParserWarning
+ )
+ seq_lines.append(line)
+ line = self.handle.readline()
+ self.line = line
+ return misc_lines, "".join(seq_lines).replace(" ", "")
+
+ def _feed_first_line(self, consumer, line):
+ assert line[: self.HEADER_WIDTH].rstrip() == "ID"
+ if line[self.HEADER_WIDTH :].count(";") == 6:
+ # Looks like the semi colon separated style introduced in 2006
+ self._feed_first_line_new(consumer, line)
+ elif line[self.HEADER_WIDTH :].count(";") == 3:
+ if line.rstrip().endswith(" SQ"):
+ # EMBL-bank patent data
+ self._feed_first_line_patents(consumer, line)
+ else:
+ # Looks like the pre 2006 style
+ self._feed_first_line_old(consumer, line)
+ elif line[self.HEADER_WIDTH :].count(";") == 2:
+ # Looks like KIKO patent data
+ self._feed_first_line_patents_kipo(consumer, line)
+ else:
+ raise ValueError("Did not recognise the ID line layout:\n" + line)
+
+ def _feed_first_line_patents(self, consumer, line):
+ # Old style EMBL patent records where ID line ended SQ
+ # Not 100% sure that PRT here is really molecule type and
+ # not the data file division...
+ #
+ # Either Non-Redundant Level 1 database records,
+ # ID ; ; ;
+ # e.g. ID NRP_AX000635; PRT; NR1; 15 SQ
+ #
+ # Or, Non-Redundant Level 2 database records:
+ # ID ; ; ;
+ # e.g. ID NRP0000016E; PRT; NR2; 5 SQ
+ # e.g. ID NRP_AX000635; PRT; NR1; 15 SQ
+ fields = [
+ data.strip() for data in line[self.HEADER_WIDTH :].strip()[:-3].split(";")
+ ]
+ assert len(fields) == 4
+ consumer.locus(fields[0])
+ consumer.residue_type(fields[1]) # semi-redundant
+ consumer.data_file_division(fields[2])
+ # TODO - Record cluster size?
+
+ def _feed_first_line_patents_kipo(self, consumer, line):
+ # EMBL format patent sequence from KIPO, e.g.
+ # ftp://ftp.ebi.ac.uk/pub/databases/patentdata/kipo_prt.dat.gz
+ #
+ # e.g. ID DI500001 STANDARD; PRT; 111 AA.
+ #
+ # This follows the style of _feed_first_line_old
+ assert line[: self.HEADER_WIDTH].rstrip() == "ID"
+ fields = [line[self.HEADER_WIDTH :].split(None, 1)[0]]
+ fields.extend(line[self.HEADER_WIDTH :].split(None, 1)[1].split(";"))
+ fields = [entry.strip() for entry in fields]
+ """
+ The tokens represent:
+
+ 0. Primary accession number
+ (space sep)
+ 1. ??? (e.g. standard)
+ (semi-colon)
+ 2. Molecule type (protein)? Division? Always 'PRT'
+ 3. Sequence length (e.g. '111 AA.')
+ """
+ consumer.locus(fields[0]) # Should we also call the accession consumer?
+ # consumer.molecule_type(fields[2])
+ self._feed_seq_length(consumer, fields[3])
+
+ def _feed_first_line_old(self, consumer, line):
+ # Expects an ID line in the style before 2006, e.g.
+ # ID SC10H5 standard; DNA; PRO; 4870 BP.
+ # ID BSUB9999 standard; circular DNA; PRO; 4214630 BP.
+ assert line[: self.HEADER_WIDTH].rstrip() == "ID"
+ fields = [line[self.HEADER_WIDTH :].split(None, 1)[0]]
+ fields.extend(line[self.HEADER_WIDTH :].split(None, 1)[1].split(";"))
+ fields = [entry.strip() for entry in fields]
+ """
+ The tokens represent:
+
+ 0. Primary accession number
+ (space sep)
+ 1. ??? (e.g. standard)
+ (semi-colon)
+ 2. Topology and/or Molecule type (e.g. 'circular DNA' or 'DNA')
+ 3. Taxonomic division (e.g. 'PRO')
+ 4. Sequence length (e.g. '4639675 BP.')
+
+ """
+ consumer.locus(fields[0]) # Should we also call the accession consumer?
+ consumer.residue_type(fields[2])
+ if "circular" in fields[2]:
+ consumer.topology("circular")
+ consumer.molecule_type(fields[2].replace("circular", "").strip())
+ elif "linear" in fields[2]:
+ consumer.topology("linear")
+ consumer.molecule_type(fields[2].replace("linear", "").strip())
+ else:
+ consumer.molecule_type(fields[2].strip())
+ consumer.data_file_division(fields[3])
+ self._feed_seq_length(consumer, fields[4])
+
+ def _feed_first_line_new(self, consumer, line):
+ # Expects an ID line in the style introduced in 2006, e.g.
+ # ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP.
+ # ID CD789012; SV 4; linear; genomic DNA; HTG; MAM; 500 BP.
+ assert line[: self.HEADER_WIDTH].rstrip() == "ID"
+ fields = [data.strip() for data in line[self.HEADER_WIDTH :].strip().split(";")]
+ assert len(fields) == 7
+ """
+ The tokens represent:
+
+ 0. Primary accession number
+ 1. Sequence version number
+ 2. Topology: 'circular' or 'linear'
+ 3. Molecule type (e.g. 'genomic DNA')
+ 4. Data class (e.g. 'STD')
+ 5. Taxonomic division (e.g. 'PRO')
+ 6. Sequence length (e.g. '4639675 BP.')
+
+ """
+
+ consumer.locus(fields[0])
+
+ # Call the accession consumer now, to make sure we record
+ # something as the record.id, in case there is no AC line
+ consumer.accession(fields[0])
+
+ # TODO - How to deal with the version field? At the moment the consumer
+ # will try and use this for the ID which isn't ideal for EMBL files.
+ version_parts = fields[1].split()
+ if (
+ len(version_parts) == 2
+ and version_parts[0] == "SV"
+ and version_parts[1].isdigit()
+ ):
+ consumer.version_suffix(version_parts[1])
+
+ # Based on how the old GenBank parser worked, merge these two:
+ consumer.residue_type(" ".join(fields[2:4])) # Semi-obsolete
+
+ consumer.topology(fields[2])
+ consumer.molecule_type(fields[3])
+
+ # consumer.xxx(fields[4]) # TODO - What should we do with the data class?
+
+ consumer.data_file_division(fields[5])
+
+ self._feed_seq_length(consumer, fields[6])
+
+ @staticmethod
+ def _feed_seq_length(consumer, text):
+ length_parts = text.split()
+ assert len(length_parts) == 2, "Invalid sequence length string %r" % text
+ assert length_parts[1].upper() in ["BP", "BP.", "AA", "AA."]
+ consumer.size(length_parts[0])
+
+ def _feed_header_lines(self, consumer, lines):
+ consumer_dict = {
+ "AC": "accession",
+ "SV": "version", # SV line removed in June 2006, now part of ID line
+ "DE": "definition",
+ # 'RN' : 'reference_num',
+ # 'RC' : reference comment... TODO
+ # 'RP' : 'reference_bases',
+ # 'RX' : reference cross reference... DOI or Pubmed
+ "RG": "consrtm", # optional consortium
+ # 'RA' : 'authors',
+ # 'RT' : 'title',
+ "RL": "journal",
+ "OS": "organism",
+ "OC": "taxonomy",
+ # 'DR' : data reference
+ "CC": "comment",
+ # 'XX' : splitter
+ }
+ # We have to handle the following specially:
+ # RX (depending on reference type...)
+ for line in lines:
+ line_type = line[: self.EMBL_INDENT].strip()
+ data = line[self.EMBL_INDENT :].strip()
+ if line_type == "XX":
+ pass
+ elif line_type == "RN":
+ # Reformat reference numbers for the GenBank based consumer
+ # e.g. '[1]' becomes '1'
+ if data[0] == "[" and data[-1] == "]":
+ data = data[1:-1]
+ consumer.reference_num(data)
+ elif line_type == "RP":
+ if data.strip() == "[-]":
+ # Patent EMBL files from KIPO just use: RN [-]
+ pass
+ else:
+ # Reformat reference numbers for the GenBank based consumer
+ # e.g. '1-4639675' becomes '(bases 1 to 4639675)'
+ # and '160-550, 904-1055' becomes '(bases 160 to 550; 904 to 1055)'
+ # Note could be multi-line, and end with a comma
+ parts = [
+ bases.replace("-", " to ").strip()
+ for bases in data.split(",")
+ if bases.strip()
+ ]
+ consumer.reference_bases("(bases %s)" % "; ".join(parts))
+ elif line_type == "RT":
+ # Remove the enclosing quotes and trailing semi colon.
+ # Note the title can be split over multiple lines.
+ if data.startswith('"'):
+ data = data[1:]
+ if data.endswith('";'):
+ data = data[:-2]
+ consumer.title(data)
+ elif line_type == "RX":
+ # EMBL support three reference types at the moment:
+ # - PUBMED PUBMED bibliographic database (NLM)
+ # - DOI Digital Object Identifier (International DOI Foundation)
+ # - AGRICOLA US National Agriculture Library (NAL) of the US Department
+ # of Agriculture (USDA)
+ #
+ # Format:
+ # RX resource_identifier; identifier.
+ #
+ # e.g.
+ # RX DOI; 10.1016/0024-3205(83)90010-3.
+ # RX PUBMED; 264242.
+ #
+ # Currently our reference object only supports PUBMED and MEDLINE
+ # (as these were in GenBank files?).
+ key, value = data.split(";", 1)
+ if value.endswith("."):
+ value = value[:-1]
+ value = value.strip()
+ if key == "PUBMED":
+ consumer.pubmed_id(value)
+ # TODO - Handle other reference types (here and in BioSQL bindings)
+ elif line_type == "CC":
+ # Have to pass a list of strings for this one (not just a string)
+ consumer.comment([data])
+ elif line_type == "DR":
+ # Database Cross-reference, format:
+ # DR database_identifier; primary_identifier; secondary_identifier.
+ #
+ # e.g.
+ # DR MGI; 98599; Tcrb-V4.
+ #
+ # TODO - How should we store any secondary identifier?
+ parts = data.rstrip(".").split(";")
+ # Turn it into "database_identifier:primary_identifier" to
+ # mimic the GenBank parser. e.g. "MGI:98599"
+ if len(parts) == 1:
+ warnings.warn(
+ "Malformed DR line in EMBL file.", BiopythonParserWarning
+ )
+ else:
+ consumer.dblink("%s:%s" % (parts[0].strip(), parts[1].strip()))
+ elif line_type == "RA":
+ # Remove trailing ; at end of authors list
+ consumer.authors(data.rstrip(";"))
+ elif line_type == "PR":
+ # In the EMBL patent files, this is a PR (PRiority) line which
+ # provides the earliest active priority within the family.
+ # The priority number comes first, followed by the priority date.
+ #
+ # e.g.
+ # PR JP19990377484 16-DEC-1999
+ #
+ # However, in most EMBL files this is a PR (PRoject) line which
+ # gives the BioProject reference number.
+ #
+ # e.g.
+ # PR Project:PRJNA60715;
+ #
+ # In GenBank files this corresponds to the old PROJECT line
+ # which was later replaced with the DBLINK line.
+ if data.startswith("Project:"):
+ # Remove trailing ; at end of the project reference
+ consumer.project(data.rstrip(";"))
+ elif line_type == "KW":
+ consumer.keywords(data.rstrip(";"))
+ elif line_type in consumer_dict:
+ # Its a semi-automatic entry!
+ getattr(consumer, consumer_dict[line_type])(data)
+ else:
+ if self.debug:
+ print("Ignoring EMBL header line:\n%s" % line)
+
+ def _feed_misc_lines(self, consumer, lines):
+ # TODO - Should we do something with the information on the SQ line(s)?
+ lines.append("")
+ line_iter = iter(lines)
+ try:
+ for line in line_iter:
+ if line.startswith("CO "):
+ line = line[5:].strip()
+ contig_location = line
+ while True:
+ line = next(line_iter)
+ if not line:
+ break
+ elif line.startswith("CO "):
+ # Don't need to preseve the whitespace here.
+ contig_location += line[5:].strip()
+ else:
+ raise ValueError(
+ "Expected CO (contig) continuation line, got:\n" + line
+ )
+ consumer.contig_location(contig_location)
+ if line.startswith("SQ Sequence "):
+ # e.g.
+ # SQ Sequence 219 BP; 82 A; 48 C; 33 G; 45 T; 11 other;
+ #
+ # Or, EMBL-bank patent, e.g.
+ # SQ Sequence 465 AA; 3963407aa91d3a0d622fec679a4524e0; MD5;
+ self._feed_seq_length(
+ consumer, line[14:].rstrip().rstrip(";").split(";", 1)[0]
+ )
+ # TODO - Record the checksum etc?
+ return
+ except StopIteration:
+ raise ValueError("Problem in misc lines before sequence") from None
+
+
+class _ImgtScanner(EmblScanner):
+ """For extracting chunks of information in IMGT (EMBL like) files (PRIVATE).
+
+ IMGT files are like EMBL files but in order to allow longer feature types
+ the features should be indented by 25 characters not 21 characters. In
+ practice the IMGT flat files tend to use either 21 or 25 characters, so we
+ must cope with both.
+
+ This is private to encourage use of Bio.SeqIO rather than Bio.GenBank.
+ """
+
+ FEATURE_START_MARKERS = [
+ "FH Key Location/Qualifiers",
+ "FH Key Location/Qualifiers (from EMBL)",
+ "FH Key Location/Qualifiers",
+ "FH",
+ ]
+
+ def _feed_first_line(self, consumer, line):
+ assert line[: self.HEADER_WIDTH].rstrip() == "ID"
+ if line[self.HEADER_WIDTH :].count(";") != 5:
+ # Assume its an older EMBL-like line,
+ return EmblScanner._feed_first_line(self, consumer, line)
+ # Otherwise assume its the new (circa 2016) IMGT style
+ # as used in the IPD-IMGT/HLA Database
+ #
+ # https://github.com/ANHIG/IMGTHLA/
+ #
+ # The key changes post 3.16 are the addition of an SV value
+ # to the ID line, these additions should make the format more
+ # similar to the ENA style.
+ #
+ # ID HLA00001 standard; DNA; HUM; 3503 BP.
+ #
+ # becomes
+ #
+ # ID HLA00001; SV 1; standard; DNA; HUM; 3503 BP.
+ fields = [data.strip() for data in line[self.HEADER_WIDTH :].strip().split(";")]
+ assert len(fields) == 6
+ """
+ The tokens represent:
+
+ 0. Primary accession number (eg 'HLA00001')
+ 1. Sequence version number (eg 'SV 1')
+ 2. ??? eg 'standard'
+ 3. Molecule type (e.g. 'DNA')
+ 4. Taxonomic division (e.g. 'HUM')
+ 5. Sequence length (e.g. '3503 BP.')
+ """
+ consumer.locus(fields[0])
+
+ # See TODO on the EMBL _feed_first_line_new about version field
+ version_parts = fields[1].split()
+ if (
+ len(version_parts) == 2
+ and version_parts[0] == "SV"
+ and version_parts[1].isdigit()
+ ):
+ consumer.version_suffix(version_parts[1])
+
+ consumer.residue_type(fields[3])
+ if "circular" in fields[3]:
+ consumer.topology("circular")
+ consumer.molecule_type(fields[3].replace("circular", "").strip())
+ elif "linear" in fields[3]:
+ consumer.topology("linear")
+ consumer.molecule_type(fields[3].replace("linear", "").strip())
+ else:
+ consumer.molecule_type(fields[3].strip())
+ consumer.data_file_division(fields[4])
+ self._feed_seq_length(consumer, fields[5])
+
+ def parse_features(self, skip=False):
+ """Return list of tuples for the features (if present).
+
+ Each feature is returned as a tuple (key, location, qualifiers)
+ where key and location are strings (e.g. "CDS" and
+ "complement(join(490883..490885,1..879))") while qualifiers
+ is a list of two string tuples (feature qualifier keys and values).
+
+ Assumes you have already read to the start of the features table.
+ """
+ if self.line.rstrip() not in self.FEATURE_START_MARKERS:
+ if self.debug:
+ print("Didn't find any feature table")
+ return []
+
+ while self.line.rstrip() in self.FEATURE_START_MARKERS:
+ self.line = self.handle.readline()
+
+ bad_position_re = re.compile(r"([0-9]+)>")
+
+ features = []
+ line = self.line
+ while True:
+ if not line:
+ raise ValueError("Premature end of line during features table")
+ if line[: self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS:
+ if self.debug:
+ print("Found start of sequence")
+ break
+ line = line.rstrip()
+ if line == "//":
+ raise ValueError("Premature end of features table, marker '//' found")
+ if line in self.FEATURE_END_MARKERS:
+ if self.debug:
+ print("Found end of features")
+ line = self.handle.readline()
+ break
+ if line[2 : self.FEATURE_QUALIFIER_INDENT].strip() == "":
+ # This is an empty feature line between qualifiers. Empty
+ # feature lines within qualifiers are handled below (ignored).
+ line = self.handle.readline()
+ continue
+
+ if skip:
+ line = self.handle.readline()
+ while (
+ line[: self.FEATURE_QUALIFIER_INDENT]
+ == self.FEATURE_QUALIFIER_SPACER
+ ):
+ line = self.handle.readline()
+ else:
+ assert line[:2] == "FT"
+ try:
+ feature_key, location_start = line[2:].strip().split()
+ except ValueError:
+ # e.g. "FT TRANSMEMBRANE-REGION2163..2240\n"
+ # Assume indent of 25 as per IMGT spec, with the location
+ # start in column 26 (one-based).
+ feature_key = line[2:25].strip()
+ location_start = line[25:].strip()
+ feature_lines = [location_start]
+ line = self.handle.readline()
+ while (
+ line[: self.FEATURE_QUALIFIER_INDENT]
+ == self.FEATURE_QUALIFIER_SPACER
+ or line.rstrip() == ""
+ ): # cope with blank lines in the midst of a feature
+ # Use strip to remove any harmless trailing white space AND and leading
+ # white space (copes with 21 or 26 indents and orther variants)
+ assert line[:2] == "FT"
+ feature_lines.append(line[self.FEATURE_QUALIFIER_INDENT :].strip())
+ line = self.handle.readline()
+ feature_key, location, qualifiers = self.parse_feature(
+ feature_key, feature_lines
+ )
+ # Try to handle known problems with IMGT locations here:
+ if ">" in location:
+ # Nasty hack for common IMGT bug, should be >123 not 123>
+ # in a location string. At least here the meaning is clear,
+ # and since it is so common I don't want to issue a warning
+ # warnings.warn("Feature location %s is invalid, "
+ # "moving greater than sign before position"
+ # % location, BiopythonParserWarning)
+ location = bad_position_re.sub(r">\1", location)
+ features.append((feature_key, location, qualifiers))
+ self.line = line
+ return features
+
+
+class GenBankScanner(InsdcScanner):
+ """For extracting chunks of information in GenBank files."""
+
+ RECORD_START = "LOCUS "
+ HEADER_WIDTH = 12
+ FEATURE_START_MARKERS = ["FEATURES Location/Qualifiers", "FEATURES"]
+ FEATURE_END_MARKERS = []
+ FEATURE_QUALIFIER_INDENT = 21
+ FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT
+ SEQUENCE_HEADERS = [
+ "CONTIG",
+ "ORIGIN",
+ "BASE COUNT",
+ "WGS",
+ "TSA",
+ "TLS",
+ ] # trailing spaces removed
+
+ GENBANK_INDENT = HEADER_WIDTH
+ GENBANK_SPACER = " " * GENBANK_INDENT
+
+ STRUCTURED_COMMENT_START = "-START##"
+ STRUCTURED_COMMENT_END = "-END##"
+ STRUCTURED_COMMENT_DELIM = " :: "
+
+ def parse_footer(self):
+ """Return a tuple containing a list of any misc strings, and the sequence."""
+ if self.line[: self.HEADER_WIDTH].rstrip() not in self.SEQUENCE_HEADERS:
+ raise ValueError("Footer format unexpected: '%s'" % self.line)
+
+ misc_lines = []
+ while (
+ self.line[: self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS
+ or self.line[: self.HEADER_WIDTH] == " " * self.HEADER_WIDTH
+ or "WGS" == self.line[:3]
+ ):
+ misc_lines.append(self.line.rstrip())
+ self.line = self.handle.readline()
+ if not self.line:
+ raise ValueError("Premature end of file")
+
+ if self.line[: self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS:
+ raise ValueError("Eh? '%s'" % self.line)
+
+ # Now just consume the sequence lines until reach the // marker
+ # or a CONTIG line
+ seq_lines = []
+ line = self.line
+ while True:
+ if not line:
+ warnings.warn(
+ "Premature end of file in sequence data", BiopythonParserWarning
+ )
+ line = "//"
+ break
+ line = line.rstrip()
+ if not line:
+ warnings.warn("Blank line in sequence data", BiopythonParserWarning)
+ line = self.handle.readline()
+ continue
+ if line == "//":
+ break
+ if line.startswith("CONTIG"):
+ break
+ if len(line) > 9 and line[9:10] != " ":
+ # Some broken programs indent the sequence by one space too many
+ # so try to get rid of that and test again.
+ warnings.warn(
+ "Invalid indentation for sequence line", BiopythonParserWarning
+ )
+ line = line[1:]
+ if len(line) > 9 and line[9:10] != " ":
+ raise ValueError("Sequence line mal-formed, '%s'" % line)
+ seq_lines.append(line[10:]) # remove spaces later
+ line = self.handle.readline()
+
+ self.line = line
+ return misc_lines, "".join(seq_lines).replace(" ", "")
+
+ def _feed_first_line(self, consumer, line):
+ """Scan over and parse GenBank LOCUS line (PRIVATE).
+
+ This must cope with several variants, primarily the old and new column
+ based standards from GenBank. Additionally EnsEMBL produces GenBank
+ files where the LOCUS line is space separated rather that following
+ the column based layout.
+
+ We also try to cope with GenBank like files with partial LOCUS lines.
+
+ As of release 229.0, the columns are no longer strictly in a given
+ position. See GenBank format release notes:
+
+ "Historically, the LOCUS line has had a fixed length and its
+ elements have been presented at specific column positions...
+ But with the anticipated increases in the lengths of accession
+ numbers, and the advent of sequences that are gigabases long,
+ maintaining the column positions will not always be possible and
+ the overall length of the LOCUS line could exceed 79 characters."
+
+ """
+ #####################################
+ # LOCUS line #
+ #####################################
+ if line[0 : self.GENBANK_INDENT] != "LOCUS ":
+ raise ValueError("LOCUS line does not start correctly:\n" + line)
+
+ # Have to break up the locus line, and handle the different bits of it.
+ # There are at least two different versions of the locus line...
+ if line[29:33] in [" bp ", " aa ", " rc "] and line[55:62] == " ":
+ # Old... note we insist on the 55:62 being empty to avoid trying
+ # to parse space separated LOCUS lines from Ensembl etc, see below.
+ #
+ # Positions Contents
+ # --------- --------
+ # 00:06 LOCUS
+ # 06:12 spaces
+ # 12:?? Locus name
+ # ??:?? space
+ # ??:29 Length of sequence, right-justified
+ # 29:33 space, bp, space
+ # 33:41 strand type / molecule type, e.g. DNA
+ # 41:42 space
+ # 42:51 Blank (implies linear), linear or circular
+ # 51:52 space
+ # 52:55 The division code (e.g. BCT, VRL, INV)
+ # 55:62 space
+ # 62:73 Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991)
+ #
+ # assert line[29:33] in [' bp ', ' aa ',' rc '] , \
+ # 'LOCUS line does not contain size units at expected position:\n' + line
+ if line[41:42] != " ":
+ raise ValueError(
+ "LOCUS line does not contain space at position 42:\n" + line
+ )
+ if line[42:51].strip() not in ["", "linear", "circular"]:
+ raise ValueError(
+ "LOCUS line does not contain valid entry "
+ "(linear, circular, ...):\n" + line
+ )
+ if line[51:52] != " ":
+ raise ValueError(
+ "LOCUS line does not contain space at position 52:\n" + line
+ )
+ # if line[55:62] != ' ':
+ # raise ValueError('LOCUS line does not contain spaces from position 56 to 62:\n' + line)
+ if line[62:73].strip():
+ if line[64:65] != "-":
+ raise ValueError(
+ "LOCUS line does not contain - at "
+ "position 65 in date:\n" + line
+ )
+ if line[68:69] != "-":
+ raise ValueError(
+ "LOCUS line does not contain - at "
+ "position 69 in date:\n" + line
+ )
+
+ name_and_length_str = line[self.GENBANK_INDENT : 29]
+ while " " in name_and_length_str:
+ name_and_length_str = name_and_length_str.replace(" ", " ")
+ name_and_length = name_and_length_str.split(" ")
+ if len(name_and_length) > 2:
+ raise ValueError(
+ "Cannot parse the name and length in the LOCUS line:\n" + line
+ )
+ if len(name_and_length) == 1:
+ raise ValueError("Name and length collide in the LOCUS line:\n" + line)
+ # Should be possible to split them based on position, if
+ # a clear definition of the standard exists THAT AGREES with
+ # existing files.
+ name, length = name_and_length
+ if len(name) > 16:
+ # As long as the sequence is short, can steal its leading spaces
+ # to extend the name over the current 16 character limit.
+ # However, that deserves a warning as it is out of spec.
+ warnings.warn(
+ "GenBank LOCUS line identifier over 16 characters",
+ BiopythonParserWarning,
+ )
+ consumer.locus(name)
+ consumer.size(length)
+ # consumer.residue_type(line[33:41].strip())
+
+ if line[33:51].strip() == "" and line[29:33] == " aa ":
+ # Amino acids -> protein (even if there is no residue type given)
+ consumer.residue_type("PROTEIN")
+ else:
+ consumer.residue_type(line[33:51].strip())
+
+ consumer.molecule_type(line[33:41].strip())
+ consumer.topology(line[42:51].strip())
+ consumer.data_file_division(line[52:55])
+ if line[62:73].strip():
+ consumer.date(line[62:73])
+ elif line[40:44] in [" bp ", " aa ", " rc "] and line[54:64].strip() in [
+ "",
+ "linear",
+ "circular",
+ ]:
+ # New... linear/circular/big blank test should avoid EnsEMBL style
+ # LOCUS line being treated like a proper column based LOCUS line.
+ #
+ # Positions Contents
+ # --------- --------
+ # 00:06 LOCUS
+ # 06:12 spaces
+ # 12:?? Locus name
+ # ??:?? space
+ # ??:40 Length of sequence, right-justified
+ # 40:44 space, bp, space
+ # 44:47 Blank, ss-, ds-, ms-
+ # 47:54 Blank, DNA, RNA, tRNA, mRNA, uRNA, snRNA, cDNA
+ # 54:55 space
+ # 55:63 Blank (implies linear), linear or circular
+ # 63:64 space
+ # 64:67 The division code (e.g. BCT, VRL, INV)
+ # 67:68 space
+ # 68:79 Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991)
+ #
+ if len(line) < 79:
+ # JBEI genbank files seem to miss a division code and date
+ # See issue #1656 e.g.
+ # LOCUS pEH010 5743 bp DNA circular
+ warnings.warn(
+ "Truncated LOCUS line found - is this correct?\n:%r" % line,
+ BiopythonParserWarning,
+ )
+ padding_len = 79 - len(line)
+ padding = " " * padding_len
+ line += padding
+
+ if line[40:44] not in [" bp ", " aa ", " rc "]:
+ raise ValueError(
+ "LOCUS line does not contain size units at "
+ "expected position:\n" + line
+ )
+ if line[44:47] not in [" ", "ss-", "ds-", "ms-"]:
+ raise ValueError(
+ "LOCUS line does not have valid strand "
+ "type (Single stranded, ...):\n" + line
+ )
+
+ if not (
+ line[47:54].strip() == ""
+ or "DNA" in line[47:54].strip().upper()
+ or "RNA" in line[47:54].strip().upper()
+ ):
+ raise ValueError(
+ "LOCUS line does not contain valid "
+ "sequence type (DNA, RNA, ...):\n" + line
+ )
+ if line[54:55] != " ":
+ raise ValueError(
+ "LOCUS line does not contain space at position 55:\n" + line
+ )
+ if line[55:63].strip() not in ["", "linear", "circular"]:
+ raise ValueError(
+ "LOCUS line does not contain valid "
+ "entry (linear, circular, ...):\n" + line
+ )
+ if line[63:64] != " ":
+ raise ValueError(
+ "LOCUS line does not contain space at position 64:\n" + line
+ )
+ if line[67:68] != " ":
+ raise ValueError(
+ "LOCUS line does not contain space at position 68:\n" + line
+ )
+ if line[68:79].strip():
+ if line[70:71] != "-":
+ raise ValueError(
+ "LOCUS line does not contain - at "
+ "position 71 in date:\n" + line
+ )
+ if line[74:75] != "-":
+ raise ValueError(
+ "LOCUS line does not contain - at "
+ "position 75 in date:\n" + line
+ )
+
+ name_and_length_str = line[self.GENBANK_INDENT : 40]
+ while " " in name_and_length_str:
+ name_and_length_str = name_and_length_str.replace(" ", " ")
+ name_and_length = name_and_length_str.split(" ")
+ if len(name_and_length) > 2:
+ raise ValueError(
+ "Cannot parse the name and length in the LOCUS line:\n" + line
+ )
+ if len(name_and_length) == 1:
+ raise ValueError("Name and length collide in the LOCUS line:\n" + line)
+ # Should be possible to split them based on position, if
+ # a clear definition of the stand exists THAT AGREES with
+ # existing files.
+ consumer.locus(name_and_length[0])
+ consumer.size(name_and_length[1])
+
+ if line[44:54].strip() == "" and line[40:44] == " aa ":
+ # Amino acids -> protein (even if there is no residue type given)
+ consumer.residue_type(("PROTEIN " + line[54:63]).strip())
+ else:
+ consumer.residue_type(line[44:63].strip())
+
+ consumer.molecule_type(line[44:54].strip())
+ consumer.topology(line[55:63].strip())
+ if line[64:76].strip():
+ consumer.data_file_division(line[64:67])
+ if line[68:79].strip():
+ consumer.date(line[68:79])
+ elif line[self.GENBANK_INDENT :].strip().count(" ") == 0:
+ # Truncated LOCUS line, as produced by some EMBOSS tools - see bug 1762
+ #
+ # e.g.
+ #
+ # "LOCUS U00096"
+ #
+ # rather than:
+ #
+ # "LOCUS U00096 4639675 bp DNA circular BCT"
+ #
+ # Positions Contents
+ # --------- --------
+ # 00:06 LOCUS
+ # 06:12 spaces
+ # 12:?? Locus name
+ if line[self.GENBANK_INDENT :].strip() != "":
+ consumer.locus(line[self.GENBANK_INDENT :].strip())
+ else:
+ # Must just have just "LOCUS ", is this even legitimate?
+ # We should be able to continue parsing... we need real world testcases!
+ warnings.warn(
+ "Minimal LOCUS line found - is this correct?\n:%r" % line,
+ BiopythonParserWarning,
+ )
+ elif (
+ len(line.split()) == 8
+ and line.split()[3] in ("aa", "bp")
+ and line.split()[5] in ("linear", "circular")
+ ):
+ # Cope with invalidly spaced GenBank LOCUS lines like
+ # LOCUS AB070938 6497 bp DNA linear BCT 11-OCT-2001
+ # This will also cope with extra long accession numbers and
+ # sequence lengths
+ splitline = line.split()
+ consumer.locus(splitline[1])
+ # Provide descriptive error message if the sequence is too long
+ # for python to handle
+
+ if int(splitline[2]) > sys.maxsize:
+ raise ValueError(
+ "Tried to load a sequence with a length %s, "
+ "your installation of python can only load "
+ "sesquences of length %s" % (splitline[2], sys.maxsize)
+ )
+ else:
+ consumer.size(splitline[2])
+
+ consumer.residue_type(splitline[4])
+ consumer.topology(splitline[5])
+ consumer.data_file_division(splitline[6])
+ consumer.date(splitline[7])
+ if len(line) < 80:
+ warnings.warn(
+ "Attempting to parse malformed locus line:\n%r\n"
+ "Found locus %r size %r residue_type %r\n"
+ "Some fields may be wrong."
+ % (line, splitline[1], splitline[2], splitline[4]),
+ BiopythonParserWarning,
+ )
+ elif len(line.split()) == 7 and line.split()[3] in ["aa", "bp"]:
+ # Cope with EnsEMBL genbank files which use space separation rather
+ # than the expected column based layout. e.g.
+ # LOCUS HG531_PATCH 1000000 bp DNA HTG 18-JUN-2011
+ # LOCUS HG531_PATCH 759984 bp DNA HTG 18-JUN-2011
+ # LOCUS HG506_HG1000_1_PATCH 814959 bp DNA HTG 18-JUN-2011
+ # LOCUS HG506_HG1000_1_PATCH 1219964 bp DNA HTG 18-JUN-2011
+ # Notice that the 'bp' can occur in the position expected by either
+ # the old or the new fixed column standards (parsed above).
+ splitline = line.split()
+ consumer.locus(splitline[1])
+ consumer.size(splitline[2])
+ consumer.residue_type(splitline[4])
+ consumer.data_file_division(splitline[5])
+ consumer.date(splitline[6])
+ elif len(line.split()) >= 4 and line.split()[3] in ["aa", "bp"]:
+ # Cope with EMBOSS seqret output where it seems the locus id can cause
+ # the other fields to overflow. We just IGNORE the other fields!
+ warnings.warn(
+ "Malformed LOCUS line found - is this correct?\n:%r" % line,
+ BiopythonParserWarning,
+ )
+ consumer.locus(line.split()[1])
+ consumer.size(line.split()[2])
+ elif len(line.split()) >= 4 and line.split()[-1] in ["aa", "bp"]:
+ # Cope with pseudo-GenBank files like this:
+ # "LOCUS RNA5 complete 1718 bp"
+ # Treat everything between LOCUS and the size as the identifier.
+ warnings.warn(
+ "Malformed LOCUS line found - is this correct?\n:%r" % line,
+ BiopythonParserWarning,
+ )
+ consumer.locus(line[5:].rsplit(None, 2)[0].strip())
+ consumer.size(line.split()[-2])
+ else:
+ raise ValueError("Did not recognise the LOCUS line layout:\n" + line)
+
+ def _feed_header_lines(self, consumer, lines):
+ # Following dictionary maps GenBank lines to the associated
+ # consumer methods - the special cases like LOCUS where one
+ # genbank line triggers several consumer calls have to be
+ # handled individually.
+ consumer_dict = {
+ "DEFINITION": "definition",
+ "ACCESSION": "accession",
+ "NID": "nid",
+ "PID": "pid",
+ "DBSOURCE": "db_source",
+ "KEYWORDS": "keywords",
+ "SEGMENT": "segment",
+ "SOURCE": "source",
+ "AUTHORS": "authors",
+ "CONSRTM": "consrtm",
+ "PROJECT": "project",
+ "TITLE": "title",
+ "JOURNAL": "journal",
+ "MEDLINE": "medline_id",
+ "PUBMED": "pubmed_id",
+ "REMARK": "remark",
+ }
+ # We have to handle the following specially:
+ # ORIGIN (locus, size, residue_type, data_file_division and date)
+ # COMMENT (comment)
+ # VERSION (version and gi)
+ # DBLINK (database links like projects, newlines important)
+ # REFERENCE (eference_num and reference_bases)
+ # ORGANISM (organism and taxonomy)
+ lines = [_f for _f in lines if _f]
+ lines.append("") # helps avoid getting StopIteration all the time
+ line_iter = iter(lines)
+ try:
+ line = next(line_iter)
+ while True:
+ if not line:
+ break
+ line_type = line[: self.GENBANK_INDENT].strip()
+ data = line[self.GENBANK_INDENT :].strip()
+
+ if line_type == "VERSION":
+ # Need to call consumer.version(), and maybe also consumer.gi() as well.
+ # e.g.
+ # VERSION AC007323.5 GI:6587720
+ while " " in data:
+ data = data.replace(" ", " ")
+ if " GI:" not in data:
+ consumer.version(data)
+ else:
+ if self.debug:
+ print(
+ "Version ["
+ + data.split(" GI:")[0]
+ + "], gi ["
+ + data.split(" GI:")[1]
+ + "]"
+ )
+ consumer.version(data.split(" GI:")[0])
+ consumer.gi(data.split(" GI:")[1])
+ # Read in the next line!
+ line = next(line_iter)
+ elif line_type == "DBLINK":
+ # Need to call consumer.dblink() for each line, e.g.
+ # DBLINK Project: 57779
+ # BioProject: PRJNA57779
+ consumer.dblink(data.strip())
+ # Read in the next line, and see if its more of the DBLINK section:
+ while True:
+ line = next(line_iter)
+ if line[: self.GENBANK_INDENT] == self.GENBANK_SPACER:
+ # Add this continuation to the data string
+ consumer.dblink(line[self.GENBANK_INDENT :].strip())
+ else:
+ # End of the DBLINK, leave this text in the variable "line"
+ break
+ elif line_type == "REFERENCE":
+ if self.debug > 1:
+ print("Found reference [" + data + "]")
+ # Need to call consumer.reference_num() and consumer.reference_bases()
+ # e.g.
+ # REFERENCE 1 (bases 1 to 86436)
+ #
+ # Note that this can be multiline, see Bug 1968, e.g.
+ #
+ # REFERENCE 42 (bases 1517 to 1696; 3932 to 4112; 17880 to 17975; 21142 to
+ # 28259)
+ #
+ # For such cases we will call the consumer once only.
+ data = data.strip()
+
+ # Read in the next line, and see if its more of the reference:
+ while True:
+ line = next(line_iter)
+ if line[: self.GENBANK_INDENT] == self.GENBANK_SPACER:
+ # Add this continuation to the data string
+ data += " " + line[self.GENBANK_INDENT :]
+ if self.debug > 1:
+ print("Extended reference text [" + data + "]")
+ else:
+ # End of the reference, leave this text in the variable "line"
+ break
+
+ # We now have all the reference line(s) stored in a string, data,
+ # which we pass to the consumer
+ while " " in data:
+ data = data.replace(" ", " ")
+ if " " not in data:
+ if self.debug > 2:
+ print('Reference number "' + data + '"')
+ consumer.reference_num(data)
+ else:
+ if self.debug > 2:
+ print(
+ 'Reference number "'
+ + data[: data.find(" ")]
+ + '", "'
+ + data[data.find(" ") + 1 :]
+ + '"'
+ )
+ consumer.reference_num(data[: data.find(" ")])
+ consumer.reference_bases(data[data.find(" ") + 1 :])
+ elif line_type == "ORGANISM":
+ # Typically the first line is the organism, and subsequent lines
+ # are the taxonomy lineage. However, given longer and longer
+ # species names (as more and more strains and sub strains get
+ # sequenced) the oragnism name can now get wrapped onto multiple
+ # lines. The NCBI say we have to recognise the lineage line by
+ # the presence of semi-colon delimited entries. In the long term,
+ # they are considering adding a new keyword (e.g. LINEAGE).
+ # See Bug 2591 for details.
+ organism_data = data
+ lineage_data = ""
+ while True:
+ line = next(line_iter)
+ if line[0 : self.GENBANK_INDENT] == self.GENBANK_SPACER:
+ if lineage_data or ";" in line:
+ lineage_data += " " + line[self.GENBANK_INDENT :]
+ elif line[self.GENBANK_INDENT :].strip() == ".":
+ # No lineage data, just . place holder
+ pass
+ else:
+ organism_data += (
+ " " + line[self.GENBANK_INDENT :].strip()
+ )
+ else:
+ # End of organism and taxonomy
+ break
+ consumer.organism(organism_data)
+ if lineage_data.strip() == "" and self.debug > 1:
+ print("Taxonomy line(s) missing or blank")
+ consumer.taxonomy(lineage_data.strip())
+ del organism_data, lineage_data
+ elif line_type == "COMMENT":
+ # A COMMENT can either be plain text or tabular (Structured Comment),
+ # or contain both. Multi-line comments are common. The code calls
+ # consumer.comment() once with a list where each entry
+ # is a line. If there's a structured comment consumer.structured_comment()
+ # is called with a dict of dicts where the secondary key/value pairs are
+ # the same as those in the structured comment table. The primary key is
+ # the title or header of the table (e.g. Assembly-Data, FluData). See
+ # http://www.ncbi.nlm.nih.gov/genbank/structuredcomment
+ # for more information on Structured Comments.
+ data = line[self.GENBANK_INDENT :]
+ if self.debug > 1:
+ print("Found comment")
+ comment_list = []
+ structured_comment_dict = OrderedDict()
+ regex = fr"([^#]+){self.STRUCTURED_COMMENT_START}$"
+ structured_comment_key = re.search(regex, data)
+ if structured_comment_key is not None:
+ structured_comment_key = structured_comment_key.group(1)
+ if self.debug > 1:
+ print("Found Structured Comment")
+ else:
+ comment_list.append(data)
+
+ while True:
+ line = next(line_iter)
+ data = line[self.GENBANK_INDENT :]
+ if line[0 : self.GENBANK_INDENT] == self.GENBANK_SPACER:
+ if self.STRUCTURED_COMMENT_START in data:
+ regex = r"([^#]+){}$".format(
+ self.STRUCTURED_COMMENT_START
+ )
+ structured_comment_key = re.search(regex, data)
+ if structured_comment_key is not None:
+ structured_comment_key = structured_comment_key.group(
+ 1
+ )
+ else:
+ comment_list.append(data)
+ elif (
+ structured_comment_key is not None
+ and self.STRUCTURED_COMMENT_DELIM in data
+ ):
+ match = re.search(
+ r"(.+?)\s*{}\s*(.+)".format(
+ self.STRUCTURED_COMMENT_DELIM
+ ),
+ data,
+ )
+ structured_comment_dict.setdefault(
+ structured_comment_key, OrderedDict()
+ )
+ structured_comment_dict[structured_comment_key][
+ match.group(1)
+ ] = match.group(2)
+ if self.debug > 2:
+ print(
+ "Structured Comment continuation [" + data + "]"
+ )
+ elif (
+ structured_comment_key is not None
+ and self.STRUCTURED_COMMENT_END not in data
+ ):
+ # Don't die on a malformed comment, just warn and carry on
+ if (
+ structured_comment_key
+ not in structured_comment_dict
+ ):
+ warnings.warn(
+ "Structured comment not parsed for %s. Is it malformed?"
+ % consumer.data.name,
+ BiopythonParserWarning,
+ )
+ continue
+
+ # The current structured comment has a multiline value
+ previous_value_line = structured_comment_dict[
+ structured_comment_key
+ ][match.group(1)]
+ structured_comment_dict[structured_comment_key][
+ match.group(1)
+ ] = (previous_value_line + " " + line.strip())
+ elif self.STRUCTURED_COMMENT_END in data:
+ # End of structured comment
+ structured_comment_key = None
+ else:
+ comment_list.append(data)
+ if self.debug > 2:
+ print("Comment continuation [" + data + "]")
+ else:
+ # End of the comment
+ break
+ if comment_list:
+ consumer.comment(comment_list)
+ if structured_comment_dict:
+ consumer.structured_comment(structured_comment_dict)
+ del comment_list, structured_comment_key, structured_comment_dict
+ elif line_type in consumer_dict:
+ # It's a semi-automatic entry!
+ # Now, this may be a multi line entry...
+ while True:
+ line = next(line_iter)
+ if line[0 : self.GENBANK_INDENT] == self.GENBANK_SPACER:
+ data += " " + line[self.GENBANK_INDENT :]
+ else:
+ # We now have all the data for this entry:
+
+ # The DEFINITION field must ends with a period
+ # # see ftp://ftp.ncbi.nih.gov/genbank/gbrel.txt [3.4.5]
+ # and discussion https://github.com/biopython/biopython/pull/616
+ # We consider this period belong to the syntax, not to the data
+ # So remove it if it exist
+ if line_type == "DEFINITION" and data.endswith("."):
+ data = data[:-1]
+ getattr(consumer, consumer_dict[line_type])(data)
+ # End of continuation - return to top of loop!
+ break
+ else:
+ if self.debug:
+ print("Ignoring GenBank header line:\n" % line)
+ # Read in next line
+ line = next(line_iter)
+ except StopIteration:
+ raise ValueError("Problem in header") from None
+
+ def _feed_misc_lines(self, consumer, lines):
+ # Deals with a few misc lines between the features and the sequence
+ lines.append("")
+ line_iter = iter(lines)
+ try:
+ for line in line_iter:
+ if line.startswith("BASE COUNT"):
+ line = line[10:].strip()
+ if line:
+ if self.debug:
+ print("base_count = " + line)
+ consumer.base_count(line)
+ if line.startswith("ORIGIN"):
+ line = line[6:].strip()
+ if line:
+ if self.debug:
+ print("origin_name = " + line)
+ consumer.origin_name(line)
+ if line.startswith("TLS "):
+ line = line[3:].strip()
+ consumer.tls(line)
+ if line.startswith("TSA "):
+ line = line[3:].strip()
+ consumer.tsa(line)
+ if line.startswith("WGS "):
+ line = line[3:].strip()
+ consumer.wgs(line)
+ if line.startswith("WGS_SCAFLD"):
+ line = line[10:].strip()
+ consumer.add_wgs_scafld(line)
+ if line.startswith("CONTIG"):
+ line = line[6:].strip()
+ contig_location = line
+ while True:
+ line = next(line_iter)
+ if not line:
+ break
+ elif line[: self.GENBANK_INDENT] == self.GENBANK_SPACER:
+ # Don't need to preseve the whitespace here.
+ contig_location += line[self.GENBANK_INDENT :].rstrip()
+ elif line.startswith("ORIGIN"):
+ # Strange, seen this in GenPept files via Entrez gbwithparts
+ line = line[6:].strip()
+ if line:
+ consumer.origin_name(line)
+ break
+ else:
+ raise ValueError(
+ "Expected CONTIG continuation line, got:\n" + line
+ )
+ consumer.contig_location(contig_location)
+ return
+ except StopIteration:
+ raise ValueError("Problem in misc lines before sequence") from None
diff --git a/code/lib/Bio/GenBank/__init__.py b/code/lib/Bio/GenBank/__init__.py
new file mode 100644
index 0000000..1875116
--- /dev/null
+++ b/code/lib/Bio/GenBank/__init__.py
@@ -0,0 +1,1746 @@
+# Copyright 2000 by Jeffrey Chang, Brad Chapman. All rights reserved.
+# Copyright 2006-2017 by Peter Cock. All rights reserved.
+#
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+
+"""Code to work with GenBank formatted files.
+
+Rather than using Bio.GenBank, you are now encouraged to use Bio.SeqIO with
+the "genbank" or "embl" format names to parse GenBank or EMBL files into
+SeqRecord and SeqFeature objects (see the Biopython tutorial for details).
+
+Using Bio.GenBank directly to parse GenBank files is only useful if you want
+to obtain GenBank-specific Record objects, which is a much closer
+representation to the raw file contents than the SeqRecord alternative from
+the FeatureParser (used in Bio.SeqIO).
+
+To use the Bio.GenBank parser, there are two helper functions:
+
+ - read Parse a handle containing a single GenBank record
+ as Bio.GenBank specific Record objects.
+ - parse Iterate over a handle containing multiple GenBank
+ records as Bio.GenBank specific Record objects.
+
+The following internal classes are not intended for direct use and may
+be deprecated in a future release.
+
+Classes:
+ - Iterator Iterate through a file of GenBank entries
+ - ErrorFeatureParser Catch errors caused during parsing.
+ - FeatureParser Parse GenBank data in SeqRecord and SeqFeature objects.
+ - RecordParser Parse GenBank data into a Record object.
+
+Exceptions:
+ - ParserFailureError Exception indicating a failure in the parser (ie.
+ scanner or consumer)
+ - LocationParserError Exception indicating a problem with the spark based
+ location parser.
+
+"""
+
+import re
+import warnings
+
+from Bio import BiopythonParserWarning
+from Bio.Seq import Seq
+from Bio import SeqFeature
+
+# other Bio.GenBank stuff
+from .utils import FeatureValueCleaner
+from .Scanner import GenBankScanner
+
+
+# Constants used to parse GenBank header lines
+GENBANK_INDENT = 12
+GENBANK_SPACER = " " * GENBANK_INDENT
+
+# Constants for parsing GenBank feature lines
+FEATURE_KEY_INDENT = 5
+FEATURE_QUALIFIER_INDENT = 21
+FEATURE_KEY_SPACER = " " * FEATURE_KEY_INDENT
+FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT
+
+# Regular expressions for location parsing
+_solo_location = r"[<>]?\d+"
+_pair_location = r"[<>]?\d+\.\.[<>]?\d+"
+_between_location = r"\d+\^\d+"
+
+_within_position = r"\(\d+\.\d+\)"
+_re_within_position = re.compile(_within_position)
+_within_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" % (
+ _within_position,
+ _within_position,
+)
+assert _re_within_position.match("(3.9)")
+assert re.compile(_within_location).match("(3.9)..10")
+assert re.compile(_within_location).match("26..(30.33)")
+assert re.compile(_within_location).match("(13.19)..(20.28)")
+
+_oneof_position = r"one\-of\(\d+(,\d+)+\)"
+_re_oneof_position = re.compile(_oneof_position)
+_oneof_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" % (_oneof_position, _oneof_position)
+assert _re_oneof_position.match("one-of(6,9)")
+assert re.compile(_oneof_location).match("one-of(6,9)..101")
+assert re.compile(_oneof_location).match("one-of(6,9)..one-of(101,104)")
+assert re.compile(_oneof_location).match("6..one-of(101,104)")
+
+assert not _re_oneof_position.match("one-of(3)")
+assert _re_oneof_position.match("one-of(3,6)")
+assert _re_oneof_position.match("one-of(3,6,9)")
+
+
+_simple_location = r"\d+\.\.\d+"
+_re_simple_location = re.compile(r"^%s$" % _simple_location)
+_re_simple_compound = re.compile(
+ r"^(join|order|bond)\(%s(,%s)*\)$" % (_simple_location, _simple_location)
+)
+_complex_location = r"([a-zA-Z][a-zA-Z0-9_\.\|]*[a-zA-Z0-9]?\:)?(%s|%s|%s|%s|%s)" % (
+ _pair_location,
+ _solo_location,
+ _between_location,
+ _within_location,
+ _oneof_location,
+)
+_re_complex_location = re.compile(r"^%s$" % _complex_location)
+_possibly_complemented_complex_location = r"(%s|complement\(%s\))" % (
+ _complex_location,
+ _complex_location,
+)
+_re_complex_compound = re.compile(
+ r"^(join|order|bond)\(%s(,%s)*\)$"
+ % (_possibly_complemented_complex_location, _possibly_complemented_complex_location)
+)
+
+
+assert _re_simple_location.match("104..160")
+assert not _re_simple_location.match("68451760..68452073^68452074")
+assert not _re_simple_location.match("<104..>160")
+assert not _re_simple_location.match("104")
+assert not _re_simple_location.match("<1")
+assert not _re_simple_location.match(">99999")
+assert not _re_simple_location.match("join(104..160,320..390,504..579)")
+assert not _re_simple_compound.match("bond(12,63)")
+assert _re_simple_compound.match("join(104..160,320..390,504..579)")
+assert _re_simple_compound.match("order(1..69,1308..1465)")
+assert not _re_simple_compound.match("order(1..69,1308..1465,1524)")
+assert not _re_simple_compound.match("join(<1..442,992..1228,1524..>1983)")
+assert not _re_simple_compound.match("join(<1..181,254..336,422..497,574..>590)")
+assert not _re_simple_compound.match(
+ "join(1475..1577,2841..2986,3074..3193,3314..3481,4126..>4215)"
+)
+assert not _re_simple_compound.match("test(1..69,1308..1465)")
+assert not _re_simple_compound.match("complement(1..69)")
+assert not _re_simple_compound.match("(1..69)")
+assert _re_complex_location.match("(3.9)..10")
+assert _re_complex_location.match("26..(30.33)")
+assert _re_complex_location.match("(13.19)..(20.28)")
+assert _re_complex_location.match("41^42") # between
+assert _re_complex_location.match("AL121804:41^42")
+assert _re_complex_location.match("AL121804:41..610")
+assert _re_complex_location.match("AL121804.2:41..610")
+assert _re_complex_location.match(
+ "AL358792.24.1.166931:3274..3461"
+) # lots of dots in external reference
+assert _re_complex_location.match("one-of(3,6)..101")
+assert _re_complex_compound.match(
+ "join(153490..154269,AL121804.2:41..610,AL121804.2:672..1487)"
+)
+assert not _re_simple_compound.match(
+ "join(153490..154269,AL121804.2:41..610,AL121804.2:672..1487)"
+)
+assert _re_complex_compound.match("join(complement(69611..69724),139856..140650)")
+assert _re_complex_compound.match(
+ "join(complement(AL354868.10.1.164018:80837..81016),complement(AL354868.10.1.164018:80539..80835))"
+)
+
+# Trans-spliced example from NC_016406, note underscore in reference name:
+assert _re_complex_location.match("NC_016402.1:6618..6676")
+assert _re_complex_location.match("181647..181905")
+assert _re_complex_compound.match(
+ "join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)"
+)
+assert not _re_complex_location.match(
+ "join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)"
+)
+assert not _re_simple_compound.match(
+ "join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)"
+)
+assert not _re_complex_location.match(
+ "join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)"
+)
+assert not _re_simple_location.match(
+ "join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)"
+)
+
+_solo_bond = re.compile(r"bond\(%s\)" % _solo_location)
+assert _solo_bond.match("bond(196)")
+assert _solo_bond.search("bond(196)")
+assert _solo_bond.search("join(bond(284),bond(305),bond(309),bond(305))")
+
+
+def _pos(pos_str, offset=0):
+ """Build a Position object (PRIVATE).
+
+ For an end position, leave offset as zero (default):
+
+ >>> _pos("5")
+ ExactPosition(5)
+
+ For a start position, set offset to minus one (for Python counting):
+
+ >>> _pos("5", -1)
+ ExactPosition(4)
+
+ This also covers fuzzy positions:
+
+ >>> p = _pos("<5")
+ >>> p
+ BeforePosition(5)
+ >>> print(p)
+ <5
+ >>> int(p)
+ 5
+
+ >>> _pos(">5")
+ AfterPosition(5)
+
+ By default assumes an end position, so note the integer behaviour:
+
+ >>> p = _pos("one-of(5,8,11)")
+ >>> p
+ OneOfPosition(11, choices=[ExactPosition(5), ExactPosition(8), ExactPosition(11)])
+ >>> print(p)
+ one-of(5,8,11)
+ >>> int(p)
+ 11
+
+ >>> _pos("(8.10)")
+ WithinPosition(10, left=8, right=10)
+
+ Fuzzy start positions:
+
+ >>> p = _pos("<5", -1)
+ >>> p
+ BeforePosition(4)
+ >>> print(p)
+ <4
+ >>> int(p)
+ 4
+
+ Notice how the integer behaviour changes too!
+
+ >>> p = _pos("one-of(5,8,11)", -1)
+ >>> p
+ OneOfPosition(4, choices=[ExactPosition(4), ExactPosition(7), ExactPosition(10)])
+ >>> print(p)
+ one-of(4,7,10)
+ >>> int(p)
+ 4
+
+ """
+ if pos_str.startswith("<"):
+ return SeqFeature.BeforePosition(int(pos_str[1:]) + offset)
+ elif pos_str.startswith(">"):
+ return SeqFeature.AfterPosition(int(pos_str[1:]) + offset)
+ elif _re_within_position.match(pos_str):
+ s, e = pos_str[1:-1].split(".")
+ s = int(s) + offset
+ e = int(e) + offset
+ if offset == -1:
+ default = s
+ else:
+ default = e
+ return SeqFeature.WithinPosition(default, left=s, right=e)
+ elif _re_oneof_position.match(pos_str):
+ assert pos_str.startswith("one-of(")
+ assert pos_str[-1] == ")"
+ parts = [
+ SeqFeature.ExactPosition(int(pos) + offset)
+ for pos in pos_str[7:-1].split(",")
+ ]
+ if offset == -1:
+ default = min(int(pos) for pos in parts)
+ else:
+ default = max(int(pos) for pos in parts)
+ return SeqFeature.OneOfPosition(default, choices=parts)
+ else:
+ return SeqFeature.ExactPosition(int(pos_str) + offset)
+
+
+def _loc(loc_str, expected_seq_length, strand, seq_type=None):
+ """Make FeatureLocation from non-compound non-complement location (PRIVATE).
+
+ This is also invoked to 'automatically' fix ambiguous formatting of features
+ that span the origin of a circular sequence.
+
+ Simple examples,
+
+ >>> _loc("123..456", 1000, +1)
+ FeatureLocation(ExactPosition(122), ExactPosition(456), strand=1)
+ >>> _loc("<123..>456", 1000, strand = -1)
+ FeatureLocation(BeforePosition(122), AfterPosition(456), strand=-1)
+
+ A more complex location using within positions,
+
+ >>> _loc("(9.10)..(20.25)", 1000, 1)
+ FeatureLocation(WithinPosition(8, left=8, right=9), WithinPosition(25, left=20, right=25), strand=1)
+
+ Notice how that will act as though it has overall start 8 and end 25.
+
+ Zero length between feature,
+
+ >>> _loc("123^124", 1000, 0)
+ FeatureLocation(ExactPosition(123), ExactPosition(123), strand=0)
+
+ The expected sequence length is needed for a special case, a between
+ position at the start/end of a circular genome:
+
+ >>> _loc("1000^1", 1000, 1)
+ FeatureLocation(ExactPosition(1000), ExactPosition(1000), strand=1)
+
+ Apart from this special case, between positions P^Q must have P+1==Q,
+
+ >>> _loc("123^456", 1000, 1)
+ Traceback (most recent call last):
+ ...
+ ValueError: Invalid between location '123^456'
+
+ You can optionally provide a reference name:
+
+ >>> _loc("AL391218.9:105173..108462", 2000000, 1)
+ FeatureLocation(ExactPosition(105172), ExactPosition(108462), strand=1, ref='AL391218.9')
+
+ >>> _loc("<2644..159", 2868, 1, "circular")
+ CompoundLocation([FeatureLocation(BeforePosition(2643), ExactPosition(2868), strand=1), FeatureLocation(ExactPosition(0), ExactPosition(159), strand=1)], 'join')
+ """
+ if ":" in loc_str:
+ ref, loc_str = loc_str.split(":")
+ else:
+ ref = None
+ try:
+ s, e = loc_str.split("..")
+ except ValueError:
+ assert ".." not in loc_str
+ if "^" in loc_str:
+ # A between location like "67^68" (one based counting) is a
+ # special case (note it has zero length). In python slice
+ # notation this is 67:67, a zero length slice. See Bug 2622
+ # Further more, on a circular genome of length N you can have
+ # a location N^1 meaning the junction at the origin. See Bug 3098.
+ # NOTE - We can imagine between locations like "2^4", but this
+ # is just "3". Similarly, "2^5" is just "3..4"
+ s, e = loc_str.split("^")
+ if int(s) + 1 == int(e):
+ pos = _pos(s)
+ elif int(s) == expected_seq_length and e == "1":
+ pos = _pos(s)
+ else:
+ raise ValueError("Invalid between location %r" % loc_str) from None
+ return SeqFeature.FeatureLocation(pos, pos, strand, ref=ref)
+ else:
+ # e.g. "123"
+ s = loc_str
+ e = loc_str
+
+ # Attempt to fix features that span the origin
+ s_pos = _pos(s, -1)
+ e_pos = _pos(e)
+ if int(s_pos) > int(e_pos):
+ if seq_type is None or "circular" not in seq_type.lower():
+ warnings.warn(
+ "It appears that %r is a feature that spans "
+ "the origin, but the sequence topology is "
+ "undefined. Skipping feature." % loc_str,
+ BiopythonParserWarning,
+ )
+ return None
+ warnings.warn(
+ "Attempting to fix invalid location %r as "
+ "it looks like incorrect origin wrapping. "
+ "Please fix input file, this could have "
+ "unintended behavior." % loc_str,
+ BiopythonParserWarning,
+ )
+
+ f1 = SeqFeature.FeatureLocation(s_pos, expected_seq_length, strand)
+ f2 = SeqFeature.FeatureLocation(0, int(e_pos), strand)
+
+ if strand == -1:
+ # For complementary features spanning the origin
+ return f2 + f1
+ else:
+ return f1 + f2
+
+ return SeqFeature.FeatureLocation(_pos(s, -1), _pos(e), strand, ref=ref)
+
+
+def _split_compound_loc(compound_loc):
+ """Split a tricky compound location string (PRIVATE).
+
+ >>> list(_split_compound_loc("123..145"))
+ ['123..145']
+ >>> list(_split_compound_loc("123..145,200..209"))
+ ['123..145', '200..209']
+ >>> list(_split_compound_loc("one-of(200,203)..300"))
+ ['one-of(200,203)..300']
+ >>> list(_split_compound_loc("complement(123..145),200..209"))
+ ['complement(123..145)', '200..209']
+ >>> list(_split_compound_loc("123..145,one-of(200,203)..209"))
+ ['123..145', 'one-of(200,203)..209']
+ >>> list(_split_compound_loc("123..145,one-of(200,203)..one-of(209,211),300"))
+ ['123..145', 'one-of(200,203)..one-of(209,211)', '300']
+ >>> list(_split_compound_loc("123..145,complement(one-of(200,203)..one-of(209,211)),300"))
+ ['123..145', 'complement(one-of(200,203)..one-of(209,211))', '300']
+ >>> list(_split_compound_loc("123..145,200..one-of(209,211),300"))
+ ['123..145', '200..one-of(209,211)', '300']
+ >>> list(_split_compound_loc("123..145,200..one-of(209,211)"))
+ ['123..145', '200..one-of(209,211)']
+ >>> list(_split_compound_loc("complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905"))
+ ['complement(149815..150200)', 'complement(293787..295573)', 'NC_016402.1:6618..6676', '181647..181905']
+ """
+ if "one-of(" in compound_loc:
+ # Hard case
+ while "," in compound_loc:
+ assert compound_loc[0] != ","
+ assert compound_loc[0:2] != ".."
+ i = compound_loc.find(",")
+ part = compound_loc[:i]
+ compound_loc = compound_loc[i:] # includes the comma
+ while part.count("(") > part.count(")"):
+ assert "one-of(" in part, (part, compound_loc)
+ i = compound_loc.find(")")
+ part += compound_loc[: i + 1]
+ compound_loc = compound_loc[i + 1 :]
+ if compound_loc.startswith(".."):
+ i = compound_loc.find(",")
+ if i == -1:
+ part += compound_loc
+ compound_loc = ""
+ else:
+ part += compound_loc[:i]
+ compound_loc = compound_loc[i:] # includes the comma
+ while part.count("(") > part.count(")"):
+ assert part.count("one-of(") == 2
+ i = compound_loc.find(")")
+ part += compound_loc[: i + 1]
+ compound_loc = compound_loc[i + 1 :]
+ if compound_loc.startswith(","):
+ compound_loc = compound_loc[1:]
+ assert part
+ yield part
+ if compound_loc:
+ yield compound_loc
+ else:
+ # Easy case
+ yield from compound_loc.split(",")
+
+
+class Iterator:
+ """Iterator interface to move over a file of GenBank entries one at a time (OBSOLETE).
+
+ This class is likely to be deprecated in a future release of Biopython.
+ Please use Bio.SeqIO.parse(..., format="gb") or Bio.GenBank.parse(...)
+ for SeqRecord and GenBank specific Record objects respectively instead.
+ """
+
+ def __init__(self, handle, parser=None):
+ """Initialize the iterator.
+
+ Arguments:
+ - handle - A handle with GenBank entries to iterate through.
+ - parser - An optional parser to pass the entries through before
+ returning them. If None, then the raw entry will be returned.
+
+ """
+ self.handle = handle
+ self._parser = parser
+
+ def __next__(self):
+ """Return the next GenBank record from the handle.
+
+ Will return None if we ran out of records.
+ """
+ if self._parser is None:
+ lines = []
+ while True:
+ line = self.handle.readline()
+ if not line:
+ return None # Premature end of file?
+ lines.append(line)
+ if line.rstrip() == "//":
+ break
+ return "".join(lines)
+ try:
+ return self._parser.parse(self.handle)
+ except StopIteration:
+ return None
+
+ def __iter__(self):
+ """Iterate over the records."""
+ return iter(self.__next__, None)
+
+
+class ParserFailureError(Exception):
+ """Failure caused by some kind of problem in the parser."""
+
+ pass
+
+
+class LocationParserError(Exception):
+ """Could not Properly parse out a location from a GenBank file."""
+
+ pass
+
+
+_cleaner = FeatureValueCleaner()
+
+
+class FeatureParser:
+ """Parse GenBank files into Seq + Feature objects (OBSOLETE).
+
+ Direct use of this class is discouraged, and may be deprecated in
+ a future release of Biopython.
+
+ Please use Bio.SeqIO.parse(...) or Bio.SeqIO.read(...) instead.
+ """
+
+ def __init__(self, debug_level=0, use_fuzziness=1, feature_cleaner=None):
+ """Initialize a GenBank parser and Feature consumer.
+
+ Arguments:
+ - debug_level - An optional argument that species the amount of
+ debugging information the parser should spit out. By default we have
+ no debugging info (the fastest way to do things), but if you want
+ you can set this as high as two and see exactly where a parse fails.
+ - use_fuzziness - Specify whether or not to use fuzzy representations.
+ The default is 1 (use fuzziness).
+ - feature_cleaner - A class which will be used to clean out the
+ values of features. This class must implement the function
+ clean_value. GenBank.utils has a "standard" cleaner class, which
+ is used by default.
+
+ """
+ self._scanner = GenBankScanner(debug_level)
+ self.use_fuzziness = use_fuzziness
+ if feature_cleaner:
+ self._cleaner = feature_cleaner
+ else:
+ self._cleaner = _cleaner # default
+
+ def parse(self, handle):
+ """Parse the specified handle."""
+ _consumer = _FeatureConsumer(self.use_fuzziness, self._cleaner)
+ self._scanner.feed(handle, _consumer)
+ return _consumer.data
+
+
+class RecordParser:
+ """Parse GenBank files into Record objects (OBSOLETE).
+
+ Direct use of this class is discouraged, and may be deprecated in
+ a future release of Biopython.
+
+ Please use the Bio.GenBank.parse(...) or Bio.GenBank.read(...) functions
+ instead.
+ """
+
+ def __init__(self, debug_level=0):
+ """Initialize the parser.
+
+ Arguments:
+ - debug_level - An optional argument that species the amount of
+ debugging information the parser should spit out. By default we have
+ no debugging info (the fastest way to do things), but if you want
+ you can set this as high as two and see exactly where a parse fails.
+
+ """
+ self._scanner = GenBankScanner(debug_level)
+
+ def parse(self, handle):
+ """Parse the specified handle into a GenBank record."""
+ _consumer = _RecordConsumer()
+
+ self._scanner.feed(handle, _consumer)
+ return _consumer.data
+
+
+class _BaseGenBankConsumer:
+ """Abstract GenBank consumer providing useful general functions (PRIVATE).
+
+ This just helps to eliminate some duplication in things that most
+ GenBank consumers want to do.
+ """
+
+ # Special keys in GenBank records that we should remove spaces from
+ # For instance, \translation keys have values which are proteins and
+ # should have spaces and newlines removed from them. This class
+ # attribute gives us more control over specific formatting problems.
+ remove_space_keys = ["translation"]
+
+ def __init__(self):
+ pass
+
+ @staticmethod
+ def _split_keywords(keyword_string):
+ """Split a string of keywords into a nice clean list (PRIVATE)."""
+ # process the keywords into a python list
+ if keyword_string == "" or keyword_string == ".":
+ keywords = ""
+ elif keyword_string[-1] == ".":
+ keywords = keyword_string[:-1]
+ else:
+ keywords = keyword_string
+ keyword_list = keywords.split(";")
+ return [x.strip() for x in keyword_list]
+
+ @staticmethod
+ def _split_accessions(accession_string):
+ """Split a string of accession numbers into a list (PRIVATE)."""
+ # first replace all line feeds with spaces
+ # Also, EMBL style accessions are split with ';'
+ accession = accession_string.replace("\n", " ").replace(";", " ")
+
+ return [x.strip() for x in accession.split() if x.strip()]
+
+ @staticmethod
+ def _split_taxonomy(taxonomy_string):
+ """Split a string with taxonomy info into a list (PRIVATE)."""
+ if not taxonomy_string or taxonomy_string == ".":
+ # Missing data, no taxonomy
+ return []
+
+ if taxonomy_string[-1] == ".":
+ tax_info = taxonomy_string[:-1]
+ else:
+ tax_info = taxonomy_string
+ tax_list = tax_info.split(";")
+ new_tax_list = []
+ for tax_item in tax_list:
+ new_items = tax_item.split("\n")
+ new_tax_list.extend(new_items)
+ while "" in new_tax_list:
+ new_tax_list.remove("")
+ return [x.strip() for x in new_tax_list]
+
+ @staticmethod
+ def _clean_location(location_string):
+ """Clean whitespace out of a location string (PRIVATE).
+
+ The location parser isn't a fan of whitespace, so we clean it out
+ before feeding it into the parser.
+ """
+ # Originally this imported string.whitespace and did a replace
+ # via a loop. It's simpler to just split on whitespace and rejoin
+ # the string - and this avoids importing string too. See Bug 2684.
+ return "".join(location_string.split())
+
+ @staticmethod
+ def _remove_newlines(text):
+ """Remove any newlines in the passed text, returning the new string (PRIVATE)."""
+ # get rid of newlines in the qualifier value
+ newlines = ["\n", "\r"]
+ for ws in newlines:
+ text = text.replace(ws, "")
+
+ return text
+
+ @staticmethod
+ def _normalize_spaces(text):
+ """Replace multiple spaces in the passed text with single spaces (PRIVATE)."""
+ # get rid of excessive spaces
+ return " ".join(x for x in text.split(" ") if x)
+
+ @staticmethod
+ def _remove_spaces(text):
+ """Remove all spaces from the passed text (PRIVATE)."""
+ return text.replace(" ", "")
+
+ @staticmethod
+ def _convert_to_python_numbers(start, end):
+ """Convert a start and end range to python notation (PRIVATE).
+
+ In GenBank, starts and ends are defined in "biological" coordinates,
+ where 1 is the first base and [i, j] means to include both i and j.
+
+ In python, 0 is the first base and [i, j] means to include i, but
+ not j.
+
+ So, to convert "biological" to python coordinates, we need to
+ subtract 1 from the start, and leave the end and things should
+ be converted happily.
+ """
+ new_start = start - 1
+ new_end = end
+
+ return new_start, new_end
+
+
+class _FeatureConsumer(_BaseGenBankConsumer):
+ """Create a SeqRecord object with Features to return (PRIVATE).
+
+ Attributes:
+ - use_fuzziness - specify whether or not to parse with fuzziness in
+ feature locations.
+ - feature_cleaner - a class that will be used to provide specialized
+ cleaning-up of feature values.
+
+ """
+
+ def __init__(self, use_fuzziness, feature_cleaner=None):
+ from Bio.SeqRecord import SeqRecord
+
+ _BaseGenBankConsumer.__init__(self)
+ self.data = SeqRecord(None, id=None)
+ self.data.id = None
+ self.data.description = ""
+
+ self._use_fuzziness = use_fuzziness
+ self._feature_cleaner = feature_cleaner
+
+ self._seq_type = ""
+ self._seq_data = []
+ self._cur_reference = None
+ self._cur_feature = None
+ self._expected_size = None
+
+ def locus(self, locus_name):
+ """Set the locus name is set as the name of the Sequence."""
+ self.data.name = locus_name
+
+ def size(self, content):
+ """Record the sequence length."""
+ self._expected_size = int(content)
+
+ def residue_type(self, type):
+ """Record the sequence type (SEMI-OBSOLETE).
+
+ This reflects the fact that the topology (linear/circular) and
+ molecule type (e.g. DNA vs RNA) were a single field in early
+ files. Current GenBank/EMBL files have two fields.
+ """
+ self._seq_type = type.strip()
+
+ def topology(self, topology):
+ """Validate and record sequence topology.
+
+ The topology argument should be "linear" or "circular" (string).
+ """
+ if topology:
+ if topology not in ["linear", "circular"]:
+ raise ParserFailureError(
+ "Unexpected topology %r should be linear or circular" % topology
+ )
+ self.data.annotations["topology"] = topology
+
+ def molecule_type(self, mol_type):
+ """Validate and record the molecule type (for round-trip etc)."""
+ if mol_type:
+ if "circular" in mol_type or "linear" in mol_type:
+ raise ParserFailureError(
+ "Molecule type %r should not include topology" % mol_type
+ )
+
+ # Writing out records will fail if we have a lower case DNA
+ # or RNA string in here, so upper case it.
+ # This is a bit ugly, but we don't want to upper case e.g.
+ # the m in mRNA, but thanks to the strip we lost the spaces
+ # so we need to index from the back
+ if mol_type[-3:].upper() in ("DNA", "RNA") and not mol_type[-3:].isupper():
+ warnings.warn(
+ "Non-upper case molecule type in LOCUS line: %s" % mol_type,
+ BiopythonParserWarning,
+ )
+
+ self.data.annotations["molecule_type"] = mol_type
+
+ def data_file_division(self, division):
+ self.data.annotations["data_file_division"] = division
+
+ def date(self, submit_date):
+ self.data.annotations["date"] = submit_date
+
+ def definition(self, definition):
+ """Set the definition as the description of the sequence."""
+ if self.data.description:
+ # Append to any existing description
+ # e.g. EMBL files with two DE lines.
+ self.data.description += " " + definition
+ else:
+ self.data.description = definition
+
+ def accession(self, acc_num):
+ """Set the accession number as the id of the sequence.
+
+ If we have multiple accession numbers, the first one passed is
+ used.
+ """
+ new_acc_nums = self._split_accessions(acc_num)
+
+ # Also record them ALL in the annotations
+ try:
+ # On the off chance there was more than one accession line:
+ for acc in new_acc_nums:
+ # Prevent repeat entries
+ if acc not in self.data.annotations["accessions"]:
+ self.data.annotations["accessions"].append(acc)
+ except KeyError:
+ self.data.annotations["accessions"] = new_acc_nums
+
+ # if we haven't set the id information yet, add the first acc num
+ if not self.data.id:
+ if len(new_acc_nums) > 0:
+ # self.data.id = new_acc_nums[0]
+ # Use the FIRST accession as the ID, not the first on this line!
+ self.data.id = self.data.annotations["accessions"][0]
+
+ def tls(self, content):
+ self.data.annotations["tls"] = content.split("-")
+
+ def tsa(self, content):
+ self.data.annotations["tsa"] = content.split("-")
+
+ def wgs(self, content):
+ self.data.annotations["wgs"] = content.split("-")
+
+ def add_wgs_scafld(self, content):
+ self.data.annotations.setdefault("wgs_scafld", []).append(content.split("-"))
+
+ def nid(self, content):
+ self.data.annotations["nid"] = content
+
+ def pid(self, content):
+ self.data.annotations["pid"] = content
+
+ def version(self, version_id):
+ # Want to use the versioned accession as the record.id
+ # This comes from the VERSION line in GenBank files, or the
+ # obsolete SV line in EMBL. For the new EMBL files we need
+ # both the version suffix from the ID line and the accession
+ # from the AC line.
+ if version_id.count(".") == 1 and version_id.split(".")[1].isdigit():
+ self.accession(version_id.split(".")[0])
+ self.version_suffix(version_id.split(".")[1])
+ elif version_id:
+ # For backwards compatibility...
+ self.data.id = version_id
+
+ def project(self, content):
+ """Handle the information from the PROJECT line as a list of projects.
+
+ e.g.::
+
+ PROJECT GenomeProject:28471
+
+ or::
+
+ PROJECT GenomeProject:13543 GenomeProject:99999
+
+ This is stored as dbxrefs in the SeqRecord to be consistent with the
+ projected switch of this line to DBLINK in future GenBank versions.
+ Note the NCBI plan to replace "GenomeProject:28471" with the shorter
+ "Project:28471" as part of this transition.
+ """
+ content = content.replace("GenomeProject:", "Project:")
+ self.data.dbxrefs.extend(p for p in content.split() if p)
+
+ def dblink(self, content):
+ """Store DBLINK cross references as dbxrefs in our record object.
+
+ This line type is expected to replace the PROJECT line in 2009. e.g.
+
+ During transition::
+
+ PROJECT GenomeProject:28471
+ DBLINK Project:28471
+ Trace Assembly Archive:123456
+
+ Once the project line is dropped::
+
+ DBLINK Project:28471
+ Trace Assembly Archive:123456
+
+ Note GenomeProject -> Project.
+
+ We'll have to see some real examples to be sure, but based on the
+ above example we can expect one reference per line.
+
+ Note that at some point the NCBI have included an extra space, e.g.::
+
+ DBLINK Project: 28471
+
+ """
+ # During the transition period with both PROJECT and DBLINK lines,
+ # we don't want to add the same cross reference twice.
+ while ": " in content:
+ content = content.replace(": ", ":")
+ if content.strip() not in self.data.dbxrefs:
+ self.data.dbxrefs.append(content.strip())
+
+ def version_suffix(self, version):
+ """Set the version to overwrite the id.
+
+ Since the version provides the same information as the accession
+ number, plus some extra info, we set this as the id if we have
+ a version.
+ """
+ # e.g. GenBank line:
+ # VERSION U49845.1 GI:1293613
+ # or the obsolete EMBL line:
+ # SV U49845.1
+ # Scanner calls consumer.version("U49845.1")
+ # which then calls consumer.version_suffix(1)
+ #
+ # e.g. EMBL new line:
+ # ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP.
+ # Scanner calls consumer.version_suffix(1)
+ assert version.isdigit()
+ self.data.annotations["sequence_version"] = int(version)
+
+ def db_source(self, content):
+ self.data.annotations["db_source"] = content.rstrip()
+
+ def gi(self, content):
+ self.data.annotations["gi"] = content
+
+ def keywords(self, content):
+ if "keywords" in self.data.annotations:
+ # Multi-line keywords, append to list
+ # Note EMBL states "A keyword is never split between lines."
+ self.data.annotations["keywords"].extend(self._split_keywords(content))
+ else:
+ self.data.annotations["keywords"] = self._split_keywords(content)
+
+ def segment(self, content):
+ self.data.annotations["segment"] = content
+
+ def source(self, content):
+ # Note that some software (e.g. VectorNTI) may produce an empty
+ # source (rather than using a dot/period as might be expected).
+ if content == "":
+ source_info = ""
+ elif content[-1] == ".":
+ source_info = content[:-1]
+ else:
+ source_info = content
+ self.data.annotations["source"] = source_info
+
+ def organism(self, content):
+ self.data.annotations["organism"] = content
+
+ def taxonomy(self, content):
+ """Record (another line of) the taxonomy lineage."""
+ lineage = self._split_taxonomy(content)
+ try:
+ self.data.annotations["taxonomy"].extend(lineage)
+ except KeyError:
+ self.data.annotations["taxonomy"] = lineage
+
+ def reference_num(self, content):
+ """Signal the beginning of a new reference object."""
+ # if we have a current reference that hasn't been added to
+ # the list of references, add it.
+ if self._cur_reference is not None:
+ self.data.annotations["references"].append(self._cur_reference)
+ else:
+ self.data.annotations["references"] = []
+
+ self._cur_reference = SeqFeature.Reference()
+
+ def reference_bases(self, content):
+ """Attempt to determine the sequence region the reference entails.
+
+ Possible types of information we may have to deal with:
+
+ (bases 1 to 86436)
+ (sites)
+ (bases 1 to 105654; 110423 to 111122)
+ 1 (residues 1 to 182)
+ """
+ # first remove the parentheses
+ assert content.endswith(")"), content
+ ref_base_info = content[1:-1]
+
+ all_locations = []
+ # parse if we've got 'bases' and 'to'
+ if "bases" in ref_base_info and "to" in ref_base_info:
+ # get rid of the beginning 'bases'
+ ref_base_info = ref_base_info[5:]
+ locations = self._split_reference_locations(ref_base_info)
+ all_locations.extend(locations)
+ elif "residues" in ref_base_info and "to" in ref_base_info:
+ residues_start = ref_base_info.find("residues")
+ # get only the information after "residues"
+ ref_base_info = ref_base_info[(residues_start + len("residues ")) :]
+ locations = self._split_reference_locations(ref_base_info)
+ all_locations.extend(locations)
+
+ # make sure if we are not finding information then we have
+ # the string 'sites' or the string 'bases'
+ elif ref_base_info == "sites" or ref_base_info.strip() == "bases":
+ pass
+ # otherwise raise an error
+ else:
+ raise ValueError(
+ "Could not parse base info %s in record %s"
+ % (ref_base_info, self.data.id)
+ )
+
+ self._cur_reference.location = all_locations
+
+ def _split_reference_locations(self, location_string):
+ """Get reference locations out of a string of reference information (PRIVATE).
+
+ The passed string should be of the form::
+
+ 1 to 20; 20 to 100
+
+ This splits the information out and returns a list of location objects
+ based on the reference locations.
+ """
+ # split possibly multiple locations using the ';'
+ all_base_info = location_string.split(";")
+
+ new_locations = []
+ for base_info in all_base_info:
+ start, end = base_info.split("to")
+ new_start, new_end = self._convert_to_python_numbers(
+ int(start.strip()), int(end.strip())
+ )
+ this_location = SeqFeature.FeatureLocation(new_start, new_end)
+ new_locations.append(this_location)
+ return new_locations
+
+ def authors(self, content):
+ if self._cur_reference.authors:
+ self._cur_reference.authors += " " + content
+ else:
+ self._cur_reference.authors = content
+
+ def consrtm(self, content):
+ if self._cur_reference.consrtm:
+ self._cur_reference.consrtm += " " + content
+ else:
+ self._cur_reference.consrtm = content
+
+ def title(self, content):
+ if self._cur_reference is None:
+ warnings.warn(
+ "GenBank TITLE line without REFERENCE line.", BiopythonParserWarning
+ )
+ elif self._cur_reference.title:
+ self._cur_reference.title += " " + content
+ else:
+ self._cur_reference.title = content
+
+ def journal(self, content):
+ if self._cur_reference.journal:
+ self._cur_reference.journal += " " + content
+ else:
+ self._cur_reference.journal = content
+
+ def medline_id(self, content):
+ self._cur_reference.medline_id = content
+
+ def pubmed_id(self, content):
+ self._cur_reference.pubmed_id = content
+
+ def remark(self, content):
+ """Deal with a reference comment."""
+ if self._cur_reference.comment:
+ self._cur_reference.comment += " " + content
+ else:
+ self._cur_reference.comment = content
+
+ def comment(self, content):
+ try:
+ self.data.annotations["comment"] += "\n" + "\n".join(content)
+ except KeyError:
+ self.data.annotations["comment"] = "\n".join(content)
+
+ def structured_comment(self, content):
+ self.data.annotations["structured_comment"] = content
+
+ def features_line(self, content):
+ """Get ready for the feature table when we reach the FEATURE line."""
+ self.start_feature_table()
+
+ def start_feature_table(self):
+ """Indicate we've got to the start of the feature table."""
+ # make sure we've added on our last reference object
+ if self._cur_reference is not None:
+ self.data.annotations["references"].append(self._cur_reference)
+ self._cur_reference = None
+
+ def feature_key(self, content):
+ # start a new feature
+ self._cur_feature = SeqFeature.SeqFeature()
+ self._cur_feature.type = content
+ self.data.features.append(self._cur_feature)
+
+ def location(self, content):
+ """Parse out location information from the location string.
+
+ This uses simple Python code with some regular expressions to do the
+ parsing, and then translates the results into appropriate objects.
+ """
+ # clean up newlines and other whitespace inside the location before
+ # parsing - locations should have no whitespace whatsoever
+ location_line = self._clean_location(content)
+
+ # Older records have junk like replace(266,"c") in the
+ # location line. Newer records just replace this with
+ # the number 266 and have the information in a more reasonable
+ # place. So we'll just grab out the number and feed this to the
+ # parser. We shouldn't really be losing any info this way.
+ if "replace" in location_line:
+ comma_pos = location_line.find(",")
+ location_line = location_line[8:comma_pos]
+
+ cur_feature = self._cur_feature
+
+ # Handle top level complement here for speed
+ if location_line.startswith("complement("):
+ assert location_line.endswith(")")
+ location_line = location_line[11:-1]
+ strand = -1
+ elif "PROTEIN" in self._seq_type.upper():
+ strand = None
+ else:
+ # Assume nucleotide otherwise feature strand for
+ # GenBank files with bad LOCUS lines set to None
+ strand = 1
+
+ # Special case handling of the most common cases for speed
+ if _re_simple_location.match(location_line):
+ # e.g. "123..456"
+ s, e = location_line.split("..")
+ try:
+ cur_feature.location = SeqFeature.FeatureLocation(
+ int(s) - 1, int(e), strand
+ )
+ except ValueError:
+ # Could be non-integers, more likely bad origin wrapping
+ cur_feature.location = _loc(
+ location_line,
+ self._expected_size,
+ strand,
+ seq_type=self._seq_type.lower(),
+ )
+ return
+
+ if ",)" in location_line:
+ warnings.warn(
+ "Dropping trailing comma in malformed feature location",
+ BiopythonParserWarning,
+ )
+ location_line = location_line.replace(",)", ")")
+
+ if _solo_bond.search(location_line):
+ # e.g. bond(196)
+ # e.g. join(bond(284),bond(305),bond(309),bond(305))
+ warnings.warn(
+ "Dropping bond qualifier in feature location", BiopythonParserWarning
+ )
+ # There ought to be a better way to do this...
+ for x in _solo_bond.finditer(location_line):
+ x = x.group()
+ location_line = location_line.replace(x, x[5:-1])
+
+ if _re_simple_compound.match(location_line):
+ # e.g. join(<123..456,480..>500)
+ i = location_line.find("(")
+ # cur_feature.location_operator = location_line[:i]
+ # we can split on the comma because these are simple locations
+ locs = []
+ for part in location_line[i + 1 : -1].split(","):
+ s, e = part.split("..")
+
+ try:
+ locs.append(SeqFeature.FeatureLocation(int(s) - 1, int(e), strand))
+ except ValueError:
+ # Could be non-integers, more likely bad origin wrapping
+
+ # In the case of bad origin wrapping, _loc will return
+ # a CompoundLocation. CompoundLocation.parts returns a
+ # list of the FeatureLocation objects inside the
+ # CompoundLocation.
+ locs.extend(
+ _loc(
+ part, self._expected_size, strand, self._seq_type.lower()
+ ).parts
+ )
+
+ if len(locs) < 2:
+ # The CompoundLocation will raise a ValueError here!
+ warnings.warn(
+ "Should have at least 2 parts for compound location",
+ BiopythonParserWarning,
+ )
+ cur_feature.location = None
+ return
+ if strand == -1:
+ cur_feature.location = SeqFeature.CompoundLocation(
+ locs[::-1], operator=location_line[:i]
+ )
+ else:
+ cur_feature.location = SeqFeature.CompoundLocation(
+ locs, operator=location_line[:i]
+ )
+ return
+
+ # Handle the general case with more complex regular expressions
+ if _re_complex_location.match(location_line):
+ # e.g. "AL121804.2:41..610"
+ cur_feature.location = _loc(
+ location_line,
+ self._expected_size,
+ strand,
+ seq_type=self._seq_type.lower(),
+ )
+ return
+
+ if _re_complex_compound.match(location_line):
+ i = location_line.find("(")
+ # cur_feature.location_operator = location_line[:i]
+ # Can't split on the comma because of positions like one-of(1,2,3)
+ locs = []
+ for part in _split_compound_loc(location_line[i + 1 : -1]):
+ if part.startswith("complement("):
+ assert part[-1] == ")"
+ part = part[11:-1]
+ assert strand != -1, "Double complement?"
+ part_strand = -1
+ else:
+ part_strand = strand
+ try:
+ # There is likely a problem with origin wrapping.
+ # Using _loc to return a CompoundLocation of the
+ # wrapped feature and returning the two FeatureLocation
+ # objects to extend to the list of feature locations.
+ loc = _loc(
+ part,
+ self._expected_size,
+ part_strand,
+ seq_type=self._seq_type.lower(),
+ ).parts
+
+ except ValueError:
+ print(location_line)
+ print(part)
+ raise
+ # loc will be a list of one or two FeatureLocation items.
+ locs.extend(loc)
+ # Historically a join on the reverse strand has been represented
+ # in Biopython with both the parent SeqFeature and its children
+ # (the exons for a CDS) all given a strand of -1. Likewise, for
+ # a join feature on the forward strand they all have strand +1.
+ # However, we must also consider evil mixed strand examples like
+ # this, join(complement(69611..69724),139856..140087,140625..140650)
+ if strand == -1:
+ # Whole thing was wrapped in complement(...)
+ for l in locs:
+ assert l.strand == -1
+ # Reverse the backwards order used in GenBank files
+ # with complement(join(...))
+ cur_feature.location = SeqFeature.CompoundLocation(
+ locs[::-1], operator=location_line[:i]
+ )
+ else:
+ cur_feature.location = SeqFeature.CompoundLocation(
+ locs, operator=location_line[:i]
+ )
+ return
+ # Not recognised
+ if "order" in location_line and "join" in location_line:
+ # See Bug 3197
+ msg = (
+ 'Combinations of "join" and "order" within the same '
+ "location (nested operators) are illegal:\n" + location_line
+ )
+ raise LocationParserError(msg)
+ # This used to be an error....
+ cur_feature.location = None
+ warnings.warn(
+ BiopythonParserWarning(
+ "Couldn't parse feature location: %r" % location_line
+ )
+ )
+
+ def feature_qualifier(self, key, value):
+ """When we get a qualifier key and its value.
+
+ Can receive None, since you can have valueless keys such as /pseudo
+ """
+ # Hack to try to preserve historical behaviour of /pseudo etc
+ if value is None:
+ # if the key doesn't exist yet, add an empty string
+ if key not in self._cur_feature.qualifiers:
+ self._cur_feature.qualifiers[key] = [""]
+ return
+ # otherwise just skip this key
+ return
+
+ # Remove enclosing quotation marks
+ value = re.sub('^"|"$', "", value)
+
+ # Handle NCBI escaping
+ # Warn if escaping is not according to standard
+ if re.search(r'[^"]"[^"]|^"[^"]|[^"]"$', value):
+ warnings.warn(
+ 'The NCBI states double-quote characters like " should be escaped as "" '
+ "(two double - quotes), but here it was not: %r" % value,
+ BiopythonParserWarning,
+ )
+ # Undo escaping, repeated double quotes -> one double quote
+ value = value.replace('""', '"')
+
+ if self._feature_cleaner is not None:
+ value = self._feature_cleaner.clean_value(key, value)
+
+ # if the qualifier name exists, append the value
+ if key in self._cur_feature.qualifiers:
+ self._cur_feature.qualifiers[key].append(value)
+ # otherwise start a new list of the key with its values
+ else:
+ self._cur_feature.qualifiers[key] = [value]
+
+ def feature_qualifier_name(self, content_list):
+ """Use feature_qualifier instead (OBSOLETE)."""
+ raise NotImplementedError("Use the feature_qualifier method instead.")
+
+ def feature_qualifier_description(self, content):
+ """Use feature_qualifier instead (OBSOLETE)."""
+ raise NotImplementedError("Use the feature_qualifier method instead.")
+
+ def contig_location(self, content):
+ """Deal with CONTIG information."""
+ # Historically this was stored as a SeqFeature object, but it was
+ # stored under record.annotations["contig"] and not under
+ # record.features with the other SeqFeature objects.
+ #
+ # The CONTIG location line can include additional tokens like
+ # Gap(), Gap(100) or Gap(unk100) which are not used in the feature
+ # location lines, so storing it using SeqFeature based location
+ # objects is difficult.
+ #
+ # We now store this a string, which means for BioSQL we are now in
+ # much better agreement with how BioPerl records the CONTIG line
+ # in the database.
+ #
+ # NOTE - This code assumes the scanner will return all the CONTIG
+ # lines already combined into one long string!
+ self.data.annotations["contig"] = content
+
+ def origin_name(self, content):
+ pass
+
+ def base_count(self, content):
+ pass
+
+ def base_number(self, content):
+ pass
+
+ def sequence(self, content):
+ """Add up sequence information as we get it.
+
+ To try and make things speedier, this puts all of the strings
+ into a list of strings, and then uses string.join later to put
+ them together. Supposedly, this is a big time savings
+ """
+ assert " " not in content
+ self._seq_data.append(content.upper())
+
+ def record_end(self, content):
+ """Clean up when we've finished the record."""
+ # Try and append the version number to the accession for the full id
+ if not self.data.id:
+ if "accessions" in self.data.annotations:
+ raise ValueError(
+ "Problem adding version number to accession: "
+ + str(self.data.annotations["accessions"])
+ )
+ self.data.id = self.data.name # Good fall back?
+ elif self.data.id.count(".") == 0:
+ try:
+ self.data.id += ".%i" % self.data.annotations["sequence_version"]
+ except KeyError:
+ pass
+
+ # add the sequence information
+
+ sequence = "".join(self._seq_data)
+
+ if (
+ self._expected_size is not None
+ and len(sequence) != 0
+ and self._expected_size != len(sequence)
+ ):
+ warnings.warn(
+ "Expected sequence length %i, found %i (%s)."
+ % (self._expected_size, len(sequence), self.data.id),
+ BiopythonParserWarning,
+ )
+
+ molecule_type = None
+ if self._seq_type:
+ # mRNA is really also DNA, since it is actually cDNA
+ if "DNA" in self._seq_type.upper() or "MRNA" in self._seq_type.upper():
+ molecule_type = "DNA"
+ # are there ever really RNA sequences in GenBank?
+ elif "RNA" in self._seq_type.upper():
+ # Even for data which was from RNA, the sequence string
+ # is usually given as DNA (T not U). Bug 3010
+ molecule_type = "RNA"
+ elif (
+ "PROTEIN" in self._seq_type.upper() or self._seq_type == "PRT"
+ ): # PRT is used in EMBL-bank for patents
+ molecule_type = "protein"
+ # work around ugly GenBank records which have circular or
+ # linear but no indication of sequence type
+ elif self._seq_type in ["circular", "linear", "unspecified"]:
+ pass
+ # we have a bug if we get here
+ else:
+ raise ValueError(
+ "Could not determine molecule_type for seq_type %s" % self._seq_type
+ )
+ # Don't overwrite molecule_type
+ if molecule_type is not None:
+ self.data.annotations["molecule_type"] = self.data.annotations.get(
+ "molecule_type", molecule_type
+ )
+ if not sequence and self._expected_size:
+ self.data.seq = Seq(None, length=self._expected_size)
+ else:
+ self.data.seq = Seq(sequence)
+
+
+class _RecordConsumer(_BaseGenBankConsumer):
+ """Create a GenBank Record object from scanner generated information (PRIVATE)."""
+
+ def __init__(self):
+ _BaseGenBankConsumer.__init__(self)
+ from . import Record
+
+ self.data = Record.Record()
+
+ self._seq_data = []
+ self._cur_reference = None
+ self._cur_feature = None
+ self._cur_qualifier = None
+
+ def tls(self, content):
+ self.data.tls = content.split("-")
+
+ def tsa(self, content):
+ self.data.tsa = content.split("-")
+
+ def wgs(self, content):
+ self.data.wgs = content.split("-")
+
+ def add_wgs_scafld(self, content):
+ self.data.wgs_scafld.append(content.split("-"))
+
+ def locus(self, content):
+ self.data.locus = content
+
+ def size(self, content):
+ self.data.size = content
+
+ def residue_type(self, content):
+ # Be lenient about parsing, but technically lowercase residue types are malformed.
+ if "dna" in content or "rna" in content:
+ warnings.warn(
+ "Invalid seq_type (%s): DNA/RNA should be uppercase." % content,
+ BiopythonParserWarning,
+ )
+ self.data.residue_type = content
+
+ def data_file_division(self, content):
+ self.data.data_file_division = content
+
+ def date(self, content):
+ self.data.date = content
+
+ def definition(self, content):
+ self.data.definition = content
+
+ def accession(self, content):
+ for acc in self._split_accessions(content):
+ if acc not in self.data.accession:
+ self.data.accession.append(acc)
+
+ def molecule_type(self, mol_type):
+ """Validate and record the molecule type (for round-trip etc)."""
+ if mol_type:
+ if "circular" in mol_type or "linear" in mol_type:
+ raise ParserFailureError(
+ "Molecule type %r should not include topology" % mol_type
+ )
+
+ # Writing out records will fail if we have a lower case DNA
+ # or RNA string in here, so upper case it.
+ # This is a bit ugly, but we don't want to upper case e.g.
+ # the m in mRNA, but thanks to the strip we lost the spaces
+ # so we need to index from the back
+ if mol_type[-3:].upper() in ("DNA", "RNA") and not mol_type[-3:].isupper():
+ warnings.warn(
+ "Non-upper case molecule type in LOCUS line: %s" % mol_type,
+ BiopythonParserWarning,
+ )
+
+ self.data.molecule_type = mol_type
+
+ def topology(self, topology):
+ """Validate and record sequence topology.
+
+ The topology argument should be "linear" or "circular" (string).
+ """
+ if topology:
+ if topology not in ["linear", "circular"]:
+ raise ParserFailureError(
+ "Unexpected topology %r should be linear or circular" % topology
+ )
+ self.data.topology = topology
+
+ def nid(self, content):
+ self.data.nid = content
+
+ def pid(self, content):
+ self.data.pid = content
+
+ def version(self, content):
+ self.data.version = content
+
+ def db_source(self, content):
+ self.data.db_source = content.rstrip()
+
+ def gi(self, content):
+ self.data.gi = content
+
+ def keywords(self, content):
+ self.data.keywords = self._split_keywords(content)
+
+ def project(self, content):
+ self.data.projects.extend(p for p in content.split() if p)
+
+ def dblink(self, content):
+ self.data.dblinks.append(content)
+
+ def segment(self, content):
+ self.data.segment = content
+
+ def source(self, content):
+ self.data.source = content
+
+ def organism(self, content):
+ self.data.organism = content
+
+ def taxonomy(self, content):
+ self.data.taxonomy = self._split_taxonomy(content)
+
+ def reference_num(self, content):
+ """Grab the reference number and signal the start of a new reference."""
+ # check if we have a reference to add
+ if self._cur_reference is not None:
+ self.data.references.append(self._cur_reference)
+
+ from . import Record
+
+ self._cur_reference = Record.Reference()
+ self._cur_reference.number = content
+
+ def reference_bases(self, content):
+ self._cur_reference.bases = content
+
+ def authors(self, content):
+ self._cur_reference.authors = content
+
+ def consrtm(self, content):
+ self._cur_reference.consrtm = content
+
+ def title(self, content):
+ if self._cur_reference is None:
+ warnings.warn(
+ "GenBank TITLE line without REFERENCE line.", BiopythonParserWarning
+ )
+ return
+ self._cur_reference.title = content
+
+ def journal(self, content):
+ self._cur_reference.journal = content
+
+ def medline_id(self, content):
+ self._cur_reference.medline_id = content
+
+ def pubmed_id(self, content):
+ self._cur_reference.pubmed_id = content
+
+ def remark(self, content):
+ self._cur_reference.remark = content
+
+ def comment(self, content):
+ self.data.comment += "\n".join(content)
+
+ def structured_comment(self, content):
+ self.data.structured_comment = content
+
+ def primary_ref_line(self, content):
+ """Save reference data for the PRIMARY line."""
+ self.data.primary.append(content)
+
+ def primary(self, content):
+ pass
+
+ def features_line(self, content):
+ """Get ready for the feature table when we reach the FEATURE line."""
+ self.start_feature_table()
+
+ def start_feature_table(self):
+ """Signal the start of the feature table."""
+ # we need to add on the last reference
+ if self._cur_reference is not None:
+ self.data.references.append(self._cur_reference)
+
+ def feature_key(self, content):
+ """Grab the key of the feature and signal the start of a new feature."""
+ # first add on feature information if we've got any
+ self._add_feature()
+
+ from . import Record
+
+ self._cur_feature = Record.Feature()
+ self._cur_feature.key = content
+
+ def _add_feature(self):
+ """Add a feature to the record, with relevant checks (PRIVATE).
+
+ This does all of the appropriate checking to make sure we haven't
+ left any info behind, and that we are only adding info if it
+ exists.
+ """
+ if self._cur_feature is not None:
+ # if we have a left over qualifier, add it to the qualifiers
+ # on the current feature
+ if self._cur_qualifier is not None:
+ self._cur_feature.qualifiers.append(self._cur_qualifier)
+
+ self._cur_qualifier = None
+ self.data.features.append(self._cur_feature)
+
+ def location(self, content):
+ self._cur_feature.location = self._clean_location(content)
+
+ def feature_qualifier(self, key, value):
+ self.feature_qualifier_name([key])
+ if value is not None:
+ self.feature_qualifier_description(value)
+
+ def feature_qualifier_name(self, content_list):
+ """Deal with qualifier names.
+
+ We receive a list of keys, since you can have valueless keys such as
+ /pseudo which would be passed in with the next key (since no other
+ tags separate them in the file)
+ """
+ from . import Record
+
+ for content in content_list:
+ # the record parser keeps the /s -- add them if we don't have 'em
+ if not content.startswith("/"):
+ content = "/%s" % content
+ # add on a qualifier if we've got one
+ if self._cur_qualifier is not None:
+ self._cur_feature.qualifiers.append(self._cur_qualifier)
+
+ self._cur_qualifier = Record.Qualifier()
+ self._cur_qualifier.key = content
+
+ def feature_qualifier_description(self, content):
+ # if we have info then the qualifier key should have a ='s
+ if "=" not in self._cur_qualifier.key:
+ self._cur_qualifier.key = "%s=" % self._cur_qualifier.key
+ cur_content = self._remove_newlines(content)
+ # remove all spaces from the value if it is a type where spaces
+ # are not important
+ for remove_space_key in self.__class__.remove_space_keys:
+ if remove_space_key in self._cur_qualifier.key:
+ cur_content = self._remove_spaces(cur_content)
+ self._cur_qualifier.value = self._normalize_spaces(cur_content)
+
+ def base_count(self, content):
+ self.data.base_counts = content
+
+ def origin_name(self, content):
+ self.data.origin = content
+
+ def contig_location(self, content):
+ """Signal that we have contig information to add to the record."""
+ self.data.contig = self._clean_location(content)
+
+ def sequence(self, content):
+ """Add sequence information to a list of sequence strings.
+
+ This removes spaces in the data and uppercases the sequence, and
+ then adds it to a list of sequences. Later on we'll join this
+ list together to make the final sequence. This is faster than
+ adding on the new string every time.
+ """
+ assert " " not in content
+ self._seq_data.append(content.upper())
+
+ def record_end(self, content):
+ """Signal the end of the record and do any necessary clean-up."""
+ # add together all of the sequence parts to create the
+ # final sequence string
+ self.data.sequence = "".join(self._seq_data)
+ # add on the last feature
+ self._add_feature()
+
+
+def parse(handle):
+ """Iterate over GenBank formatted entries as Record objects.
+
+ >>> from Bio import GenBank
+ >>> with open("GenBank/NC_000932.gb") as handle:
+ ... for record in GenBank.parse(handle):
+ ... print(record.accession)
+ ['NC_000932']
+
+ To get SeqRecord objects use Bio.SeqIO.parse(..., format="gb")
+ instead.
+ """
+ return iter(Iterator(handle, RecordParser()))
+
+
+def read(handle):
+ """Read a handle containing a single GenBank entry as a Record object.
+
+ >>> from Bio import GenBank
+ >>> with open("GenBank/NC_000932.gb") as handle:
+ ... record = GenBank.read(handle)
+ ... print(record.accession)
+ ['NC_000932']
+
+ To get a SeqRecord object use Bio.SeqIO.read(..., format="gb")
+ instead.
+ """
+ iterator = parse(handle)
+ try:
+ record = next(iterator)
+ except StopIteration:
+ raise ValueError("No records found in handle") from None
+ try:
+ next(iterator)
+ raise ValueError("More than one record found in handle")
+ except StopIteration:
+ pass
+ return record
+
+
+if __name__ == "__main__":
+ from Bio._utils import run_doctest
+
+ run_doctest()
diff --git a/code/lib/Bio/GenBank/__pycache__/Record.cpython-37.pyc b/code/lib/Bio/GenBank/__pycache__/Record.cpython-37.pyc
new file mode 100644
index 0000000..862e2a0
Binary files /dev/null and b/code/lib/Bio/GenBank/__pycache__/Record.cpython-37.pyc differ
diff --git a/code/lib/Bio/GenBank/__pycache__/Scanner.cpython-37.pyc b/code/lib/Bio/GenBank/__pycache__/Scanner.cpython-37.pyc
new file mode 100644
index 0000000..24b0a53
Binary files /dev/null and b/code/lib/Bio/GenBank/__pycache__/Scanner.cpython-37.pyc differ
diff --git a/code/lib/Bio/GenBank/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/GenBank/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..9d0e9c2
Binary files /dev/null and b/code/lib/Bio/GenBank/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/GenBank/__pycache__/utils.cpython-37.pyc b/code/lib/Bio/GenBank/__pycache__/utils.cpython-37.pyc
new file mode 100644
index 0000000..74c8727
Binary files /dev/null and b/code/lib/Bio/GenBank/__pycache__/utils.cpython-37.pyc differ
diff --git a/code/lib/Bio/GenBank/utils.py b/code/lib/Bio/GenBank/utils.py
new file mode 100644
index 0000000..6f0eb28
--- /dev/null
+++ b/code/lib/Bio/GenBank/utils.py
@@ -0,0 +1,68 @@
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+#
+
+"""Useful utilities for helping in parsing GenBank files."""
+
+
+class FeatureValueCleaner:
+ r"""Provide specialized capabilities for cleaning up values in features.
+
+ This class is designed to provide a mechanism to clean up and process
+ values in the key/value pairs of GenBank features. This is useful
+ because in cases like::
+
+ /translation="MED
+ YDPWNLRFQSKYKSRDA"
+
+ you'll otherwise end up with white space in it.
+
+ This cleaning needs to be done on a case by case basis since it is
+ impossible to interpret whether you should be concatenating everything
+ (as in translations), or combining things with spaces (as might be
+ the case with /notes).
+
+ >>> cleaner = FeatureValueCleaner(["translation"])
+ >>> cleaner
+ FeatureValueCleaner(['translation'])
+ >>> cleaner.clean_value("translation", "MED\nYDPWNLRFQSKYKSRDA")
+ 'MEDYDPWNLRFQSKYKSRDA'
+ """
+
+ keys_to_process = ["translation"]
+
+ def __init__(self, to_process=keys_to_process):
+ """Initialize with the keys we should deal with."""
+ self._to_process = to_process
+
+ def __repr__(self):
+ """Return a string representation of the class."""
+ return f"{self.__class__.__name__}({self._to_process!r})"
+
+ def clean_value(self, key_name, value):
+ """Clean the specified value and return it.
+
+ If the value is not specified to be dealt with, the original value
+ will be returned.
+ """
+ if key_name in self._to_process:
+ try:
+ cleaner = getattr(self, "_clean_%s" % key_name)
+ except AttributeError:
+ raise AssertionError(
+ "No function to clean key: %s" % key_name
+ ) from None
+ value = cleaner(value)
+ return value
+
+ def _clean_translation(self, value):
+ """Concatenate a translation value to one long protein string (PRIVATE)."""
+ translation_parts = value.split()
+ return "".join(translation_parts)
+
+
+if __name__ == "__main__":
+ from Bio._utils import run_doctest
+
+ run_doctest()
diff --git a/code/lib/Bio/Geo/Record.py b/code/lib/Bio/Geo/Record.py
new file mode 100644
index 0000000..5e38c78
--- /dev/null
+++ b/code/lib/Bio/Geo/Record.py
@@ -0,0 +1,92 @@
+# Copyright 2001 by Katharine Lindner. All rights reserved.
+# Copyright 2006 by PeterC. All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+
+"""Hold GEO data in a straightforward format.
+
+classes:
+o Record - All of the information in an GEO record.
+
+See http://www.ncbi.nlm.nih.gov/geo/
+"""
+
+
+class Record:
+ """Hold GEO information in a format similar to the original record.
+
+ The Record class is meant to make data easy to get to when you are
+ just interested in looking at GEO data.
+
+ Attributes:
+ entity_type
+ entity_id
+ entity_attributes
+ col_defs
+ table_rows
+
+ """
+
+ def __init__(self):
+ """Initialize the class."""
+ self.entity_type = ""
+ self.entity_id = ""
+ self.entity_attributes = {}
+ self.col_defs = {}
+ self.table_rows = []
+
+ def __str__(self):
+ """Return the GEO record as a string."""
+ output = ""
+ output += "GEO Type: %s\n" % self.entity_type
+ output += "GEO Id: %s\n" % self.entity_id
+ att_keys = sorted(self.entity_attributes)
+ for key in att_keys:
+ contents = self.entity_attributes[key]
+ if isinstance(contents, list):
+ for item in contents:
+ try:
+ output += "%s: %s\n" % (key, item[:40])
+ output += out_block(item[40:])
+ except Exception: # TODO: IndexError?
+ pass
+ elif isinstance(contents, str):
+ output += "%s: %s\n" % (key, contents[:40])
+ output += out_block(contents[40:])
+ else:
+ print(contents)
+ output += "%s: %s\n" % (key, contents[:40])
+ output += out_block(contents[40:])
+ col_keys = sorted(self.col_defs)
+ output += "Column Header Definitions\n"
+ for key in col_keys:
+ val = self.col_defs[key]
+ output += " %s: %s\n" % (key, val[:40])
+ output += out_block(val[40:], " ")
+ # May have to display VERY large tables,
+ # so only show the first 20 lines of data
+ MAX_ROWS = 20 + 1 # include header in count
+ for row in self.table_rows[0:MAX_ROWS]:
+ output += "%s: " % self.table_rows.index(row)
+ for col in row:
+ output += "%s\t" % col
+ output += "\n"
+ if len(self.table_rows) > MAX_ROWS:
+ output += "...\n"
+ row = self.table_rows[-1]
+ output += "%s: " % self.table_rows.index(row)
+ for col in row:
+ output += "%s\t" % col
+ output += "\n"
+
+ return output
+
+
+def out_block(text, prefix=""):
+ """Format text in blocks of 80 chars with an additional optional prefix."""
+ output = ""
+ for j in range(0, len(text), 80):
+ output += "%s%s\n" % (prefix, text[j : j + 80])
+ output += "\n"
+ return output
diff --git a/code/lib/Bio/Geo/__init__.py b/code/lib/Bio/Geo/__init__.py
new file mode 100644
index 0000000..6735e9a
--- /dev/null
+++ b/code/lib/Bio/Geo/__init__.py
@@ -0,0 +1,67 @@
+# Copyright 2001 by Katharine Lindner. All rights reserved.
+# Copyright 2006 by PeterC. All rights reserved.
+# Copyright 2007 by Michiel de Hoon. All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+"""Parser for files from NCBI's Gene Expression Omnibus (GEO).
+
+http://www.ncbi.nlm.nih.gov/geo/
+"""
+
+from . import Record
+
+
+def _read_key_value(line):
+ words = line[1:].split("=", 1)
+ try:
+ key, value = words
+ value = value.strip()
+ except ValueError:
+ key = words[0]
+ value = ""
+ key = key.strip()
+ return key, value
+
+
+def parse(handle):
+ """Read Gene Expression Omnibus records from file handle.
+
+ Returns a generator object which yields Bio.Geo.Record() objects.
+ """
+ record = None
+ for line in handle:
+ line = line.strip("\n").strip("\r")
+ if not line:
+ continue # Ignore empty lines
+ c = line[0]
+ if c == "^":
+ if record:
+ yield record
+ record = Record.Record()
+ record.entity_type, record.entity_id = _read_key_value(line)
+ elif c == "!":
+ if line in (
+ "!Sample_table_begin",
+ "!Sample_table_end",
+ "!Platform_table_begin",
+ "!Platform_table_end",
+ ):
+ continue
+ key, value = _read_key_value(line)
+ if key in record.entity_attributes:
+ if isinstance(record.entity_attributes[key], list):
+ record.entity_attributes[key].append(value)
+ else:
+ existing = record.entity_attributes[key]
+ record.entity_attributes[key] = [existing, value]
+ else:
+ record.entity_attributes[key] = value
+ elif c == "#":
+ key, value = _read_key_value(line)
+ assert key not in record.col_defs
+ record.col_defs[key] = value
+ else:
+ row = line.split("\t")
+ record.table_rows.append(row)
+ yield record
diff --git a/code/lib/Bio/Geo/__pycache__/Record.cpython-37.pyc b/code/lib/Bio/Geo/__pycache__/Record.cpython-37.pyc
new file mode 100644
index 0000000..8861450
Binary files /dev/null and b/code/lib/Bio/Geo/__pycache__/Record.cpython-37.pyc differ
diff --git a/code/lib/Bio/Geo/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/Geo/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..1c5efce
Binary files /dev/null and b/code/lib/Bio/Geo/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/Graphics/BasicChromosome.py b/code/lib/Bio/Graphics/BasicChromosome.py
new file mode 100644
index 0000000..91e6445
--- /dev/null
+++ b/code/lib/Bio/Graphics/BasicChromosome.py
@@ -0,0 +1,823 @@
+# Copyright 2001, 2003 by Brad Chapman. All rights reserved.
+# Revisions copyright 2011 by Peter Cock. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Draw representations of organism chromosomes with added information.
+
+These classes are meant to model the drawing of pictures of chromosomes.
+This can be useful for lots of things, including displaying markers on
+a chromosome (ie. for genetic mapping) and showing syteny between two
+chromosomes.
+
+The structure of these classes is intended to be a Composite, so that
+it will be easy to plug in and switch different parts without
+breaking the general drawing capabilities of the system. The
+relationship between classes is that everything derives from
+_ChromosomeComponent, which specifies the overall interface. The parts
+then are related so that an Organism contains Chromosomes, and these
+Chromosomes contain ChromosomeSegments. This representation differents
+from the canonical composite structure in that we don't really have
+'leaf' nodes here -- all components can potentially hold sub-components.
+
+Most of the time the ChromosomeSegment class is what you'll want to
+customize for specific drawing tasks.
+
+For providing drawing capabilities, these classes use reportlab:
+
+http://www.reportlab.com
+
+This provides nice output in PDF, SVG and postscript. If you have
+reportlab's renderPM module installed you can also use PNG etc.
+"""
+
+# reportlab
+from reportlab.lib.pagesizes import letter
+from reportlab.lib.units import inch
+from reportlab.lib import colors
+from reportlab.pdfbase.pdfmetrics import stringWidth
+
+from reportlab.graphics.shapes import Drawing, String, Line, Rect, Wedge, ArcPath
+from reportlab.graphics.widgetbase import Widget
+
+from Bio.Graphics import _write
+from Bio.Graphics.GenomeDiagram import _Colors
+
+
+_color_trans = _Colors.ColorTranslator()
+
+
+class _ChromosomeComponent(Widget):
+ """Base class specifying the interface for a component of the system.
+
+ This class should not be instantiated directly, but should be used
+ from derived classes.
+ """
+
+ def __init__(self):
+ """Initialize a chromosome component.
+
+ Attributes:
+ - _sub_components -- Any components which are contained under
+ this parent component. This attribute should be accessed through
+ the add() and remove() functions.
+
+ """
+ self._sub_components = []
+
+ def add(self, component):
+ """Add a sub_component to the list of components under this item."""
+ if not isinstance(component, _ChromosomeComponent):
+ raise TypeError(
+ "Expected a _ChromosomeComponent object, got %s" % component
+ )
+
+ self._sub_components.append(component)
+
+ def remove(self, component):
+ """Remove the specified component from the subcomponents.
+
+ Raises a ValueError if the component is not registered as a
+ sub_component.
+ """
+ try:
+ self._sub_components.remove(component)
+ except ValueError:
+ raise ValueError(
+ "Component %s not found in sub_components." % component
+ ) from None
+
+ def draw(self):
+ """Draw the specified component."""
+ raise AssertionError("Subclasses must implement.")
+
+
+class Organism(_ChromosomeComponent):
+ """Top level class for drawing chromosomes.
+
+ This class holds information about an organism and all of its
+ chromosomes, and provides the top level object which could be used
+ for drawing a chromosome representation of an organism.
+
+ Chromosomes should be added and removed from the Organism via the
+ add and remove functions.
+ """
+
+ def __init__(self, output_format="pdf"):
+ """Initialize the class."""
+ _ChromosomeComponent.__init__(self)
+
+ # customizable attributes
+ self.page_size = letter
+ self.title_size = 20
+
+ # Do we need this given we don't draw a legend?
+ # If so, should be a public API...
+ self._legend_height = 0 # 2 * inch
+
+ self.output_format = output_format
+
+ def draw(self, output_file, title):
+ """Draw out the information for the Organism.
+
+ Arguments:
+ - output_file -- The name of a file specifying where the
+ document should be saved, or a handle to be written to.
+ The output format is set when creating the Organism object.
+ Alternatively, output_file=None will return the drawing using
+ the low-level ReportLab objects (for further processing, such
+ as adding additional graphics, before writing).
+ - title -- The output title of the produced document.
+
+ """
+ width, height = self.page_size
+ cur_drawing = Drawing(width, height)
+
+ self._draw_title(cur_drawing, title, width, height)
+
+ cur_x_pos = inch * 0.5
+ if len(self._sub_components) > 0:
+ x_pos_change = (width - inch) / len(self._sub_components)
+ # no sub_components
+ else:
+ pass
+
+ for sub_component in self._sub_components:
+ # set the drawing location of the chromosome
+ sub_component.start_x_position = cur_x_pos + 0.05 * x_pos_change
+ sub_component.end_x_position = cur_x_pos + 0.95 * x_pos_change
+ sub_component.start_y_position = height - 1.5 * inch
+ sub_component.end_y_position = self._legend_height + 1 * inch
+
+ # do the drawing
+ sub_component.draw(cur_drawing)
+
+ # update the locations for the next chromosome
+ cur_x_pos += x_pos_change
+
+ self._draw_legend(cur_drawing, self._legend_height + 0.5 * inch, width)
+
+ if output_file is None:
+ # Let the user take care of writing to the file...
+ return cur_drawing
+
+ return _write(cur_drawing, output_file, self.output_format)
+
+ def _draw_title(self, cur_drawing, title, width, height):
+ """Write out the title of the organism figure (PRIVATE)."""
+ title_string = String(width / 2, height - inch, title)
+ title_string.fontName = "Helvetica-Bold"
+ title_string.fontSize = self.title_size
+ title_string.textAnchor = "middle"
+
+ cur_drawing.add(title_string)
+
+ def _draw_legend(self, cur_drawing, start_y, width):
+ """Draw a legend for the figure (PRIVATE).
+
+ Subclasses should implement this (see also self._legend_height) to
+ provide specialized legends.
+ """
+ pass
+
+
+class Chromosome(_ChromosomeComponent):
+ """Class for drawing a chromosome of an organism.
+
+ This organizes the drawing of a single organisms chromosome. This
+ class can be instantiated directly, but the draw method makes the
+ most sense to be called in the context of an organism.
+ """
+
+ def __init__(self, chromosome_name):
+ """Initialize a Chromosome for drawing.
+
+ Arguments:
+ - chromosome_name - The label for the chromosome.
+
+ Attributes:
+ - start_x_position, end_x_position - The x positions on the page
+ where the chromosome should be drawn. This allows multiple
+ chromosomes to be drawn on a single page.
+ - start_y_position, end_y_position - The y positions on the page
+ where the chromosome should be contained.
+
+ Configuration Attributes:
+ - title_size - The size of the chromosome title.
+ - scale_num - A number of scale the drawing by. This is useful if
+ you want to draw multiple chromosomes of different sizes at the
+ same scale. If this is not set, then the chromosome drawing will
+ be scaled by the number of segements in the chromosome (so each
+ chromosome will be the exact same final size).
+
+ """
+ _ChromosomeComponent.__init__(self)
+
+ self._name = chromosome_name
+
+ self.start_x_position = -1
+ self.end_x_position = -1
+ self.start_y_position = -1
+ self.end_y_position = -1
+
+ self.title_size = 20
+ self.scale_num = None
+
+ self.label_size = 6
+ self.chr_percent = 0.25
+ self.label_sep_percent = self.chr_percent * 0.5
+ self._color_labels = False
+
+ def subcomponent_size(self):
+ """Return the scaled size of all subcomponents of this component."""
+ total_sub = 0
+ for sub_component in self._sub_components:
+ total_sub += sub_component.scale
+
+ return total_sub
+
+ def draw(self, cur_drawing):
+ """Draw a chromosome on the specified template.
+
+ Ideally, the x_position and y_*_position attributes should be
+ set prior to drawing -- otherwise we're going to have some problems.
+ """
+ for position in (
+ self.start_x_position,
+ self.end_x_position,
+ self.start_y_position,
+ self.end_y_position,
+ ):
+ assert position != -1, "Need to set drawing coordinates."
+
+ # first draw all of the sub-sections of the chromosome -- this
+ # will actually be the picture of the chromosome
+ cur_y_pos = self.start_y_position
+ if self.scale_num:
+ y_pos_change = (
+ self.start_y_position * 0.95 - self.end_y_position
+ ) / self.scale_num
+ elif len(self._sub_components) > 0:
+ y_pos_change = (
+ self.start_y_position * 0.95 - self.end_y_position
+ ) / self.subcomponent_size()
+ # no sub_components to draw
+ else:
+ pass
+
+ left_labels = []
+ right_labels = []
+ for sub_component in self._sub_components:
+ this_y_pos_change = sub_component.scale * y_pos_change
+
+ # set the location of the component to draw
+ sub_component.start_x_position = self.start_x_position
+ sub_component.end_x_position = self.end_x_position
+ sub_component.start_y_position = cur_y_pos
+ sub_component.end_y_position = cur_y_pos - this_y_pos_change
+
+ # draw the sub component
+ sub_component._left_labels = []
+ sub_component._right_labels = []
+ sub_component.draw(cur_drawing)
+ left_labels += sub_component._left_labels
+ right_labels += sub_component._right_labels
+
+ # update the position for the next component
+ cur_y_pos -= this_y_pos_change
+
+ self._draw_labels(cur_drawing, left_labels, right_labels)
+ self._draw_label(cur_drawing, self._name)
+
+ def _draw_label(self, cur_drawing, label_name):
+ """Draw a label for the chromosome (PRIVATE)."""
+ x_position = 0.5 * (self.start_x_position + self.end_x_position)
+ y_position = self.end_y_position
+
+ label_string = String(x_position, y_position, label_name)
+ label_string.fontName = "Times-BoldItalic"
+ label_string.fontSize = self.title_size
+ label_string.textAnchor = "middle"
+
+ cur_drawing.add(label_string)
+
+ def _draw_labels(self, cur_drawing, left_labels, right_labels):
+ """Layout and draw sub-feature labels for the chromosome (PRIVATE).
+
+ Tries to place each label at the same vertical position as the
+ feature it applies to, but will adjust the positions to avoid or
+ at least reduce label overlap.
+
+ Draws the label text and a coloured line linking it to the
+ location (i.e. feature) it applies to.
+ """
+ if not self._sub_components:
+ return
+ color_label = self._color_labels
+
+ segment_width = (self.end_x_position - self.start_x_position) * self.chr_percent
+ label_sep = (
+ self.end_x_position - self.start_x_position
+ ) * self.label_sep_percent
+ segment_x = self.start_x_position + 0.5 * (
+ self.end_x_position - self.start_x_position - segment_width
+ )
+
+ y_limits = []
+ for sub_component in self._sub_components:
+ y_limits.extend(
+ (sub_component.start_y_position, sub_component.end_y_position)
+ )
+ y_min = min(y_limits)
+ y_max = max(y_limits)
+ del y_limits
+ # Now do some label placement magic...
+ # from reportlab.pdfbase import pdfmetrics
+ # font = pdfmetrics.getFont('Helvetica')
+ # h = (font.face.ascent + font.face.descent) * 0.90
+ h = self.label_size
+ for x1, x2, labels, anchor in [
+ (
+ segment_x,
+ segment_x - label_sep,
+ _place_labels(left_labels, y_min, y_max, h),
+ "end",
+ ),
+ (
+ segment_x + segment_width,
+ segment_x + segment_width + label_sep,
+ _place_labels(right_labels, y_min, y_max, h),
+ "start",
+ ),
+ ]:
+ for (y1, y2, color, back_color, name) in labels:
+ cur_drawing.add(
+ Line(x1, y1, x2, y2, strokeColor=color, strokeWidth=0.25)
+ )
+ label_string = String(x2, y2, name, textAnchor=anchor)
+ label_string.fontName = "Helvetica"
+ label_string.fontSize = h
+ if color_label:
+ label_string.fillColor = color
+ if back_color:
+ w = stringWidth(name, label_string.fontName, label_string.fontSize)
+ if x1 > x2:
+ w = w * -1.0
+ cur_drawing.add(
+ Rect(
+ x2,
+ y2 - 0.1 * h,
+ w,
+ h,
+ strokeColor=back_color,
+ fillColor=back_color,
+ )
+ )
+ cur_drawing.add(label_string)
+
+
+class ChromosomeSegment(_ChromosomeComponent):
+ """Draw a segment of a chromosome.
+
+ This class provides the important configurable functionality of drawing
+ a Chromosome. Each segment has some customization available here, or can
+ be subclassed to define additional functionality. Most of the interesting
+ drawing stuff is likely to happen at the ChromosomeSegment level.
+ """
+
+ def __init__(self):
+ """Initialize a ChromosomeSegment.
+
+ Attributes:
+ - start_x_position, end_x_position - Defines the x range we have
+ to draw things in.
+ - start_y_position, end_y_position - Defines the y range we have
+ to draw things in.
+
+ Configuration Attributes:
+ - scale - A scaling value for the component. By default this is
+ set at 1 (ie -- has the same scale as everything else). Higher
+ values give more size to the component, smaller values give less.
+ - fill_color - A color to fill in the segment with. Colors are
+ available in reportlab.lib.colors
+ - label - A label to place on the chromosome segment. This should
+ be a text string specifying what is to be included in the label.
+ - label_size - The size of the label.
+ - chr_percent - The percentage of area that the chromosome
+ segment takes up.
+
+ """
+ _ChromosomeComponent.__init__(self)
+
+ self.start_x_position = -1
+ self.end_x_position = -1
+ self.start_y_position = -1
+ self.end_y_position = -1
+
+ # --- attributes for configuration
+ self.scale = 1
+ self.fill_color = None
+ self.label = None
+ self.label_size = 6
+ self.chr_percent = 0.25
+
+ def draw(self, cur_drawing):
+ """Draw a chromosome segment.
+
+ Before drawing, the range we are drawing in needs to be set.
+ """
+ for position in (
+ self.start_x_position,
+ self.end_x_position,
+ self.start_y_position,
+ self.end_y_position,
+ ):
+ assert position != -1, "Need to set drawing coordinates."
+
+ self._draw_subcomponents(cur_drawing) # Anything behind
+ self._draw_segment(cur_drawing)
+ self._overdraw_subcomponents(cur_drawing) # Anything on top
+ self._draw_label(cur_drawing)
+
+ def _draw_subcomponents(self, cur_drawing):
+ """Draw any subcomponents of the chromosome segment (PRIVATE).
+
+ This should be overridden in derived classes if there are
+ subcomponents to be drawn.
+ """
+ pass
+
+ def _draw_segment(self, cur_drawing):
+ """Draw the current chromosome segment (PRIVATE)."""
+ # set the coordinates of the segment -- it'll take up the MIDDLE part
+ # of the space we have.
+ segment_y = self.end_y_position
+ segment_width = (self.end_x_position - self.start_x_position) * self.chr_percent
+ segment_height = self.start_y_position - self.end_y_position
+ segment_x = self.start_x_position + 0.5 * (
+ self.end_x_position - self.start_x_position - segment_width
+ )
+
+ # first draw the sides of the segment
+ right_line = Line(segment_x, segment_y, segment_x, segment_y + segment_height)
+ left_line = Line(
+ segment_x + segment_width,
+ segment_y,
+ segment_x + segment_width,
+ segment_y + segment_height,
+ )
+
+ cur_drawing.add(right_line)
+ cur_drawing.add(left_line)
+
+ # now draw the box, if it is filled in
+ if self.fill_color is not None:
+ fill_rectangle = Rect(segment_x, segment_y, segment_width, segment_height)
+ fill_rectangle.fillColor = self.fill_color
+ fill_rectangle.strokeColor = None
+
+ cur_drawing.add(fill_rectangle)
+
+ def _overdraw_subcomponents(self, cur_drawing):
+ """Draw any subcomponents of the chromosome segment over the main part (PRIVATE).
+
+ This should be overridden in derived classes if there are
+ subcomponents to be drawn.
+ """
+ pass
+
+ def _draw_label(self, cur_drawing):
+ """Add a label to the chromosome segment (PRIVATE).
+
+ The label will be applied to the right of the segment.
+
+ This may be overlapped by any sub-feature labels on other segments!
+ """
+ if self.label is not None:
+
+ label_x = 0.5 * (self.start_x_position + self.end_x_position) + (
+ self.chr_percent + 0.05
+ ) * (self.end_x_position - self.start_x_position)
+ label_y = (
+ self.start_y_position - self.end_y_position
+ ) / 2 + self.end_y_position
+
+ label_string = String(label_x, label_y, self.label)
+ label_string.fontName = "Helvetica"
+ label_string.fontSize = self.label_size
+
+ cur_drawing.add(label_string)
+
+
+def _spring_layout(desired, minimum, maximum, gap=0):
+ """Try to layout label co-ordinates or other floats (PRIVATE).
+
+ Originally written for the y-axis vertical positioning of labels on a
+ chromosome diagram (where the minimum gap between y-axis co-ordinates is
+ the label height), it could also potentially be used for x-axis placement,
+ or indeed radial placement for circular chromosomes within GenomeDiagram.
+
+ In essence this is an optimisation problem, balancing the desire to have
+ each label as close as possible to its data point, but also to spread out
+ the labels to avoid overlaps. This could be described with a cost function
+ (modelling the label distance from the desired placement, and the inter-
+ label separations as springs) and solved as a multi-variable minimization
+ problem - perhaps with NumPy or SciPy.
+
+ For now however, the implementation is a somewhat crude ad hoc algorithm.
+
+ NOTE - This expects the input data to have been sorted!
+ """
+ count = len(desired)
+ if count <= 1:
+ return desired # Easy!
+ if minimum >= maximum:
+ raise ValueError("Bad min/max %f and %f" % (minimum, maximum))
+ if min(desired) < minimum or max(desired) > maximum:
+ raise ValueError(
+ "Data %f to %f out of bounds (%f to %f)"
+ % (min(desired), max(desired), minimum, maximum)
+ )
+ equal_step = float(maximum - minimum) / (count - 1)
+
+ if equal_step < gap:
+ import warnings
+ from Bio import BiopythonWarning
+
+ warnings.warn("Too many labels to avoid overlap", BiopythonWarning)
+ # Crudest solution
+ return [minimum + i * equal_step for i in range(count)]
+
+ good = True
+ if gap:
+ prev = desired[0]
+ for next in desired[1:]:
+ if prev - next < gap:
+ good = False
+ break
+ if good:
+ return desired
+
+ span = maximum - minimum
+ for split in [0.5 * span, span / 3.0, 2 * span / 3.0, 0.25 * span, 0.75 * span]:
+ midpoint = minimum + split
+ low = [x for x in desired if x <= midpoint - 0.5 * gap]
+ high = [x for x in desired if x > midpoint + 0.5 * gap]
+ if len(low) + len(high) < count:
+ # Bad split point, points right on boundary
+ continue
+ elif not low and len(high) * gap <= (span - split) + 0.5 * gap:
+ # Give a little of the unused low space to the high points
+ return _spring_layout(high, midpoint + 0.5 * gap, maximum, gap)
+ elif not high and len(low) * gap <= split + 0.5 * gap:
+ # Give a little of the unused highspace to the low points
+ return _spring_layout(low, minimum, midpoint - 0.5 * gap, gap)
+ elif (
+ len(low) * gap <= split - 0.5 * gap
+ and len(high) * gap <= (span - split) - 0.5 * gap
+ ):
+ return _spring_layout(
+ low, minimum, midpoint - 0.5 * gap, gap
+ ) + _spring_layout(high, midpoint + 0.5 * gap, maximum, gap)
+
+ # This can be count-productive now we can split out into the telomere or
+ # spacer-segment's vertical space...
+ # Try not to spread out as far as the min/max unless needed
+ low = min(desired)
+ high = max(desired)
+ if (high - low) / (count - 1) >= gap:
+ # Good, we don't need the full range, and can position the
+ # min and max exactly as well :)
+ equal_step = (high - low) / (count - 1)
+ return [low + i * equal_step for i in range(count)]
+
+ low = 0.5 * (minimum + min(desired))
+ high = 0.5 * (max(desired) + maximum)
+ if (high - low) / (count - 1) >= gap:
+ # Good, we don't need the full range
+ equal_step = (high - low) / (count - 1)
+ return [low + i * equal_step for i in range(count)]
+
+ # Crudest solution
+ return [minimum + i * equal_step for i in range(count)]
+
+
+# assert False, _spring_layout([0.10,0.12,0.13,0.14,0.5,0.75, 1.0], 0, 1, 0.1)
+# assert _spring_layout([0.10,0.12,0.13,0.14,0.5,0.75, 1.0], 0, 1, 0.1) == \
+# [0.0, 0.125, 0.25, 0.375, 0.5, 0.75, 1.0]
+# assert _spring_layout([0.10,0.12,0.13,0.14,0.5,0.75, 1.0], 0, 1, 0.1) == \
+# [0.0, 0.16666666666666666, 0.33333333333333331, 0.5,
+# 0.66666666666666663, 0.83333333333333326, 1.0]
+
+
+def _place_labels(desired_etc, minimum, maximum, gap=0):
+ # Want a list of lists/tuples for desired_etc
+ desired_etc.sort()
+ placed = _spring_layout([row[0] for row in desired_etc], minimum, maximum, gap)
+ for old, y2 in zip(desired_etc, placed):
+ # (y1, a, b, c, ..., z) --> (y1, y2, a, b, c, ..., z)
+ yield (old[0], y2) + tuple(old[1:])
+
+
+class AnnotatedChromosomeSegment(ChromosomeSegment):
+ """Annotated chromosome segment.
+
+ This is like the ChromosomeSegment, but accepts a list of features.
+ """
+
+ def __init__(
+ self,
+ bp_length,
+ features,
+ default_feature_color=colors.blue,
+ name_qualifiers=("gene", "label", "name", "locus_tag", "product"),
+ ):
+ """Initialize.
+
+ The features can either be SeqFeature objects, or tuples of values:
+ start (int), end (int), strand (+1, -1, O or None), label (string),
+ ReportLab color (string or object), and optional ReportLab fill color.
+
+ Note we require 0 <= start <= end <= bp_length, and within the vertical
+ space allocated to this segmenet lines will be places according to the
+ start/end coordinates (starting from the top).
+
+ Positive stand features are drawn on the right, negative on the left,
+ otherwise all the way across.
+
+ We recommend using consisent units for all the segment's scale values
+ (e.g. their length in base pairs).
+
+ When providing features as SeqFeature objects, the default color
+ is used, unless the feature's qualifiers include an Artemis colour
+ string (functionality also in GenomeDiagram). The caption also follows
+ the GenomeDiagram approach and takes the first qualifier from the list
+ or tuple specified in name_qualifiers.
+
+ Note additional attribute label_sep_percent controls the percentage of
+ area that the chromosome segment takes up, by default half of the
+ chr_percent attribute (half of 25%, thus 12.5%)
+
+ """
+ ChromosomeSegment.__init__(self)
+ self.bp_length = bp_length
+ self.features = features
+ self.default_feature_color = default_feature_color
+ self.name_qualifiers = name_qualifiers
+ self.label_sep_percent = self.chr_percent * 0.5
+
+ def _overdraw_subcomponents(self, cur_drawing):
+ """Draw any annotated features on the chromosome segment (PRIVATE).
+
+ Assumes _draw_segment already called to fill out the basic shape,
+ and assmes that uses the same boundaries.
+ """
+ # set the coordinates of the segment -- it'll take up the MIDDLE part
+ # of the space we have.
+ segment_y = self.end_y_position
+ segment_width = (self.end_x_position - self.start_x_position) * self.chr_percent
+ label_sep = (
+ self.end_x_position - self.start_x_position
+ ) * self.label_sep_percent
+ segment_height = self.start_y_position - self.end_y_position
+ segment_x = self.start_x_position + 0.5 * (
+ self.end_x_position - self.start_x_position - segment_width
+ )
+
+ left_labels = []
+ right_labels = []
+ for f in self.features:
+ try:
+ # Assume SeqFeature objects
+ start = f.location.start
+ end = f.location.end
+ strand = f.strand
+ try:
+ # Handles Artemis colour integers, HTML colors, etc
+ color = _color_trans.translate(f.qualifiers["color"][0])
+ except Exception: # TODO: ValueError?
+ color = self.default_feature_color
+ fill_color = color
+ name = ""
+ for qualifier in self.name_qualifiers:
+ if qualifier in f.qualifiers:
+ name = f.qualifiers[qualifier][0]
+ break
+ except AttributeError:
+ # Assume tuple of ints, string, and color
+ start, end, strand, name, color = f[:5]
+ color = _color_trans.translate(color)
+ if len(f) > 5:
+ fill_color = _color_trans.translate(f[5])
+ else:
+ fill_color = color
+ assert 0 <= start <= end <= self.bp_length
+ if strand == +1:
+ # Right side only
+ x = segment_x + segment_width * 0.6
+ w = segment_width * 0.4
+ elif strand == -1:
+ # Left side only
+ x = segment_x
+ w = segment_width * 0.4
+ else:
+ # Both or neither - full width
+ x = segment_x
+ w = segment_width
+ local_scale = segment_height / self.bp_length
+ fill_rectangle = Rect(
+ x,
+ segment_y + segment_height - local_scale * start,
+ w,
+ local_scale * (start - end),
+ )
+ fill_rectangle.fillColor = fill_color
+ fill_rectangle.strokeColor = color
+ cur_drawing.add(fill_rectangle)
+ if name:
+ if fill_color == color:
+ back_color = None
+ else:
+ back_color = fill_color
+ value = (
+ segment_y + segment_height - local_scale * start,
+ color,
+ back_color,
+ name,
+ )
+ if strand == -1:
+ self._left_labels.append(value)
+ else:
+ self._right_labels.append(value)
+
+
+class TelomereSegment(ChromosomeSegment):
+ """A segment that is located at the end of a linear chromosome.
+
+ This is just like a regular segment, but it draws the end of a chromosome
+ which is represented by a half circle. This just overrides the
+ _draw_segment class of ChromosomeSegment to provide that specialized
+ drawing.
+ """
+
+ def __init__(self, inverted=0):
+ """Initialize a segment at the end of a chromosome.
+
+ See ChromosomeSegment for all of the attributes that can be
+ customized in a TelomereSegments.
+
+ Arguments:
+ - inverted -- Whether or not the telomere should be inverted
+ (ie. drawn on the bottom of a chromosome)
+
+ """
+ ChromosomeSegment.__init__(self)
+
+ self._inverted = inverted
+
+ def _draw_segment(self, cur_drawing):
+ """Draw a half circle representing the end of a linear chromosome (PRIVATE)."""
+ # set the coordinates of the segment -- it'll take up the MIDDLE part
+ # of the space we have.
+ width = (self.end_x_position - self.start_x_position) * self.chr_percent
+ height = self.start_y_position - self.end_y_position
+ center_x = 0.5 * (self.end_x_position + self.start_x_position)
+ start_x = center_x - 0.5 * width
+ if self._inverted:
+ center_y = self.start_y_position
+ start_angle = 180
+ end_angle = 360
+ else:
+ center_y = self.end_y_position
+ start_angle = 0
+ end_angle = 180
+
+ cap_wedge = Wedge(center_x, center_y, width / 2, start_angle, end_angle, height)
+ cap_wedge.strokeColor = None
+ cap_wedge.fillColor = self.fill_color
+ cur_drawing.add(cap_wedge)
+
+ # Now draw an arc for the curved edge of the wedge,
+ # omitting the flat end.
+ cap_arc = ArcPath()
+ cap_arc.addArc(center_x, center_y, width / 2, start_angle, end_angle, height)
+ cur_drawing.add(cap_arc)
+
+
+class SpacerSegment(ChromosomeSegment):
+ """A segment that is located at the end of a linear chromosome.
+
+ Doesn't draw anything, just empty space which can be helpful
+ for layout purposes (e.g. making room for feature labels).
+ """
+
+ def draw(self, cur_diagram):
+ """Draw nothing to the current diagram (dummy method).
+
+ The segment spacer has no actual image in the diagram,
+ so this method therefore does nothing, but is defined
+ to match the expected API of the other segment objects.
+ """
+ pass
diff --git a/code/lib/Bio/Graphics/ColorSpiral.py b/code/lib/Bio/Graphics/ColorSpiral.py
new file mode 100644
index 0000000..c113b7a
--- /dev/null
+++ b/code/lib/Bio/Graphics/ColorSpiral.py
@@ -0,0 +1,206 @@
+# Copyright 2012 by Leighton Pritchard. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Generate RGB colours suitable for distinguishing categorical data.
+
+This module provides a class that implements a spiral 'path' through HSV
+colour space, permitting the selection of a number of points along that path,
+and returning the output in RGB colour space, suitable for use with ReportLab
+and other graphics packages.
+
+This approach to colour choice was inspired by Bang Wong's Points of View
+article: Color Coding, in Nature Methods _7_ 573 (https://doi.org/10.1038/nmeth0810-573).
+
+The module also provides helper functions that return a list for colours, or
+a dictionary of colours (if passed an iterable containing the names of
+categories to be coloured).
+"""
+
+# standard library
+import colorsys # colour format conversions
+from math import log, exp, floor, pi
+import random # for jitter values
+
+
+class ColorSpiral:
+ """Implement a spiral path through HSV colour space.
+
+ This class provides functions for sampling points along a logarithmic
+ spiral path through HSV colour space.
+
+ The spiral is described by r = a * exp(b * t) where r is the distance
+ from the axis of the HSV cylinder to the current point in the spiral,
+ and t is the angle through which the spiral has turned to reach the
+ current point. a and b are (positive, real) parameters that control the
+ shape of the spiral.
+
+ - a: the starting direction of the spiral
+ - b: the number of revolutions about the axis made by the spiral
+
+ We permit the spiral to move along the cylinder ('in V-space') between
+ v_init and v_final, to give a gradation in V (essentially, brightness),
+ along the path, where v_init, v_final are in [0,1].
+
+ A brightness 'jitter' may also be provided as an absolute value in
+ V-space, to aid in distinguishing consecutive colour points on the
+ path.
+ """
+
+ def __init__(self, a=1, b=0.33, v_init=0.85, v_final=0.5, jitter=0.05):
+ """Initialize a logarithmic spiral path through HSV colour space.
+
+ Arguments:
+ - a - Parameter a for the spiral, controls the initial spiral
+ direction. a > 0
+ - b - parameter b for the spiral, controls the rate at which the
+ spiral revolves around the axis. b > 0
+ - v_init - initial value of V (brightness) for the spiral.
+ v_init in [0,1]
+ - v_final - final value of V (brightness) for the spiral
+ v_final in [0,1]
+ - jitter - the degree of V (brightness) jitter to add to each
+ selected colour. The amount of jitter will be selected
+ from a uniform random distribution [-jitter, jitter],
+ and V will be maintained in [0,1].
+
+ """
+ # Initialize attributes
+ self.a = a
+ self.b = b
+ self.v_init = v_init
+ self.v_final = v_final
+ self.jitter = jitter
+
+ def get_colors(self, k, offset=0.1):
+ """Generate k different RBG colours evenly-space on the spiral.
+
+ A generator returning the RGB colour space values for k
+ evenly-spaced points along the defined spiral in HSV space.
+
+ Arguments:
+ - k - the number of points to return
+ - offset - how far along the spiral path to start.
+
+ """
+ # We use the offset to skip a number of similar colours near to HSV axis
+ assert offset > 0 and offset < 1, "offset must be in (0,1)"
+ v_rate = (self._v_final - self._v_init) / float(k)
+ # Generator for colours: we have divided the arc length into sections
+ # of equal length, and step along them
+ for n in range(1, k + 1):
+ # For each value of n, t indicates the angle through which the
+ # spiral has turned, to this point
+ t = (1.0 / self._b) * (
+ log(n + (k * offset)) - log((1 + offset) * k * self._a)
+ )
+ # Put 0 <= h <= 2*pi, where h is the angular part of the polar
+ # co-ordinates for this point on the spiral
+ h = t
+ while h < 0:
+ h += 2 * pi
+ h = h - (floor(h / (2 * pi)) * pi)
+ # Now put h in [0, 1] for colorsys conversion
+ h = h / (2 * pi)
+ # r is the radial distance of this point from the centre
+ r = self._a * exp(self._b * t)
+ # v is the brightness of this point, linearly interpolated
+ # from self._v_init to self._v_final. Jitter size is sampled from
+ # a uniform distribution
+ if self._jitter:
+ jitter = random.random() * 2 * self._jitter - self._jitter
+ else:
+ jitter = 0
+ v = self._v_init + (n * v_rate + jitter)
+ # We have arranged the arithmetic such that 0 <= r <= 1, so
+ # we can use this value directly as s in HSV
+ yield colorsys.hsv_to_rgb(h, r, max(0, min(v, 1)))
+
+ def _get_a(self):
+ return self._a
+
+ def _set_a(self, value):
+ self._a = max(0, value)
+
+ def _get_b(self):
+ return self._b
+
+ def _set_b(self, value):
+ self._b = max(0, value)
+
+ def _get_v_init(self):
+ return self._v_init
+
+ def _set_v_init(self, value):
+ self._v_init = max(0, min(1, value))
+
+ def _get_v_final(self):
+ return self._v_final
+
+ def _set_v_final(self, value):
+ self._v_final = max(0, min(1, value))
+
+ def _get_jitter(self):
+ return self._jitter
+
+ def _set_jitter(self, value):
+ self._jitter = max(0, min(1, value))
+
+ a = property(
+ _get_a, _set_a, doc="Parameter controlling initial spiral direction (a > 0)"
+ )
+ b = property(
+ _get_b,
+ _set_b,
+ doc="Parameter controlling rate spiral revolves around axis (b > 0)",
+ )
+ v_init = property(
+ _get_v_init,
+ _set_v_init,
+ doc="Initial value of V (brightness) for the spiral (range 0 to 1)",
+ )
+ v_final = property(
+ _get_v_final,
+ _set_v_final,
+ doc="Final value of V (brightness) for the spiral (range 0 to 1)",
+ )
+ jitter = property(
+ _get_jitter,
+ _set_jitter,
+ doc="Degree of V (brightness) jitter to add to each color (range 0 to 1)",
+ )
+
+
+# Convenience functions for those who don't want to bother with a
+# ColorSpiral object
+def get_colors(k, **kwargs):
+ """Return k colours selected by the ColorSpiral object, as a generator.
+
+ Arguments:
+ - k - the number of colours to return
+ - kwargs - pass-through arguments to the ColorSpiral object
+
+ """
+ cs = ColorSpiral(**kwargs)
+ return cs.get_colors(k)
+
+
+def get_color_dict(l, **kwargs):
+ """Return a dictionary of colours using the provided values as keys.
+
+ Returns a dictionary, keyed by the members of iterable l, with a
+ colour assigned to each member.
+
+ Arguments:
+ - l - an iterable representing classes to be coloured
+ - kwargs - pass-through arguments to the ColorSpiral object
+
+ """
+ cs = ColorSpiral(**kwargs)
+ colors = cs.get_colors(len(l))
+ dict = {}
+ for item in l:
+ dict[item] = next(colors)
+ return dict
diff --git a/code/lib/Bio/Graphics/Comparative.py b/code/lib/Bio/Graphics/Comparative.py
new file mode 100644
index 0000000..35bc192
--- /dev/null
+++ b/code/lib/Bio/Graphics/Comparative.py
@@ -0,0 +1,178 @@
+# Copyright 2001 by Brad Chapman. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Plots to compare information between different sources.
+
+This file contains high level plots which are designed to be used to
+compare different types of information. The most basic example is comparing
+two variables in a traditional scatter plot.
+"""
+# reportlab
+from reportlab.lib import colors
+from reportlab.graphics.charts.lineplots import LinePlot
+from reportlab.lib.pagesizes import letter
+from reportlab.lib.units import inch
+
+from reportlab.graphics.shapes import Drawing, String
+from reportlab.graphics.charts.markers import makeEmptySquare, makeFilledSquare
+from reportlab.graphics.charts.markers import makeFilledDiamond, makeSmiley
+from reportlab.graphics.charts.markers import makeFilledCircle, makeEmptyCircle
+
+from Bio.Graphics import _write
+
+
+class ComparativeScatterPlot:
+ """Display a scatter-type plot comparing two different kinds of info.
+
+ Attributes;
+ - display_info - a 2D list of the information we'll be outputting. Each
+ top level list is a different data type, and each data point is a
+ two-tuple of the coordinates of a point.
+
+ So if you had two distributions of points, it should look like::
+
+ display_info = [[(1, 2), (3, 4)],
+ [(5, 6), (7, 8)]]
+
+ If everything is just one set of points, display_info can look like::
+
+ display_info = [[(1, 2), (3, 4), (5, 6)]]
+
+ """
+
+ def __init__(self, output_format="pdf"):
+ """Initialize the class."""
+ # customizable attributes
+ self.number_of_columns = 1
+ self.page_size = letter
+ self.title_size = 20
+
+ self.output_format = output_format
+
+ # the information we'll be writing
+ self.display_info = []
+
+ # initial colors and shapes used for drawing points
+ self.color_choices = [
+ colors.red,
+ colors.green,
+ colors.blue,
+ colors.yellow,
+ colors.orange,
+ colors.black,
+ ]
+ self.shape_choices = [
+ makeFilledCircle,
+ makeEmptySquare,
+ makeFilledDiamond,
+ makeFilledSquare,
+ makeEmptyCircle,
+ makeSmiley,
+ ]
+
+ def draw_to_file(self, output_file, title):
+ """Write the comparative plot to a file.
+
+ Arguments:
+ - output_file - The name of the file to output the information to,
+ or a handle to write to.
+ - title - A title to display on the graphic.
+
+ """
+ width, height = self.page_size
+ cur_drawing = Drawing(width, height)
+
+ self._draw_title(cur_drawing, title, width, height)
+
+ start_x = inch * 0.5
+ end_x = width - inch * 0.5
+ end_y = height - 1.5 * inch
+ start_y = 0.5 * inch
+ self._draw_scatter_plot(cur_drawing, start_x, start_y, end_x, end_y)
+
+ return _write(cur_drawing, output_file, self.output_format)
+
+ def _draw_title(self, cur_drawing, title, width, height):
+ """Add a title to the page we are outputting (PRIVATE)."""
+ title_string = String(width / 2, height - inch, title)
+ title_string.fontName = "Helvetica-Bold"
+ title_string.fontSize = self.title_size
+ title_string.textAnchor = "middle"
+
+ cur_drawing.add(title_string)
+
+ def _draw_scatter_plot(self, cur_drawing, x_start, y_start, x_end, y_end):
+ """Draw a scatter plot on the drawing with the given coordinates (PRIVATE)."""
+ scatter_plot = LinePlot()
+
+ # set the dimensions of the scatter plot
+ scatter_plot.x = x_start
+ scatter_plot.y = y_start
+ scatter_plot.width = abs(x_start - x_end)
+ scatter_plot.height = abs(y_start - y_end)
+
+ scatter_plot.data = self.display_info
+
+ scatter_plot.joinedLines = 0
+
+ # set the axes of the plot
+ x_min, x_max, y_min, y_max = self._find_min_max(self.display_info)
+ scatter_plot.xValueAxis.valueMin = x_min
+ scatter_plot.xValueAxis.valueMax = x_max
+ scatter_plot.xValueAxis.valueStep = (x_max - x_min) / 10.0
+
+ scatter_plot.yValueAxis.valueMin = y_min
+ scatter_plot.yValueAxis.valueMax = y_max
+ scatter_plot.yValueAxis.valueStep = (y_max - y_min) / 10.0
+
+ self._set_colors_and_shapes(scatter_plot, self.display_info)
+
+ cur_drawing.add(scatter_plot)
+
+ def _set_colors_and_shapes(self, scatter_plot, display_info):
+ """Set the colors and shapes of the points displayed (PRIVATE).
+
+ By default this just sets all of the points according to the order
+ of colors and shapes defined in self.color_choices and
+ self.shape_choices. The first 5 shapes and colors are unique, the
+ rest of them are just set to the same color and shape (since I
+ ran out of shapes!).
+
+ You can change how this function works by either changing the
+ values of the color_choices and shape_choices attributes, or
+ by inheriting from this class and overriding this function.
+ """
+ for value_num in range(len(display_info)):
+ # if we have unique colors, add them
+ if (value_num + 1) < len(self.color_choices):
+ scatter_plot.lines[value_num].strokeColor = self.color_choices[
+ value_num
+ ]
+ scatter_plot.lines[value_num].symbol = self.shape_choices[value_num]
+ # otherwise just use the last number
+ else:
+ scatter_plot.lines[value_num].strokeColor = self.color_choices[-1]
+ scatter_plot.lines[value_num].symbol = self.shape_choices[-1]
+
+ def _find_min_max(self, info):
+ """Find min and max for x and y coordinates in the given data (PRIVATE)."""
+ x_min = info[0][0][0]
+ x_max = info[0][0][0]
+ y_min = info[0][0][1]
+ y_max = info[0][0][1]
+
+ for two_d_list in info:
+ for x, y in two_d_list:
+ if x > x_max:
+ x_max = x
+ if x < x_min:
+ x_min = x
+ if y > y_max:
+ y_max = y
+ if y < y_min:
+ y_min = y
+
+ return x_min, x_max, y_min, y_max
diff --git a/code/lib/Bio/Graphics/DisplayRepresentation.py b/code/lib/Bio/Graphics/DisplayRepresentation.py
new file mode 100644
index 0000000..df75283
--- /dev/null
+++ b/code/lib/Bio/Graphics/DisplayRepresentation.py
@@ -0,0 +1,187 @@
+# Copyright 2001 by Brad Chapman. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Represent information for graphical display.
+
+Classes in this module are designed to hold information in a way that
+makes it easy to draw graphical figures.
+"""
+# reportlab
+from reportlab.lib import colors
+
+# local stuff
+from Bio.Graphics.BasicChromosome import ChromosomeSegment
+from Bio.Graphics.BasicChromosome import TelomereSegment
+
+
+# --- constants
+# This is a default color scheme based on the light spectrum.
+# Based on my vague recollections from biology, this is our friend ROY G. BIV
+RAINBOW_COLORS = {
+ (1, 1): colors.violet,
+ (2, 2): colors.indigo,
+ (3, 3): colors.blue,
+ (4, 4): colors.green,
+ (5, 5): colors.yellow,
+ (6, 6): colors.orange,
+ (7, 20): colors.red,
+}
+
+
+class ChromosomeCounts:
+ """Represent a chromosome with count information.
+
+ This is used to display information about counts along a chromosome.
+ The segments are expected to have different count information, which
+ will be displayed using a color scheme.
+
+ I envision using this class when you think that certain regions of
+ the chromosome will be especially abundant in the counts, and you
+ want to pick those out.
+ """
+
+ def __init__(self, segment_names, color_scheme=RAINBOW_COLORS):
+ """Initialize a representation of chromosome counts.
+
+ Arguments:
+ - segment_names - An ordered list of all segment names along
+ the chromosome. The count and other information will be added
+ to these.
+ - color_scheme - A coloring scheme to use in the counts. This
+ should be a dictionary mapping count ranges to colors (specified
+ in reportlab.lib.colors).
+
+ """
+ self._names = segment_names
+ self._count_info = {}
+ self._label_info = {}
+ self._scale_info = {}
+ for name in self._names:
+ self._count_info[name] = 0
+ self._label_info[name] = None
+ self._scale_info[name] = 1
+
+ self._color_scheme = color_scheme
+
+ def add_count(self, segment_name, count=1):
+ """Add counts to the given segment name.
+
+ Arguments:
+ - segment_name - The name of the segment we should add counts to.
+ If the name is not present, a KeyError will be raised.
+ - count - The counts to add the current segment. This defaults to
+ a single count.
+
+ """
+ try:
+ self._count_info[segment_name] += count
+ except KeyError:
+ raise KeyError("Segment name %s not found." % segment_name) from None
+
+ def scale_segment_value(self, segment_name, scale_value=None):
+ """Divide the counts for a segment by some kind of scale value.
+
+ This is useful if segments aren't represented by raw counts, but
+ are instead counts divided by some number.
+ """
+ try:
+ self._count_info[segment_name] = float(
+ self._count_info[segment_name]
+ ) / float(scale_value)
+ except KeyError:
+ raise KeyError("Segment name %s not found." % segment_name) from None
+
+ def add_label(self, segment_name, label):
+ """Add a label to a specific segment.
+
+ Raises a KeyError is the specified segment name is not found.
+ """
+ if segment_name in self._label_info:
+ self._label_info[segment_name] = label
+ else:
+ raise KeyError("Segment name %s not found." % segment_name)
+
+ def set_scale(self, segment_name, scale):
+ """Set the scale for a specific chromosome segment.
+
+ By default all segments have the same scale -- this allows scaling
+ by the size of the segment.
+
+ Raises a KeyError is the specified segment name is not found.
+ """
+ if segment_name in self._label_info:
+ self._scale_info[segment_name] = scale
+ else:
+ raise KeyError("Segment name %s not found." % segment_name)
+
+ def get_segment_info(self):
+ """Retrieve the color and label info about the segments.
+
+ Returns a list consiting of two tuples specifying the counts and
+ label name for each segment. The list is ordered according to the
+ original listing of names. Labels are set as None if no label
+ was specified.
+ """
+ order_info = []
+
+ for seg_name in self._names:
+ order_info.append((self._count_info[seg_name], self._label_info[seg_name]))
+
+ return order_info
+
+ def fill_chromosome(self, chromosome):
+ """Add the collected segment information to a chromosome for drawing.
+
+ Arguments:
+ - chromosome - A Chromosome graphics object that we can add
+ chromosome segments to.
+
+ This creates ChromosomeSegment (and TelomereSegment) objects to
+ fill in the chromosome. The information is derived from the
+ label and count information, with counts transformed to the
+ specified color map.
+
+ Returns the chromosome with all of the segments added.
+ """
+ for seg_num in range(len(self._names)):
+ is_end_segment = 0
+ # make the top and bottom telomeres
+ if seg_num == 0:
+ cur_segment = TelomereSegment()
+ is_end_segment = 1
+ elif seg_num == len(self._names) - 1:
+ cur_segment = TelomereSegment(1)
+ is_end_segment = 1
+ # otherwise, they are just regular segments
+ else:
+ cur_segment = ChromosomeSegment()
+
+ seg_name = self._names[seg_num]
+ if self._count_info[seg_name] > 0:
+ color = self._color_from_count(self._count_info[seg_name])
+ cur_segment.fill_color = color
+
+ if self._label_info[seg_name] is not None:
+ cur_segment.label = self._label_info[seg_name]
+
+ # give end segments extra size so they look right
+ if is_end_segment:
+ cur_segment.scale = 3
+ else:
+ cur_segment.scale = self._scale_info[seg_name]
+
+ chromosome.add(cur_segment)
+
+ return chromosome
+
+ def _color_from_count(self, count):
+ """Translate the given count into a color using the color scheme (PRIVATE)."""
+ for count_start, count_end in self._color_scheme:
+ if count >= count_start and count <= count_end:
+ return self._color_scheme[(count_start, count_end)]
+
+ # if we got here we didn't find a color for the count
+ raise ValueError("Count value %s was not found in the color scheme." % count)
diff --git a/code/lib/Bio/Graphics/Distribution.py b/code/lib/Bio/Graphics/Distribution.py
new file mode 100644
index 0000000..3bfb065
--- /dev/null
+++ b/code/lib/Bio/Graphics/Distribution.py
@@ -0,0 +1,258 @@
+# Copyright 2001 by Brad Chapman. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Display information distributed across a Chromosome-like object.
+
+These classes are meant to show the distribution of some kind of information
+as it changes across any kind of segment. It was designed with chromosome
+distributions in mind, but could also work for chromosome regions, BAC clones
+or anything similar.
+
+Reportlab is used for producing the graphical output.
+"""
+# standard library
+import math
+
+# reportlab
+from reportlab.lib.pagesizes import letter
+from reportlab.lib.units import inch
+from reportlab.lib import colors
+
+from reportlab.graphics.shapes import Drawing, String
+from reportlab.graphics.charts.barcharts import VerticalBarChart
+from reportlab.graphics.charts.barcharts import BarChartProperties
+from reportlab.graphics.widgetbase import TypedPropertyCollection
+
+from Bio.Graphics import _write
+
+
+class DistributionPage:
+ """Display a grouping of distributions on a page.
+
+ This organizes Distributions, and will display them nicely
+ on a single page.
+ """
+
+ def __init__(self, output_format="pdf"):
+ """Initialize the class."""
+ self.distributions = []
+
+ # customizable attributes
+ self.number_of_columns = 1
+ self.page_size = letter
+ self.title_size = 20
+
+ self.output_format = output_format
+
+ def draw(self, output_file, title):
+ """Draw out the distribution information.
+
+ Arguments:
+ - output_file - The name of the file to output the information to,
+ or a handle to write to.
+ - title - A title to display on the graphic.
+
+ """
+ width, height = self.page_size
+ cur_drawing = Drawing(width, height)
+
+ self._draw_title(cur_drawing, title, width, height)
+
+ # calculate the x and y position changes for each distribution
+ cur_x_pos = inch * 0.5
+ end_x_pos = width - inch * 0.5
+ cur_y_pos = height - 1.5 * inch
+ end_y_pos = 0.5 * inch
+ x_pos_change = (end_x_pos - cur_x_pos) / float(self.number_of_columns)
+ num_y_rows = math.ceil(
+ float(len(self.distributions)) / float(self.number_of_columns)
+ )
+ y_pos_change = (cur_y_pos - end_y_pos) / num_y_rows
+
+ self._draw_distributions(
+ cur_drawing, cur_x_pos, x_pos_change, cur_y_pos, y_pos_change, num_y_rows
+ )
+ self._draw_legend(cur_drawing, 2.5 * inch, width)
+
+ return _write(cur_drawing, output_file, self.output_format)
+
+ def _draw_title(self, cur_drawing, title, width, height):
+ """Add the title of the figure to the drawing (PRIVATE)."""
+ title_string = String(width / 2, height - inch, title)
+ title_string.fontName = "Helvetica-Bold"
+ title_string.fontSize = self.title_size
+ title_string.textAnchor = "middle"
+
+ cur_drawing.add(title_string)
+
+ def _draw_distributions(
+ self,
+ cur_drawing,
+ start_x_pos,
+ x_pos_change,
+ start_y_pos,
+ y_pos_change,
+ num_y_drawings,
+ ):
+ """Draw all of the distributions on the page (PRIVATE).
+
+ Arguments:
+ - cur_drawing - The drawing we are working with.
+ - start_x_pos - The x position on the page to start drawing at.
+ - x_pos_change - The change in x position between each figure.
+ - start_y_pos - The y position on the page to start drawing at.
+ - y_pos_change - The change in y position between each figure.
+ - num_y_drawings - The number of drawings we'll have in the y
+ (up/down) direction.
+
+ """
+ for y_drawing in range(int(num_y_drawings)):
+ # if we are on the last y position, we may not be able
+ # to fill all of the x columns
+ if (y_drawing + 1) * self.number_of_columns > len(self.distributions):
+ num_x_drawings = (
+ len(self.distributions) - y_drawing * self.number_of_columns
+ )
+ else:
+ num_x_drawings = self.number_of_columns
+ for x_drawing in range(num_x_drawings):
+ dist_num = y_drawing * self.number_of_columns + x_drawing
+ cur_distribution = self.distributions[dist_num]
+
+ # find the x and y boundaries of the distribution
+ x_pos = start_x_pos + x_drawing * x_pos_change
+ end_x_pos = x_pos + x_pos_change
+ end_y_pos = start_y_pos - y_drawing * y_pos_change
+ y_pos = end_y_pos - y_pos_change
+
+ # draw the distribution
+ cur_distribution.draw(cur_drawing, x_pos, y_pos, end_x_pos, end_y_pos)
+
+ def _draw_legend(self, cur_drawing, start_y, width):
+ """Add a legend to the figure (PRIVATE).
+
+ Subclasses can implement to provide a specialized legend.
+ """
+ pass
+
+
+class BarChartDistribution:
+ """Display the distribution of values as a bunch of bars."""
+
+ def __init__(self, display_info=None):
+ """Initialize a Bar Chart display of distribution info.
+
+ Attributes:
+ - display_info - the information to be displayed in the distribution.
+ This should be ordered as a list of lists, where each internal list
+ is a data set to display in the bar chart.
+
+ """
+ if display_info is None:
+ display_info = []
+ self.display_info = display_info
+
+ self.x_axis_title = ""
+ self.y_axis_title = ""
+ self.chart_title = ""
+ self.chart_title_size = 10
+
+ self.padding_percent = 0.15
+
+ def draw(self, cur_drawing, start_x, start_y, end_x, end_y):
+ """Draw a bar chart with the info in the specified range."""
+ bar_chart = VerticalBarChart()
+ if self.chart_title:
+ self._draw_title(
+ cur_drawing, self.chart_title, start_x, start_y, end_x, end_y
+ )
+ # set the position of the bar chart
+ x_start, x_end, y_start, y_end = self._determine_position(
+ start_x, start_y, end_x, end_y
+ )
+
+ bar_chart.x = x_start
+ bar_chart.y = y_start
+ bar_chart.width = abs(x_start - x_end)
+ bar_chart.height = abs(y_start - y_end)
+
+ # set the information in the bar chart
+ bar_chart.data = self.display_info
+ bar_chart.valueAxis.valueMin = min(self.display_info[0])
+ bar_chart.valueAxis.valueMax = max(self.display_info[0])
+ for data_set in self.display_info[1:]:
+ if min(data_set) < bar_chart.valueAxis.valueMin:
+ bar_chart.valueAxis.valueMin = min(data_set)
+ if max(data_set) > bar_chart.valueAxis.valueMax:
+ bar_chart.valueAxis.valueMax = max(data_set)
+
+ # set other formatting options
+ if len(self.display_info) == 1:
+ bar_chart.groupSpacing = 0
+ style = TypedPropertyCollection(BarChartProperties)
+ style.strokeWidth = 0
+ style.strokeColor = colors.green
+ style[0].fillColor = colors.green
+
+ bar_chart.bars = style
+
+ # set the labels
+ # XXX labels don't work yet
+ # bar_chart.valueAxis.title = self.x_axis_title
+ # bar_chart.categoryAxis.title = self.y_axis_title
+
+ cur_drawing.add(bar_chart)
+
+ def _draw_title(self, cur_drawing, title, start_x, start_y, end_x, end_y):
+ """Add the title of the figure to the drawing (PRIVATE)."""
+ x_center = start_x + (end_x - start_x) / 2
+ y_pos = end_y + (self.padding_percent * (start_y - end_y)) / 2
+ title_string = String(x_center, y_pos, title)
+ title_string.fontName = "Helvetica-Bold"
+ title_string.fontSize = self.chart_title_size
+ title_string.textAnchor = "middle"
+
+ cur_drawing.add(title_string)
+
+ def _determine_position(self, start_x, start_y, end_x, end_y):
+ """Calculate the position of the chart with blank space (PRIVATE).
+
+ This uses some padding around the chart, and takes into account
+ whether the chart has a title. It returns 4 values, which are,
+ in order, the x_start, x_end, y_start and y_end of the chart
+ itself.
+ """
+ x_padding = self.padding_percent * (end_x - start_x)
+ y_padding = self.padding_percent * (start_y - end_y)
+
+ new_x_start = start_x + x_padding
+ new_x_end = end_x - x_padding
+
+ if self.chart_title:
+ new_y_start = start_y - y_padding - self.chart_title_size
+ else:
+ new_y_start = start_y - y_padding
+
+ new_y_end = end_y + y_padding
+
+ return new_x_start, new_x_end, new_y_start, new_y_end
+
+
+class LineDistribution:
+ """Display the distribution of values as connected lines.
+
+ This distribution displays the change in values across the object as
+ lines. This also allows multiple distributions to be displayed on a
+ single graph.
+ """
+
+ def __init__(self):
+ """Initialize the class."""
+ pass
+
+ def draw(self, cur_drawing, start_x, start_y, end_x, end_y):
+ """Draw a line distribution into the current drawing."""
+ pass
diff --git a/code/lib/Bio/Graphics/GenomeDiagram/_AbstractDrawer.py b/code/lib/Bio/Graphics/GenomeDiagram/_AbstractDrawer.py
new file mode 100644
index 0000000..4e97e36
--- /dev/null
+++ b/code/lib/Bio/Graphics/GenomeDiagram/_AbstractDrawer.py
@@ -0,0 +1,565 @@
+# Copyright 2003-2008 by Leighton Pritchard. All rights reserved.
+# Revisions copyright 2008-2017 by Peter Cock.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+#
+# Contact: Leighton Pritchard, The James Hutton Institute,
+# Invergowrie, Dundee, Scotland, DD2 5DA, UK
+# Leighton.Pritchard@hutton.ac.uk
+################################################################################
+
+"""AbstractDrawer module (considered to be a private module, the API may change!).
+
+Provides:
+ - AbstractDrawer - Superclass for methods common to the Drawer objects
+ - page_sizes - Method that returns a ReportLab pagesize when passed
+ a valid ISO size
+ - draw_box - Method that returns a closed path object when passed
+ the proper co-ordinates. For HORIZONTAL boxes only.
+ - angle2trig - Method that returns a tuple of values that are the
+ vector for rotating a point through a passed angle,
+ about an origin
+ - intermediate_points - Method that returns a list of values intermediate
+ between the points in a passed dataset
+
+For drawing capabilities, this module uses reportlab to draw and write
+the diagram: http://www.reportlab.com
+
+For dealing with biological information, the package expects Biopython objects
+like SeqFeatures.
+"""
+
+# ReportLab imports
+
+from reportlab.lib import pagesizes
+from reportlab.lib import colors
+from reportlab.graphics.shapes import Polygon
+
+from math import pi, sin, cos
+from itertools import islice
+
+################################################################################
+# METHODS
+################################################################################
+
+
+# Utility method to translate strings to ISO page sizes
+def page_sizes(size):
+ """Convert size string into a Reportlab pagesize.
+
+ Arguments:
+ - size - A string representing a standard page size, eg 'A4' or 'LETTER'
+
+ """
+ sizes = { # ReportLab pagesizes, keyed by ISO string
+ "A0": pagesizes.A0,
+ "A1": pagesizes.A1,
+ "A2": pagesizes.A2,
+ "A3": pagesizes.A3,
+ "A4": pagesizes.A4,
+ "A5": pagesizes.A5,
+ "A6": pagesizes.A6,
+ "B0": pagesizes.B0,
+ "B1": pagesizes.B1,
+ "B2": pagesizes.B2,
+ "B3": pagesizes.B3,
+ "B4": pagesizes.B4,
+ "B5": pagesizes.B5,
+ "B6": pagesizes.B6,
+ "ELEVENSEVENTEEN": pagesizes.ELEVENSEVENTEEN,
+ "LEGAL": pagesizes.LEGAL,
+ "LETTER": pagesizes.LETTER,
+ }
+ try:
+ return sizes[size]
+ except KeyError:
+ raise ValueError("%s not in list of page sizes" % size) from None
+
+
+def _stroke_and_fill_colors(color, border):
+ """Deal with border and fill colors (PRIVATE)."""
+ if not isinstance(color, colors.Color):
+ raise ValueError("Invalid color %r" % color)
+
+ if color == colors.white and border is None:
+ # Force black border on white boxes with undefined border
+ strokecolor = colors.black
+ elif border is None:
+ strokecolor = color # use fill color
+ elif border:
+ if not isinstance(border, colors.Color):
+ raise ValueError("Invalid border color %r" % border)
+ strokecolor = border
+ else:
+ # e.g. False
+ strokecolor = None
+
+ return strokecolor, color
+
+
+def draw_box(
+ point1, point2, color=colors.lightgreen, border=None, colour=None, **kwargs
+):
+ """Draw a box.
+
+ Arguments:
+ - point1, point2 - coordinates for opposite corners of the box
+ (x,y tuples)
+ - color /colour - The color for the box (colour takes priority
+ over color)
+ - border - Border color for the box
+
+ Returns a closed path object, beginning at (x1,y1) going round
+ the four points in order, and filling with the passed color.
+ """
+ x1, y1 = point1
+ x2, y2 = point2
+
+ # Let the UK spelling (colour) override the USA spelling (color)
+ if colour is not None:
+ color = colour
+ del colour
+
+ strokecolor, color = _stroke_and_fill_colors(color, border)
+
+ x1, y1, x2, y2 = min(x1, x2), min(y1, y2), max(x1, x2), max(y1, y2)
+ return Polygon(
+ [x1, y1, x2, y1, x2, y2, x1, y2],
+ strokeColor=strokecolor,
+ fillColor=color,
+ strokewidth=0,
+ **kwargs
+ )
+
+
+def draw_cut_corner_box(
+ point1, point2, corner=0.5, color=colors.lightgreen, border=None, **kwargs
+):
+ """Draw a box with the corners cut off."""
+ x1, y1 = point1
+ x2, y2 = point2
+
+ if not corner:
+ return draw_box(point1, point2, color, border)
+ elif corner < 0:
+ raise ValueError("Arrow head length ratio should be positive")
+
+ strokecolor, color = _stroke_and_fill_colors(color, border)
+
+ boxheight = y2 - y1
+ boxwidth = x2 - x1
+ x_corner = min(boxheight * 0.5 * corner, boxwidth * 0.5)
+ y_corner = min(boxheight * 0.5 * corner, boxheight * 0.5)
+
+ points = [
+ x1,
+ y1 + y_corner,
+ x1,
+ y2 - y_corner,
+ x1 + x_corner,
+ y2,
+ x2 - x_corner,
+ y2,
+ x2,
+ y2 - y_corner,
+ x2,
+ y1 + y_corner,
+ x2 - x_corner,
+ y1,
+ x1 + x_corner,
+ y1,
+ ]
+ return Polygon(
+ deduplicate(points),
+ strokeColor=strokecolor,
+ strokeWidth=1,
+ strokeLineJoin=1, # 1=round
+ fillColor=color,
+ **kwargs
+ )
+
+
+def draw_polygon(
+ list_of_points, color=colors.lightgreen, border=None, colour=None, **kwargs
+):
+ """Draw polygon.
+
+ Arguments:
+ - list_of_point - list of (x,y) tuples for the corner coordinates
+ - color / colour - The color for the box
+
+ Returns a closed path object, beginning at (x1,y1) going round
+ the four points in order, and filling with the passed colour.
+
+ """
+ # Let the UK spelling (colour) override the USA spelling (color)
+ if colour is not None:
+ color = colour
+ del colour
+
+ strokecolor, color = _stroke_and_fill_colors(color, border)
+
+ xy_list = []
+ for (x, y) in list_of_points:
+ xy_list.append(x)
+ xy_list.append(y)
+
+ return Polygon(
+ deduplicate(xy_list),
+ strokeColor=strokecolor,
+ fillColor=color,
+ strokewidth=0,
+ **kwargs
+ )
+
+
+def draw_arrow(
+ point1,
+ point2,
+ color=colors.lightgreen,
+ border=None,
+ shaft_height_ratio=0.4,
+ head_length_ratio=0.5,
+ orientation="right",
+ colour=None,
+ **kwargs
+):
+ """Draw an arrow.
+
+ Returns a closed path object representing an arrow enclosed by the
+ box with corners at {point1=(x1,y1), point2=(x2,y2)}, a shaft height
+ given by shaft_height_ratio (relative to box height), a head length
+ given by head_length_ratio (also relative to box height), and
+ an orientation that may be 'left' or 'right'.
+ """
+ x1, y1 = point1
+ x2, y2 = point2
+
+ if shaft_height_ratio < 0 or 1 < shaft_height_ratio:
+ raise ValueError("Arrow shaft height ratio should be in range 0 to 1")
+ if head_length_ratio < 0:
+ raise ValueError("Arrow head length ratio should be positive")
+
+ # Let the UK spelling (colour) override the USA spelling (color)
+ if colour is not None:
+ color = colour
+ del colour
+
+ strokecolor, color = _stroke_and_fill_colors(color, border)
+
+ # Depending on the orientation, we define the bottom left (x1, y1) and
+ # top right (x2, y2) coordinates differently, but still draw the box
+ # using the same relative co-ordinates:
+ xmin, ymin = min(x1, x2), min(y1, y2)
+ xmax, ymax = max(x1, x2), max(y1, y2)
+ if orientation == "right":
+ x1, x2, y1, y2 = xmin, xmax, ymin, ymax
+ elif orientation == "left":
+ x1, x2, y1, y2 = xmax, xmin, ymin, ymax
+ else:
+ raise ValueError(
+ "Invalid orientation %r, should be 'left' or 'right'" % orientation
+ )
+
+ # We define boxheight and boxwidth accordingly, and calculate the shaft
+ # height from these. We also ensure that the maximum head length is
+ # the width of the box enclosure
+ boxheight = y2 - y1
+ boxwidth = x2 - x1
+ shaftheight = boxheight * shaft_height_ratio
+ headlength = min(abs(boxheight) * head_length_ratio, abs(boxwidth))
+ if boxwidth < 0:
+ headlength *= -1 # reverse it
+
+ shafttop = 0.5 * (boxheight + shaftheight)
+ shaftbase = boxheight - shafttop
+ headbase = boxwidth - headlength
+ midheight = 0.5 * boxheight
+
+ points = [
+ x1,
+ y1 + shafttop,
+ x1 + headbase,
+ y1 + shafttop,
+ x1 + headbase,
+ y2,
+ x2,
+ y1 + midheight,
+ x1 + headbase,
+ y1,
+ x1 + headbase,
+ y1 + shaftbase,
+ x1,
+ y1 + shaftbase,
+ ]
+
+ return Polygon(
+ deduplicate(points),
+ strokeColor=strokecolor,
+ # strokeWidth=max(1, int(boxheight/40.)),
+ strokeWidth=1,
+ # default is mitre/miter which can stick out too much:
+ strokeLineJoin=1, # 1=round
+ fillColor=color,
+ **kwargs
+ )
+
+
+def deduplicate(points):
+ """Remove adjacent duplicate points.
+
+ This is important for use with the Polygon class since reportlab has a
+ bug with duplicate points.
+
+ Arguments:
+ - points - list of points [x1, y1, x2, y2,...]
+
+ Returns a list in the same format with consecutive duplicates removed
+ """
+ assert len(points) % 2 == 0
+ if len(points) < 2:
+ return points
+ newpoints = points[0:2]
+ for x, y in zip(islice(points, 2, None, 2), islice(points, 3, None, 2)):
+ if x != newpoints[-2] or y != newpoints[-1]:
+ newpoints.append(x)
+ newpoints.append(y)
+ return newpoints
+
+
+def angle2trig(theta):
+ """Convert angle to a reportlab ready tuple.
+
+ Arguments:
+ - theta - Angle in degrees, counter clockwise from horizontal
+
+ Returns a representation of the passed angle in a format suitable
+ for ReportLab rotations (i.e. cos(theta), sin(theta), -sin(theta),
+ cos(theta) tuple)
+ """
+ c = cos(theta * pi / 180)
+ s = sin(theta * pi / 180)
+ return (c, s, -s, c) # Vector for rotating point around an origin
+
+
+def intermediate_points(start, end, graph_data):
+ """Generate intermediate points describing provided graph data..
+
+ Returns a list of (start, end, value) tuples describing the passed
+ graph data as 'bins' between position midpoints.
+ """
+ newdata = [] # data in form (X0, X1, val)
+ # add first block
+ newdata.append(
+ (
+ start,
+ graph_data[0][0] + (graph_data[1][0] - graph_data[0][0]) / 2.0,
+ graph_data[0][1],
+ )
+ )
+ # add middle set
+ for index in range(1, len(graph_data) - 1):
+ lastxval, lastyval = graph_data[index - 1]
+ xval, yval = graph_data[index]
+ nextxval, nextyval = graph_data[index + 1]
+ newdata.append(
+ (lastxval + (xval - lastxval) / 2.0, xval + (nextxval - xval) / 2.0, yval)
+ )
+ # add last block
+ newdata.append((xval + (nextxval - xval) / 2.0, end, graph_data[-1][1]))
+ return newdata
+
+
+################################################################################
+# CLASSES
+################################################################################
+
+
+class AbstractDrawer:
+ """Abstract Drawer.
+
+ Attributes:
+ - tracklines Boolean for whether to draw lines delineating tracks
+ - pagesize Tuple describing the size of the page in pixels
+ - x0 Float X co-ord for leftmost point of drawable area
+ - xlim Float X co-ord for rightmost point of drawable area
+ - y0 Float Y co-ord for lowest point of drawable area
+ - ylim Float Y co-ord for topmost point of drawable area
+ - pagewidth Float pixel width of drawable area
+ - pageheight Float pixel height of drawable area
+ - xcenter Float X co-ord of center of drawable area
+ - ycenter Float Y co-ord of center of drawable area
+ - start Int, base to start drawing from
+ - end Int, base to stop drawing at
+ - length Size of sequence to be drawn
+ - cross_track_links List of tuples each with four entries (track A,
+ feature A, track B, feature B) to be linked.
+
+ """
+
+ def __init__(
+ self,
+ parent,
+ pagesize="A3",
+ orientation="landscape",
+ x=0.05,
+ y=0.05,
+ xl=None,
+ xr=None,
+ yt=None,
+ yb=None,
+ start=None,
+ end=None,
+ tracklines=0,
+ cross_track_links=None,
+ ):
+ """Create the object.
+
+ Arguments:
+ - parent Diagram object containing the data that the drawer draws
+ - pagesize String describing the ISO size of the image, or a tuple
+ of pixels
+ - orientation String describing the required orientation of the
+ final drawing ('landscape' or 'portrait')
+ - x Float (0->1) describing the relative size of the X
+ margins to the page
+ - y Float (0->1) describing the relative size of the Y
+ margins to the page
+ - xl Float (0->1) describing the relative size of the left X
+ margin to the page (overrides x)
+ - xr Float (0->1) describing the relative size of the right X
+ margin to the page (overrides x)
+ - yt Float (0->1) describing the relative size of the top Y
+ margin to the page (overrides y)
+ - yb Float (0->1) describing the relative size of the lower Y
+ margin to the page (overrides y)
+ - start Int, the position to begin drawing the diagram at
+ - end Int, the position to stop drawing the diagram at
+ - tracklines Boolean flag to show (or not) lines delineating tracks
+ on the diagram
+ - cross_track_links List of tuples each with four entries (track A,
+ feature A, track B, feature B) to be linked.
+
+ """
+ self._parent = parent # The calling Diagram object
+
+ # Perform 'administrative' tasks of setting up the page
+ self.set_page_size(pagesize, orientation) # Set drawing size
+ self.set_margins(x, y, xl, xr, yt, yb) # Set page margins
+ self.set_bounds(start, end) # Set limits on what will be drawn
+ self.tracklines = tracklines # Set flags
+ if cross_track_links is None:
+ cross_track_links = []
+ else:
+ self.cross_track_links = cross_track_links
+
+ def set_page_size(self, pagesize, orientation):
+ """Set page size of the drawing..
+
+ Arguments:
+ - pagesize Size of the output image, a tuple of pixels (width,
+ height, or a string in the reportlab.lib.pagesizes
+ set of ISO sizes.
+ - orientation String: 'landscape' or 'portrait'
+
+ """
+ if isinstance(pagesize, str): # A string, so translate
+ pagesize = page_sizes(pagesize)
+ elif isinstance(pagesize, tuple): # A tuple, so don't translate
+ pass
+ else:
+ raise ValueError("Page size %s not recognised" % pagesize)
+ shortside, longside = min(pagesize), max(pagesize)
+
+ orientation = orientation.lower()
+ if orientation not in ("landscape", "portrait"):
+ raise ValueError("Orientation %s not recognised" % orientation)
+ if orientation == "landscape":
+ self.pagesize = (longside, shortside)
+ else:
+ self.pagesize = (shortside, longside)
+
+ def set_margins(self, x, y, xl, xr, yt, yb):
+ """Set page margins.
+
+ Arguments:
+ - x Float(0->1), Absolute X margin as % of page
+ - y Float(0->1), Absolute Y margin as % of page
+ - xl Float(0->1), Left X margin as % of page
+ - xr Float(0->1), Right X margin as % of page
+ - yt Float(0->1), Top Y margin as % of page
+ - yb Float(0->1), Bottom Y margin as % of page
+
+ Set the page margins as proportions of the page 0->1, and also
+ set the page limits x0, y0 and xlim, ylim, and page center
+ xorigin, yorigin, as well as overall page width and height
+ """
+ # Set left, right, top and bottom margins
+ xmargin_l = xl or x
+ xmargin_r = xr or x
+ ymargin_top = yt or y
+ ymargin_btm = yb or y
+
+ # Set page limits, center and height/width
+ self.x0, self.y0 = self.pagesize[0] * xmargin_l, self.pagesize[1] * ymargin_btm
+ self.xlim, self.ylim = (
+ self.pagesize[0] * (1 - xmargin_r),
+ self.pagesize[1] * (1 - ymargin_top),
+ )
+ self.pagewidth = self.xlim - self.x0
+ self.pageheight = self.ylim - self.y0
+ self.xcenter, self.ycenter = (
+ self.x0 + self.pagewidth / 2.0,
+ self.y0 + self.pageheight / 2.0,
+ )
+
+ def set_bounds(self, start, end):
+ """Set start and end points for the drawing as a whole.
+
+ Arguments:
+ - start - The first base (or feature mark) to draw from
+ - end - The last base (or feature mark) to draw to
+
+ """
+ low, high = self._parent.range() # Extent of tracks
+
+ if start is not None and end is not None and start > end:
+ start, end = end, start
+
+ if start is None or start < 0: # Check validity of passed args and
+ start = 0 # default to 0
+ if end is None or end < 0:
+ end = high + 1 # default to track range top limit
+
+ self.start, self.end = int(start), int(end)
+ self.length = self.end - self.start + 1
+
+ def is_in_bounds(self, value):
+ """Check if given value is within the region selected for drawing.
+
+ Arguments:
+ - value - A base position
+
+ """
+ if value >= self.start and value <= self.end:
+ return 1
+ return 0
+
+ def __len__(self):
+ """Return the length of the region to be drawn."""
+ return self.length
+
+ def _current_track_start_end(self):
+ track = self._parent[self.current_track_level]
+ if track.start is None:
+ start = self.start
+ else:
+ start = max(self.start, track.start)
+ if track.end is None:
+ end = self.end
+ else:
+ end = min(self.end, track.end)
+ return start, end
diff --git a/code/lib/Bio/Graphics/GenomeDiagram/_CircularDrawer.py b/code/lib/Bio/Graphics/GenomeDiagram/_CircularDrawer.py
new file mode 100644
index 0000000..b090fd9
--- /dev/null
+++ b/code/lib/Bio/Graphics/GenomeDiagram/_CircularDrawer.py
@@ -0,0 +1,1725 @@
+# Copyright 2003-2008 by Leighton Pritchard. All rights reserved.
+# Revisions copyright 2008-2017 by Peter Cock.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+#
+# Contact: Leighton Pritchard, The James Hutton Institute,
+# Invergowrie, Dundee, Scotland, DD2 5DA, UK
+# Leighton.Pritchard@hutton.ac.uk
+################################################################################
+
+"""CircularDrawer module for GenomeDiagram."""
+
+# ReportLab imports
+
+from reportlab.graphics.shapes import Drawing, String, Group, Line, Circle, Polygon
+from reportlab.lib import colors
+from reportlab.graphics.shapes import ArcPath
+
+# GenomeDiagram imports
+from ._AbstractDrawer import AbstractDrawer, draw_polygon, intermediate_points
+from ._AbstractDrawer import _stroke_and_fill_colors
+from ._FeatureSet import FeatureSet
+from ._GraphSet import GraphSet
+
+from math import pi, cos, sin
+
+
+class CircularDrawer(AbstractDrawer):
+ """Object for drawing circular diagrams.
+
+ Attributes:
+ - tracklines Boolean for whether to draw lines dilineating tracks
+ - pagesize Tuple describing the size of the page in pixels
+ - x0 Float X co-ord for leftmost point of drawable area
+ - xlim Float X co-ord for rightmost point of drawable area
+ - y0 Float Y co-ord for lowest point of drawable area
+ - ylim Float Y co-ord for topmost point of drawable area
+ - pagewidth Float pixel width of drawable area
+ - pageheight Float pixel height of drawable area
+ - xcenter Float X co-ord of center of drawable area
+ - ycenter Float Y co-ord of center of drawable area
+ - start Int, base to start drawing from
+ - end Int, base to stop drawing at
+ - length Size of sequence to be drawn
+ - track_size Float (0->1) the proportion of the track height to draw in
+ - drawing Drawing canvas
+ - drawn_tracks List of ints denoting which tracks are to be drawn
+ - current_track_level Int denoting which track is currently being drawn
+ - track_offsets Dictionary of number of pixels that each track top,
+ center and bottom is offset from the base of a fragment, keyed by track
+ - sweep Float (0->1) the proportion of the circle circumference to
+ use for the diagram
+ - cross_track_links List of tuples each with four entries (track A,
+ feature A, track B, feature B) to be linked.
+
+ """
+
+ def __init__(
+ self,
+ parent=None,
+ pagesize="A3",
+ orientation="landscape",
+ x=0.05,
+ y=0.05,
+ xl=None,
+ xr=None,
+ yt=None,
+ yb=None,
+ start=None,
+ end=None,
+ tracklines=0,
+ track_size=0.75,
+ circular=1,
+ circle_core=0.0,
+ cross_track_links=None,
+ ):
+ """Create CircularDrawer object.
+
+ Arguments:
+ - parent Diagram object containing the data that the drawer
+ draws
+ - pagesize String describing the ISO size of the image, or a tuple
+ of pixels
+ - orientation String describing the required orientation of the
+ final drawing ('landscape' or 'portrait')
+ - x Float (0->1) describing the relative size of the X
+ margins to the page
+ - y Float (0->1) describing the relative size of the Y
+ margins to the page
+ - xl Float (0->1) describing the relative size of the left X
+ margin to the page (overrides x)
+ - xl Float (0->1) describing the relative size of the left X
+ margin to the page (overrides x)
+ - xr Float (0->1) describing the relative size of the right X
+ margin to the page (overrides x)
+ - yt Float (0->1) describing the relative size of the top Y
+ margin to the page (overrides y)
+ - yb Float (0->1) describing the relative size of the lower Y
+ margin to the page (overrides y)
+ - start Int, the position to begin drawing the diagram at
+ - end Int, the position to stop drawing the diagram at
+ - tracklines Boolean flag to show (or not) lines delineating tracks
+ on the diagram
+ - track_size The proportion of the available track height that
+ should be taken up in drawing
+ - circular Boolean flaw to show whether the passed sequence is
+ circular or not
+ - circle_core The proportion of the available radius to leave
+ empty at the center of a circular diagram (0 to 1).
+ - cross_track_links List of tuples each with four entries (track A,
+ feature A, track B, feature B) to be linked.
+
+ """
+ # Use the superclass' instantiation method
+ AbstractDrawer.__init__(
+ self,
+ parent,
+ pagesize,
+ orientation,
+ x,
+ y,
+ xl,
+ xr,
+ yt,
+ yb,
+ start,
+ end,
+ tracklines,
+ cross_track_links,
+ )
+
+ # Useful measurements on the page
+ self.track_size = track_size
+ self.circle_core = circle_core
+ # Determine proportion of circumference around which information will be drawn
+ if not circular:
+ self.sweep = 0.9
+ else:
+ self.sweep = 1.0
+
+ def set_track_heights(self):
+ """Initialize track heights.
+
+ Since tracks may not be of identical heights, the bottom and top
+ radius for each track is stored in a dictionary - self.track_radii,
+ keyed by track number
+ """
+ bot_track = min(min(self.drawn_tracks), 1)
+ top_track = max(self.drawn_tracks) # The 'highest' track to draw
+
+ trackunit_sum = 0 # Total number of 'units' taken up by all tracks
+ trackunits = {} # Start and & units for each track keyed by track number
+ heightholder = 0 # placeholder variable
+ for track in range(bot_track, top_track + 1): # track numbers to 'draw'
+ try:
+ trackheight = self._parent[track].height # Get track height
+ except Exception: # TODO: ValueError? IndexError?
+ trackheight = 1
+ trackunit_sum += trackheight # increment total track unit height
+ trackunits[track] = (heightholder, heightholder + trackheight)
+ heightholder += trackheight # move to next height
+
+ max_radius = 0.5 * min(self.pagewidth, self.pageheight)
+ trackunit_height = max_radius * (1 - self.circle_core) / trackunit_sum
+ track_core = max_radius * self.circle_core
+
+ # Calculate top and bottom radii for each track
+ self.track_radii = {} # The inner, outer and center radii for each track
+ track_crop = (
+ trackunit_height * (1 - self.track_size) / 2.0
+ ) # 'step back' in pixels
+ for track in trackunits:
+ top = trackunits[track][1] * trackunit_height - track_crop + track_core
+ btm = trackunits[track][0] * trackunit_height + track_crop + track_core
+ ctr = btm + (top - btm) / 2.0
+ self.track_radii[track] = (btm, ctr, top)
+
+ def draw(self):
+ """Draw a circular diagram of the stored data."""
+ # Instantiate the drawing canvas
+ self.drawing = Drawing(self.pagesize[0], self.pagesize[1])
+
+ feature_elements = [] # holds feature elements
+ feature_labels = [] # holds feature labels
+ greytrack_bgs = [] # holds track background
+ greytrack_labels = [] # holds track foreground labels
+ scale_axes = [] # holds scale axes
+ scale_labels = [] # holds scale axis labels
+
+ # Get tracks to be drawn and set track sizes
+ self.drawn_tracks = self._parent.get_drawn_levels()
+ self.set_track_heights()
+
+ # Go through each track in the parent (if it is to be drawn) one by
+ # one and collate the data as drawing elements
+ for track_level in self._parent.get_drawn_levels():
+ self.current_track_level = track_level
+ track = self._parent[track_level]
+ gbgs, glabels = self.draw_greytrack(track) # Greytracks
+ greytrack_bgs.append(gbgs)
+ greytrack_labels.append(glabels)
+ features, flabels = self.draw_track(track) # Features and graphs
+ feature_elements.append(features)
+ feature_labels.append(flabels)
+ if track.scale:
+ axes, slabels = self.draw_scale(track) # Scale axes
+ scale_axes.append(axes)
+ scale_labels.append(slabels)
+
+ feature_cross_links = []
+ for cross_link_obj in self.cross_track_links:
+ cross_link_elements = self.draw_cross_link(cross_link_obj)
+ if cross_link_elements:
+ feature_cross_links.append(cross_link_elements)
+
+ # Groups listed in order of addition to page (from back to front)
+ # Draw track backgrounds
+ # Draw feature cross track links
+ # Draw features and graphs
+ # Draw scale axes
+ # Draw scale labels
+ # Draw feature labels
+ # Draw track labels
+ element_groups = [
+ greytrack_bgs,
+ feature_cross_links,
+ feature_elements,
+ scale_axes,
+ scale_labels,
+ feature_labels,
+ greytrack_labels,
+ ]
+ for element_group in element_groups:
+ for element_list in element_group:
+ [self.drawing.add(element) for element in element_list]
+
+ if self.tracklines:
+ # Draw test tracks over top of diagram
+ self.draw_test_tracks()
+
+ def draw_track(self, track):
+ """Return list of track elements and list of track labels."""
+ track_elements = [] # Holds elements for features and graphs
+ track_labels = [] # Holds labels for features and graphs
+
+ # Distribution dictionary for dealing with different set types
+ set_methods = {FeatureSet: self.draw_feature_set, GraphSet: self.draw_graph_set}
+
+ for set in track.get_sets(): # Draw the feature or graph sets
+ elements, labels = set_methods[set.__class__](set)
+ track_elements += elements
+ track_labels += labels
+ return track_elements, track_labels
+
+ def draw_feature_set(self, set):
+ """Return list of feature elements and list of labels for them."""
+ # print('draw feature set')
+ feature_elements = [] # Holds diagram elements belonging to the features
+ label_elements = [] # Holds diagram elements belonging to feature labels
+
+ # Collect all the elements for the feature set
+ for feature in set.get_features():
+ if self.is_in_bounds(feature.start) or self.is_in_bounds(feature.end):
+ features, labels = self.draw_feature(feature)
+ feature_elements += features
+ label_elements += labels
+
+ return feature_elements, label_elements
+
+ def draw_feature(self, feature):
+ """Return list of feature elements and list of labels for them."""
+ feature_elements = [] # Holds drawable elements for a single feature
+ label_elements = [] # Holds labels for a single feature
+
+ if feature.hide: # Don't show feature: return early
+ return feature_elements, label_elements
+
+ start, end = self._current_track_start_end()
+ # A single feature may be split into subfeatures, so loop over them
+ for locstart, locend in feature.locations:
+ if locend < start:
+ continue
+ locstart = max(locstart, start)
+ if end < locstart:
+ continue
+ locend = min(locend, end)
+ # Get sigil for the feature/ each subfeature
+ feature_sigil, label = self.get_feature_sigil(feature, locstart, locend)
+ feature_elements.append(feature_sigil)
+ if label is not None: # If there's a label
+ label_elements.append(label)
+
+ return feature_elements, label_elements
+
+ def get_feature_sigil(self, feature, locstart, locend, **kwargs):
+ """Return graphics for feature, and any required label for it.
+
+ Arguments:
+ - feature Feature object
+ - locstart The start position of the feature
+ - locend The end position of the feature
+
+ """
+ # Establish the co-ordinates for the sigil
+ btm, ctr, top = self.track_radii[self.current_track_level]
+
+ startangle, startcos, startsin = self.canvas_angle(locstart)
+ endangle, endcos, endsin = self.canvas_angle(locend)
+ midangle, midcos, midsin = self.canvas_angle(float(locend + locstart) / 2)
+
+ # Distribution dictionary for various ways of drawing the feature
+ # Each method takes the inner and outer radii, the start and end angle
+ # subtended at the diagram center, and the color as arguments
+ draw_methods = {
+ "BOX": self._draw_sigil_box,
+ "OCTO": self._draw_sigil_cut_corner_box,
+ "JAGGY": self._draw_sigil_jaggy,
+ "ARROW": self._draw_sigil_arrow,
+ "BIGARROW": self._draw_sigil_big_arrow,
+ }
+
+ # Get sigil for the feature, location dependent on the feature strand
+ method = draw_methods[feature.sigil]
+ kwargs["head_length_ratio"] = feature.arrowhead_length
+ kwargs["shaft_height_ratio"] = feature.arrowshaft_height
+
+ # Support for clickable links... needs ReportLab 2.4 or later
+ # which added support for links in SVG output.
+ if hasattr(feature, "url"):
+ kwargs["hrefURL"] = feature.url
+ kwargs["hrefTitle"] = feature.name
+
+ sigil = method(
+ btm,
+ ctr,
+ top,
+ startangle,
+ endangle,
+ feature.strand,
+ color=feature.color,
+ border=feature.border,
+ **kwargs
+ )
+
+ if feature.label: # Feature needs a label
+ # The spaces are a hack to force a little space between the label
+ # and the edge of the feature
+ label = String(
+ 0,
+ 0,
+ " %s " % feature.name.strip(),
+ fontName=feature.label_font,
+ fontSize=feature.label_size,
+ fillColor=feature.label_color,
+ )
+ labelgroup = Group(label)
+ if feature.label_strand:
+ strand = feature.label_strand
+ else:
+ strand = feature.strand
+ if feature.label_position in ("start", "5'", "left"):
+ # Position the label at the feature's start
+ if strand != -1:
+ label_angle = startangle + 0.5 * pi # Make text radial
+ sinval, cosval = startsin, startcos
+ else:
+ label_angle = endangle + 0.5 * pi # Make text radial
+ sinval, cosval = endsin, endcos
+ elif feature.label_position in ("middle", "center", "centre"):
+ # Position the label at the feature's midpoint
+ label_angle = midangle + 0.5 * pi # Make text radial
+ sinval, cosval = midsin, midcos
+ elif feature.label_position in ("end", "3'", "right"):
+ # Position the label at the feature's end
+ if strand != -1:
+ label_angle = endangle + 0.5 * pi # Make text radial
+ sinval, cosval = endsin, endcos
+ else:
+ label_angle = startangle + 0.5 * pi # Make text radial
+ sinval, cosval = startsin, startcos
+ elif startangle < pi:
+ # Default to placing the label the bottom of the feature
+ # as drawn on the page, meaning feature end on left half
+ label_angle = endangle + 0.5 * pi # Make text radial
+ sinval, cosval = endsin, endcos
+ else:
+ # Default to placing the label on the bottom of the feature,
+ # which means the feature end when on right hand half
+ label_angle = startangle + 0.5 * pi # Make text radial
+ sinval, cosval = startsin, startcos
+ if strand != -1:
+ # Feature label on top
+ radius = top
+ if startangle < pi: # Turn text round
+ label_angle -= pi
+ else:
+ labelgroup.contents[0].textAnchor = "end"
+ else:
+ # Feature label on bottom
+ radius = btm
+ if startangle < pi: # Turn text round and anchor end
+ label_angle -= pi
+ labelgroup.contents[0].textAnchor = "end"
+ x_pos = self.xcenter + radius * sinval
+ y_pos = self.ycenter + radius * cosval
+ coslabel = cos(label_angle)
+ sinlabel = sin(label_angle)
+ labelgroup.transform = (
+ coslabel,
+ -sinlabel,
+ sinlabel,
+ coslabel,
+ x_pos,
+ y_pos,
+ )
+ else:
+ # No label required
+ labelgroup = None
+ # if locstart > locend:
+ # print(locstart, locend, feature.strand, sigil, feature.name)
+ # print(locstart, locend, feature.name)
+ return sigil, labelgroup
+
+ def draw_cross_link(self, cross_link):
+ """Draw a cross-link between features."""
+ startA = cross_link.startA
+ startB = cross_link.startB
+ endA = cross_link.endA
+ endB = cross_link.endB
+
+ if not self.is_in_bounds(startA) and not self.is_in_bounds(endA):
+ return None
+ if not self.is_in_bounds(startB) and not self.is_in_bounds(endB):
+ return None
+
+ if startA < self.start:
+ startA = self.start
+ if startB < self.start:
+ startB = self.start
+ if self.end < endA:
+ endA = self.end
+ if self.end < endB:
+ endB = self.end
+
+ trackobjA = cross_link._trackA(list(self._parent.tracks.values()))
+ trackobjB = cross_link._trackB(list(self._parent.tracks.values()))
+ assert trackobjA is not None
+ assert trackobjB is not None
+ if trackobjA == trackobjB:
+ raise NotImplementedError()
+
+ if trackobjA.start is not None:
+ if endA < trackobjA.start:
+ return
+ startA = max(startA, trackobjA.start)
+ if trackobjA.end is not None:
+ if trackobjA.end < startA:
+ return
+ endA = min(endA, trackobjA.end)
+ if trackobjB.start is not None:
+ if endB < trackobjB.start:
+ return
+ startB = max(startB, trackobjB.start)
+ if trackobjB.end is not None:
+ if trackobjB.end < startB:
+ return
+ endB = min(endB, trackobjB.end)
+
+ for track_level in self._parent.get_drawn_levels():
+ track = self._parent[track_level]
+ if track == trackobjA:
+ trackA = track_level
+ if track == trackobjB:
+ trackB = track_level
+ if trackA == trackB:
+ raise NotImplementedError()
+
+ startangleA, startcosA, startsinA = self.canvas_angle(startA)
+ startangleB, startcosB, startsinB = self.canvas_angle(startB)
+ endangleA, endcosA, endsinA = self.canvas_angle(endA)
+ endangleB, endcosB, endsinB = self.canvas_angle(endB)
+
+ btmA, ctrA, topA = self.track_radii[trackA]
+ btmB, ctrB, topB = self.track_radii[trackB]
+
+ if ctrA < ctrB:
+ return [
+ self._draw_arc_poly(
+ topA,
+ btmB,
+ startangleA,
+ endangleA,
+ startangleB,
+ endangleB,
+ cross_link.color,
+ cross_link.border,
+ cross_link.flip,
+ )
+ ]
+ else:
+ return [
+ self._draw_arc_poly(
+ btmA,
+ topB,
+ startangleA,
+ endangleA,
+ startangleB,
+ endangleB,
+ cross_link.color,
+ cross_link.border,
+ cross_link.flip,
+ )
+ ]
+
+ def draw_graph_set(self, set):
+ """Return list of graph elements and list of their labels.
+
+ Arguments:
+ - set GraphSet object
+
+ """
+ # print('draw graph set')
+ elements = [] # Holds graph elements
+
+ # Distribution dictionary for how to draw the graph
+ style_methods = {
+ "line": self.draw_line_graph,
+ "heat": self.draw_heat_graph,
+ "bar": self.draw_bar_graph,
+ }
+
+ for graph in set.get_graphs():
+ elements += style_methods[graph.style](graph)
+
+ return elements, []
+
+ def draw_line_graph(self, graph):
+ """Return line graph as list of drawable elements.
+
+ Arguments:
+ - graph GraphData object
+
+ """
+ line_elements = [] # holds drawable elements
+
+ # Get graph data
+ data_quartiles = graph.quartiles()
+ minval, maxval = data_quartiles[0], data_quartiles[4]
+ btm, ctr, top = self.track_radii[self.current_track_level]
+ trackheight = 0.5 * (top - btm)
+ datarange = maxval - minval
+ if datarange == 0:
+ datarange = trackheight
+
+ start, end = self._current_track_start_end()
+ data = graph[start:end]
+
+ if not data:
+ return []
+
+ # midval is the value at which the x-axis is plotted, and is the
+ # central ring in the track
+ if graph.center is None:
+ midval = (maxval + minval) / 2.0
+ else:
+ midval = graph.center
+ # Whichever is the greatest difference: max-midval or min-midval, is
+ # taken to specify the number of pixel units resolved along the
+ # y-axis
+ resolution = max((midval - minval), (maxval - midval))
+
+ # Start from first data point
+ pos, val = data[0]
+ lastangle, lastcos, lastsin = self.canvas_angle(pos)
+ # We calculate the track height
+ posheight = trackheight * (val - midval) / resolution + ctr
+ lastx = self.xcenter + posheight * lastsin # start xy coords
+ lasty = self.ycenter + posheight * lastcos
+ for pos, val in data:
+ posangle, poscos, possin = self.canvas_angle(pos)
+ posheight = trackheight * (val - midval) / resolution + ctr
+ x = self.xcenter + posheight * possin # next xy coords
+ y = self.ycenter + posheight * poscos
+ line_elements.append(
+ Line(
+ lastx,
+ lasty,
+ x,
+ y,
+ strokeColor=graph.poscolor,
+ strokeWidth=graph.linewidth,
+ )
+ )
+ lastx, lasty, = x, y
+ return line_elements
+
+ def draw_bar_graph(self, graph):
+ """Return list of drawable elements for a bar graph.
+
+ Arguments:
+ - graph Graph object
+
+ """
+ # At each point contained in the graph data, we draw a vertical bar
+ # from the track center to the height of the datapoint value (positive
+ # values go up in one color, negative go down in the alternative
+ # color).
+ bar_elements = []
+
+ # Set the number of pixels per unit for the data
+ data_quartiles = graph.quartiles()
+ minval, maxval = data_quartiles[0], data_quartiles[4]
+ btm, ctr, top = self.track_radii[self.current_track_level]
+ trackheight = 0.5 * (top - btm)
+ datarange = maxval - minval
+ if datarange == 0:
+ datarange = trackheight
+ data = graph[self.start : self.end]
+ # midval is the value at which the x-axis is plotted, and is the
+ # central ring in the track
+ if graph.center is None:
+ midval = (maxval + minval) / 2.0
+ else:
+ midval = graph.center
+
+ # Convert data into 'binned' blocks, covering half the distance to the
+ # next data point on either side, accounting for the ends of fragments
+ # and tracks
+ start, end = self._current_track_start_end()
+ data = intermediate_points(start, end, graph[start:end])
+
+ if not data:
+ return []
+
+ # Whichever is the greatest difference: max-midval or min-midval, is
+ # taken to specify the number of pixel units resolved along the
+ # y-axis
+ resolution = max((midval - minval), (maxval - midval))
+ if resolution == 0:
+ resolution = trackheight
+
+ # Create elements for the bar graph based on newdata
+ for pos0, pos1, val in data:
+ pos0angle, pos0cos, pos0sin = self.canvas_angle(pos0)
+ pos1angle, pos1cos, pos1sin = self.canvas_angle(pos1)
+
+ barval = trackheight * (val - midval) / resolution
+ if barval >= 0:
+ barcolor = graph.poscolor
+ else:
+ barcolor = graph.negcolor
+
+ # Draw bar
+ bar_elements.append(
+ self._draw_arc(ctr, ctr + barval, pos0angle, pos1angle, barcolor)
+ )
+ return bar_elements
+
+ def draw_heat_graph(self, graph):
+ """Return list of drawable elements for the heat graph.
+
+ Arguments:
+ - graph Graph object
+
+ """
+ # At each point contained in the graph data, we draw a box that is the
+ # full height of the track, extending from the midpoint between the
+ # previous and current data points to the midpoint between the current
+ # and next data points
+ heat_elements = [] # holds drawable elements
+
+ # Get graph data
+ data_quartiles = graph.quartiles()
+ minval, maxval = data_quartiles[0], data_quartiles[4]
+ midval = (maxval + minval) / 2.0 # mid is the value at the X-axis
+ btm, ctr, top = self.track_radii[self.current_track_level]
+ trackheight = top - btm
+
+ start, end = self._current_track_start_end()
+ data = intermediate_points(start, end, graph[start:end])
+
+ # Create elements on the graph, indicating a large positive value by
+ # the graph's poscolor, and a large negative value by the graph's
+ # negcolor attributes
+ for pos0, pos1, val in data:
+ pos0angle, pos0cos, pos0sin = self.canvas_angle(pos0)
+ pos1angle, pos1cos, pos1sin = self.canvas_angle(pos1)
+
+ # Calculate the heat color, based on the differential between
+ # the value and the median value
+ heat = colors.linearlyInterpolatedColor(
+ graph.poscolor, graph.negcolor, maxval, minval, val
+ )
+
+ # Draw heat box
+ heat_elements.append(
+ self._draw_arc(btm, top, pos0angle, pos1angle, heat, border=heat)
+ )
+ return heat_elements
+
+ def draw_scale(self, track):
+ """Return list of elements in the scale and list of their labels.
+
+ Arguments:
+ - track Track object
+
+ """
+ scale_elements = [] # holds axes and ticks
+ scale_labels = [] # holds labels
+
+ if not track.scale:
+ # no scale required, exit early
+ return [], []
+
+ # Get track locations
+ btm, ctr, top = self.track_radii[self.current_track_level]
+ trackheight = top - ctr
+
+ # X-axis
+ start, end = self._current_track_start_end()
+ if track.start is not None or track.end is not None:
+ # Draw an arc, leaving out the wedge
+ p = ArcPath(strokeColor=track.scale_color, fillColor=None)
+ startangle, startcos, startsin = self.canvas_angle(start)
+ endangle, endcos, endsin = self.canvas_angle(end)
+ p.addArc(
+ self.xcenter,
+ self.ycenter,
+ ctr,
+ 90 - (endangle * 180 / pi),
+ 90 - (startangle * 180 / pi),
+ )
+ scale_elements.append(p)
+ del p
+ # Y-axis start marker
+ x0, y0 = self.xcenter + btm * startsin, self.ycenter + btm * startcos
+ x1, y1 = self.xcenter + top * startsin, self.ycenter + top * startcos
+ scale_elements.append(Line(x0, y0, x1, y1, strokeColor=track.scale_color))
+ # Y-axis end marker
+ x0, y0 = self.xcenter + btm * endsin, self.ycenter + btm * endcos
+ x1, y1 = self.xcenter + top * endsin, self.ycenter + top * endcos
+ scale_elements.append(Line(x0, y0, x1, y1, strokeColor=track.scale_color))
+ elif self.sweep < 1:
+ # Draw an arc, leaving out the wedge
+ p = ArcPath(strokeColor=track.scale_color, fillColor=None)
+ # Note reportlab counts angles anti-clockwise from the horizontal
+ # (as in mathematics, e.g. complex numbers and polar coordinates)
+ # in degrees.
+ p.addArc(
+ self.xcenter,
+ self.ycenter,
+ ctr,
+ startangledegrees=90 - 360 * self.sweep,
+ endangledegrees=90,
+ )
+ scale_elements.append(p)
+ del p
+ # Y-axis start marker
+ x0, y0 = self.xcenter, self.ycenter + btm
+ x1, y1 = self.xcenter, self.ycenter + top
+ scale_elements.append(Line(x0, y0, x1, y1, strokeColor=track.scale_color))
+ # Y-axis end marker
+ alpha = 2 * pi * self.sweep
+ x0, y0 = self.xcenter + btm * sin(alpha), self.ycenter + btm * cos(alpha)
+ x1, y1 = self.xcenter + top * sin(alpha), self.ycenter + top * cos(alpha)
+ scale_elements.append(Line(x0, y0, x1, y1, strokeColor=track.scale_color))
+ else:
+ # Draw a full circle
+ scale_elements.append(
+ Circle(
+ self.xcenter,
+ self.ycenter,
+ ctr,
+ strokeColor=track.scale_color,
+ fillColor=None,
+ )
+ )
+
+ start, end = self._current_track_start_end()
+ if track.scale_ticks: # Ticks are required on the scale
+ # Draw large ticks
+ # I want the ticks to be consistently positioned relative to
+ # the start of the sequence (position 0), not relative to the
+ # current viewpoint (self.start and self.end)
+
+ ticklen = track.scale_largeticks * trackheight
+ tickiterval = int(track.scale_largetick_interval)
+ # Note that we could just start the list of ticks using
+ # range(0,self.end,tickinterval) and the filter out the
+ # ones before self.start - but this seems wasteful.
+ # Using tickiterval * (self.start/tickiterval) is a shortcut.
+ for tickpos in range(
+ tickiterval * (self.start // tickiterval), int(self.end), tickiterval
+ ):
+ if tickpos <= start or end <= tickpos:
+ continue
+ tick, label = self.draw_tick(
+ tickpos, ctr, ticklen, track, track.scale_largetick_labels
+ )
+ scale_elements.append(tick)
+ if label is not None: # If there's a label, add it
+ scale_labels.append(label)
+ # Draw small ticks
+ ticklen = track.scale_smallticks * trackheight
+ tickiterval = int(track.scale_smalltick_interval)
+ for tickpos in range(
+ tickiterval * (self.start // tickiterval), int(self.end), tickiterval
+ ):
+ if tickpos <= start or end <= tickpos:
+ continue
+ tick, label = self.draw_tick(
+ tickpos, ctr, ticklen, track, track.scale_smalltick_labels
+ )
+ scale_elements.append(tick)
+ if label is not None: # If there's a label, add it
+ scale_labels.append(label)
+
+ # Check to see if the track contains a graph - if it does, get the
+ # minimum and maximum values, and put them on the scale Y-axis
+ # at 60 degree intervals, ordering the labels by graph_id
+ startangle, startcos, startsin = self.canvas_angle(start)
+ endangle, endcos, endsin = self.canvas_angle(end)
+ if track.axis_labels:
+ for set in track.get_sets():
+ if set.__class__ is GraphSet:
+ # Y-axis
+ for n in range(7):
+ angle = n * 1.0471975511965976
+ if angle < startangle or endangle < angle:
+ continue
+ ticksin, tickcos = sin(angle), cos(angle)
+ x0, y0 = (
+ self.xcenter + btm * ticksin,
+ self.ycenter + btm * tickcos,
+ )
+ x1, y1 = (
+ self.xcenter + top * ticksin,
+ self.ycenter + top * tickcos,
+ )
+ scale_elements.append(
+ Line(x0, y0, x1, y1, strokeColor=track.scale_color)
+ )
+
+ graph_label_min = []
+ graph_label_max = []
+ graph_label_mid = []
+ for graph in set.get_graphs():
+ quartiles = graph.quartiles()
+ minval, maxval = quartiles[0], quartiles[4]
+ if graph.center is None:
+ midval = (maxval + minval) / 2.0
+ graph_label_min.append("%.3f" % minval)
+ graph_label_max.append("%.3f" % maxval)
+ graph_label_mid.append("%.3f" % midval)
+ else:
+ diff = max(
+ (graph.center - minval), (maxval - graph.center)
+ )
+ minval = graph.center - diff
+ maxval = graph.center + diff
+ midval = graph.center
+ graph_label_mid.append("%.3f" % midval)
+ graph_label_min.append("%.3f" % minval)
+ graph_label_max.append("%.3f" % maxval)
+ xmid, ymid = (x0 + x1) / 2.0, (y0 + y1) / 2.0
+ for limit, x, y in [
+ (graph_label_min, x0, y0),
+ (graph_label_max, x1, y1),
+ (graph_label_mid, xmid, ymid),
+ ]:
+ label = String(
+ 0,
+ 0,
+ ";".join(limit),
+ fontName=track.scale_font,
+ fontSize=track.scale_fontsize,
+ fillColor=track.scale_color,
+ )
+ label.textAnchor = "middle"
+ labelgroup = Group(label)
+ labelgroup.transform = (
+ tickcos,
+ -ticksin,
+ ticksin,
+ tickcos,
+ x,
+ y,
+ )
+ scale_labels.append(labelgroup)
+
+ return scale_elements, scale_labels
+
+ def draw_tick(self, tickpos, ctr, ticklen, track, draw_label):
+ """Return drawing element for a tick on the scale.
+
+ Arguments:
+ - tickpos Int, position of the tick on the sequence
+ - ctr Float, Y co-ord of the center of the track
+ - ticklen How long to draw the tick
+ - track Track, the track the tick is drawn on
+ - draw_label Boolean, write the tick label?
+
+ """
+ # Calculate tick co-ordinates
+ tickangle, tickcos, ticksin = self.canvas_angle(tickpos)
+ x0, y0 = self.xcenter + ctr * ticksin, self.ycenter + ctr * tickcos
+ x1, y1 = (
+ self.xcenter + (ctr + ticklen) * ticksin,
+ self.ycenter + (ctr + ticklen) * tickcos,
+ )
+ # Calculate height of text label so it can be offset on lower half
+ # of diagram
+ # LP: not used, as not all fonts have ascent_descent data in reportlab.pdfbase._fontdata
+ # label_offset = _fontdata.ascent_descent[track.scale_font][0]*\
+ # track.scale_fontsize/1000.
+ tick = Line(x0, y0, x1, y1, strokeColor=track.scale_color)
+ if draw_label:
+ # Put tick position on as label
+ if track.scale_format == "SInt":
+ if tickpos >= 1000000:
+ tickstring = str(tickpos // 1000000) + " Mbp"
+ elif tickpos >= 1000:
+ tickstring = str(tickpos // 1000) + " Kbp"
+ else:
+ tickstring = str(tickpos)
+ else:
+ tickstring = str(tickpos)
+ label = String(
+ 0,
+ 0,
+ tickstring, # Make label string
+ fontName=track.scale_font,
+ fontSize=track.scale_fontsize,
+ fillColor=track.scale_color,
+ )
+ if tickangle > pi:
+ label.textAnchor = "end"
+ # LP: This label_offset depends on ascent_descent data, which is not available for all
+ # fonts, so has been deprecated.
+ # if 0.5*pi < tickangle < 1.5*pi:
+ # y1 -= label_offset
+ labelgroup = Group(label)
+ labelgroup.transform = (1, 0, 0, 1, x1, y1)
+ else:
+ labelgroup = None
+ return tick, labelgroup
+
+ def draw_test_tracks(self):
+ """Draw blue test tracks with grene line down their center."""
+ # Add lines only for drawn tracks
+ for track in self.drawn_tracks:
+ btm, ctr, top = self.track_radii[track]
+ self.drawing.add(
+ Circle(
+ self.xcenter,
+ self.ycenter,
+ top,
+ strokeColor=colors.blue,
+ fillColor=None,
+ )
+ ) # top line
+ self.drawing.add(
+ Circle(
+ self.xcenter,
+ self.ycenter,
+ ctr,
+ strokeColor=colors.green,
+ fillColor=None,
+ )
+ ) # middle line
+ self.drawing.add(
+ Circle(
+ self.xcenter,
+ self.ycenter,
+ btm,
+ strokeColor=colors.blue,
+ fillColor=None,
+ )
+ ) # bottom line
+
+ def draw_greytrack(self, track):
+ """Drawing element for grey background to passed Track object."""
+ greytrack_bgs = [] # Holds track backgrounds
+ greytrack_labels = [] # Holds track foreground labels
+
+ if not track.greytrack: # No greytrack required, return early
+ return [], []
+
+ # Get track location
+ btm, ctr, top = self.track_radii[self.current_track_level]
+
+ start, end = self._current_track_start_end()
+ startangle, startcos, startsin = self.canvas_angle(start)
+ endangle, endcos, endsin = self.canvas_angle(end)
+
+ # Make background
+ if track.start is not None or track.end is not None:
+ # Draw an arc, leaving out the wedge
+ p = ArcPath(strokeColor=track.scale_color, fillColor=None)
+ greytrack_bgs.append(
+ self._draw_arc(
+ btm, top, startangle, endangle, colors.Color(0.96, 0.96, 0.96)
+ )
+ )
+ elif self.sweep < 1:
+ # Make a partial circle, a large arc box
+ # This method assumes the correct center for us.
+ greytrack_bgs.append(
+ self._draw_arc(
+ btm, top, 0, 2 * pi * self.sweep, colors.Color(0.96, 0.96, 0.96)
+ )
+ )
+ else:
+ # Make a full circle (using a VERY thick linewidth)
+ greytrack_bgs.append(
+ Circle(
+ self.xcenter,
+ self.ycenter,
+ ctr,
+ strokeColor=colors.Color(0.96, 0.96, 0.96),
+ fillColor=None,
+ strokeWidth=top - btm,
+ )
+ )
+
+ if track.greytrack_labels:
+ # Labels are required for this track
+ labelstep = self.length // track.greytrack_labels # label interval
+ for pos in range(self.start, self.end, labelstep):
+ label = String(
+ 0,
+ 0,
+ track.name, # Add a new label at
+ fontName=track.greytrack_font, # each interval
+ fontSize=track.greytrack_fontsize,
+ fillColor=track.greytrack_fontcolor,
+ )
+ theta, costheta, sintheta = self.canvas_angle(pos)
+ if theta < startangle or endangle < theta:
+ continue
+ x, y = (
+ self.xcenter + btm * sintheta,
+ self.ycenter + btm * costheta,
+ ) # start text halfway up marker
+ labelgroup = Group(label)
+ labelangle = (
+ self.sweep * 2 * pi * (pos - self.start) / self.length - pi / 2
+ )
+ if theta > pi:
+ label.textAnchor = "end" # Anchor end of text to inner radius
+ labelangle += pi # and reorient it
+ cosA, sinA = cos(labelangle), sin(labelangle)
+ labelgroup.transform = (cosA, -sinA, sinA, cosA, x, y)
+ if not self.length - x <= labelstep: # Don't overrun the circle
+ greytrack_labels.append(labelgroup)
+
+ return greytrack_bgs, greytrack_labels
+
+ def canvas_angle(self, base):
+ """Given base-pair position, return (angle, cosine, sin) (PRIVATE)."""
+ angle = self.sweep * 2 * pi * (base - self.start) / self.length
+ return (angle, cos(angle), sin(angle))
+
+ def _draw_sigil_box(
+ self, bottom, center, top, startangle, endangle, strand, **kwargs
+ ):
+ """Draw BOX sigil (PRIVATE)."""
+ if strand == 1:
+ inner_radius = center
+ outer_radius = top
+ elif strand == -1:
+ inner_radius = bottom
+ outer_radius = center
+ else:
+ inner_radius = bottom
+ outer_radius = top
+ return self._draw_arc(
+ inner_radius, outer_radius, startangle, endangle, **kwargs
+ )
+
+ def _draw_arc(
+ self,
+ inner_radius,
+ outer_radius,
+ startangle,
+ endangle,
+ color,
+ border=None,
+ colour=None,
+ **kwargs
+ ):
+ """Return closed path describing an arc box (PRIVATE).
+
+ Arguments:
+ - inner_radius Float distance of inside of arc from drawing center
+ - outer_radius Float distance of outside of arc from drawing center
+ - startangle Float angle subtended by start of arc at drawing center
+ (in radians)
+ - endangle Float angle subtended by end of arc at drawing center
+ (in radians)
+ - color colors.Color object for arc (overridden by backwards
+ compatible argument with UK spelling, colour).
+
+ Returns a closed path object describing an arced box corresponding to
+ the passed values. For very small angles, a simple four sided
+ polygon is used.
+ """
+ # Let the UK spelling (colour) override the USA spelling (color)
+ if colour is not None:
+ color = colour
+
+ strokecolor, color = _stroke_and_fill_colors(color, border)
+
+ if abs(float(endangle - startangle)) > 0.01:
+ # Wide arc, must use full curves
+ p = ArcPath(strokeColor=strokecolor, fillColor=color, strokewidth=0)
+ # Note reportlab counts angles anti-clockwise from the horizontal
+ # (as in mathematics, e.g. complex numbers and polar coordinates)
+ # but we use clockwise from the vertical. Also reportlab uses
+ # degrees, but we use radians.
+ p.addArc(
+ self.xcenter,
+ self.ycenter,
+ inner_radius,
+ 90 - (endangle * 180 / pi),
+ 90 - (startangle * 180 / pi),
+ moveTo=True,
+ )
+ p.addArc(
+ self.xcenter,
+ self.ycenter,
+ outer_radius,
+ 90 - (endangle * 180 / pi),
+ 90 - (startangle * 180 / pi),
+ reverse=True,
+ )
+ p.closePath()
+ return p
+ else:
+ # Cheat and just use a four sided polygon.
+ # Calculate trig values for angle and coordinates
+ startcos, startsin = cos(startangle), sin(startangle)
+ endcos, endsin = cos(endangle), sin(endangle)
+ x0, y0 = self.xcenter, self.ycenter # origin of the circle
+ x1, y1 = (x0 + inner_radius * startsin, y0 + inner_radius * startcos)
+ x2, y2 = (x0 + inner_radius * endsin, y0 + inner_radius * endcos)
+ x3, y3 = (x0 + outer_radius * endsin, y0 + outer_radius * endcos)
+ x4, y4 = (x0 + outer_radius * startsin, y0 + outer_radius * startcos)
+ return draw_polygon([(x1, y1), (x2, y2), (x3, y3), (x4, y4)], color, border)
+
+ def _draw_arc_line(
+ self, path, start_radius, end_radius, start_angle, end_angle, move=False
+ ):
+ """Add a list of points to a path object (PRIVATE).
+
+ Assumes angles given are in degrees!
+
+ Represents what would be a straight line on a linear diagram.
+ """
+ x0, y0 = self.xcenter, self.ycenter # origin of the circle
+ radius_diff = end_radius - start_radius
+ angle_diff = end_angle - start_angle
+ dx = 0.01 # heuristic
+ a = start_angle * pi / 180
+ if move:
+ path.moveTo(x0 + start_radius * cos(a), y0 + start_radius * sin(a))
+ else:
+ path.lineTo(x0 + start_radius * cos(a), y0 + start_radius * sin(a))
+ x = dx
+ if 0.01 <= abs(dx):
+ while x < 1:
+ r = start_radius + x * radius_diff
+ a = (
+ (start_angle + x * (angle_diff)) * pi / 180
+ ) # to radians for sin/cos
+ # print(x0+r*cos(a), y0+r*sin(a))
+ path.lineTo(x0 + r * cos(a), y0 + r * sin(a))
+ x += dx
+ a = end_angle * pi / 180
+ path.lineTo(x0 + end_radius * cos(a), y0 + end_radius * sin(a))
+
+ def _draw_arc_poly(
+ self,
+ inner_radius,
+ outer_radius,
+ inner_startangle,
+ inner_endangle,
+ outer_startangle,
+ outer_endangle,
+ color,
+ border=None,
+ flip=False,
+ **kwargs
+ ):
+ """Return polygon path describing an arc."""
+ strokecolor, color = _stroke_and_fill_colors(color, border)
+
+ x0, y0 = self.xcenter, self.ycenter # origin of the circle
+ if (
+ abs(inner_endangle - outer_startangle) > 0.01
+ or abs(outer_endangle - inner_startangle) > 0.01
+ or abs(inner_startangle - outer_startangle) > 0.01
+ or abs(outer_startangle - outer_startangle) > 0.01
+ ):
+ # Wide arc, must use full curves
+ p = ArcPath(
+ strokeColor=strokecolor,
+ fillColor=color,
+ # default is mitre/miter which can stick out too much:
+ strokeLineJoin=1, # 1=round
+ strokewidth=0,
+ )
+ # Note reportlab counts angles anti-clockwise from the horizontal
+ # (as in mathematics, e.g. complex numbers and polar coordinates)
+ # but we use clockwise from the vertical. Also reportlab uses
+ # degrees, but we use radians.
+ i_start = 90 - (inner_startangle * 180 / pi)
+ i_end = 90 - (inner_endangle * 180 / pi)
+ o_start = 90 - (outer_startangle * 180 / pi)
+ o_end = 90 - (outer_endangle * 180 / pi)
+ p.addArc(x0, y0, inner_radius, i_end, i_start, moveTo=True, reverse=True)
+ if flip:
+ # Flipped, join end to start,
+ self._draw_arc_line(p, inner_radius, outer_radius, i_end, o_start)
+ p.addArc(x0, y0, outer_radius, o_end, o_start, reverse=True)
+ self._draw_arc_line(p, outer_radius, inner_radius, o_end, i_start)
+ else:
+ # Not flipped, join start to start, end to end
+ self._draw_arc_line(p, inner_radius, outer_radius, i_end, o_end)
+ p.addArc(x0, y0, outer_radius, o_end, o_start, reverse=False)
+ self._draw_arc_line(p, outer_radius, inner_radius, o_start, i_start)
+ p.closePath()
+ return p
+ else:
+ # Cheat and just use a four sided polygon.
+ # Calculate trig values for angle and coordinates
+ inner_startcos, inner_startsin = (
+ cos(inner_startangle),
+ sin(inner_startangle),
+ )
+ inner_endcos, inner_endsin = cos(inner_endangle), sin(inner_endangle)
+ outer_startcos, outer_startsin = (
+ cos(outer_startangle),
+ sin(outer_startangle),
+ )
+ outer_endcos, outer_endsin = cos(outer_endangle), sin(outer_endangle)
+ x1, y1 = (
+ x0 + inner_radius * inner_startsin,
+ y0 + inner_radius * inner_startcos,
+ )
+ x2, y2 = (
+ x0 + inner_radius * inner_endsin,
+ y0 + inner_radius * inner_endcos,
+ )
+ x3, y3 = (
+ x0 + outer_radius * outer_endsin,
+ y0 + outer_radius * outer_endcos,
+ )
+ x4, y4 = (
+ x0 + outer_radius * outer_startsin,
+ y0 + outer_radius * outer_startcos,
+ )
+ return draw_polygon(
+ [(x1, y1), (x2, y2), (x3, y3), (x4, y4)],
+ color,
+ border,
+ # default is mitre/miter which can stick out too much:
+ strokeLineJoin=1, # 1=round
+ )
+
+ def _draw_sigil_cut_corner_box(
+ self,
+ bottom,
+ center,
+ top,
+ startangle,
+ endangle,
+ strand,
+ color,
+ border=None,
+ corner=0.5,
+ **kwargs
+ ):
+ """Draw OCTO sigil, box with corners cut off (PRIVATE)."""
+ if strand == 1:
+ inner_radius = center
+ outer_radius = top
+ elif strand == -1:
+ inner_radius = bottom
+ outer_radius = center
+ else:
+ inner_radius = bottom
+ outer_radius = top
+
+ strokecolor, color = _stroke_and_fill_colors(color, border)
+
+ startangle, endangle = min(startangle, endangle), max(startangle, endangle)
+ angle = float(endangle - startangle)
+
+ middle_radius = 0.5 * (inner_radius + outer_radius)
+ boxheight = outer_radius - inner_radius
+
+ corner_len = min(0.5 * boxheight, 0.5 * boxheight * corner)
+ shaft_inner_radius = inner_radius + corner_len
+ shaft_outer_radius = outer_radius - corner_len
+
+ cornerangle_delta = max(
+ 0.0, min(abs(boxheight) * 0.5 * corner / middle_radius, abs(angle * 0.5))
+ )
+ if angle < 0:
+ cornerangle_delta *= -1 # reverse it
+
+ # Calculate trig values for angle and coordinates
+ startcos, startsin = cos(startangle), sin(startangle)
+ endcos, endsin = cos(endangle), sin(endangle)
+ x0, y0 = self.xcenter, self.ycenter # origin of the circle
+ p = ArcPath(
+ strokeColor=strokecolor,
+ fillColor=color,
+ strokeLineJoin=1, # 1=round
+ strokewidth=0,
+ **kwargs
+ )
+ # Inner curved edge
+ p.addArc(
+ self.xcenter,
+ self.ycenter,
+ inner_radius,
+ 90 - ((endangle - cornerangle_delta) * 180 / pi),
+ 90 - ((startangle + cornerangle_delta) * 180 / pi),
+ moveTo=True,
+ )
+ # Corner edge - straight lines assumes small angle!
+ # TODO - Use self._draw_arc_line(p, ...) here if we expose corner setting
+ p.lineTo(x0 + shaft_inner_radius * startsin, y0 + shaft_inner_radius * startcos)
+ p.lineTo(x0 + shaft_outer_radius * startsin, y0 + shaft_outer_radius * startcos)
+ # Outer curved edge
+ p.addArc(
+ self.xcenter,
+ self.ycenter,
+ outer_radius,
+ 90 - ((endangle - cornerangle_delta) * 180 / pi),
+ 90 - ((startangle + cornerangle_delta) * 180 / pi),
+ reverse=True,
+ )
+ # Corner edges
+ p.lineTo(x0 + shaft_outer_radius * endsin, y0 + shaft_outer_radius * endcos)
+ p.lineTo(x0 + shaft_inner_radius * endsin, y0 + shaft_inner_radius * endcos)
+ p.closePath()
+ return p
+
+ def _draw_sigil_arrow(
+ self, bottom, center, top, startangle, endangle, strand, **kwargs
+ ):
+ """Draw ARROW sigil (PRIVATE)."""
+ if strand == 1:
+ inner_radius = center
+ outer_radius = top
+ orientation = "right"
+ elif strand == -1:
+ inner_radius = bottom
+ outer_radius = center
+ orientation = "left"
+ else:
+ inner_radius = bottom
+ outer_radius = top
+ orientation = "right" # backwards compatibility
+ return self._draw_arc_arrow(
+ inner_radius,
+ outer_radius,
+ startangle,
+ endangle,
+ orientation=orientation,
+ **kwargs
+ )
+
+ def _draw_sigil_big_arrow(
+ self, bottom, center, top, startangle, endangle, strand, **kwargs
+ ):
+ """Draw BIGARROW sigil, like ARROW but straddles the axis (PRIVATE)."""
+ if strand == -1:
+ orientation = "left"
+ else:
+ orientation = "right"
+ return self._draw_arc_arrow(
+ bottom, top, startangle, endangle, orientation=orientation, **kwargs
+ )
+
+ def _draw_arc_arrow(
+ self,
+ inner_radius,
+ outer_radius,
+ startangle,
+ endangle,
+ color,
+ border=None,
+ shaft_height_ratio=0.4,
+ head_length_ratio=0.5,
+ orientation="right",
+ colour=None,
+ **kwargs
+ ):
+ """Draw an arrow along an arc (PRIVATE)."""
+ # Let the UK spelling (colour) override the USA spelling (color)
+ if colour is not None:
+ color = colour
+
+ strokecolor, color = _stroke_and_fill_colors(color, border)
+
+ # if orientation == 'right':
+ # startangle, endangle = min(startangle, endangle), max(startangle, endangle)
+ # elif orientation == 'left':
+ # startangle, endangle = max(startangle, endangle), min(startangle, endangle)
+ # else:
+ startangle, endangle = min(startangle, endangle), max(startangle, endangle)
+ if orientation != "left" and orientation != "right":
+ raise ValueError(
+ "Invalid orientation %r, should be 'left' or 'right'" % orientation
+ )
+
+ angle = float(endangle - startangle) # angle subtended by arc
+ middle_radius = 0.5 * (inner_radius + outer_radius)
+ boxheight = outer_radius - inner_radius
+ shaft_height = boxheight * shaft_height_ratio
+ shaft_inner_radius = middle_radius - 0.5 * shaft_height
+ shaft_outer_radius = middle_radius + 0.5 * shaft_height
+ headangle_delta = max(
+ 0.0, min(abs(boxheight) * head_length_ratio / middle_radius, abs(angle))
+ )
+ if angle < 0:
+ headangle_delta *= -1 # reverse it
+ if orientation == "right":
+ headangle = endangle - headangle_delta
+ else:
+ headangle = startangle + headangle_delta
+ if startangle <= endangle:
+ headangle = max(min(headangle, endangle), startangle)
+ else:
+ headangle = max(min(headangle, startangle), endangle)
+ if not (
+ startangle <= headangle <= endangle or endangle <= headangle <= startangle
+ ):
+ raise RuntimeError(
+ "Problem drawing arrow, invalid positions. "
+ "Start angle: %s, Head angle: %s, "
+ "End angle: %s, Angle: %s" % (startangle, headangle, endangle, angle)
+ )
+
+ # Calculate trig values for angle and coordinates
+ startcos, startsin = cos(startangle), sin(startangle)
+ headcos, headsin = cos(headangle), sin(headangle)
+ endcos, endsin = cos(endangle), sin(endangle)
+ x0, y0 = self.xcenter, self.ycenter # origin of the circle
+ if 0.5 >= abs(angle) and abs(headangle_delta) >= abs(angle):
+ # If the angle is small, and the arrow is all head,
+ # cheat and just use a triangle.
+ if orientation == "right":
+ x1, y1 = (x0 + inner_radius * startsin, y0 + inner_radius * startcos)
+ x2, y2 = (x0 + outer_radius * startsin, y0 + outer_radius * startcos)
+ x3, y3 = (x0 + middle_radius * endsin, y0 + middle_radius * endcos)
+ else:
+ x1, y1 = (x0 + inner_radius * endsin, y0 + inner_radius * endcos)
+ x2, y2 = (x0 + outer_radius * endsin, y0 + outer_radius * endcos)
+ x3, y3 = (x0 + middle_radius * startsin, y0 + middle_radius * startcos)
+ # return draw_polygon([(x1,y1),(x2,y2),(x3,y3)], color, border,
+ # stroke_line_join=1)
+ return Polygon(
+ [x1, y1, x2, y2, x3, y3],
+ strokeColor=border or color,
+ fillColor=color,
+ strokeLineJoin=1, # 1=round, not mitre!
+ strokewidth=0,
+ )
+ elif orientation == "right":
+ p = ArcPath(
+ strokeColor=strokecolor,
+ fillColor=color,
+ # default is mitre/miter which can stick out too much:
+ strokeLineJoin=1, # 1=round
+ strokewidth=0,
+ **kwargs
+ )
+ # Note reportlab counts angles anti-clockwise from the horizontal
+ # (as in mathematics, e.g. complex numbers and polar coordinates)
+ # but we use clockwise from the vertical. Also reportlab uses
+ # degrees, but we use radians.
+ p.addArc(
+ self.xcenter,
+ self.ycenter,
+ shaft_inner_radius,
+ 90 - (headangle * 180 / pi),
+ 90 - (startangle * 180 / pi),
+ moveTo=True,
+ )
+ p.addArc(
+ self.xcenter,
+ self.ycenter,
+ shaft_outer_radius,
+ 90 - (headangle * 180 / pi),
+ 90 - (startangle * 180 / pi),
+ reverse=True,
+ )
+ if abs(angle) < 0.5:
+ p.lineTo(x0 + outer_radius * headsin, y0 + outer_radius * headcos)
+ p.lineTo(x0 + middle_radius * endsin, y0 + middle_radius * endcos)
+ p.lineTo(x0 + inner_radius * headsin, y0 + inner_radius * headcos)
+ else:
+ self._draw_arc_line(
+ p,
+ outer_radius,
+ middle_radius,
+ 90 - (headangle * 180 / pi),
+ 90 - (endangle * 180 / pi),
+ )
+ self._draw_arc_line(
+ p,
+ middle_radius,
+ inner_radius,
+ 90 - (endangle * 180 / pi),
+ 90 - (headangle * 180 / pi),
+ )
+ p.closePath()
+ return p
+ else:
+ p = ArcPath(
+ strokeColor=strokecolor,
+ fillColor=color,
+ # default is mitre/miter which can stick out too much:
+ strokeLineJoin=1, # 1=round
+ strokewidth=0,
+ **kwargs
+ )
+ # Note reportlab counts angles anti-clockwise from the horizontal
+ # (as in mathematics, e.g. complex numbers and polar coordinates)
+ # but we use clockwise from the vertical. Also reportlab uses
+ # degrees, but we use radians.
+ p.addArc(
+ self.xcenter,
+ self.ycenter,
+ shaft_inner_radius,
+ 90 - (endangle * 180 / pi),
+ 90 - (headangle * 180 / pi),
+ moveTo=True,
+ reverse=True,
+ )
+ p.addArc(
+ self.xcenter,
+ self.ycenter,
+ shaft_outer_radius,
+ 90 - (endangle * 180 / pi),
+ 90 - (headangle * 180 / pi),
+ reverse=False,
+ )
+ # Note - two staight lines is only a good approximation for small
+ # head angle, in general will need to curved lines here:
+ if abs(angle) < 0.5:
+ p.lineTo(x0 + outer_radius * headsin, y0 + outer_radius * headcos)
+ p.lineTo(x0 + middle_radius * startsin, y0 + middle_radius * startcos)
+ p.lineTo(x0 + inner_radius * headsin, y0 + inner_radius * headcos)
+ else:
+ self._draw_arc_line(
+ p,
+ outer_radius,
+ middle_radius,
+ 90 - (headangle * 180 / pi),
+ 90 - (startangle * 180 / pi),
+ )
+ self._draw_arc_line(
+ p,
+ middle_radius,
+ inner_radius,
+ 90 - (startangle * 180 / pi),
+ 90 - (headangle * 180 / pi),
+ )
+ p.closePath()
+ return p
+
+ def _draw_sigil_jaggy(
+ self,
+ bottom,
+ center,
+ top,
+ startangle,
+ endangle,
+ strand,
+ color,
+ border=None,
+ **kwargs
+ ):
+ """Draw JAGGY sigil (PRIVATE).
+
+ Although we may in future expose the head/tail jaggy lengths, for now
+ both the left and right edges are drawn jagged.
+ """
+ if strand == 1:
+ inner_radius = center
+ outer_radius = top
+ teeth = 2
+ elif strand == -1:
+ inner_radius = bottom
+ outer_radius = center
+ teeth = 2
+ else:
+ inner_radius = bottom
+ outer_radius = top
+ teeth = 4
+
+ # TODO, expose these settings?
+ tail_length_ratio = 1.0
+ head_length_ratio = 1.0
+
+ strokecolor, color = _stroke_and_fill_colors(color, border)
+
+ startangle, endangle = min(startangle, endangle), max(startangle, endangle)
+ angle = float(endangle - startangle) # angle subtended by arc
+ height = outer_radius - inner_radius
+
+ assert startangle <= endangle and angle >= 0
+ if head_length_ratio and tail_length_ratio:
+ headangle = max(
+ endangle
+ - min(height * head_length_ratio / (center * teeth), angle * 0.5),
+ startangle,
+ )
+ tailangle = min(
+ startangle
+ + min(height * tail_length_ratio / (center * teeth), angle * 0.5),
+ endangle,
+ )
+ # With very small features, can due to floating point calculations
+ # violate the assertion below that start <= tail <= head <= end
+ tailangle = min(tailangle, headangle)
+ elif head_length_ratio:
+ headangle = max(
+ endangle - min(height * head_length_ratio / (center * teeth), angle),
+ startangle,
+ )
+ tailangle = startangle
+ else:
+ headangle = endangle
+ tailangle = min(
+ startangle + min(height * tail_length_ratio / (center * teeth), angle),
+ endangle,
+ )
+
+ if not startangle <= tailangle <= headangle <= endangle:
+ raise RuntimeError(
+ "Problem drawing jaggy sigil, invalid "
+ "positions. Start angle: %s, "
+ "Tail angle: %s, Head angle: %s, End angle %s, "
+ "Angle: %s" % (startangle, tailangle, headangle, endangle, angle)
+ )
+
+ # Calculate trig values for angle and coordinates
+ startcos, startsin = cos(startangle), sin(startangle)
+ headcos, headsin = cos(headangle), sin(headangle)
+ endcos, endsin = cos(endangle), sin(endangle)
+ x0, y0 = self.xcenter, self.ycenter # origin of the circle
+
+ p = ArcPath(
+ strokeColor=strokecolor,
+ fillColor=color,
+ # default is mitre/miter which can stick out too much:
+ strokeLineJoin=1, # 1=round
+ strokewidth=0,
+ **kwargs
+ )
+ # Note reportlab counts angles anti-clockwise from the horizontal
+ # (as in mathematics, e.g. complex numbers and polar coordinates)
+ # but we use clockwise from the vertical. Also reportlab uses
+ # degrees, but we use radians.
+ p.addArc(
+ self.xcenter,
+ self.ycenter,
+ inner_radius,
+ 90 - (headangle * 180 / pi),
+ 90 - (tailangle * 180 / pi),
+ moveTo=True,
+ )
+ for i in range(0, teeth):
+ p.addArc(
+ self.xcenter,
+ self.ycenter,
+ inner_radius + i * height / teeth,
+ 90 - (tailangle * 180 / pi),
+ 90 - (startangle * 180 / pi),
+ )
+ # Curved line needed when drawing long jaggies
+ self._draw_arc_line(
+ p,
+ inner_radius + i * height / teeth,
+ inner_radius + (i + 1) * height / teeth,
+ 90 - (startangle * 180 / pi),
+ 90 - (tailangle * 180 / pi),
+ )
+ p.addArc(
+ self.xcenter,
+ self.ycenter,
+ outer_radius,
+ 90 - (headangle * 180 / pi),
+ 90 - (tailangle * 180 / pi),
+ reverse=True,
+ )
+ for i in range(0, teeth):
+ p.addArc(
+ self.xcenter,
+ self.ycenter,
+ outer_radius - i * height / teeth,
+ 90 - (endangle * 180 / pi),
+ 90 - (headangle * 180 / pi),
+ reverse=True,
+ )
+ # Curved line needed when drawing long jaggies
+ self._draw_arc_line(
+ p,
+ outer_radius - i * height / teeth,
+ outer_radius - (i + 1) * height / teeth,
+ 90 - (endangle * 180 / pi),
+ 90 - (headangle * 180 / pi),
+ )
+ p.closePath()
+ return p
diff --git a/code/lib/Bio/Graphics/GenomeDiagram/_Colors.py b/code/lib/Bio/Graphics/GenomeDiagram/_Colors.py
new file mode 100644
index 0000000..a37e107
--- /dev/null
+++ b/code/lib/Bio/Graphics/GenomeDiagram/_Colors.py
@@ -0,0 +1,234 @@
+# Copyright 2003-2008 by Leighton Pritchard. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+#
+# Contact: Leighton Pritchard, The James Hutton Institute,
+# Invergowrie, Dundee, Scotland, DD2 5DA, UK
+# Leighton.Pritchard@hutton.ac.uk
+################################################################################
+
+"""Colors module.
+
+Provides:
+
+- ColorTranslator - class to convert tuples of integers and floats into
+ colors.Color objects
+
+For drawing capabilities, this module uses reportlab to define colors:
+http://www.reportlab.com
+"""
+
+# ReportLab imports
+from reportlab.lib import colors
+
+
+class ColorTranslator:
+ """Class providing methods for translating representations of color into.
+
+ Examples
+ --------
+ >>> from Bio.Graphics import GenomeDiagram
+ >>> gdct=GenomeDiagram._Colors.ColorTranslator()
+ >>> print(gdct.float1_color((0.5, 0.5, 0.5)))
+ Color(.5,.5,.5,1)
+ >>> print(gdct.int255_color((1, 75, 240)))
+ Color(.003922,.294118,.941176,1)
+ >>> print(gdct.artemis_color(7))
+ Color(1,1,0,1)
+ >>> print(gdct.scheme_color(2))
+ Color(1,0,0,1)
+ >>> gdct.get_artemis_colorscheme()
+ {0: (Color(1,1,1,1), 'pathogenicity, adaptation, chaperones'), 1: (Color(.39,.39,.39,1), 'energy metabolism'), 2: (Color(1,0,0,1), 'information transfer'), 3: (Color(0,1,0,1), 'surface'), 4: (Color(0,0,1,1), 'stable RNA'), 5: (Color(0,1,1,1), 'degradation of large molecules'), 6: (Color(1,0,1,1), 'degradation of small molecules'), 7: (Color(1,1,0,1), 'central/intermediary/miscellaneous metabolism'), 8: (Color(.6,.98,.6,1), 'unknown'), 9: (Color(.53,.81,.98,1), 'regulators'), 10: (Color(1,.65,0,1), 'conserved hypotheticals'), 11: (Color(.78,.59,.39,1), 'pseudogenes and partial genes'), 12: (Color(1,.78,.78,1), 'phage/IS elements'), 13: (Color(.7,.7,.7,1), 'some miscellaneous information'), 14: (Color(0,0,0,1), ''), 15: (Color(1,.25,.25,1), 'secondary metabolism'), 16: (Color(1,.5,.5,1), ''), 17: (Color(1,.75,.75,1), '')}
+
+ >>> print(gdct.translate((0.5, 0.5, 0.5)))
+ Color(.5,.5,.5,1)
+ >>> print(gdct.translate((1, 75, 240)))
+ Color(.003922,.294118,.941176,1)
+ >>> print(gdct.translate(7))
+ Color(1,1,0,1)
+ >>> print(gdct.translate(2))
+ Color(1,0,0,1)
+
+ """
+
+ def __init__(self, filename=None):
+ """Initialize.
+
+ Argument filename is the location of a file containing
+ colorscheme information.
+ """
+ self._artemis_colorscheme = {
+ 0: (colors.Color(1, 1, 1), "pathogenicity, adaptation, chaperones"),
+ 1: (colors.Color(0.39, 0.39, 0.39), "energy metabolism"),
+ 2: (colors.Color(1, 0, 0), "information transfer"),
+ 3: (colors.Color(0, 1, 0), "surface"),
+ 4: (colors.Color(0, 0, 1), "stable RNA"),
+ 5: (colors.Color(0, 1, 1), "degradation of large molecules"),
+ 6: (colors.Color(1, 0, 1), "degradation of small molecules"),
+ 7: (colors.Color(1, 1, 0), "central/intermediary/miscellaneous metabolism"),
+ 8: (colors.Color(0.60, 0.98, 0.60), "unknown"),
+ 9: (colors.Color(0.53, 0.81, 0.98), "regulators"),
+ 10: (colors.Color(1, 0.65, 0), "conserved hypotheticals"),
+ 11: (colors.Color(0.78, 0.59, 0.39), "pseudogenes and partial genes"),
+ 12: (colors.Color(1, 0.78, 0.78), "phage/IS elements"),
+ 13: (colors.Color(0.70, 0.70, 0.70), "some miscellaneous information"),
+ 14: (colors.Color(0, 0, 0), ""),
+ 15: (colors.Color(1, 0.25, 0.25), "secondary metabolism"),
+ 16: (colors.Color(1, 0.5, 0.5), ""),
+ 17: (colors.Color(1, 0.75, 0.75), ""),
+ } # Hardwired Artemis color scheme
+ self._colorscheme = {}
+ if filename is not None:
+ self.read_colorscheme(filename) # Imported color scheme
+ else:
+ self._colorscheme = self._artemis_colorscheme
+
+ def translate(self, color=None, colour=None):
+ """Translate a color into a ReportLab Color object.
+
+ Arguments:
+ - color - Color defined as an int, a tuple of three ints 0->255
+ or a tuple of three floats 0 -> 1, or a string giving
+ one of the named colors defined by ReportLab, or a
+ ReportLab color object (returned as is).
+ - colour - Backwards compatible alias using UK spelling (which
+ will over-ride any color argument).
+
+ Returns a colors.Color object, determined semi-intelligently
+ depending on the input values
+ """
+ # Let the UK spelling (colour) override the USA spelling (color)
+ if colour is not None:
+ color = colour
+
+ if color is None:
+ raise ValueError("Passed color (or colour) must be a valid color type")
+ elif isinstance(color, int):
+ color = self.scheme_color(color)
+ elif isinstance(color, colors.Color):
+ return color
+ elif isinstance(color, str):
+ # Assume its a named reportlab color like "red".
+ color = colors.toColor(color)
+ elif isinstance(color, tuple) and isinstance(color[0], float):
+ color = self.float1_color(color)
+ elif isinstance(color, tuple) and isinstance(color[0], int):
+ color = self.int255_color(color)
+ return color
+
+ def read_colorscheme(self, filename):
+ r"""Load colour scheme from file.
+
+ Reads information from a file containing color information and stores
+ it internally.
+
+ Argument filename is the location of a file defining colors in
+ tab-separated format plaintext as::
+
+ INT \t RED \t GREEN \t BLUE \t Comment
+
+ Where RED, GREEN and BLUE are intensities in the range 0 -> 255, e.g.::
+
+ 2 \t 255 \t 0 \t 0 \t Red: Information transfer
+
+ """
+ with open(filename).readlines() as lines:
+ for line in lines:
+ data = line.strip().split("\t")
+ try:
+ label = int(data[0])
+ red, green, blue = int(data[1]), int(data[2]), int(data[3])
+ if len(data) > 4:
+ comment = data[4]
+ else:
+ comment = ""
+ self._colorscheme[label] = (
+ self.int255_color((red, green, blue)),
+ comment,
+ )
+ except ValueError:
+ raise ValueError(
+ "Expected INT \t INT \t INT \t INT \t string input"
+ ) from None
+
+ def get_artemis_colorscheme(self):
+ """Return the Artemis color scheme as a dictionary."""
+ return self._artemis_colorscheme
+
+ def artemis_color(self, value):
+ """Artemis color (integer) to ReportLab Color object.
+
+ Arguments:
+ - value: An int representing a functional class in the Artemis
+ color scheme (see www.sanger.ac.uk for a description),
+ or a string from a GenBank feature annotation for the
+ color which may be dot delimited (in which case the
+ first value is used).
+
+ Takes an int representing a functional class in the Artemis color
+ scheme, and returns the appropriate colors.Color object
+ """
+ try:
+ value = int(value)
+ except ValueError:
+ if value.count("."): # dot-delimited
+ value = int(value.split(".", 1)[0]) # Use only first integer
+ else:
+ raise
+ if value in self._artemis_colorscheme:
+ return self._artemis_colorscheme[value][0]
+ else:
+ raise ValueError("Artemis color out of range: %d" % value)
+
+ def get_colorscheme(self):
+ """Return the user-defined color scheme as a dictionary."""
+ return self._colorscheme
+
+ def scheme_color(self, value):
+ """Map a user-defined color integer to a ReportLab Color object.
+
+ - value: An int representing a single color in the user-defined
+ color scheme
+
+ Takes an int representing a user-defined color and returns the
+ appropriate colors.Color object.
+ """
+ if value in self._colorscheme:
+ return self._colorscheme[value][0]
+ else:
+ raise ValueError("Scheme color out of range: %d" % value)
+
+ def int255_color(self, values):
+ """Map integer (red, green, blue) tuple to a ReportLab Color object.
+
+ - values: A tuple of (red, green, blue) intensities as
+ integers in the range 0->255
+
+ Takes a tuple of (red, green, blue) intensity values in the range
+ 0 -> 255 and returns an appropriate colors.Color object.
+ """
+ red, green, blue = values
+ factor = 1 / 255.0
+ red, green, blue = red * factor, green * factor, blue * factor
+ return colors.Color(red, green, blue)
+
+ def float1_color(self, values):
+ """Map float (red, green, blue) tuple to a ReportLab Color object.
+
+ - values: A tuple of (red, green, blue) intensities as floats
+ in the range 0 -> 1
+
+ Takes a tuple of (red, green, blue) intensity values in the range
+ 0 -> 1 and returns an appropriate colors.Color object.
+ """
+ red, green, blue = values
+ return colors.Color(red, green, blue)
+
+
+if __name__ == "__main__":
+ from Bio._utils import run_doctest
+
+ run_doctest(verbose=2)
diff --git a/code/lib/Bio/Graphics/GenomeDiagram/_CrossLink.py b/code/lib/Bio/Graphics/GenomeDiagram/_CrossLink.py
new file mode 100644
index 0000000..7958de4
--- /dev/null
+++ b/code/lib/Bio/Graphics/GenomeDiagram/_CrossLink.py
@@ -0,0 +1,100 @@
+# Copyright 2011-2017 by Peter Cock. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Genome Diagram Feature cross-link module."""
+
+from reportlab.lib import colors
+
+
+class CrossLink:
+ """Hold information for drawing a cross link between features."""
+
+ def __init__(
+ self, featureA, featureB, color=colors.lightgreen, border=None, flip=False
+ ):
+ """Create a new cross link.
+
+ Arguments featureA and featureB should GenomeDiagram feature objects,
+ or 3-tuples (track object, start, end), and currently must be on
+ different tracks.
+
+ The color and border arguments should be ReportLab colour objects, or
+ for border use a boolean False for no border, otherwise it defaults to
+ the same as the main colour.
+
+ The flip argument draws an inverted cross link, useful for showing a
+ mapping where one sequence has been reversed. It is conventional to
+ also use a different colour (e.g. red for simple links, blue for any
+ flipped links).
+ """
+ # Initialize attributes
+ self.featureA = featureA
+ self.featureB = featureB
+ self.color = color # default color to draw the feature
+ self.border = border
+ self.flip = flip
+
+ @property
+ def startA(self):
+ """Start position of Feature A."""
+ try:
+ return self.featureA.start
+ except AttributeError:
+ track, start, end = self.featureA
+ return start
+
+ @property
+ def endA(self):
+ """End position of Feature A."""
+ try:
+ return self.featureA.end
+ except AttributeError:
+ track, start, end = self.featureA
+ return end
+
+ def _trackA(self, tracks):
+ try:
+ track, start, end = self.featureA
+ assert track in tracks
+ return track
+ except TypeError:
+ for track in tracks:
+ for feature_set in track.get_sets():
+ if hasattr(feature_set, "features"):
+ if self.featureA in feature_set.features.values():
+ return track
+ return None
+
+ @property
+ def startB(self):
+ """Start position of Feature B."""
+ try:
+ return self.featureB.start
+ except AttributeError:
+ track, start, end = self.featureB
+ return start
+
+ @property
+ def endB(self):
+ """End position of Feature B."""
+ try:
+ return self.featureB.end
+ except AttributeError:
+ track, start, end = self.featureB
+ return end
+
+ def _trackB(self, tracks):
+ try:
+ track, start, end = self.featureB
+ assert track in tracks
+ return track
+ except TypeError:
+ for track in tracks:
+ for feature_set in track.get_sets():
+ if hasattr(feature_set, "features"):
+ if self.featureB in feature_set.features.values():
+ return track
+ return None
diff --git a/code/lib/Bio/Graphics/GenomeDiagram/_Diagram.py b/code/lib/Bio/Graphics/GenomeDiagram/_Diagram.py
new file mode 100644
index 0000000..fa44970
--- /dev/null
+++ b/code/lib/Bio/Graphics/GenomeDiagram/_Diagram.py
@@ -0,0 +1,411 @@
+# Copyright 2003-2008 by Leighton Pritchard. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+#
+# Contact: Leighton Pritchard, The James Hutton Institute,
+# Invergowrie, Dundee, Scotland, DD2 5DA, UK
+# Leighton.Pritchard@hutton.ac.uk
+"""Provides a container for information concerning the tracks to be drawn in a diagram.
+
+It also provides the interface for defining the diagram (possibly split these
+functions in later version?).
+
+For drawing capabilities, this module uses reportlab to draw and write the
+diagram:
+
+http://www.reportlab.com
+
+For dealing with biological information, the package expects BioPython
+objects - namely SeqRecord objects containing SeqFeature objects.
+"""
+
+try:
+ from reportlab.graphics import renderPM
+except ImportError:
+ # This is an optional part of ReportLab, so may not be installed.
+ renderPM = None
+
+from ._LinearDrawer import LinearDrawer
+from ._CircularDrawer import CircularDrawer
+from ._Track import Track
+
+from Bio.Graphics import _write
+
+
+def _first_defined(*args):
+ """Return the first non-null argument (PRIVATE)."""
+ for arg in args:
+ if arg is not None:
+ return arg
+ return None
+
+
+class Diagram:
+ """Diagram container.
+
+ Arguments:
+ - name - a string, identifier for the diagram.
+ - tracks - a list of Track objects comprising the diagram.
+ - format - a string, format of the diagram 'circular' or
+ 'linear', depending on the sort of diagram required.
+ - pagesize - a string, the pagesize of output describing the ISO
+ size of the image, or a tuple of pixels.
+ - orientation - a string describing the required orientation of the
+ final drawing ('landscape' or 'portrait').
+ - x - a float (0->1), the proportion of the page to take
+ up with even X margins t the page.
+ - y - a float (0->1), the proportion of the page to take
+ up with even Y margins to the page.
+ - xl - a float (0->1), the proportion of the page to take
+ up with the left X margin to the page (overrides x).
+ - xr - a float (0->1), the proportion of the page to take
+ up with the right X margin to the page (overrides x).
+ - yt - a float (0->1), the proportion of the page to take
+ up with the top Y margin to the page (overrides y).
+ - yb - a float (0->1), the proportion of the page to take
+ up with the bottom Y margin to the page (overrides y).
+ - circle_core - a float, the proportion of the available radius to
+ leave empty at the center of a circular diagram (0 to 1).
+ - start - an integer, the base/aa position to start the diagram at.
+ - end - an integer, the base/aa position to end the diagram at.
+ - tracklines - a boolean, True if track guidelines are to be drawn.
+ - fragments - and integer, for a linear diagram, the number of equal
+ divisions into which the sequence is divided.
+ - fragment_size - a float (0->1), the proportion of the space
+ available to each fragment that should be used in drawing.
+ - track_size - a float (0->1), the proportion of the space
+ available to each track that should be used in drawing with sigils.
+ - circular - a boolean, True if the genome/sequence to be drawn
+ is, in reality, circular.
+
+ """
+
+ def __init__(
+ self,
+ name=None,
+ format="circular",
+ pagesize="A3",
+ orientation="landscape",
+ x=0.05,
+ y=0.05,
+ xl=None,
+ xr=None,
+ yt=None,
+ yb=None,
+ start=None,
+ end=None,
+ tracklines=False,
+ fragments=10,
+ fragment_size=None,
+ track_size=0.75,
+ circular=True,
+ circle_core=0.0,
+ ):
+ """Initialize.
+
+ gdd = Diagram(name=None)
+ """
+ self.tracks = {} # Holds all Track objects, keyed by level
+ self.name = name # Description of the diagram
+ # Diagram page setup attributes
+ self.format = format
+ self.pagesize = pagesize
+ self.orientation = orientation
+ self.x = x
+ self.y = y
+ self.xl = xl
+ self.xr = xr
+ self.yt = yt
+ self.yb = yb
+ self.start = start
+ self.end = end
+ self.tracklines = tracklines
+ self.fragments = fragments
+ if fragment_size is not None:
+ self.fragment_size = fragment_size
+ else:
+ if self.fragments == 1:
+ # For single fragments, default to full height
+ self.fragment_size = 1
+ else:
+ # Otherwise keep a 10% gap between fragments
+ self.fragment_size = 0.9
+ self.track_size = track_size
+ self.circular = circular
+ self.circle_core = circle_core
+ self.cross_track_links = []
+ self.drawing = None
+
+ def set_all_tracks(self, attr, value):
+ """Set the passed attribute of all tracks in the set to the passed value.
+
+ Arguments:
+ - attr - An attribute of the Track class.
+ - value - The value to set that attribute.
+
+ set_all_tracks(self, attr, value)
+ """
+ for track in self.tracks.values():
+ if hasattr(track, attr):
+ # If the feature has the attribute set it to the passed value
+ setattr(track, attr, value)
+
+ def draw(
+ self,
+ format=None,
+ pagesize=None,
+ orientation=None,
+ x=None,
+ y=None,
+ xl=None,
+ xr=None,
+ yt=None,
+ yb=None,
+ start=None,
+ end=None,
+ tracklines=None,
+ fragments=None,
+ fragment_size=None,
+ track_size=None,
+ circular=None,
+ circle_core=None,
+ cross_track_links=None,
+ ):
+ """Draw the diagram, with passed parameters overriding existing attributes.
+
+ gdd.draw(format='circular')
+ """
+ # Pass the parameters to the drawer objects that will build the
+ # diagrams. At the moment, we detect overrides with an or in the
+ # Instantiation arguments, but I suspect there's a neater way to do
+ # this.
+ if format == "linear":
+ drawer = LinearDrawer(
+ self,
+ _first_defined(pagesize, self.pagesize),
+ _first_defined(orientation, self.orientation),
+ _first_defined(x, self.x),
+ _first_defined(y, self.y),
+ _first_defined(xl, self.xl),
+ _first_defined(xr, self.xr),
+ _first_defined(yt, self.yt),
+ _first_defined(yb, self.yb),
+ _first_defined(start, self.start),
+ _first_defined(end, self.end),
+ _first_defined(tracklines, self.tracklines),
+ _first_defined(fragments, self.fragments),
+ _first_defined(fragment_size, self.fragment_size),
+ _first_defined(track_size, self.track_size),
+ _first_defined(cross_track_links, self.cross_track_links),
+ )
+ else:
+ drawer = CircularDrawer(
+ self,
+ _first_defined(pagesize, self.pagesize),
+ _first_defined(orientation, self.orientation),
+ _first_defined(x, self.x),
+ _first_defined(y, self.y),
+ _first_defined(xl, self.xl),
+ _first_defined(xr, self.xr),
+ _first_defined(yt, self.yt),
+ _first_defined(yb, self.yb),
+ _first_defined(start, self.start),
+ _first_defined(end, self.end),
+ _first_defined(tracklines, self.tracklines),
+ _first_defined(track_size, self.track_size),
+ _first_defined(circular, self.circular),
+ _first_defined(circle_core, self.circle_core),
+ _first_defined(cross_track_links, self.cross_track_links),
+ )
+ drawer.draw() # Tell the drawer to complete the drawing
+ self.drawing = drawer.drawing # Get the completed drawing
+
+ def write(self, filename="test1.ps", output="PS", dpi=72):
+ """Write the drawn diagram to a specified file, in a specified format.
+
+ Arguments:
+ - filename - a string indicating the name of the output file,
+ or a handle to write to.
+ - output - a string indicating output format, one of PS, PDF,
+ SVG, or provided the ReportLab renderPM module is installed, one
+ of the bitmap formats JPG, BMP, GIF, PNG, TIFF or TIFF. The
+ format can be given in upper or lower case.
+ - dpi - an integer. Resolution (dots per inch) for bitmap formats.
+
+ Returns:
+ No return value.
+
+ write(self, filename='test1.ps', output='PS', dpi=72)
+
+ """
+ return _write(self.drawing, filename, output, dpi=dpi)
+
+ def write_to_string(self, output="PS", dpi=72):
+ """Return a byte string containing the diagram in the requested format.
+
+ Arguments:
+ - output - a string indicating output format, one of PS, PDF,
+ SVG, JPG, BMP, GIF, PNG, TIFF or TIFF (as specified for the write
+ method).
+ - dpi - Resolution (dots per inch) for bitmap formats.
+
+ Returns:
+ Return the completed drawing as a bytes string in a prescribed
+ format.
+
+ """
+ # The ReportLab drawToString method, which this function used to call,
+ # originally just used a StringIO handle with the drawToFile method.
+ #
+ # TODO - Rename this method to include keyword bytes?
+ from io import BytesIO
+
+ handle = BytesIO()
+ self.write(handle, output, dpi)
+ return handle.getvalue()
+
+ def add_track(self, track, track_level):
+ """Add a Track object to the diagram.
+
+ It also accepts instructions to place it at a particular level on the
+ diagram.
+
+ Arguments:
+ - track - Track object to draw.
+ - track_level - an integer. The level at which the track will be
+ drawn (above an arbitrary baseline).
+
+ add_track(self, track, track_level)
+ """
+ if track is None:
+ raise ValueError("Must specify track")
+ if track_level not in self.tracks: # No track at that level
+ self.tracks[track_level] = track # so just add it
+ else: # Already a track there, so shunt all higher tracks up one
+ occupied_levels = sorted(
+ self.get_levels()
+ ) # Get list of occupied levels...
+ occupied_levels.reverse() # ...reverse it (highest first)
+ for val in occupied_levels:
+ # If track value >= that to be added
+ if val >= track.track_level:
+ self.tracks[val + 1] = self.tracks[val] # ...increment by 1
+ self.tracks[track_level] = track # And put the new track in
+ self.tracks[track_level].track_level = track_level
+
+ def new_track(self, track_level, **args):
+ """Add a new Track to the diagram at a given level.
+
+ The track is returned for further user manipulation.
+
+ Arguments:
+ - track_level - an integer. The level at which the track will be
+ drawn (above an arbitrary baseline).
+
+ new_track(self, track_level)
+ """
+ newtrack = Track()
+ for key in args:
+ setattr(newtrack, key, args[key])
+ if track_level not in self.tracks: # No track at that level
+ self.tracks[track_level] = newtrack # so just add it
+ else: # Already a track there, so shunt all higher tracks up one
+ occupied_levels = sorted(
+ self.get_levels()
+ ) # Get list of occupied levels...
+ occupied_levels.reverse() # ...reverse (highest first)...
+ for val in occupied_levels:
+ if val >= track_level:
+ # Track value >= that to be added, increment by 1
+ self.tracks[val + 1] = self.tracks[val]
+ self.tracks[track_level] = newtrack # And put the new track in
+ self.tracks[track_level].track_level = track_level
+ return newtrack
+
+ def del_track(self, track_level):
+ """Remove the track to be drawn at a particular level on the diagram.
+
+ Arguments:
+ - track_level - an integer. The level of the track on the diagram
+ to delete.
+
+ del_track(self, track_level)
+ """
+ del self.tracks[track_level]
+
+ def get_tracks(self):
+ """Return a list of the tracks contained in the diagram."""
+ return list(self.tracks.values())
+
+ def move_track(self, from_level, to_level):
+ """Move a track from one level on the diagram to another.
+
+ Arguments:
+ - from_level - an integer. The level at which the track to be
+ moved is found.
+ - to_level - an integer. The level to move the track to.
+
+ """
+ aux = self.tracks[from_level]
+ del self.tracks[from_level]
+ self.add_track(aux, to_level)
+
+ def renumber_tracks(self, low=1, step=1):
+ """Renumber all tracks consecutively.
+
+ Optionally from a passed lowest number.
+
+ Arguments:
+ - low - an integer. The track number to start from.
+ - step - an integer. The track interval for separation of
+ tracks.
+
+ """
+ track = low # Start numbering from here
+ levels = self.get_levels()
+
+ conversion = {} # Holds new set of levels
+ for level in levels: # Starting at low...
+ conversion[track] = self.tracks[level] # Add old tracks to new set
+ conversion[track].track_level = track
+ track += step # step interval
+ self.tracks = conversion # Replace old set of levels with new set
+
+ def get_levels(self):
+ """Return a sorted list of levels occupied by tracks in the diagram."""
+ return sorted(self.tracks)
+
+ def get_drawn_levels(self):
+ """Return a sorted list of levels occupied by tracks.
+
+ These tracks are not explicitly hidden.
+ """
+ return sorted(key for key in self.tracks if not self.tracks[key].hide)
+
+ def range(self):
+ """Return lowest and highest base numbers from track features.
+
+ Returned type is a tuple.
+ """
+ lows, highs = [], []
+ for track in self.tracks.values(): # Get ranges for each track
+ low, high = track.range()
+ lows.append(low)
+ highs.append(high)
+ return min(lows), max(highs) # Return extremes from all tracks
+
+ def __getitem__(self, key):
+ """Return the track contained at the level of the passed key."""
+ return self.tracks[key]
+
+ def __str__(self):
+ """Return a formatted string describing the diagram."""
+ outstr = ["\n<%s: %s>" % (self.__class__, self.name)]
+ outstr.append("%d tracks" % len(self.tracks))
+ for level in self.get_levels():
+ outstr.append("Track %d: %s\n" % (level, self.tracks[level]))
+ outstr = "\n".join(outstr)
+ return outstr
diff --git a/code/lib/Bio/Graphics/GenomeDiagram/_Feature.py b/code/lib/Bio/Graphics/GenomeDiagram/_Feature.py
new file mode 100644
index 0000000..87be16e
--- /dev/null
+++ b/code/lib/Bio/Graphics/GenomeDiagram/_Feature.py
@@ -0,0 +1,198 @@
+# Copyright 2003-2008 by Leighton Pritchard. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+#
+# Contact: Leighton Pritchard, The James Hutton Institute,
+# Invergowrie, Dundee, Scotland, DD2 5DA, UK
+# Leighton.Pritchard@hutton.ac.uk
+################################################################################
+
+"""Feature module.
+
+Provides:
+ - Feature - class to wrap Bio.SeqFeature objects with drawing information
+
+For drawing capabilities, this module uses reportlab to define colors:
+http://www.reportlab.com
+"""
+
+# ReportLab imports
+from reportlab.lib import colors
+
+# GenomeDiagram imports
+from ._Colors import ColorTranslator
+
+
+class Feature:
+ """Class to wrap Bio.SeqFeature objects for GenomeDiagram.
+
+ Attributes:
+ - parent FeatureSet, container for the object
+ - id Unique id
+ - color color.Color, color to draw the feature
+ - hide Boolean for whether the feature will be drawn or not
+ - sigil String denoting the type of sigil to use for the feature.
+ Currently either "BOX" or "ARROW" are supported.
+ - arrowhead_length Float denoting length of the arrow head to be drawn,
+ relative to the bounding box height. The arrow shaft
+ takes up the remainder of the bounding box's length.
+ - arrowshaft_height Float denoting length of the representative arrow
+ shaft to be drawn, relative to the bounding box height.
+ The arrow head takes the full height of the bound box.
+ - name_qualifiers List of Strings, describes the qualifiers that may
+ contain feature names in the wrapped Bio.SeqFeature object
+ - label Boolean, 1 if the label should be shown
+ - label_font String describing the font to use for the feature label
+ - label_size Int describing the feature label font size
+ - label_color color.Color describing the feature label color
+ - label_angle Float describing the angle through which to rotate the
+ feature label in degrees (default = 45, linear only)
+ - label_position String, 'start', 'end' or 'middle' denoting where
+ to place the feature label. Leave as None for the default
+ which is 'start' for linear diagrams, and at the bottom of
+ the feature as drawn on circular diagrams.
+ - label_strand Integer -1 or +1 to explicitly place the label on the
+ forward or reverse strand. Default (None) follows th
+ feature's strand. Use -1 to put labels under (linear) or
+ inside (circular) the track, +1 to put them above (linear)
+ or outside (circular) the track.
+ - locations List of tuples of (start, end) ints describing where the
+ feature and any subfeatures start and end
+ - type String denoting the feature type
+ - name String denoting the feature name
+ - strand Int describing the strand on which the feature is found
+
+ """
+
+ def __init__(
+ self,
+ parent=None,
+ feature_id=None,
+ feature=None,
+ color=colors.lightgreen,
+ label=0,
+ border=None,
+ colour=None,
+ ):
+ """Initialize.
+
+ Arguments:
+ - parent FeatureSet containing the feature
+ - feature_id Unique id for the feature
+ - feature Bio.SeqFeature object to be wrapped
+ - color color.Color Color to draw the feature (overridden
+ by backwards compatible argument with UK spelling, colour).
+ Either argument is overridden if 'color' is found in feature
+ qualifiers
+ - border color.Color Color to draw the feature border, use
+ None for the same as the fill color, False for no border.
+ - label Boolean, 1 if the label should be shown
+
+ """
+ # Let the UK spelling (colour) override the USA spelling (color)
+ if colour is not None:
+ color = colour
+
+ self._colortranslator = ColorTranslator()
+
+ # Initialize attributes
+ self.parent = parent
+ self.id = feature_id
+ self.color = color # default color to draw the feature
+ self.border = border
+ self._feature = None # Bio.SeqFeature object to wrap
+ self.hide = 0 # show by default
+ self.sigil = "BOX"
+ self.arrowhead_length = 0.5 # 50% of the box height
+ self.arrowshaft_height = 0.4 # 40% of the box height
+ self.name_qualifiers = ["gene", "label", "name", "locus_tag", "product"]
+ self.label = label
+ self.label_font = "Helvetica"
+ self.label_size = 6
+ self.label_color = colors.black
+ self.label_angle = 45
+ self.label_position = None # Expect 'start', 'middle', or 'end' (plus aliases)
+ self.label_strand = None # Expect +1 or -1 if overriding this
+
+ if feature is not None:
+ self.set_feature(feature)
+
+ def set_feature(self, feature):
+ """Define the Bio.SeqFeature object to be wrapped."""
+ self._feature = feature
+ self.__process_feature()
+
+ def __process_feature(self):
+ """Examine wrapped feature and set some properties accordingly (PRIVATE)."""
+ self.locations = []
+ bounds = []
+ # This will be a list of length one for simple FeatureLocation:
+ for location in self._feature.location.parts:
+ start = location.nofuzzy_start
+ end = location.nofuzzy_end
+ # if start > end and self.strand == -1:
+ # start, end = end, start
+ self.locations.append((start, end))
+ bounds += [start, end]
+ self.type = str(self._feature.type) # Feature type
+ # TODO - Strand can vary with subfeatures (e.g. mixed strand tRNA)
+ if self._feature.strand is None:
+ # This is the SeqFeature default (None), but the drawing code
+ # only expects 0, +1 or -1.
+ self.strand = 0
+ else:
+ self.strand = int(self._feature.strand) # Feature strand
+ if "color" in self._feature.qualifiers: # Artemis color (if present)
+ self.color = self._colortranslator.artemis_color(
+ self._feature.qualifiers["color"][0]
+ )
+ self.name = self.type
+ for qualifier in self.name_qualifiers:
+ if qualifier in self._feature.qualifiers:
+ self.name = self._feature.qualifiers[qualifier][0]
+ break
+ # Note will be 0 to N for origin wrapping feature on genome of length N
+ self.start, self.end = min(bounds), max(bounds)
+
+ def get_feature(self):
+ """Return the unwrapped Bio.SeqFeature object."""
+ return self._feature
+
+ def set_colour(self, colour):
+ """Backwards compatible variant of set_color(self, color) using UK spelling."""
+ color = self._colortranslator.translate(colour)
+ self.color = color
+
+ def set_color(self, color):
+ """Set the color in which the feature will be drawn.
+
+ Arguments:
+ - color The color to draw the feature - either a colors.Color
+ object, an RGB tuple of floats, or an integer corresponding a
+ colors in colors.txt
+
+ """
+ # TODO - Make this into the set method for a color property?
+ color = self._colortranslator.translate(color)
+ self.color = color
+
+ def __getattr__(self, name):
+ """Get attribute by name.
+
+ If the Feature class doesn't have the attribute called for,
+ check in self._feature for it.
+ """
+ return getattr(self._feature, name) # try to get the attribute from the feature
+
+
+################################################################################
+# RUN AS SCRIPT
+################################################################################
+
+if __name__ == "__main__":
+
+ # Test code
+ gdf = Feature()
diff --git a/code/lib/Bio/Graphics/GenomeDiagram/_FeatureSet.py b/code/lib/Bio/Graphics/GenomeDiagram/_FeatureSet.py
new file mode 100644
index 0000000..4168a29
--- /dev/null
+++ b/code/lib/Bio/Graphics/GenomeDiagram/_FeatureSet.py
@@ -0,0 +1,210 @@
+# Copyright 2003-2008 by Leighton Pritchard. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+#
+# Contact: Leighton Pritchard, The James Hutton Institute,
+# Invergowrie, Dundee, Scotland, DD2 5DA, UK
+# Leighton.Pritchard@hutton.ac.uk
+################################################################################
+#
+# Thanks to Peter Cock for the impetus to write the get_features() code to
+# subselect Features.
+#
+################################################################################
+
+"""FeatureSet module.
+
+Provides:
+ - FeatureSet - container for Feature objects
+
+For drawing capabilities, this module uses reportlab to draw and write
+the diagram: http://www.reportlab.com
+"""
+
+
+# GenomeDiagram
+from ._Feature import Feature
+
+# Builtins
+import re
+
+
+class FeatureSet:
+ """FeatureSet object."""
+
+ def __init__(self, set_id=None, name=None, parent=None):
+ """Create the object.
+
+ Arguments:
+ - set_id: Unique id for the set
+ - name: String identifying the feature set
+
+ """
+ self.parent = parent
+ self.id = id # Unique id for the set
+ self.next_id = 0 # counter for unique feature ids
+ self.features = {} # Holds features, keyed by ID
+ self.name = name # String describing the set
+
+ def add_feature(self, feature, **kwargs):
+ """Add a new feature.
+
+ Arguments:
+ - feature: Bio.SeqFeature object
+ - kwargs: Keyword arguments for Feature. Named attributes
+ of the Feature
+
+ Add a Bio.SeqFeature object to the diagram (will be stored
+ internally in a Feature wrapper).
+ """
+ id = self.next_id # get id number
+ f = Feature(self, id, feature)
+ self.features[id] = f # add feature
+ for key in kwargs:
+ if key == "colour" or key == "color":
+ # Deal with "colour" as a special case by also mapping to color.
+ # If Feature.py used a python property we wouldn't need to call
+ # set_color explicitly. However, this is important to make sure
+ # every color gets mapped to a colors object - for example color
+ # numbers, or strings (may not matter for PDF, but does for PNG).
+ self.features[id].set_color(kwargs[key])
+ continue
+ setattr(self.features[id], key, kwargs[key])
+ self.next_id += 1 # increment next id
+ return f
+
+ def del_feature(self, feature_id):
+ """Delete a feature.
+
+ Arguments:
+ - feature_id: Unique id of the feature to delete
+
+ Remove a feature from the set, indicated by its id.
+ """
+ del self.features[feature_id]
+
+ def set_all_features(self, attr, value):
+ """Set an attribute of all the features.
+
+ Arguments:
+ - attr: An attribute of the Feature class
+ - value: The value to set that attribute to
+
+ Set the passed attribute of all features in the set to the
+ passed value.
+ """
+ for feature in self.features.values():
+ if hasattr(feature, attr):
+ # If the feature has the attribute, set it to the passed value
+ setattr(feature, attr, value)
+
+ # For backwards compatibility, we support both colour and color.
+ # As a quick hack, make "colour" set both "colour" and "color".
+ # if attr=="colour":
+ # self.set_all_feature("color",value)
+
+ def get_features(self, attribute=None, value=None, comparator=None):
+ """Retrieve features.
+
+ Arguments:
+ - attribute: String, attribute of a Feature object
+ - value: The value desired of the attribute
+ - comparator: String, how to compare the Feature attribute to the
+ passed value
+
+ If no attribute or value is given, return a list of all features in the
+ feature set. If both an attribute and value are given, then depending
+ on the comparator, then a list of all features in the FeatureSet
+ matching (or not) the passed value will be returned. Allowed comparators
+ are: 'startswith', 'not', 'like'.
+
+ The user is expected to make a responsible decision about which feature
+ attributes to use with which passed values and comparator settings.
+ """
+ # If no attribute or value specified, return all features
+ if attribute is None or value is None:
+ return list(self.features.values())
+ # If no comparator is specified, return all features where the attribute
+ # value matches that passed
+ if comparator is None:
+ return [
+ feature
+ for feature in self.features.values()
+ if getattr(feature, attribute) == value
+ ]
+ # If the comparator is 'not', return all features where the attribute
+ # value does not match that passed
+ elif comparator == "not":
+ return [
+ feature
+ for feature in self.features.values()
+ if getattr(feature, attribute) != value
+ ]
+ # If the comparator is 'startswith', return all features where the attribute
+ # value does not match that passed
+ elif comparator == "startswith":
+ return [
+ feature
+ for feature in self.features.values()
+ if getattr(feature, attribute).startswith(value)
+ ]
+ # If the comparator is 'like', use a regular expression search to identify
+ # features
+ elif comparator == "like":
+ return [
+ feature
+ for feature in self.features.values()
+ if re.search(value, getattr(feature, attribute))
+ ]
+ # As a final option, just return an empty list
+ return []
+
+ def get_ids(self):
+ """Return a list of all ids for the feature set."""
+ return list(self.features.keys())
+
+ def range(self):
+ """Return the lowest and highest base (or mark) numbers as a tuple."""
+ lows, highs = [], []
+ for feature in self.features.values():
+ for start, end in feature.locations:
+ lows.append(start)
+ highs.append(end)
+ if len(lows) != 0 and len(highs) != 0: # Default in case there is
+ return (min(lows), max(highs)) # nothing in the set
+ return 0, 0
+
+ def to_string(self, verbose=0):
+ """Return a formatted string with information about the set.
+
+ Arguments:
+ - verbose: Boolean indicating whether a short (default) or
+ complete account of the set is required
+
+ """
+ if not verbose: # Short account only required
+ return "%s" % self
+ else: # Long account desired
+ outstr = ["\n<%s: %s>" % (self.__class__, self.name)]
+ outstr.append("%d features" % len(self.features))
+ for key in self.features:
+ outstr.append("feature: %s" % self.features[key])
+ return "\n".join(outstr)
+
+ def __len__(self):
+ """Return the number of features in the set."""
+ return len(self.features)
+
+ def __getitem__(self, key):
+ """Return a feature, keyed by id."""
+ return self.features[key]
+
+ def __str__(self):
+ """Return a formatted string with information about the feature set."""
+ outstr = [
+ "\n<%s: %s %d features>" % (self.__class__, self.name, len(self.features))
+ ]
+ return "\n".join(outstr)
diff --git a/code/lib/Bio/Graphics/GenomeDiagram/_Graph.py b/code/lib/Bio/Graphics/GenomeDiagram/_Graph.py
new file mode 100644
index 0000000..7f99ef9
--- /dev/null
+++ b/code/lib/Bio/Graphics/GenomeDiagram/_Graph.py
@@ -0,0 +1,195 @@
+# Copyright 2003-2008 by Leighton Pritchard. All rights reserved.
+# Revisions copyright 2008-2009 by Peter Cock.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+#
+# Contact: Leighton Pritchard, The James Hutton Institute,
+# Invergowrie, Dundee, Scotland, DD2 5DA, UK
+# Leighton.Pritchard@hutton.ac.uk
+################################################################################
+
+"""Graph module.
+
+Provides:
+ - GraphData - Contains data from which a graph will be drawn, and
+ information about its presentation
+
+For drawing capabilities, this module uses reportlab to draw and write
+the diagram: http://www.reportlab.com
+"""
+
+# ReportLab imports
+
+from reportlab.lib import colors
+
+from math import sqrt
+
+
+class GraphData:
+ """Graph Data.
+
+ Attributes:
+ - id Unique identifier for the data
+ - data Dictionary of describing the data, keyed by position
+ - name String describing the data
+ - style String ('bar', 'heat', 'line') describing how to draw the data
+ - poscolor colors.Color for drawing high (some styles) or all
+ values
+ - negcolor colors.Color for drawing low values (some styles)
+ - linewidth Int, thickness to draw the line in 'line' styles
+
+ """
+
+ def __init__(
+ self,
+ id=None,
+ data=None,
+ name=None,
+ style="bar",
+ color=colors.lightgreen,
+ altcolor=colors.darkseagreen,
+ center=None,
+ colour=None,
+ altcolour=None,
+ ):
+ """Initialize.
+
+ Arguments:
+ - id Unique ID for the graph
+ - data List of (position, value) tuples
+ - name String describing the graph
+ - style String describing the presentation style ('bar', 'line',
+ 'heat')
+ - color colors.Color describing the color to draw all or the
+ 'high' (some styles) values (overridden by backwards
+ compatible argument with UK spelling, colour).
+ - altcolor colors.Color describing the color to draw the 'low'
+ values (some styles only) (overridden by backwards
+ compatible argument with UK spelling, colour).
+ - center Value at which x-axis crosses y-axis.
+
+ """
+ # Let the UK spelling (colour) override the USA spelling (color)
+ if colour is not None:
+ color = colour
+ if altcolour is not None:
+ altcolor = altcolour
+
+ self.id = id # Unique identifier for the graph
+ self.data = {} # holds values, keyed by sequence position
+ if data is not None:
+ self.set_data(data)
+ self.name = name # Descriptive string
+
+ # Attributes describing how the graph will be drawn
+ self.style = style # One of 'bar', 'heat' or 'line'
+ self.poscolor = color # Color to draw all, or 'high' values
+ self.negcolor = altcolor # Color to draw 'low' values
+ self.linewidth = 2 # linewidth to use in line graphs
+ self.center = center # value at which x-axis crosses y-axis
+
+ def set_data(self, data):
+ """Add data as a list of (position, value) tuples."""
+ for (pos, val) in data: # Fill data dictionary
+ self.data[pos] = val
+
+ def get_data(self):
+ """Return data as a list of sorted (position, value) tuples."""
+ data = []
+ for xval in self.data:
+ yval = self.data[xval]
+ data.append((xval, yval))
+ data.sort()
+ return data
+
+ def add_point(self, point):
+ """Add a single point to the set of data as a (position, value) tuple."""
+ pos, val = point
+ self.data[pos] = val
+
+ def quartiles(self):
+ """Return (minimum, lowerQ, medianQ, upperQ, maximum) values as tuple."""
+ data = sorted(self.data.values())
+ datalen = len(data)
+ return (
+ data[0],
+ data[datalen // 4],
+ data[datalen // 2],
+ data[3 * datalen // 4],
+ data[-1],
+ )
+
+ def range(self):
+ """Return range of data as (start, end) tuple.
+
+ Returns the range of the data, i.e. its start and end points on
+ the genome as a (start, end) tuple.
+ """
+ positions = sorted(self.data) # i.e. dict keys
+ # Return first and last positions in graph
+ # print(len(self.data))
+ return (positions[0], positions[-1])
+
+ def mean(self):
+ """Return the mean value for the data points (float)."""
+ data = list(self.data.values())
+ sum = 0.0
+ for item in data:
+ sum += float(item)
+ return sum / len(data)
+
+ def stdev(self):
+ """Return the sample standard deviation for the data (float)."""
+ data = list(self.data.values())
+ m = self.mean()
+ runtotal = 0.0
+ for entry in data:
+ runtotal += float((entry - m) ** 2)
+ # This is sample standard deviation; population stdev would involve
+ # division by len(data), rather than len(data)-1
+ return sqrt(runtotal / (len(data) - 1))
+
+ def __len__(self):
+ """Return the number of points in the data set."""
+ return len(self.data)
+
+ def __getitem__(self, index):
+ """Return data value(s) at the given position.
+
+ Given an integer representing position on the sequence
+ returns a float - the data value at the passed position.
+
+ If a slice, returns graph data from the region as a list or
+ (position, value) tuples. Slices with step are not supported.
+ """
+ if isinstance(index, int):
+ return self.data[index]
+ elif isinstance(index, slice):
+ # TODO - Why does it treat the end points both as inclusive?
+ # This doesn't match Python norms does it?
+ low = index.start
+ high = index.stop
+ if index.step is not None and index.step != 1:
+ raise ValueError
+ outlist = []
+ for pos in sorted(self.data):
+ if pos >= low and pos <= high:
+ outlist.append((pos, self.data[pos]))
+ return outlist
+ else:
+ raise TypeError("Need an integer or a slice")
+
+ def __str__(self):
+ """Return a string describing the graph data."""
+ outstr = ["\nGraphData: %s, ID: %s" % (self.name, self.id)]
+ outstr.append("Number of points: %d" % len(self.data))
+ outstr.append("Mean data value: %s" % self.mean())
+ outstr.append("Sample SD: %.3f" % self.stdev())
+ outstr.append(
+ "Minimum: %s\n1Q: %s\n2Q: %s\n3Q: %s\nMaximum: %s" % self.quartiles()
+ )
+ outstr.append("Sequence Range: %s..%s" % self.range())
+ return "\n".join(outstr)
diff --git a/code/lib/Bio/Graphics/GenomeDiagram/_GraphSet.py b/code/lib/Bio/Graphics/GenomeDiagram/_GraphSet.py
new file mode 100644
index 0000000..d79e6ef
--- /dev/null
+++ b/code/lib/Bio/Graphics/GenomeDiagram/_GraphSet.py
@@ -0,0 +1,171 @@
+# Copyright 2003-2008 by Leighton Pritchard. All rights reserved.
+# Revisions copyright 2008-2010 by Peter Cock.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+#
+# Contact: Leighton Pritchard, The James Hutton Institute,
+# Invergowrie, Dundee, Scotland, DD2 5DA, UK
+# Leighton.Pritchard@hutton.ac.uk
+################################################################################
+#
+# TODO: Make representation of Ymax and Ymin values at this level, so that
+# calculation of graph/axis drawing is simplified
+
+"""GraphSet module.
+
+Provides:
+ - GraphSet - container for GraphData objects
+
+For drawing capabilities, this module uses reportlab to draw and write
+the diagram: http://www.reportlab.com
+"""
+
+# ReportLab imports
+
+from reportlab.lib import colors
+
+from ._Graph import GraphData
+
+
+class GraphSet:
+ """Graph Set.
+
+ Attributes:
+ - id Unique identifier for the set
+ - name String describing the set
+
+ """
+
+ def __init__(self, name=None):
+ """Initialize.
+
+ Arguments:
+ - name String identifying the graph set sensibly
+
+ """
+ self.id = id # Unique identifier for the set
+ self._next_id = 0 # Holds unique ids for graphs
+ self._graphs = {} # Holds graphs, keyed by unique id
+ self.name = name # Holds description of graph
+
+ def new_graph(
+ self,
+ data,
+ name=None,
+ style="bar",
+ color=colors.lightgreen,
+ altcolor=colors.darkseagreen,
+ linewidth=1,
+ center=None,
+ colour=None,
+ altcolour=None,
+ centre=None,
+ ):
+ """Add a GraphData object to the diagram.
+
+ Arguments:
+ - data List of (position, value) int tuples
+ - name String, description of the graph
+ - style String ('bar', 'heat', 'line') describing how the graph
+ will be drawn
+ - color colors.Color describing the color to draw all or 'high'
+ (some styles) data (overridden by backwards compatible
+ argument with UK spelling, colour).
+ - altcolor colors.Color describing the color to draw 'low' (some
+ styles) data (overridden by backwards compatible argument
+ with UK spelling, colour).
+ - linewidth Float describing linewidth for graph
+ - center Float setting the value at which the x-axis
+ crosses the y-axis (overridden by backwards
+ compatible argument with UK spelling, centre)
+
+ Add a GraphData object to the diagram (will be stored internally).
+ """
+ # Let the UK spelling (colour) override the USA spelling (color)
+ if colour is not None:
+ color = colour
+ if altcolour is not None:
+ altcolor = altcolour
+ if centre is not None:
+ center = centre
+
+ id = self._next_id # get id number
+ graph = GraphData(id, data, name, style, color, altcolor, center)
+ graph.linewidth = linewidth
+ self._graphs[id] = graph # add graph data
+ self._next_id += 1 # increment next id
+ return graph
+
+ def del_graph(self, graph_id):
+ """Remove a graph from the set, indicated by its id."""
+ del self._graphs[graph_id]
+
+ def get_graphs(self):
+ """Return list of all graphs in the graph set, sorted by id.
+
+ Sorting is to ensure reliable stacking.
+ """
+ return [self._graphs[id] for id in sorted(self._graphs)]
+
+ def get_ids(self):
+ """Return a list of all ids for the graph set."""
+ return list(self._graphs.keys())
+
+ def range(self):
+ """Return the lowest and highest base (or mark) numbers as a tuple."""
+ lows, highs = [], []
+ for graph in self._graphs.values():
+ low, high = graph.range()
+ lows.append(low)
+ highs.append(high)
+ return (min(lows), max(highs))
+
+ def data_quartiles(self):
+ """Return (minimum, lowerQ, medianQ, upperQ, maximum) values as a tuple."""
+ data = []
+ for graph in self._graphs.values():
+ data += list(graph.data.values())
+ data.sort()
+ datalen = len(data)
+ return (
+ data[0],
+ data[datalen / 4],
+ data[datalen / 2],
+ data[3 * datalen / 4],
+ data[-1],
+ )
+
+ def to_string(self, verbose=0):
+ """Return a formatted string with information about the set.
+
+ Arguments:
+ - verbose - Flag indicating whether a short or complete account
+ of the set is required
+
+ """
+ if not verbose:
+ return "%s" % self
+ else:
+ outstr = ["\n<%s: %s>" % (self.__class__, self.name)]
+ outstr.append("%d graphs" % len(self._graphs))
+ for key in self._graphs:
+ outstr.append("%s" % self._graphs[key])
+ return "\n".join(outstr)
+
+ def __len__(self):
+ """Return the number of graphs in the set."""
+ return len(self._graphs)
+
+ def __getitem__(self, key):
+ """Return a graph, keyed by id."""
+ return self._graphs[key]
+
+ def __str__(self):
+ """Return a formatted string with information about the feature set."""
+ outstr = ["\n<%s: %s>" % (self.__class__, self.name)]
+ outstr.append("%d graphs" % len(self._graphs))
+ outstr = "\n".join(outstr)
+ return outstr
diff --git a/code/lib/Bio/Graphics/GenomeDiagram/_LinearDrawer.py b/code/lib/Bio/Graphics/GenomeDiagram/_LinearDrawer.py
new file mode 100644
index 0000000..36012ad
--- /dev/null
+++ b/code/lib/Bio/Graphics/GenomeDiagram/_LinearDrawer.py
@@ -0,0 +1,1580 @@
+# Copyright 2003-2008 by Leighton Pritchard. All rights reserved.
+# Revisions copyright 2008-2009 by Peter Cock.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+#
+# Contact: Leighton Pritchard, The James Hutton Institute,
+# Invergowrie, Dundee, Scotland, DD2 5DA, UK
+# Leighton.Pritchard@hutton.ac.uk
+################################################################################
+
+"""Linear Drawer module.
+
+Provides:
+ - LinearDrawer - Drawing object for linear diagrams
+
+For drawing capabilities, this module uses reportlab to draw and write
+the diagram: http://www.reportlab.com
+"""
+
+# ReportLab imports
+
+from reportlab.graphics.shapes import Drawing, Line, String, Group, Polygon
+from reportlab.lib import colors
+
+# GenomeDiagram imports
+from ._AbstractDrawer import AbstractDrawer, draw_box, draw_arrow
+from ._AbstractDrawer import draw_cut_corner_box, _stroke_and_fill_colors
+from ._AbstractDrawer import intermediate_points, angle2trig, deduplicate
+from ._FeatureSet import FeatureSet
+from ._GraphSet import GraphSet
+
+from math import ceil
+
+
+class LinearDrawer(AbstractDrawer):
+ """Linear Drawer.
+
+ Inherits from:
+ - AbstractDrawer
+
+ Attributes:
+ - tracklines Boolean for whether to draw lines delineating tracks
+ - pagesize Tuple describing the size of the page in pixels
+ - x0 Float X co-ord for leftmost point of drawable area
+ - xlim Float X co-ord for rightmost point of drawable area
+ - y0 Float Y co-ord for lowest point of drawable area
+ - ylim Float Y co-ord for topmost point of drawable area
+ - pagewidth Float pixel width of drawable area
+ - pageheight Float pixel height of drawable area
+ - xcenter Float X co-ord of center of drawable area
+ - ycenter Float Y co-ord of center of drawable area
+ - start Int, base to start drawing from
+ - end Int, base to stop drawing at
+ - length Int, size of sequence to be drawn
+ - fragments Int, number of fragments into which to divide the
+ drawn sequence
+ - fragment_size Float (0->1) the proportion of the fragment height to
+ draw in
+ - track_size Float (0->1) the proportion of the track height to
+ draw in
+ - drawing Drawing canvas
+ - drawn_tracks List of ints denoting which tracks are to be drawn
+ - current_track_level Int denoting which track is currently being
+ drawn
+ - fragment_height Float total fragment height in pixels
+ - fragment_bases Int total fragment length in bases
+ - fragment_lines Dictionary of top and bottom y-coords of fragment,
+ keyed by fragment number
+ - fragment_limits Dictionary of start and end bases of each fragment,
+ keyed by fragment number
+ - track_offsets Dictionary of number of pixels that each track top,
+ center and bottom is offset from the base of a fragment, keyed by track
+ - cross_track_links List of tuples each with four entries (track A,
+ feature A, track B, feature B) to be linked.
+
+ """
+
+ def __init__(
+ self,
+ parent=None,
+ pagesize="A3",
+ orientation="landscape",
+ x=0.05,
+ y=0.05,
+ xl=None,
+ xr=None,
+ yt=None,
+ yb=None,
+ start=None,
+ end=None,
+ tracklines=0,
+ fragments=10,
+ fragment_size=None,
+ track_size=0.75,
+ cross_track_links=None,
+ ):
+ """Initialize.
+
+ Arguments:
+ - parent Diagram object containing the data that the drawer draws
+ - pagesize String describing the ISO size of the image, or a tuple
+ of pixels
+ - orientation String describing the required orientation of the
+ final drawing ('landscape' or 'portrait')
+ - x Float (0->1) describing the relative size of the X
+ margins to the page
+ - y Float (0->1) describing the relative size of the Y
+ margins to the page
+ - xl Float (0->1) describing the relative size of the left X
+ margin to the page (overrides x)
+ - xl Float (0->1) describing the relative size of the left X
+ margin to the page (overrides x)
+ - xr Float (0->1) describing the relative size of the right X
+ margin to the page (overrides x)
+ - yt Float (0->1) describing the relative size of the top Y
+ margin to the page (overrides y)
+ - yb Float (0->1) describing the relative size of the lower Y
+ margin to the page (overrides y)
+ - start Int, the position to begin drawing the diagram at
+ - end Int, the position to stop drawing the diagram at
+ - tracklines Boolean flag to show (or not) lines delineating tracks
+ on the diagram
+ - fragments Int, the number of equal fragments into which the
+ sequence should be divided for drawing
+ - fragment_size Float(0->1) The proportion of the available height
+ for the fragment that should be taken up in drawing
+ - track_size The proportion of the available track height that
+ should be taken up in drawing
+ - cross_track_links List of tuples each with four entries (track A,
+ feature A, track B, feature B) to be linked.
+ """
+ # Use the superclass' instantiation method
+ AbstractDrawer.__init__(
+ self,
+ parent,
+ pagesize,
+ orientation,
+ x,
+ y,
+ xl,
+ xr,
+ yt,
+ yb,
+ start,
+ end,
+ tracklines,
+ cross_track_links,
+ )
+
+ # Useful measurements on the page
+ self.fragments = fragments
+ if fragment_size is not None:
+ self.fragment_size = fragment_size
+ else:
+ if self.fragments == 1:
+ # For single fragments, default to full height
+ self.fragment_size = 1
+ else:
+ # Otherwise keep a 10% gap between fragments
+ self.fragment_size = 0.9
+ self.track_size = track_size
+
+ def draw(self):
+ """Draw a linear diagram of the data in the parent Diagram object."""
+ # Instantiate the drawing canvas
+ self.drawing = Drawing(self.pagesize[0], self.pagesize[1])
+
+ feature_elements = [] # holds feature elements
+ feature_labels = [] # holds feature labels
+ greytrack_bgs = [] # holds track background
+ greytrack_labels = [] # holds track foreground labels
+ scale_axes = [] # holds scale axes
+ scale_labels = [] # holds scale axis labels
+
+ # Get the tracks to be drawn
+ self.drawn_tracks = self._parent.get_drawn_levels()
+
+ # Set fragment and track sizes
+ self.init_fragments()
+ self.set_track_heights()
+
+ # Go through each track in the parent (if it is to be drawn) one by
+ # one and collate the data as drawing elements
+ for track_level in self.drawn_tracks: # only use tracks to be drawn
+ self.current_track_level = track_level # establish track level
+ track = self._parent[track_level] # get the track at that level
+ gbgs, glabels = self.draw_greytrack(track) # get greytrack elements
+ greytrack_bgs.append(gbgs)
+ greytrack_labels.append(glabels)
+ features, flabels = self.draw_track(track) # get feature and graph elements
+ feature_elements.append(features)
+ feature_labels.append(flabels)
+ if track.scale:
+ axes, slabels = self.draw_scale(track) # get scale elements
+ scale_axes.append(axes)
+ scale_labels.append(slabels)
+
+ feature_cross_links = []
+ for cross_link_obj in self.cross_track_links:
+ cross_link_elements = self.draw_cross_link(cross_link_obj)
+ if cross_link_elements:
+ feature_cross_links.append(cross_link_elements)
+
+ # Groups listed in order of addition to page (from back to front)
+ # Draw track backgrounds
+ # Draw feature cross track links
+ # Draw features and graphs
+ # Draw scale axes
+ # Draw scale labels
+ # Draw feature labels
+ # Draw track labels
+ element_groups = [
+ greytrack_bgs,
+ feature_cross_links,
+ feature_elements,
+ scale_axes,
+ scale_labels,
+ feature_labels,
+ greytrack_labels,
+ ]
+ for element_group in element_groups:
+ for element_list in element_group:
+ [self.drawing.add(element) for element in element_list]
+
+ if self.tracklines: # Draw test tracks over top of diagram
+ self.draw_test_tracks()
+
+ def init_fragments(self):
+ """Initialize useful values for positioning diagram elements."""
+ # Set basic heights, lengths etc
+ self.fragment_height = (
+ 1.0 * self.pageheight / self.fragments
+ ) # total fragment height in pixels
+ self.fragment_bases = ceil(
+ 1.0 * self.length / self.fragments
+ ) # fragment length in bases
+
+ # Key fragment base and top lines by fragment number
+ # Holds bottom and top line locations of fragments, keyed by fragment number
+ self.fragment_lines = {}
+ # Number of pixels to crop the fragment:
+ fragment_crop = (1 - self.fragment_size) / 2
+ fragy = self.ylim # Holder for current absolute fragment base
+ for fragment in range(self.fragments):
+ fragtop = fragy - fragment_crop * self.fragment_height # top - crop
+ fragbtm = (
+ fragy - (1 - fragment_crop) * self.fragment_height
+ ) # bottom + crop
+ self.fragment_lines[fragment] = (fragbtm, fragtop)
+ fragy -= self.fragment_height # next fragment base
+
+ # Key base starts and ends for each fragment by fragment number
+ self.fragment_limits = {} # Holds first and last base positions in a fragment
+ fragment_step = self.fragment_bases # bases per fragment
+ fragment_count = 0
+ # Add start and end positions for each fragment to dictionary
+ for marker in range(int(self.start), int(self.end), int(fragment_step)):
+ self.fragment_limits[fragment_count] = (marker, marker + fragment_step)
+ fragment_count += 1
+
+ def set_track_heights(self):
+ """Set track heights.
+
+ Since tracks may not be of identical heights, the bottom and top
+ offsets of each track relative to the fragment top and bottom is
+ stored in a dictionary - self.track_offsets, keyed by track number.
+ """
+ bot_track = min(min(self.drawn_tracks), 1)
+ top_track = max(self.drawn_tracks) # The 'highest' track number to draw
+
+ trackunit_sum = 0 # Total number of 'units' for the tracks
+ trackunits = {} # The start and end units for each track, keyed by track number
+ heightholder = 0 # placeholder variable
+ for track in range(bot_track, top_track + 1): # for all track numbers to 'draw'
+ try:
+ trackheight = self._parent[track].height # Get track height
+ except Exception: # TODO: IndexError?
+ trackheight = 1 # ...or default to 1
+ trackunit_sum += trackheight # increment total track unit height
+ trackunits[track] = (heightholder, heightholder + trackheight)
+ heightholder += trackheight # move to next height
+ trackunit_height = (
+ 1.0 * self.fragment_height * self.fragment_size / trackunit_sum
+ )
+
+ # Calculate top and bottom offsets for each track, relative to fragment
+ # base
+ track_offsets = {} # The offsets from fragment base for each track
+ track_crop = (
+ trackunit_height * (1 - self.track_size) / 2.0
+ ) # 'step back' in pixels
+ assert track_crop >= 0
+ for track in trackunits:
+ top = trackunits[track][1] * trackunit_height - track_crop # top offset
+ btm = trackunits[track][0] * trackunit_height + track_crop # bottom offset
+ ctr = btm + (top - btm) / 2.0 # center offset
+ track_offsets[track] = (btm, ctr, top)
+ self.track_offsets = track_offsets
+
+ def draw_test_tracks(self):
+ """Draw test tracks.
+
+ Draw red lines indicating the top and bottom of each fragment,
+ and blue ones indicating tracks to be drawn.
+ """
+ # Add lines for each fragment
+ for fbtm, ftop in self.fragment_lines.values():
+ self.drawing.add(
+ Line(self.x0, ftop, self.xlim, ftop, strokeColor=colors.red)
+ ) # top line
+ self.drawing.add(
+ Line(self.x0, fbtm, self.xlim, fbtm, strokeColor=colors.red)
+ ) # bottom line
+
+ # Add track lines for this fragment - but only for drawn tracks
+ for track in self.drawn_tracks:
+ trackbtm = fbtm + self.track_offsets[track][0]
+ trackctr = fbtm + self.track_offsets[track][1]
+ tracktop = fbtm + self.track_offsets[track][2]
+ self.drawing.add(
+ Line(
+ self.x0, tracktop, self.xlim, tracktop, strokeColor=colors.blue
+ )
+ ) # top line
+ self.drawing.add(
+ Line(
+ self.x0, trackctr, self.xlim, trackctr, strokeColor=colors.green
+ )
+ ) # center line
+ self.drawing.add(
+ Line(
+ self.x0, trackbtm, self.xlim, trackbtm, strokeColor=colors.blue
+ )
+ ) # bottom line
+
+ def draw_track(self, track):
+ """Draw track.
+
+ Arguments:
+ - track Track object
+
+ Returns a tuple (list of elements in the track, list of labels in
+ the track).
+ """
+ track_elements = [] # Holds elements from features and graphs
+ track_labels = [] # Holds labels from features and graphs
+
+ # Distribution dictionary for dealing with different set types
+ set_methods = {FeatureSet: self.draw_feature_set, GraphSet: self.draw_graph_set}
+
+ for set in track.get_sets(): # Draw the feature or graph sets
+ elements, labels = set_methods[set.__class__](set)
+ track_elements += elements
+ track_labels += labels
+ return track_elements, track_labels
+
+ def draw_tick(self, tickpos, ctr, ticklen, track, draw_label):
+ """Draw tick.
+
+ Arguments:
+ - tickpos Int, position of the tick on the sequence
+ - ctr Float, Y co-ord of the center of the track
+ - ticklen How long to draw the tick
+ - track Track, the track the tick is drawn on
+ - draw_label Boolean, write the tick label?
+
+ Returns a drawing element that is the tick on the scale
+ """
+ if self.start >= tickpos and tickpos >= self.end:
+ raise RuntimeError(
+ "Tick at %i, but showing %i to %i" % (tickpos, self.start, self.end)
+ )
+ if not (
+ (track.start is None or track.start <= tickpos)
+ and (track.end is None or tickpos <= track.end)
+ ):
+ raise RuntimeError(
+ "Tick at %i, but showing %r to %r for track"
+ % (tickpos, track.start, track.end)
+ )
+ fragment, tickx = self.canvas_location(tickpos) # Tick co-ordinates
+ assert fragment >= 0, "Fragment %i, tickpos %i" % (fragment, tickpos)
+ tctr = ctr + self.fragment_lines[fragment][0] # Center line of the track
+ tickx += self.x0 # Tick X co-ord
+ ticktop = tctr + ticklen # Y co-ord of tick top
+ tick = Line(tickx, tctr, tickx, ticktop, strokeColor=track.scale_color)
+ if draw_label: # Put tick position on as label
+ if track.scale_format == "SInt":
+ if tickpos >= 1000000:
+ tickstring = str(tickpos // 1000000) + " Mbp"
+ elif tickpos >= 1000:
+ tickstring = str(tickpos // 1000) + " Kbp"
+ else:
+ tickstring = str(tickpos)
+ else:
+ tickstring = str(tickpos)
+ label = String(
+ 0,
+ 0,
+ tickstring, # Make label string
+ fontName=track.scale_font,
+ fontSize=track.scale_fontsize,
+ fillColor=track.scale_color,
+ )
+ labelgroup = Group(label)
+ rotation = angle2trig(track.scale_fontangle)
+ labelgroup.transform = (
+ rotation[0],
+ rotation[1],
+ rotation[2],
+ rotation[3],
+ tickx,
+ ticktop,
+ )
+ else:
+ labelgroup = None
+ return tick, labelgroup
+
+ def draw_scale(self, track):
+ """Draw scale.
+
+ Argument:
+ - track Track object
+
+ Returns a tuple of (list of elements in the scale, list of labels
+ in the scale).
+ """
+ scale_elements = [] # Holds axes and ticks
+ scale_labels = [] # Holds labels
+
+ if not track.scale: # No scale required, exit early
+ return [], []
+
+ # Get track location
+ btm, ctr, top = self.track_offsets[self.current_track_level]
+ trackheight = top - ctr
+
+ # For each fragment, draw the scale for this track
+ start, end = self._current_track_start_end()
+ start_f, start_x = self.canvas_location(start)
+ end_f, end_x = self.canvas_location(end)
+
+ for fragment in range(start_f, end_f + 1):
+ tbtm = btm + self.fragment_lines[fragment][0]
+ tctr = ctr + self.fragment_lines[fragment][0]
+ ttop = top + self.fragment_lines[fragment][0]
+ # X-axis
+ if fragment == start_f:
+ x_left = start_x
+ else:
+ x_left = 0
+ if fragment == end_f:
+ x_right = end_x
+ # Y-axis end marker
+ scale_elements.append(
+ Line(
+ self.x0 + x_right,
+ tbtm,
+ self.x0 + x_right,
+ ttop,
+ strokeColor=track.scale_color,
+ )
+ )
+ else:
+ x_right = self.xlim - self.x0
+ scale_elements.append(
+ Line(
+ self.x0 + x_left,
+ tctr,
+ self.x0 + x_right,
+ tctr,
+ strokeColor=track.scale_color,
+ )
+ )
+ # Y-axis start marker
+ scale_elements.append(
+ Line(
+ self.x0 + x_left,
+ tbtm,
+ self.x0 + x_left,
+ ttop,
+ strokeColor=track.scale_color,
+ )
+ )
+
+ start, end = self._current_track_start_end()
+ if track.scale_ticks: # Ticks are required on the scale
+ # Draw large ticks
+ # I want the ticks to be consistently positioned relative to
+ # the start of the sequence (position 0), not relative to the
+ # current viewpoint (self.start and self.end)
+
+ ticklen = track.scale_largeticks * trackheight
+ tickiterval = int(track.scale_largetick_interval)
+ # Note that we could just start the list of ticks using
+ # range(0,self.end,tickinterval) and the filter out the
+ # ones before self.start - but this seems wasteful.
+ # Using tickiterval * (self.start//tickiterval) is a shortcut.
+ for tickpos in range(
+ tickiterval * (self.start // tickiterval), int(self.end), tickiterval
+ ):
+ if tickpos <= start or end <= tickpos:
+ continue
+ tick, label = self.draw_tick(
+ tickpos, ctr, ticklen, track, track.scale_largetick_labels
+ )
+ scale_elements.append(tick)
+ if label is not None: # If there's a label, add it
+ scale_labels.append(label)
+ # Draw small ticks
+ ticklen = track.scale_smallticks * trackheight
+ tickiterval = int(track.scale_smalltick_interval)
+ for tickpos in range(
+ tickiterval * (self.start // tickiterval), int(self.end), tickiterval
+ ):
+ if tickpos <= start or end <= tickpos:
+ continue
+ tick, label = self.draw_tick(
+ tickpos, ctr, ticklen, track, track.scale_smalltick_labels
+ )
+ scale_elements.append(tick)
+ if label is not None: # If there's a label, add it
+ scale_labels.append(label)
+
+ # Check to see if the track contains a graph - if it does, get the
+ # minimum and maximum values, and put them on the scale Y-axis
+ if track.axis_labels:
+ for set in track.get_sets(): # Check all sets...
+ if set.__class__ is GraphSet: # ...for a graph set
+ graph_label_min = []
+ graph_label_mid = []
+ graph_label_max = []
+ for graph in set.get_graphs():
+ quartiles = graph.quartiles()
+ minval, maxval = quartiles[0], quartiles[4]
+ if graph.center is None:
+ midval = (maxval + minval) / 2.0
+ graph_label_min.append("%.3f" % minval)
+ graph_label_max.append("%.3f" % maxval)
+ else:
+ diff = max((graph.center - minval), (maxval - graph.center))
+ minval = graph.center - diff
+ maxval = graph.center + diff
+ midval = graph.center
+ graph_label_mid.append("%.3f" % midval)
+ graph_label_min.append("%.3f" % minval)
+ graph_label_max.append("%.3f" % maxval)
+ for fragment in range(
+ start_f, end_f + 1
+ ): # Add to all used fragment axes
+ tbtm = btm + self.fragment_lines[fragment][0]
+ tctr = ctr + self.fragment_lines[fragment][0]
+ ttop = top + self.fragment_lines[fragment][0]
+ if fragment == start_f:
+ x_left = start_x
+ else:
+ x_left = 0
+ for val, pos in [
+ (";".join(graph_label_min), tbtm),
+ (";".join(graph_label_max), ttop),
+ (";".join(graph_label_mid), tctr),
+ ]:
+ label = String(
+ 0,
+ 0,
+ val,
+ fontName=track.scale_font,
+ fontSize=track.scale_fontsize,
+ fillColor=track.scale_color,
+ )
+ labelgroup = Group(label)
+ rotation = angle2trig(track.scale_fontangle)
+ labelgroup.transform = (
+ rotation[0],
+ rotation[1],
+ rotation[2],
+ rotation[3],
+ self.x0 + x_left,
+ pos,
+ )
+ scale_labels.append(labelgroup)
+
+ return scale_elements, scale_labels
+
+ def draw_greytrack(self, track):
+ """Draw greytrack.
+
+ Arguments:
+ - track Track object
+
+ Put in a grey background to the current track in all fragments,
+ if track specifies that we should.
+ """
+ greytrack_bgs = [] # Holds grey track backgrounds
+ greytrack_labels = [] # Holds grey foreground labels
+
+ if not track.greytrack: # No greytrack required, return early
+ return [], []
+
+ # Get track location
+ btm, ctr, top = self.track_offsets[self.current_track_level]
+
+ start, end = self._current_track_start_end()
+ start_fragment, start_offset = self.canvas_location(start)
+ end_fragment, end_offset = self.canvas_location(end)
+
+ # Add greytrack to all fragments for this track
+ for fragment in range(start_fragment, end_fragment + 1):
+ tbtm = btm + self.fragment_lines[fragment][0]
+ tctr = ctr + self.fragment_lines[fragment][0]
+ ttop = top + self.fragment_lines[fragment][0]
+ if fragment == start_fragment:
+ x1 = self.x0 + start_offset
+ else:
+ x1 = self.x0
+ if fragment == end_fragment:
+ x2 = self.x0 + end_offset
+ else:
+ x2 = self.xlim
+ box = draw_box(
+ (x1, tbtm), (x2, ttop), colors.Color(0.96, 0.96, 0.96) # Grey track bg
+ ) # is just a box
+ greytrack_bgs.append(box)
+
+ if track.greytrack_labels: # If labels are required
+ # # how far apart should they be?
+ labelstep = self.pagewidth / track.greytrack_labels
+ label = String(
+ 0,
+ 0,
+ track.name, # label contents
+ fontName=track.greytrack_font,
+ fontSize=track.greytrack_fontsize,
+ fillColor=track.greytrack_fontcolor,
+ )
+ # Create a new labelgroup at each position the label is required
+ for x in range(int(self.x0), int(self.xlim), int(labelstep)):
+ if fragment == start_fragment and x < start_offset:
+ continue
+ if (
+ fragment == end_fragment
+ and end_offset < x + label.getBounds()[2]
+ ):
+ continue
+ labelgroup = Group(label)
+ rotation = angle2trig(track.greytrack_font_rotation)
+ labelgroup.transform = (
+ rotation[0],
+ rotation[1],
+ rotation[2],
+ rotation[3],
+ x,
+ tbtm,
+ )
+ if not self.xlim - x <= labelstep:
+ # Don't overlap the end of the track
+ greytrack_labels.append(labelgroup)
+
+ return greytrack_bgs, greytrack_labels
+
+ def draw_feature_set(self, set):
+ """Draw feature set.
+
+ Arguments:
+ - set FeatureSet object
+
+ Returns a tuple (list of elements describing features, list of
+ labels for elements).
+ """
+ # print("draw feature set")
+ feature_elements = [] # Holds diagram elements belonging to the features
+ label_elements = [] # Holds diagram elements belonging to feature labels
+
+ # Collect all the elements for the feature set
+ for feature in set.get_features():
+ if self.is_in_bounds(feature.start) or self.is_in_bounds(feature.end):
+ features, labels = self.draw_feature(feature) # get elements and labels
+ feature_elements += features
+ label_elements += labels
+
+ return feature_elements, label_elements
+
+ def draw_feature(self, feature):
+ """Draw feature.
+
+ Arguments:
+ - feature Feature containing location info
+
+ Returns tuple of (list of elements describing single feature, list
+ of labels for those elements).
+ """
+ if feature.hide: # Feature hidden, don't draw it...
+ return [], []
+
+ feature_elements = [] # Holds diagram elements belonging to the feature
+ label_elements = [] # Holds labels belonging to the feature
+
+ start, end = self._current_track_start_end()
+ # A single feature may be split into subfeatures, so loop over them
+ for locstart, locend in feature.locations:
+ if locend < start:
+ continue
+ locstart = max(locstart, start)
+ if end < locstart:
+ continue
+ locend = min(locend, end)
+ feature_boxes = self.draw_feature_location(feature, locstart, locend)
+ for box, label in feature_boxes:
+ feature_elements.append(box)
+ if label is not None:
+ label_elements.append(label)
+
+ return feature_elements, label_elements
+
+ def draw_feature_location(self, feature, locstart, locend):
+ """Draw feature location."""
+ feature_boxes = []
+ # Get start and end positions for feature/subfeatures
+ start_fragment, start_offset = self.canvas_location(locstart)
+ end_fragment, end_offset = self.canvas_location(locend)
+ # print("start_fragment, start_offset", start_fragment, start_offset)
+ # print("end_fragment, end_offset", end_fragment, end_offset)
+ # print("start, end", locstart, locend)
+
+ # Note that there is a strange situation where a feature may be in
+ # several parts, and one or more of those parts may end up being
+ # drawn on a non-existent fragment. So we check that the start and
+ # end fragments do actually exist in terms of the drawing
+ allowed_fragments = list(self.fragment_limits.keys())
+ if start_fragment in allowed_fragments and end_fragment in allowed_fragments:
+ # print(feature.name, feature.start, feature.end, start_offset, end_offset)
+ if start_fragment == end_fragment: # Feature is found on one fragment
+ feature_box, label = self.get_feature_sigil(
+ feature, start_offset, end_offset, start_fragment
+ )
+ feature_boxes.append((feature_box, label))
+ # feature_elements.append(feature_box)
+ # if label is not None: # There is a label for the feature
+ # label_elements.append(label)
+ else: # Feature is split over two or more fragments
+ fragment = start_fragment
+ start = start_offset
+ # The bit that runs up to the end of the first fragment,
+ # and any bits that subsequently span whole fragments
+ while self.fragment_limits[fragment][1] < locend:
+ # print(fragment, self.fragment_limits[fragment][1], locend)
+ feature_box, label = self.get_feature_sigil(
+ feature, start, self.pagewidth, fragment
+ )
+
+ fragment += 1 # move to next fragment
+ start = 0 # start next sigil from start of fragment
+ feature_boxes.append((feature_box, label))
+ # feature_elements.append(feature_box)
+ # if label is not None: # There's a label for the feature
+ # label_elements.append(label)
+ # The last bit of the feature
+ # print(locend, self.end, fragment)
+ # print(self.fragment_bases, self.length)
+ feature_box, label = self.get_feature_sigil(
+ feature, 0, end_offset, fragment
+ )
+ feature_boxes.append((feature_box, label))
+ # if locstart > locend:
+ # print(locstart, locend, feature.strand, feature_boxes, feature.name)
+ return feature_boxes
+
+ def draw_cross_link(self, cross_link):
+ """Draw cross-link between two features."""
+ startA = cross_link.startA
+ startB = cross_link.startB
+ endA = cross_link.endA
+ endB = cross_link.endB
+
+ if not self.is_in_bounds(startA) and not self.is_in_bounds(endA):
+ return None
+ if not self.is_in_bounds(startB) and not self.is_in_bounds(endB):
+ return None
+
+ if startA < self.start:
+ startA = self.start
+ if startB < self.start:
+ startB = self.start
+ if self.end < endA:
+ endA = self.end
+ if self.end < endB:
+ endB = self.end
+
+ trackobjA = cross_link._trackA(list(self._parent.tracks.values()))
+ trackobjB = cross_link._trackB(list(self._parent.tracks.values()))
+ assert trackobjA is not None
+ assert trackobjB is not None
+ if trackobjA == trackobjB:
+ raise NotImplementedError()
+
+ if trackobjA.start is not None:
+ if endA < trackobjA.start:
+ return
+ startA = max(startA, trackobjA.start)
+ if trackobjA.end is not None:
+ if trackobjA.end < startA:
+ return
+ endA = min(endA, trackobjA.end)
+ if trackobjB.start is not None:
+ if endB < trackobjB.start:
+ return
+ startB = max(startB, trackobjB.start)
+ if trackobjB.end is not None:
+ if trackobjB.end < startB:
+ return
+ endB = min(endB, trackobjB.end)
+
+ for track_level in self._parent.get_drawn_levels():
+ track = self._parent[track_level]
+ if track == trackobjA:
+ trackA = track_level
+ if track == trackobjB:
+ trackB = track_level
+ if trackA == trackB:
+ raise NotImplementedError()
+
+ strokecolor, fillcolor = _stroke_and_fill_colors(
+ cross_link.color, cross_link.border
+ )
+
+ allowed_fragments = list(self.fragment_limits.keys())
+
+ start_fragmentA, start_offsetA = self.canvas_location(startA)
+ end_fragmentA, end_offsetA = self.canvas_location(endA)
+ if (
+ start_fragmentA not in allowed_fragments
+ or end_fragmentA not in allowed_fragments
+ ):
+ return
+
+ start_fragmentB, start_offsetB = self.canvas_location(startB)
+ end_fragmentB, end_offsetB = self.canvas_location(endB)
+ if (
+ start_fragmentB not in allowed_fragments
+ or end_fragmentB not in allowed_fragments
+ ):
+ return
+
+ # TODO - Better drawing of flips when split between fragments
+
+ answer = []
+ for fragment in range(
+ min(start_fragmentA, start_fragmentB), max(end_fragmentA, end_fragmentB) + 1
+ ):
+ btmA, ctrA, topA = self.track_offsets[trackA]
+ btmA += self.fragment_lines[fragment][0]
+ ctrA += self.fragment_lines[fragment][0]
+ topA += self.fragment_lines[fragment][0]
+
+ btmB, ctrB, topB = self.track_offsets[trackB]
+ btmB += self.fragment_lines[fragment][0]
+ ctrB += self.fragment_lines[fragment][0]
+ topB += self.fragment_lines[fragment][0]
+
+ if self.fragment_limits[fragment][1] < endA:
+ xAe = self.x0 + self.pagewidth
+ crop_rightA = True
+ else:
+ xAe = self.x0 + end_offsetA
+ crop_rightA = False
+ if self.fragment_limits[fragment][1] < endB:
+ xBe = self.x0 + self.pagewidth
+ crop_rightB = True
+ else:
+ xBe = self.x0 + end_offsetB
+ crop_rightB = False
+
+ if fragment < start_fragmentA:
+ xAs = self.x0 + self.pagewidth
+ xAe = xAs
+ crop_leftA = False
+ elif fragment == start_fragmentA:
+ xAs = self.x0 + start_offsetA
+ crop_leftA = False
+ else:
+ xAs = self.x0
+ crop_leftA = True
+
+ if fragment < start_fragmentB:
+ xBs = self.x0 + self.pagewidth
+ xBe = xBs
+ crop_leftB = False
+ elif fragment == start_fragmentB:
+ xBs = self.x0 + start_offsetB
+ crop_leftB = False
+ else:
+ xBs = self.x0
+ crop_leftB = True
+
+ if ctrA < ctrB:
+ yA = topA
+ yB = btmB
+ else:
+ yA = btmA
+ yB = topB
+
+ if fragment < start_fragmentB or end_fragmentB < fragment:
+ if cross_link.flip:
+ # Just draw A as a triangle to left/right
+ if fragment < start_fragmentB:
+ extra = [self.x0 + self.pagewidth, 0.5 * (yA + yB)]
+ else:
+ extra = [self.x0, 0.5 * (yA + yB)]
+ else:
+ if fragment < start_fragmentB:
+ extra = [
+ self.x0 + self.pagewidth,
+ 0.7 * yA + 0.3 * yB,
+ self.x0 + self.pagewidth,
+ 0.3 * yA + 0.7 * yB,
+ ]
+ else:
+ extra = [
+ self.x0,
+ 0.3 * yA + 0.7 * yB,
+ self.x0,
+ 0.7 * yA + 0.3 * yB,
+ ]
+ answer.append(
+ Polygon(
+ deduplicate([xAs, yA, xAe, yA] + extra),
+ strokeColor=strokecolor,
+ fillColor=fillcolor,
+ # default is mitre/miter which can stick out too much:
+ strokeLineJoin=1, # 1=round
+ strokewidth=0,
+ )
+ )
+ elif fragment < start_fragmentA or end_fragmentA < fragment:
+ if cross_link.flip:
+ # Just draw B as a triangle to left
+ if fragment < start_fragmentA:
+ extra = [self.x0 + self.pagewidth, 0.5 * (yA + yB)]
+ else:
+ extra = [self.x0, 0.5 * (yA + yB)]
+ else:
+ if fragment < start_fragmentA:
+ extra = [
+ self.x0 + self.pagewidth,
+ 0.3 * yA + 0.7 * yB,
+ self.x0 + self.pagewidth,
+ 0.7 * yA + 0.3 * yB,
+ ]
+ else:
+ extra = [
+ self.x0,
+ 0.7 * yA + 0.3 * yB,
+ self.x0,
+ 0.3 * yA + 0.7 * yB,
+ ]
+ answer.append(
+ Polygon(
+ deduplicate([xBs, yB, xBe, yB] + extra),
+ strokeColor=strokecolor,
+ fillColor=fillcolor,
+ # default is mitre/miter which can stick out too much:
+ strokeLineJoin=1, # 1=round
+ strokewidth=0,
+ )
+ )
+ elif cross_link.flip and (
+ (crop_leftA and not crop_rightA) or (crop_leftB and not crop_rightB)
+ ):
+ # On left end of fragment... force "crossing" to margin
+ answer.append(
+ Polygon(
+ deduplicate(
+ [
+ xAs,
+ yA,
+ xAe,
+ yA,
+ self.x0,
+ 0.5 * (yA + yB),
+ xBe,
+ yB,
+ xBs,
+ yB,
+ ]
+ ),
+ strokeColor=strokecolor,
+ fillColor=fillcolor,
+ # default is mitre/miter which can stick out too much:
+ strokeLineJoin=1, # 1=round
+ strokewidth=0,
+ )
+ )
+ elif cross_link.flip and (
+ (crop_rightA and not crop_leftA) or (crop_rightB and not crop_leftB)
+ ):
+ # On right end... force "crossing" to margin
+ answer.append(
+ Polygon(
+ deduplicate(
+ [
+ xAs,
+ yA,
+ xAe,
+ yA,
+ xBe,
+ yB,
+ xBs,
+ yB,
+ self.x0 + self.pagewidth,
+ 0.5 * (yA + yB),
+ ]
+ ),
+ strokeColor=strokecolor,
+ fillColor=fillcolor,
+ # default is mitre/miter which can stick out too much:
+ strokeLineJoin=1, # 1=round
+ strokewidth=0,
+ )
+ )
+ elif cross_link.flip:
+ answer.append(
+ Polygon(
+ deduplicate([xAs, yA, xAe, yA, xBs, yB, xBe, yB]),
+ strokeColor=strokecolor,
+ fillColor=fillcolor,
+ # default is mitre/miter which can stick out too much:
+ strokeLineJoin=1, # 1=round
+ strokewidth=0,
+ )
+ )
+ else:
+ answer.append(
+ Polygon(
+ deduplicate([xAs, yA, xAe, yA, xBe, yB, xBs, yB]),
+ strokeColor=strokecolor,
+ fillColor=fillcolor,
+ # default is mitre/miter which can stick out too much:
+ strokeLineJoin=1, # 1=round
+ strokewidth=0,
+ )
+ )
+ return answer
+
+ def get_feature_sigil(self, feature, x0, x1, fragment, **kwargs):
+ """Get feature sigil.
+
+ Arguments:
+ - feature Feature object
+ - x0 Start X co-ordinate on diagram
+ - x1 End X co-ordinate on diagram
+ - fragment The fragment on which the feature appears
+
+ Returns a drawable indicator of the feature, and any required label
+ for it.
+ """
+ # Establish co-ordinates for drawing
+ x0, x1 = self.x0 + x0, self.x0 + x1
+ btm, ctr, top = self.track_offsets[self.current_track_level]
+ try:
+ btm += self.fragment_lines[fragment][0]
+ ctr += self.fragment_lines[fragment][0]
+ top += self.fragment_lines[fragment][0]
+ except Exception: # Only called if the method screws up big time
+ print("We've got a screw-up")
+ print("%s %s" % (self.start, self.end))
+ print(self.fragment_bases)
+ print("%r %r" % (x0, x1))
+ for locstart, locend in feature.locations:
+ print(self.canvas_location(locstart))
+ print(self.canvas_location(locend))
+ print("FEATURE\n%s" % feature)
+ raise
+
+ # Distribution dictionary for various ways of drawing the feature
+ draw_methods = {
+ "BOX": self._draw_sigil_box,
+ "ARROW": self._draw_sigil_arrow,
+ "BIGARROW": self._draw_sigil_big_arrow,
+ "OCTO": self._draw_sigil_octo,
+ "JAGGY": self._draw_sigil_jaggy,
+ }
+
+ method = draw_methods[feature.sigil]
+ kwargs["head_length_ratio"] = feature.arrowhead_length
+ kwargs["shaft_height_ratio"] = feature.arrowshaft_height
+
+ # Support for clickable links... needs ReportLab 2.4 or later
+ # which added support for links in SVG output.
+ if hasattr(feature, "url"):
+ kwargs["hrefURL"] = feature.url
+ kwargs["hrefTitle"] = feature.name
+
+ # Get sigil for the feature, give it the bounding box straddling
+ # the axis (it decides strand specific placement)
+ sigil = method(
+ btm,
+ ctr,
+ top,
+ x0,
+ x1,
+ strand=feature.strand,
+ color=feature.color,
+ border=feature.border,
+ **kwargs
+ )
+
+ if feature.label_strand:
+ strand = feature.label_strand
+ else:
+ strand = feature.strand
+ if feature.label: # Feature requires a label
+ label = String(
+ 0,
+ 0,
+ feature.name,
+ fontName=feature.label_font,
+ fontSize=feature.label_size,
+ fillColor=feature.label_color,
+ )
+ labelgroup = Group(label)
+ # Feature is on top, or covers both strands (location affects
+ # the height and rotation of the label)
+ if strand != -1:
+ rotation = angle2trig(feature.label_angle)
+ if feature.label_position in ("end", "3'", "right"):
+ pos = x1
+ elif feature.label_position in ("middle", "center", "centre"):
+ pos = (x1 + x0) / 2.0
+ else:
+ # Default to start, i.e. 'start', "5'", 'left'
+ pos = x0
+ labelgroup.transform = (
+ rotation[0],
+ rotation[1],
+ rotation[2],
+ rotation[3],
+ pos,
+ top,
+ )
+ else: # Feature on bottom strand
+ rotation = angle2trig(feature.label_angle + 180)
+ if feature.label_position in ("end", "3'", "right"):
+ pos = x0
+ elif feature.label_position in ("middle", "center", "centre"):
+ pos = (x1 + x0) / 2.0
+ else:
+ # Default to start, i.e. 'start', "5'", 'left'
+ pos = x1
+ labelgroup.transform = (
+ rotation[0],
+ rotation[1],
+ rotation[2],
+ rotation[3],
+ pos,
+ btm,
+ )
+ else:
+ labelgroup = None
+ return sigil, labelgroup
+
+ def draw_graph_set(self, set):
+ """Draw graph set.
+
+ Arguments:
+ - set GraphSet object
+
+ Returns tuple (list of graph elements, list of graph labels).
+ """
+ # print('draw graph set')
+ elements = [] # Holds graph elements
+
+ # Distribution dictionary for how to draw the graph
+ style_methods = {
+ "line": self.draw_line_graph,
+ "heat": self.draw_heat_graph,
+ "bar": self.draw_bar_graph,
+ }
+
+ for graph in set.get_graphs():
+ elements += style_methods[graph.style](graph)
+
+ return elements, []
+
+ def draw_line_graph(self, graph):
+ """Return a line graph as a list of drawable elements.
+
+ Arguments:
+ - graph Graph object
+
+ """
+ # print('\tdraw_line_graph')
+ line_elements = [] # Holds drawable elements
+
+ # Get graph data
+ data_quartiles = graph.quartiles()
+ minval, maxval = data_quartiles[0], data_quartiles[4]
+ btm, ctr, top = self.track_offsets[self.current_track_level]
+ trackheight = 0.5 * (top - btm)
+ datarange = maxval - minval
+ if datarange == 0:
+ datarange = trackheight
+
+ start, end = self._current_track_start_end()
+ data = graph[start:end]
+
+ # midval is the value at which the x-axis is plotted, and is the
+ # central ring in the track
+ if graph.center is None:
+ midval = (maxval + minval) / 2.0
+ else:
+ midval = graph.center
+ # Whichever is the greatest difference: max-midval or min-midval, is
+ # taken to specify the number of pixel units resolved along the
+ # y-axis
+ resolution = max((midval - minval), (maxval - midval))
+
+ # Start from first data point
+ pos, val = data[0]
+ lastfrag, lastx = self.canvas_location(pos)
+ lastx += self.x0 # Start xy co-ords
+ lasty = (
+ trackheight * (val - midval) / resolution
+ + self.fragment_lines[lastfrag][0]
+ + ctr
+ )
+ lastval = val
+ # Add a series of lines linking consecutive data points
+ for pos, val in data:
+ frag, x = self.canvas_location(pos)
+ x += self.x0 # next xy co-ords
+ y = (
+ trackheight * (val - midval) / resolution
+ + self.fragment_lines[frag][0]
+ + ctr
+ )
+ if frag == lastfrag: # Points on the same fragment: draw the line
+ line_elements.append(
+ Line(
+ lastx,
+ lasty,
+ x,
+ y,
+ strokeColor=graph.poscolor,
+ strokeWidth=graph.linewidth,
+ )
+ )
+ else: # Points not on the same fragment, so interpolate
+ tempy = (
+ trackheight * (val - midval) / resolution
+ + self.fragment_lines[lastfrag][0]
+ + ctr
+ )
+ line_elements.append(
+ Line(
+ lastx,
+ lasty,
+ self.xlim,
+ tempy,
+ strokeColor=graph.poscolor,
+ strokeWidth=graph.linewidth,
+ )
+ )
+ tempy = (
+ trackheight * (val - midval) / resolution
+ + self.fragment_lines[frag][0]
+ + ctr
+ )
+ line_elements.append(
+ Line(
+ self.x0,
+ tempy,
+ x,
+ y,
+ strokeColor=graph.poscolor,
+ strokeWidth=graph.linewidth,
+ )
+ )
+ lastfrag, lastx, lasty, lastval = frag, x, y, val
+
+ return line_elements
+
+ def draw_heat_graph(self, graph):
+ """Return a list of drawable elements for the heat graph."""
+ # print('\tdraw_heat_graph')
+ # At each point contained in the graph data, we draw a box that is the
+ # full height of the track, extending from the midpoint between the
+ # previous and current data points to the midpoint between the current
+ # and next data points
+ heat_elements = [] # Holds drawable elements for the graph
+
+ # Get graph data and information
+ data_quartiles = graph.quartiles()
+ minval, maxval = data_quartiles[0], data_quartiles[4]
+ midval = (maxval + minval) / 2.0 # mid is the value at the X-axis
+ btm, ctr, top = self.track_offsets[self.current_track_level]
+ trackheight = top - btm
+
+ start, end = self._current_track_start_end()
+ data = intermediate_points(start, end, graph[start:end])
+
+ if not data:
+ return []
+
+ # Create elements on the graph, indicating a large positive value by
+ # the graph's poscolor, and a large negative value by the graph's
+ # negcolor attributes
+ for pos0, pos1, val in data:
+ # assert start <= pos0 <= pos1 <= end
+ fragment0, x0 = self.canvas_location(pos0)
+ fragment1, x1 = self.canvas_location(pos1)
+ x0, x1 = self.x0 + x0, self.x0 + x1 # account for margin
+ # print('x1 before:', x1)
+
+ # Calculate the heat color, based on the differential between
+ # the value and the median value
+ heat = colors.linearlyInterpolatedColor(
+ graph.poscolor, graph.negcolor, maxval, minval, val
+ )
+
+ # Draw heat box
+ if fragment0 == fragment1: # Box is contiguous on one fragment
+ if pos1 >= self.fragment_limits[fragment0][1]:
+ x1 = self.xlim
+ ttop = top + self.fragment_lines[fragment0][0]
+ tbtm = btm + self.fragment_lines[fragment0][0]
+ # print('equal', pos0, pos1, val)
+ # print(pos0, pos1, fragment0, fragment1)
+ heat_elements.append(
+ draw_box((x0, tbtm), (x1, ttop), color=heat, border=None)
+ )
+ else: # box is split over two or more fragments
+ # if pos0 >= self.fragment_limits[fragment0][0]:
+ # fragment0 += 1
+ fragment = fragment0
+ start_x = x0
+ while self.fragment_limits[fragment][1] <= pos1:
+ # print(pos0, self.fragment_limits[fragment][1], pos1)
+ ttop = top + self.fragment_lines[fragment][0]
+ tbtm = btm + self.fragment_lines[fragment][0]
+ heat_elements.append(
+ draw_box(
+ (start_x, tbtm), (self.xlim, ttop), color=heat, border=None
+ )
+ )
+ fragment += 1
+ start_x = self.x0
+ ttop = top + self.fragment_lines[fragment][0]
+ tbtm = btm + self.fragment_lines[fragment][0]
+ # Add the last part of the bar
+ # print('x1 after:', x1, '\n')
+ heat_elements.append(
+ draw_box((self.x0, tbtm), (x1, ttop), color=heat, border=None)
+ )
+
+ return heat_elements
+
+ def draw_bar_graph(self, graph):
+ """Return list of drawable elements for a bar graph."""
+ # print('\tdraw_bar_graph')
+ # At each point contained in the graph data, we draw a vertical bar
+ # from the track center to the height of the datapoint value (positive
+ # values go up in one color, negative go down in the alternative
+ # color).
+ bar_elements = [] # Holds drawable elements for the graph
+
+ # Set the number of pixels per unit for the data
+ data_quartiles = graph.quartiles()
+ minval, maxval = data_quartiles[0], data_quartiles[4]
+ btm, ctr, top = self.track_offsets[self.current_track_level]
+ trackheight = 0.5 * (top - btm)
+ datarange = maxval - minval
+ if datarange == 0:
+ datarange = trackheight
+ data = graph[self.start : self.end]
+ # midval is the value at which the x-axis is plotted, and is the
+ # central ring in the track
+ if graph.center is None:
+ midval = (maxval + minval) / 2.0
+ else:
+ midval = graph.center
+
+ # Convert data into 'binned' blocks, covering half the distance to the
+ # next data point on either side, accounting for the ends of fragments
+ # and tracks
+ start, end = self._current_track_start_end()
+ data = intermediate_points(start, end, graph[start:end])
+
+ if not data:
+ return []
+
+ # Whichever is the greatest difference: max-midval or min-midval, is
+ # taken to specify the number of pixel units resolved along the
+ # y-axis
+ resolution = max((midval - minval), (maxval - midval))
+ if resolution == 0:
+ resolution = trackheight
+
+ # Create elements for the bar graph based on newdata
+ for pos0, pos1, val in data:
+ fragment0, x0 = self.canvas_location(pos0)
+ fragment1, x1 = self.canvas_location(pos1)
+ x0, x1 = self.x0 + x0, self.x0 + x1 # account for margin
+ barval = trackheight * (val - midval) / resolution
+ if barval >= 0: # Different colors for bars that extend above...
+ barcolor = graph.poscolor
+ else: # ...or below the axis
+ barcolor = graph.negcolor
+
+ # Draw bar
+ if fragment0 == fragment1: # Box is contiguous
+ if pos1 >= self.fragment_limits[fragment0][1]:
+ x1 = self.xlim
+ tctr = ctr + self.fragment_lines[fragment0][0]
+ barval += tctr
+ bar_elements.append(draw_box((x0, tctr), (x1, barval), color=barcolor))
+ else: # Box is split over two or more fragments
+ fragment = fragment0
+ # if pos0 >= self.fragment_limits[fragment0][0]:
+ # fragment += 1
+ start = x0
+ while self.fragment_limits[fragment][1] < pos1:
+ tctr = ctr + self.fragment_lines[fragment][0]
+ thisbarval = barval + tctr
+ bar_elements.append(
+ draw_box((start, tctr), (self.xlim, thisbarval), color=barcolor)
+ )
+ fragment += 1
+ start = self.x0
+ tctr = ctr + self.fragment_lines[fragment1][0]
+ barval += tctr
+ # Add the last part of the bar
+ bar_elements.append(
+ draw_box((self.x0, tctr), (x1, barval), color=barcolor)
+ )
+
+ return bar_elements
+
+ def canvas_location(self, base):
+ """Canvas location of a base on the genome.
+
+ Arguments:
+ - base The base number on the genome sequence
+
+ Returns the x-coordinate and fragment number of a base on the
+ genome sequence, in the context of the current drawing setup
+ """
+ base = int(base - self.start) # number of bases we are from the start
+ fragment = int(base / self.fragment_bases)
+ if fragment < 1: # First fragment
+ base_offset = base
+ fragment = 0
+ elif fragment >= self.fragments:
+ fragment = self.fragments - 1
+ base_offset = self.fragment_bases
+ else: # Calculate number of bases from start of fragment
+ base_offset = base % self.fragment_bases
+ assert fragment < self.fragments, (
+ base,
+ self.start,
+ self.end,
+ self.length,
+ self.fragment_bases,
+ )
+ # Calculate number of pixels from start of fragment
+ x_offset = 1.0 * self.pagewidth * base_offset / self.fragment_bases
+ return fragment, x_offset
+
+ def _draw_sigil_box(self, bottom, center, top, x1, x2, strand, **kwargs):
+ """Draw BOX sigil (PRIVATE)."""
+ if strand == 1:
+ y1 = center
+ y2 = top
+ elif strand == -1:
+ y1 = bottom
+ y2 = center
+ else:
+ y1 = bottom
+ y2 = top
+ return draw_box((x1, y1), (x2, y2), **kwargs)
+
+ def _draw_sigil_octo(self, bottom, center, top, x1, x2, strand, **kwargs):
+ """Draw OCTO sigil, a box with the corners cut off (PRIVATE)."""
+ if strand == 1:
+ y1 = center
+ y2 = top
+ elif strand == -1:
+ y1 = bottom
+ y2 = center
+ else:
+ y1 = bottom
+ y2 = top
+ return draw_cut_corner_box((x1, y1), (x2, y2), **kwargs)
+
+ def _draw_sigil_jaggy(
+ self, bottom, center, top, x1, x2, strand, color, border=None, **kwargs
+ ):
+ """Draw JAGGY sigil (PRIVATE).
+
+ Although we may in future expose the head/tail jaggy lengths, for now
+ both the left and right edges are drawn jagged.
+ """
+ if strand == 1:
+ y1 = center
+ y2 = top
+ teeth = 2
+ elif strand == -1:
+ y1 = bottom
+ y2 = center
+ teeth = 2
+ else:
+ y1 = bottom
+ y2 = top
+ teeth = 4
+
+ xmin = min(x1, x2)
+ xmax = max(x1, x2)
+ height = y2 - y1
+ boxwidth = x2 - x1
+ tooth_length = min(height / teeth, boxwidth * 0.5)
+
+ headlength = tooth_length
+ taillength = tooth_length
+
+ strokecolor, color = _stroke_and_fill_colors(color, border)
+
+ points = []
+ for i in range(teeth):
+ points.extend(
+ (
+ xmin,
+ y1 + i * height / teeth,
+ xmin + taillength,
+ y1 + (i + 1) * height / teeth,
+ )
+ )
+ for i in range(teeth):
+ points.extend(
+ (
+ xmax,
+ y1 + (teeth - i) * height / teeth,
+ xmax - headlength,
+ y1 + (teeth - i - 1) * height / teeth,
+ )
+ )
+
+ return Polygon(
+ deduplicate(points),
+ strokeColor=strokecolor,
+ strokeWidth=1,
+ strokeLineJoin=1, # 1=round
+ fillColor=color,
+ **kwargs
+ )
+
+ def _draw_sigil_arrow(self, bottom, center, top, x1, x2, strand, **kwargs):
+ """Draw ARROW sigil (PRIVATE)."""
+ if strand == 1:
+ y1 = center
+ y2 = top
+ orientation = "right"
+ elif strand == -1:
+ y1 = bottom
+ y2 = center
+ orientation = "left"
+ else:
+ y1 = bottom
+ y2 = top
+ orientation = "right" # backward compatibility
+ return draw_arrow((x1, y1), (x2, y2), orientation=orientation, **kwargs)
+
+ def _draw_sigil_big_arrow(self, bottom, center, top, x1, x2, strand, **kwargs):
+ """Draw BIGARROW sigil, like ARROW but straddles the axis (PRIVATE)."""
+ if strand == -1:
+ orientation = "left"
+ else:
+ orientation = "right"
+ return draw_arrow((x1, bottom), (x2, top), orientation=orientation, **kwargs)
diff --git a/code/lib/Bio/Graphics/GenomeDiagram/_Track.py b/code/lib/Bio/Graphics/GenomeDiagram/_Track.py
new file mode 100644
index 0000000..a6c67f9
--- /dev/null
+++ b/code/lib/Bio/Graphics/GenomeDiagram/_Track.py
@@ -0,0 +1,285 @@
+# Copyright 2003-2008 by Leighton Pritchard. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+#
+# Contact: Leighton Pritchard, The James Hutton Institute,
+# Invergowrie, Dundee, Scotland, DD2 5DA, UK
+# Leighton.Pritchard@hutton.ac.uk
+################################################################################
+
+"""Track module.
+
+Provides:
+ - Track - Container for a single track on the diagram, containing
+ FeatureSet and GraphSet objects
+
+For drawing capabilities, this module uses reportlab to draw and write
+the diagram: http://www.reportlab.com
+"""
+
+
+from reportlab.lib import colors
+
+# GenomeDiagram imports
+from ._FeatureSet import FeatureSet
+from ._GraphSet import GraphSet
+
+_grey = colors.Color(0.6, 0.6, 0.6)
+
+
+class Track:
+ """Track.
+
+ Attributes:
+ - height Int describing the relative height to other trackscale_fontsizes
+ in the diagram
+ - name String describing the track
+ - hide Boolean, 0 if the track is not to be drawn
+ - start, end Integers (or None) specifying start/end to draw just
+ a partial track.
+ - greytrack Boolean, 1 if a grey background to the track is to be
+ drawn
+ - greytrack_labels Int describing how many track-identifying labels
+ should be placed on the track at regular intervals
+ - greytrack_font String describing the font to use for the greytrack
+ labels
+ - greytrack_fontsize Int describing the font size to display the
+ labels on the grey track
+ - greytrack_font_rotation Int describing the angle through which to
+ rotate the grey track labels (Linear only)
+ - greytrack_font_color colors.Color describing the color to draw
+ the grey track labels
+ - scale Boolean, 1 if a scale is to be drawn on the track
+ - scale_format String, defaults to None, when scale values are written
+ as numerals. Setting this to 'SInt' invokes SI
+ unit-like multiples, such as Mbp, Kbp and so on.
+ - scale_color colors.Color to draw the elements of the scale
+ - scale_font String describing the font to use for the scale labels
+ - scale_fontsize Int describing the size of the scale label font
+ - scale_fontangle Int describing the angle at which to draw the scale
+ labels (linear only)
+ - scale_ticks Boolean, 1 if ticks should be drawn at all on the
+ scale
+ - scale_largeticks Float (0->1) describing the height of large
+ scale ticks relative to the track height.
+ - scale_smallticks Float (0->1) describing the height of large
+ scale ticks relative to the track height.
+ - scale_largetick_interval Int, describing the number of bases that
+ should separate large ticks
+ - scale_smalltick_interval Int, describing the number of bases that
+ should separate small ticks
+ - scale_largetick_labels Boolean describing whether position labels
+ should be written over large ticks
+ - scale_smalltick_labels Boolean describing whether position labels
+ should be written over small ticks
+ - axis_labels Boolean describing whether the value labels should
+ be placed on the Y axes
+
+ """
+
+ def __init__(
+ self,
+ name=None,
+ height=1,
+ hide=0,
+ greytrack=0,
+ greytrack_labels=5,
+ greytrack_fontsize=8,
+ greytrack_font="Helvetica",
+ greytrack_font_rotation=0,
+ greytrack_font_color=_grey,
+ scale=1,
+ scale_format=None,
+ scale_color=colors.black,
+ scale_font="Helvetica",
+ scale_fontsize=6,
+ scale_fontangle=45,
+ scale_largeticks=0.5,
+ scale_ticks=1,
+ scale_smallticks=0.3,
+ scale_largetick_interval=1e6,
+ scale_smalltick_interval=1e4,
+ scale_largetick_labels=1,
+ scale_smalltick_labels=0,
+ axis_labels=1,
+ start=None,
+ end=None,
+ greytrack_font_colour=None,
+ scale_colour=None,
+ ):
+ """Initialize.
+
+ Arguments:
+ - height Int describing the relative height to other tracks in the
+ diagram
+ - name String describing the track
+ - hide Boolean, 0 if the track is not to be drawn
+ - greytrack Boolean, 1 if a grey background to the track is to be
+ drawn
+ - greytrack_labels Int describing how many track-identifying labels
+ should be placed on the track at regular intervals
+ - greytrack_font String describing the font to use for the greytrack
+ labels
+ - greytrack_fontsize Int describing the font size to display the
+ labels on the grey track
+ - greytrack_font_rotation Int describing the angle through which to
+ rotate the grey track labels (Linear only)
+ - greytrack_font_color colors.Color describing the color to draw
+ the grey track labels (overridden by backwards compatible argument
+ with UK spelling, colour).
+ - scale Boolean, 1 if a scale is to be drawn on the track
+ - scale_color colors.Color to draw the elements of the scale
+ (overridden by backwards compatible argument with UK
+ spelling, colour).
+ - scale_font String describing the font to use for the scale labels
+ - scale_fontsize Int describing the size of the scale label font
+ - scale_fontangle Int describing the angle at which to draw the scale
+ labels (linear only)
+ - scale_ticks Boolean, 1 if ticks should be drawn at all on the
+ scale
+ - scale_largeticks Float (0->1) describing the height of large
+ scale ticks relative to the track height.
+ - scale_smallticks Float (0->1) describing the height of large
+ scale ticks relative to the track height.
+ - scale_largetick_interval Int, describing the number of bases that
+ should separate large ticks
+ - scale_smalltick_interval Int, describing the number of bases that
+ should separate small ticks
+ - scale_largetick_labels Boolean describing whether position labels
+ should be written over large ticks
+ - scale_smalltick_labels Boolean describing whether position labels
+ should be written over small ticks
+ - name String to help identify the track
+ - height Relative height to draw the track
+ - axis_labels Boolean describing whether the value labels should
+ be placed on the Y axes
+
+ """
+ # Let the UK spelling (colour) override the USA spelling (color)
+ if greytrack_font_colour is not None:
+ greytrack_font_color = greytrack_font_colour
+ if scale_colour is not None:
+ scale_color = scale_colour
+
+ self._next_id = 0 # This will count sets as they are added to the track
+ self._sets = {} # Holds sets, keyed by unique ID
+
+ # Assign attribute values from instantiation
+ self.height = height
+ if name is not None:
+ self.name = str(name)
+ else:
+ self.name = "Track"
+ self.hide = hide
+ self.start = start
+ self.end = end
+
+ # Attributes for the grey track background and labels
+ self.greytrack = greytrack
+ self.greytrack_labels = greytrack_labels
+ self.greytrack_fontsize = greytrack_fontsize
+ self.greytrack_font = greytrack_font
+ self.greytrack_font_rotation = greytrack_font_rotation
+ self.greytrack_fontcolor = greytrack_font_color
+
+ # Attributes for the track scale
+ self.scale = scale
+ self.scale_format = scale_format
+ self.scale_color = scale_color
+ self.scale_font = scale_font
+ self.scale_fontsize = scale_fontsize
+ self.scale_fontangle = scale_fontangle
+ self.scale_ticks = scale_ticks
+ self.scale_largeticks = scale_largeticks
+ self.scale_smallticks = scale_smallticks
+ self.scale_largetick_interval = scale_largetick_interval
+ self.scale_smalltick_interval = scale_smalltick_interval
+ self.scale_largetick_labels = scale_largetick_labels
+ self.scale_smalltick_labels = scale_smalltick_labels
+ self.axis_labels = axis_labels
+
+ def add_set(self, set):
+ """Add a preexisting FeatureSet or GraphSet object to the track."""
+ set.id = self._next_id # Assign unique id to set
+ set.parent = self # Make set's parent this track
+ self._sets[self._next_id] = set # Add set, keyed by unique id
+ self._next_id += 1 # Increment unique set ids
+
+ def new_set(self, type="feature", **args):
+ """Create a new FeatureSet or GraphSet object.
+
+ Create a new FeatureSet or GraphSet object, add it to the
+ track, and return for user manipulation
+ """
+ type_dict = {"feature": FeatureSet, "graph": GraphSet}
+ set = type_dict[type]()
+ for key in args:
+ setattr(set, key, args[key])
+ set.id = self._next_id # Assign unique id to set
+ set.parent = self # Make set's parent this track
+ self._sets[self._next_id] = set # Add set, keyed by unique id
+ self._next_id += 1 # Increment unique set ids
+ return set
+
+ def del_set(self, set_id):
+ """Remove the set with the passed id from the track."""
+ del self._sets[set_id]
+
+ def get_sets(self):
+ """Return the sets contained in this track."""
+ return list(self._sets.values())
+
+ def get_ids(self):
+ """Return the ids of all sets contained in this track."""
+ return list(self._sets.keys())
+
+ def range(self):
+ """Return the lowest and highest base (or mark) numbers as a tuple."""
+ lows, highs = [], [] # Holds set of low and high values from sets
+ if self.start is not None:
+ lows.append(self.start)
+ if self.end is not None:
+ highs.append(self.end)
+ for set in self._sets.values():
+ low, high = set.range() # Get each set range
+ lows.append(low)
+ highs.append(high)
+ if lows:
+ low = min(lows)
+ else:
+ low = None
+ if highs:
+ high = max(highs)
+ else:
+ high = None
+ return low, high # Return lowest and highest values
+
+ def to_string(self, verbose=0):
+ """Return a formatted string with information about the track.
+
+ Arguments:
+ - verbose - Boolean indicating whether a short or complete
+ account of the track is required
+
+ """
+ if not verbose: # Return the short description
+ return "%s" % self # Use __str__ method instead
+ else: # Return the long description
+ outstr = ["\n<%s: %s>" % (self.__class__, self.name)]
+ outstr.append("%d sets" % len(self._sets))
+ for key in self._sets:
+ outstr.append("set: %s" % self._sets[key])
+ return "\n".join(outstr)
+
+ def __getitem__(self, key):
+ """Return the set with the passed id."""
+ return self._sets[key]
+
+ def __str__(self):
+ """Return a formatted string with information about the Track."""
+ outstr = ["\n<%s: %s>" % (self.__class__, self.name)]
+ outstr.append("%d sets" % len(self._sets))
+ return "\n".join(outstr)
diff --git a/code/lib/Bio/Graphics/GenomeDiagram/__init__.py b/code/lib/Bio/Graphics/GenomeDiagram/__init__.py
new file mode 100644
index 0000000..ca40d28
--- /dev/null
+++ b/code/lib/Bio/Graphics/GenomeDiagram/__init__.py
@@ -0,0 +1,37 @@
+# Copyright 2003-2008 by Leighton Pritchard. All rights reserved.
+# Revisions copyright 2009 by Peter Cock.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+#
+# Contact: Leighton Pritchard, The James Hutton Institute,
+# Invergowrie, Dundee, Scotland, DD2 5DA, UK
+# Leighton.Pritchard@hutton.ac.uk
+# #############################################################################
+
+"""GenomeDiagram module integrated into Biopython."""
+
+# Local imports, to make these classes available directly under the
+# Bio.Graphics.GenomeDiagram namespace:
+
+from ._Diagram import Diagram
+from ._Track import Track
+from ._FeatureSet import FeatureSet
+from ._GraphSet import GraphSet
+from ._CrossLink import CrossLink
+from ._Colors import ColorTranslator
+from ._Feature import Feature
+from ._Graph import GraphData
+
+__all__ = (
+ "Diagram",
+ "Track",
+ "FeatureSet",
+ "Feature",
+ "GraphSet",
+ "GraphData",
+ "CrossLink",
+ "ColorTranslator",
+)
diff --git a/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_AbstractDrawer.cpython-37.pyc b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_AbstractDrawer.cpython-37.pyc
new file mode 100644
index 0000000..b0e8b84
Binary files /dev/null and b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_AbstractDrawer.cpython-37.pyc differ
diff --git a/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_CircularDrawer.cpython-37.pyc b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_CircularDrawer.cpython-37.pyc
new file mode 100644
index 0000000..e580415
Binary files /dev/null and b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_CircularDrawer.cpython-37.pyc differ
diff --git a/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_Colors.cpython-37.pyc b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_Colors.cpython-37.pyc
new file mode 100644
index 0000000..a8989e5
Binary files /dev/null and b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_Colors.cpython-37.pyc differ
diff --git a/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_CrossLink.cpython-37.pyc b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_CrossLink.cpython-37.pyc
new file mode 100644
index 0000000..2b61193
Binary files /dev/null and b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_CrossLink.cpython-37.pyc differ
diff --git a/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_Diagram.cpython-37.pyc b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_Diagram.cpython-37.pyc
new file mode 100644
index 0000000..9ae04f2
Binary files /dev/null and b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_Diagram.cpython-37.pyc differ
diff --git a/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_Feature.cpython-37.pyc b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_Feature.cpython-37.pyc
new file mode 100644
index 0000000..4dab43a
Binary files /dev/null and b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_Feature.cpython-37.pyc differ
diff --git a/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_FeatureSet.cpython-37.pyc b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_FeatureSet.cpython-37.pyc
new file mode 100644
index 0000000..ac57f08
Binary files /dev/null and b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_FeatureSet.cpython-37.pyc differ
diff --git a/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_Graph.cpython-37.pyc b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_Graph.cpython-37.pyc
new file mode 100644
index 0000000..3aecb02
Binary files /dev/null and b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_Graph.cpython-37.pyc differ
diff --git a/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_GraphSet.cpython-37.pyc b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_GraphSet.cpython-37.pyc
new file mode 100644
index 0000000..e9a7dc7
Binary files /dev/null and b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_GraphSet.cpython-37.pyc differ
diff --git a/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_LinearDrawer.cpython-37.pyc b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_LinearDrawer.cpython-37.pyc
new file mode 100644
index 0000000..bd7db74
Binary files /dev/null and b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_LinearDrawer.cpython-37.pyc differ
diff --git a/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_Track.cpython-37.pyc b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_Track.cpython-37.pyc
new file mode 100644
index 0000000..959ca99
Binary files /dev/null and b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/_Track.cpython-37.pyc differ
diff --git a/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..6f04517
Binary files /dev/null and b/code/lib/Bio/Graphics/GenomeDiagram/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/Graphics/KGML_vis.py b/code/lib/Bio/Graphics/KGML_vis.py
new file mode 100644
index 0000000..9a09086
--- /dev/null
+++ b/code/lib/Bio/Graphics/KGML_vis.py
@@ -0,0 +1,443 @@
+# Copyright 2013 Leighton Pritchard. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Classes and functions to visualise a KGML Pathway Map.
+
+The KGML definition is as of release KGML v0.7.1
+(http://www.kegg.jp/kegg/xml/docs/)
+
+Classes:
+"""
+
+
+import os
+import tempfile
+from io import BytesIO
+
+try:
+ from reportlab.lib import colors
+ from reportlab.pdfgen import canvas
+except ImportError:
+ from Bio import MissingPythonDependencyError
+
+ raise MissingPythonDependencyError(
+ "Install reportlab if you want to use KGML_vis."
+ ) from None
+
+try:
+ from PIL import Image
+except ImportError:
+ from Bio import MissingPythonDependencyError
+
+ raise MissingPythonDependencyError(
+ "Install pillow if you want to use KGML_vis."
+ ) from None
+
+from urllib.request import urlopen
+
+from Bio.KEGG.KGML.KGML_pathway import Pathway
+
+
+def darken(color, factor=0.7):
+ """Return darkened color as a ReportLab RGB color.
+
+ Take a passed color and returns a Reportlab color that is darker by the
+ factor indicated in the parameter.
+ """
+ newcol = color_to_reportlab(color)
+ for a in ["red", "green", "blue"]:
+ setattr(newcol, a, factor * getattr(newcol, a))
+ return newcol
+
+
+def color_to_reportlab(color):
+ """Return the passed color in Reportlab Color format.
+
+ We allow colors to be specified as hex values, tuples, or Reportlab Color
+ objects, and with or without an alpha channel. This function acts as a
+ Rosetta stone for conversion of those formats to a Reportlab Color
+ object, with alpha value.
+
+ Any other color specification is returned directly
+ """
+ # Reportlab Color objects are in the format we want already
+ if isinstance(color, colors.Color):
+ return color
+ elif isinstance(color, str): # String implies hex color
+ if color.startswith("0x"): # Standardise to octothorpe
+ color.replace("0x", "#")
+ if len(color) == 7:
+ return colors.HexColor(color)
+ else:
+ try:
+ return colors.HexColor(color, hasAlpha=True)
+ except TypeError: # Catch pre-2.7 Reportlab
+ raise RuntimeError(
+ "Your reportlab seems to be too old, try 2.7 onwards"
+ ) from None
+ elif isinstance(color, tuple): # Tuple implies RGB(alpha) tuple
+ return colors.Color(*color)
+ return color
+
+
+def get_temp_imagefilename(url):
+ """Return filename of temporary file containing downloaded image.
+
+ Create a new temporary file to hold the image file at the passed URL
+ and return the filename.
+ """
+ img = urlopen(url).read()
+ im = Image.open(BytesIO(img))
+ # im.transpose(Image.FLIP_TOP_BOTTOM)
+ f = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
+ fname = f.name
+ f.close()
+ im.save(fname, "PNG")
+ return fname
+
+
+class KGMLCanvas:
+ """Reportlab Canvas-based representation of a KGML pathway map."""
+
+ def __init__(
+ self,
+ pathway,
+ import_imagemap=False,
+ label_compounds=True,
+ label_orthologs=True,
+ label_reaction_entries=True,
+ label_maps=True,
+ show_maps=False,
+ fontname="Helvetica",
+ fontsize=6,
+ draw_relations=True,
+ show_orthologs=True,
+ show_compounds=True,
+ show_genes=True,
+ show_reaction_entries=True,
+ margins=(0.02, 0.02),
+ ):
+ """Initialize the class."""
+ self.pathway = pathway
+ self.show_maps = show_maps
+ self.show_orthologs = show_orthologs
+ self.show_compounds = show_compounds
+ self.show_genes = show_genes
+ self.show_reaction_entries = show_reaction_entries
+ self.label_compounds = label_compounds
+ self.label_orthologs = label_orthologs
+ self.label_reaction_entries = label_reaction_entries
+ self.label_maps = label_maps
+ self.fontname = fontname
+ self.fontsize = fontsize
+ self.draw_relations = draw_relations
+ self.non_reactant_transparency = 0.3
+ self.import_imagemap = import_imagemap # Import the map .png from URL
+ # percentage of canvas that will be margin in on either side in the
+ # X and Y directions
+ self.margins = margins
+
+ def draw(self, filename):
+ """Add the map elements to the drawing."""
+ # Instantiate the drawing, first
+ # size x_max, y_max for now - we can add margins, later
+ if self.import_imagemap:
+ # We're drawing directly on the image, so we set the canvas to the
+ # same size as the image
+ if os.path.isfile(self.pathway.image):
+ imfilename = self.pathway.image
+ else:
+ imfilename = get_temp_imagefilename(self.pathway.image)
+ im = Image.open(imfilename)
+ cwidth, cheight = im.size
+ else:
+ # No image, so we set the canvas size to accommodate visible
+ # elements
+ cwidth, cheight = (self.pathway.bounds[1][0], self.pathway.bounds[1][1])
+ # Instantiate canvas
+ self.drawing = canvas.Canvas(
+ filename,
+ bottomup=0,
+ pagesize=(
+ cwidth * (1 + 2 * self.margins[0]),
+ cheight * (1 + 2 * self.margins[1]),
+ ),
+ )
+ self.drawing.setFont(self.fontname, self.fontsize)
+ # Transform the canvas to add the margins
+ self.drawing.translate(
+ self.margins[0] * self.pathway.bounds[1][0],
+ self.margins[1] * self.pathway.bounds[1][1],
+ )
+ # Add the map image, if required
+ if self.import_imagemap:
+ self.drawing.saveState()
+ self.drawing.scale(1, -1)
+ self.drawing.translate(0, -cheight)
+ self.drawing.drawImage(imfilename, 0, 0)
+ self.drawing.restoreState()
+ # Add the reactions, compounds and maps
+ # Maps go on first, to be overlaid by more information.
+ # By default, they're slightly transparent.
+ if self.show_maps:
+ self.__add_maps()
+ if self.show_reaction_entries:
+ self.__add_reaction_entries()
+ if self.show_orthologs:
+ self.__add_orthologs()
+ if self.show_compounds:
+ self.__add_compounds()
+ if self.show_genes:
+ self.__add_genes()
+ # TODO: complete draw_relations code
+ # if self.draw_relations:
+ # self.__add_relations()
+ # Write the pathway map to PDF
+ self.drawing.save()
+
+ def __add_maps(self):
+ """Add maps to the drawing of the map (PRIVATE).
+
+ We do this first, as they're regional labels to be overlaid by
+ information. Also, we want to set the color to something subtle.
+
+ We're using Hex colors because that's what KGML uses, and
+ Reportlab doesn't mind.
+ """
+ for m in self.pathway.maps:
+ for g in m.graphics:
+ self.drawing.setStrokeColor("#888888")
+ self.drawing.setFillColor("#DDDDDD")
+ self.__add_graphics(g)
+ if self.label_maps:
+ self.drawing.setFillColor("#888888")
+ self.__add_labels(g)
+
+ def __add_graphics(self, graphics):
+ """Add the passed graphics object to the map (PRIVATE).
+
+ Add text, add after the graphics object, for sane Z-ordering.
+ """
+ if graphics.type == "line":
+ p = self.drawing.beginPath()
+ x, y = graphics.coords[0]
+ # There are optional settings for lines that aren't necessarily
+ # part of the KGML DTD
+ if graphics.width is not None:
+ self.drawing.setLineWidth(graphics.width)
+ else:
+ self.drawing.setLineWidth(1)
+ p.moveTo(x, y)
+ for (x, y) in graphics.coords:
+ p.lineTo(x, y)
+ self.drawing.drawPath(p)
+ self.drawing.setLineWidth(1) # Return to default
+ # KGML defines the (x, y) coordinates as the centre of the circle/
+ # rectangle/roundrectangle, but Reportlab uses the co-ordinates of the
+ # lower-left corner for rectangle/elif.
+ if graphics.type == "circle":
+ self.drawing.circle(
+ graphics.x, graphics.y, graphics.width * 0.5, stroke=1, fill=1
+ )
+ elif graphics.type == "roundrectangle":
+ self.drawing.roundRect(
+ graphics.x - graphics.width * 0.5,
+ graphics.y - graphics.height * 0.5,
+ graphics.width,
+ graphics.height,
+ min(graphics.width, graphics.height) * 0.1,
+ stroke=1,
+ fill=1,
+ )
+ elif graphics.type == "rectangle":
+ self.drawing.rect(
+ graphics.x - graphics.width * 0.5,
+ graphics.y - graphics.height * 0.5,
+ graphics.width,
+ graphics.height,
+ stroke=1,
+ fill=1,
+ )
+
+ def __add_labels(self, graphics):
+ """Add labels for the passed graphics objects to the map (PRIVATE).
+
+ We don't check that the labels fit inside objects such as circles/
+ rectangles/roundrectangles.
+ """
+ if graphics.type == "line":
+ # We use the midpoint of the line - sort of - we take the median
+ # line segment (list-wise, not in terms of length), and use the
+ # midpoint of that line. We could have other options here,
+ # maybe even parameterising it to a proportion of the total line
+ # length.
+ mid_idx = len(graphics.coords) * 0.5
+ if not int(mid_idx) == mid_idx:
+ idx1, idx2 = int(mid_idx - 0.5), int(mid_idx + 0.5)
+ else:
+ idx1, idx2 = int(mid_idx - 1), int(mid_idx)
+ x1, y1 = graphics.coords[idx1]
+ x2, y2 = graphics.coords[idx2]
+ x, y = 0.5 * (x1 + x2), 0.5 * (y1 + y2)
+ elif graphics.type == "circle":
+ x, y = graphics.x, graphics.y
+ elif graphics.type in ("rectangle", "roundrectangle"):
+ x, y = graphics.x, graphics.y
+ # How big so we want the text, and how many characters?
+ if graphics._parent.type == "map":
+ text = graphics.name
+ self.drawing.setFont(self.fontname, self.fontsize + 2)
+ elif len(graphics.name) < 15:
+ text = graphics.name
+ else:
+ text = graphics.name[:12] + "..."
+ self.drawing.drawCentredString(x, y, text)
+ self.drawing.setFont(self.fontname, self.fontsize)
+
+ def __add_orthologs(self):
+ """Add 'ortholog' Entry elements to the drawing of the map (PRIVATE).
+
+ In KGML, these are typically line objects, so we render them
+ before the compound circles to cover the unsightly ends/junctions.
+ """
+ for ortholog in self.pathway.orthologs:
+ for g in ortholog.graphics:
+ self.drawing.setStrokeColor(color_to_reportlab(g.fgcolor))
+ self.drawing.setFillColor(color_to_reportlab(g.bgcolor))
+ self.__add_graphics(g)
+ if self.label_orthologs:
+ # We want the label color to be slightly darker
+ # (where possible), so it can be read
+ self.drawing.setFillColor(darken(g.fgcolor))
+ self.__add_labels(g)
+
+ def __add_reaction_entries(self):
+ """Add Entry elements for Reactions to the map drawing (PRIVATE).
+
+ In KGML, these are typically line objects, so we render them
+ before the compound circles to cover the unsightly ends/junctions
+ """
+ for reaction in self.pathway.reaction_entries:
+ for g in reaction.graphics:
+ self.drawing.setStrokeColor(color_to_reportlab(g.fgcolor))
+ self.drawing.setFillColor(color_to_reportlab(g.bgcolor))
+ self.__add_graphics(g)
+ if self.label_reaction_entries:
+ # We want the label color to be slightly darker
+ # (where possible), so it can be read
+ self.drawing.setFillColor(darken(g.fgcolor))
+ self.__add_labels(g)
+
+ def __add_compounds(self):
+ """Add compound elements to the drawing of the map (PRIVATE)."""
+ for compound in self.pathway.compounds:
+ for g in compound.graphics:
+ # Modify transparency of compounds that don't participate
+ # in reactions
+ fillcolor = color_to_reportlab(g.bgcolor)
+ if not compound.is_reactant:
+ fillcolor.alpha *= self.non_reactant_transparency
+ self.drawing.setStrokeColor(color_to_reportlab(g.fgcolor))
+ self.drawing.setFillColor(fillcolor)
+ self.__add_graphics(g)
+ if self.label_compounds:
+ if not compound.is_reactant:
+ t = 0.3
+ else:
+ t = 1
+ self.drawing.setFillColor(colors.Color(0.2, 0.2, 0.2, t))
+ self.__add_labels(g)
+
+ def __add_genes(self):
+ """Add gene elements to the drawing of the map (PRIVATE)."""
+ for gene in self.pathway.genes:
+ for g in gene.graphics:
+ self.drawing.setStrokeColor(color_to_reportlab(g.fgcolor))
+ self.drawing.setFillColor(color_to_reportlab(g.bgcolor))
+ self.__add_graphics(g)
+ if self.label_compounds:
+ self.drawing.setFillColor(darken(g.fgcolor))
+ self.__add_labels(g)
+
+ def __add_relations(self):
+ """Add relations to the map (PRIVATE).
+
+ This is tricky. There is no defined graphic in KGML for a
+ relation, and the corresponding entries are typically defined
+ as objects 'to be connected somehow'. KEGG uses KegSketch, which
+ is not public, and most third-party software draws straight line
+ arrows, with heads to indicate the appropriate direction
+ (at both ends for reversible reactions), using solid lines for
+ ECrel relation types, and dashed lines for maplink relation types.
+
+ The relation has:
+ - entry1: 'from' node
+ - entry2: 'to' node
+ - subtype: what the relation refers to
+
+ Typically we have entry1 = map/ortholog; entry2 = map/ortholog,
+ subtype = compound.
+ """
+ # Dashed lines for maplinks, solid for everything else
+ for relation in list(self.pathway.relations):
+ if relation.type == "maplink":
+ self.drawing.setDash(6, 3)
+ else:
+ self.drawing.setDash()
+ for s in relation.subtypes:
+ subtype = self.pathway.entries[s[1]]
+ # Our aim is to draw an arrow from the entry1 object to the
+ # entry2 object, via the subtype object.
+ # 1) Entry 1 to subtype
+ self.__draw_arrow(relation.entry1, subtype)
+ # 2) subtype to Entry 2
+ self.__draw_arrow(subtype, relation.entry2)
+
+ def __draw_arrow(self, g_from, g_to):
+ """Draw an arrow between given Entry objects (PRIVATE).
+
+ Draws an arrow from the g_from Entry object to the g_to
+ Entry object; both must have Graphics objects.
+ """
+ # Centre and bound co-ordinates for the from and two objects
+ bounds_from, bounds_to = g_from.bounds, g_to.bounds
+ centre_from = (
+ 0.5 * (bounds_from[0][0] + bounds_from[1][0]),
+ 0.5 * (bounds_from[0][1] + bounds_from[1][1]),
+ )
+ centre_to = (
+ 0.5 * (bounds_to[0][0] + bounds_to[1][0]),
+ 0.5 * (bounds_to[0][1] + bounds_to[1][1]),
+ )
+ p = self.drawing.beginPath()
+ # print(True, g_from.name, g_to.name, bounds_to, bounds_from)
+ # If the 'from' and 'to' graphics are vertically-aligned, draw a line
+ # from the 'from' to the 'to' entity
+ if bounds_to[0][0] < centre_from[0] < bounds_to[1][0]:
+ # print(True, g_from.name, g_to.name, bounds_to, bounds_from)
+ if centre_to[1] > centre_from[1]: # to above from
+ p.moveTo(centre_from[0], bounds_from[1][1])
+ p.lineTo(centre_from[0], bounds_to[0][1])
+ # Draw arrow point - TODO
+ else: # to below from
+ p.moveTo(centre_from[0], bounds_from[0][1])
+ p.lineTo(centre_from[0], bounds_to[1][1])
+ # Draw arrow point - TODO
+ elif bounds_from[0][0] < centre_to[0] < bounds_from[1][0]:
+ # print(True, g_from.name, g_to.name, bounds_to, bounds_from)
+ if centre_to[1] > centre_from[1]: # to above from
+ p.moveTo(centre_to[0], bounds_from[1][1])
+ p.lineTo(centre_to[0], bounds_to[0][1])
+ # Draw arrow point - TODO
+ else: # to below from
+ p.moveTo(centre_to[0], bounds_from[0][1])
+ p.lineTo(centre_to[0], bounds_to[1][1])
+ # Draw arrow point - TODO
+ self.drawing.drawPath(p) # Draw arrow shaft
+ # print(g_from)
+ # print(bounds_from)
+ # print(g_to)
+ # print(bounds_to)
diff --git a/code/lib/Bio/Graphics/__init__.py b/code/lib/Bio/Graphics/__init__.py
new file mode 100644
index 0000000..8720bb4
--- /dev/null
+++ b/code/lib/Bio/Graphics/__init__.py
@@ -0,0 +1,90 @@
+# Copyright 2008 by Brad Chapman. All rights reserved.
+# Copyright 2008 by Michiel de Hoon. All rights reserved.
+# Copyright 2009-2017 by Peter Cock. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Bio.Graphics offers several graphical outputs, all using ReportLab."""
+
+# Check if ReportLab is installed.
+try:
+ import reportlab as r
+
+ del r
+except ImportError:
+ from Bio import MissingPythonDependencyError
+
+ raise MissingPythonDependencyError(
+ "Please install ReportLab if you want "
+ "to use Bio.Graphics. You can find ReportLab at "
+ "http://www.reportlab.com/software/opensource/"
+ ) from None
+
+
+# The following code is to allow all the Bio.Graphics
+# code to deal with the different ReportLab renderers
+# and the API quirks consistently.
+
+
+def _write(drawing, output_file, format, dpi=72):
+ """Standardize output to files (PRIVATE).
+
+ Writes the provided drawing out to a file in a prescribed format.
+
+ - drawing - suitable ReportLab drawing object.
+ - output_file - a handle to write to, or a filename to write to.
+ - format - String indicating output format, one of PS, PDF, SVG,
+ or provided the ReportLab renderPM module is installed,
+ one of the bitmap formats JPG, BMP, GIF, PNG, TIFF or TIFF.
+ The format can be given in any case.
+ - dpi - Resolution (dots per inch) for bitmap formats.
+
+ No return value.
+ """
+ from reportlab.graphics import renderPS, renderPDF, renderSVG
+
+ try:
+ from reportlab.graphics import renderPM
+ except ImportError:
+ # This is an optional part of ReportLab, so may not be installed.
+ # We'll raise a missing dependency error if rendering to a
+ # bitmap format is attempted.
+ renderPM = None
+
+ formatdict = {
+ "PS": renderPS,
+ "EPS": renderPS,
+ # not sure which you actually get, PS or EPS, but
+ # GenomeDiagram used PS while other modules used EPS.
+ "PDF": renderPDF,
+ "SVG": renderSVG,
+ "JPG": renderPM,
+ "BMP": renderPM,
+ "GIF": renderPM,
+ "PNG": renderPM,
+ "TIFF": renderPM,
+ "TIF": renderPM,
+ }
+ try:
+ # If output is not a string, then .upper() will trigger
+ # an attribute error...
+ drawmethod = formatdict[format.upper()] # select drawing method
+ except (KeyError, AttributeError):
+ raise ValueError(
+ "Output format should be one of %s" % ", ".join(formatdict)
+ ) from None
+
+ if drawmethod is None:
+ # i.e. We wanted renderPM but it isn't installed
+ # See the import at the top of the function.
+ from Bio import MissingPythonDependencyError
+
+ raise MissingPythonDependencyError("Please install ReportLab's renderPM module")
+
+ if drawmethod == renderPM:
+ # This has a different API to the other render objects
+ return drawmethod.drawToFile(drawing, output_file, format, dpi=dpi)
+ else:
+ return drawmethod.drawToFile(drawing, output_file)
diff --git a/code/lib/Bio/Graphics/__pycache__/BasicChromosome.cpython-37.pyc b/code/lib/Bio/Graphics/__pycache__/BasicChromosome.cpython-37.pyc
new file mode 100644
index 0000000..366a8eb
Binary files /dev/null and b/code/lib/Bio/Graphics/__pycache__/BasicChromosome.cpython-37.pyc differ
diff --git a/code/lib/Bio/Graphics/__pycache__/ColorSpiral.cpython-37.pyc b/code/lib/Bio/Graphics/__pycache__/ColorSpiral.cpython-37.pyc
new file mode 100644
index 0000000..fdef3ca
Binary files /dev/null and b/code/lib/Bio/Graphics/__pycache__/ColorSpiral.cpython-37.pyc differ
diff --git a/code/lib/Bio/Graphics/__pycache__/Comparative.cpython-37.pyc b/code/lib/Bio/Graphics/__pycache__/Comparative.cpython-37.pyc
new file mode 100644
index 0000000..7a2e4da
Binary files /dev/null and b/code/lib/Bio/Graphics/__pycache__/Comparative.cpython-37.pyc differ
diff --git a/code/lib/Bio/Graphics/__pycache__/DisplayRepresentation.cpython-37.pyc b/code/lib/Bio/Graphics/__pycache__/DisplayRepresentation.cpython-37.pyc
new file mode 100644
index 0000000..363b6b5
Binary files /dev/null and b/code/lib/Bio/Graphics/__pycache__/DisplayRepresentation.cpython-37.pyc differ
diff --git a/code/lib/Bio/Graphics/__pycache__/Distribution.cpython-37.pyc b/code/lib/Bio/Graphics/__pycache__/Distribution.cpython-37.pyc
new file mode 100644
index 0000000..aa31c44
Binary files /dev/null and b/code/lib/Bio/Graphics/__pycache__/Distribution.cpython-37.pyc differ
diff --git a/code/lib/Bio/Graphics/__pycache__/KGML_vis.cpython-37.pyc b/code/lib/Bio/Graphics/__pycache__/KGML_vis.cpython-37.pyc
new file mode 100644
index 0000000..f086a54
Binary files /dev/null and b/code/lib/Bio/Graphics/__pycache__/KGML_vis.cpython-37.pyc differ
diff --git a/code/lib/Bio/Graphics/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/Graphics/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..22ca765
Binary files /dev/null and b/code/lib/Bio/Graphics/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/HMM/DynamicProgramming.py b/code/lib/Bio/HMM/DynamicProgramming.py
new file mode 100644
index 0000000..9f9b095
--- /dev/null
+++ b/code/lib/Bio/HMM/DynamicProgramming.py
@@ -0,0 +1,326 @@
+# Copyright 2001 Brad Chapman. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+
+"""Dynamic Programming algorithms for general usage.
+
+This module contains classes which implement Dynamic Programming
+algorithms that can be used generally.
+"""
+
+
+class AbstractDPAlgorithms:
+ """An abstract class to calculate forward and backward probabilities.
+
+ This class should not be instantiated directly, but should be used
+ through a derived class which implements proper scaling of variables.
+
+ This class is just meant to encapsulate the basic forward and backward
+ algorithms, and allow derived classes to deal with the problems of
+ multiplying probabilities.
+
+ Derived class of this must implement:
+
+ - _forward_recursion -- Calculate the forward values in the recursion
+ using some kind of technique for preventing underflow errors.
+ - _backward_recursion -- Calculate the backward values in the recursion
+ step using some technique to prevent underflow errors.
+
+ """
+
+ def __init__(self, markov_model, sequence):
+ """Initialize to calculate forward and backward probabilities.
+
+ Arguments:
+ - markov_model -- The current Markov model we are working with.
+ - sequence -- A training sequence containing a set of emissions.
+
+ """
+ self._mm = markov_model
+ self._seq = sequence
+
+ def _forward_recursion(self, cur_state, sequence_pos, forward_vars):
+ """Calculate the forward recursion value (PRIVATE)."""
+ raise NotImplementedError("Subclasses must implement")
+
+ def forward_algorithm(self):
+ """Calculate sequence probability using the forward algorithm.
+
+ This implements the forward algorithm, as described on p57-58 of
+ Durbin et al.
+
+ Returns:
+ - A dictionary containing the forward variables. This has keys of the
+ form (state letter, position in the training sequence), and values
+ containing the calculated forward variable.
+ - The calculated probability of the sequence.
+
+ """
+ # all of the different letters that the state path can be in
+ state_letters = self._mm.state_alphabet
+
+ # -- initialize the algorithm
+ #
+ # NOTE: My index numbers are one less than what is given in Durbin
+ # et al, since we are indexing the sequence going from 0 to
+ # (Length - 1) not 1 to Length, like in Durbin et al.
+ #
+ forward_var = {}
+ # f_{0}(0) = 1
+ forward_var[(state_letters[0], -1)] = 1
+ # f_{k}(0) = 0, for k > 0
+ for k in range(1, len(state_letters)):
+ forward_var[(state_letters[k], -1)] = 0
+
+ # -- now do the recursion step
+ # loop over the training sequence
+ # Recursion step: (i = 1 .. L)
+ for i in range(len(self._seq.emissions)):
+ # now loop over the letters in the state path
+ for main_state in state_letters:
+ # calculate the forward value using the appropriate
+ # method to prevent underflow errors
+ forward_value = self._forward_recursion(main_state, i, forward_var)
+
+ if forward_value is not None:
+ forward_var[(main_state, i)] = forward_value
+
+ # -- termination step - calculate the probability of the sequence
+ first_state = state_letters[0]
+ seq_prob = 0
+
+ for state_item in state_letters:
+ # f_{k}(L)
+ forward_value = forward_var[(state_item, len(self._seq.emissions) - 1)]
+ # a_{k0}
+ transition_value = self._mm.transition_prob[(state_item, first_state)]
+
+ seq_prob += forward_value * transition_value
+
+ return forward_var, seq_prob
+
+ def _backward_recursion(self, cur_state, sequence_pos, forward_vars):
+ """Calculate the backward recursion value (PRIVATE)."""
+ raise NotImplementedError("Subclasses must implement")
+
+ def backward_algorithm(self):
+ """Calculate sequence probability using the backward algorithm.
+
+ This implements the backward algorithm, as described on p58-59 of
+ Durbin et al.
+
+ Returns:
+ - A dictionary containing the backwards variables. This has keys
+ of the form (state letter, position in the training sequence),
+ and values containing the calculated backward variable.
+
+ """
+ # all of the different letters that the state path can be in
+ state_letters = self._mm.state_alphabet
+
+ # -- initialize the algorithm
+ #
+ # NOTE: My index numbers are one less than what is given in Durbin
+ # et al, since we are indexing the sequence going from 0 to
+ # (Length - 1) not 1 to Length, like in Durbin et al.
+ #
+ backward_var = {}
+
+ first_letter = state_letters[0]
+ # b_{k}(L) = a_{k0} for all k
+ for state in state_letters:
+ backward_var[
+ (state, len(self._seq.emissions) - 1)
+ ] = self._mm.transition_prob[(state, state_letters[0])]
+
+ # -- recursion
+ # first loop over the training sequence backwards
+ # Recursion step: (i = L - 1 ... 1)
+ all_indexes = list(range(len(self._seq.emissions) - 1))
+ all_indexes.reverse()
+ for i in all_indexes:
+ # now loop over the letters in the state path
+ for main_state in state_letters:
+ # calculate the backward value using the appropriate
+ # method to prevent underflow errors
+ backward_value = self._backward_recursion(main_state, i, backward_var)
+
+ if backward_value is not None:
+ backward_var[(main_state, i)] = backward_value
+
+ # skip the termination step to avoid recalculations -- you should
+ # get sequence probabilities using the forward algorithm
+
+ return backward_var
+
+
+class ScaledDPAlgorithms(AbstractDPAlgorithms):
+ """Implement forward and backward algorithms using a rescaling approach.
+
+ This scales the f and b variables, so that they remain within a
+ manageable numerical interval during calculations. This approach is
+ described in Durbin et al. on p 78.
+
+ This approach is a little more straightforward then log transformation
+ but may still give underflow errors for some types of models. In these
+ cases, the LogDPAlgorithms class should be used.
+ """
+
+ def __init__(self, markov_model, sequence):
+ """Initialize the scaled approach to calculating probabilities.
+
+ Arguments:
+ - markov_model -- The current Markov model we are working with.
+ - sequence -- A TrainingSequence object that must have a
+ set of emissions to work with.
+
+ """
+ AbstractDPAlgorithms.__init__(self, markov_model, sequence)
+
+ self._s_values = {}
+
+ def _calculate_s_value(self, seq_pos, previous_vars):
+ """Calculate the next scaling variable for a sequence position (PRIVATE).
+
+ This utilizes the approach of choosing s values such that the
+ sum of all of the scaled f values is equal to 1.
+
+ Arguments:
+ - seq_pos -- The current position we are at in the sequence.
+ - previous_vars -- All of the forward or backward variables
+ calculated so far.
+
+ Returns:
+ - The calculated scaling variable for the sequence item.
+
+ """
+ # all of the different letters the state can have
+ state_letters = self._mm.state_alphabet
+
+ # loop over all of the possible states
+ s_value = 0
+ for main_state in state_letters:
+ emission = self._mm.emission_prob[
+ (main_state, self._seq.emissions[seq_pos])
+ ]
+
+ # now sum over all of the previous vars and transitions
+ trans_and_var_sum = 0
+ for second_state in self._mm.transitions_from(main_state):
+ # the value of the previous f or b value
+ var_value = previous_vars[(second_state, seq_pos - 1)]
+
+ # the transition probability
+ trans_value = self._mm.transition_prob[(second_state, main_state)]
+
+ trans_and_var_sum += var_value * trans_value
+
+ s_value += emission * trans_and_var_sum
+
+ return s_value
+
+ def _forward_recursion(self, cur_state, sequence_pos, forward_vars):
+ """Calculate the value of the forward recursion (PRIVATE).
+
+ Arguments:
+ - cur_state -- The letter of the state we are calculating the
+ forward variable for.
+ - sequence_pos -- The position we are at in the training seq.
+ - forward_vars -- The current set of forward variables
+
+ """
+ # calculate the s value, if we haven't done so already (ie. during
+ # a previous forward or backward recursion)
+ if sequence_pos not in self._s_values:
+ self._s_values[sequence_pos] = self._calculate_s_value(
+ sequence_pos, forward_vars
+ )
+
+ # e_{l}(x_{i})
+ seq_letter = self._seq.emissions[sequence_pos]
+ cur_emission_prob = self._mm.emission_prob[(cur_state, seq_letter)]
+ # divide by the scaling value
+ scale_emission_prob = float(cur_emission_prob) / float(
+ self._s_values[sequence_pos]
+ )
+
+ # loop over all of the possible states at the position
+ state_pos_sum = 0
+ have_transition = 0
+ for second_state in self._mm.transitions_from(cur_state):
+ have_transition = 1
+
+ # get the previous forward_var values
+ # f_{k}(i - 1)
+ prev_forward = forward_vars[(second_state, sequence_pos - 1)]
+
+ # a_{kl}
+ cur_trans_prob = self._mm.transition_prob[(second_state, cur_state)]
+ state_pos_sum += prev_forward * cur_trans_prob
+
+ # if we have the possibility of having a transition
+ # return the recursion value
+ if have_transition:
+ return scale_emission_prob * state_pos_sum
+ else:
+ return None
+
+ def _backward_recursion(self, cur_state, sequence_pos, backward_vars):
+ """Calculate the value of the backward recursion (PRIVATE).
+
+ Arguments:
+ - cur_state -- The letter of the state we are calculating the
+ forward variable for.
+ - sequence_pos -- The position we are at in the training seq.
+ - backward_vars -- The current set of backward variables
+
+ """
+ # calculate the s value, if we haven't done so already (ie. during
+ # a previous forward or backward recursion)
+ if sequence_pos not in self._s_values:
+ self._s_values[sequence_pos] = self._calculate_s_value(
+ sequence_pos, backward_vars
+ )
+
+ # loop over all of the possible states at the position
+ state_pos_sum = 0
+ have_transition = 0
+ for second_state in self._mm.transitions_from(cur_state):
+ have_transition = 1
+ # e_{l}(x_{i + 1})
+ seq_letter = self._seq.emissions[sequence_pos + 1]
+ cur_emission_prob = self._mm.emission_prob[(cur_state, seq_letter)]
+
+ # get the previous backward_var value
+ # b_{l}(i + 1)
+ prev_backward = backward_vars[(second_state, sequence_pos + 1)]
+
+ # the transition probability -- a_{kl}
+ cur_transition_prob = self._mm.transition_prob[(cur_state, second_state)]
+
+ state_pos_sum += cur_emission_prob * prev_backward * cur_transition_prob
+
+ # if we have a probability for a transition, return it
+ if have_transition:
+ return state_pos_sum / float(self._s_values[sequence_pos])
+ # otherwise we have no probability (ie. we can't do this transition)
+ # and return None
+ else:
+ return None
+
+
+class LogDPAlgorithms(AbstractDPAlgorithms):
+ """Implement forward and backward algorithms using a log approach.
+
+ This uses the approach of calculating the sum of log probabilities
+ using a lookup table for common values.
+
+ XXX This is not implemented yet!
+ """
+
+ def __init__(self, markov_model, sequence):
+ """Initialize the class."""
+ raise NotImplementedError("Haven't coded this yet...")
diff --git a/code/lib/Bio/HMM/MarkovModel.py b/code/lib/Bio/HMM/MarkovModel.py
new file mode 100644
index 0000000..ef9fef6
--- /dev/null
+++ b/code/lib/Bio/HMM/MarkovModel.py
@@ -0,0 +1,677 @@
+# Copyright 2001 Brad Chapman. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+
+"""Deal with representations of Markov Models."""
+# standard modules
+import copy
+import math
+import random
+from collections import defaultdict
+
+from Bio.Seq import Seq
+
+
+def _gen_random_array(n):
+ """Return an array of n random numbers summing to 1.0 (PRIVATE)."""
+ randArray = [random.random() for _ in range(n)]
+ total = sum(randArray)
+ return [x / total for x in randArray]
+
+
+def _calculate_emissions(emission_probs):
+ """Calculate which symbols can be emitted in each state (PRIVATE)."""
+ # loop over all of the state-symbol duples, mapping states to
+ # lists of emitted symbols
+ emissions = defaultdict(list)
+ for state, symbol in emission_probs:
+ emissions[state].append(symbol)
+
+ return emissions
+
+
+def _calculate_from_transitions(trans_probs):
+ """Calculate which 'from transitions' are allowed for each state (PRIVATE).
+
+ This looks through all of the trans_probs, and uses this dictionary
+ to determine allowed transitions. It converts this information into
+ a dictionary, whose keys are source states and whose values are
+ lists of destination states reachable from the source state via a
+ transition.
+ """
+ transitions = defaultdict(list)
+ for from_state, to_state in trans_probs:
+ transitions[from_state].append(to_state)
+
+ return transitions
+
+
+def _calculate_to_transitions(trans_probs):
+ """Calculate which 'to transitions' are allowed for each state (PRIVATE).
+
+ This looks through all of the trans_probs, and uses this dictionary
+ to determine allowed transitions. It converts this information into
+ a dictionary, whose keys are destination states and whose values are
+ lists of source states from which the destination is reachable via a
+ transition.
+ """
+ transitions = defaultdict(list)
+ for from_state, to_state in trans_probs:
+ transitions[to_state].append(from_state)
+
+ return transitions
+
+
+class MarkovModelBuilder:
+ """Interface to build up a Markov Model.
+
+ This class is designed to try to separate the task of specifying the
+ Markov Model from the actual model itself. This is in hopes of making
+ the actual Markov Model classes smaller.
+
+ So, this builder class should be used to create Markov models instead
+ of trying to initiate a Markov Model directly.
+ """
+
+ # the default pseudo counts to use
+ DEFAULT_PSEUDO = 1
+
+ def __init__(self, state_alphabet, emission_alphabet):
+ """Initialize a builder to create Markov Models.
+
+ Arguments:
+ - state_alphabet -- An iterable (e.g., tuple or list) containing
+ all of the letters that can appear in the states
+ - emission_alphabet -- An iterable (e.g., tuple or list) containing
+ all of the letters for states that can be emitted by the HMM.
+
+ """
+ self._state_alphabet = tuple(state_alphabet)
+ self._emission_alphabet = tuple(emission_alphabet)
+
+ # probabilities for the initial state, initialized by calling
+ # set_initial_probabilities (required)
+ self.initial_prob = {}
+
+ # the probabilities for transitions and emissions
+ # by default we have no transitions and all possible emissions
+ self.transition_prob = {}
+ self.emission_prob = self._all_blank(state_alphabet, emission_alphabet)
+
+ # the default pseudocounts for transition and emission counting
+ self.transition_pseudo = {}
+ self.emission_pseudo = self._all_pseudo(state_alphabet, emission_alphabet)
+
+ def _all_blank(self, first_alphabet, second_alphabet):
+ """Return a dictionary with all counts set to zero (PRIVATE).
+
+ This uses the letters in the first and second alphabet to create
+ a dictionary with keys of two tuples organized as
+ (letter of first alphabet, letter of second alphabet). The values
+ are all set to 0.
+ """
+ all_blank = {}
+ for first_state in first_alphabet:
+ for second_state in second_alphabet:
+ all_blank[(first_state, second_state)] = 0
+
+ return all_blank
+
+ def _all_pseudo(self, first_alphabet, second_alphabet):
+ """Return a dictionary with all counts set to a default value (PRIVATE).
+
+ This takes the letters in first alphabet and second alphabet and
+ creates a dictionary with keys of two tuples organized as:
+ (letter of first alphabet, letter of second alphabet). The values
+ are all set to the value of the class attribute DEFAULT_PSEUDO.
+ """
+ all_counts = {}
+ for first_state in first_alphabet:
+ for second_state in second_alphabet:
+ all_counts[(first_state, second_state)] = self.DEFAULT_PSEUDO
+
+ return all_counts
+
+ def get_markov_model(self):
+ """Return the markov model corresponding with the current parameters.
+
+ Each markov model returned by a call to this function is unique
+ (ie. they don't influence each other).
+ """
+ # user must set initial probabilities
+ if not self.initial_prob:
+ raise Exception(
+ "set_initial_probabilities must be called to "
+ "fully initialize the Markov model"
+ )
+
+ initial_prob = copy.deepcopy(self.initial_prob)
+ transition_prob = copy.deepcopy(self.transition_prob)
+ emission_prob = copy.deepcopy(self.emission_prob)
+ transition_pseudo = copy.deepcopy(self.transition_pseudo)
+ emission_pseudo = copy.deepcopy(self.emission_pseudo)
+
+ return HiddenMarkovModel(
+ self._state_alphabet,
+ self._emission_alphabet,
+ initial_prob,
+ transition_prob,
+ emission_prob,
+ transition_pseudo,
+ emission_pseudo,
+ )
+
+ def set_initial_probabilities(self, initial_prob):
+ """Set initial state probabilities.
+
+ initial_prob is a dictionary mapping states to probabilities.
+ Suppose, for example, that the state alphabet is ('A', 'B'). Call
+ set_initial_prob({'A': 1}) to guarantee that the initial
+ state will be 'A'. Call set_initial_prob({'A': 0.5, 'B': 0.5})
+ to make each initial state equally probable.
+
+ This method must now be called in order to use the Markov model
+ because the calculation of initial probabilities has changed
+ incompatibly; the previous calculation was incorrect.
+
+ If initial probabilities are set for all states, then they should add up
+ to 1. Otherwise the sum should be <= 1. The residual probability is
+ divided up evenly between all the states for which the initial
+ probability has not been set. For example, calling
+ set_initial_prob({}) results in P('A') = 0.5 and P('B') = 0.5,
+ for the above example.
+ """
+ self.initial_prob = copy.copy(initial_prob)
+
+ # ensure that all referenced states are valid
+ for state in initial_prob:
+ if state not in self._state_alphabet:
+ raise ValueError(
+ "State %s was not found in the sequence alphabet" % state
+ )
+
+ # distribute the residual probability, if any
+ num_states_not_set = len(self._state_alphabet) - len(self.initial_prob)
+ if num_states_not_set < 0:
+ raise Exception("Initial probabilities can't exceed # of states")
+ prob_sum = sum(self.initial_prob.values())
+ if prob_sum > 1.0:
+ raise Exception("Total initial probability cannot exceed 1.0")
+ if num_states_not_set > 0:
+ prob = (1.0 - prob_sum) / num_states_not_set
+ for state in self._state_alphabet:
+ if state not in self.initial_prob:
+ self.initial_prob[state] = prob
+
+ def set_equal_probabilities(self):
+ """Reset all probabilities to be an average value.
+
+ Resets the values of all initial probabilities and all allowed
+ transitions and all allowed emissions to be equal to 1 divided by the
+ number of possible elements.
+
+ This is useful if you just want to initialize a Markov Model to
+ starting values (ie. if you have no prior notions of what the
+ probabilities should be -- or if you are just feeling too lazy
+ to calculate them :-).
+
+ Warning 1 -- this will reset all currently set probabilities.
+
+ Warning 2 -- This just sets all probabilities for transitions and
+ emissions to total up to 1, so it doesn't ensure that the sum of
+ each set of transitions adds up to 1.
+ """
+ # set initial state probabilities
+ new_initial_prob = float(1) / float(len(self.transition_prob))
+ for state in self._state_alphabet:
+ self.initial_prob[state] = new_initial_prob
+
+ # set the transitions
+ new_trans_prob = float(1) / float(len(self.transition_prob))
+ for key in self.transition_prob:
+ self.transition_prob[key] = new_trans_prob
+
+ # set the emissions
+ new_emission_prob = float(1) / float(len(self.emission_prob))
+ for key in self.emission_prob:
+ self.emission_prob[key] = new_emission_prob
+
+ def set_random_initial_probabilities(self):
+ """Set all initial state probabilities to a randomly generated distribution.
+
+ Returns the dictionary containing the initial probabilities.
+ """
+ initial_freqs = _gen_random_array(len(self._state_alphabet))
+ for state in self._state_alphabet:
+ self.initial_prob[state] = initial_freqs.pop()
+
+ return self.initial_prob
+
+ def set_random_transition_probabilities(self):
+ """Set all allowed transition probabilities to a randomly generated distribution.
+
+ Returns the dictionary containing the transition probabilities.
+ """
+ if not self.transition_prob:
+ raise Exception(
+ "No transitions have been allowed yet. "
+ "Allow some or all transitions by calling "
+ "allow_transition or allow_all_transitions first."
+ )
+
+ transitions_from = _calculate_from_transitions(self.transition_prob)
+ for from_state in transitions_from:
+ freqs = _gen_random_array(len(transitions_from[from_state]))
+ for to_state in transitions_from[from_state]:
+ self.transition_prob[(from_state, to_state)] = freqs.pop()
+
+ return self.transition_prob
+
+ def set_random_emission_probabilities(self):
+ """Set all allowed emission probabilities to a randomly generated distribution.
+
+ Returns the dictionary containing the emission probabilities.
+ """
+ if not self.emission_prob:
+ raise Exception(
+ "No emissions have been allowed yet. Allow some or all emissions."
+ )
+
+ emissions = _calculate_emissions(self.emission_prob)
+ for state in emissions:
+ freqs = _gen_random_array(len(emissions[state]))
+ for symbol in emissions[state]:
+ self.emission_prob[(state, symbol)] = freqs.pop()
+
+ return self.emission_prob
+
+ def set_random_probabilities(self):
+ """Set all probabilities to randomly generated numbers.
+
+ Resets probabilities of all initial states, transitions, and
+ emissions to random values.
+ """
+ self.set_random_initial_probabilities()
+ self.set_random_transition_probabilities()
+ self.set_random_emission_probabilities()
+
+ # --- functions to deal with the transitions in the sequence
+
+ def allow_all_transitions(self):
+ """Create transitions between all states.
+
+ By default all transitions within the alphabet are disallowed;
+ this is a convenience function to change this to allow all
+ possible transitions.
+ """
+ # first get all probabilities and pseudo counts set
+ # to the default values
+ all_probs = self._all_blank(self._state_alphabet, self._state_alphabet)
+
+ all_pseudo = self._all_pseudo(self._state_alphabet, self._state_alphabet)
+
+ # now set any probabilities and pseudo counts that
+ # were previously set
+ for set_key in self.transition_prob:
+ all_probs[set_key] = self.transition_prob[set_key]
+
+ for set_key in self.transition_pseudo:
+ all_pseudo[set_key] = self.transition_pseudo[set_key]
+
+ # finally reinitialize the transition probs and pseudo counts
+ self.transition_prob = all_probs
+ self.transition_pseudo = all_pseudo
+
+ def allow_transition(
+ self, from_state, to_state, probability=None, pseudocount=None
+ ):
+ """Set a transition as being possible between the two states.
+
+ probability and pseudocount are optional arguments
+ specifying the probabilities and pseudo counts for the transition.
+ If these are not supplied, then the values are set to the
+ default values.
+
+ Raises:
+ KeyError -- if the two states already have an allowed transition.
+
+ """
+ # check the sanity of adding these states
+ for state in [from_state, to_state]:
+ if state not in self._state_alphabet:
+ raise ValueError(
+ "State %s was not found in the sequence alphabet" % state
+ )
+
+ # ensure that the states are not already set
+ if (from_state, to_state) not in self.transition_prob and (
+ from_state,
+ to_state,
+ ) not in self.transition_pseudo:
+ # set the initial probability
+ if probability is None:
+ probability = 0
+ self.transition_prob[(from_state, to_state)] = probability
+
+ # set the initial pseudocounts
+ if pseudocount is None:
+ pseudocount = self.DEFAULT_PSEUDO
+ self.transition_pseudo[(from_state, to_state)] = pseudocount
+ else:
+ raise KeyError(
+ "Transition from %s to %s is already allowed." % (from_state, to_state)
+ )
+
+ def destroy_transition(self, from_state, to_state):
+ """Restrict transitions between the two states.
+
+ Raises:
+ KeyError if the transition is not currently allowed.
+
+ """
+ try:
+ del self.transition_prob[(from_state, to_state)]
+ del self.transition_pseudo[(from_state, to_state)]
+ except KeyError:
+ raise KeyError(
+ "Transition from %s to %s is already disallowed."
+ % (from_state, to_state)
+ )
+
+ def set_transition_score(self, from_state, to_state, probability):
+ """Set the probability of a transition between two states.
+
+ Raises:
+ KeyError if the transition is not allowed.
+
+ """
+ if (from_state, to_state) in self.transition_prob:
+ self.transition_prob[(from_state, to_state)] = probability
+ else:
+ raise KeyError(
+ "Transition from %s to %s is not allowed." % (from_state, to_state)
+ )
+
+ def set_transition_pseudocount(self, from_state, to_state, count):
+ """Set the default pseudocount for a transition.
+
+ To avoid computational problems, it is helpful to be able to
+ set a 'default' pseudocount to start with for estimating
+ transition and emission probabilities (see p62 in Durbin et al
+ for more discussion on this. By default, all transitions have
+ a pseudocount of 1.
+
+ Raises:
+ KeyError if the transition is not allowed.
+
+ """
+ if (from_state, to_state) in self.transition_pseudo:
+ self.transition_pseudo[(from_state, to_state)] = count
+ else:
+ raise KeyError(
+ "Transition from %s to %s is not allowed." % (from_state, to_state)
+ )
+
+ # --- functions to deal with emissions from the sequence
+
+ def set_emission_score(self, seq_state, emission_state, probability):
+ """Set the probability of a emission from a particular state.
+
+ Raises:
+ KeyError if the emission from the given state is not allowed.
+
+ """
+ if (seq_state, emission_state) in self.emission_prob:
+ self.emission_prob[(seq_state, emission_state)] = probability
+ else:
+ raise KeyError(
+ "Emission of %s from %s is not allowed." % (emission_state, seq_state)
+ )
+
+ def set_emission_pseudocount(self, seq_state, emission_state, count):
+ """Set the default pseudocount for an emission.
+
+ To avoid computational problems, it is helpful to be able to
+ set a 'default' pseudocount to start with for estimating
+ transition and emission probabilities (see p62 in Durbin et al
+ for more discussion on this. By default, all emissions have
+ a pseudocount of 1.
+
+ Raises:
+ KeyError if the emission from the given state is not allowed.
+
+ """
+ if (seq_state, emission_state) in self.emission_pseudo:
+ self.emission_pseudo[(seq_state, emission_state)] = count
+ else:
+ raise KeyError(
+ "Emission of %s from %s is not allowed." % (emission_state, seq_state)
+ )
+
+
+class HiddenMarkovModel:
+ """Represent a hidden markov model that can be used for state estimation."""
+
+ def __init__(
+ self,
+ state_alphabet,
+ emission_alphabet,
+ initial_prob,
+ transition_prob,
+ emission_prob,
+ transition_pseudo,
+ emission_pseudo,
+ ):
+ """Initialize a Markov Model.
+
+ Note: You should use the MarkovModelBuilder class instead of
+ initiating this class directly.
+
+ Arguments:
+ - state_alphabet -- A tuple containing all of the letters that can
+ appear in the states.
+ - emission_alphabet -- A tuple containing all of the letters for
+ states that can be emitted by the HMM.
+ - initial_prob - A dictionary of initial probabilities for all states.
+ - transition_prob -- A dictionary of transition probabilities for all
+ possible transitions in the sequence.
+ - emission_prob -- A dictionary of emission probabilities for all
+ possible emissions from the sequence states.
+ - transition_pseudo -- Pseudo-counts to be used for the transitions,
+ when counting for purposes of estimating transition probabilities.
+ - emission_pseudo -- Pseudo-counts to be used for the emissions,
+ when counting for purposes of estimating emission probabilities.
+
+ """
+ self.state_alphabet = state_alphabet
+ self.emission_alphabet = emission_alphabet
+
+ self.initial_prob = initial_prob
+
+ self._transition_pseudo = transition_pseudo
+ self._emission_pseudo = emission_pseudo
+
+ self.transition_prob = transition_prob
+ self.emission_prob = emission_prob
+
+ # a dictionary of the possible transitions from each state
+ # each key is a source state, mapped to a list of the destination states
+ # that are reachable from the source state via a transition
+ self._transitions_from = _calculate_from_transitions(self.transition_prob)
+
+ # a dictionary of the possible transitions to each state
+ # each key is a destination state, mapped to a list of source states
+ # from which the destination is reachable via a transition
+ self._transitions_to = _calculate_to_transitions(self.transition_prob)
+
+ def get_blank_transitions(self):
+ """Get the default transitions for the model.
+
+ Returns a dictionary of all of the default transitions between any
+ two letters in the sequence alphabet. The dictionary is structured
+ with keys as (letter1, letter2) and values as the starting number
+ of transitions.
+ """
+ return self._transition_pseudo
+
+ def get_blank_emissions(self):
+ """Get the starting default emmissions for each sequence.
+
+ This returns a dictionary of the default emmissions for each
+ letter. The dictionary is structured with keys as
+ (seq_letter, emmission_letter) and values as the starting number
+ of emmissions.
+ """
+ return self._emission_pseudo
+
+ def transitions_from(self, state_letter):
+ """Get all destination states which can transition from source state_letter.
+
+ This returns all letters which the given state_letter can transition
+ to, i.e. all the destination states reachable from state_letter.
+
+ An empty list is returned if state_letter has no outgoing transitions.
+ """
+ if state_letter in self._transitions_from:
+ return self._transitions_from[state_letter]
+ else:
+ return []
+
+ def transitions_to(self, state_letter):
+ """Get all source states which can transition to destination state_letter.
+
+ This returns all letters which the given state_letter is reachable
+ from, i.e. all the source states which can reach state_later
+
+ An empty list is returned if state_letter is unreachable.
+ """
+ if state_letter in self._transitions_to:
+ return self._transitions_to[state_letter]
+ else:
+ return []
+
+ def viterbi(self, sequence, state_alphabet):
+ """Calculate the most probable state path using the Viterbi algorithm.
+
+ This implements the Viterbi algorithm (see pgs 55-57 in Durbin et
+ al for a full explanation -- this is where I took my implementation
+ ideas from), to allow decoding of the state path, given a sequence
+ of emissions.
+
+ Arguments:
+ - sequence -- A Seq object with the emission sequence that we
+ want to decode.
+ - state_alphabet -- An iterable (e.g., tuple or list) containing
+ all of the letters that can appear in the states
+
+ """
+ # calculate logarithms of the initial, transition, and emission probs
+ log_initial = self._log_transform(self.initial_prob)
+ log_trans = self._log_transform(self.transition_prob)
+ log_emission = self._log_transform(self.emission_prob)
+
+ viterbi_probs = {}
+ pred_state_seq = {}
+
+ # --- recursion
+ # loop over the training squence (i = 1 .. L)
+ # NOTE: My index numbers are one less than what is given in Durbin
+ # et al, since we are indexing the sequence going from 0 to
+ # (Length - 1) not 1 to Length, like in Durbin et al.
+ for i in range(0, len(sequence)):
+ # loop over all of the possible i-th states in the state path
+ for cur_state in state_alphabet:
+ # e_{l}(x_{i})
+ emission_part = log_emission[(cur_state, sequence[i])]
+
+ max_prob = 0
+ if i == 0:
+ # for the first state, use the initial probability rather
+ # than looking back to previous states
+ max_prob = log_initial[cur_state]
+ else:
+ # loop over all possible (i-1)-th previous states
+ possible_state_probs = {}
+ for prev_state in self.transitions_to(cur_state):
+ # a_{kl}
+ trans_part = log_trans[(prev_state, cur_state)]
+
+ # v_{k}(i - 1)
+ viterbi_part = viterbi_probs[(prev_state, i - 1)]
+ cur_prob = viterbi_part + trans_part
+
+ possible_state_probs[prev_state] = cur_prob
+
+ # calculate the viterbi probability using the max
+ max_prob = max(possible_state_probs.values())
+
+ # v_{k}(i)
+ viterbi_probs[(cur_state, i)] = emission_part + max_prob
+
+ if i > 0:
+ # get the most likely prev_state leading to cur_state
+ for state in possible_state_probs:
+ if possible_state_probs[state] == max_prob:
+ pred_state_seq[(i - 1, cur_state)] = state
+ break
+
+ # --- termination
+ # calculate the probability of the state path
+ # loop over all states
+ all_probs = {}
+ for state in state_alphabet:
+ # v_{k}(L)
+ all_probs[state] = viterbi_probs[(state, len(sequence) - 1)]
+
+ state_path_prob = max(all_probs.values())
+
+ # find the last pointer we need to trace back from
+ last_state = ""
+ for state in all_probs:
+ if all_probs[state] == state_path_prob:
+ last_state = state
+
+ assert last_state != "", "Didn't find the last state to trace from!"
+
+ # --- traceback
+ traceback_seq = []
+
+ loop_seq = list(range(1, len(sequence)))
+ loop_seq.reverse()
+
+ # last_state is the last state in the most probable state sequence.
+ # Compute that sequence by walking backwards in time. From the i-th
+ # state in the sequence, find the (i-1)-th state as the most
+ # probable state preceding the i-th state.
+ state = last_state
+ traceback_seq.append(state)
+ for i in loop_seq:
+ state = pred_state_seq[(i - 1, state)]
+ traceback_seq.append(state)
+
+ # put the traceback sequence in the proper orientation
+ traceback_seq.reverse()
+ traceback_seq = "".join(traceback_seq)
+
+ return Seq(traceback_seq), state_path_prob
+
+ def _log_transform(self, probability):
+ """Return log transform of the given probability dictionary (PRIVATE).
+
+ When calculating the Viterbi equation, add logs of probabilities rather
+ than multiplying probabilities, to avoid underflow errors. This method
+ returns a new dictionary with the same keys as the given dictionary
+ and log-transformed values.
+ """
+ log_prob = copy.copy(probability)
+ for key in log_prob:
+ prob = log_prob[key]
+ if prob > 0:
+ log_prob[key] = math.log(log_prob[key])
+ else:
+ log_prob[key] = -math.inf
+
+ return log_prob
diff --git a/code/lib/Bio/HMM/Trainer.py b/code/lib/Bio/HMM/Trainer.py
new file mode 100644
index 0000000..98e3703
--- /dev/null
+++ b/code/lib/Bio/HMM/Trainer.py
@@ -0,0 +1,430 @@
+# Copyright 2001 Brad Chapman. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+
+"""Provide trainers which estimate parameters based on training sequences.
+
+These should be used to 'train' a Markov Model prior to actually using
+it to decode state paths. When supplied training sequences and a model
+to work from, these classes will estimate parameters of the model.
+
+This aims to estimate two parameters:
+
+- a_{kl} -- the number of times there is a transition from k to l in the
+ training data.
+- e_{k}(b) -- the number of emissions of the state b from the letter k
+ in the training data.
+
+"""
+# standard modules
+import math
+
+# local stuff
+from .DynamicProgramming import ScaledDPAlgorithms
+
+
+class TrainingSequence:
+ """Hold a training sequence with emissions and optionally, a state path."""
+
+ def __init__(self, emissions, state_path):
+ """Initialize a training sequence.
+
+ Arguments:
+ - emissions - An iterable (e.g., a tuple, list, or Seq object)
+ containing the sequence of emissions in the training sequence.
+ - state_path - An iterable (e.g., a tuple or list) containing the
+ sequence of states. If there is no known state path, then the
+ sequence of states should be an empty iterable.
+
+ """
+ if len(state_path) > 0 and len(emissions) != len(state_path):
+ raise ValueError("State path does not match associated emissions.")
+ self.emissions = emissions
+ self.states = state_path
+
+
+class AbstractTrainer:
+ """Provide generic functionality needed in all trainers."""
+
+ def __init__(self, markov_model):
+ """Initialize the class."""
+ self._markov_model = markov_model
+
+ def log_likelihood(self, probabilities):
+ """Calculate the log likelihood of the training seqs.
+
+ Arguments:
+ - probabilities -- A list of the probabilities of each training
+ sequence under the current parameters, calculated using the
+ forward algorithm.
+
+ """
+ total_likelihood = 0
+ for probability in probabilities:
+ total_likelihood += math.log(probability)
+
+ return total_likelihood
+
+ def estimate_params(self, transition_counts, emission_counts):
+ """Get a maximum likelihood estimation of transition and emmission.
+
+ Arguments:
+ - transition_counts -- A dictionary with the total number of counts
+ of transitions between two states.
+ - emissions_counts -- A dictionary with the total number of counts
+ of emmissions of a particular emission letter by a state letter.
+
+ This then returns the maximum likelihood estimators for the
+ transitions and emissions, estimated by formulas 3.18 in
+ Durbin et al::
+
+ a_{kl} = A_{kl} / sum(A_{kl'})
+ e_{k}(b) = E_{k}(b) / sum(E_{k}(b'))
+
+ Returns:
+ Transition and emission dictionaries containing the maximum
+ likelihood estimators.
+
+ """
+ # now calculate the information
+ ml_transitions = self.ml_estimator(transition_counts)
+ ml_emissions = self.ml_estimator(emission_counts)
+
+ return ml_transitions, ml_emissions
+
+ def ml_estimator(self, counts):
+ """Calculate the maximum likelihood estimator.
+
+ This can calculate maximum likelihoods for both transitions
+ and emissions.
+
+ Arguments:
+ - counts -- A dictionary of the counts for each item.
+
+ See estimate_params for a description of the formula used for
+ calculation.
+
+ """
+ # get an ordered list of all items
+ all_ordered = sorted(counts)
+
+ ml_estimation = {}
+
+ # the total counts for the current letter we are on
+ cur_letter = None
+ cur_letter_counts = 0
+
+ for cur_item in all_ordered:
+ # if we are on a new letter (ie. the first letter of the tuple)
+ if cur_item[0] != cur_letter:
+ # set the new letter we are working with
+ cur_letter = cur_item[0]
+
+ # count up the total counts for this letter
+ cur_letter_counts = counts[cur_item]
+
+ # add counts for all other items with the same first letter
+ cur_position = all_ordered.index(cur_item) + 1
+
+ # keep adding while we have the same first letter or until
+ # we get to the end of the ordered list
+ while (
+ cur_position < len(all_ordered)
+ and all_ordered[cur_position][0] == cur_item[0]
+ ):
+ cur_letter_counts += counts[all_ordered[cur_position]]
+ cur_position += 1
+ # otherwise we've already got the total counts for this letter
+ else:
+ pass
+
+ # now calculate the ml and add it to the estimation
+ cur_ml = float(counts[cur_item]) / float(cur_letter_counts)
+ ml_estimation[cur_item] = cur_ml
+
+ return ml_estimation
+
+
+class BaumWelchTrainer(AbstractTrainer):
+ """Trainer that uses the Baum-Welch algorithm to estimate parameters.
+
+ These should be used when a training sequence for an HMM has unknown
+ paths for the actual states, and you need to make an estimation of the
+ model parameters from the observed emissions.
+
+ This uses the Baum-Welch algorithm, first described in
+ Baum, L.E. 1972. Inequalities. 3:1-8
+ This is based on the description in 'Biological Sequence Analysis' by
+ Durbin et al. in section 3.3
+
+ This algorithm is guaranteed to converge to a local maximum, but not
+ necessarily to the global maxima, so use with care!
+ """
+
+ def __init__(self, markov_model):
+ """Initialize the trainer.
+
+ Arguments:
+ - markov_model - The model we are going to estimate parameters for.
+ This should have the parameters with some initial estimates, that
+ we can build from.
+
+ """
+ AbstractTrainer.__init__(self, markov_model)
+
+ def train(self, training_seqs, stopping_criteria, dp_method=ScaledDPAlgorithms):
+ """Estimate the parameters using training sequences.
+
+ The algorithm for this is taken from Durbin et al. p64, so this
+ is a good place to go for a reference on what is going on.
+
+ Arguments:
+ - training_seqs -- A list of TrainingSequence objects to be used
+ for estimating the parameters.
+ - stopping_criteria -- A function, that when passed the change
+ in log likelihood and threshold, will indicate if we should stop
+ the estimation iterations.
+ - dp_method -- A class instance specifying the dynamic programming
+ implementation we should use to calculate the forward and
+ backward variables. By default, we use the scaling method.
+
+ """
+ prev_log_likelihood = None
+ num_iterations = 1
+
+ while True:
+ transition_count = self._markov_model.get_blank_transitions()
+ emission_count = self._markov_model.get_blank_emissions()
+
+ # remember all of the sequence probabilities
+ all_probabilities = []
+
+ for training_seq in training_seqs:
+ # calculate the forward and backward variables
+ DP = dp_method(self._markov_model, training_seq)
+ forward_var, seq_prob = DP.forward_algorithm()
+ backward_var = DP.backward_algorithm()
+
+ all_probabilities.append(seq_prob)
+
+ # update the counts for transitions and emissions
+ transition_count = self.update_transitions(
+ transition_count, training_seq, forward_var, backward_var, seq_prob
+ )
+ emission_count = self.update_emissions(
+ emission_count, training_seq, forward_var, backward_var, seq_prob
+ )
+
+ # update the markov model with the new probabilities
+ ml_transitions, ml_emissions = self.estimate_params(
+ transition_count, emission_count
+ )
+ self._markov_model.transition_prob = ml_transitions
+ self._markov_model.emission_prob = ml_emissions
+
+ cur_log_likelihood = self.log_likelihood(all_probabilities)
+
+ # if we have previously calculated the log likelihood (ie.
+ # not the first round), see if we can finish
+ if prev_log_likelihood is not None:
+ # XXX log likelihoods are negatives -- am I calculating
+ # the change properly, or should I use the negatives...
+ # I'm not sure at all if this is right.
+ log_likelihood_change = abs(
+ abs(cur_log_likelihood) - abs(prev_log_likelihood)
+ )
+
+ # check whether we have completed enough iterations to have
+ # a good estimation
+ if stopping_criteria(log_likelihood_change, num_iterations):
+ break
+
+ # set up for another round of iterations
+ prev_log_likelihood = cur_log_likelihood
+ num_iterations += 1
+
+ return self._markov_model
+
+ def update_transitions(
+ self,
+ transition_counts,
+ training_seq,
+ forward_vars,
+ backward_vars,
+ training_seq_prob,
+ ):
+ """Add the contribution of a new training sequence to the transitions.
+
+ Arguments:
+ - transition_counts -- A dictionary of the current counts for the
+ transitions
+ - training_seq -- The training sequence we are working with
+ - forward_vars -- Probabilities calculated using the forward
+ algorithm.
+ - backward_vars -- Probabilities calculated using the backwards
+ algorithm.
+ - training_seq_prob - The probability of the current sequence.
+
+ This calculates A_{kl} (the estimated transition counts from state
+ k to state l) using formula 3.20 in Durbin et al.
+
+ """
+ # set up the transition and emission probabilities we are using
+ transitions = self._markov_model.transition_prob
+ emissions = self._markov_model.emission_prob
+
+ # loop over the possible combinations of state path letters
+ for k in self._markov_model.state_alphabet:
+ for l in self._markov_model.transitions_from(k):
+ estimated_counts = 0
+ # now loop over the entire training sequence
+ for i in range(len(training_seq.emissions) - 1):
+ # the forward value of k at the current position
+ forward_value = forward_vars[(k, i)]
+
+ # the backward value of l in the next position
+ backward_value = backward_vars[(l, i + 1)]
+
+ # the probability of a transition from k to l
+ trans_value = transitions[(k, l)]
+
+ # the probability of getting the emission at the next pos
+ emm_value = emissions[(l, training_seq.emissions[i + 1])]
+
+ estimated_counts += (
+ forward_value * trans_value * emm_value * backward_value
+ )
+
+ # update the transition approximation
+ transition_counts[(k, l)] += float(estimated_counts) / training_seq_prob
+
+ return transition_counts
+
+ def update_emissions(
+ self,
+ emission_counts,
+ training_seq,
+ forward_vars,
+ backward_vars,
+ training_seq_prob,
+ ):
+ """Add the contribution of a new training sequence to the emissions.
+
+ Arguments:
+ - emission_counts -- A dictionary of the current counts for the
+ emissions
+ - training_seq -- The training sequence we are working with
+ - forward_vars -- Probabilities calculated using the forward
+ algorithm.
+ - backward_vars -- Probabilities calculated using the backwards
+ algorithm.
+ - training_seq_prob - The probability of the current sequence.
+
+ This calculates E_{k}(b) (the estimated emission probability for
+ emission letter b from state k) using formula 3.21 in Durbin et al.
+
+ """
+ # loop over the possible combinations of state path letters
+ for k in self._markov_model.state_alphabet:
+ # now loop over all of the possible emissions
+ for b in self._markov_model.emission_alphabet:
+ expected_times = 0
+ # finally loop over the entire training sequence
+ for i in range(len(training_seq.emissions)):
+ # only count the forward and backward probability if the
+ # emission at the position is the same as b
+ if training_seq.emissions[i] == b:
+ # f_{k}(i) b_{k}(i)
+ expected_times += forward_vars[(k, i)] * backward_vars[(k, i)]
+
+ # add to E_{k}(b)
+ emission_counts[(k, b)] += float(expected_times) / training_seq_prob
+
+ return emission_counts
+
+
+class KnownStateTrainer(AbstractTrainer):
+ """Estimate probabilities with known state sequences.
+
+ This should be used for direct estimation of emission and transition
+ probabilities when both the state path and emission sequence are
+ known for the training examples.
+ """
+
+ def __init__(self, markov_model):
+ """Initialize the class."""
+ AbstractTrainer.__init__(self, markov_model)
+
+ def train(self, training_seqs):
+ """Estimate the Markov Model parameters with known state paths.
+
+ This trainer requires that both the state and the emissions are
+ known for all of the training sequences in the list of
+ TrainingSequence objects.
+ This training will then count all of the transitions and emissions,
+ and use this to estimate the parameters of the model.
+ """
+ # count up all of the transitions and emissions
+ transition_counts = self._markov_model.get_blank_transitions()
+ emission_counts = self._markov_model.get_blank_emissions()
+
+ for training_seq in training_seqs:
+ emission_counts = self._count_emissions(training_seq, emission_counts)
+ transition_counts = self._count_transitions(
+ training_seq.states, transition_counts
+ )
+
+ # update the markov model from the counts
+ ml_transitions, ml_emissions = self.estimate_params(
+ transition_counts, emission_counts
+ )
+ self._markov_model.transition_prob = ml_transitions
+ self._markov_model.emission_prob = ml_emissions
+
+ return self._markov_model
+
+ def _count_emissions(self, training_seq, emission_counts):
+ """Add emissions from the training sequence to the current counts (PRIVATE).
+
+ Arguments:
+ - training_seq -- A TrainingSequence with states and emissions
+ to get the counts from
+ - emission_counts -- The current emission counts to add to.
+
+ """
+ for index in range(len(training_seq.emissions)):
+ cur_state = training_seq.states[index]
+ cur_emission = training_seq.emissions[index]
+
+ try:
+ emission_counts[(cur_state, cur_emission)] += 1
+ except KeyError:
+ raise KeyError(
+ "Unexpected emission (%s, %s)" % (cur_state, cur_emission)
+ )
+ return emission_counts
+
+ def _count_transitions(self, state_seq, transition_counts):
+ """Add transitions from the training sequence to the current counts (PRIVATE).
+
+ Arguments:
+ - state_seq -- A Seq object with the states of the current training
+ sequence.
+ - transition_counts -- The current transition counts to add to.
+
+ """
+ for cur_pos in range(len(state_seq) - 1):
+ cur_state = state_seq[cur_pos]
+ next_state = state_seq[cur_pos + 1]
+
+ try:
+ transition_counts[(cur_state, next_state)] += 1
+ except KeyError:
+ raise KeyError(
+ "Unexpected transition (%s, %s)" % (cur_state, next_state)
+ )
+
+ return transition_counts
diff --git a/code/lib/Bio/HMM/Utilities.py b/code/lib/Bio/HMM/Utilities.py
new file mode 100644
index 0000000..61d3b37
--- /dev/null
+++ b/code/lib/Bio/HMM/Utilities.py
@@ -0,0 +1,68 @@
+# Copyright 2001 Brad Chapman. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+
+"""Generic functions which are useful for working with HMMs.
+
+This just collects general functions which you might like to use in
+dealing with HMMs.
+"""
+
+
+def pretty_print_prediction(
+ emissions,
+ real_state,
+ predicted_state,
+ emission_title="Emissions",
+ real_title="Real State",
+ predicted_title="Predicted State",
+ line_width=75,
+):
+ """Print out a state sequence prediction in a nice manner.
+
+ Arguments:
+ - emissions -- The sequence of emissions of the sequence you are
+ dealing with.
+ - real_state -- The actual state path that generated the emissions.
+ - predicted_state -- A state path predicted by some kind of HMM model.
+
+ """
+ # calculate the length of the titles and sequences
+ title_length = max(len(emission_title), len(real_title), len(predicted_title)) + 1
+ seq_length = line_width - title_length
+
+ # set up the titles so they'll print right
+ emission_title = emission_title.ljust(title_length)
+ real_title = real_title.ljust(title_length)
+ predicted_title = predicted_title.ljust(title_length)
+
+ cur_position = 0
+ # while we still have more than seq_length characters to print
+ while True:
+ if (cur_position + seq_length) < len(emissions):
+ extension = seq_length
+ else:
+ extension = len(emissions) - cur_position
+
+ print(
+ "%s%s"
+ % (emission_title, emissions[cur_position : cur_position + seq_length])
+ )
+ print(
+ "%s%s" % (real_title, real_state[cur_position : cur_position + seq_length])
+ )
+ print(
+ "%s%s\n"
+ % (
+ predicted_title,
+ predicted_state[cur_position : cur_position + seq_length],
+ )
+ )
+
+ if len(emissions) < (cur_position + seq_length):
+ break
+
+ cur_position += seq_length
diff --git a/code/lib/Bio/HMM/__init__.py b/code/lib/Bio/HMM/__init__.py
new file mode 100644
index 0000000..a477108
--- /dev/null
+++ b/code/lib/Bio/HMM/__init__.py
@@ -0,0 +1,5 @@
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""A selection of Hidden Markov Model code."""
diff --git a/code/lib/Bio/HMM/__pycache__/DynamicProgramming.cpython-37.pyc b/code/lib/Bio/HMM/__pycache__/DynamicProgramming.cpython-37.pyc
new file mode 100644
index 0000000..b9f86c9
Binary files /dev/null and b/code/lib/Bio/HMM/__pycache__/DynamicProgramming.cpython-37.pyc differ
diff --git a/code/lib/Bio/HMM/__pycache__/MarkovModel.cpython-37.pyc b/code/lib/Bio/HMM/__pycache__/MarkovModel.cpython-37.pyc
new file mode 100644
index 0000000..3d527bd
Binary files /dev/null and b/code/lib/Bio/HMM/__pycache__/MarkovModel.cpython-37.pyc differ
diff --git a/code/lib/Bio/HMM/__pycache__/Trainer.cpython-37.pyc b/code/lib/Bio/HMM/__pycache__/Trainer.cpython-37.pyc
new file mode 100644
index 0000000..89eb7b5
Binary files /dev/null and b/code/lib/Bio/HMM/__pycache__/Trainer.cpython-37.pyc differ
diff --git a/code/lib/Bio/HMM/__pycache__/Utilities.cpython-37.pyc b/code/lib/Bio/HMM/__pycache__/Utilities.cpython-37.pyc
new file mode 100644
index 0000000..6a1bd24
Binary files /dev/null and b/code/lib/Bio/HMM/__pycache__/Utilities.cpython-37.pyc differ
diff --git a/code/lib/Bio/HMM/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/HMM/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..50bf067
Binary files /dev/null and b/code/lib/Bio/HMM/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/KEGG/Compound/__init__.py b/code/lib/Bio/KEGG/Compound/__init__.py
new file mode 100644
index 0000000..000291a
--- /dev/null
+++ b/code/lib/Bio/KEGG/Compound/__init__.py
@@ -0,0 +1,175 @@
+# Copyright 2001 by Tarjei Mikkelsen. All rights reserved.
+# Copyright 2007 by Michiel de Hoon. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+
+"""Code to work with the KEGG Ligand/Compound database.
+
+Functions:
+ - parse - Returns an iterator giving Record objects.
+
+Classes:
+ - Record - A representation of a KEGG Ligand/Compound.
+"""
+
+
+from Bio.KEGG import _default_wrap, _struct_wrap, _wrap_kegg, _write_kegg
+
+
+# Set up line wrapping rules (see Bio.KEGG._wrap_kegg)
+name_wrap = [0, "", (" ", "$", 1, 1), ("-", "$", 1, 1)]
+id_wrap = _default_wrap
+struct_wrap = _struct_wrap
+
+
+class Record:
+ """Holds info from a KEGG Ligand/Compound record.
+
+ Attributes:
+ - entry The entry identifier.
+ - name A list of the compund names.
+ - formula The chemical formula for the compound
+ - mass The molecular weight for the compound
+ - pathway A list of 3-tuples: ('PATH', pathway id, pathway)
+ - enzyme A list of the EC numbers.
+ - structures A list of 2-tuples: (database, list of struct ids)
+ - dblinks A list of 2-tuples: (database, list of link ids)
+
+ """
+
+ def __init__(self):
+ """Initialize as new record."""
+ self.entry = ""
+ self.name = []
+ self.formula = ""
+ self.mass = ""
+ self.pathway = []
+ self.enzyme = []
+ self.structures = []
+ self.dblinks = []
+
+ def __str__(self):
+ """Return a string representation of this Record."""
+ return (
+ self._entry()
+ + self._name()
+ + self._formula()
+ + self._mass()
+ + self._pathway()
+ + self._enzyme()
+ + self._structures()
+ + self._dblinks()
+ + "///"
+ )
+
+ def _entry(self):
+ return _write_kegg("ENTRY", [self.entry])
+
+ def _name(self):
+ return _write_kegg(
+ "NAME", [_wrap_kegg(l, wrap_rule=name_wrap) for l in self.name]
+ )
+
+ def _formula(self):
+ return _write_kegg("FORMULA", [self.formula])
+
+ def _mass(self):
+ return _write_kegg("MASS", [self.mass])
+
+ def _pathway(self):
+ s = []
+ for entry in self.pathway:
+ s.append(entry[0] + " " + entry[1])
+ return _write_kegg("PATHWAY", [_wrap_kegg(l, wrap_rule=id_wrap(16)) for l in s])
+
+ def _enzyme(self):
+ return _write_kegg(
+ "ENZYME", [_wrap_kegg(l, wrap_rule=name_wrap) for l in self.enzyme]
+ )
+
+ def _structures(self):
+ s = []
+ for entry in self.structures:
+ s.append(entry[0] + ": " + " ".join(entry[1]) + " ")
+ return _write_kegg(
+ "STRUCTURES", [_wrap_kegg(l, wrap_rule=struct_wrap(5)) for l in s]
+ )
+
+ def _dblinks(self):
+ s = []
+ for entry in self.dblinks:
+ s.append(entry[0] + ": " + " ".join(entry[1]))
+ return _write_kegg("DBLINKS", [_wrap_kegg(l, wrap_rule=id_wrap(9)) for l in s])
+
+
+def parse(handle):
+ """Parse a KEGG Ligan/Compound file, returning Record objects.
+
+ This is an iterator function, typically used in a for loop. For
+ example, using one of the example KEGG files in the Biopython
+ test suite,
+
+ >>> with open("KEGG/compound.sample") as handle:
+ ... for record in parse(handle):
+ ... print("%s %s" % (record.entry, record.name[0]))
+ ...
+ C00023 Iron
+ C00017 Protein
+ C00099 beta-Alanine
+ C00294 Inosine
+ C00298 Trypsin
+ C00348 all-trans-Undecaprenyl phosphate
+ C00349 2-Methyl-3-oxopropanoate
+ C01386 NH2Mec
+
+ """
+ record = Record()
+ for line in handle:
+ if line[:3] == "///":
+ yield record
+ record = Record()
+ continue
+ if line[:12] != " ":
+ keyword = line[:12]
+ data = line[12:].strip()
+ if keyword == "ENTRY ":
+ words = data.split()
+ record.entry = words[0]
+ elif keyword == "NAME ":
+ data = data.strip(";")
+ record.name.append(data)
+ elif keyword == "ENZYME ":
+ while data:
+ column = data[:16]
+ data = data[16:]
+ enzyme = column.strip()
+ record.enzyme.append(enzyme)
+ elif keyword == "PATHWAY ":
+ map, name = data.split(" ")
+ pathway = ("PATH", map, name)
+ record.pathway.append(pathway)
+ elif keyword == "FORMULA ":
+ record.formula = data
+ elif keyword == "MASS ":
+ record.mass = data
+ elif keyword == "DBLINKS ":
+ if ":" in data:
+ key, values = data.split(":")
+ values = values.split()
+ row = (key, values)
+ record.dblinks.append(row)
+ else:
+ row = record.dblinks[-1]
+ key, values = row
+ values.extend(data.split())
+ row = key, values
+ record.dblinks[-1] = row
+
+
+if __name__ == "__main__":
+ from Bio._utils import run_doctest
+
+ run_doctest()
diff --git a/code/lib/Bio/KEGG/Compound/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/KEGG/Compound/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..7d9b138
Binary files /dev/null and b/code/lib/Bio/KEGG/Compound/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/KEGG/Enzyme/__init__.py b/code/lib/Bio/KEGG/Enzyme/__init__.py
new file mode 100644
index 0000000..bb5bb7c
--- /dev/null
+++ b/code/lib/Bio/KEGG/Enzyme/__init__.py
@@ -0,0 +1,328 @@
+# Copyright 2001 by Tarjei Mikkelsen. All rights reserved.
+# Copyright 2007 by Michiel de Hoon. All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+
+"""Code to work with the KEGG Enzyme database.
+
+Functions:
+ - parse - Returns an iterator giving Record objects.
+
+Classes:
+ - Record - Holds the information from a KEGG Enzyme record.
+"""
+
+
+from Bio.KEGG import _default_wrap, _struct_wrap, _wrap_kegg, _write_kegg
+
+
+# Set up line wrapping rules (see Bio.KEGG._wrap_kegg)
+rxn_wrap = [
+ 0,
+ "",
+ (" + ", "", 1, 1),
+ (" = ", "", 1, 1),
+ (" ", "$", 1, 1),
+ ("-", "$", 1, 1),
+]
+name_wrap = [0, "", (" ", "$", 1, 1), ("-", "$", 1, 1)]
+id_wrap = _default_wrap
+struct_wrap = _struct_wrap
+
+
+class Record:
+ """Holds info from a KEGG Enzyme record.
+
+ Attributes:
+ - entry The EC number (withou the 'EC ').
+ - name A list of the enzyme names.
+ - classname A list of the classification terms.
+ - sysname The systematic name of the enzyme.
+ - reaction A list of the reaction description strings.
+ - substrate A list of the substrates.
+ - product A list of the products.
+ - inhibitor A list of the inhibitors.
+ - cofactor A list of the cofactors.
+ - effector A list of the effectors.
+ - comment A list of the comment strings.
+ - pathway A list of 3-tuples: (database, id, pathway)
+ - genes A list of 2-tuples: (organism, list of gene ids)
+ - disease A list of 3-tuples: (database, id, disease)
+ - structures A list of 2-tuples: (database, list of struct ids)
+ - dblinks A list of 2-tuples: (database, list of db ids)
+
+ """
+
+ def __init__(self):
+ """Initialize a new Record."""
+ self.entry = ""
+ self.name = []
+ self.classname = []
+ self.sysname = []
+ self.reaction = []
+ self.substrate = []
+ self.product = []
+ self.inhibitor = []
+ self.cofactor = []
+ self.effector = []
+ self.comment = []
+ self.pathway = []
+ self.genes = []
+ self.disease = []
+ self.structures = []
+ self.dblinks = []
+
+ def __str__(self):
+ """Return a string representation of this Record."""
+ return (
+ self._entry()
+ + self._name()
+ + self._classname()
+ + self._sysname()
+ + self._reaction()
+ + self._substrate()
+ + self._product()
+ + self._inhibitor()
+ + self._cofactor()
+ + self._effector()
+ + self._comment()
+ + self._pathway()
+ + self._genes()
+ + self._disease()
+ + self._structures()
+ + self._dblinks()
+ + "///"
+ )
+
+ def _entry(self):
+ return _write_kegg("ENTRY", ["EC " + self.entry])
+
+ def _name(self):
+ return _write_kegg(
+ "NAME", [_wrap_kegg(l, wrap_rule=name_wrap) for l in self.name]
+ )
+
+ def _classname(self):
+ return _write_kegg("CLASS", self.classname)
+
+ def _sysname(self):
+ return _write_kegg(
+ "SYSNAME", [_wrap_kegg(l, wrap_rule=name_wrap) for l in self.sysname]
+ )
+
+ def _reaction(self):
+ return _write_kegg(
+ "REACTION", [_wrap_kegg(l, wrap_rule=rxn_wrap) for l in self.reaction]
+ )
+
+ def _substrate(self):
+ return _write_kegg(
+ "SUBSTRATE", [_wrap_kegg(l, wrap_rule=name_wrap) for l in self.substrate]
+ )
+
+ def _product(self):
+ return _write_kegg(
+ "PRODUCT", [_wrap_kegg(l, wrap_rule=name_wrap) for l in self.product]
+ )
+
+ def _inhibitor(self):
+ return _write_kegg(
+ "INHIBITOR", [_wrap_kegg(l, wrap_rule=name_wrap) for l in self.inhibitor]
+ )
+
+ def _cofactor(self):
+ return _write_kegg(
+ "COFACTOR", [_wrap_kegg(l, wrap_rule=name_wrap) for l in self.cofactor]
+ )
+
+ def _effector(self):
+ return _write_kegg(
+ "EFFECTOR", [_wrap_kegg(l, wrap_rule=name_wrap) for l in self.effector]
+ )
+
+ def _comment(self):
+ return _write_kegg(
+ "COMMENT", [_wrap_kegg(l, wrap_rule=id_wrap(0)) for l in self.comment]
+ )
+
+ def _pathway(self):
+ s = []
+ for entry in self.pathway:
+ s.append(entry[0] + ": " + entry[1] + " " + entry[2])
+ return _write_kegg("PATHWAY", [_wrap_kegg(l, wrap_rule=id_wrap(16)) for l in s])
+
+ def _genes(self):
+ s = []
+ for entry in self.genes:
+ s.append(entry[0] + ": " + " ".join(entry[1]))
+ return _write_kegg("GENES", [_wrap_kegg(l, wrap_rule=id_wrap(5)) for l in s])
+
+ def _disease(self):
+ s = []
+ for entry in self.disease:
+ s.append(entry[0] + ": " + entry[1] + " " + entry[2])
+ return _write_kegg("DISEASE", [_wrap_kegg(l, wrap_rule=id_wrap(13)) for l in s])
+
+ def _structures(self):
+ s = []
+ for entry in self.structures:
+ s.append(entry[0] + ": " + " ".join(entry[1]) + " ")
+ return _write_kegg(
+ "STRUCTURES", [_wrap_kegg(l, wrap_rule=struct_wrap(5)) for l in s]
+ )
+
+ def _dblinks(self):
+ # This is a bit of a cheat that won't work if enzyme entries
+ # have more than one link id per db id. For now, that's not
+ # the case - storing links ids in a list is only to make
+ # this class similar to the Compound.Record class.
+ s = []
+ for entry in self.dblinks:
+ s.append(entry[0] + ": " + " ".join(entry[1]))
+ return _write_kegg("DBLINKS", s)
+
+
+def parse(handle):
+ """Parse a KEGG Enzyme file, returning Record objects.
+
+ This is an iterator function, typically used in a for loop. For
+ example, using one of the example KEGG files in the Biopython
+ test suite,
+
+ >>> with open("KEGG/enzyme.sample") as handle:
+ ... for record in parse(handle):
+ ... print("%s %s" % (record.entry, record.name[0]))
+ ...
+ 1.1.1.1 alcohol dehydrogenase
+ 1.1.1.62 17beta-estradiol 17-dehydrogenase
+ 1.1.1.68 Transferred to 1.5.1.20
+ 1.6.5.3 NADH:ubiquinone reductase (H+-translocating)
+ 1.14.13.28 3,9-dihydroxypterocarpan 6a-monooxygenase
+ 2.4.1.68 glycoprotein 6-alpha-L-fucosyltransferase
+ 3.1.1.6 acetylesterase
+ 2.7.2.1 acetate kinase
+
+ """
+ record = Record()
+ for line in handle:
+ if line[:3] == "///":
+ yield record
+ record = Record()
+ continue
+ if line[:12] != " ":
+ keyword = line[:12]
+ data = line[12:].strip()
+ if keyword == "ENTRY ":
+ words = data.split()
+ record.entry = words[1]
+ elif keyword == "CLASS ":
+ record.classname.append(data)
+ elif keyword == "COFACTOR ":
+ record.cofactor.append(data)
+ elif keyword == "COMMENT ":
+ record.comment.append(data)
+ elif keyword == "DBLINKS ":
+ if ":" in data:
+ key, values = data.split(":")
+ values = values.split()
+ row = (key, values)
+ record.dblinks.append(row)
+ else:
+ row = record.dblinks[-1]
+ key, values = row
+ values.extend(data.split())
+ row = key, values
+ record.dblinks[-1] = row
+ elif keyword == "DISEASE ":
+ if ":" in data:
+ database, data = data.split(":")
+ number, name = data.split(None, 1)
+ row = (database, number, name)
+ record.disease.append(row)
+ else:
+ row = record.disease[-1]
+ database, number, name = row
+ name = name + " " + data
+ row = database, number, name
+ record.disease[-1] = row
+ elif keyword == "EFFECTOR ":
+ record.effector.append(data.strip(";"))
+ elif keyword == "GENES ":
+ if data[3:5] == ": " or data[4:6] == ": ":
+ key, values = data.split(":", 1)
+ values = [value.split("(")[0] for value in values.split()]
+ row = (key, values)
+ record.genes.append(row)
+ else:
+ row = record.genes[-1]
+ key, values = row
+ for value in data.split():
+ value = value.split("(")[0]
+ values.append(value)
+ row = key, values
+ record.genes[-1] = row
+ elif keyword == "INHIBITOR ":
+ record.inhibitor.append(data.strip(";"))
+ elif keyword == "NAME ":
+ record.name.append(data.strip(";"))
+ elif keyword == "PATHWAY ":
+ if data[:5] == "PATH:":
+ _, map_num, name = data.split(None, 2)
+ pathway = ("PATH", map_num, name)
+ record.pathway.append(pathway)
+ else:
+ ec_num, name = data.split(None, 1)
+ pathway = "PATH", ec_num, name
+ record.pathway.append(pathway)
+ elif keyword == "PRODUCT ":
+ record.product.append(data.strip(";"))
+ elif keyword == "REACTION ":
+ record.reaction.append(data.strip(";"))
+ elif keyword == "STRUCTURES ":
+ if data[:4] == "PDB:":
+ database = data[:3]
+ accessions = data[4:].split()
+ row = (database, accessions)
+ record.structures.append(row)
+ else:
+ row = record.structures[-1]
+ database, accessions = row
+ accessions.extend(data.split())
+ row = (database, accessions)
+ record.structures[-1] = row
+ elif keyword == "SUBSTRATE ":
+ record.substrate.append(data.strip(";"))
+ elif keyword == "SYSNAME ":
+ record.sysname.append(data.strip(";"))
+
+
+def read(handle):
+ """Parse a KEGG Enzyme file with exactly one entry.
+
+ If the handle contains no records, or more than one record,
+ an exception is raised. For example:
+
+ >>> with open("KEGG/enzyme.new") as handle:
+ ... record = read(handle)
+ ... print("%s %s" % (record.entry, record.name[0]))
+ ...
+ 6.2.1.25 benzoate---CoA ligase
+ """
+ records = parse(handle)
+ try:
+ record = next(records)
+ except StopIteration:
+ raise ValueError("No records found in handle") from None
+ try:
+ next(records)
+ raise ValueError("More than one record found in handle")
+ except StopIteration:
+ pass
+ return record
+
+
+if __name__ == "__main__":
+ from Bio._utils import run_doctest
+
+ run_doctest()
diff --git a/code/lib/Bio/KEGG/Enzyme/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/KEGG/Enzyme/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..b6c2c5f
Binary files /dev/null and b/code/lib/Bio/KEGG/Enzyme/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/KEGG/Gene/__init__.py b/code/lib/Bio/KEGG/Gene/__init__.py
new file mode 100644
index 0000000..8ffe5c2
--- /dev/null
+++ b/code/lib/Bio/KEGG/Gene/__init__.py
@@ -0,0 +1,140 @@
+# Copyright 2017 by Kozo Nishida. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+
+"""Code to work with the KEGG Gene database.
+
+Functions:
+- parse - Returns an iterator giving Record objects.
+
+Classes:
+- Record - A representation of a KEGG Gene.
+
+"""
+
+
+from Bio.KEGG import _default_wrap, _wrap_kegg, _write_kegg
+
+
+# Set up line wrapping rules (see Bio.KEGG._wrap_kegg)
+name_wrap = [0, "", (" ", "$", 1, 1), ("-", "$", 1, 1)]
+id_wrap = _default_wrap
+
+
+class Record:
+ """Holds info from a KEGG Gene record.
+
+ Attributes:
+ - entry The entry identifier.
+ - name A list of the gene names.
+ - definition The definition for the gene.
+ - orthology A list of 2-tuples: (orthology id, role)
+ - organism A tuple: (organism id, organism)
+ - position The position for the gene
+ - motif A list of 2-tuples: (database, list of link ids)
+ - dblinks A list of 2-tuples: (database, list of link ids)
+
+ """
+
+ def __init__(self):
+ """Initialize new record."""
+ self.entry = ""
+ self.name = []
+ self.definition = ""
+ self.orthology = []
+ self.organism = ""
+ self.position = ""
+ self.motif = []
+ self.dblinks = []
+
+ def __str__(self):
+ """Return a string representation of this Record."""
+ return self._entry() + self._name() + self._dblinks() + "///"
+
+ def _entry(self):
+ return _write_kegg("ENTRY", [self.entry])
+
+ def _name(self):
+ return _write_kegg(
+ "NAME", [_wrap_kegg(l, wrap_rule=name_wrap) for l in self.name]
+ )
+
+ def _definition(self):
+ return _write_kegg("DEFINITION", [self.definition])
+
+ def _dblinks(self):
+ s = []
+ for entry in self.dblinks:
+ s.append(entry[0] + ": " + " ".join(entry[1]))
+ return _write_kegg("DBLINKS", [_wrap_kegg(l, wrap_rule=id_wrap(9)) for l in s])
+
+
+def parse(handle):
+ """Parse a KEGG Gene file, returning Record objects.
+
+ This is an iterator function, typically used in a for loop. For
+ example, using one of the example KEGG files in the Biopython
+ test suite,
+
+ >>> with open("KEGG/gene.sample") as handle:
+ ... for record in parse(handle):
+ ... print("%s %s" % (record.entry, record.name[0]))
+ ...
+ b1174 minE
+ b1175 minD
+
+
+ """
+ record = Record()
+ for line in handle:
+ if line[:3] == "///":
+ yield record
+ record = Record()
+ continue
+ if line[:12] != " ":
+ keyword = line[:12]
+ data = line[12:].strip()
+ if keyword == "ENTRY ":
+ words = data.split()
+ record.entry = words[0]
+ elif keyword == "NAME ":
+ data = data.strip(";")
+ record.name.append(data)
+ elif keyword == "DEFINITION ":
+ record.definition = data
+ elif keyword == "ORTHOLOGY ":
+ id, name = data.split(" ")
+ orthology = (id, name)
+ record.orthology.append(orthology)
+ elif keyword == "ORGANISM ":
+ id, name = data.split(" ")
+ organism = (id, name)
+ record.organism = organism
+ elif keyword == "POSITION ":
+ record.position = data
+ elif keyword == "MOTIF ":
+ key, values = data.split(": ")
+ values = values.split()
+ row = (key, values)
+ record.motif.append(row)
+ elif keyword == "DBLINKS ":
+ if ":" in data:
+ key, values = data.split(": ")
+ values = values.split()
+ row = (key, values)
+ record.dblinks.append(row)
+ else:
+ row = record.dblinks[-1]
+ key, values = row
+ values.extend(data.split())
+ row = key, values
+ record.dblinks[-1] = row
+
+
+if __name__ == "__main__":
+ from Bio._utils import run_doctest
+
+ run_doctest()
diff --git a/code/lib/Bio/KEGG/Gene/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/KEGG/Gene/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..efa0935
Binary files /dev/null and b/code/lib/Bio/KEGG/Gene/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/KEGG/KGML/KGML_parser.py b/code/lib/Bio/KEGG/KGML/KGML_parser.py
new file mode 100644
index 0000000..6405ce3
--- /dev/null
+++ b/code/lib/Bio/KEGG/KGML/KGML_parser.py
@@ -0,0 +1,189 @@
+# Copyright 2013 by Leighton Pritchard. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+
+"""Classes and functions to parse a KGML pathway map.
+
+The KGML pathway map is parsed into the object structure defined in
+KGML_Pathway.py in this module.
+
+Classes:
+ - KGMLParser - Parses KGML file
+
+Functions:
+ - read - Returns a single Pathway object, using KGMLParser internally
+
+"""
+
+from xml.etree import ElementTree
+
+from io import StringIO
+
+from Bio.KEGG.KGML.KGML_pathway import Component, Entry, Graphics
+from Bio.KEGG.KGML.KGML_pathway import Pathway, Reaction, Relation
+
+
+def read(handle):
+ """Parse a single KEGG Pathway from given file handle.
+
+ Returns a single Pathway object. There should be one and only
+ one pathway in each file, but there may well be pathological
+ examples out there.
+ """
+ pathways = parse(handle)
+ try:
+ pathway = next(pathways)
+ except StopIteration:
+ raise ValueError("No pathways found in handle") from None
+ try:
+ next(pathways)
+ raise ValueError("More than one pathway found in handle")
+ except StopIteration:
+ pass
+ return pathway
+
+
+def parse(handle):
+ """Return an iterator over Pathway elements.
+
+ Arguments:
+ - handle - file handle to a KGML file for parsing, or a KGML string
+
+ This is a generator for the return of multiple Pathway objects.
+
+ """
+ # Check handle
+ try:
+ handle.read(0)
+ except AttributeError:
+ try:
+ handle = StringIO(handle)
+ except TypeError:
+ raise TypeError(
+ "An XML-containing handle or an XML string must be provided"
+ ) from None
+ # Parse XML and return each Pathway
+ for event, elem in ElementTree.iterparse(handle, events=("start", "end")):
+ if event == "end" and elem.tag == "pathway":
+ yield KGMLParser(elem).parse()
+ elem.clear()
+
+
+class KGMLParser:
+ """Parses a KGML XML Pathway entry into a Pathway object.
+
+ Example: Read and parse large metabolism file
+
+ >>> from Bio.KEGG.KGML.KGML_parser import read
+ >>> pathway = read(open('KEGG/ko01100.xml', 'r'))
+ >>> print(len(pathway.entries))
+ 3628
+ >>> print(len(pathway.reactions))
+ 1672
+ >>> print(len(pathway.maps))
+ 149
+
+ >>> pathway = read(open('KEGG/ko00010.xml', 'r'))
+ >>> print(pathway) #doctest: +NORMALIZE_WHITESPACE
+ Pathway: Glycolysis / Gluconeogenesis
+ KEGG ID: path:ko00010
+ Image file: http://www.kegg.jp/kegg/pathway/ko/ko00010.png
+ Organism: ko
+ Entries: 99
+ Entry types:
+ ortholog: 61
+ compound: 31
+ map: 7
+
+ """
+
+ def __init__(self, elem):
+ """Initialize the class."""
+ self.entry = elem
+
+ def parse(self):
+ """Parse the input elements."""
+
+ def _parse_pathway(attrib):
+ for k, v in attrib.items():
+ self.pathway.__setattr__(k, v)
+
+ def _parse_entry(element):
+ new_entry = Entry()
+ for k, v in element.attrib.items():
+ new_entry.__setattr__(k, v)
+ for subelement in element:
+ if subelement.tag == "graphics":
+ _parse_graphics(subelement, new_entry)
+ elif subelement.tag == "component":
+ _parse_component(subelement, new_entry)
+ self.pathway.add_entry(new_entry)
+
+ def _parse_graphics(element, entry):
+ new_graphics = Graphics(entry)
+ for k, v in element.attrib.items():
+ new_graphics.__setattr__(k, v)
+ entry.add_graphics(new_graphics)
+
+ def _parse_component(element, entry):
+ new_component = Component(entry)
+ for k, v in element.attrib.items():
+ new_component.__setattr__(k, v)
+ entry.add_component(new_component)
+
+ def _parse_reaction(element):
+ new_reaction = Reaction()
+ for k, v in element.attrib.items():
+ new_reaction.__setattr__(k, v)
+ for subelement in element:
+ if subelement.tag == "substrate":
+ new_reaction.add_substrate(int(subelement.attrib["id"]))
+ elif subelement.tag == "product":
+ new_reaction.add_product(int(subelement.attrib["id"]))
+ self.pathway.add_reaction(new_reaction)
+
+ def _parse_relation(element):
+ new_relation = Relation()
+ new_relation.entry1 = int(element.attrib["entry1"])
+ new_relation.entry2 = int(element.attrib["entry2"])
+ new_relation.type = element.attrib["type"]
+ for subtype in element:
+ name, value = subtype.attrib["name"], subtype.attrib["value"]
+ if name in ("compound", "hidden compound"):
+ new_relation.subtypes.append((name, int(value)))
+ else:
+ new_relation.subtypes.append((name, value))
+ self.pathway.add_relation(new_relation)
+
+ # ==========
+ # Initialize Pathway
+ self.pathway = Pathway()
+ # Get information about the pathway itself
+ _parse_pathway(self.entry.attrib)
+ for element in self.entry:
+ if element.tag == "entry":
+ _parse_entry(element)
+ elif element.tag == "reaction":
+ _parse_reaction(element)
+ elif element.tag == "relation":
+ _parse_relation(element)
+ # Parsing of some elements not implemented - no examples yet
+ else:
+ # This should warn us of any unimplemented tags
+ import warnings
+ from Bio import BiopythonParserWarning
+
+ warnings.warn(
+ "Warning: tag %s not implemented in parser" % element.tag,
+ BiopythonParserWarning,
+ )
+ return self.pathway
+
+
+if __name__ == "__main__":
+ from Bio._utils import run_doctest
+
+ run_doctest(verbose=0)
diff --git a/code/lib/Bio/KEGG/KGML/KGML_pathway.py b/code/lib/Bio/KEGG/KGML/KGML_pathway.py
new file mode 100644
index 0000000..12dd8aa
--- /dev/null
+++ b/code/lib/Bio/KEGG/KGML/KGML_pathway.py
@@ -0,0 +1,859 @@
+# Copyright 2013 by Leighton Pritchard. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+
+"""Classes to represent a KGML Pathway Map.
+
+The KGML definition is as of release KGML v0.7.2
+(http://www.kegg.jp/kegg/xml/docs/)
+
+Classes:
+ - Pathway - Specifies graph information for the pathway map
+ - Relation - Specifies a relationship between two proteins or KOs,
+ or protein and compound. There is an implied direction to the
+ relationship in some cases.
+ - Reaction - A specific chemical reaction between a substrate and
+ a product.
+ - Entry - A node in the pathway graph
+ - Graphics - Entry subelement describing its visual representation
+
+"""
+
+import time
+from itertools import chain
+from xml.dom import minidom
+import xml.etree.ElementTree as ET
+
+
+# Pathway
+class Pathway:
+ """Represents a KGML pathway from KEGG.
+
+ Specifies graph information for the pathway map, as described in
+ release KGML v0.7.2 (http://www.kegg.jp/kegg/xml/docs/)
+
+ Attributes:
+ - name - KEGGID of the pathway map
+ - org - ko/ec/[org prefix]
+ - number - map number (integer)
+ - title - the map title
+ - image - URL of the image map for the pathway
+ - link - URL of information about the pathway
+ - entries - Dictionary of entries in the pathway, keyed by node ID
+ - reactions - Set of reactions in the pathway
+
+ The name attribute has a restricted format, so we make it a property and
+ enforce the formatting.
+
+ The Pathway object is the only allowed route for adding/removing
+ Entry, Reaction, or Relation elements.
+
+ Entries are held in a dictionary and keyed by the node ID for the
+ pathway graph - this allows for ready access via the Reaction/Relation
+ etc. elements. Entries must be added before reference by any other
+ element.
+
+ Reactions are held in a dictionary, keyed by node ID for the path.
+ The elements referred to in the reaction must be added before the
+ reaction itself.
+
+ """
+
+ def __init__(self):
+ """Initialize the class."""
+ self._name = ""
+ self.org = ""
+ self._number = None
+ self.title = ""
+ self.image = ""
+ self.link = ""
+ self.entries = {}
+ self._reactions = {}
+ self._relations = set()
+
+ def get_KGML(self):
+ """Return the pathway as a string in prettified KGML format."""
+ header = "\n".join(
+ [
+ '',
+ "',
+ "" % time.asctime(),
+ ]
+ )
+ rough_xml = header + ET.tostring(self.element, "utf-8").decode()
+ reparsed = minidom.parseString(rough_xml)
+ return reparsed.toprettyxml(indent=" ")
+
+ def add_entry(self, entry):
+ """Add an Entry element to the pathway."""
+ # We insist that the node ID is an integer
+ if not isinstance(entry.id, int):
+ raise TypeError(
+ "Node ID must be an integer, got %s (%s)" % (type(entry.id), entry.id)
+ )
+ entry._pathway = self # Let the entry know about the pathway
+ self.entries[entry.id] = entry
+
+ def remove_entry(self, entry):
+ """Remove an Entry element from the pathway."""
+ if not isinstance(entry.id, int):
+ raise TypeError(
+ "Node ID must be an integer, got %s (%s)" % (type(entry.id), entry.id)
+ )
+ # We need to remove the entry from any other elements that may
+ # contain it, which means removing those elements
+ # TODO
+ del self.entries[entry.id]
+
+ def add_reaction(self, reaction):
+ """Add a Reaction element to the pathway."""
+ # We insist that the node ID is an integer and corresponds to an entry
+ if not isinstance(reaction.id, int):
+ raise ValueError(
+ "Node ID must be an integer, got %s (%s)"
+ % (type(reaction.id), reaction.id)
+ )
+ if reaction.id not in self.entries:
+ raise ValueError("Reaction ID %d has no corresponding entry" % reaction.id)
+ reaction._pathway = self # Let the reaction know about the pathway
+ self._reactions[reaction.id] = reaction
+
+ def remove_reaction(self, reaction):
+ """Remove a Reaction element from the pathway."""
+ if not isinstance(reaction.id, int):
+ raise TypeError(
+ "Node ID must be an integer, got %s (%s)"
+ % (type(reaction.id), reaction.id)
+ )
+ # We need to remove the reaction from any other elements that may
+ # contain it, which means removing those elements
+ # TODO
+ del self._reactions[reaction.id]
+
+ def add_relation(self, relation):
+ """Add a Relation element to the pathway."""
+ relation._pathway = self # Let the reaction know about the pathway
+ self._relations.add(relation)
+
+ def remove_relation(self, relation):
+ """Remove a Relation element from the pathway."""
+ self._relations.remove(relation)
+
+ def __str__(self):
+ """Return a readable summary description string."""
+ outstr = [
+ "Pathway: %s" % self.title,
+ "KEGG ID: %s" % self.name,
+ "Image file: %s" % self.image,
+ "Organism: %s" % self.org,
+ "Entries: %d" % len(self.entries),
+ "Entry types:",
+ ]
+ for t in ["ortholog", "enzyme", "reaction", "gene", "group", "compound", "map"]:
+ etype = [e for e in self.entries.values() if e.type == t]
+ if len(etype):
+ outstr.append("\t%s: %d" % (t, len(etype)))
+ return "\n".join(outstr) + "\n"
+
+ # Assert correct formatting of the pathway name, and other attributes
+ def _getname(self):
+ return self._name
+
+ def _setname(self, value):
+ if not value.startswith("path:"):
+ raise ValueError("Pathway name should begin with 'path:', got %s" % value)
+ self._name = value
+
+ def _delname(self):
+ del self._name
+
+ name = property(_getname, _setname, _delname, "The KEGGID for the pathway map.")
+
+ def _getnumber(self):
+ return self._number
+
+ def _setnumber(self, value):
+ self._number = int(value)
+
+ def _delnumber(self):
+ del self._number
+
+ number = property(_getnumber, _setnumber, _delnumber, "The KEGG map number.")
+
+ @property
+ def compounds(self):
+ """Get a list of entries of type compound."""
+ return [e for e in self.entries.values() if e.type == "compound"]
+
+ @property
+ def maps(self):
+ """Get a list of entries of type map."""
+ return [e for e in self.entries.values() if e.type == "map"]
+
+ @property
+ def orthologs(self):
+ """Get a list of entries of type ortholog."""
+ return [e for e in self.entries.values() if e.type == "ortholog"]
+
+ @property
+ def genes(self):
+ """Get a list of entries of type gene."""
+ return [e for e in self.entries.values() if e.type == "gene"]
+
+ @property
+ def reactions(self):
+ """Get a list of reactions in the pathway."""
+ return self._reactions.values()
+
+ @property
+ def reaction_entries(self):
+ """List of entries corresponding to each reaction in the pathway."""
+ return [self.entries[i] for i in self._reactions]
+
+ @property
+ def relations(self):
+ """Get a list of relations in the pathway."""
+ return list(self._relations)
+
+ @property
+ def element(self):
+ """Return the Pathway as a valid KGML element."""
+ # The root is this Pathway element
+ pathway = ET.Element("pathway")
+ pathway.attrib = {
+ "name": self._name,
+ "org": self.org,
+ "number": str(self._number),
+ "title": self.title,
+ "image": self.image,
+ "link": self.link,
+ }
+ # We add the Entries in node ID order
+ for eid, entry in sorted(self.entries.items()):
+ pathway.append(entry.element)
+ # Next we add Relations
+ for relation in self._relations:
+ pathway.append(relation.element)
+ for eid, reaction in sorted(self._reactions.items()):
+ pathway.append(reaction.element)
+ return pathway
+
+ @property
+ def bounds(self):
+ """Coordinate bounds for all Graphics elements in the Pathway.
+
+ Returns the [(xmin, ymin), (xmax, ymax)] coordinates for all
+ Graphics elements in the Pathway
+ """
+ xlist, ylist = [], []
+ for b in [g.bounds for g in self.entries.values()]:
+ xlist.extend([b[0][0], b[1][0]])
+ ylist.extend([b[0][1], b[1][1]])
+ return [(min(xlist), min(ylist)), (max(xlist), max(ylist))]
+
+
+# Entry
+class Entry:
+ """Represent an Entry from KGML.
+
+ Each Entry element is a node in the pathway graph, as described in
+ release KGML v0.7.2 (http://www.kegg.jp/kegg/xml/docs/)
+
+ Attributes:
+ - id - The ID of the entry in the pathway map (integer)
+ - names - List of KEGG IDs for the entry
+ - type - The type of the entry
+ - link - URL of information about the entry
+ - reaction - List of KEGG IDs of the corresponding reactions
+ (integer)
+ - graphics - List of Graphics objects describing the Entry's visual
+ representation
+ - components - List of component node ID for this Entry ('group')
+ - alt - List of alternate names for the Entry
+
+ NOTE: The alt attribute represents a subelement of the substrate and
+ product elements in the KGML file
+
+ """
+
+ def __init__(self):
+ """Initialize the class."""
+ self._id = None
+ self._names = []
+ self.type = ""
+ self.image = ""
+ self.link = ""
+ self.graphics = []
+ self.components = set()
+ self.alt = []
+ self._pathway = None
+ self._reactions = []
+
+ def __str__(self):
+ """Return readable descriptive string."""
+ outstr = [
+ "Entry node ID: %d" % self.id,
+ "Names: %s" % self.name,
+ "Type: %s" % self.type,
+ "Components: %s" % self.components,
+ "Reactions: %s" % self.reaction,
+ "Graphics elements: %d %s" % (len(self.graphics), self.graphics),
+ ]
+ return "\n".join(outstr) + "\n"
+
+ def add_component(self, element):
+ """Add an element to the entry.
+
+ If the Entry is already part of a pathway, make sure
+ the component already exists.
+ """
+ if self._pathway is not None:
+ if element.id not in self._pathway.entries:
+ raise ValueError(
+ "Component %s is not an entry in the pathway" % element.id
+ )
+ self.components.add(element)
+
+ def remove_component(self, value):
+ """Remove the entry with the passed ID from the group."""
+ self.components.remove(value)
+
+ def add_graphics(self, entry):
+ """Add the Graphics entry."""
+ self.graphics.append(entry)
+
+ def remove_graphics(self, entry):
+ """Remove the Graphics entry with the passed ID from the group."""
+ self.graphics.remove(entry)
+
+ # Names may be given as a space-separated list of KEGG identifiers
+ def _getname(self):
+ return " ".join(self._names)
+
+ def _setname(self, value):
+ self._names = value.split()
+
+ def _delname(self):
+ self._names = []
+
+ name = property(
+ _getname, _setname, _delname, "List of KEGG identifiers for the Entry."
+ )
+
+ # Reactions may be given as a space-separated list of KEGG identifiers
+ def _getreaction(self):
+ return " ".join(self._reactions)
+
+ def _setreaction(self, value):
+ self._reactions = value.split()
+
+ def _delreaction(self):
+ self._reactions = []
+
+ reaction = property(
+ _getreaction,
+ _setreaction,
+ _delreaction,
+ "List of reaction KEGG IDs for this Entry.",
+ )
+
+ # We make sure that the node ID is an integer
+ def _getid(self):
+ return self._id
+
+ def _setid(self, value):
+ self._id = int(value)
+
+ def _delid(self):
+ del self._id
+
+ id = property(_getid, _setid, _delid, "The pathway graph node ID for the Entry.")
+
+ @property
+ def element(self):
+ """Return the Entry as a valid KGML element."""
+ # The root is this Entry element
+ entry = ET.Element("entry")
+ entry.attrib = {
+ "id": str(self._id),
+ "name": self.name,
+ "link": self.link,
+ "type": self.type,
+ }
+ if len(self._reactions):
+ entry.attrib["reaction"] = self.reaction
+ if len(self.graphics):
+ for g in self.graphics:
+ entry.append(g.element)
+ if len(self.components):
+ for c in self.components:
+ entry.append(c.element)
+ return entry
+
+ @property
+ def bounds(self):
+ """Coordinate bounds for all Graphics elements in the Entry.
+
+ Return the [(xmin, ymin), (xmax, ymax)] co-ordinates for the Entry
+ Graphics elements.
+ """
+ xlist, ylist = [], []
+ for b in [g.bounds for g in self.graphics]:
+ xlist.extend([b[0][0], b[1][0]])
+ ylist.extend([b[0][1], b[1][1]])
+ return [(min(xlist), min(ylist)), (max(xlist), max(ylist))]
+
+ @property
+ def is_reactant(self):
+ """Return true if this Entry participates in any reaction in its parent pathway."""
+ for rxn in self._pathway.reactions:
+ if self._id in rxn.reactant_ids:
+ return True
+ return False
+
+
+# Component
+class Component:
+ """An Entry subelement used to represents a complex node.
+
+ A subelement of the Entry element, used when the Entry is a complex
+ node, as described in release KGML v0.7.2
+ (http://www.kegg.jp/kegg/xml/docs/)
+
+ The Component acts as a collection (with type 'group', and typically
+ its own Graphics subelement), having only an ID.
+ """
+
+ def __init__(self, parent):
+ """Initialize the class."""
+ self._id = None
+ self._parent = parent
+
+ # We make sure that the node ID is an integer
+ def _getid(self):
+ return self._id
+
+ def _setid(self, value):
+ self._id = int(value)
+
+ def _delid(self):
+ del self._id
+
+ id = property(_getid, _setid, _delid, "The pathway graph node ID for the Entry")
+
+ @property
+ def element(self):
+ """Return the Component as a valid KGML element."""
+ # The root is this Component element
+ component = ET.Element("component")
+ component.attrib = {"id": str(self._id)}
+ return component
+
+
+# Graphics
+class Graphics:
+ """An Entry subelement used to represents the visual representation.
+
+ A subelement of Entry, specifying its visual representation, as
+ described in release KGML v0.7.2 (http://www.kegg.jp/kegg/xml/docs/)
+
+ Attributes:
+ - name Label for the graphics object
+ - x X-axis position of the object (int)
+ - y Y-axis position of the object (int)
+ - coords polyline co-ordinates, list of (int, int) tuples
+ - type object shape
+ - width object width (int)
+ - height object height (int)
+ - fgcolor object foreground color (hex RGB)
+ - bgcolor object background color (hex RGB)
+
+ Some attributes are present only for specific graphics types. For
+ example, line types do not (typically) have a width.
+ We permit non-DTD attributes and attribute settings, such as
+
+ dash List of ints, describing an on/off pattern for dashes
+
+ """
+
+ def __init__(self, parent):
+ """Initialize the class."""
+ self.name = ""
+ self._x = None
+ self._y = None
+ self._coords = None
+ self.type = ""
+ self._width = None
+ self._height = None
+ self.fgcolor = ""
+ self.bgcolor = ""
+ self._parent = parent
+
+ # We make sure that the XY coordinates, width and height are numbers
+ def _getx(self):
+ return self._x
+
+ def _setx(self, value):
+ self._x = float(value)
+
+ def _delx(self):
+ del self._x
+
+ x = property(_getx, _setx, _delx, "The X coordinate for the graphics element.")
+
+ def _gety(self):
+ return self._y
+
+ def _sety(self, value):
+ self._y = float(value)
+
+ def _dely(self):
+ del self._y
+
+ y = property(_gety, _sety, _dely, "The Y coordinate for the graphics element.")
+
+ def _getwidth(self):
+ return self._width
+
+ def _setwidth(self, value):
+ self._width = float(value)
+
+ def _delwidth(self):
+ del self._width
+
+ width = property(
+ _getwidth, _setwidth, _delwidth, "The width of the graphics element."
+ )
+
+ def _getheight(self):
+ return self._height
+
+ def _setheight(self, value):
+ self._height = float(value)
+
+ def _delheight(self):
+ del self._height
+
+ height = property(
+ _getheight, _setheight, _delheight, "The height of the graphics element."
+ )
+
+ # We make sure that the polyline co-ordinates are integers, too
+ def _getcoords(self):
+ return self._coords
+
+ def _setcoords(self, value):
+ clist = [int(e) for e in value.split(",")]
+ self._coords = [tuple(clist[i : i + 2]) for i in range(0, len(clist), 2)]
+
+ def _delcoords(self):
+ del self._coords
+
+ coords = property(
+ _getcoords,
+ _setcoords,
+ _delcoords,
+ "Polyline coordinates for the graphics element.",
+ )
+
+ # Set default colors
+ def _getfgcolor(self):
+ return self._fgcolor
+
+ def _setfgcolor(self, value):
+ if value == "none":
+ self._fgcolor = "#000000" # this default defined in KGML spec
+ else:
+ self._fgcolor = value
+
+ def _delfgcolor(self):
+ del self._fgcolor
+
+ fgcolor = property(_getfgcolor, _setfgcolor, _delfgcolor, "Foreground color.")
+
+ def _getbgcolor(self):
+ return self._bgcolor
+
+ def _setbgcolor(self, value):
+ if value == "none":
+ self._bgcolor = "#000000" # this default defined in KGML spec
+ else:
+ self._bgcolor = value
+
+ def _delbgcolor(self):
+ del self._bgcolor
+
+ bgcolor = property(_getbgcolor, _setbgcolor, _delbgcolor, "Background color.")
+
+ @property
+ def element(self):
+ """Return the Graphics as a valid KGML element."""
+ # The root is this Component element
+ graphics = ET.Element("graphics")
+ if isinstance(self.fgcolor, str): # Assumes that string is hexstring
+ fghex = self.fgcolor
+ else: # Assumes ReportLab Color object
+ fghex = "#" + self.fgcolor.hexval()[2:]
+ if isinstance(self.bgcolor, str): # Assumes that string is hexstring
+ bghex = self.bgcolor
+ else: # Assumes ReportLab Color object
+ bghex = "#" + self.bgcolor.hexval()[2:]
+ graphics.attrib = {
+ "name": self.name,
+ "type": self.type,
+ "fgcolor": fghex,
+ "bgcolor": bghex,
+ }
+ for (n, attr) in [
+ ("x", "_x"),
+ ("y", "_y"),
+ ("width", "_width"),
+ ("height", "_height"),
+ ]:
+ if getattr(self, attr) is not None:
+ graphics.attrib[n] = str(getattr(self, attr))
+ if self.type == "line": # Need to write polycoords
+ graphics.attrib["coords"] = ",".join(
+ [str(e) for e in chain.from_iterable(self.coords)]
+ )
+ return graphics
+
+ @property
+ def bounds(self):
+ """Coordinate bounds for the Graphics element.
+
+ Return the bounds of the Graphics object as an [(xmin, ymin),
+ (xmax, ymax)] tuple. Co-ordinates give the centre of the
+ circle, rectangle, roundrectangle elements, so we have to
+ adjust for the relevant width/height.
+ """
+ if self.type == "line":
+ xlist = [x for x, y in self.coords]
+ ylist = [y for x, y in self.coords]
+ return [(min(xlist), min(ylist)), (max(xlist), max(ylist))]
+ else:
+ return [
+ (self.x - self.width * 0.5, self.y - self.height * 0.5),
+ (self.x + self.width * 0.5, self.y + self.height * 0.5),
+ ]
+
+ @property
+ def centre(self):
+ """Return the centre of the Graphics object as an (x, y) tuple."""
+ return (
+ 0.5 * (self.bounds[0][0] + self.bounds[1][0]),
+ 0.5 * (self.bounds[0][1] + self.bounds[1][1]),
+ )
+
+
+# Reaction
+class Reaction:
+ """A specific chemical reaction with substrates and products.
+
+ This describes a specific chemical reaction between one or more
+ substrates and one or more products.
+
+ Attributes:
+ - id Pathway graph node ID of the entry
+ - names List of KEGG identifier(s) from the REACTION database
+ - type String: reversible or irreversible
+ - substrate Entry object of the substrate
+ - product Entry object of the product
+
+ """
+
+ def __init__(self):
+ """Initialize the class."""
+ self._id = None
+ self._names = []
+ self.type = ""
+ self._substrates = set()
+ self._products = set()
+ self._pathway = None
+
+ def __str__(self):
+ """Return an informative human-readable string."""
+ outstr = [
+ "Reaction node ID: %s" % self.id,
+ "Reaction KEGG IDs: %s" % self.name,
+ "Type: %s" % self.type,
+ "Substrates: %s" % ",".join([s.name for s in self.substrates]),
+ "Products: %s" % ",".join([s.name for s in self.products]),
+ ]
+ return "\n".join(outstr) + "\n"
+
+ def add_substrate(self, substrate_id):
+ """Add a substrate, identified by its node ID, to the reaction."""
+ if self._pathway is not None:
+ if int(substrate_id) not in self._pathway.entries:
+ raise ValueError(
+ "Couldn't add substrate, no node ID %d in Pathway"
+ % int(substrate_id)
+ )
+ self._substrates.add(substrate_id)
+
+ def add_product(self, product_id):
+ """Add a product, identified by its node ID, to the reaction."""
+ if self._pathway is not None:
+ if int(product_id) not in self._pathway.entries:
+ raise ValueError(
+ "Couldn't add product, no node ID %d in Pathway" % product_id
+ )
+ self._products.add(int(product_id))
+
+ # The node ID is also the node ID of the Entry that corresponds to the
+ # reaction; we get the corresponding Entry when there is an associated
+ # Pathway
+ def _getid(self):
+ return self._id
+
+ def _setid(self, value):
+ self._id = int(value)
+
+ def _delid(self):
+ del self._id
+
+ id = property(_getid, _setid, _delid, "Node ID for the reaction.")
+
+ # Names may show up as a space-separated list of several KEGG identifiers
+ def _getnames(self):
+ return " ".join(self._names)
+
+ def _setnames(self, value):
+ self._names.extend(value.split())
+
+ def _delnames(self):
+ del self.names
+
+ name = property(
+ _getnames, _setnames, _delnames, "List of KEGG identifiers for the reaction."
+ )
+
+ # products and substrates are read-only properties, returning lists
+ # of Entry objects
+ @property
+ def substrates(self):
+ """Return list of substrate Entry elements."""
+ return [self._pathway.entries[sid] for sid in self._substrates]
+
+ @property
+ def products(self):
+ """Return list of product Entry elements."""
+ return [self._pathway.entries[pid] for pid in self._products]
+
+ @property
+ def entry(self):
+ """Return the Entry corresponding to this reaction."""
+ return self._pathway.entries[self._id]
+
+ @property
+ def reactant_ids(self):
+ """Return a list of substrate and product reactant IDs."""
+ return self._products.union(self._substrates)
+
+ @property
+ def element(self):
+ """Return KGML element describing the Reaction."""
+ # The root is this Relation element
+ reaction = ET.Element("reaction")
+ reaction.attrib = {"id": str(self.id), "name": self.name, "type": self.type}
+ for s in self._substrates:
+ substrate = ET.Element("substrate")
+ substrate.attrib["id"] = str(s)
+ substrate.attrib["name"] = self._pathway.entries[s].name
+ reaction.append(substrate)
+ for p in self._products:
+ product = ET.Element("product")
+ product.attrib["id"] = str(p)
+ product.attrib["name"] = self._pathway.entries[p].name
+ reaction.append(product)
+ return reaction
+
+
+# Relation
+class Relation:
+ """A relationship between to products, KOs, or protein and compound.
+
+ This describes a relationship between two products, KOs, or protein
+ and compound, as described in release KGML v0.7.2
+ (http://www.kegg.jp/kegg/xml/docs/)
+
+ Attributes:
+ - entry1 - The first Entry object node ID defining the
+ relation (int)
+ - entry2 - The second Entry object node ID defining the
+ relation (int)
+ - type - The relation type
+ - subtypes - List of subtypes for the relation, as a list of
+ (name, value) tuples
+
+ """
+
+ def __init__(self):
+ """Initialize the class."""
+ self._entry1 = None
+ self._entry2 = None
+ self.type = ""
+ self.subtypes = []
+ self._pathway = None
+
+ def __str__(self):
+ """Return a useful human-readable string."""
+ outstr = [
+ "Relation (subtypes: %d):" % len(self.subtypes),
+ "Entry1:",
+ str(self.entry1),
+ "Entry2:",
+ str(self.entry2),
+ ]
+ for s in self.subtypes:
+ outstr.extend(["Subtype: %s" % s[0], str(s[1])])
+ return "\n".join(outstr)
+
+ # Properties entry1 and entry2
+ def _getentry1(self):
+ if self._pathway is not None:
+ return self._pathway.entries[self._entry1]
+ return self._entry1
+
+ def _setentry1(self, value):
+ self._entry1 = int(value)
+
+ def _delentry1(self):
+ del self._entry1
+
+ entry1 = property(_getentry1, _setentry1, _delentry1, "Entry1 of the relation.")
+
+ def _getentry2(self):
+ if self._pathway is not None:
+ return self._pathway.entries[self._entry2]
+ return self._entry2
+
+ def _setentry2(self, value):
+ self._entry2 = int(value)
+
+ def _delentry2(self):
+ del self._entry2
+
+ entry2 = property(_getentry2, _setentry2, _delentry2, "Entry2 of the relation.")
+
+ @property
+ def element(self):
+ """Return KGML element describing the Relation."""
+ # The root is this Relation element
+ relation = ET.Element("relation")
+ relation.attrib = {
+ "entry1": str(self._entry1),
+ "entry2": str(self._entry2),
+ "type": self.type,
+ }
+ for (name, value) in self.subtypes:
+ subtype = ET.Element("subtype")
+ subtype.attrib = {"name": name, "value": str(value)}
+ relation.append(subtype)
+ return relation
diff --git a/code/lib/Bio/KEGG/KGML/__init__.py b/code/lib/Bio/KEGG/KGML/__init__.py
new file mode 100644
index 0000000..9063911
--- /dev/null
+++ b/code/lib/Bio/KEGG/KGML/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2013 by Leighton Pritchard. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+
+"""Code to work with data from the KEGG database.
+
+References:
+Kanehisa, M. and Goto, S.; KEGG: Kyoto Encyclopedia of Genes and Genomes.
+Nucleic Acids Res. 28, 29-34 (2000).
+
+URL: http://www.genome.ad.jp/kegg/
+
+"""
diff --git a/code/lib/Bio/KEGG/KGML/__pycache__/KGML_parser.cpython-37.pyc b/code/lib/Bio/KEGG/KGML/__pycache__/KGML_parser.cpython-37.pyc
new file mode 100644
index 0000000..9ed45a0
Binary files /dev/null and b/code/lib/Bio/KEGG/KGML/__pycache__/KGML_parser.cpython-37.pyc differ
diff --git a/code/lib/Bio/KEGG/KGML/__pycache__/KGML_pathway.cpython-37.pyc b/code/lib/Bio/KEGG/KGML/__pycache__/KGML_pathway.cpython-37.pyc
new file mode 100644
index 0000000..687a3d2
Binary files /dev/null and b/code/lib/Bio/KEGG/KGML/__pycache__/KGML_pathway.cpython-37.pyc differ
diff --git a/code/lib/Bio/KEGG/KGML/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/KEGG/KGML/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..236575f
Binary files /dev/null and b/code/lib/Bio/KEGG/KGML/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/KEGG/Map/__init__.py b/code/lib/Bio/KEGG/Map/__init__.py
new file mode 100644
index 0000000..e1b37f7
--- /dev/null
+++ b/code/lib/Bio/KEGG/Map/__init__.py
@@ -0,0 +1,49 @@
+# Copyright 2001 by Tarjei Mikkelsen. All rights reserved.
+# Copyright 2007 by Michiel de Hoon. All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+
+"""Load KEGG Pathway maps for use with the Biopython Pathway module.
+
+The pathway maps are in the format::
+
+ RXXXXX:[X.X.X.X:] A + 2 B <=> C
+ RXXXXX:[X.X.X.X:] 3C <=> 2 D + E
+ ...
+
+where RXXXXX is a five-digit reaction id, and X.X.X.X is the optional
+EC number of the enzyme that catalyze the reaction.
+"""
+
+from Bio.Pathway import Reaction
+
+
+def parse(handle):
+ """Parse a KEGG pathway map."""
+ for line in handle:
+ data, catalysts, reaction = line.split(":")
+ catalysts = [(catalysts,)]
+ reactants = {}
+ before, after = reaction.split("<=>")
+ compounds = before.split(" + ")
+ for compound in compounds:
+ compound = compound.strip()
+ try:
+ number, compound = compound.split()
+ number = -int(number)
+ except ValueError:
+ number = -1
+ reactants[compound] = number
+ compounds = after.split(" + ")
+ for compound in compounds:
+ compound = compound.strip()
+ try:
+ number, compound = compound.split()
+ number = int(number)
+ except ValueError:
+ number = +1
+ reactants[compound] = number
+ yield Reaction(reactants, catalysts, True, data)
diff --git a/code/lib/Bio/KEGG/Map/__pycache__/__init__.cpython-37.pyc b/code/lib/Bio/KEGG/Map/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..c220247
Binary files /dev/null and b/code/lib/Bio/KEGG/Map/__pycache__/__init__.cpython-37.pyc differ
diff --git a/code/lib/Bio/KEGG/REST.py b/code/lib/Bio/KEGG/REST.py
new file mode 100644
index 0000000..11f9f98
--- /dev/null
+++ b/code/lib/Bio/KEGG/REST.py
@@ -0,0 +1,315 @@
+# Copyright 2014 by Kevin Wu.
+# Revisions copyright 2014 by Peter Cock.
+# All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+
+"""Provides code to access the REST-style KEGG online API.
+
+This module aims to make the KEGG online REST-style API easier to use. See:
+http://www.kegg.jp/kegg/rest/keggapi.html
+
+The KEGG REST-style API provides simple access to a range of KEGG databases.
+This works using simple URLs (which this module will construct for you),
+with any errors indicated via HTTP error levels.
+
+The functionality is somewhat similar to Biopython's Bio.TogoWS and Bio.Entrez
+modules.
+
+Currently KEGG does not provide any usage guidelines (unlike the NCBI whose
+requirements are reasonably clear). To avoid risking overloading the service,
+Biopython will only allow three calls per second.
+
+References:
+Kanehisa, M. and Goto, S.; KEGG: Kyoto Encyclopedia of Genes and Genomes.
+Nucleic Acids Res. 28, 29-34 (2000).
+
+"""
+
+import io
+from urllib.request import urlopen
+
+
+def _q(op, arg1, arg2=None, arg3=None):
+ URL = "http://rest.kegg.jp/%s"
+ if arg2 and arg3:
+ args = "%s/%s/%s/%s" % (op, arg1, arg2, arg3)
+ elif arg2:
+ args = "%s/%s/%s" % (op, arg1, arg2)
+ else:
+ args = "%s/%s" % (op, arg1)
+ resp = urlopen(URL % (args))
+
+ if "image" == arg2:
+ return resp
+
+ handle = io.TextIOWrapper(resp, encoding="UTF-8")
+ handle.url = resp.url
+ return handle
+
+
+# http://www.kegg.jp/kegg/rest/keggapi.html
+def kegg_info(database):
+ """KEGG info - Displays the current statistics of a given database.
+
+ db - database or organism (string)
+
+ The argument db can be a KEGG database name (e.g. 'pathway' or its
+ official abbreviation, 'path'), or a KEGG organism code or T number
+ (e.g. 'hsa' or 'T01001' for human).
+
+ A valid list of organism codes and their T numbers can be obtained
+ via kegg_info('organism') or http://rest.kegg.jp/list/organism
+
+ """
+ # TODO - return a string (rather than the handle?)
+ # TODO - chache and validate the organism code / T numbers?
+ # TODO - can we parse the somewhat formatted output?
+ #
+ # http://rest.kegg.jp/info/
+ #
+ # = pathway | brite | module | disease | drug | environ |
+ # ko | genome | | compound | glycan | reaction |
+ # rpair | rclass | enzyme | genomes | genes | ligand | kegg
+ # = KEGG organism code or T number
+ return _q("info", database)
+
+
+def kegg_list(database, org=None):
+ """KEGG list - Entry list for database, or specified database entries.
+
+ db - database or organism (string)
+ org - optional organism (string), see below.
+
+ For the pathway and module databases the optional organism can be
+ used to restrict the results.
+
+ """
+ # TODO - split into two functions (dbentries seems separate)?
+ #
+ # http://rest.kegg.jp/list//
+ #
+ # = pathway | module
+ # = KEGG organism code
+ if database in ("pathway", "module") and org:
+ resp = _q("list", database, org)
+ elif isinstance(database, str) and database and org:
+ raise ValueError("Invalid database arg for kegg list request.")
+
+ # http://rest.kegg.jp/list/
+ #
+ # = pathway | brite | module | disease | drug | environ |
+ # ko | genome | | compound | glycan | reaction |
+ # rpair | rclass | enzyme | organism
+ # = KEGG organism code or T number
+ #
+ #
+ # http://rest.kegg.jp/list/
+ #
+ # = KEGG database entries involving the following
+ # = pathway | brite | module | disease | drug | environ |
+ # ko | genome | | compound | glycan | reaction |
+ # rpair | rclass | enzyme
+ # = KEGG organism code or T number
+ else:
+ if isinstance(database, list):
+ if len(database) > 100:
+ raise ValueError(
+ "Maximum number of databases is 100 for kegg list query"
+ )
+ database = ("+").join(database)
+ resp = _q("list", database)
+
+ return resp
+
+
+def kegg_find(database, query, option=None):
+ """KEGG find - Data search.
+
+ Finds entries with matching query keywords or other query data in
+ a given database.
+
+ db - database or organism (string)
+ query - search terms (string)
+ option - search option (string), see below.
+
+ For the compound and drug database, set option to the string 'formula',
+ 'exact_mass' or 'mol_weight' to search on that field only. The
+ chemical formula search is a partial match irrespective of the order
+ of atoms given. The exact mass (or molecular weight) is checked by
+ rounding off to the same decimal place as the query data. A range of
+ values may also be specified with the minus(-) sign.
+
+ """
+ # TODO - return list of tuples?
+ #
+ # http://rest.kegg.jp/find///